Browse Source

feat: Add extract flag to add command

Cristian 5 years ago
parent
commit
44eede96e5
3 changed files with 28 additions and 6 deletions
  1. 8 1
      archivebox/cli/archivebox_add.py
  2. 10 4
      archivebox/main.py
  3. 10 1
      tests/test_add.py

+ 8 - 1
archivebox/cli/archivebox_add.py

@@ -62,10 +62,16 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         help="Re-archive URLs from scratch, overwriting any existing files"
     )
     parser.add_argument(
-        '--init', #'-i',
+        "--init", #'-i',
         action='store_true',
         help="Init/upgrade the curent data directory before adding",
     )
+    parser.add_argument(
+        "--extract",
+        nargs="+",
+        help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
+              This does not take precedence over the configuration"
+    )
     command = parser.parse_args(args or ())
     urls = command.urls
     stdin_urls = accept_stdin(stdin)
@@ -83,6 +89,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         overwrite=command.overwrite,
         init=command.init,
         out_dir=pwd or OUTPUT_DIR,
+        extractors = command.extract or [],
     )
 
 

+ 10 - 4
archivebox/main.py

@@ -525,7 +525,8 @@ def add(urls: Union[str, List[str]],
         index_only: bool=False,
         overwrite: bool=False,
         init: bool=False,
-        out_dir: Path=OUTPUT_DIR) -> List[Link]:
+        out_dir: Path=OUTPUT_DIR,
+        extractors: list=[]) -> List[Link]:
     """Add a new URL or list of URLs to your archive"""
 
     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
@@ -567,12 +568,17 @@ def add(urls: Union[str, List[str]],
         return all_links
 
     # Run the archive methods for each link
+    archive_kwargs = {
+        "out_dir": out_dir,
+    }
+    if extractors:
+        archive_kwargs["methods"] = extractors
     if update_all:
-        archive_links(all_links, overwrite=overwrite, out_dir=out_dir)
+        archive_links(all_links, overwrite=overwrite, **archive_kwargs)
     elif overwrite:
-        archive_links(imported_links, overwrite=True, out_dir=out_dir)
+        archive_links(imported_links, overwrite=True, **archive_kwargs)
     elif new_links:
-        archive_links(new_links, overwrite=False, out_dir=out_dir)
+        archive_links(new_links, overwrite=False, **archive_kwargs)
     
     return all_links
 

+ 10 - 1
tests/test_add.py

@@ -81,4 +81,13 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
 
     with open(archived_item_path / "index.json", "r") as f:
         output_json = json.load(f)
-    assert output_json["history"] != {}
+    assert output_json["history"] != {}
+
+def test_extract_input_uses_only_passed_extractors(tmp_path, process):
+    subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"],
+                    capture_output=True)
+    
+    archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
+
+    assert (archived_item_path / "warc").exists()
+    assert not (archived_item_path / "singlefile.html").exists()