瀏覽代碼

add overwrite flag to add command to force re-archiving

Nick Sweeting 5 年之前
父節點
當前提交
b681a477ae
共有 4 個文件被更改,包括 35 次插入18 次删除
  1. 7 0
      archivebox/cli/archivebox_add.py
  2. 12 12
      archivebox/extractors/__init__.py
  3. 15 6
      archivebox/main.py
  4. 1 0
      archivebox/system.py

+ 7 - 0
archivebox/cli/archivebox_add.py

@@ -55,6 +55,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         type=int,
         help="Recursively archive all linked pages up to this many hops away"
     )
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Re-archive URLs from scratch, overwriting any existing files"
+    )
     command = parser.parse_args(args or ())
     urls = command.urls
     stdin_urls = accept_stdin(stdin)
@@ -69,6 +75,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
         depth=command.depth,
         update_all=command.update_all,
         index_only=command.index_only,
+        overwrite=command.overwrite,
         out_dir=pwd or OUTPUT_DIR,
     )
 

+ 12 - 12
archivebox/extractors/__init__.py

@@ -36,18 +36,18 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
 
 def get_default_archive_methods():
     return [
-            ('title', should_save_title, save_title),
-            ('favicon', should_save_favicon, save_favicon),
-            ('wget', should_save_wget, save_wget),
-            ('singlefile', should_save_singlefile, save_singlefile),
-            ('pdf', should_save_pdf, save_pdf),
-            ('screenshot', should_save_screenshot, save_screenshot),
-            ('dom', should_save_dom, save_dom),
-            ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
-            ('git', should_save_git, save_git),
-            ('media', should_save_media, save_media),
-            ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
-        ]
+        ('title', should_save_title, save_title),
+        ('favicon', should_save_favicon, save_favicon),
+        ('wget', should_save_wget, save_wget),
+        ('singlefile', should_save_singlefile, save_singlefile),
+        ('pdf', should_save_pdf, save_pdf),
+        ('screenshot', should_save_screenshot, save_screenshot),
+        ('dom', should_save_dom, save_dom),
+        ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
+        ('git', should_save_git, save_git),
+        ('media', should_save_media, save_media),
+        ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
+    ]
 
 @enforce_types
 def ignore_methods(to_ignore: List[str]):

+ 15 - 6
archivebox/main.py

@@ -522,6 +522,7 @@ def add(urls: Union[str, List[str]],
         depth: int=0,
         update_all: bool=not ONLY_NEW,
         index_only: bool=False,
+        overwrite: bool=False,
         out_dir: str=OUTPUT_DIR) -> List[Link]:
     """Add a new URL or list of URLs to your archive"""
 
@@ -551,20 +552,28 @@ def add(urls: Union[str, List[str]],
         for new_link in new_links:
             downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
             new_links_depth += parse_links_from_source(downloaded_file)
-    all_links, new_links = dedupe_links(all_links, new_links + new_links_depth)
+
+    imported_links = new_links + new_links_depth
+    all_links, new_links = dedupe_links(all_links, imported_links)
     write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
 
     if index_only:
         return all_links
 
     # Run the archive methods for each link
-    to_archive = all_links if update_all else new_links
-    archive_links(to_archive, out_dir=out_dir)
+    if update_all:
+        archive_links(all_links, overwrite=overwrite, out_dir=out_dir)
+    elif overwrite:
+        archive_links(imported_links, overwrite=True, out_dir=out_dir)
+    elif new_links:
+        archive_links(new_links, overwrite=False, out_dir=out_dir)
+    else:
+        # nothing was updated, don't bother re-saving the index
+        return all_links
 
     # Step 4: Re-write links index with updated titles, icons, and resources
-    if to_archive:
-        all_links = load_main_index(out_dir=out_dir)
-        write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
+    all_links = load_main_index(out_dir=out_dir)
+    write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
     return all_links
 
 @enforce_types

+ 1 - 0
archivebox/system.py

@@ -16,6 +16,7 @@ from .util import enforce_types, ExtendedEncoder
 from .config import OUTPUT_PERMISSIONS
 
 
+
 def run(*args, input=None, capture_output=True, text=False, **kwargs):
     """Patched of subprocess.run to fix blocking io making timeout=innefective"""