5 vuotta sitten · be0dff8126
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -261,6 +261,11 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
 
				     log_indexing_process_finished()
			
 
				 
			
 
				 
			
 
				+def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
			
 
				+    setup_django(out_dir, check_db=True)
			
 
				+    from core.models import Snapshot
			
 
				+    return Snapshot.objects.none()
			
 
				+
			
 
				 @enforce_types
			
 
				 def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
			
 
				     """parse and load existing index with any new links from import_path merged in"""
			
@@ -432,23 +437,19 @@ def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
 
				     """dirs that conflict with other directories that have the same link URL or timestamp"""
			
 
				     by_url = {}
			
 
				     by_timestamp = {}
			
 
				-    indexed_folders = set()
			
 
				-    for snapshot in snapshots.iterator():
			
 
				-        link = snapshot.as_link()
			
 
				-        by_url[link.url] = 0
			
 
				-        by_timestamp[link.timestamp] = 0
			
 
				-        indexed_folders.update([link.link_dir])
			
 
				-
			
 
				     duplicate_folders = {}
			
 
				 
			
 
				     data_folders = (
			
 
				-        entry.path
			
 
				+        str(entry)
			
 
				         for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
			
 
				-            if entry.is_dir() and str(entry) not in indexed_folders
			
 
				+            if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
			
 
				     )
			
 
				 
			
 
				-    for path in chain(sorted(indexed_folders), sorted(data_folders)):
			
 
				+    for path in chain(snapshots.iterator(), data_folders):
			
 
				         link = None
			
 
				+        if type(path) is not str:
			
 
				+            path = path.as_link().link_dir
			
 
				+
			
 
				         try:
			
 
				             link = parse_json_link_details(path)
			
 
				         except Exception:
			
@@ -464,7 +465,6 @@ def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
 
				             by_url[link.url] = by_url.get(link.url, 0) + 1
			
 
				             if by_url[link.url] > 1:
			
 
				                 duplicate_folders[path] = link
			
 
				-
			
 
				     return duplicate_folders
			
 
				 
			
 
				 def get_orphaned_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
			
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -26,6 +26,7 @@ from .util import enforce_types                         # type: ignore
 
				 from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
			
 
				 from .index import (
			
 
				     load_main_index,
			
 
				+    get_empty_snapshot_queryset,
			
 
				     parse_links_from_source,
			
 
				     dedupe_links,
			
 
				     write_main_index,
			
@@ -317,7 +318,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
 
				     print()
			
 
				     print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
			
 
				 
			
 
				-    all_links: Dict[str, Link] = {}
			
 
				+    all_links = get_empty_snapshot_queryset()
			
 
				     pending_links: Dict[str, Link] = {}
			
 
				 
			
 
				     if existing_index:
			
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -5,6 +5,7 @@ import os
 
				 import subprocess
			
 
				 from pathlib import Path
			
 
				 import json
			
 
				+import sqlite3
			
 
				 
			
 
				 from archivebox.config import OUTPUT_PERMISSIONS
			
 
				 
			
@@ -63,4 +64,71 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
 
				     for path in archived_item_path.iterdir():
			
 
				         assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
			
 
				 
			
 
				+def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
			
 
				+    os.chdir(tmp_path)
			
 
				+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
			
 
				+                     env=disable_extractors_dict)
			
 
				+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True,
			
 
				+                     env=disable_extractors_dict)
			
 
				+    archive_folders = [x.name for x in (tmp_path / "archive").iterdir()]
			
 
				+    
			
 
				+    first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
			
 
				+    json_index = str(first_archive / "index.json")
			
 
				+    with open(json_index, "r") as f:
			
 
				+        link_details = json.loads(f.read())
			
 
				+
			
 
				+    link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
			
 
				+    with open(json_index, "w") as f:
			
 
				+        json.dump(link_details, f)
			
 
				+
			
 
				+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
			
 
				+    # 1 from duplicated url, 1 from corrupted index
			
 
				+    assert "Skipped adding 2 invalid link data directories" in init_process.stdout.decode("utf-8")
			
 
				+    assert init_process.returncode == 0
			
 
				+
			
 
				+def test_collision_timestamps_different_urls(tmp_path, process, disable_extractors_dict):
			
 
				+    os.chdir(tmp_path)
			
 
				+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
			
 
				+                     env=disable_extractors_dict)
			
 
				+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True,
			
 
				+                     env=disable_extractors_dict)
			
 
				+    archive_folders = [x.name for x in (tmp_path / "archive").iterdir()]
			
 
				+    first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
			
 
				+    archive_folders.remove(first_archive.name)
			
 
				+    json_index = str(first_archive / "index.json")
			
 
				+
			
 
				+    with open(json_index, "r") as f:
			
 
				+        link_details = json.loads(f.read())
			
 
				+
			
 
				+    link_details["timestamp"] = archive_folders[0]
			
 
				+
			
 
				+    with open(json_index, "w") as f:
			
 
				+        json.dump(link_details, f)
			
 
				+
			
 
				+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
			
 
				+    assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
			
 
				+    assert init_process.returncode == 0
			
 
				+
			
 
				+def test_orphaned_folders(tmp_path, process, disable_extractors_dict):
			
 
				+    os.chdir(tmp_path)
			
 
				+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
			
 
				+                     env=disable_extractors_dict)
			
 
				+    conn = sqlite3.connect("index.sqlite3")
			
 
				+    c = conn.cursor()
			
 
				+    c.execute("DELETE from core_snapshot")
			
 
				+    conn.commit()
			
 
				+    conn.close()
			
 
				+
			
 
				+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
			
 
				+    assert "Added 1 orphaned links from existing JSON index" in init_process.stdout.decode("utf-8")
			
 
				+    assert init_process.returncode == 0
			
 
				+
			
 
				+def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
			
 
				+    os.chdir(tmp_path)
			
 
				+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
			
 
				+                     env=disable_extractors_dict)
			
 
				+    (tmp_path / "archive" / "some_random_folder").mkdir()
			
 
				 
			
 
				+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
			
 
				+    assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
			
 
				+    assert init_process.returncode == 0