Selaa lähdekoodia

feat: Add tests to refactored init command

Cristian 5 vuotta sitten
vanhempi
sitoutus
be0dff8126
3 muutettua tiedostoa jossa 81 lisäystä ja 12 poistoa
  1. 11 11
      archivebox/index/__init__.py
  2. 2 1
      archivebox/main.py
  3. 68 0
      tests/test_init.py

+ 11 - 11
archivebox/index/__init__.py

@@ -261,6 +261,11 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
     log_indexing_process_finished()
 
 
+def get_empty_snapshot_queryset(out_dir: str=OUTPUT_DIR):
+    setup_django(out_dir, check_db=True)
+    from core.models import Snapshot
+    return Snapshot.objects.none()
+
 @enforce_types
 def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
     """parse and load existing index with any new links from import_path merged in"""
@@ -432,23 +437,19 @@ def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
     """dirs that conflict with other directories that have the same link URL or timestamp"""
     by_url = {}
     by_timestamp = {}
-    indexed_folders = set()
-    for snapshot in snapshots.iterator():
-        link = snapshot.as_link()
-        by_url[link.url] = 0
-        by_timestamp[link.timestamp] = 0
-        indexed_folders.update([link.link_dir])
-
     duplicate_folders = {}
 
     data_folders = (
-        entry.path
+        str(entry)
         for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
-            if entry.is_dir() and str(entry) not in indexed_folders
+            if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
     )
 
-    for path in chain(sorted(indexed_folders), sorted(data_folders)):
+    for path in chain(snapshots.iterator(), data_folders):
         link = None
+        if type(path) is not str:
+            path = path.as_link().link_dir
+
         try:
             link = parse_json_link_details(path)
         except Exception:
@@ -464,7 +465,6 @@ def get_duplicate_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optio
             by_url[link.url] = by_url.get(link.url, 0) + 1
             if by_url[link.url] > 1:
                 duplicate_folders[path] = link
-
     return duplicate_folders
 
 def get_orphaned_folders(snapshots, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:

+ 2 - 1
archivebox/main.py

@@ -26,6 +26,7 @@ from .util import enforce_types                         # type: ignore
 from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
 from .index import (
     load_main_index,
+    get_empty_snapshot_queryset,
     parse_links_from_source,
     dedupe_links,
     write_main_index,
@@ -317,7 +318,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
     print()
     print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
 
-    all_links: Dict[str, Link] = {}
+    all_links = get_empty_snapshot_queryset()
     pending_links: Dict[str, Link] = {}
 
     if existing_index:

+ 68 - 0
tests/test_init.py

@@ -5,6 +5,7 @@ import os
 import subprocess
 from pathlib import Path
 import json
+import sqlite3
 
 from archivebox.config import OUTPUT_PERMISSIONS
 
@@ -63,4 +64,71 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
     for path in archived_item_path.iterdir():
         assert oct(path.stat().st_mode)[-3:] == OUTPUT_PERMISSIONS
 
+def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    archive_folders = [x.name for x in (tmp_path / "archive").iterdir()]
+    
+    first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
+    json_index = str(first_archive / "index.json")
+    with open(json_index, "r") as f:
+        link_details = json.loads(f.read())
+
+    link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
+    with open(json_index, "w") as f:
+        json.dump(link_details, f)
+
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    # 1 from duplicated url, 1 from corrupted index
+    assert "Skipped adding 2 invalid link data directories" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0
+
+def test_collision_timestamps_different_urls(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    archive_folders = [x.name for x in (tmp_path / "archive").iterdir()]
+    first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
+    archive_folders.remove(first_archive.name)
+    json_index = str(first_archive / "index.json")
+
+    with open(json_index, "r") as f:
+        link_details = json.loads(f.read())
+
+    link_details["timestamp"] = archive_folders[0]
+
+    with open(json_index, "w") as f:
+        json.dump(link_details, f)
+
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0
+
+def test_orphaned_folders(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    c.execute("DELETE from core_snapshot")
+    conn.commit()
+    conn.close()
+
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    assert "Added 1 orphaned links from existing JSON index" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0
+
+def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
+    os.chdir(tmp_path)
+    subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True,
+                     env=disable_extractors_dict)
+    (tmp_path / "archive" / "some_random_folder").mkdir()
 
+    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+    assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
+    assert init_process.returncode == 0