Browse Source

feat: Finish reversal. Add ArchiveResults that are not found in the index.json

Cristian 5 years ago
parent
commit
b237e412df
2 changed files with 14 additions and 8 deletions
  1. 13 7
      archivebox/core/migrations/0007_archiveresult.py
  2. 1 1
      archivebox/core/utils.py

+ 13 - 7
archivebox/core/migrations/0007_archiveresult.py

@@ -7,6 +7,7 @@ from django.db import migrations, models
 import django.db.models.deletion
 
 from config import CONFIG
+from index.json import to_json
 
 
 def forwards_func(apps, schema_editor):
@@ -33,26 +34,31 @@ def forwards_func(apps, schema_editor):
                 start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
 
 
-def verify_json_index_integrity(results):
+def verify_json_index_integrity(snapshot):
     results = snapshot.archiveresult_set.all()
     out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
     with open(out_dir / "index.json", "r") as f:
         index = json.load(f)
 
     history = index["history"]
-    extractors = [extractor for extractor in history]
-    index_results = [(result, extractor) for result in history[extractor]]
-    flattened_results = [(result["start_ts"], extractor) for result, extractor in index_results]
+    index_results = [result for extractor in history for result in history[extractor]]
+    flattened_results = [result["start_ts"] for result in index_results]
     
-    missing = [result for result in results if result.start_ts not in flattened_results]
-
-    #process missing elements here. Re-add to the index.json
+    missing_results = [result for result in results if result.start_ts.isoformat() not in flattened_results]
 
+    for missing in missing_results:
+        index["history"][missing.extractor].append({"cmd": missing.cmd, "cmd_version": missing.cmd_version, "end_ts": missing.end_ts.isoformat(),
+                                                    "start_ts": missing.start_ts.isoformat(), "pwd": missing.pwd, "output": missing.output,
+                                                    "schema": "ArchiveResult", "status": missing.status})
 
+    json_index = to_json(index)
+    with open(out_dir / "index.json", "w") as f:
+        f.write(json_index)
 
 
 def reverse_func(apps, schema_editor):
     Snapshot = apps.get_model("core", "Snapshot")
+    ArchiveResult = apps.get_model("core", "ArchiveResult")
     for snapshot in Snapshot.objects.all():
         verify_json_index_integrity(snapshot)
 

+ 1 - 1
archivebox/core/utils.py

@@ -35,7 +35,7 @@ def get_icons(snapshot: Snapshot) -> str:
             if extractor == "wget":
                 # warc isn't technically it's own extractor, so we have to add it after wget
 
-                output += output_template.format(path, canon[f"warc_path"],
+                output += output_template.format(path, canon["warc_path"],
                                                  exists, "warc", icons.get("warc", "?"))
 
         except Exception as e: