Forráskód Böngészése

Merge pull request #389 from cdvv7788/recover-index

fix: Guess timestamps and add placeholders to support older indices
Nick Sweeting 5 éve
szülő
commit
f4c0616332
3 módosított fájl, 51 hozzáadás és 16 törlés
  1. 12 4
      archivebox/index/__init__.py
  2. 8 6
      archivebox/index/json.py
  3. 31 6
      archivebox/index/schema.py

+ 12 - 4
archivebox/index/__init__.py

@@ -529,8 +529,16 @@ def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Option
             link = None
             try:
                 link = parse_json_link_details(entry.path)
-            except Exception:
-                pass
+            except KeyError:
+                # Try to fix index
+                if index_exists:
+                    try:
+                        # Last attempt to repair the detail index
+                        link_guessed = parse_json_link_details(entry.path, guess=True)
+                        write_json_link_details(link_guessed, out_dir=entry.path)
+                        link = parse_json_link_details(entry.path)
+                    except Exception as e:
+                        pass
 
             if index_exists and link is None:
                 # index exists but it's corrupted or unparseable
@@ -555,9 +563,9 @@ def is_valid(link: Link) -> bool:
         return False
     if dir_exists and index_exists:
         try:
-            parsed_link = parse_json_link_details(link.link_dir)
+            parsed_link = parse_json_link_details(link.link_dir, guess=True)
             return link.url == parsed_link.url
-        except Exception:
+        except Exception as e:
             pass
     return False
 

+ 8 - 6
archivebox/index/json.py

@@ -39,7 +39,6 @@ MAIN_INDEX_HEADER = {
     },
 }
 
-
 ### Main Links Index
 
 @enforce_types
@@ -58,8 +57,12 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
                         detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
                         yield parse_json_link_details(str(detail_index_path))
                     except KeyError: 
-                        print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
-                        continue
+                        # as a last effort, try to guess the missing values out of existing ones
+                        try:
+                            yield Link.from_json(link_json, guess=True)
+                        except KeyError:
+                            print("    {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
+                            continue
     return ()
 
 @enforce_types
@@ -94,19 +97,18 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
     
     out_dir = out_dir or link.link_dir
     path = os.path.join(out_dir, JSON_INDEX_FILENAME)
-
     atomic_write(path, link._asdict(extended=True))
 
 
 @enforce_types
-def parse_json_link_details(out_dir: str) -> Optional[Link]:
+def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
     """load the json link index from a given directory"""
     existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
     if os.path.exists(existing_index):
         with open(existing_index, 'r', encoding='utf-8') as f:
             try:
                 link_json = pyjson.load(f)
-                return Link.from_json(link_json)
+                return Link.from_json(link_json, guess)
             except pyjson.JSONDecodeError:
                 pass
     return None

+ 31 - 6
archivebox/index/schema.py

@@ -1,6 +1,7 @@
 __package__ = 'archivebox.index'
 
 import os
+from pathlib import Path
 
 from datetime import datetime, timedelta
 
@@ -11,6 +12,8 @@ from dataclasses import dataclass, asdict, field, fields
 
 from ..system import get_dir_size
 
+from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME
+
 class ArchiveError(Exception):
     def __init__(self, message, hints=None):
         super().__init__(message)
@@ -51,7 +54,15 @@ class ArchiveResult:
             assert self.output
 
     @classmethod
-    def from_json(cls, json_info):
+    def guess_ts(_cls, dict_info):
+        from ..util import parse_date
+        parsed_timestamp = parse_date(dict_info["timestamp"])
+        start_ts = parsed_timestamp
+        end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
+        return start_ts, end_ts
+
+    @classmethod
+    def from_json(cls, json_info, guess=False):
         from ..util import parse_date
 
         info = {
@@ -59,9 +70,23 @@ class ArchiveResult:
             for key, val in json_info.items()
             if key in cls.field_names()
         }
-        info['start_ts'] = parse_date(info['start_ts'])
-        info['end_ts'] = parse_date(info['end_ts'])
-        info['cmd_version'] = info.get('cmd_version')
+        if guess:
+            keys = info.keys()
+            if "start_ts" not in keys:
+                info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
+            else:
+                info['start_ts'] = parse_date(info['start_ts'])
+                info['end_ts'] = parse_date(info['end_ts'])
+            if "pwd" not in keys:
+                info["pwd"] = str(Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / json_info["timestamp"])
+            if "cmd_version" not in keys:
+                info["cmd_version"] = "Undefined"
+            if "cmd" not in keys:
+                info["cmd"] = []
+        else:
+            info['start_ts'] = parse_date(info['start_ts'])
+            info['end_ts'] = parse_date(info['end_ts'])
+            info['cmd_version'] = info.get('cmd_version')
         return cls(**info)
 
     def to_dict(self, *keys) -> dict:
@@ -182,7 +207,7 @@ class Link:
         return info
 
     @classmethod
-    def from_json(cls, json_info):
+    def from_json(cls, json_info, guess=False):
         from ..util import parse_date
         
         info = {
@@ -200,7 +225,7 @@ class Link:
             cast_history[method] = []
             for json_result in method_history:
                 assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
-                cast_result = ArchiveResult.from_json(json_result)
+                cast_result = ArchiveResult.from_json(json_result, guess)
                 cast_history[method].append(cast_result)
 
         info['history'] = cast_history