Browse Source

feat: Replace index.json with index.sql as the main index in init

Cristian 5 years ago
parent
commit
02f36b2096
2 changed files with 15 additions and 39 deletions
  1. 9 25
      archivebox/index/__init__.py
  2. 6 14
      archivebox/main.py

+ 9 - 25
archivebox/index/__init__.py

@@ -236,7 +236,7 @@ def timed_index_update(out_path: str):
 
 
 @enforce_types
-def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
+def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False, write_static: bool=False) -> None:
     """create index.html file for a given list of links"""
 
     log_indexing_process_started(len(links))
@@ -246,11 +246,12 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
             write_sql_main_index(links, out_dir=out_dir)
             os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
 
-        with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
-            write_json_main_index(links, out_dir=out_dir)
+        if write_static:
+            with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
+                write_json_main_index(links, out_dir=out_dir)
 
-        with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
-            write_html_main_index(links, out_dir=out_dir, finished=finished)
+            with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
+                write_html_main_index(links, out_dir=out_dir, finished=finished)
     except (KeyboardInterrupt, SystemExit):
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('    Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
@@ -268,26 +269,9 @@ def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
 
     all_links: List[Link] = []
     try:
-        all_links = list(parse_json_main_index(out_dir))
-        links_from_sql = list(parse_sql_main_index(out_dir))
-
-        json_urls = set(l.url for l in all_links)
-        sql_urls = set(l.url for l in links_from_sql)
-        only_in_sql = sql_urls - json_urls
-        only_in_json = json_urls - sql_urls
-
-        if only_in_json:
-            stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI))
-            if only_in_json:
-                stderr('    > Only in JSON: {}...'.format(', '.join(list(only_in_json)[:5])))
-            if only_in_sql:
-                stderr('    > Only in SQL: {}...'.format(', '.join(list(only_in_sql)[:5])))
-
-            stderr('    To repair the index and re-import any orphaned links run:')
-            stderr('        archivebox init')
-        if only_in_sql:
-            # meh, this harmless, it'll get overwritten on next run anyway
-            pass
+        all_links = list(parse_sql_main_index(out_dir))
+        list(parse_sql_main_index(out_dir))
+
     except (KeyboardInterrupt, SystemExit):
         raise SystemExit(0)
 

+ 6 - 14
archivebox/main.py

@@ -3,6 +3,7 @@ __package__ = 'archivebox'
 import os
 import sys
 import shutil
+from pathlib import Path
 
 from typing import Dict, List, Optional, Iterable, IO, Union
 from crontab import CronTab, CronSlices
@@ -252,7 +253,8 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
     """Initialize a new ArchiveBox collection in the current directory"""
     os.makedirs(out_dir, exist_ok=True)
     is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
-    existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
+
+    existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
 
     if is_empty and not existing_index:
         print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
@@ -264,11 +266,11 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
         print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
     else:
         if force:
-            stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow')
+            stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
             stderr('    Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
         else:
             stderr(
-                ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
+                ("{red}[X] This folder appears to already have files in it, but no index.sqlite3 present.{reset}\n\n"
                 "    You must run init in a completely empty directory, or an existing data folder.\n\n"
                 "    {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
                 "    then run and run 'archivebox init' to pick up where you left off.\n\n"
@@ -342,16 +344,6 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
         all_links.update(orphaned_json_links)
         print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
 
-    # Links in SQL index but not in main index
-    orphaned_sql_links = {
-        link.url: link
-        for link in parse_sql_main_index(out_dir)
-        if link.url not in all_links
-    }
-    if orphaned_sql_links:
-        all_links.update(orphaned_sql_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
-
     # Links in data dir indexes but not in main index
     orphaned_data_dir_links = {
         link.url: link
@@ -376,7 +368,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
         print('        archivebox list --status=invalid')
 
 
-    write_main_index(list(all_links.values()), out_dir=out_dir)
+    write_main_index(list(all_links.values()), out_dir=out_dir, write_static=True)
 
     print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
     if existing_index: