Browse Source

feat: Replace index.json with index.sql as the main index in init

Cristian 5 years ago
parent
commit
02f36b2096
2 changed files with 15 additions and 39 deletions
  1. 9 25
      archivebox/index/__init__.py
  2. 6 14
      archivebox/main.py

+ 9 - 25
archivebox/index/__init__.py

@@ -236,7 +236,7 @@ def timed_index_update(out_path: str):
 
 
 
 
 @enforce_types
 @enforce_types
-def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
+def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False, write_static: bool=False) -> None:
     """create index.html file for a given list of links"""
     """create index.html file for a given list of links"""
 
 
     log_indexing_process_started(len(links))
     log_indexing_process_started(len(links))
@@ -246,11 +246,12 @@ def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=
             write_sql_main_index(links, out_dir=out_dir)
             write_sql_main_index(links, out_dir=out_dir)
             os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
             os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
 
 
-        with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
-            write_json_main_index(links, out_dir=out_dir)
+        if write_static:
+            with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
+                write_json_main_index(links, out_dir=out_dir)
 
 
-        with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
-            write_html_main_index(links, out_dir=out_dir, finished=finished)
+            with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
+                write_html_main_index(links, out_dir=out_dir, finished=finished)
     except (KeyboardInterrupt, SystemExit):
     except (KeyboardInterrupt, SystemExit):
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('    Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
         stderr('    Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
@@ -268,26 +269,9 @@ def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
 
 
     all_links: List[Link] = []
     all_links: List[Link] = []
     try:
     try:
-        all_links = list(parse_json_main_index(out_dir))
-        links_from_sql = list(parse_sql_main_index(out_dir))
-
-        json_urls = set(l.url for l in all_links)
-        sql_urls = set(l.url for l in links_from_sql)
-        only_in_sql = sql_urls - json_urls
-        only_in_json = json_urls - sql_urls
-
-        if only_in_json:
-            stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI))
-            if only_in_json:
-                stderr('    > Only in JSON: {}...'.format(', '.join(list(only_in_json)[:5])))
-            if only_in_sql:
-                stderr('    > Only in SQL: {}...'.format(', '.join(list(only_in_sql)[:5])))
-
-            stderr('    To repair the index and re-import any orphaned links run:')
-            stderr('        archivebox init')
-        if only_in_sql:
-            # meh, this harmless, it'll get overwritten on next run anyway
-            pass
+        all_links = list(parse_sql_main_index(out_dir))
+        list(parse_sql_main_index(out_dir))
+
     except (KeyboardInterrupt, SystemExit):
     except (KeyboardInterrupt, SystemExit):
         raise SystemExit(0)
         raise SystemExit(0)
 
 

+ 6 - 14
archivebox/main.py

@@ -3,6 +3,7 @@ __package__ = 'archivebox'
 import os
 import os
 import sys
 import sys
 import shutil
 import shutil
+from pathlib import Path
 
 
 from typing import Dict, List, Optional, Iterable, IO, Union
 from typing import Dict, List, Optional, Iterable, IO, Union
 from crontab import CronTab, CronSlices
 from crontab import CronTab, CronSlices
@@ -252,7 +253,8 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
     """Initialize a new ArchiveBox collection in the current directory"""
     """Initialize a new ArchiveBox collection in the current directory"""
     os.makedirs(out_dir, exist_ok=True)
     os.makedirs(out_dir, exist_ok=True)
     is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
     is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
-    existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
+
+    existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
 
 
     if is_empty and not existing_index:
     if is_empty and not existing_index:
         print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
         print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
@@ -264,11 +266,11 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
         print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
         print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
     else:
     else:
         if force:
         if force:
-            stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow')
+            stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
             stderr('    Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
             stderr('    Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
         else:
         else:
             stderr(
             stderr(
-                ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
+                ("{red}[X] This folder appears to already have files in it, but no index.sqlite3 present.{reset}\n\n"
                 "    You must run init in a completely empty directory, or an existing data folder.\n\n"
                 "    You must run init in a completely empty directory, or an existing data folder.\n\n"
                 "    {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
                 "    {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
                 "    then run and run 'archivebox init' to pick up where you left off.\n\n"
                 "    then run and run 'archivebox init' to pick up where you left off.\n\n"
@@ -342,16 +344,6 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
         all_links.update(orphaned_json_links)
         all_links.update(orphaned_json_links)
         print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
         print('    {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
 
 
-    # Links in SQL index but not in main index
-    orphaned_sql_links = {
-        link.url: link
-        for link in parse_sql_main_index(out_dir)
-        if link.url not in all_links
-    }
-    if orphaned_sql_links:
-        all_links.update(orphaned_sql_links)
-        print('    {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
-
     # Links in data dir indexes but not in main index
     # Links in data dir indexes but not in main index
     orphaned_data_dir_links = {
     orphaned_data_dir_links = {
         link.url: link
         link.url: link
@@ -376,7 +368,7 @@ def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
         print('        archivebox list --status=invalid')
         print('        archivebox list --status=invalid')
 
 
 
 
-    write_main_index(list(all_links.values()), out_dir=out_dir)
+    write_main_index(list(all_links.values()), out_dir=out_dir, write_static=True)
 
 
     print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
     print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
     if existing_index:
     if existing_index: