Sfoglia il codice sorgente

pathlib with / syntax for config, index

apkallum 5 anni fa
parent
commit
b99784b919
3 ha cambiato i file con 65 aggiunte e 64 eliminazioni
  1. 46 46
      archivebox/config/__init__.py
  2. 16 16
      archivebox/index/__init__.py
  3. 3 2
      archivebox/logging_util.py

+ 46 - 46
archivebox/config/__init__.py

@@ -222,17 +222,17 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
     'USER':                     {'default': lambda c: getpass.getuser() or os.getlogin()},
     'ANSI':                     {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
 
-    'REPO_DIR':                 {'default': lambda c: Path(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..')))},
-    'PYTHON_DIR':               {'default': lambda c: Path.joinpath(Path(c['REPO_DIR']), PYTHON_DIR_NAME)},
-    'TEMPLATES_DIR':            {'default': lambda c: Path.joinpath(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
-
-    'OUTPUT_DIR':               {'default': lambda c: Path(os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir))},
-    'ARCHIVE_DIR':              {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
-    'SOURCES_DIR':              {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], SOURCES_DIR_NAME)},
-    'LOGS_DIR':                 {'default': lambda c: Path.joinpath(c['OUTPUT_DIR'], LOGS_DIR_NAME)},
-    'CONFIG_FILE':              {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else Path.joinpath(c['OUTPUT_DIR'], CONFIG_FILENAME)},
-    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))},
-    'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)},
+    'REPO_DIR':                 {'default': lambda c: Path(__file__).resolve().parent.parent.parent},
+    'PYTHON_DIR':               {'default': lambda c: c['REPO_DIR'] / PYTHON_DIR_NAME},
+    'TEMPLATES_DIR':            {'default': lambda c: c['PYTHON_DIR'] / TEMPLATES_DIR_NAME / 'legacy'},
+
+    'OUTPUT_DIR':               {'default': lambda c: Path.home() / c['OUTPUT_DIR'] if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
+    'ARCHIVE_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
+    'SOURCES_DIR':              {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
+    'LOGS_DIR':                 {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
+    'CONFIG_FILE':              {'default': lambda c: Path.home() / c['CONFIG_FILE'] if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
+    'COOKIES_FILE':             {'default': lambda c: c['COOKIES_FILE'] and Path.home() / c['COOKIES_FILE']},
+    'CHROME_USER_DATA_DIR':     {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else Path.home() / c['CHROME_USER_DATA_DIR'] or None},
     'URL_BLACKLIST_PTN':        {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
 
     'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0]},
@@ -347,9 +347,9 @@ def load_config_val(key: str,
 def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
     """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
 
-    out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
-    config_path = Path.joinpath(Path(out_dir), CONFIG_FILENAME)
-    if os.path.exists(config_path):
+    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
+    config_path = Path(out_dir) / CONFIG_FILENAME
+    if config_path.exists():
         config_file = ConfigParser()
         config_file.optionxform = str 
         config_file.read(config_path)
@@ -370,10 +370,10 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
 
     from ..system import atomic_write
 
-    out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
-    config_path = Path.joinpath(out_dir, CONFIG_FILENAME)
+    out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
+    config_path = Path(out_dir) /  CONFIG_FILENAME
     
-    if not os.path.exists(config_path):
+    if not config_path.exists():
         atomic_write(config_path, CONFIG_HEADER)
 
     config_file = ConfigParser()
@@ -593,8 +593,8 @@ def find_chrome_data_dir() -> Optional[str]:
         '~/.config/google-chrome-dev',
     )
     for path in default_profile_paths:
-        full_path = os.path.expanduser(path)
-        if os.path.exists(full_path):
+        full_path = Path.home() / path
+        if full_path.exists():
             return full_path
     return None
 
@@ -609,19 +609,19 @@ def wget_supports_compression(config):
 def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
     return {
         'REPO_DIR': {
-            'path': os.path.abspath(config['REPO_DIR']),
+            'path': config['REPO_DIR'].resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(Path.joinpath(config['REPO_DIR'], 'archivebox')),
+            'is_valid': (config['REPO_DIR'] / 'archivebox').exists(),
         },
         'PYTHON_DIR': {
-            'path': os.path.abspath(config['PYTHON_DIR']),
+            'path': (config['PYTHON_DIR']).resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(Path.joinpath(config['PYTHON_DIR'], '__main__.py')),
+            'is_valid': (config['PYTHON_DIR'] / '__main__.py').exists(),
         },
         'TEMPLATES_DIR': {
-            'path': os.path.abspath(config['TEMPLATES_DIR']),
+            'path': (config['TEMPLATES_DIR']).resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(Path.joinpath(config['TEMPLATES_DIR'], 'static')),
+            'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
         },
     }
 
@@ -643,44 +643,44 @@ def get_external_locations(config: ConfigDict) -> ConfigValue:
 def get_data_locations(config: ConfigDict) -> ConfigValue:
     return {
         'OUTPUT_DIR': {
-            'path': os.path.abspath(config['OUTPUT_DIR']),
+            'path': config['OUTPUT_DIR'].resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
+            'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(),
         },
         'SOURCES_DIR': {
-            'path': os.path.abspath(config['SOURCES_DIR']),
+            'path': config['SOURCES_DIR'].resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(config['SOURCES_DIR']),
+            'is_valid': config['SOURCES_DIR'].exists(),
         },
         'LOGS_DIR': {
-            'path': os.path.abspath(config['LOGS_DIR']),
+            'path': config['LOGS_DIR'].resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(config['LOGS_DIR']),
+            'is_valid': config['LOGS_DIR'].exists(),
         },
         'ARCHIVE_DIR': {
-            'path': os.path.abspath(config['ARCHIVE_DIR']),
+            'path': config['ARCHIVE_DIR'].resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(config['ARCHIVE_DIR']),
+            'is_valid': config['ARCHIVE_DIR'].exists(),
         },
         'CONFIG_FILE': {
-            'path': os.path.abspath(config['CONFIG_FILE']),
+            'path': config['CONFIG_FILE'].resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(config['CONFIG_FILE']),
+            'is_valid': config['CONFIG_FILE'].exists(),
         },
         'SQL_INDEX': {
-            'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
+            'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
+            'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
         },
         'JSON_INDEX': {
-            'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
+            'path': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
+            'is_valid': (config['OUTPUT_DIR'] / JSON_INDEX_FILENAME).exists(),
         },
         'HTML_INDEX': {
-            'path': os.path.abspath(Path.joinpath(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
+            'path': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).resolve(),
             'enabled': True,
-            'is_valid': os.path.exists(Path.joinpath(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
+            'is_valid': (config['OUTPUT_DIR'] / HTML_INDEX_FILENAME).exists(),
         },
     }
 
@@ -909,9 +909,9 @@ def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) ->
         stderr('        archivebox init')
         raise SystemExit(3)
 
-    sources_dir = Path.joinpath(output_dir, SOURCES_DIR_NAME)
-    if not os.path.exists(sources_dir):
-        os.makedirs(sources_dir)
+    sources_dir = Path(output_dir) / SOURCES_DIR_NAME
+    if not sources_dir.exists():
+        sources_dir.mkdir()
 
 
 
@@ -930,8 +930,8 @@ def setup_django(out_dir: str=None, check_db=False, config: ConfigDict=CONFIG) -
         django.setup()
 
         if check_db:
-            sql_index_path = Path.joinpath(output_dir, SQL_INDEX_FILENAME)
-            assert os.path.exists(sql_index_path), (
+            sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
+            assert sql_index_path.exists(), (
                 f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
     except KeyboardInterrupt:
         raise SystemExit(2)

+ 16 - 16
archivebox/index/__init__.py

@@ -212,7 +212,7 @@ def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
 
 @contextmanager
 @enforce_types
-def timed_index_update(out_path: str):
+def timed_index_update(out_path: Path):
     log_indexing_started(out_path)
     timer = TimedProgress(TIMEOUT * 2, prefix='      ')
     try:
@@ -220,7 +220,7 @@ def timed_index_update(out_path: str):
     finally:
         timer.end()
 
-    assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
+    assert out_path.exists(), f'Failed to write index file: {out_path}'
     log_indexing_finished(out_path)
 
 
@@ -231,27 +231,27 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool
     log_indexing_process_started(len(links))
 
     try:
-        with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
+        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
             write_sql_main_index(links, out_dir=out_dir)
-            os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
 
         if finished:
             write_static_index(links, out_dir=out_dir)
     except (KeyboardInterrupt, SystemExit):
         stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
         stderr('    Run archivebox init to fix any inconsisntencies from an ungraceful exit.')
-        with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
+        with timed_index_update(out_dir / SQL_INDEX_FILENAME):
             write_sql_main_index(links, out_dir=out_dir)
-            os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
+            os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
         raise SystemExit(0)
 
     log_indexing_process_finished()
 
 @enforce_types
-def write_static_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
-    with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
+def write_static_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
+    with timed_index_update(str(out_dir / JSON_INDEX_FILENAME)):
         write_json_main_index(links)
-    with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
+    with timed_index_update(str(out_dir / HTML_INDEX_FILENAME)):
         write_html_main_index(links, out_dir=out_dir, finished=True)
 
 @enforce_types
@@ -273,8 +273,8 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
 
 @enforce_types
 def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
-    index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
-    if os.path.exists(index_path):
+    index_path = out_dir / JSON_INDEX_FILENAME
+    if index_path.exists():
         with open(index_path, 'r', encoding='utf-8') as f:
             meta_dict = pyjson.load(f)
             meta_dict.pop('links')
@@ -422,7 +422,7 @@ def get_present_folders(links, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[L
 
     all_folders = {}
 
-    for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
+    for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
         if entry.is_dir():
             link = None
             try:
@@ -584,9 +584,9 @@ def is_unarchived(link: Link) -> bool:
 def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
     fixed = []
     cant_fix = []
-    for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+    for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME):
         if entry.is_dir(follow_symlinks=True):
-            if os.path.exists(os.path.join(entry.path, 'index.json')):
+            if (Path(entry.path) / 'index.json').exists():
                 try:
                     link = parse_json_link_details(entry.path)
                 except KeyError:
@@ -595,8 +595,8 @@ def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], L
                     continue
 
                 if not entry.path.endswith(f'/{link.timestamp}'):
-                    dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
-                    if os.path.exists(dest):
+                    dest = out_dir / ARCHIVE_DIR_NAME / link.timestamp
+                    if dest.exists():
                         cant_fix.append(entry.path)
                     else:
                         shutil.move(entry.path, dest)

+ 3 - 2
archivebox/logging_util.py

@@ -6,6 +6,7 @@ import sys
 import time
 import argparse
 from multiprocessing import Process
+from pathlib import Path
 
 from datetime import datetime
 from dataclasses import dataclass
@@ -442,11 +443,11 @@ def log_shell_welcome_msg():
 ### Helpers
 
 @enforce_types
-def pretty_path(path: str) -> str:
+def pretty_path(path: Union[Path, str]) -> str:
     """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
     pwd = os.path.abspath('.')
     # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
-    return path.replace(pwd + '/', './')
+    return str(path).replace(pwd + '/', './')
 
 
 @enforce_types