| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138 |
- """
- ArchiveBox config definitons (including defaults and dynamic config options).
- Config Usage Example:
- archivebox config --set MEDIA_TIMEOUT=600
- env MEDIA_TIMEOUT=600 USE_COLOR=False ... archivebox [subcommand] ...
- Config Precedence Order:
- 1. cli args (--update-all / --index-only / etc.)
- 2. shell environment vars (env USE_COLOR=False archivebox add '...')
- 3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf)
- 4. defaults (defined below in Python)
- Documentation:
- https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
- """
- __package__ = 'archivebox'
- import os
- import io
- import re
- import sys
- import json
- import getpass
- import platform
- import shutil
- import sqlite3
- import django
- from hashlib import md5
- from pathlib import Path
- from datetime import datetime
- from typing import Optional, Type, Tuple, Dict, Union, List
- from subprocess import run, PIPE, DEVNULL
- from configparser import ConfigParser
- from collections import defaultdict
- from .config_stubs import (
- SimpleConfigValueDict,
- ConfigValue,
- ConfigDict,
- ConfigDefaultValue,
- ConfigDefaultDict,
- )
- ############################### Config Schema ##################################
- CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
- 'SHELL_CONFIG': {
- 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
- 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
- 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
- 'IN_DOCKER': {'type': bool, 'default': False},
- # TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
- },
- 'GENERAL_CONFIG': {
- 'OUTPUT_DIR': {'type': str, 'default': None},
- 'CONFIG_FILE': {'type': str, 'default': None},
- 'ONLY_NEW': {'type': bool, 'default': True},
- 'TIMEOUT': {'type': int, 'default': 60},
- 'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
- 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
- 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
- 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
- },
- 'SERVER_CONFIG': {
- 'SECRET_KEY': {'type': str, 'default': None},
- 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
- 'ALLOWED_HOSTS': {'type': str, 'default': '*'},
- 'DEBUG': {'type': bool, 'default': False},
- 'PUBLIC_INDEX': {'type': bool, 'default': True},
- 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
- 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
- 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
- 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
- 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}
- },
- 'ARCHIVE_METHOD_TOGGLES': {
- 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
- 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
- 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
- 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
- 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
- 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)},
- 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)},
- 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
- 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
- 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
- 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
- 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
- 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
- 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
- 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
- },
- 'ARCHIVE_METHOD_OPTIONS': {
- 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
- 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
- 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
- 'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
- 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
- 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
- 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
- 'COOKIES_FILE': {'type': str, 'default': None},
- 'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
- 'CHROME_HEADLESS': {'type': bool, 'default': True},
- 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
- 'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: ['--write-description',
- '--write-info-json',
- '--write-annotations',
- '--write-thumbnail',
- '--no-call-home',
- '--all-subs',
- '--yes-playlist',
- '--continue',
- '--ignore-errors',
- '--geo-bypass',
- '--add-metadata',
- '--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
- ]},
-
- 'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
- '--adjust-extension',
- '--convert-links',
- '--force-directories',
- '--backup-converted',
- '--span-hosts',
- '--no-parent',
- '-e', 'robots=off',
- ]},
- 'CURL_ARGS': {'type': list, 'default': ['--silent',
- '--location',
- '--compressed'
- ]},
- 'GIT_ARGS': {'type': list, 'default': ['--recursive']},
- },
- 'SEARCH_BACKEND_CONFIG' : {
- 'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
- 'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
- 'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
- 'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
- 'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
- 'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},
- # SONIC
- 'SONIC_COLLECTION': {'type': str, 'default': 'archivebox'},
- 'SONIC_BUCKET': {'type': str, 'default': 'snapshots'},
- },
- 'DEPENDENCY_CONFIG': {
- 'USE_CURL': {'type': bool, 'default': True},
- 'USE_WGET': {'type': bool, 'default': True},
- 'USE_SINGLEFILE': {'type': bool, 'default': True},
- 'USE_READABILITY': {'type': bool, 'default': True},
- 'USE_MERCURY': {'type': bool, 'default': True},
- 'USE_GIT': {'type': bool, 'default': True},
- 'USE_CHROME': {'type': bool, 'default': True},
- 'USE_NODE': {'type': bool, 'default': True},
- 'USE_YOUTUBEDL': {'type': bool, 'default': True},
- 'USE_RIPGREP': {'type': bool, 'default': True},
-
- 'CURL_BINARY': {'type': str, 'default': 'curl'},
- 'GIT_BINARY': {'type': str, 'default': 'git'},
- 'WGET_BINARY': {'type': str, 'default': 'wget'},
- 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
- 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
- 'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
- 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
- 'NODE_BINARY': {'type': str, 'default': 'node'},
- 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
- 'CHROME_BINARY': {'type': str, 'default': None},
- 'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
- 'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
- },
- }
- ########################## Backwards-Compatibility #############################
- # for backwards compatibility with old config files, check old/deprecated names for each key
- CONFIG_ALIASES = {
- alias: key
- for section in CONFIG_SCHEMA.values()
- for key, default in section.items()
- for alias in default.get('aliases', ())
- }
- USER_CONFIG = {key for section in CONFIG_SCHEMA.values() for key in section.keys()}
- def get_real_name(key: str) -> str:
- """get the current canonical name for a given deprecated config key"""
- return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
- ################################ Constants #####################################
- PACKAGE_DIR_NAME = 'archivebox'
- TEMPLATES_DIR_NAME = 'templates'
- ARCHIVE_DIR_NAME = 'archive'
- SOURCES_DIR_NAME = 'sources'
- LOGS_DIR_NAME = 'logs'
- SQL_INDEX_FILENAME = 'index.sqlite3'
- JSON_INDEX_FILENAME = 'index.json'
- HTML_INDEX_FILENAME = 'index.html'
- ROBOTS_TXT_FILENAME = 'robots.txt'
- FAVICON_FILENAME = 'favicon.ico'
- CONFIG_FILENAME = 'ArchiveBox.conf'
- DEFAULT_CLI_COLORS = {
- 'reset': '\033[00;00m',
- 'lightblue': '\033[01;30m',
- 'lightyellow': '\033[01;33m',
- 'lightred': '\033[01;35m',
- 'red': '\033[01;31m',
- 'green': '\033[01;32m',
- 'blue': '\033[01;34m',
- 'white': '\033[01;37m',
- 'black': '\033[01;30m',
- }
- ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
- COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
- '00': [(0, 0, 0), (0, 0, 0)],
- '30': [(0, 0, 0), (0, 0, 0)],
- '31': [(255, 0, 0), (128, 0, 0)],
- '32': [(0, 200, 0), (0, 128, 0)],
- '33': [(255, 255, 0), (128, 128, 0)],
- '34': [(0, 0, 255), (0, 0, 128)],
- '35': [(255, 0, 255), (128, 0, 128)],
- '36': [(0, 255, 255), (0, 128, 128)],
- '37': [(255, 255, 255), (255, 255, 255)],
- })
- STATICFILE_EXTENSIONS = {
- # 99.999% of the time, URLs ending in these extensions are static files
- # that can be downloaded as-is, not html pages that need to be rendered
- 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
- 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
- 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
- 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
- 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
- 'atom', 'rss', 'css', 'js', 'json',
- 'dmg', 'iso', 'img',
- 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
- # Less common extensions to consider adding later
- # jar, swf, bin, com, exe, dll, deb
- # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
- # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
- # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
- # These are always treated as pages, not as static files, never add them:
- # html, htm, shtml, xhtml, xml, aspx, php, cgi
- }
- ############################## Derived Config ##################################
- DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
- 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
- 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()},
- 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
- 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
- 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
- 'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
- 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
- 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
- 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
- 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
- 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
- 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
- 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
- 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
- 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
- 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
- 'PYTHON_BINARY': {'default': lambda c: sys.executable},
- 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
- 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
- 'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
- 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
- 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
- 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
- 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
- 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
- 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
- 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
- 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
- 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
- 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
- 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
- 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
- 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
- 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
- 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
- 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
- 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
- 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
- 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
- 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
- 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
- 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
- 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
- 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
- 'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
- 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
- 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
- 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
- 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
- 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
- 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
-
- 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
- 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
- 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
- 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
- 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
- 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
-
- 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
- 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
- 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
- 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
- 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
- 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
- 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
- }
- ################################### Helpers ####################################
- def load_config_val(key: str,
- default: ConfigDefaultValue=None,
- type: Optional[Type]=None,
- aliases: Optional[Tuple[str, ...]]=None,
- config: Optional[ConfigDict]=None,
- env_vars: Optional[os._Environ]=None,
- config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
- """parse bool, int, and str key=value pairs from env"""
- config_keys_to_check = (key, *(aliases or ()))
- for key in config_keys_to_check:
- if env_vars:
- val = env_vars.get(key)
- if val:
- break
- if config_file_vars:
- val = config_file_vars.get(key)
- if val:
- break
- if type is None or val is None:
- if callable(default):
- assert isinstance(config, dict)
- return default(config)
- return default
- elif type is bool:
- if val.lower() in ('true', 'yes', '1'):
- return True
- elif val.lower() in ('false', 'no', '0'):
- return False
- else:
- raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
- elif type is str:
- if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'):
- raise ValueError(f'Invalid configuration option {key}={val} (expected a string)')
- return val.strip()
- elif type is int:
- if not val.isdigit():
- raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
- return int(val)
- elif type is list or type is dict:
- return json.loads(val)
- raise Exception('Config values can only be str, bool, int or json')
- def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
- """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
- out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
- config_path = Path(out_dir) / CONFIG_FILENAME
- if config_path.exists():
- config_file = ConfigParser()
- config_file.optionxform = str
- config_file.read(config_path)
- # flatten into one namespace
- config_file_vars = {
- key.upper(): val
- for section, options in config_file.items()
- for key, val in options.items()
- }
- # print('[i] Loaded config file', os.path.abspath(config_path))
- # print(config_file_vars)
- return config_file_vars
- return None
- def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
- """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
- from .system import atomic_write
- CONFIG_HEADER = (
- """# This is the config file for your ArchiveBox collection.
- #
- # You can add options here manually in INI format, or automatically by running:
- # archivebox config --set KEY=VALUE
- #
- # If you modify this file manually, make sure to update your archive after by running:
- # archivebox init
- #
- # A list of all possible config with documentation and examples can be found here:
- # https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
- """)
- out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
- config_path = Path(out_dir) / CONFIG_FILENAME
-
- if not config_path.exists():
- atomic_write(config_path, CONFIG_HEADER)
- config_file = ConfigParser()
- config_file.optionxform = str
- config_file.read(config_path)
- with open(config_path, 'r', encoding='utf-8') as old:
- atomic_write(f'{config_path}.bak', old.read())
- find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
- # Set up sections in empty config file
- for key, val in config.items():
- section = find_section(key)
- if section in config_file:
- existing_config = dict(config_file[section])
- else:
- existing_config = {}
- config_file[section] = {**existing_config, key: val}
- # always make sure there's a SECRET_KEY defined for Django
- existing_secret_key = None
- if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
- existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
- if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
- from django.utils.crypto import get_random_string
- chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
- random_secret_key = get_random_string(50, chars)
- if 'SERVER_CONFIG' in config_file:
- config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
- else:
- config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
- with open(config_path, 'w+', encoding='utf-8') as new:
- config_file.write(new)
-
- try:
- # validate the config by attempting to re-parse it
- CONFIG = load_all_config()
- return {
- key.upper(): CONFIG.get(key.upper())
- for key in config.keys()
- }
- except BaseException:
- # something went horribly wrong, rever to the previous version
- with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
- atomic_write(config_path, old.read())
- if Path(f'{config_path}.bak').exists():
- os.remove(f'{config_path}.bak')
- return {}
-
- def load_config(defaults: ConfigDefaultDict,
- config: Optional[ConfigDict]=None,
- out_dir: Optional[str]=None,
- env_vars: Optional[os._Environ]=None,
- config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
-
- env_vars = env_vars or os.environ
- config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
- extended_config: ConfigDict = config.copy() if config else {}
- for key, default in defaults.items():
- try:
- extended_config[key] = load_config_val(
- key,
- default=default['default'],
- type=default.get('type'),
- aliases=default.get('aliases'),
- config=extended_config,
- env_vars=env_vars,
- config_file_vars=config_file_vars,
- )
- except KeyboardInterrupt:
- raise SystemExit(0)
- except Exception as e:
- stderr()
- stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
- stderr(' {}: {}'.format(e.__class__.__name__, e))
- stderr()
- stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
- stderr()
- stderr(' For config documentation and examples see:')
- stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
- stderr()
- # raise
- raise SystemExit(2)
-
- return extended_config
- # def write_config(config: ConfigDict):
- # with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
- # Logging Helpers
- def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
- ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
- if color:
- strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
- else:
- strs = [' '.join(str(a) for a in args), '\n']
- sys.stdout.write(prefix + ''.join(strs))
- def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
- ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
- if color:
- strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
- else:
- strs = [' '.join(str(a) for a in args), '\n']
- sys.stderr.write(prefix + ''.join(strs))
- def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None:
- ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
- if isinstance(text, str):
- stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
- else:
- stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
- for line in text[1:]:
- stderr('{} {}'.format(prefix, line))
- # Dependency Metadata Helpers
- def bin_version(binary: Optional[str]) -> Optional[str]:
- """check the presence and return valid version line of a specified binary"""
- abspath = bin_path(binary)
- if not binary or not abspath:
- return None
- try:
- version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
- # take first 3 columns of first line of version info
- return ' '.join(version_str.split('\n')[0].strip().split()[:3])
- except OSError:
- pass
- # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
- # stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
- # stderr(f' {binary} --version')
- # stderr()
- # stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
- # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
- return None
- def bin_path(binary: Optional[str]) -> Optional[str]:
- if binary is None:
- return None
- node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
- if node_modules_bin.exists():
- return str(node_modules_bin.resolve())
- return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
- def bin_hash(binary: Optional[str]) -> Optional[str]:
- if binary is None:
- return None
- abs_path = bin_path(binary)
- if abs_path is None or not Path(abs_path).exists():
- return None
- file_hash = md5()
- with io.open(abs_path, mode='rb') as f:
- for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
- file_hash.update(chunk)
-
- return f'md5:{file_hash.hexdigest()}'
- def find_chrome_binary() -> Optional[str]:
- """find any installed chrome binaries in the default locations"""
- # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
- # make sure data dir finding precedence order always matches binary finding order
- default_executable_paths = (
- 'chromium-browser',
- 'chromium',
- '/Applications/Chromium.app/Contents/MacOS/Chromium',
- 'chrome',
- 'google-chrome',
- '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
- 'google-chrome-stable',
- 'google-chrome-beta',
- 'google-chrome-canary',
- '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
- 'google-chrome-unstable',
- 'google-chrome-dev',
- )
- for name in default_executable_paths:
- full_path_exists = shutil.which(name)
- if full_path_exists:
- return name
-
- return None
- def find_chrome_data_dir() -> Optional[str]:
- """find any installed chrome user data directories in the default locations"""
- # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
- # make sure data dir finding precedence order always matches binary finding order
- default_profile_paths = (
- '~/.config/chromium',
- '~/Library/Application Support/Chromium',
- '~/AppData/Local/Chromium/User Data',
- '~/.config/chrome',
- '~/.config/google-chrome',
- '~/Library/Application Support/Google/Chrome',
- '~/AppData/Local/Google/Chrome/User Data',
- '~/.config/google-chrome-stable',
- '~/.config/google-chrome-beta',
- '~/Library/Application Support/Google/Chrome Canary',
- '~/AppData/Local/Google/Chrome SxS/User Data',
- '~/.config/google-chrome-unstable',
- '~/.config/google-chrome-dev',
- )
- for path in default_profile_paths:
- full_path = Path(path).resolve()
- if full_path.exists():
- return full_path
- return None
- def wget_supports_compression(config):
- try:
- cmd = [
- config['WGET_BINARY'],
- "--compression=auto",
- "--help",
- ]
- return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
- except (FileNotFoundError, OSError):
- return False
- def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
- return {
- 'PACKAGE_DIR': {
- 'path': (config['PACKAGE_DIR']).resolve(),
- 'enabled': True,
- 'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
- },
- 'TEMPLATES_DIR': {
- 'path': (config['TEMPLATES_DIR']).resolve(),
- 'enabled': True,
- 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
- },
- 'CUSTOM_TEMPLATES_DIR': {
- 'path': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).resolve(),
- 'enabled': bool(config['CUSTOM_TEMPLATES_DIR']),
- 'is_valid': config['CUSTOM_TEMPLATES_DIR'] and Path(config['CUSTOM_TEMPLATES_DIR']).exists(),
- },
- # 'NODE_MODULES_DIR': {
- # 'path': ,
- # 'enabled': ,
- # 'is_valid': (...).exists(),
- # },
- }
- def get_external_locations(config: ConfigDict) -> ConfigValue:
- abspath = lambda path: None if path is None else Path(path).resolve()
- return {
- 'CHROME_USER_DATA_DIR': {
- 'path': abspath(config['CHROME_USER_DATA_DIR']),
- 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
- 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
- },
- 'COOKIES_FILE': {
- 'path': abspath(config['COOKIES_FILE']),
- 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
- 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
- },
- }
- def get_data_locations(config: ConfigDict) -> ConfigValue:
- return {
- 'OUTPUT_DIR': {
- 'path': config['OUTPUT_DIR'].resolve(),
- 'enabled': True,
- 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
- },
- 'SOURCES_DIR': {
- 'path': config['SOURCES_DIR'].resolve(),
- 'enabled': True,
- 'is_valid': config['SOURCES_DIR'].exists(),
- },
- 'LOGS_DIR': {
- 'path': config['LOGS_DIR'].resolve(),
- 'enabled': True,
- 'is_valid': config['LOGS_DIR'].exists(),
- },
- 'ARCHIVE_DIR': {
- 'path': config['ARCHIVE_DIR'].resolve(),
- 'enabled': True,
- 'is_valid': config['ARCHIVE_DIR'].exists(),
- },
- 'CONFIG_FILE': {
- 'path': config['CONFIG_FILE'].resolve(),
- 'enabled': True,
- 'is_valid': config['CONFIG_FILE'].exists(),
- },
- 'SQL_INDEX': {
- 'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
- 'enabled': True,
- 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
- },
- }
- def get_dependency_info(config: ConfigDict) -> ConfigValue:
- return {
- 'ARCHIVEBOX_BINARY': {
- 'path': bin_path(config['ARCHIVEBOX_BINARY']),
- 'version': config['VERSION'],
- 'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
- 'enabled': True,
- 'is_valid': True,
- },
- 'PYTHON_BINARY': {
- 'path': bin_path(config['PYTHON_BINARY']),
- 'version': config['PYTHON_VERSION'],
- 'hash': bin_hash(config['PYTHON_BINARY']),
- 'enabled': True,
- 'is_valid': bool(config['PYTHON_VERSION']),
- },
- 'DJANGO_BINARY': {
- 'path': bin_path(config['DJANGO_BINARY']),
- 'version': config['DJANGO_VERSION'],
- 'hash': bin_hash(config['DJANGO_BINARY']),
- 'enabled': True,
- 'is_valid': bool(config['DJANGO_VERSION']),
- },
- 'CURL_BINARY': {
- 'path': bin_path(config['CURL_BINARY']),
- 'version': config['CURL_VERSION'],
- 'hash': bin_hash(config['CURL_BINARY']),
- 'enabled': config['USE_CURL'],
- 'is_valid': bool(config['CURL_VERSION']),
- },
- 'WGET_BINARY': {
- 'path': bin_path(config['WGET_BINARY']),
- 'version': config['WGET_VERSION'],
- 'hash': bin_hash(config['WGET_BINARY']),
- 'enabled': config['USE_WGET'],
- 'is_valid': bool(config['WGET_VERSION']),
- },
- 'NODE_BINARY': {
- 'path': bin_path(config['NODE_BINARY']),
- 'version': config['NODE_VERSION'],
- 'hash': bin_hash(config['NODE_BINARY']),
- 'enabled': config['USE_NODE'],
- 'is_valid': bool(config['NODE_VERSION']),
- },
- 'SINGLEFILE_BINARY': {
- 'path': bin_path(config['SINGLEFILE_BINARY']),
- 'version': config['SINGLEFILE_VERSION'],
- 'hash': bin_hash(config['SINGLEFILE_BINARY']),
- 'enabled': config['USE_SINGLEFILE'],
- 'is_valid': bool(config['SINGLEFILE_VERSION']),
- },
- 'READABILITY_BINARY': {
- 'path': bin_path(config['READABILITY_BINARY']),
- 'version': config['READABILITY_VERSION'],
- 'hash': bin_hash(config['READABILITY_BINARY']),
- 'enabled': config['USE_READABILITY'],
- 'is_valid': bool(config['READABILITY_VERSION']),
- },
- 'MERCURY_BINARY': {
- 'path': bin_path(config['MERCURY_BINARY']),
- 'version': config['MERCURY_VERSION'],
- 'hash': bin_hash(config['MERCURY_BINARY']),
- 'enabled': config['USE_MERCURY'],
- 'is_valid': bool(config['MERCURY_VERSION']),
- },
- 'GIT_BINARY': {
- 'path': bin_path(config['GIT_BINARY']),
- 'version': config['GIT_VERSION'],
- 'hash': bin_hash(config['GIT_BINARY']),
- 'enabled': config['USE_GIT'],
- 'is_valid': bool(config['GIT_VERSION']),
- },
- 'YOUTUBEDL_BINARY': {
- 'path': bin_path(config['YOUTUBEDL_BINARY']),
- 'version': config['YOUTUBEDL_VERSION'],
- 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
- 'enabled': config['USE_YOUTUBEDL'],
- 'is_valid': bool(config['YOUTUBEDL_VERSION']),
- },
- 'CHROME_BINARY': {
- 'path': bin_path(config['CHROME_BINARY']),
- 'version': config['CHROME_VERSION'],
- 'hash': bin_hash(config['CHROME_BINARY']),
- 'enabled': config['USE_CHROME'],
- 'is_valid': bool(config['CHROME_VERSION']),
- },
- 'RIPGREP_BINARY': {
- 'path': bin_path(config['RIPGREP_BINARY']),
- 'version': config['RIPGREP_VERSION'],
- 'hash': bin_hash(config['RIPGREP_BINARY']),
- 'enabled': config['USE_RIPGREP'],
- 'is_valid': bool(config['RIPGREP_VERSION']),
- },
- # TODO: add an entry for the sonic search backend?
- # 'SONIC_BINARY': {
- # 'path': bin_path(config['SONIC_BINARY']),
- # 'version': config['SONIC_VERSION'],
- # 'hash': bin_hash(config['SONIC_BINARY']),
- # 'enabled': config['USE_SONIC'],
- # 'is_valid': bool(config['SONIC_VERSION']),
- # },
- }
- def get_chrome_info(config: ConfigDict) -> ConfigValue:
- return {
- 'TIMEOUT': config['TIMEOUT'],
- 'RESOLUTION': config['RESOLUTION'],
- 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
- 'CHROME_BINARY': config['CHROME_BINARY'],
- 'CHROME_HEADLESS': config['CHROME_HEADLESS'],
- 'CHROME_SANDBOX': config['CHROME_SANDBOX'],
- 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
- 'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
- }
- # ******************************************************************************
- # ******************************************************************************
- # ******************************** Load Config *********************************
- # ******* (compile the defaults, configs, and metadata all into CONFIG) ********
- # ******************************************************************************
- # ******************************************************************************
- def load_all_config():
- CONFIG: ConfigDict = {}
- for section_name, section_config in CONFIG_SCHEMA.items():
- CONFIG = load_config(section_config, CONFIG)
- return load_config(DYNAMIC_CONFIG_SCHEMA, CONFIG)
- # add all final config values in CONFIG to globals in this file
- CONFIG = load_all_config()
- globals().update(CONFIG)
- # this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
- # ******************************************************************************
- # ******************************************************************************
- # ******************************************************************************
- # ******************************************************************************
- # ******************************************************************************
- ########################### System Environment Setup ###########################
- # Set timezone to UTC and umask to OUTPUT_PERMISSIONS
- os.environ["TZ"] = 'UTC'
- os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821
- # add ./node_modules/.bin to $PATH so we can use node scripts in extractors
- NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
- sys.path.append(NODE_BIN_PATH)
- # disable stderr "you really shouldnt disable ssl" warnings with library config
- if not CONFIG['CHECK_SSL_VALIDITY']:
- import urllib3
- import requests
- requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
- ########################### Config Validity Checkers ###########################
- def check_system_config(config: ConfigDict=CONFIG) -> None:
- ### Check system environment
- if config['USER'] == 'root':
- stderr('[!] ArchiveBox should never be run as root!', color='red')
- stderr(' For more information, see the security overview documentation:')
- stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
- raise SystemExit(2)
- ### Check Python environment
- if sys.version_info[:3] < (3, 6, 0):
- stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
- stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
- raise SystemExit(2)
- if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
- stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
- stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
- stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
- stderr('')
- stderr(' Confirm that it\'s fixed by opening a new shell and running:')
- stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
- raise SystemExit(2)
- # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
- # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
- if config['CHROME_USER_DATA_DIR'] is not None:
- if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
- stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
- stderr(f' {config["CHROME_USER_DATA_DIR"]}')
- stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
- stderr(' For more info see:')
- stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
- if '/Default' in str(config['CHROME_USER_DATA_DIR']):
- stderr()
- stderr(' Try removing /Default from the end e.g.:')
- stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
- raise SystemExit(2)
- def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
- invalid_dependencies = [
- (name, info) for name, info in config['DEPENDENCIES'].items()
- if info['enabled'] and not info['is_valid']
- ]
- if invalid_dependencies and show_help:
- stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
- for dependency, info in invalid_dependencies:
- stderr(
- ' ! {}: {} ({})'.format(
- dependency,
- info['path'] or 'unable to find binary',
- info['version'] or 'unable to detect version',
- )
- )
- if dependency in ('YOUTUBEDL_BINARY', 'CHROME_BINARY', 'SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
- hint(('To install all packages automatically run: archivebox setup',
- f'or to disable it and silence this warning: archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False',
- ''), prefix=' ')
- stderr('')
- if config['TIMEOUT'] < 5:
- stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
- stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
- stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
- stderr()
- stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
- stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
- stderr()
- elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
- stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
- stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
- stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
- stderr()
- stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
- stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
- stderr()
- if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
- stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
- stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
- stderr(' (Setting it somewhere over 60 seconds is recommended)')
- stderr()
- stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
- stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
- stderr()
-
- def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
- output_dir = out_dir or config['OUTPUT_DIR']
- assert isinstance(output_dir, (str, Path))
- archive_dir_exists = (Path(output_dir) / ARCHIVE_DIR_NAME).exists()
- if not archive_dir_exists:
- stderr('[X] No archivebox index found in the current directory.', color='red')
- stderr(f' {output_dir}', color='lightyellow')
- stderr()
- stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
- stderr(' cd path/to/your/archive/folder')
- stderr(' archivebox [command]')
- stderr()
- stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
- stderr(' archivebox init')
- raise SystemExit(2)
- def check_migrations(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG):
- output_dir = out_dir or config['OUTPUT_DIR']
- from .index.sql import list_migrations
- pending_migrations = [name for status, name in list_migrations() if not status]
- if pending_migrations:
- stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
- stderr(f' {output_dir}')
- stderr()
- stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
- stderr(' archivebox init')
- raise SystemExit(3)
- (Path(output_dir) / SOURCES_DIR_NAME).mkdir(exist_ok=True)
- (Path(output_dir) / LOGS_DIR_NAME).mkdir(exist_ok=True)
- def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
- check_system_config()
-
- output_dir = out_dir or Path(config['OUTPUT_DIR'])
- assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
- try:
- from django.core.management import call_command
- sys.path.append(str(config['PACKAGE_DIR']))
- os.environ.setdefault('OUTPUT_DIR', str(output_dir))
- assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
- os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
- # Check to make sure JSON extension is available in our Sqlite3 instance
- try:
- cursor = sqlite3.connect(':memory:').cursor()
- cursor.execute('SELECT JSON(\'{"a": "b"}\')')
- except sqlite3.OperationalError as exc:
- stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
- hint([
- 'Upgrade your Python version or install the extension manually:',
- 'https://code.djangoproject.com/wiki/JSON1Extension'
- ])
- if in_memory_db:
- # some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
- # in those cases we create a temporary in-memory db and run the migrations
- # immediately to get a usable in-memory-database at startup
- os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
- django.setup()
- call_command("migrate", interactive=False, verbosity=0)
- else:
- # Otherwise use default sqlite3 file-based database and initialize django
- # without running migrations automatically (user runs them manually by calling init)
- django.setup()
-
- from django.conf import settings
- # log startup message to the error log
- with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
- command = ' '.join(sys.argv)
- ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S')
- f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
- if check_db:
- # Enable WAL mode in sqlite3
- from django.db import connection
- with connection.cursor() as cursor:
- current_mode = cursor.execute("PRAGMA journal_mode")
- if current_mode != 'wal':
- cursor.execute("PRAGMA journal_mode=wal;")
- # Create cache table in DB if needed
- try:
- from django.core.cache import cache
- cache.get('test', None)
- except django.db.utils.OperationalError:
- call_command("createcachetable", verbosity=0)
- # if archivebox gets imported multiple times, we have to close
- # the sqlite3 whenever we init from scratch to avoid multiple threads
- # sharing the same connection by accident
- from django.db import connections
- for conn in connections.all():
- conn.close_if_unusable_or_obsolete()
- sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
- assert sql_index_path.exists(), (
- f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
- except KeyboardInterrupt:
- raise SystemExit(2)
|