config.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784
  1. __package__ = 'archivebox.legacy'
  2. import os
  3. import io
  4. import re
  5. import sys
  6. import django
  7. import getpass
  8. import shutil
  9. from hashlib import md5
  10. from typing import Optional, Type, Tuple, Dict
  11. from subprocess import run, PIPE, DEVNULL
  12. from configparser import ConfigParser
  13. from .config_stubs import (
  14. SimpleConfigValueDict,
  15. ConfigValue,
  16. ConfigDict,
  17. ConfigDefaultValue,
  18. ConfigDefaultDict,
  19. )
  20. # ******************************************************************************
  21. # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
  22. # Use the 'env' command to pass config options to ArchiveBox. e.g.:
  23. # env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
  24. # ******************************************************************************
  25. ################################# User Config ##################################
  26. CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
  27. 'SHELL_CONFIG': {
  28. 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
  29. 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
  30. 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']},
  31. },
  32. 'GENERAL_CONFIG': {
  33. 'OUTPUT_DIR': {'type': str, 'default': None},
  34. 'CONFIG_FILE': {'type': str, 'default': None},
  35. 'ONLY_NEW': {'type': bool, 'default': False},
  36. 'TIMEOUT': {'type': int, 'default': 60},
  37. 'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
  38. 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
  39. 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
  40. 'URL_BLACKLIST': {'type': str, 'default': None},
  41. },
  42. 'ARCHIVE_METHOD_TOGGLES': {
  43. 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
  44. 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
  45. 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
  46. 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
  47. 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
  48. 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
  49. 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
  50. 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
  51. 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
  52. 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
  53. 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
  54. },
  55. 'ARCHIVE_METHOD_OPTIONS': {
  56. 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
  57. 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
  58. 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
  59. 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
  60. 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
  61. 'COOKIES_FILE': {'type': str, 'default': None},
  62. 'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
  63. 'CHROME_HEADLESS': {'type': bool, 'default': True},
  64. 'CHROME_SANDBOX': {'type': bool, 'default': True},
  65. },
  66. 'DEPENDENCY_CONFIG': {
  67. 'USE_CURL': {'type': bool, 'default': True},
  68. 'USE_WGET': {'type': bool, 'default': True},
  69. 'USE_GIT': {'type': bool, 'default': True},
  70. 'USE_CHROME': {'type': bool, 'default': True},
  71. 'USE_YOUTUBEDL': {'type': bool, 'default': True},
  72. 'CURL_BINARY': {'type': str, 'default': 'curl'},
  73. 'GIT_BINARY': {'type': str, 'default': 'git'},
  74. 'WGET_BINARY': {'type': str, 'default': 'wget'},
  75. 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
  76. 'CHROME_BINARY': {'type': str, 'default': None},
  77. },
  78. }
  79. CONFIG_ALIASES = {
  80. alias: key
  81. for section in CONFIG_DEFAULTS.values()
  82. for key, default in section.items()
  83. for alias in default.get('aliases', ())
  84. }
  85. USER_CONFIG = {key for section in CONFIG_DEFAULTS.values() for key in section.keys()}
  86. def get_real_name(key: str) -> str:
  87. return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
  88. ############################## Derived Config ##############################
  89. # Constants
  90. DEFAULT_CLI_COLORS = {
  91. 'reset': '\033[00;00m',
  92. 'lightblue': '\033[01;30m',
  93. 'lightyellow': '\033[01;33m',
  94. 'lightred': '\033[01;35m',
  95. 'red': '\033[01;31m',
  96. 'green': '\033[01;32m',
  97. 'blue': '\033[01;34m',
  98. 'white': '\033[01;37m',
  99. 'black': '\033[01;30m',
  100. }
  101. ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
  102. VERSION_FILENAME = 'VERSION'
  103. PYTHON_DIR_NAME = 'archivebox'
  104. LEGACY_DIR_NAME = 'legacy'
  105. TEMPLATES_DIR_NAME = 'templates'
  106. ARCHIVE_DIR_NAME = 'archive'
  107. SOURCES_DIR_NAME = 'sources'
  108. LOGS_DIR_NAME = 'logs'
  109. STATIC_DIR_NAME = 'static'
  110. SQL_INDEX_FILENAME = 'index.sqlite3'
  111. JSON_INDEX_FILENAME = 'index.json'
  112. HTML_INDEX_FILENAME = 'index.html'
  113. ROBOTS_TXT_FILENAME = 'robots.txt'
  114. FAVICON_FILENAME = 'favicon.ico'
  115. CONFIG_FILENAME = 'ArchiveBox.conf'
  116. CONFIG_HEADER = (
  117. """# This is the config file for your ArchiveBox collection.
  118. #
  119. # You can add options here manually in INI format, or automatically by running:
  120. # archivebox config --set KEY=VALUE
  121. #
  122. # If you modify this file manually, make sure to update your archive after by running:
  123. # archivebox init
  124. #
  125. # A list of all possible config with documentation and examples can be found here:
  126. # https://github.com/pirate/ArchiveBox/wiki/Configuration
  127. """)
  128. DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
  129. 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
  130. 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()},
  131. 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
  132. 'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
  133. 'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
  134. 'LEGACY_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], LEGACY_DIR_NAME)},
  135. 'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['LEGACY_DIR'], TEMPLATES_DIR_NAME)},
  136. 'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
  137. 'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
  138. 'SOURCES_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], SOURCES_DIR_NAME)},
  139. 'LOGS_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], LOGS_DIR_NAME)},
  140. 'CONFIG_FILE': {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else os.path.join(c['OUTPUT_DIR'], CONFIG_FILENAME)},
  141. 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))},
  142. 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)},
  143. 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'], re.IGNORECASE)},
  144. 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
  145. 'VERSION': {'default': lambda c: open(os.path.join(c['REPO_DIR'], VERSION_FILENAME), 'r').read().strip()},
  146. 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
  147. 'PYTHON_BINARY': {'default': lambda c: sys.executable},
  148. 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
  149. 'PYTHON_VERSION': {'default': lambda c: '{}.{}'.format(sys.version_info.major, sys.version_info.minor)},
  150. 'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
  151. 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
  152. 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_ARCHIVE_DOT_ORG'])},
  153. 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
  154. 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
  155. 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
  156. 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
  157. 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
  158. 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
  159. 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
  160. 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
  161. 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
  162. 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
  163. 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
  164. 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
  165. 'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
  166. 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
  167. 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
  168. 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'])},
  169. 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
  170. 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
  171. 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
  172. 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
  173. 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
  174. 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
  175. 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
  176. 'CONFIG_LOCATIONS': {'default': lambda c: get_config_locations(c)},
  177. 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
  178. 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
  179. }
  180. ################################### Helpers ####################################
  181. def load_config_val(key: str,
  182. default: ConfigDefaultValue=None,
  183. type: Optional[Type]=None,
  184. aliases: Optional[Tuple[str, ...]]=None,
  185. config: Optional[ConfigDict]=None,
  186. env_vars: Optional[os._Environ]=None,
  187. config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
  188. config_keys_to_check = (key, *(aliases or ()))
  189. for key in config_keys_to_check:
  190. if env_vars:
  191. val = env_vars.get(key)
  192. if val:
  193. break
  194. if config_file_vars:
  195. val = config_file_vars.get(key)
  196. if val:
  197. break
  198. if type is None or val is None:
  199. if callable(default):
  200. assert isinstance(config, dict)
  201. return default(config)
  202. return default
  203. elif type is bool:
  204. if val.lower() in ('true', 'yes', '1'):
  205. return True
  206. elif val.lower() in ('false', 'no', '0'):
  207. return False
  208. else:
  209. raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
  210. elif type is str:
  211. if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'):
  212. raise ValueError(f'Invalid configuration option {key}={val} (expected a string)')
  213. return val.strip()
  214. elif type is int:
  215. if not val.isdigit():
  216. raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
  217. return int(val)
  218. raise Exception('Config values can only be str, bool, or int')
  219. def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
  220. """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
  221. out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
  222. config_path = os.path.join(out_dir, CONFIG_FILENAME)
  223. if os.path.exists(config_path):
  224. config_file = ConfigParser()
  225. config_file.optionxform = str
  226. config_file.read(config_path)
  227. # flatten into one namespace
  228. config_file_vars = {
  229. key.upper(): val
  230. for section, options in config_file.items()
  231. for key, val in options.items()
  232. }
  233. # print('[i] Loaded config file', os.path.abspath(config_path))
  234. # print(config_file_vars)
  235. return config_file_vars
  236. return None
  237. def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
  238. """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
  239. out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
  240. config_path = os.path.join(out_dir, CONFIG_FILENAME)
  241. if not os.path.exists(config_path):
  242. with open(config_path, 'w+') as f:
  243. f.write(CONFIG_HEADER)
  244. if not config:
  245. return {}
  246. config_file = ConfigParser()
  247. config_file.optionxform = str
  248. config_file.read(config_path)
  249. find_section = lambda key: [name for name, opts in CONFIG_DEFAULTS.items() if key in opts][0]
  250. with open(f'{config_path}.old', 'w+') as old:
  251. with open(config_path, 'r') as new:
  252. old.write(new.read())
  253. with open(config_path, 'w+') as f:
  254. for key, val in config.items():
  255. section = find_section(key)
  256. if section in config_file:
  257. existing_config = dict(config_file[section])
  258. else:
  259. existing_config = {}
  260. config_file[section] = {**existing_config, key: val}
  261. config_file.write(f)
  262. try:
  263. CONFIG = load_all_config()
  264. return {
  265. key.upper(): CONFIG.get(key.upper())
  266. for key in config.keys()
  267. }
  268. except:
  269. with open(f'{config_path}.old', 'r') as old:
  270. with open(config_path, 'w+') as new:
  271. new.write(old.read())
  272. if os.path.exists(f'{config_path}.old'):
  273. os.remove(f'{config_path}.old')
  274. return {}
  275. def load_config(defaults: ConfigDefaultDict,
  276. config: Optional[ConfigDict]=None,
  277. out_dir: Optional[str]=None,
  278. env_vars: Optional[os._Environ]=None,
  279. config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
  280. env_vars = env_vars or os.environ
  281. config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
  282. extended_config: ConfigDict = config.copy() if config else {}
  283. for key, default in defaults.items():
  284. try:
  285. extended_config[key] = load_config_val(
  286. key,
  287. default=default['default'],
  288. type=default.get('type'),
  289. aliases=default.get('aliases'),
  290. config=extended_config,
  291. env_vars=env_vars,
  292. config_file_vars=config_file_vars,
  293. )
  294. except KeyboardInterrupt:
  295. raise SystemExit(0)
  296. except Exception as e:
  297. stderr()
  298. stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
  299. stderr(' {}: {}'.format(e.__class__.__name__, e))
  300. stderr()
  301. stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
  302. stderr()
  303. stderr(' For config documentation and examples see:')
  304. stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
  305. stderr()
  306. raise SystemExit(2)
  307. return extended_config
  308. # def write_config(config: ConfigDict):
  309. # with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
  310. def stderr(*args, color: Optional[str]=None, config: Optional[ConfigDict]=None) -> None:
  311. ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
  312. if color:
  313. strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
  314. else:
  315. strs = [' '.join(str(a) for a in args), '\n']
  316. sys.stderr.write(''.join(strs))
  317. def bin_version(binary: Optional[str]) -> Optional[str]:
  318. """check the presence and return valid version line of a specified binary"""
  319. abspath = bin_path(binary)
  320. if not abspath:
  321. return None
  322. try:
  323. version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
  324. # take first 3 columns of first line of version info
  325. return ' '.join(version_str.split('\n')[0].strip().split()[:3])
  326. except Exception:
  327. # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
  328. # stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
  329. # stderr(f' {binary} --version')
  330. # stderr()
  331. # stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
  332. # stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
  333. # stderr()
  334. return None
  335. def bin_path(binary: Optional[str]) -> Optional[str]:
  336. if binary is None:
  337. return None
  338. return shutil.which(os.path.expanduser(binary)) or binary
  339. def bin_hash(binary: Optional[str]) -> Optional[str]:
  340. abs_path = bin_path(binary)
  341. if abs_path is None:
  342. return None
  343. file_hash = md5()
  344. with io.open(abs_path, mode='rb') as f:
  345. for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
  346. file_hash.update(chunk)
  347. return f'md5:{file_hash.hexdigest()}'
  348. def find_chrome_binary() -> Optional[str]:
  349. """find any installed chrome binaries in the default locations"""
  350. # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
  351. # make sure data dir finding precedence order always matches binary finding order
  352. default_executable_paths = (
  353. 'chromium-browser',
  354. 'chromium',
  355. '/Applications/Chromium.app/Contents/MacOS/Chromium',
  356. 'google-chrome',
  357. '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
  358. 'google-chrome-stable',
  359. 'google-chrome-beta',
  360. 'google-chrome-canary',
  361. '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
  362. 'google-chrome-unstable',
  363. 'google-chrome-dev',
  364. )
  365. for name in default_executable_paths:
  366. full_path_exists = shutil.which(name)
  367. if full_path_exists:
  368. return name
  369. stderr('[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?', color='red')
  370. stderr()
  371. return None
  372. def find_chrome_data_dir() -> Optional[str]:
  373. """find any installed chrome user data directories in the default locations"""
  374. # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
  375. # make sure data dir finding precedence order always matches binary finding order
  376. default_profile_paths = (
  377. '~/.config/chromium',
  378. '~/Library/Application Support/Chromium',
  379. '~/AppData/Local/Chromium/User Data',
  380. '~/.config/google-chrome',
  381. '~/Library/Application Support/Google/Chrome',
  382. '~/AppData/Local/Google/Chrome/User Data',
  383. '~/.config/google-chrome-stable',
  384. '~/.config/google-chrome-beta',
  385. '~/Library/Application Support/Google/Chrome Canary',
  386. '~/AppData/Local/Google/Chrome SxS/User Data',
  387. '~/.config/google-chrome-unstable',
  388. '~/.config/google-chrome-dev',
  389. )
  390. for path in default_profile_paths:
  391. full_path = os.path.expanduser(path)
  392. if os.path.exists(full_path):
  393. return full_path
  394. return None
  395. def wget_supports_compression(config):
  396. cmd = [
  397. config['WGET_BINARY'],
  398. "--compression=auto",
  399. "--help",
  400. ]
  401. return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
  402. def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
  403. return {
  404. 'REPO_DIR': {
  405. 'path': os.path.abspath(config['REPO_DIR']),
  406. 'enabled': True,
  407. 'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], '.github')),
  408. },
  409. 'PYTHON_DIR': {
  410. 'path': os.path.abspath(config['PYTHON_DIR']),
  411. 'enabled': True,
  412. 'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
  413. },
  414. 'LEGACY_DIR': {
  415. 'path': os.path.abspath(config['LEGACY_DIR']),
  416. 'enabled': True,
  417. 'is_valid': os.path.exists(os.path.join(config['LEGACY_DIR'], 'util.py')),
  418. },
  419. 'TEMPLATES_DIR': {
  420. 'path': os.path.abspath(config['TEMPLATES_DIR']),
  421. 'enabled': True,
  422. 'is_valid': os.path.exists(os.path.join(config['TEMPLATES_DIR'], 'static')),
  423. },
  424. }
  425. def get_config_locations(config: ConfigDict) -> ConfigValue:
  426. abspath = lambda path: None if path is None else os.path.abspath(path)
  427. return {
  428. 'CONFIG_FILE': {
  429. 'path': abspath(config['CHROME_USER_DATA_DIR']),
  430. 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
  431. 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
  432. },
  433. 'CHROME_USER_DATA_DIR': {
  434. 'path': abspath(config['CHROME_USER_DATA_DIR']),
  435. 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
  436. 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
  437. },
  438. 'COOKIES_FILE': {
  439. 'path': abspath(config['COOKIES_FILE']),
  440. 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
  441. 'is_valid': False if config['COOKIES_FILE'] is None else os.path.exists(config['COOKIES_FILE']),
  442. },
  443. }
  444. def get_data_locations(config: ConfigDict) -> ConfigValue:
  445. return {
  446. 'OUTPUT_DIR': {
  447. 'path': os.path.abspath(config['OUTPUT_DIR']),
  448. 'enabled': True,
  449. 'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
  450. },
  451. 'SOURCES_DIR': {
  452. 'path': os.path.abspath(config['SOURCES_DIR']),
  453. 'enabled': True,
  454. 'is_valid': os.path.exists(config['SOURCES_DIR']),
  455. },
  456. 'LOGS_DIR': {
  457. 'path': os.path.abspath(config['LOGS_DIR']),
  458. 'enabled': True,
  459. 'is_valid': os.path.exists(config['LOGS_DIR']),
  460. },
  461. 'ARCHIVE_DIR': {
  462. 'path': os.path.abspath(config['ARCHIVE_DIR']),
  463. 'enabled': True,
  464. 'is_valid': os.path.exists(config['ARCHIVE_DIR']),
  465. },
  466. 'SQL_INDEX': {
  467. 'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
  468. 'enabled': True,
  469. 'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
  470. },
  471. }
  472. def get_dependency_info(config: ConfigDict) -> ConfigValue:
  473. return {
  474. 'PYTHON_BINARY': {
  475. 'path': bin_path(config['PYTHON_BINARY']),
  476. 'version': config['PYTHON_VERSION'],
  477. 'hash': bin_hash(config['PYTHON_BINARY']),
  478. 'enabled': True,
  479. 'is_valid': bool(config['DJANGO_VERSION']),
  480. },
  481. 'DJANGO_BINARY': {
  482. 'path': bin_path(config['DJANGO_BINARY']),
  483. 'version': config['DJANGO_VERSION'],
  484. 'hash': bin_hash(config['DJANGO_BINARY']),
  485. 'enabled': True,
  486. 'is_valid': bool(config['DJANGO_VERSION']),
  487. },
  488. 'CURL_BINARY': {
  489. 'path': bin_path(config['CURL_BINARY']),
  490. 'version': config['CURL_VERSION'],
  491. 'hash': bin_hash(config['PYTHON_BINARY']),
  492. 'enabled': config['USE_CURL'],
  493. 'is_valid': bool(config['CURL_VERSION']),
  494. },
  495. 'WGET_BINARY': {
  496. 'path': bin_path(config['WGET_BINARY']),
  497. 'version': config['WGET_VERSION'],
  498. 'hash': bin_hash(config['WGET_BINARY']),
  499. 'enabled': config['USE_WGET'],
  500. 'is_valid': bool(config['WGET_VERSION']),
  501. },
  502. 'GIT_BINARY': {
  503. 'path': bin_path(config['GIT_BINARY']),
  504. 'version': config['GIT_VERSION'],
  505. 'hash': bin_hash(config['GIT_BINARY']),
  506. 'enabled': config['USE_GIT'],
  507. 'is_valid': bool(config['GIT_VERSION']),
  508. },
  509. 'YOUTUBEDL_BINARY': {
  510. 'path': bin_path(config['YOUTUBEDL_BINARY']),
  511. 'version': config['YOUTUBEDL_VERSION'],
  512. 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
  513. 'enabled': config['USE_YOUTUBEDL'],
  514. 'is_valid': bool(config['YOUTUBEDL_VERSION']),
  515. },
  516. 'CHROME_BINARY': {
  517. 'path': bin_path(config['CHROME_BINARY']),
  518. 'version': config['CHROME_VERSION'],
  519. 'hash': bin_hash(config['CHROME_BINARY']),
  520. 'enabled': config['USE_CHROME'],
  521. 'is_valid': bool(config['CHROME_VERSION']),
  522. },
  523. }
  524. def get_chrome_info(config: ConfigDict) -> ConfigValue:
  525. return {
  526. 'TIMEOUT': config['TIMEOUT'],
  527. 'RESOLUTION': config['RESOLUTION'],
  528. 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
  529. 'CHROME_BINARY': config['CHROME_BINARY'],
  530. 'CHROME_HEADLESS': config['CHROME_HEADLESS'],
  531. 'CHROME_SANDBOX': config['CHROME_SANDBOX'],
  532. 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
  533. 'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
  534. }
  535. ################################## Load Config #################################
  536. def load_all_config():
  537. CONFIG: ConfigDict = {}
  538. for section_name, section_config in CONFIG_DEFAULTS.items():
  539. CONFIG = load_config(section_config, CONFIG)
  540. return load_config(DERIVED_CONFIG_DEFAULTS, CONFIG)
  541. CONFIG = load_all_config()
  542. globals().update(CONFIG)
  543. ############################## Importable Checkers #############################
  544. def check_system_config(config: ConfigDict=CONFIG) -> None:
  545. ### Check system environment
  546. if config['USER'] == 'root':
  547. stderr('[!] ArchiveBox should never be run as root!', color='red')
  548. stderr(' For more information, see the security overview documentation:')
  549. stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
  550. raise SystemExit(2)
  551. ### Check Python environment
  552. if float(config['PYTHON_VERSION']) < 3.6:
  553. stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
  554. stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
  555. raise SystemExit(2)
  556. if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
  557. stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
  558. stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
  559. stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
  560. stderr('')
  561. stderr(' Confirm that it\'s fixed by opening a new shell and running:')
  562. stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
  563. raise SystemExit(2)
  564. # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
  565. # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
  566. if config['CHROME_USER_DATA_DIR'] is not None:
  567. if not os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')):
  568. stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
  569. stderr(f' {config["CHROME_USER_DATA_DIR"]}')
  570. stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
  571. stderr(' For more info see:')
  572. stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
  573. if 'Default' in config['CHROME_USER_DATA_DIR']:
  574. stderr()
  575. stderr(' Try removing /Default from the end e.g.:')
  576. stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
  577. raise SystemExit(2)
  578. def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
  579. invalid = [
  580. '{}: {} ({})'.format(name, info['path'] or 'unable to find binary', info['version'] or 'unable to detect version')
  581. for name, info in config['DEPENDENCIES'].items()
  582. if info['enabled'] and not info['is_valid']
  583. ]
  584. if invalid:
  585. stderr('[X] Missing some required dependencies.', color='red')
  586. stderr()
  587. stderr(' {}'.format('\n '.join(invalid)))
  588. if show_help:
  589. stderr()
  590. stderr(' To get more info on dependency status run:')
  591. stderr(' archivebox --version')
  592. raise SystemExit(2)
  593. if config['TIMEOUT'] < 5:
  594. stderr()
  595. stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
  596. stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
  597. stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
  598. stderr()
  599. stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
  600. stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
  601. elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
  602. stderr()
  603. stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
  604. stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
  605. stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
  606. stderr()
  607. stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
  608. stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
  609. if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
  610. stderr()
  611. stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
  612. stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
  613. stderr(' (Setting it somewhere over 60 seconds is recommended)')
  614. stderr()
  615. stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
  616. stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
  617. def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
  618. output_dir = out_dir or config['OUTPUT_DIR']
  619. assert isinstance(output_dir, str)
  620. json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
  621. if not json_index_exists:
  622. stderr('[X] No archive index was found in current directory.', color='red')
  623. stderr(f' {output_dir}')
  624. stderr()
  625. stderr(' Are you running archivebox in the right folder?')
  626. stderr(' cd path/to/your/archive/folder')
  627. stderr(' archivebox [command]')
  628. stderr()
  629. stderr(' To create a new archive collection or import existing data in this folder, run:')
  630. stderr(' archivebox init')
  631. raise SystemExit(2)
  632. sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
  633. from .storage.sql import list_migrations
  634. pending_migrations = [name for status, name in list_migrations() if not status]
  635. if (not sql_index_exists) or pending_migrations:
  636. if sql_index_exists:
  637. pending_operation = f'apply the {len(pending_migrations)} pending migrations'
  638. else:
  639. pending_operation = 'generate the new SQL main index'
  640. stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
  641. stderr(f' {output_dir}')
  642. stderr()
  643. stderr(f' To upgrade it to the latest version and {pending_operation} run:')
  644. stderr(' archivebox init')
  645. raise SystemExit(3)
  646. def setup_django(out_dir: str=None, check_db=False, config: ConfigDict=CONFIG) -> None:
  647. output_dir = out_dir or config['OUTPUT_DIR']
  648. assert isinstance(output_dir, str) and isinstance(config['PYTHON_DIR'], str)
  649. try:
  650. import django
  651. sys.path.append(config['PYTHON_DIR'])
  652. os.environ.setdefault('OUTPUT_DIR', output_dir)
  653. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
  654. django.setup()
  655. if check_db:
  656. sql_index_path = os.path.join(output_dir, SQL_INDEX_FILENAME)
  657. assert os.path.exists(sql_index_path), (
  658. f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
  659. except KeyboardInterrupt:
  660. raise SystemExit(2)
  661. check_system_config()