config.py 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001
  1. __package__ = 'archivebox'
  2. import os
  3. import io
  4. import re
  5. import sys
  6. import json
  7. import getpass
  8. import shutil
  9. import django
  10. from hashlib import md5
  11. from pathlib import Path
  12. from typing import Optional, Type, Tuple, Dict, Union, List
  13. from subprocess import run, PIPE, DEVNULL
  14. from configparser import ConfigParser
  15. from collections import defaultdict
  16. from .config_stubs import (
  17. SimpleConfigValueDict,
  18. ConfigValue,
  19. ConfigDict,
  20. ConfigDefaultValue,
  21. ConfigDefaultDict,
  22. )
  23. # precedence order for config:
  24. # 1. cli args (e.g. )
  25. # 2. shell environment vars (env USE_COLOR=False archivebox add '...')
  26. # 3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf)
  27. # 4. defaults (defined below in Python)
  28. #
  29. # env SHOW_PROGRESS=1 archivebox add '...'
  30. # archivebox config --set TIMEOUT=600
  31. #
  32. # ******************************************************************************
  33. # Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
  34. # Use the 'env' command to pass config options to ArchiveBox. e.g.:
  35. # env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
  36. # ******************************************************************************
  37. ################################# User Config ##################################
  38. CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
  39. 'SHELL_CONFIG': {
  40. 'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
  41. 'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
  42. 'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']},
  43. 'IN_DOCKER': {'type': bool, 'default': False},
  44. # TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
  45. },
  46. 'GENERAL_CONFIG': {
  47. 'OUTPUT_DIR': {'type': str, 'default': None},
  48. 'CONFIG_FILE': {'type': str, 'default': None},
  49. 'ONLY_NEW': {'type': bool, 'default': True},
  50. 'TIMEOUT': {'type': int, 'default': 60},
  51. 'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
  52. 'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
  53. 'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
  54. 'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
  55. },
  56. 'SERVER_CONFIG': {
  57. 'SECRET_KEY': {'type': str, 'default': None},
  58. 'BIND_ADDR': {'type': str, 'default': lambda c: ['127.0.0.1:8000', '0.0.0.0:8000'][c['IN_DOCKER']]},
  59. 'ALLOWED_HOSTS': {'type': str, 'default': '*'},
  60. 'DEBUG': {'type': bool, 'default': False},
  61. 'PUBLIC_INDEX': {'type': bool, 'default': True},
  62. 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
  63. 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
  64. 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
  65. 'ACTIVE_THEME': {'type': str, 'default': 'default'},
  66. },
  67. 'ARCHIVE_METHOD_TOGGLES': {
  68. 'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
  69. 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
  70. 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
  71. 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
  72. 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)},
  73. 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)},
  74. 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)},
  75. 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
  76. 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
  77. 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
  78. 'SAVE_HEADERS': {'type': bool, 'default': True, 'aliases': ('FETCH_HEADERS',)},
  79. 'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
  80. 'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
  81. 'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
  82. 'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
  83. },
  84. 'ARCHIVE_METHOD_OPTIONS': {
  85. 'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
  86. 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
  87. 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
  88. 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
  89. 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
  90. 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
  91. 'COOKIES_FILE': {'type': str, 'default': None},
  92. 'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
  93. 'CHROME_HEADLESS': {'type': bool, 'default': True},
  94. 'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
  95. 'YOUTUBEDL_ARGS': {'type': list, 'default': ['--write-description',
  96. '--write-info-json',
  97. '--write-annotations',
  98. '--write-thumbnail',
  99. '--no-call-home',
  100. '--user-agent',
  101. '--all-subs',
  102. '--extract-audio',
  103. '--keep-video',
  104. '--ignore-errors',
  105. '--geo-bypass',
  106. '--audio-format', 'mp3',
  107. '--audio-quality', '320K',
  108. '--embed-thumbnail',
  109. '--add-metadata']},
  110. 'WGET_ARGS': {'type': list, 'default': ['--no-verbose',
  111. '--adjust-extension',
  112. '--convert-links',
  113. '--force-directories',
  114. '--backup-converted',
  115. '--span-hosts',
  116. '--no-parent',
  117. '-e', 'robots=off',
  118. ]},
  119. 'CURL_ARGS': {'type': list, 'default': ['--silent',
  120. '--location',
  121. '--compressed'
  122. ]},
  123. 'GIT_ARGS': {'type': list, 'default': ['--recursive']},
  124. },
  125. 'DEPENDENCY_CONFIG': {
  126. 'USE_CURL': {'type': bool, 'default': True},
  127. 'USE_WGET': {'type': bool, 'default': True},
  128. 'USE_SINGLEFILE': {'type': bool, 'default': True},
  129. 'USE_READABILITY': {'type': bool, 'default': True},
  130. 'USE_MERCURY': {'type': bool, 'default': True},
  131. 'USE_GIT': {'type': bool, 'default': True},
  132. 'USE_CHROME': {'type': bool, 'default': True},
  133. 'USE_NODE': {'type': bool, 'default': True},
  134. 'USE_YOUTUBEDL': {'type': bool, 'default': True},
  135. 'CURL_BINARY': {'type': str, 'default': 'curl'},
  136. 'GIT_BINARY': {'type': str, 'default': 'git'},
  137. 'WGET_BINARY': {'type': str, 'default': 'wget'},
  138. 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'},
  139. 'READABILITY_BINARY': {'type': str, 'default': 'readability-extractor'},
  140. 'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'},
  141. 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
  142. 'NODE_BINARY': {'type': str, 'default': 'node'},
  143. 'CHROME_BINARY': {'type': str, 'default': None},
  144. },
  145. }
  146. # for backwards compatibility with old config files, check old/deprecated names for each key
  147. CONFIG_ALIASES = {
  148. alias: key
  149. for section in CONFIG_DEFAULTS.values()
  150. for key, default in section.items()
  151. for alias in default.get('aliases', ())
  152. }
  153. USER_CONFIG = {key for section in CONFIG_DEFAULTS.values() for key in section.keys()}
  154. def get_real_name(key: str) -> str:
  155. return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
  156. ############################## Derived Config ##############################
  157. # Constants
  158. DEFAULT_CLI_COLORS = {
  159. 'reset': '\033[00;00m',
  160. 'lightblue': '\033[01;30m',
  161. 'lightyellow': '\033[01;33m',
  162. 'lightred': '\033[01;35m',
  163. 'red': '\033[01;31m',
  164. 'green': '\033[01;32m',
  165. 'blue': '\033[01;34m',
  166. 'white': '\033[01;37m',
  167. 'black': '\033[01;30m',
  168. }
  169. ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
  170. COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
  171. '00': [(0, 0, 0), (0, 0, 0)],
  172. '30': [(0, 0, 0), (0, 0, 0)],
  173. '31': [(255, 0, 0), (128, 0, 0)],
  174. '32': [(0, 200, 0), (0, 128, 0)],
  175. '33': [(255, 255, 0), (128, 128, 0)],
  176. '34': [(0, 0, 255), (0, 0, 128)],
  177. '35': [(255, 0, 255), (128, 0, 128)],
  178. '36': [(0, 255, 255), (0, 128, 128)],
  179. '37': [(255, 255, 255), (255, 255, 255)],
  180. })
  181. STATICFILE_EXTENSIONS = {
  182. # 99.999% of the time, URLs ending in these extensions are static files
  183. # that can be downloaded as-is, not html pages that need to be rendered
  184. 'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
  185. 'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
  186. 'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
  187. 'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
  188. 'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
  189. 'atom', 'rss', 'css', 'js', 'json',
  190. 'dmg', 'iso', 'img',
  191. 'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
  192. # Less common extensions to consider adding later
  193. # jar, swf, bin, com, exe, dll, deb
  194. # ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
  195. # pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
  196. # ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
  197. # These are always treated as pages, not as static files, never add them:
  198. # html, htm, shtml, xhtml, xml, aspx, php, cgi
  199. }
  200. PACKAGE_DIR_NAME = 'archivebox'
  201. TEMPLATES_DIR_NAME = 'themes'
  202. ARCHIVE_DIR_NAME = 'archive'
  203. SOURCES_DIR_NAME = 'sources'
  204. LOGS_DIR_NAME = 'logs'
  205. STATIC_DIR_NAME = 'static'
  206. SQL_INDEX_FILENAME = 'index.sqlite3'
  207. JSON_INDEX_FILENAME = 'index.json'
  208. HTML_INDEX_FILENAME = 'index.html'
  209. ROBOTS_TXT_FILENAME = 'robots.txt'
  210. FAVICON_FILENAME = 'favicon.ico'
  211. CONFIG_FILENAME = 'ArchiveBox.conf'
  212. CONFIG_HEADER = (
  213. """# This is the config file for your ArchiveBox collection.
  214. #
  215. # You can add options here manually in INI format, or automatically by running:
  216. # archivebox config --set KEY=VALUE
  217. #
  218. # If you modify this file manually, make sure to update your archive after by running:
  219. # archivebox init
  220. #
  221. # A list of all possible config with documentation and examples can be found here:
  222. # https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
  223. """)
  224. DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
  225. 'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
  226. 'USER': {'default': lambda c: getpass.getuser() or os.getlogin()},
  227. 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
  228. 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
  229. 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'},
  230. 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
  231. 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
  232. 'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
  233. 'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
  234. 'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
  235. 'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
  236. 'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
  237. 'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
  238. 'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
  239. 'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
  240. 'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
  241. 'PYTHON_BINARY': {'default': lambda c: sys.executable},
  242. 'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
  243. 'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
  244. 'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
  245. 'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
  246. 'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
  247. 'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
  248. 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
  249. 'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
  250. 'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
  251. 'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
  252. 'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
  253. 'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
  254. 'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
  255. 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
  256. 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
  257. 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
  258. 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
  259. 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
  260. 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
  261. 'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
  262. 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
  263. 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
  264. 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if (c['USE_MERCURY'] and c['MERCURY_BINARY']) else None}, # mercury is unversioned
  265. 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
  266. 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
  267. 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
  268. 'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
  269. 'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
  270. 'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
  271. 'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
  272. 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
  273. 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
  274. 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
  275. 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
  276. 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
  277. 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
  278. 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
  279. 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
  280. 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
  281. 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
  282. 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
  283. 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
  284. 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
  285. 'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
  286. 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
  287. 'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
  288. }
  289. ################################### Helpers ####################################
  290. def load_config_val(key: str,
  291. default: ConfigDefaultValue=None,
  292. type: Optional[Type]=None,
  293. aliases: Optional[Tuple[str, ...]]=None,
  294. config: Optional[ConfigDict]=None,
  295. env_vars: Optional[os._Environ]=None,
  296. config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
  297. """parse bool, int, and str key=value pairs from env"""
  298. config_keys_to_check = (key, *(aliases or ()))
  299. for key in config_keys_to_check:
  300. if env_vars:
  301. val = env_vars.get(key)
  302. if val:
  303. break
  304. if config_file_vars:
  305. val = config_file_vars.get(key)
  306. if val:
  307. break
  308. if type is None or val is None:
  309. if callable(default):
  310. assert isinstance(config, dict)
  311. return default(config)
  312. return default
  313. elif type is bool:
  314. if val.lower() in ('true', 'yes', '1'):
  315. return True
  316. elif val.lower() in ('false', 'no', '0'):
  317. return False
  318. else:
  319. raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
  320. elif type is str:
  321. if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'):
  322. raise ValueError(f'Invalid configuration option {key}={val} (expected a string)')
  323. return val.strip()
  324. elif type is int:
  325. if not val.isdigit():
  326. raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
  327. return int(val)
  328. elif type is list:
  329. return json.loads(val)
  330. raise Exception('Config values can only be str, bool, int or json')
  331. def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
  332. """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
  333. out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
  334. config_path = Path(out_dir) / CONFIG_FILENAME
  335. if config_path.exists():
  336. config_file = ConfigParser()
  337. config_file.optionxform = str
  338. config_file.read(config_path)
  339. # flatten into one namespace
  340. config_file_vars = {
  341. key.upper(): val
  342. for section, options in config_file.items()
  343. for key, val in options.items()
  344. }
  345. # print('[i] Loaded config file', os.path.abspath(config_path))
  346. # print(config_file_vars)
  347. return config_file_vars
  348. return None
  349. def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
  350. """load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
  351. from .system import atomic_write
  352. out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
  353. config_path = Path(out_dir) / CONFIG_FILENAME
  354. if not config_path.exists():
  355. atomic_write(config_path, CONFIG_HEADER)
  356. config_file = ConfigParser()
  357. config_file.optionxform = str
  358. config_file.read(config_path)
  359. with open(config_path, 'r') as old:
  360. atomic_write(f'{config_path}.bak', old.read())
  361. find_section = lambda key: [name for name, opts in CONFIG_DEFAULTS.items() if key in opts][0]
  362. # Set up sections in empty config file
  363. for key, val in config.items():
  364. section = find_section(key)
  365. if section in config_file:
  366. existing_config = dict(config_file[section])
  367. else:
  368. existing_config = {}
  369. config_file[section] = {**existing_config, key: val}
  370. # always make sure there's a SECRET_KEY defined for Django
  371. existing_secret_key = None
  372. if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
  373. existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
  374. if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
  375. from django.utils.crypto import get_random_string
  376. chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
  377. random_secret_key = get_random_string(50, chars)
  378. if 'SERVER_CONFIG' in config_file:
  379. config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
  380. else:
  381. config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
  382. with open(config_path, 'w+') as new:
  383. config_file.write(new)
  384. try:
  385. # validate the config by attempting to re-parse it
  386. CONFIG = load_all_config()
  387. return {
  388. key.upper(): CONFIG.get(key.upper())
  389. for key in config.keys()
  390. }
  391. except:
  392. # something went horribly wrong, rever to the previous version
  393. with open(f'{config_path}.bak', 'r') as old:
  394. atomic_write(config_path, old.read())
  395. if Path(f'{config_path}.bak').exists():
  396. os.remove(f'{config_path}.bak')
  397. return {}
  398. def load_config(defaults: ConfigDefaultDict,
  399. config: Optional[ConfigDict]=None,
  400. out_dir: Optional[str]=None,
  401. env_vars: Optional[os._Environ]=None,
  402. config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
  403. env_vars = env_vars or os.environ
  404. config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
  405. extended_config: ConfigDict = config.copy() if config else {}
  406. for key, default in defaults.items():
  407. try:
  408. extended_config[key] = load_config_val(
  409. key,
  410. default=default['default'],
  411. type=default.get('type'),
  412. aliases=default.get('aliases'),
  413. config=extended_config,
  414. env_vars=env_vars,
  415. config_file_vars=config_file_vars,
  416. )
  417. except KeyboardInterrupt:
  418. raise SystemExit(0)
  419. except Exception as e:
  420. stderr()
  421. stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
  422. stderr(' {}: {}'.format(e.__class__.__name__, e))
  423. stderr()
  424. stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
  425. stderr()
  426. stderr(' For config documentation and examples see:')
  427. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
  428. stderr()
  429. raise
  430. raise SystemExit(2)
  431. return extended_config
  432. # def write_config(config: ConfigDict):
  433. # with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
  434. def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
  435. ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
  436. if color:
  437. strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
  438. else:
  439. strs = [' '.join(str(a) for a in args), '\n']
  440. sys.stdout.write(prefix + ''.join(strs))
  441. def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[ConfigDict]=None) -> None:
  442. ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
  443. if color:
  444. strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
  445. else:
  446. strs = [' '.join(str(a) for a in args), '\n']
  447. sys.stderr.write(prefix + ''.join(strs))
  448. def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[ConfigDict]=None) -> None:
  449. ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
  450. if isinstance(text, str):
  451. stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text, **ansi))
  452. else:
  453. stderr('{}{lightred}Hint:{reset} {}'.format(prefix, text[0], **ansi))
  454. for line in text[1:]:
  455. stderr('{} {}'.format(prefix, line))
  456. def bin_version(binary: Optional[str]) -> Optional[str]:
  457. """check the presence and return valid version line of a specified binary"""
  458. abspath = bin_path(binary)
  459. if not binary or not abspath:
  460. return None
  461. try:
  462. version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
  463. # take first 3 columns of first line of version info
  464. return ' '.join(version_str.split('\n')[0].strip().split()[:3])
  465. except OSError:
  466. pass
  467. # stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
  468. # stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
  469. # stderr(f' {binary} --version')
  470. # stderr()
  471. # stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
  472. # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
  473. return None
  474. def bin_path(binary: Optional[str]) -> Optional[str]:
  475. if binary is None:
  476. return None
  477. node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
  478. if node_modules_bin.exists():
  479. return str(node_modules_bin.resolve())
  480. return shutil.which(Path(binary).expanduser()) or binary
  481. def bin_hash(binary: Optional[str]) -> Optional[str]:
  482. if binary is None:
  483. return None
  484. abs_path = bin_path(binary)
  485. if abs_path is None or not Path(abs_path).exists():
  486. return None
  487. file_hash = md5()
  488. with io.open(abs_path, mode='rb') as f:
  489. for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
  490. file_hash.update(chunk)
  491. return f'md5:{file_hash.hexdigest()}'
  492. def find_chrome_binary() -> Optional[str]:
  493. """find any installed chrome binaries in the default locations"""
  494. # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
  495. # make sure data dir finding precedence order always matches binary finding order
  496. default_executable_paths = (
  497. 'chromium-browser',
  498. 'chromium',
  499. '/Applications/Chromium.app/Contents/MacOS/Chromium',
  500. 'chrome',
  501. 'google-chrome',
  502. '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
  503. 'google-chrome-stable',
  504. 'google-chrome-beta',
  505. 'google-chrome-canary',
  506. '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
  507. 'google-chrome-unstable',
  508. 'google-chrome-dev',
  509. )
  510. for name in default_executable_paths:
  511. full_path_exists = shutil.which(name)
  512. if full_path_exists:
  513. return name
  514. return None
  515. def find_chrome_data_dir() -> Optional[str]:
  516. """find any installed chrome user data directories in the default locations"""
  517. # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
  518. # make sure data dir finding precedence order always matches binary finding order
  519. default_profile_paths = (
  520. '~/.config/chromium',
  521. '~/Library/Application Support/Chromium',
  522. '~/AppData/Local/Chromium/User Data',
  523. '~/.config/chrome',
  524. '~/.config/google-chrome',
  525. '~/Library/Application Support/Google/Chrome',
  526. '~/AppData/Local/Google/Chrome/User Data',
  527. '~/.config/google-chrome-stable',
  528. '~/.config/google-chrome-beta',
  529. '~/Library/Application Support/Google/Chrome Canary',
  530. '~/AppData/Local/Google/Chrome SxS/User Data',
  531. '~/.config/google-chrome-unstable',
  532. '~/.config/google-chrome-dev',
  533. )
  534. for path in default_profile_paths:
  535. full_path = Path(path).resolve()
  536. if full_path.exists():
  537. return full_path
  538. return None
  539. def wget_supports_compression(config):
  540. try:
  541. cmd = [
  542. config['WGET_BINARY'],
  543. "--compression=auto",
  544. "--help",
  545. ]
  546. return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
  547. except (FileNotFoundError, OSError):
  548. return False
  549. def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
  550. return {
  551. 'PACKAGE_DIR': {
  552. 'path': (config['PACKAGE_DIR']).resolve(),
  553. 'enabled': True,
  554. 'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
  555. },
  556. 'TEMPLATES_DIR': {
  557. 'path': (config['TEMPLATES_DIR']).resolve(),
  558. 'enabled': True,
  559. 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
  560. },
  561. # 'NODE_MODULES_DIR': {
  562. # 'path': ,
  563. # 'enabled': ,
  564. # 'is_valid': (...).exists(),
  565. # },
  566. }
  567. def get_external_locations(config: ConfigDict) -> ConfigValue:
  568. abspath = lambda path: None if path is None else Path(path).resolve()
  569. return {
  570. 'CHROME_USER_DATA_DIR': {
  571. 'path': abspath(config['CHROME_USER_DATA_DIR']),
  572. 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
  573. 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
  574. },
  575. 'COOKIES_FILE': {
  576. 'path': abspath(config['COOKIES_FILE']),
  577. 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
  578. 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
  579. },
  580. }
  581. def get_data_locations(config: ConfigDict) -> ConfigValue:
  582. return {
  583. 'OUTPUT_DIR': {
  584. 'path': config['OUTPUT_DIR'].resolve(),
  585. 'enabled': True,
  586. 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
  587. },
  588. 'SOURCES_DIR': {
  589. 'path': config['SOURCES_DIR'].resolve(),
  590. 'enabled': True,
  591. 'is_valid': config['SOURCES_DIR'].exists(),
  592. },
  593. 'LOGS_DIR': {
  594. 'path': config['LOGS_DIR'].resolve(),
  595. 'enabled': True,
  596. 'is_valid': config['LOGS_DIR'].exists(),
  597. },
  598. 'ARCHIVE_DIR': {
  599. 'path': config['ARCHIVE_DIR'].resolve(),
  600. 'enabled': True,
  601. 'is_valid': config['ARCHIVE_DIR'].exists(),
  602. },
  603. 'CONFIG_FILE': {
  604. 'path': config['CONFIG_FILE'].resolve(),
  605. 'enabled': True,
  606. 'is_valid': config['CONFIG_FILE'].exists(),
  607. },
  608. 'SQL_INDEX': {
  609. 'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
  610. 'enabled': True,
  611. 'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
  612. },
  613. }
  614. def get_dependency_info(config: ConfigDict) -> ConfigValue:
  615. return {
  616. 'ARCHIVEBOX_BINARY': {
  617. 'path': bin_path(config['ARCHIVEBOX_BINARY']),
  618. 'version': config['VERSION'],
  619. 'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
  620. 'enabled': True,
  621. 'is_valid': True,
  622. },
  623. 'PYTHON_BINARY': {
  624. 'path': bin_path(config['PYTHON_BINARY']),
  625. 'version': config['PYTHON_VERSION'],
  626. 'hash': bin_hash(config['PYTHON_BINARY']),
  627. 'enabled': True,
  628. 'is_valid': bool(config['DJANGO_VERSION']),
  629. },
  630. 'DJANGO_BINARY': {
  631. 'path': bin_path(config['DJANGO_BINARY']),
  632. 'version': config['DJANGO_VERSION'],
  633. 'hash': bin_hash(config['DJANGO_BINARY']),
  634. 'enabled': True,
  635. 'is_valid': bool(config['DJANGO_VERSION']),
  636. },
  637. 'CURL_BINARY': {
  638. 'path': bin_path(config['CURL_BINARY']),
  639. 'version': config['CURL_VERSION'],
  640. 'hash': bin_hash(config['PYTHON_BINARY']),
  641. 'enabled': config['USE_CURL'],
  642. 'is_valid': bool(config['CURL_VERSION']),
  643. },
  644. 'WGET_BINARY': {
  645. 'path': bin_path(config['WGET_BINARY']),
  646. 'version': config['WGET_VERSION'],
  647. 'hash': bin_hash(config['WGET_BINARY']),
  648. 'enabled': config['USE_WGET'],
  649. 'is_valid': bool(config['WGET_VERSION']),
  650. },
  651. 'NODE_BINARY': {
  652. 'path': bin_path(config['NODE_BINARY']),
  653. 'version': config['NODE_VERSION'],
  654. 'hash': bin_hash(config['NODE_BINARY']),
  655. 'enabled': config['USE_NODE'],
  656. 'is_valid': bool(config['SINGLEFILE_VERSION']),
  657. },
  658. 'SINGLEFILE_BINARY': {
  659. 'path': bin_path(config['SINGLEFILE_BINARY']),
  660. 'version': config['SINGLEFILE_VERSION'],
  661. 'hash': bin_hash(config['SINGLEFILE_BINARY']),
  662. 'enabled': config['USE_SINGLEFILE'],
  663. 'is_valid': bool(config['SINGLEFILE_VERSION']),
  664. },
  665. 'READABILITY_BINARY': {
  666. 'path': bin_path(config['READABILITY_BINARY']),
  667. 'version': config['READABILITY_VERSION'],
  668. 'hash': bin_hash(config['READABILITY_BINARY']),
  669. 'enabled': config['USE_READABILITY'],
  670. 'is_valid': bool(config['READABILITY_VERSION']),
  671. },
  672. 'MERCURY_BINARY': {
  673. 'path': bin_path(config['MERCURY_BINARY']),
  674. 'version': config['MERCURY_VERSION'],
  675. 'hash': bin_hash(config['MERCURY_BINARY']),
  676. 'enabled': config['USE_MERCURY'],
  677. 'is_valid': bool(config['MERCURY_VERSION']),
  678. },
  679. 'GIT_BINARY': {
  680. 'path': bin_path(config['GIT_BINARY']),
  681. 'version': config['GIT_VERSION'],
  682. 'hash': bin_hash(config['GIT_BINARY']),
  683. 'enabled': config['USE_GIT'],
  684. 'is_valid': bool(config['GIT_VERSION']),
  685. },
  686. 'YOUTUBEDL_BINARY': {
  687. 'path': bin_path(config['YOUTUBEDL_BINARY']),
  688. 'version': config['YOUTUBEDL_VERSION'],
  689. 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
  690. 'enabled': config['USE_YOUTUBEDL'],
  691. 'is_valid': bool(config['YOUTUBEDL_VERSION']),
  692. },
  693. 'CHROME_BINARY': {
  694. 'path': bin_path(config['CHROME_BINARY']),
  695. 'version': config['CHROME_VERSION'],
  696. 'hash': bin_hash(config['CHROME_BINARY']),
  697. 'enabled': config['USE_CHROME'],
  698. 'is_valid': bool(config['CHROME_VERSION']),
  699. },
  700. }
  701. def get_chrome_info(config: ConfigDict) -> ConfigValue:
  702. return {
  703. 'TIMEOUT': config['TIMEOUT'],
  704. 'RESOLUTION': config['RESOLUTION'],
  705. 'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
  706. 'CHROME_BINARY': config['CHROME_BINARY'],
  707. 'CHROME_HEADLESS': config['CHROME_HEADLESS'],
  708. 'CHROME_SANDBOX': config['CHROME_SANDBOX'],
  709. 'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
  710. 'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
  711. }
  712. ################################## Load Config #################################
  713. def load_all_config():
  714. CONFIG: ConfigDict = {}
  715. for section_name, section_config in CONFIG_DEFAULTS.items():
  716. CONFIG = load_config(section_config, CONFIG)
  717. return load_config(DERIVED_CONFIG_DEFAULTS, CONFIG)
  718. CONFIG = load_all_config()
  719. globals().update(CONFIG)
  720. # Timezone set as UTC
  721. os.environ["TZ"] = 'UTC'
  722. # add ./node_modules/.bin to $PATH so we can use node scripts in extractors
  723. NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
  724. sys.path.append(NODE_BIN_PATH)
  725. ############################## Importable Checkers #############################
  726. def check_system_config(config: ConfigDict=CONFIG) -> None:
  727. ### Check system environment
  728. if config['USER'] == 'root':
  729. stderr('[!] ArchiveBox should never be run as root!', color='red')
  730. stderr(' For more information, see the security overview documentation:')
  731. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
  732. raise SystemExit(2)
  733. ### Check Python environment
  734. if sys.version_info[:3] < (3, 6, 0):
  735. stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
  736. stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
  737. raise SystemExit(2)
  738. if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
  739. stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
  740. stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
  741. stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
  742. stderr('')
  743. stderr(' Confirm that it\'s fixed by opening a new shell and running:')
  744. stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
  745. raise SystemExit(2)
  746. # stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
  747. # stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
  748. if config['CHROME_USER_DATA_DIR'] is not None:
  749. if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
  750. stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
  751. stderr(f' {config["CHROME_USER_DATA_DIR"]}')
  752. stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
  753. stderr(' For more info see:')
  754. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
  755. if '/Default' in str(config['CHROME_USER_DATA_DIR']):
  756. stderr()
  757. stderr(' Try removing /Default from the end e.g.:')
  758. stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
  759. raise SystemExit(2)
  760. def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
  761. invalid_dependencies = [
  762. (name, info) for name, info in config['DEPENDENCIES'].items()
  763. if info['enabled'] and not info['is_valid']
  764. ]
  765. if invalid_dependencies and show_help:
  766. stderr(f'[!] Warning: Missing {len(invalid_dependencies)} recommended dependencies', color='lightyellow')
  767. for dependency, info in invalid_dependencies:
  768. stderr(
  769. ' ! {}: {} ({})'.format(
  770. dependency,
  771. info['path'] or 'unable to find binary',
  772. info['version'] or 'unable to detect version',
  773. )
  774. )
  775. if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
  776. hint(('npm install --prefix . "git+https://github.com/ArchiveBox/ArchiveBox.git"',
  777. f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
  778. ''), prefix=' ')
  779. stderr('')
  780. if config['TIMEOUT'] < 5:
  781. stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
  782. stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
  783. stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
  784. stderr()
  785. stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
  786. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
  787. stderr()
  788. elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
  789. stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
  790. stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
  791. stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
  792. stderr()
  793. stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
  794. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
  795. stderr()
  796. if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
  797. stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
  798. stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
  799. stderr(' (Setting it somewhere over 60 seconds is recommended)')
  800. stderr()
  801. stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
  802. stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
  803. stderr()
  804. def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
  805. output_dir = out_dir or config['OUTPUT_DIR']
  806. assert isinstance(output_dir, (str, Path))
  807. sql_index_exists = (Path(output_dir) / SQL_INDEX_FILENAME).exists()
  808. if not sql_index_exists:
  809. stderr('[X] No archivebox index found in the current directory.', color='red')
  810. stderr(f' {output_dir}', color='lightyellow')
  811. stderr()
  812. stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
  813. stderr(' cd path/to/your/archive/folder')
  814. stderr(' archivebox [command]')
  815. stderr()
  816. stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
  817. stderr(' archivebox init')
  818. raise SystemExit(2)
  819. from .index.sql import list_migrations
  820. pending_migrations = [name for status, name in list_migrations() if not status]
  821. if (not sql_index_exists) or pending_migrations:
  822. if sql_index_exists:
  823. pending_operation = f'apply the {len(pending_migrations)} pending migrations'
  824. else:
  825. pending_operation = 'generate the new SQL main index'
  826. stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
  827. stderr(f' {output_dir}')
  828. stderr()
  829. stderr(f' To upgrade it to the latest version and {pending_operation} run:')
  830. stderr(' archivebox init')
  831. raise SystemExit(3)
  832. sources_dir = Path(output_dir) / SOURCES_DIR_NAME
  833. if not sources_dir.exists():
  834. sources_dir.mkdir()
  835. def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG) -> None:
  836. check_system_config()
  837. output_dir = out_dir or Path(config['OUTPUT_DIR'])
  838. assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
  839. try:
  840. import django
  841. sys.path.append(str(config['PACKAGE_DIR']))
  842. os.environ.setdefault('OUTPUT_DIR', str(output_dir))
  843. assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
  844. os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
  845. django.setup()
  846. if check_db:
  847. sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
  848. assert sql_index_path.exists(), (
  849. f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
  850. except KeyboardInterrupt:
  851. raise SystemExit(2)
  852. os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821