main.py 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066
  1. __package__ = 'archivebox'
  2. import os
  3. import sys
  4. import shutil
  5. from typing import Dict, List, Optional, Iterable, IO, Union
  6. from crontab import CronTab, CronSlices
  7. from .cli import (
  8. list_subcommands,
  9. run_subcommand,
  10. display_first,
  11. meta_cmds,
  12. main_cmds,
  13. archive_cmds,
  14. )
  15. from .parsers import (
  16. save_text_as_source,
  17. save_file_as_source,
  18. parse_links_memory,
  19. )
  20. from .index.schema import Link
  21. from .util import enforce_types # type: ignore
  22. from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
  23. from .index import (
  24. load_main_index,
  25. parse_links_from_source,
  26. dedupe_links,
  27. write_main_index,
  28. link_matches_filter,
  29. get_indexed_folders,
  30. get_archived_folders,
  31. get_unarchived_folders,
  32. get_present_folders,
  33. get_valid_folders,
  34. get_invalid_folders,
  35. get_duplicate_folders,
  36. get_orphaned_folders,
  37. get_corrupted_folders,
  38. get_unrecognized_folders,
  39. fix_invalid_folder_locations,
  40. )
  41. from .index.json import (
  42. parse_json_main_index,
  43. parse_json_links_details,
  44. )
  45. from .index.sql import (
  46. parse_sql_main_index,
  47. get_admins,
  48. apply_migrations,
  49. remove_from_sql_main_index,
  50. )
  51. from .index.html import parse_html_main_index
  52. from .extractors import archive_links
  53. from .config import (
  54. stderr,
  55. ConfigDict,
  56. ANSI,
  57. # IS_TTY,
  58. USER,
  59. ARCHIVEBOX_BINARY,
  60. ONLY_NEW,
  61. OUTPUT_DIR,
  62. SOURCES_DIR,
  63. ARCHIVE_DIR,
  64. LOGS_DIR,
  65. CONFIG_FILE,
  66. ARCHIVE_DIR_NAME,
  67. SOURCES_DIR_NAME,
  68. LOGS_DIR_NAME,
  69. STATIC_DIR_NAME,
  70. JSON_INDEX_FILENAME,
  71. HTML_INDEX_FILENAME,
  72. SQL_INDEX_FILENAME,
  73. ROBOTS_TXT_FILENAME,
  74. FAVICON_FILENAME,
  75. check_dependencies,
  76. check_data_folder,
  77. write_config_file,
  78. setup_django,
  79. VERSION,
  80. CODE_LOCATIONS,
  81. EXTERNAL_LOCATIONS,
  82. DATA_LOCATIONS,
  83. DEPENDENCIES,
  84. load_all_config,
  85. CONFIG,
  86. USER_CONFIG,
  87. get_real_name,
  88. )
  89. from .logging_util import (
  90. TERM_WIDTH,
  91. TimedProgress,
  92. log_importing_started,
  93. log_crawl_started,
  94. log_removal_started,
  95. log_removal_finished,
  96. log_list_started,
  97. log_list_finished,
  98. printable_config,
  99. printable_folders,
  100. printable_filesize,
  101. printable_folder_status,
  102. printable_dependency_version,
  103. )
  104. ALLOWED_IN_OUTPUT_DIR = {
  105. '.DS_Store',
  106. '.venv',
  107. 'venv',
  108. 'virtualenv',
  109. '.virtualenv',
  110. ARCHIVE_DIR_NAME,
  111. SOURCES_DIR_NAME,
  112. LOGS_DIR_NAME,
  113. STATIC_DIR_NAME,
  114. SQL_INDEX_FILENAME,
  115. JSON_INDEX_FILENAME,
  116. HTML_INDEX_FILENAME,
  117. ROBOTS_TXT_FILENAME,
  118. FAVICON_FILENAME,
  119. }
  120. @enforce_types
  121. def help(out_dir: str=OUTPUT_DIR) -> None:
  122. """Print the ArchiveBox help message and usage"""
  123. all_subcommands = list_subcommands()
  124. COMMANDS_HELP_TEXT = '\n '.join(
  125. f'{cmd.ljust(20)} {summary}'
  126. for cmd, summary in all_subcommands.items()
  127. if cmd in meta_cmds
  128. ) + '\n\n ' + '\n '.join(
  129. f'{cmd.ljust(20)} {summary}'
  130. for cmd, summary in all_subcommands.items()
  131. if cmd in main_cmds
  132. ) + '\n\n ' + '\n '.join(
  133. f'{cmd.ljust(20)} {summary}'
  134. for cmd, summary in all_subcommands.items()
  135. if cmd in archive_cmds
  136. ) + '\n\n ' + '\n '.join(
  137. f'{cmd.ljust(20)} {summary}'
  138. for cmd, summary in all_subcommands.items()
  139. if cmd not in display_first
  140. )
  141. if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)):
  142. print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset}
  143. {lightred}Active data directory:{reset}
  144. {}
  145. {lightred}Usage:{reset}
  146. archivebox [command] [--help] [--version] [...args]
  147. {lightred}Commands:{reset}
  148. {}
  149. {lightred}Example Use:{reset}
  150. mkdir my-archive; cd my-archive/
  151. archivebox init
  152. archivebox status
  153. archivebox add https://example.com/some/page
  154. archivebox add --depth=1 ~/Downloads/bookmarks_export.html
  155. archivebox list --sort=timestamp --csv=timestamp,url,is_archived
  156. archivebox schedule --every=week https://example.com/some/feed.rss
  157. archivebox update --resume=15109948213.123
  158. {lightred}Documentation:{reset}
  159. https://github.com/pirate/ArchiveBox/wiki
  160. '''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
  161. else:
  162. print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI))
  163. print()
  164. print('To import an existing archive (from a previous version of ArchiveBox):')
  165. print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
  166. print(' 2. archivebox init')
  167. print()
  168. print('To start a new archive:')
  169. print(' 1. Create an empty directory, then cd into it and run:')
  170. print(' 2. archivebox init')
  171. print()
  172. print('For more information, see the documentation here:')
  173. print(' https://github.com/pirate/ArchiveBox/wiki')
  174. @enforce_types
  175. def version(quiet: bool=False,
  176. out_dir: str=OUTPUT_DIR) -> None:
  177. """Print the ArchiveBox version and dependency information"""
  178. if quiet:
  179. print(VERSION)
  180. else:
  181. print('ArchiveBox v{}'.format(VERSION))
  182. print()
  183. print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
  184. for name, dependency in DEPENDENCIES.items():
  185. print(printable_dependency_version(name, dependency))
  186. print()
  187. print('{white}[i] Code locations:{reset}'.format(**ANSI))
  188. for name, folder in CODE_LOCATIONS.items():
  189. print(printable_folder_status(name, folder))
  190. print()
  191. print('{white}[i] External locations:{reset}'.format(**ANSI))
  192. for name, folder in EXTERNAL_LOCATIONS.items():
  193. print(printable_folder_status(name, folder))
  194. print()
  195. print('{white}[i] Data locations:{reset}'.format(**ANSI))
  196. for name, folder in DATA_LOCATIONS.items():
  197. print(printable_folder_status(name, folder))
  198. print()
  199. check_dependencies()
  200. @enforce_types
  201. def run(subcommand: str,
  202. subcommand_args: Optional[List[str]],
  203. stdin: Optional[IO]=None,
  204. out_dir: str=OUTPUT_DIR) -> None:
  205. """Run a given ArchiveBox subcommand with the given list of args"""
  206. run_subcommand(
  207. subcommand=subcommand,
  208. subcommand_args=subcommand_args,
  209. stdin=stdin,
  210. pwd=out_dir,
  211. )
  212. @enforce_types
  213. def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
  214. """Initialize a new ArchiveBox collection in the current directory"""
  215. os.makedirs(out_dir, exist_ok=True)
  216. is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
  217. existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
  218. if is_empty and not existing_index:
  219. print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  220. print(f' {out_dir}')
  221. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  222. elif existing_index:
  223. print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  224. print(f' {out_dir}')
  225. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  226. else:
  227. if force:
  228. stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow')
  229. stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
  230. else:
  231. stderr(
  232. ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
  233. " You must run init in a completely empty directory, or an existing data folder.\n\n"
  234. " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
  235. " then run and run 'archivebox init' to pick up where you left off.\n\n"
  236. " (Always make sure your data folder is backed up first before updating ArchiveBox)"
  237. ).format(out_dir, **ANSI)
  238. )
  239. raise SystemExit(2)
  240. if existing_index:
  241. print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
  242. else:
  243. print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
  244. os.makedirs(SOURCES_DIR, exist_ok=True)
  245. print(f' √ {SOURCES_DIR}')
  246. os.makedirs(ARCHIVE_DIR, exist_ok=True)
  247. print(f' √ {ARCHIVE_DIR}')
  248. os.makedirs(LOGS_DIR, exist_ok=True)
  249. print(f' √ {LOGS_DIR}')
  250. write_config_file({}, out_dir=out_dir)
  251. print(f' √ {CONFIG_FILE}')
  252. if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)):
  253. print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
  254. else:
  255. print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
  256. setup_django(out_dir, check_db=False)
  257. DATABASE_FILE = os.path.join(out_dir, SQL_INDEX_FILENAME)
  258. print(f' √ {DATABASE_FILE}')
  259. print()
  260. for migration_line in apply_migrations(out_dir):
  261. print(f' {migration_line}')
  262. assert os.path.exists(DATABASE_FILE)
  263. # from django.contrib.auth.models import User
  264. # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
  265. # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
  266. # call_command("createsuperuser", interactive=True)
  267. print()
  268. print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
  269. all_links: Dict[str, Link] = {}
  270. if existing_index:
  271. all_links = {
  272. link.url: link
  273. for link in load_main_index(out_dir=out_dir, warn=False)
  274. }
  275. print(' √ Loaded {} links from existing main index.'.format(len(all_links)))
  276. # Links in data folders that dont match their timestamp
  277. fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
  278. if fixed:
  279. print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
  280. if cant_fix:
  281. print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
  282. # Links in JSON index but not in main index
  283. orphaned_json_links = {
  284. link.url: link
  285. for link in parse_json_main_index(out_dir)
  286. if link.url not in all_links
  287. }
  288. if orphaned_json_links:
  289. all_links.update(orphaned_json_links)
  290. print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
  291. # Links in SQL index but not in main index
  292. orphaned_sql_links = {
  293. link.url: link
  294. for link in parse_sql_main_index(out_dir)
  295. if link.url not in all_links
  296. }
  297. if orphaned_sql_links:
  298. all_links.update(orphaned_sql_links)
  299. print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
  300. # Links in data dir indexes but not in main index
  301. orphaned_data_dir_links = {
  302. link.url: link
  303. for link in parse_json_links_details(out_dir)
  304. if link.url not in all_links
  305. }
  306. if orphaned_data_dir_links:
  307. all_links.update(orphaned_data_dir_links)
  308. print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
  309. # Links in invalid/duplicate data dirs
  310. invalid_folders = {
  311. folder: link
  312. for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items()
  313. }
  314. if invalid_folders:
  315. print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
  316. print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
  317. print()
  318. print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
  319. print(' archivebox status')
  320. print(' archivebox list --status=invalid')
  321. write_main_index(list(all_links.values()), out_dir=out_dir)
  322. print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  323. if existing_index:
  324. print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
  325. else:
  326. print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
  327. print()
  328. print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
  329. print(' archivebox server # then visit http://127.0.0.1:8000')
  330. print()
  331. print(' To add new links, you can run:')
  332. print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
  333. print()
  334. print(' For more usage and examples, run:')
  335. print(' archivebox help')
  336. @enforce_types
  337. def status(out_dir: str=OUTPUT_DIR) -> None:
  338. """Print out some info and statistics about the archive collection"""
  339. check_data_folder(out_dir=out_dir)
  340. from core.models import Snapshot
  341. from django.contrib.auth import get_user_model
  342. User = get_user_model()
  343. print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
  344. print(ANSI['lightyellow'], f' {out_dir}/*', ANSI['reset'])
  345. num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
  346. size = printable_filesize(num_bytes)
  347. print(f' Index size: {size} across {num_files} files')
  348. print()
  349. links = list(load_main_index(out_dir=out_dir))
  350. num_json_links = len(links)
  351. num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir))
  352. num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir))
  353. num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
  354. print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
  355. print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
  356. print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
  357. print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
  358. if num_html_links != len(links) or num_sql_links != len(links):
  359. print()
  360. print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
  361. print(' archivebox init')
  362. print()
  363. print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
  364. print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset'])
  365. num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
  366. size = printable_filesize(num_bytes)
  367. print(f' Size: {size} across {num_files} files in {num_dirs} directories')
  368. print(ANSI['black'])
  369. num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
  370. num_archived = len(get_archived_folders(links, out_dir=out_dir))
  371. num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
  372. print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
  373. print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
  374. print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
  375. num_present = len(get_present_folders(links, out_dir=out_dir))
  376. num_valid = len(get_valid_folders(links, out_dir=out_dir))
  377. print()
  378. print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
  379. print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
  380. duplicate = get_duplicate_folders(links, out_dir=out_dir)
  381. orphaned = get_orphaned_folders(links, out_dir=out_dir)
  382. corrupted = get_corrupted_folders(links, out_dir=out_dir)
  383. unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
  384. num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
  385. print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
  386. print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
  387. print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
  388. print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
  389. print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
  390. print(ANSI['reset'])
  391. if num_indexed:
  392. print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
  393. print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
  394. if orphaned:
  395. print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
  396. print(' archivebox init')
  397. if num_invalid:
  398. print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
  399. print(' archivebox init')
  400. print()
  401. print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
  402. print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset'])
  403. users = get_admins().values_list('username', flat=True)
  404. print(f' UI users {len(users)}: {", ".join(users)}')
  405. last_login = User.objects.order_by('last_login').last()
  406. if last_login:
  407. print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
  408. last_updated = Snapshot.objects.order_by('updated').last()
  409. print(f' Last changes: {str(last_updated.updated)[:16]}')
  410. if not users:
  411. print()
  412. print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
  413. print(' archivebox manage createsuperuser')
  414. print()
  415. for snapshot in Snapshot.objects.order_by('-updated')[:10]:
  416. if not snapshot.updated:
  417. continue
  418. print(
  419. ANSI['black'],
  420. (
  421. f' > {str(snapshot.updated)[:16]} '
  422. f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
  423. f'"{snapshot.title}": {snapshot.url}'
  424. )[:TERM_WIDTH()],
  425. ANSI['reset'],
  426. )
  427. print(ANSI['black'], ' ...', ANSI['reset'])
  428. @enforce_types
  429. def oneshot(url: str, out_dir: str=OUTPUT_DIR):
  430. oneshot_links, _ = parse_links_memory([url])
  431. oneshot_links, _ = dedupe_links([], oneshot_links)
  432. archive_links(oneshot_links, out_dir=out_dir, skip_index=True, oneshot=True)
  433. return oneshot_links
  434. @enforce_types
  435. def add(urls: Union[str, List[str]],
  436. depth: int=0,
  437. update_all: bool=not ONLY_NEW,
  438. index_only: bool=False,
  439. out_dir: str=OUTPUT_DIR) -> List[Link]:
  440. """Add a new URL or list of URLs to your archive"""
  441. assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
  442. # Load list of links from the existing index
  443. check_data_folder(out_dir=out_dir)
  444. check_dependencies()
  445. all_links: List[Link] = []
  446. new_links: List[Link] = []
  447. all_links = load_main_index(out_dir=out_dir)
  448. log_importing_started(urls=urls, depth=depth, index_only=index_only)
  449. if isinstance(urls, str):
  450. # save verbatim stdin to sources
  451. write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
  452. elif isinstance(urls, list):
  453. # save verbatim args to sources
  454. write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
  455. new_links += parse_links_from_source(write_ahead_log)
  456. # If we're going one level deeper, download each link and look for more links
  457. new_links_depth = []
  458. if new_links and depth == 1:
  459. log_crawl_started(new_links)
  460. for new_link in new_links:
  461. downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
  462. new_links_depth += parse_links_from_source(downloaded_file)
  463. all_links, new_links = dedupe_links(all_links, new_links + new_links_depth)
  464. write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
  465. if index_only:
  466. return all_links
  467. # Run the archive methods for each link
  468. to_archive = all_links if update_all else new_links
  469. archive_links(to_archive, out_dir=out_dir)
  470. # Step 4: Re-write links index with updated titles, icons, and resources
  471. if to_archive:
  472. all_links = load_main_index(out_dir=out_dir)
  473. write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
  474. return all_links
  475. @enforce_types
  476. def remove(filter_str: Optional[str]=None,
  477. filter_patterns: Optional[List[str]]=None,
  478. filter_type: str='exact',
  479. links: Optional[List[Link]]=None,
  480. after: Optional[float]=None,
  481. before: Optional[float]=None,
  482. yes: bool=False,
  483. delete: bool=False,
  484. out_dir: str=OUTPUT_DIR) -> List[Link]:
  485. """Remove the specified URLs from the archive"""
  486. check_data_folder(out_dir=out_dir)
  487. if links is None:
  488. if filter_str and filter_patterns:
  489. stderr(
  490. '[X] You should pass either a pattern as an argument, '
  491. 'or pass a list of patterns via stdin, but not both.\n',
  492. color='red',
  493. )
  494. raise SystemExit(2)
  495. elif not (filter_str or filter_patterns):
  496. stderr(
  497. '[X] You should pass either a pattern as an argument, '
  498. 'or pass a list of patterns via stdin.',
  499. color='red',
  500. )
  501. stderr()
  502. stderr(' {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI))
  503. stderr(" archivebox remove --filter-type=regex '.*'")
  504. stderr()
  505. raise SystemExit(2)
  506. elif filter_str:
  507. filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
  508. log_list_started(filter_patterns, filter_type)
  509. timer = TimedProgress(360, prefix=' ')
  510. try:
  511. links = list(list_links(
  512. filter_patterns=filter_patterns,
  513. filter_type=filter_type,
  514. after=after,
  515. before=before,
  516. ))
  517. finally:
  518. timer.end()
  519. if not len(links):
  520. log_removal_finished(0, 0)
  521. raise SystemExit(1)
  522. log_list_finished(links)
  523. log_removal_started(links, yes=yes, delete=delete)
  524. timer = TimedProgress(360, prefix=' ')
  525. try:
  526. to_keep = []
  527. to_delete = []
  528. all_links = load_main_index(out_dir=out_dir)
  529. for link in all_links:
  530. should_remove = (
  531. (after is not None and float(link.timestamp) < after)
  532. or (before is not None and float(link.timestamp) > before)
  533. or link_matches_filter(link, filter_patterns or [], filter_type)
  534. or link in links
  535. )
  536. if should_remove:
  537. to_delete.append(link)
  538. if delete:
  539. shutil.rmtree(link.link_dir, ignore_errors=True)
  540. else:
  541. to_keep.append(link)
  542. finally:
  543. timer.end()
  544. remove_from_sql_main_index(links=to_delete, out_dir=out_dir)
  545. write_main_index(links=to_keep, out_dir=out_dir, finished=True)
  546. log_removal_finished(len(all_links), len(to_keep))
  547. return to_keep
  548. @enforce_types
  549. def update(resume: Optional[float]=None,
  550. only_new: bool=ONLY_NEW,
  551. index_only: bool=False,
  552. overwrite: bool=False,
  553. filter_patterns_str: Optional[str]=None,
  554. filter_patterns: Optional[List[str]]=None,
  555. filter_type: Optional[str]=None,
  556. status: Optional[str]=None,
  557. after: Optional[str]=None,
  558. before: Optional[str]=None,
  559. out_dir: str=OUTPUT_DIR) -> List[Link]:
  560. """Import any new links from subscriptions and retry any previously failed/skipped links"""
  561. check_data_folder(out_dir=out_dir)
  562. check_dependencies()
  563. # Step 1: Load list of links from the existing index
  564. # merge in and dedupe new links from import_path
  565. all_links: List[Link] = []
  566. new_links: List[Link] = []
  567. all_links = load_main_index(out_dir=out_dir)
  568. # Step 2: Write updated index with deduped old and new links back to disk
  569. write_main_index(links=list(all_links), out_dir=out_dir)
  570. # Step 3: Filter for selected_links
  571. matching_links = list_links(
  572. filter_patterns=filter_patterns,
  573. filter_type=filter_type,
  574. before=before,
  575. after=after,
  576. )
  577. matching_folders = list_folders(
  578. links=list(matching_links),
  579. status=status,
  580. out_dir=out_dir,
  581. )
  582. all_links = [link for link in matching_folders.values() if link]
  583. if index_only:
  584. return all_links
  585. # Step 3: Run the archive methods for each link
  586. to_archive = new_links if only_new else all_links
  587. archive_links(to_archive, overwrite=overwrite, out_dir=out_dir)
  588. # Step 4: Re-write links index with updated titles, icons, and resources
  589. all_links = load_main_index(out_dir=out_dir)
  590. write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
  591. return all_links
  592. @enforce_types
  593. def list_all(filter_patterns_str: Optional[str]=None,
  594. filter_patterns: Optional[List[str]]=None,
  595. filter_type: str='exact',
  596. status: Optional[str]=None,
  597. after: Optional[float]=None,
  598. before: Optional[float]=None,
  599. sort: Optional[str]=None,
  600. csv: Optional[str]=None,
  601. json: bool=False,
  602. out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
  603. """List, filter, and export information about archive entries"""
  604. check_data_folder(out_dir=out_dir)
  605. if filter_patterns and filter_patterns_str:
  606. stderr(
  607. '[X] You should either pass filter patterns as an arguments '
  608. 'or via stdin, but not both.\n',
  609. color='red',
  610. )
  611. raise SystemExit(2)
  612. elif filter_patterns_str:
  613. filter_patterns = filter_patterns_str.split('\n')
  614. links = list_links(
  615. filter_patterns=filter_patterns,
  616. filter_type=filter_type,
  617. before=before,
  618. after=after,
  619. )
  620. if sort:
  621. links = sorted(links, key=lambda link: getattr(link, sort))
  622. folders = list_folders(
  623. links=list(links),
  624. status=status,
  625. out_dir=out_dir,
  626. )
  627. print(printable_folders(folders, json=json, csv=csv))
  628. return folders
  629. @enforce_types
  630. def list_links(filter_patterns: Optional[List[str]]=None,
  631. filter_type: str='exact',
  632. after: Optional[float]=None,
  633. before: Optional[float]=None,
  634. out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
  635. check_data_folder(out_dir=out_dir)
  636. all_links = load_main_index(out_dir=out_dir)
  637. for link in all_links:
  638. if after is not None and float(link.timestamp) < after:
  639. continue
  640. if before is not None and float(link.timestamp) > before:
  641. continue
  642. if filter_patterns:
  643. if link_matches_filter(link, filter_patterns, filter_type):
  644. yield link
  645. else:
  646. yield link
  647. @enforce_types
  648. def list_folders(links: List[Link],
  649. status: str,
  650. out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  651. check_data_folder(out_dir=out_dir)
  652. if status == 'indexed':
  653. return get_indexed_folders(links, out_dir=out_dir)
  654. elif status == 'archived':
  655. return get_archived_folders(links, out_dir=out_dir)
  656. elif status == 'unarchived':
  657. return get_unarchived_folders(links, out_dir=out_dir)
  658. elif status == 'present':
  659. return get_present_folders(links, out_dir=out_dir)
  660. elif status == 'valid':
  661. return get_valid_folders(links, out_dir=out_dir)
  662. elif status == 'invalid':
  663. return get_invalid_folders(links, out_dir=out_dir)
  664. elif status == 'duplicate':
  665. return get_duplicate_folders(links, out_dir=out_dir)
  666. elif status == 'orphaned':
  667. return get_orphaned_folders(links, out_dir=out_dir)
  668. elif status == 'corrupted':
  669. return get_corrupted_folders(links, out_dir=out_dir)
  670. elif status == 'unrecognized':
  671. return get_unrecognized_folders(links, out_dir=out_dir)
  672. raise ValueError('Status not recognized.')
  673. @enforce_types
  674. def config(config_options_str: Optional[str]=None,
  675. config_options: Optional[List[str]]=None,
  676. get: bool=False,
  677. set: bool=False,
  678. reset: bool=False,
  679. out_dir: str=OUTPUT_DIR) -> None:
  680. """Get and set your ArchiveBox project configuration values"""
  681. check_data_folder(out_dir=out_dir)
  682. if config_options and config_options_str:
  683. stderr(
  684. '[X] You should either pass config values as an arguments '
  685. 'or via stdin, but not both.\n',
  686. color='red',
  687. )
  688. raise SystemExit(2)
  689. elif config_options_str:
  690. config_options = config_options_str.split('\n')
  691. config_options = config_options or []
  692. no_args = not (get or set or reset or config_options)
  693. matching_config: ConfigDict = {}
  694. if get or no_args:
  695. if config_options:
  696. config_options = [get_real_name(key) for key in config_options]
  697. matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
  698. failed_config = [key for key in config_options if key not in CONFIG]
  699. if failed_config:
  700. stderr()
  701. stderr('[X] These options failed to get', color='red')
  702. stderr(' {}'.format('\n '.join(config_options)))
  703. raise SystemExit(1)
  704. else:
  705. matching_config = CONFIG
  706. print(printable_config(matching_config))
  707. raise SystemExit(not matching_config)
  708. elif set:
  709. new_config = {}
  710. failed_options = []
  711. for line in config_options:
  712. if line.startswith('#') or not line.strip():
  713. continue
  714. if '=' not in line:
  715. stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
  716. stderr(f' {line}')
  717. raise SystemExit(2)
  718. raw_key, val = line.split('=')
  719. raw_key = raw_key.upper().strip()
  720. key = get_real_name(raw_key)
  721. if key != raw_key:
  722. stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
  723. if key in CONFIG:
  724. new_config[key] = val.strip()
  725. else:
  726. failed_options.append(line)
  727. if new_config:
  728. before = CONFIG
  729. matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
  730. after = load_all_config()
  731. print(printable_config(matching_config))
  732. side_effect_changes: ConfigDict = {}
  733. for key, val in after.items():
  734. if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
  735. side_effect_changes[key] = after[key]
  736. if side_effect_changes:
  737. stderr()
  738. stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
  739. print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
  740. if failed_options:
  741. stderr()
  742. stderr('[X] These options failed to set (check for typos):', color='red')
  743. stderr(' {}'.format('\n '.join(failed_options)))
  744. raise SystemExit(bool(failed_options))
  745. elif reset:
  746. stderr('[X] This command is not implemented yet.', color='red')
  747. stderr(' Please manually remove the relevant lines from your config file:')
  748. stderr(f' {CONFIG_FILE}')
  749. raise SystemExit(2)
  750. else:
  751. stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
  752. stderr(' archivebox config')
  753. stderr(' archivebox config --get SOME_KEY')
  754. stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
  755. raise SystemExit(2)
  756. @enforce_types
  757. def schedule(add: bool=False,
  758. show: bool=False,
  759. clear: bool=False,
  760. foreground: bool=False,
  761. run_all: bool=False,
  762. quiet: bool=False,
  763. every: Optional[str]=None,
  764. import_path: Optional[str]=None,
  765. out_dir: str=OUTPUT_DIR):
  766. """Set ArchiveBox to regularly import URLs at specific times using cron"""
  767. check_data_folder(out_dir=out_dir)
  768. os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
  769. cron = CronTab(user=True)
  770. cron = dedupe_cron_jobs(cron)
  771. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  772. if foreground or run_all:
  773. if import_path or (not existing_jobs):
  774. stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
  775. stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
  776. raise SystemExit(1)
  777. print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
  778. if run_all:
  779. try:
  780. for job in existing_jobs:
  781. sys.stdout.write(f' > {job.command}')
  782. sys.stdout.flush()
  783. job.run()
  784. sys.stdout.write(f'\r √ {job.command}\n')
  785. except KeyboardInterrupt:
  786. print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
  787. raise SystemExit(1)
  788. if foreground:
  789. try:
  790. for result in cron.run_scheduler():
  791. print(result)
  792. except KeyboardInterrupt:
  793. print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
  794. raise SystemExit(1)
  795. elif show:
  796. if existing_jobs:
  797. print('\n'.join(str(cmd) for cmd in existing_jobs))
  798. else:
  799. stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
  800. stderr(' To schedule a new job, run:')
  801. stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
  802. raise SystemExit(0)
  803. elif clear:
  804. print(cron.remove_all(comment=CRON_COMMENT))
  805. cron.write()
  806. raise SystemExit(0)
  807. elif every:
  808. quoted = lambda s: f'"{s}"' if s and ' ' in s else s
  809. cmd = [
  810. 'cd',
  811. quoted(out_dir),
  812. '&&',
  813. quoted(ARCHIVEBOX_BINARY),
  814. *(['add', f'"{import_path}"'] if import_path else ['update']),
  815. '2>&1',
  816. '>',
  817. quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
  818. ]
  819. new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
  820. if every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
  821. set_every = getattr(new_job.every(), every)
  822. set_every()
  823. elif CronSlices.is_valid(every):
  824. new_job.setall(every)
  825. else:
  826. stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
  827. stderr(' It must be one of minute/hour/day/week/month')
  828. stderr(' or a quoted cron-format schedule like:')
  829. stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
  830. stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
  831. raise SystemExit(1)
  832. cron = dedupe_cron_jobs(cron)
  833. cron.write()
  834. total_runs = sum(j.frequency_per_year() for j in cron)
  835. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  836. print()
  837. print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
  838. print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
  839. if total_runs > 60 and not quiet:
  840. stderr()
  841. stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
  842. stderr(' Congrats on being an enthusiastic internet archiver! 👌')
  843. stderr()
  844. stderr(' Make sure you have enough storage space available to hold all the data.')
  845. stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
  846. raise SystemExit(0)
  847. @enforce_types
  848. def server(runserver_args: Optional[List[str]]=None,
  849. reload: bool=False,
  850. debug: bool=False,
  851. init: bool=False,
  852. out_dir: str=OUTPUT_DIR) -> None:
  853. """Run the ArchiveBox HTTP server"""
  854. runserver_args = runserver_args or []
  855. if init:
  856. run_subcommand('init', stdin=None, pwd=out_dir)
  857. # setup config for django runserver
  858. from . import config
  859. config.SHOW_PROGRESS = False
  860. config.DEBUG = config.DEBUG or debug
  861. check_data_folder(out_dir=out_dir)
  862. setup_django(out_dir)
  863. from django.core.management import call_command
  864. from django.contrib.auth.models import User
  865. admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last()
  866. print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
  867. if admin_user:
  868. print("{lightred}[i] The admin username is:{lightblue} {}{reset}".format(admin_user.username, **ANSI))
  869. else:
  870. print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
  871. print()
  872. print(' To create an admin user, run:')
  873. print(' archivebox manage createsuperuser')
  874. print()
  875. # fallback to serving staticfiles insecurely with django when DEBUG=False
  876. if not config.DEBUG:
  877. runserver_args.append('--insecure') # TODO: serve statics w/ nginx instead
  878. # toggle autoreloading when archivebox code changes (it's on by default)
  879. if not reload:
  880. runserver_args.append('--noreload')
  881. config.SHOW_PROGRESS = False
  882. config.DEBUG = config.DEBUG or debug
  883. call_command("runserver", *runserver_args)
  884. @enforce_types
  885. def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
  886. """Run an ArchiveBox Django management command"""
  887. check_data_folder(out_dir=out_dir)
  888. setup_django(out_dir)
  889. from django.core.management import execute_from_command_line
  890. execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
  891. @enforce_types
  892. def shell(out_dir: str=OUTPUT_DIR) -> None:
  893. """Enter an interactive ArchiveBox Django shell"""
  894. check_data_folder(out_dir=out_dir)
  895. setup_django(OUTPUT_DIR)
  896. from django.core.management import call_command
  897. call_command("shell_plus")