main.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041
  1. __package__ = 'archivebox'
  2. import os
  3. import sys
  4. import shutil
  5. from typing import Dict, List, Optional, Iterable, IO, Union
  6. from crontab import CronTab, CronSlices
  7. from .cli import (
  8. list_subcommands,
  9. run_subcommand,
  10. display_first,
  11. meta_cmds,
  12. main_cmds,
  13. archive_cmds,
  14. )
  15. from .parsers import (
  16. save_text_as_source,
  17. save_file_as_source,
  18. )
  19. from .index.schema import Link
  20. from .util import enforce_types # type: ignore
  21. from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
  22. from .index import (
  23. load_main_index,
  24. parse_links_from_source,
  25. dedupe_links,
  26. write_main_index,
  27. link_matches_filter,
  28. get_indexed_folders,
  29. get_archived_folders,
  30. get_unarchived_folders,
  31. get_present_folders,
  32. get_valid_folders,
  33. get_invalid_folders,
  34. get_duplicate_folders,
  35. get_orphaned_folders,
  36. get_corrupted_folders,
  37. get_unrecognized_folders,
  38. fix_invalid_folder_locations,
  39. )
  40. from .index.json import (
  41. parse_json_main_index,
  42. parse_json_links_details,
  43. )
  44. from .index.sql import (
  45. parse_sql_main_index,
  46. get_admins,
  47. apply_migrations,
  48. remove_from_sql_main_index,
  49. )
  50. from .index.html import parse_html_main_index
  51. from .extractors import archive_links
  52. from .config import (
  53. stderr,
  54. ConfigDict,
  55. ANSI,
  56. IS_TTY,
  57. USER,
  58. ARCHIVEBOX_BINARY,
  59. ONLY_NEW,
  60. OUTPUT_DIR,
  61. SOURCES_DIR,
  62. ARCHIVE_DIR,
  63. LOGS_DIR,
  64. CONFIG_FILE,
  65. ARCHIVE_DIR_NAME,
  66. SOURCES_DIR_NAME,
  67. LOGS_DIR_NAME,
  68. STATIC_DIR_NAME,
  69. JSON_INDEX_FILENAME,
  70. HTML_INDEX_FILENAME,
  71. SQL_INDEX_FILENAME,
  72. ROBOTS_TXT_FILENAME,
  73. FAVICON_FILENAME,
  74. check_dependencies,
  75. check_data_folder,
  76. write_config_file,
  77. setup_django,
  78. VERSION,
  79. CODE_LOCATIONS,
  80. EXTERNAL_LOCATIONS,
  81. DATA_LOCATIONS,
  82. DEPENDENCIES,
  83. load_all_config,
  84. CONFIG,
  85. USER_CONFIG,
  86. get_real_name,
  87. )
  88. from .logging_util import (
  89. TERM_WIDTH,
  90. TimedProgress,
  91. log_importing_started,
  92. log_crawl_started,
  93. log_removal_started,
  94. log_removal_finished,
  95. log_list_started,
  96. log_list_finished,
  97. printable_config,
  98. printable_folders,
  99. printable_filesize,
  100. printable_folder_status,
  101. printable_dependency_version,
  102. )
  103. ALLOWED_IN_OUTPUT_DIR = {
  104. '.DS_Store',
  105. '.venv',
  106. 'venv',
  107. 'virtualenv',
  108. '.virtualenv',
  109. ARCHIVE_DIR_NAME,
  110. SOURCES_DIR_NAME,
  111. LOGS_DIR_NAME,
  112. STATIC_DIR_NAME,
  113. SQL_INDEX_FILENAME,
  114. JSON_INDEX_FILENAME,
  115. HTML_INDEX_FILENAME,
  116. ROBOTS_TXT_FILENAME,
  117. FAVICON_FILENAME,
  118. }
  119. @enforce_types
  120. def help(out_dir: str=OUTPUT_DIR) -> None:
  121. """Print the ArchiveBox help message and usage"""
  122. all_subcommands = list_subcommands()
  123. COMMANDS_HELP_TEXT = '\n '.join(
  124. f'{cmd.ljust(20)} {summary}'
  125. for cmd, summary in all_subcommands.items()
  126. if cmd in meta_cmds
  127. ) + '\n\n ' + '\n '.join(
  128. f'{cmd.ljust(20)} {summary}'
  129. for cmd, summary in all_subcommands.items()
  130. if cmd in main_cmds
  131. ) + '\n\n ' + '\n '.join(
  132. f'{cmd.ljust(20)} {summary}'
  133. for cmd, summary in all_subcommands.items()
  134. if cmd in archive_cmds
  135. ) + '\n\n ' + '\n '.join(
  136. f'{cmd.ljust(20)} {summary}'
  137. for cmd, summary in all_subcommands.items()
  138. if cmd not in display_first
  139. )
  140. if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)):
  141. print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset}
  142. {lightred}Active data directory:{reset}
  143. {}
  144. {lightred}Usage:{reset}
  145. archivebox [command] [--help] [--version] [...args]
  146. {lightred}Commands:{reset}
  147. {}
  148. {lightred}Example Use:{reset}
  149. mkdir my-archive; cd my-archive/
  150. archivebox init
  151. archivebox status
  152. archivebox add https://example.com/some/page
  153. archivebox add --depth=1 ~/Downloads/bookmarks_export.html
  154. archivebox list --sort=timestamp --csv=timestamp,url,is_archived
  155. archivebox schedule --every=week https://example.com/some/feed.rss
  156. archivebox update --resume=15109948213.123
  157. {lightred}Documentation:{reset}
  158. https://github.com/pirate/ArchiveBox/wiki
  159. '''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
  160. else:
  161. print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI))
  162. print()
  163. print('To import an existing archive (from a previous version of ArchiveBox):')
  164. print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
  165. print(' 2. archivebox init')
  166. print()
  167. print('To start a new archive:')
  168. print(' 1. Create an empty directory, then cd into it and run:')
  169. print(' 2. archivebox init')
  170. print()
  171. print('For more information, see the documentation here:')
  172. print(' https://github.com/pirate/ArchiveBox/wiki')
  173. @enforce_types
  174. def version(quiet: bool=False,
  175. out_dir: str=OUTPUT_DIR) -> None:
  176. """Print the ArchiveBox version and dependency information"""
  177. if quiet:
  178. print(VERSION)
  179. else:
  180. print('ArchiveBox v{}'.format(VERSION))
  181. print()
  182. print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
  183. for name, dependency in DEPENDENCIES.items():
  184. print(printable_dependency_version(name, dependency))
  185. print()
  186. print('{white}[i] Code locations:{reset}'.format(**ANSI))
  187. for name, folder in CODE_LOCATIONS.items():
  188. print(printable_folder_status(name, folder))
  189. print()
  190. print('{white}[i] External locations:{reset}'.format(**ANSI))
  191. for name, folder in EXTERNAL_LOCATIONS.items():
  192. print(printable_folder_status(name, folder))
  193. print()
  194. print('{white}[i] Data locations:{reset}'.format(**ANSI))
  195. for name, folder in DATA_LOCATIONS.items():
  196. print(printable_folder_status(name, folder))
  197. print()
  198. check_dependencies()
  199. @enforce_types
  200. def run(subcommand: str,
  201. subcommand_args: Optional[List[str]],
  202. stdin: Optional[IO]=None,
  203. out_dir: str=OUTPUT_DIR) -> None:
  204. """Run a given ArchiveBox subcommand with the given list of args"""
  205. run_subcommand(
  206. subcommand=subcommand,
  207. subcommand_args=subcommand_args,
  208. stdin=stdin,
  209. pwd=out_dir,
  210. )
  211. @enforce_types
  212. def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
  213. """Initialize a new ArchiveBox collection in the current directory"""
  214. os.makedirs(out_dir, exist_ok=True)
  215. is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
  216. existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
  217. if is_empty and not existing_index:
  218. print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  219. print(f' {out_dir}')
  220. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  221. elif existing_index:
  222. print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  223. print(f' {out_dir}')
  224. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  225. else:
  226. if force:
  227. stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow')
  228. stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
  229. else:
  230. stderr(
  231. ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
  232. " You must run init in a completely empty directory, or an existing data folder.\n\n"
  233. " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
  234. " then run and run 'archivebox init' to pick up where you left off.\n\n"
  235. " (Always make sure your data folder is backed up first before updating ArchiveBox)"
  236. ).format(out_dir, **ANSI)
  237. )
  238. raise SystemExit(2)
  239. if existing_index:
  240. print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
  241. else:
  242. print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
  243. os.makedirs(SOURCES_DIR, exist_ok=True)
  244. print(f' √ {SOURCES_DIR}')
  245. os.makedirs(ARCHIVE_DIR, exist_ok=True)
  246. print(f' √ {ARCHIVE_DIR}')
  247. os.makedirs(LOGS_DIR, exist_ok=True)
  248. print(f' √ {LOGS_DIR}')
  249. write_config_file({}, out_dir=out_dir)
  250. print(f' √ {CONFIG_FILE}')
  251. if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)):
  252. print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
  253. else:
  254. print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
  255. setup_django(out_dir, check_db=False)
  256. DATABASE_FILE = os.path.join(out_dir, SQL_INDEX_FILENAME)
  257. print(f' √ {DATABASE_FILE}')
  258. print()
  259. for migration_line in apply_migrations(out_dir):
  260. print(f' {migration_line}')
  261. assert os.path.exists(DATABASE_FILE)
  262. # from django.contrib.auth.models import User
  263. # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
  264. # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
  265. # call_command("createsuperuser", interactive=True)
  266. print()
  267. print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
  268. all_links: Dict[str, Link] = {}
  269. if existing_index:
  270. all_links = {
  271. link.url: link
  272. for link in load_main_index(out_dir=out_dir, warn=False)
  273. }
  274. print(' √ Loaded {} links from existing main index.'.format(len(all_links)))
  275. # Links in data folders that dont match their timestamp
  276. fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
  277. if fixed:
  278. print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
  279. if cant_fix:
  280. print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
  281. # Links in JSON index but not in main index
  282. orphaned_json_links = {
  283. link.url: link
  284. for link in parse_json_main_index(out_dir)
  285. if link.url not in all_links
  286. }
  287. if orphaned_json_links:
  288. all_links.update(orphaned_json_links)
  289. print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
  290. # Links in SQL index but not in main index
  291. orphaned_sql_links = {
  292. link.url: link
  293. for link in parse_sql_main_index(out_dir)
  294. if link.url not in all_links
  295. }
  296. if orphaned_sql_links:
  297. all_links.update(orphaned_sql_links)
  298. print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
  299. # Links in data dir indexes but not in main index
  300. orphaned_data_dir_links = {
  301. link.url: link
  302. for link in parse_json_links_details(out_dir)
  303. if link.url not in all_links
  304. }
  305. if orphaned_data_dir_links:
  306. all_links.update(orphaned_data_dir_links)
  307. print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
  308. # Links in invalid/duplicate data dirs
  309. invalid_folders = {
  310. folder: link
  311. for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items()
  312. }
  313. if invalid_folders:
  314. print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
  315. print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
  316. print()
  317. print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
  318. print(' archivebox status')
  319. print(' archivebox list --status=invalid')
  320. write_main_index(list(all_links.values()), out_dir=out_dir)
  321. print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  322. if existing_index:
  323. print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
  324. else:
  325. print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
  326. print()
  327. print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
  328. print(' archivebox server # then visit http://127.0.0.1:8000')
  329. print()
  330. print(' To add new links, you can run:')
  331. print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
  332. print()
  333. print(' For more usage and examples, run:')
  334. print(' archivebox help')
  335. @enforce_types
  336. def status(out_dir: str=OUTPUT_DIR) -> None:
  337. """Print out some info and statistics about the archive collection"""
  338. check_data_folder(out_dir=out_dir)
  339. from core.models import Snapshot
  340. from django.contrib.auth import get_user_model
  341. User = get_user_model()
  342. print('{green}[*] Scanning archive main index...{reset}'.format(**ANSI))
  343. print(ANSI['lightyellow'], f' {out_dir}/*', ANSI['reset'])
  344. num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
  345. size = printable_filesize(num_bytes)
  346. print(f' Index size: {size} across {num_files} files')
  347. print()
  348. links = list(load_main_index(out_dir=out_dir))
  349. num_json_links = len(links)
  350. num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir))
  351. num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir))
  352. num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
  353. print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
  354. print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
  355. print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
  356. print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
  357. if num_html_links != len(links) or num_sql_links != len(links):
  358. print()
  359. print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
  360. print(' archivebox init')
  361. print()
  362. print('{green}[*] Scanning archive data directories...{reset}'.format(**ANSI))
  363. print(ANSI['lightyellow'], f' {ARCHIVE_DIR}/*', ANSI['reset'])
  364. num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
  365. size = printable_filesize(num_bytes)
  366. print(f' Size: {size} across {num_files} files in {num_dirs} directories')
  367. print(ANSI['black'])
  368. num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
  369. num_archived = len(get_archived_folders(links, out_dir=out_dir))
  370. num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
  371. print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
  372. print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
  373. print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
  374. num_present = len(get_present_folders(links, out_dir=out_dir))
  375. num_valid = len(get_valid_folders(links, out_dir=out_dir))
  376. print()
  377. print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
  378. print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
  379. duplicate = get_duplicate_folders(links, out_dir=out_dir)
  380. orphaned = get_orphaned_folders(links, out_dir=out_dir)
  381. corrupted = get_corrupted_folders(links, out_dir=out_dir)
  382. unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
  383. num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
  384. print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
  385. print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
  386. print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
  387. print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
  388. print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
  389. print(ANSI['reset'])
  390. if num_indexed:
  391. print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
  392. print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
  393. if orphaned:
  394. print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
  395. print(' archivebox init')
  396. if num_invalid:
  397. print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
  398. print(' archivebox init')
  399. print()
  400. print('{green}[*] Scanning recent archive changes and user logins:{reset}'.format(**ANSI))
  401. print(ANSI['lightyellow'], f' {LOGS_DIR}/*', ANSI['reset'])
  402. users = get_admins().values_list('username', flat=True)
  403. print(f' UI users {len(users)}: {", ".join(users)}')
  404. last_login = User.objects.order_by('last_login').last()
  405. if last_login:
  406. print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}')
  407. last_updated = Snapshot.objects.order_by('updated').last()
  408. print(f' Last changes: {str(last_updated.updated)[:16]}')
  409. if not users:
  410. print()
  411. print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
  412. print(' archivebox manage createsuperuser')
  413. print()
  414. for snapshot in Snapshot.objects.order_by('-updated')[:10]:
  415. if not snapshot.updated:
  416. continue
  417. print(
  418. ANSI['black'],
  419. (
  420. f' > {str(snapshot.updated)[:16]} '
  421. f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] '
  422. f'"{snapshot.title}": {snapshot.url}'
  423. )[:TERM_WIDTH()],
  424. ANSI['reset'],
  425. )
  426. print(ANSI['black'], ' ...', ANSI['reset'])
  427. @enforce_types
  428. def add(urls: Union[str, List[str]],
  429. depth: int=0,
  430. update_all: bool=not ONLY_NEW,
  431. index_only: bool=False,
  432. out_dir: str=OUTPUT_DIR) -> List[Link]:
  433. """Add a new URL or list of URLs to your archive"""
  434. assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
  435. # Load list of links from the existing index
  436. check_data_folder(out_dir=out_dir)
  437. check_dependencies()
  438. all_links: List[Link] = []
  439. new_links: List[Link] = []
  440. all_links = load_main_index(out_dir=out_dir)
  441. log_importing_started(urls=urls, depth=depth, index_only=index_only)
  442. if isinstance(urls, str):
  443. # save verbatim stdin to sources
  444. write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
  445. elif isinstance(urls, list):
  446. # save verbatim args to sources
  447. write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
  448. new_links += parse_links_from_source(write_ahead_log)
  449. # If we're going one level deeper, download each link and look for more links
  450. new_links_depth = []
  451. if new_links and depth == 1:
  452. log_crawl_started(new_links)
  453. for new_link in new_links:
  454. downloaded_file = save_file_as_source(new_link.url, filename='{ts}-crawl-{basename}.txt', out_dir=out_dir)
  455. new_links_depth += parse_links_from_source(downloaded_file)
  456. all_links, new_links = dedupe_links(all_links, new_links + new_links_depth)
  457. write_main_index(links=all_links, out_dir=out_dir, finished=not new_links)
  458. if index_only:
  459. return all_links
  460. # Run the archive methods for each link
  461. to_archive = all_links if update_all else new_links
  462. archive_links(to_archive, out_dir=out_dir)
  463. # Step 4: Re-write links index with updated titles, icons, and resources
  464. if to_archive:
  465. all_links = load_main_index(out_dir=out_dir)
  466. write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
  467. return all_links
  468. @enforce_types
  469. def remove(filter_str: Optional[str]=None,
  470. filter_patterns: Optional[List[str]]=None,
  471. filter_type: str='exact',
  472. after: Optional[float]=None,
  473. before: Optional[float]=None,
  474. yes: bool=False,
  475. delete: bool=False,
  476. out_dir: str=OUTPUT_DIR) -> List[Link]:
  477. """Remove the specified URLs from the archive"""
  478. check_data_folder(out_dir=out_dir)
  479. if filter_str and filter_patterns:
  480. stderr(
  481. '[X] You should pass either a pattern as an argument, '
  482. 'or pass a list of patterns via stdin, but not both.\n',
  483. color='red',
  484. )
  485. raise SystemExit(2)
  486. elif not (filter_str or filter_patterns):
  487. stderr(
  488. '[X] You should pass either a pattern as an argument, '
  489. 'or pass a list of patterns via stdin.',
  490. color='red',
  491. )
  492. stderr()
  493. stderr(' {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI))
  494. stderr(" archivebox remove --filter-type=regex '.*'")
  495. stderr()
  496. raise SystemExit(2)
  497. elif filter_str:
  498. filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
  499. log_list_started(filter_patterns, filter_type)
  500. timer = TimedProgress(360, prefix=' ')
  501. try:
  502. links = list(list_links(
  503. filter_patterns=filter_patterns,
  504. filter_type=filter_type,
  505. after=after,
  506. before=before,
  507. ))
  508. finally:
  509. timer.end()
  510. if not len(links):
  511. log_removal_finished(0, 0)
  512. raise SystemExit(1)
  513. log_list_finished(links)
  514. log_removal_started(links, yes=yes, delete=delete)
  515. timer = TimedProgress(360, prefix=' ')
  516. try:
  517. to_keep = []
  518. to_delete = []
  519. all_links = load_main_index(out_dir=out_dir)
  520. for link in all_links:
  521. should_remove = (
  522. (after is not None and float(link.timestamp) < after)
  523. or (before is not None and float(link.timestamp) > before)
  524. or link_matches_filter(link, filter_patterns, filter_type)
  525. )
  526. if should_remove:
  527. to_delete.append(link)
  528. if delete:
  529. shutil.rmtree(link.link_dir, ignore_errors=True)
  530. else:
  531. to_keep.append(link)
  532. finally:
  533. timer.end()
  534. remove_from_sql_main_index(links=to_delete, out_dir=out_dir)
  535. write_main_index(links=to_keep, out_dir=out_dir, finished=True)
  536. log_removal_finished(len(all_links), len(to_keep))
  537. return to_keep
  538. @enforce_types
  539. def update(resume: Optional[float]=None,
  540. only_new: bool=ONLY_NEW,
  541. index_only: bool=False,
  542. overwrite: bool=False,
  543. filter_patterns_str: Optional[str]=None,
  544. filter_patterns: Optional[List[str]]=None,
  545. filter_type: Optional[str]=None,
  546. status: Optional[str]=None,
  547. after: Optional[str]=None,
  548. before: Optional[str]=None,
  549. out_dir: str=OUTPUT_DIR) -> List[Link]:
  550. """Import any new links from subscriptions and retry any previously failed/skipped links"""
  551. check_data_folder(out_dir=out_dir)
  552. check_dependencies()
  553. # Step 1: Load list of links from the existing index
  554. # merge in and dedupe new links from import_path
  555. all_links: List[Link] = []
  556. new_links: List[Link] = []
  557. all_links = load_main_index(out_dir=out_dir)
  558. # Step 2: Write updated index with deduped old and new links back to disk
  559. write_main_index(links=list(all_links), out_dir=out_dir)
  560. # Step 3: Filter for selected_links
  561. matching_links = list_links(
  562. filter_patterns=filter_patterns,
  563. filter_type=filter_type,
  564. before=before,
  565. after=after,
  566. )
  567. matching_folders = list_folders(
  568. links=list(matching_links),
  569. status=status,
  570. out_dir=out_dir,
  571. )
  572. all_links = [link for link in matching_folders.values() if link]
  573. if index_only:
  574. return all_links
  575. # Step 3: Run the archive methods for each link
  576. to_archive = new_links if only_new else all_links
  577. archive_links(to_archive, out_dir=out_dir)
  578. # Step 4: Re-write links index with updated titles, icons, and resources
  579. all_links = load_main_index(out_dir=out_dir)
  580. write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
  581. return all_links
  582. @enforce_types
  583. def list_all(filter_patterns_str: Optional[str]=None,
  584. filter_patterns: Optional[List[str]]=None,
  585. filter_type: str='exact',
  586. status: Optional[str]=None,
  587. after: Optional[float]=None,
  588. before: Optional[float]=None,
  589. sort: Optional[str]=None,
  590. csv: Optional[str]=None,
  591. json: bool=False,
  592. out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
  593. """List, filter, and export information about archive entries"""
  594. check_data_folder(out_dir=out_dir)
  595. if filter_patterns and filter_patterns_str:
  596. stderr(
  597. '[X] You should either pass filter patterns as an arguments '
  598. 'or via stdin, but not both.\n',
  599. color='red',
  600. )
  601. raise SystemExit(2)
  602. elif filter_patterns_str:
  603. filter_patterns = filter_patterns_str.split('\n')
  604. links = list_links(
  605. filter_patterns=filter_patterns,
  606. filter_type=filter_type,
  607. before=before,
  608. after=after,
  609. )
  610. if sort:
  611. links = sorted(links, key=lambda link: getattr(link, sort))
  612. folders = list_folders(
  613. links=list(links),
  614. status=status,
  615. out_dir=out_dir,
  616. )
  617. print(printable_folders(folders, json=json, csv=csv))
  618. return folders
  619. @enforce_types
  620. def list_links(filter_patterns: Optional[List[str]]=None,
  621. filter_type: str='exact',
  622. after: Optional[float]=None,
  623. before: Optional[float]=None,
  624. out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
  625. check_data_folder(out_dir=out_dir)
  626. all_links = load_main_index(out_dir=out_dir)
  627. for link in all_links:
  628. if after is not None and float(link.timestamp) < after:
  629. continue
  630. if before is not None and float(link.timestamp) > before:
  631. continue
  632. if filter_patterns:
  633. if link_matches_filter(link, filter_patterns, filter_type):
  634. yield link
  635. else:
  636. yield link
  637. @enforce_types
  638. def list_folders(links: List[Link],
  639. status: str,
  640. out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  641. check_data_folder(out_dir=out_dir)
  642. if status == 'indexed':
  643. return get_indexed_folders(links, out_dir=out_dir)
  644. elif status == 'archived':
  645. return get_archived_folders(links, out_dir=out_dir)
  646. elif status == 'unarchived':
  647. return get_unarchived_folders(links, out_dir=out_dir)
  648. elif status == 'present':
  649. return get_present_folders(links, out_dir=out_dir)
  650. elif status == 'valid':
  651. return get_valid_folders(links, out_dir=out_dir)
  652. elif status == 'invalid':
  653. return get_invalid_folders(links, out_dir=out_dir)
  654. elif status == 'duplicate':
  655. return get_duplicate_folders(links, out_dir=out_dir)
  656. elif status == 'orphaned':
  657. return get_orphaned_folders(links, out_dir=out_dir)
  658. elif status == 'corrupted':
  659. return get_corrupted_folders(links, out_dir=out_dir)
  660. elif status == 'unrecognized':
  661. return get_unrecognized_folders(links, out_dir=out_dir)
  662. raise ValueError('Status not recognized.')
  663. @enforce_types
  664. def config(config_options_str: Optional[str]=None,
  665. config_options: Optional[List[str]]=None,
  666. get: bool=False,
  667. set: bool=False,
  668. reset: bool=False,
  669. out_dir: str=OUTPUT_DIR) -> None:
  670. """Get and set your ArchiveBox project configuration values"""
  671. check_data_folder(out_dir=out_dir)
  672. if config_options and config_options_str:
  673. stderr(
  674. '[X] You should either pass config values as an arguments '
  675. 'or via stdin, but not both.\n',
  676. color='red',
  677. )
  678. raise SystemExit(2)
  679. elif config_options_str:
  680. config_options = config_options_str.split('\n')
  681. config_options = config_options or []
  682. no_args = not (get or set or reset or config_options)
  683. matching_config: ConfigDict = {}
  684. if get or no_args:
  685. if config_options:
  686. config_options = [get_real_name(key) for key in config_options]
  687. matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
  688. failed_config = [key for key in config_options if key not in CONFIG]
  689. if failed_config:
  690. stderr()
  691. stderr('[X] These options failed to get', color='red')
  692. stderr(' {}'.format('\n '.join(config_options)))
  693. raise SystemExit(1)
  694. else:
  695. matching_config = CONFIG
  696. print(printable_config(matching_config))
  697. raise SystemExit(not matching_config)
  698. elif set:
  699. new_config = {}
  700. failed_options = []
  701. for line in config_options:
  702. if line.startswith('#') or not line.strip():
  703. continue
  704. if '=' not in line:
  705. stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
  706. stderr(f' {line}')
  707. raise SystemExit(2)
  708. raw_key, val = line.split('=')
  709. raw_key = raw_key.upper().strip()
  710. key = get_real_name(raw_key)
  711. if key != raw_key:
  712. stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
  713. if key in CONFIG:
  714. new_config[key] = val.strip()
  715. else:
  716. failed_options.append(line)
  717. if new_config:
  718. before = CONFIG
  719. matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
  720. after = load_all_config()
  721. print(printable_config(matching_config))
  722. side_effect_changes: ConfigDict = {}
  723. for key, val in after.items():
  724. if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
  725. side_effect_changes[key] = after[key]
  726. if side_effect_changes:
  727. stderr()
  728. stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
  729. print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
  730. if failed_options:
  731. stderr()
  732. stderr('[X] These options failed to set (check for typos):', color='red')
  733. stderr(' {}'.format('\n '.join(failed_options)))
  734. raise SystemExit(bool(failed_options))
  735. elif reset:
  736. stderr('[X] This command is not implemented yet.', color='red')
  737. stderr(' Please manually remove the relevant lines from your config file:')
  738. stderr(f' {CONFIG_FILE}')
  739. raise SystemExit(2)
  740. else:
  741. stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
  742. stderr(' archivebox config')
  743. stderr(' archivebox config --get SOME_KEY')
  744. stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
  745. raise SystemExit(2)
  746. @enforce_types
  747. def schedule(add: bool=False,
  748. show: bool=False,
  749. clear: bool=False,
  750. foreground: bool=False,
  751. run_all: bool=False,
  752. quiet: bool=False,
  753. every: Optional[str]=None,
  754. import_path: Optional[str]=None,
  755. out_dir: str=OUTPUT_DIR):
  756. """Set ArchiveBox to regularly import URLs at specific times using cron"""
  757. check_data_folder(out_dir=out_dir)
  758. os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
  759. cron = CronTab(user=True)
  760. cron = dedupe_cron_jobs(cron)
  761. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  762. if foreground or run_all:
  763. if import_path or (not existing_jobs):
  764. stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
  765. stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
  766. raise SystemExit(1)
  767. print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
  768. if run_all:
  769. try:
  770. for job in existing_jobs:
  771. sys.stdout.write(f' > {job.command}')
  772. sys.stdout.flush()
  773. job.run()
  774. sys.stdout.write(f'\r √ {job.command}\n')
  775. except KeyboardInterrupt:
  776. print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
  777. raise SystemExit(1)
  778. if foreground:
  779. try:
  780. for result in cron.run_scheduler():
  781. print(result)
  782. except KeyboardInterrupt:
  783. print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
  784. raise SystemExit(1)
  785. elif show:
  786. if existing_jobs:
  787. print('\n'.join(str(cmd) for cmd in existing_jobs))
  788. else:
  789. stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
  790. stderr(' To schedule a new job, run:')
  791. stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
  792. raise SystemExit(0)
  793. elif clear:
  794. print(cron.remove_all(comment=CRON_COMMENT))
  795. cron.write()
  796. raise SystemExit(0)
  797. elif every:
  798. quoted = lambda s: f'"{s}"' if s and ' ' in s else s
  799. cmd = [
  800. 'cd',
  801. quoted(out_dir),
  802. '&&',
  803. quoted(ARCHIVEBOX_BINARY),
  804. *(['add', f'"{import_path}"'] if import_path else ['update']),
  805. '2>&1',
  806. '>',
  807. quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
  808. ]
  809. new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
  810. if every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
  811. set_every = getattr(new_job.every(), every)
  812. set_every()
  813. elif CronSlices.is_valid(every):
  814. new_job.setall(every)
  815. else:
  816. stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
  817. stderr(' It must be one of minute/hour/day/week/month')
  818. stderr(' or a quoted cron-format schedule like:')
  819. stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
  820. stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
  821. raise SystemExit(1)
  822. cron = dedupe_cron_jobs(cron)
  823. cron.write()
  824. total_runs = sum(j.frequency_per_year() for j in cron)
  825. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  826. print()
  827. print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
  828. print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
  829. if total_runs > 60 and not quiet:
  830. stderr()
  831. stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
  832. stderr(' Congrats on being an enthusiastic internet archiver! 👌')
  833. stderr()
  834. stderr(' Make sure you have enough storage space available to hold all the data.')
  835. stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
  836. raise SystemExit(0)
  837. @enforce_types
  838. def server(runserver_args: Optional[List[str]]=None,
  839. reload: bool=False,
  840. debug: bool=False,
  841. out_dir: str=OUTPUT_DIR) -> None:
  842. """Run the ArchiveBox HTTP server"""
  843. runserver_args = runserver_args or []
  844. from . import config
  845. config.SHOW_PROGRESS = False
  846. if debug:
  847. # if --debug is passed, patch config.DEBUG to be True for this run
  848. config.DEBUG = True
  849. else:
  850. # force staticfiles to be served when DEBUG=False
  851. # TODO: do this using nginx or another server instead of django?
  852. runserver_args.append('--insecure')
  853. check_data_folder(out_dir=out_dir)
  854. setup_django(out_dir)
  855. from django.core.management import call_command
  856. from django.contrib.auth.models import User
  857. if IS_TTY and not User.objects.filter(is_superuser=True).exists():
  858. print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
  859. print()
  860. print(' To create an admin user, run:')
  861. print(' archivebox manage createsuperuser')
  862. print()
  863. print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
  864. if not reload:
  865. runserver_args.append('--noreload')
  866. call_command("runserver", *runserver_args)
  867. @enforce_types
  868. def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
  869. """Run an ArchiveBox Django management command"""
  870. check_data_folder(out_dir=out_dir)
  871. setup_django(out_dir)
  872. from django.core.management import execute_from_command_line
  873. execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
  874. @enforce_types
  875. def shell(out_dir: str=OUTPUT_DIR) -> None:
  876. """Enter an interactive ArchiveBox Django shell"""
  877. check_data_folder(out_dir=out_dir)
  878. setup_django(OUTPUT_DIR)
  879. from django.core.management import call_command
  880. call_command("shell_plus")