2
0

main.py 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037
  1. __package__ = 'archivebox'
  2. import os
  3. import sys
  4. import shutil
  5. from typing import Dict, List, Optional, Iterable, IO
  6. from crontab import CronTab, CronSlices
  7. from .cli import (
  8. list_subcommands,
  9. run_subcommand,
  10. display_first,
  11. meta_cmds,
  12. main_cmds,
  13. archive_cmds,
  14. )
  15. from .parsers import (
  16. save_stdin_to_sources,
  17. save_file_to_sources,
  18. )
  19. from .index.schema import Link
  20. from .util import enforce_types, docstring
  21. from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
  22. from .index import (
  23. links_after_timestamp,
  24. load_main_index,
  25. import_new_links,
  26. write_main_index,
  27. link_matches_filter,
  28. get_indexed_folders,
  29. get_archived_folders,
  30. get_unarchived_folders,
  31. get_present_folders,
  32. get_valid_folders,
  33. get_invalid_folders,
  34. get_duplicate_folders,
  35. get_orphaned_folders,
  36. get_corrupted_folders,
  37. get_unrecognized_folders,
  38. fix_invalid_folder_locations,
  39. )
  40. from .index.json import (
  41. parse_json_main_index,
  42. parse_json_links_details,
  43. )
  44. from .index.sql import (
  45. parse_sql_main_index,
  46. get_admins,
  47. apply_migrations,
  48. )
  49. from .index.html import parse_html_main_index
  50. from .extractors import archive_link
  51. from .config import (
  52. stderr,
  53. ConfigDict,
  54. ANSI,
  55. IS_TTY,
  56. USER,
  57. ARCHIVEBOX_BINARY,
  58. ONLY_NEW,
  59. OUTPUT_DIR,
  60. SOURCES_DIR,
  61. ARCHIVE_DIR,
  62. LOGS_DIR,
  63. CONFIG_FILE,
  64. ARCHIVE_DIR_NAME,
  65. SOURCES_DIR_NAME,
  66. LOGS_DIR_NAME,
  67. STATIC_DIR_NAME,
  68. JSON_INDEX_FILENAME,
  69. HTML_INDEX_FILENAME,
  70. SQL_INDEX_FILENAME,
  71. ROBOTS_TXT_FILENAME,
  72. FAVICON_FILENAME,
  73. check_dependencies,
  74. check_data_folder,
  75. write_config_file,
  76. setup_django,
  77. VERSION,
  78. CODE_LOCATIONS,
  79. EXTERNAL_LOCATIONS,
  80. DATA_LOCATIONS,
  81. DEPENDENCIES,
  82. load_all_config,
  83. CONFIG,
  84. USER_CONFIG,
  85. get_real_name,
  86. )
  87. from .cli.logging import (
  88. TimedProgress,
  89. log_archiving_started,
  90. log_archiving_paused,
  91. log_archiving_finished,
  92. log_removal_started,
  93. log_removal_finished,
  94. log_list_started,
  95. log_list_finished,
  96. printable_config,
  97. printable_folders,
  98. printable_filesize,
  99. printable_folder_status,
  100. printable_dependency_version,
  101. )
  102. ALLOWED_IN_OUTPUT_DIR = {
  103. '.DS_Store',
  104. '.venv',
  105. 'venv',
  106. 'virtualenv',
  107. '.virtualenv',
  108. ARCHIVE_DIR_NAME,
  109. SOURCES_DIR_NAME,
  110. LOGS_DIR_NAME,
  111. STATIC_DIR_NAME,
  112. SQL_INDEX_FILENAME,
  113. JSON_INDEX_FILENAME,
  114. HTML_INDEX_FILENAME,
  115. ROBOTS_TXT_FILENAME,
  116. FAVICON_FILENAME,
  117. }
  118. @enforce_types
  119. def help(out_dir: str=OUTPUT_DIR) -> None:
  120. """Print the ArchiveBox help message and usage"""
  121. all_subcommands = list_subcommands()
  122. COMMANDS_HELP_TEXT = '\n '.join(
  123. f'{cmd.ljust(20)} {summary}'
  124. for cmd, summary in all_subcommands.items()
  125. if cmd in meta_cmds
  126. ) + '\n\n ' + '\n '.join(
  127. f'{cmd.ljust(20)} {summary}'
  128. for cmd, summary in all_subcommands.items()
  129. if cmd in main_cmds
  130. ) + '\n\n ' + '\n '.join(
  131. f'{cmd.ljust(20)} {summary}'
  132. for cmd, summary in all_subcommands.items()
  133. if cmd in archive_cmds
  134. ) + '\n\n ' + '\n '.join(
  135. f'{cmd.ljust(20)} {summary}'
  136. for cmd, summary in all_subcommands.items()
  137. if cmd not in display_first
  138. )
  139. if os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME)):
  140. print('''{green}ArchiveBox v{}: The self-hosted internet archive.{reset}
  141. {lightred}Active data directory:{reset}
  142. {}
  143. {lightred}Usage:{reset}
  144. archivebox [command] [--help] [--version] [...args]
  145. {lightred}Commands:{reset}
  146. {}
  147. {lightred}Example Use:{reset}
  148. mkdir my-archive; cd my-archive/
  149. archivebox init
  150. archivebox info
  151. archivebox add https://example.com/some/page
  152. archivebox add --depth=1 ~/Downloads/bookmarks_export.html
  153. archivebox list --sort=timestamp --csv=timestamp,url,is_archived
  154. archivebox schedule --every=week https://example.com/some/feed.rss
  155. archivebox update --resume=15109948213.123
  156. {lightred}Documentation:{reset}
  157. https://github.com/pirate/ArchiveBox/wiki
  158. '''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
  159. else:
  160. print('{green}Welcome to ArchiveBox v{}!{reset}'.format(VERSION, **ANSI))
  161. print()
  162. print('To import an existing archive (from a previous version of ArchiveBox):')
  163. print(' 1. cd into your data dir OUTPUT_DIR (usually ArchiveBox/output) and run:')
  164. print(' 2. archivebox init')
  165. print()
  166. print('To start a new archive:')
  167. print(' 1. Create an empty directory, then cd into it and run:')
  168. print(' 2. archivebox init')
  169. print()
  170. print('For more information, see the documentation here:')
  171. print(' https://github.com/pirate/ArchiveBox/wiki')
  172. @enforce_types
  173. def version(quiet: bool=False,
  174. out_dir: str=OUTPUT_DIR) -> None:
  175. """Print the ArchiveBox version and dependency information"""
  176. if quiet:
  177. print(VERSION)
  178. else:
  179. print('ArchiveBox v{}'.format(VERSION))
  180. print()
  181. print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
  182. for name, dependency in DEPENDENCIES.items():
  183. print(printable_dependency_version(name, dependency))
  184. print()
  185. print('{white}[i] Code locations:{reset}'.format(**ANSI))
  186. for name, folder in CODE_LOCATIONS.items():
  187. print(printable_folder_status(name, folder))
  188. print()
  189. print('{white}[i] External locations:{reset}'.format(**ANSI))
  190. for name, folder in EXTERNAL_LOCATIONS.items():
  191. print(printable_folder_status(name, folder))
  192. print()
  193. print('{white}[i] Data locations:{reset}'.format(**ANSI))
  194. for name, folder in DATA_LOCATIONS.items():
  195. print(printable_folder_status(name, folder))
  196. print()
  197. check_dependencies()
  198. @enforce_types
  199. def run(subcommand: str,
  200. subcommand_args: Optional[List[str]],
  201. stdin: Optional[IO]=None,
  202. out_dir: str=OUTPUT_DIR) -> None:
  203. """Run a given ArchiveBox subcommand with the given list of args"""
  204. run_subcommand(
  205. subcommand=subcommand,
  206. subcommand_args=subcommand_args,
  207. stdin=stdin,
  208. pwd=out_dir,
  209. )
  210. @enforce_types
  211. def init(force: bool=False, out_dir: str=OUTPUT_DIR) -> None:
  212. """Initialize a new ArchiveBox collection in the current directory"""
  213. os.makedirs(out_dir, exist_ok=True)
  214. is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
  215. existing_index = os.path.exists(os.path.join(out_dir, JSON_INDEX_FILENAME))
  216. if is_empty and not existing_index:
  217. print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  218. print(f' {out_dir}')
  219. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  220. elif existing_index:
  221. print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  222. print(f' {out_dir}')
  223. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  224. else:
  225. if force:
  226. stderr('[!] This folder appears to already have files in it, but no index.json is present.', color='lightyellow')
  227. stderr(' Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).')
  228. else:
  229. stderr(
  230. ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
  231. " You must run init in a completely empty directory, or an existing data folder.\n\n"
  232. " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
  233. " then run and run 'archivebox init' to pick up where you left off.\n\n"
  234. " (Always make sure your data folder is backed up first before updating ArchiveBox)"
  235. ).format(out_dir, **ANSI)
  236. )
  237. raise SystemExit(2)
  238. if existing_index:
  239. print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
  240. else:
  241. print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
  242. os.makedirs(SOURCES_DIR, exist_ok=True)
  243. print(f' √ {SOURCES_DIR}')
  244. os.makedirs(ARCHIVE_DIR, exist_ok=True)
  245. print(f' √ {ARCHIVE_DIR}')
  246. os.makedirs(LOGS_DIR, exist_ok=True)
  247. print(f' √ {LOGS_DIR}')
  248. write_config_file({}, out_dir=out_dir)
  249. print(f' √ {CONFIG_FILE}')
  250. if os.path.exists(os.path.join(out_dir, SQL_INDEX_FILENAME)):
  251. print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
  252. else:
  253. print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
  254. setup_django(out_dir, check_db=False)
  255. from django.conf import settings
  256. DATABASE_FILE = os.path.join(out_dir, SQL_INDEX_FILENAME)
  257. print(f' √ {DATABASE_FILE}')
  258. print()
  259. for migration_line in apply_migrations(out_dir):
  260. print(f' {migration_line}')
  261. assert os.path.exists(DATABASE_FILE)
  262. # from django.contrib.auth.models import User
  263. # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
  264. # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
  265. # call_command("createsuperuser", interactive=True)
  266. print()
  267. print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
  268. all_links: Dict[str, Link] = {}
  269. if existing_index:
  270. all_links = {
  271. link.url: link
  272. for link in load_main_index(out_dir=out_dir, warn=False)
  273. }
  274. print(' √ Loaded {} links from existing main index.'.format(len(all_links)))
  275. # Links in data folders that dont match their timestamp
  276. fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
  277. if fixed:
  278. print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
  279. if cant_fix:
  280. print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
  281. # Links in JSON index but not in main index
  282. orphaned_json_links = {
  283. link.url: link
  284. for link in parse_json_main_index(out_dir)
  285. if link.url not in all_links
  286. }
  287. if orphaned_json_links:
  288. all_links.update(orphaned_json_links)
  289. print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
  290. # Links in SQL index but not in main index
  291. orphaned_sql_links = {
  292. link.url: link
  293. for link in parse_sql_main_index(out_dir)
  294. if link.url not in all_links
  295. }
  296. if orphaned_sql_links:
  297. all_links.update(orphaned_sql_links)
  298. print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
  299. # Links in data dir indexes but not in main index
  300. orphaned_data_dir_links = {
  301. link.url: link
  302. for link in parse_json_links_details(out_dir)
  303. if link.url not in all_links
  304. }
  305. if orphaned_data_dir_links:
  306. all_links.update(orphaned_data_dir_links)
  307. print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
  308. # Links in invalid/duplicate data dirs
  309. invalid_folders = {
  310. folder: link
  311. for folder, link in get_invalid_folders(all_links.values(), out_dir=out_dir).items()
  312. }
  313. if invalid_folders:
  314. print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
  315. print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
  316. print()
  317. print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
  318. print(' archivebox info')
  319. print(' archivebox list --status=invalid')
  320. write_main_index(list(all_links.values()), out_dir=out_dir)
  321. print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  322. if existing_index:
  323. print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
  324. else:
  325. print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
  326. print()
  327. print(' To view your archive index, open:')
  328. print(' {}'.format(os.path.join(out_dir, HTML_INDEX_FILENAME)))
  329. print()
  330. print(' To add new links, you can run:')
  331. print(" archivebox add 'https://example.com'")
  332. print()
  333. print(' For more usage and examples, run:')
  334. print(' archivebox help')
  335. @enforce_types
  336. def info(out_dir: str=OUTPUT_DIR) -> None:
  337. """Print out some info and statistics about the archive collection"""
  338. check_data_folder(out_dir=out_dir)
  339. print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
  340. print(f' {out_dir}/*')
  341. num_bytes, num_dirs, num_files = get_dir_size(out_dir, recursive=False, pattern='index.')
  342. size = printable_filesize(num_bytes)
  343. print(f' Size: {size} across {num_files} files')
  344. print()
  345. links = list(load_main_index(out_dir=out_dir))
  346. num_json_links = len(links)
  347. num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=out_dir))
  348. num_html_links = sum(1 for url in parse_html_main_index(out_dir=out_dir))
  349. num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
  350. users = get_admins().values_list('username', flat=True)
  351. print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
  352. print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
  353. print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
  354. print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
  355. print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
  356. if num_html_links != len(links) or num_sql_links != len(links):
  357. print()
  358. print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
  359. print(' archivebox init')
  360. if not users:
  361. print()
  362. print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
  363. print(' archivebox manage createsuperuser')
  364. print()
  365. print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
  366. print(f' {ARCHIVE_DIR}/*')
  367. num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
  368. size = printable_filesize(num_bytes)
  369. print(f' Size: {size} across {num_files} files in {num_dirs} directories')
  370. print()
  371. num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
  372. num_archived = len(get_archived_folders(links, out_dir=out_dir))
  373. num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
  374. print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
  375. print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
  376. print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
  377. num_present = len(get_present_folders(links, out_dir=out_dir))
  378. num_valid = len(get_valid_folders(links, out_dir=out_dir))
  379. print()
  380. print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
  381. print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
  382. duplicate = get_duplicate_folders(links, out_dir=out_dir)
  383. orphaned = get_orphaned_folders(links, out_dir=out_dir)
  384. corrupted = get_corrupted_folders(links, out_dir=out_dir)
  385. unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
  386. num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
  387. print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
  388. print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
  389. print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
  390. print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
  391. print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
  392. if num_indexed:
  393. print()
  394. print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
  395. print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
  396. if orphaned:
  397. print()
  398. print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
  399. print(' archivebox init')
  400. if num_invalid:
  401. print()
  402. print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
  403. print(' archivebox init')
  404. print()
  405. @enforce_types
  406. def add(import_str: Optional[str]=None,
  407. import_path: Optional[str]=None,
  408. update_all: bool=not ONLY_NEW,
  409. index_only: bool=False,
  410. out_dir: str=OUTPUT_DIR) -> List[Link]:
  411. """Add a new URL or list of URLs to your archive"""
  412. check_data_folder(out_dir=out_dir)
  413. if (import_str and import_path) or (not import_str and not import_path):
  414. stderr(
  415. '[X] You should pass either an import path as an argument, '
  416. 'or pass a list of links via stdin, but not both.\n',
  417. color='red',
  418. )
  419. raise SystemExit(2)
  420. elif import_str:
  421. import_path = save_stdin_to_sources(import_str, out_dir=out_dir)
  422. elif import_path:
  423. import_path = save_file_to_sources(import_path, out_dir=out_dir)
  424. check_dependencies()
  425. # Step 1: Load list of links from the existing index
  426. # merge in and dedupe new links from import_path
  427. all_links: List[Link] = []
  428. new_links: List[Link] = []
  429. all_links = load_main_index(out_dir=out_dir)
  430. if import_path:
  431. all_links, new_links = import_new_links(all_links, import_path, out_dir=out_dir)
  432. # Step 2: Write updated index with deduped old and new links back to disk
  433. write_main_index(links=all_links, out_dir=out_dir)
  434. if index_only:
  435. return all_links
  436. # Step 3: Run the archive methods for each link
  437. links = all_links if update_all else new_links
  438. log_archiving_started(len(links))
  439. idx: int = 0
  440. link: Link = None # type: ignore
  441. try:
  442. for idx, link in enumerate(links):
  443. archive_link(link, out_dir=link.link_dir)
  444. except KeyboardInterrupt:
  445. log_archiving_paused(len(links), idx, link.timestamp if link else '0')
  446. raise SystemExit(0)
  447. except:
  448. print()
  449. raise
  450. log_archiving_finished(len(links))
  451. # Step 4: Re-write links index with updated titles, icons, and resources
  452. all_links = load_main_index(out_dir=out_dir)
  453. write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
  454. return all_links
  455. @enforce_types
  456. def remove(filter_str: Optional[str]=None,
  457. filter_patterns: Optional[List[str]]=None,
  458. filter_type: str='exact',
  459. after: Optional[float]=None,
  460. before: Optional[float]=None,
  461. yes: bool=False,
  462. delete: bool=False,
  463. out_dir: str=OUTPUT_DIR) -> List[Link]:
  464. """Remove the specified URLs from the archive"""
  465. check_data_folder(out_dir=out_dir)
  466. if filter_str and filter_patterns:
  467. stderr(
  468. '[X] You should pass either a pattern as an argument, '
  469. 'or pass a list of patterns via stdin, but not both.\n',
  470. color='red',
  471. )
  472. raise SystemExit(2)
  473. elif not (filter_str or filter_patterns):
  474. stderr(
  475. '[X] You should pass either a pattern as an argument, '
  476. 'or pass a list of patterns via stdin.',
  477. color='red',
  478. )
  479. stderr()
  480. stderr(' {lightred}Hint:{reset} To remove all urls you can run:'.format(**ANSI))
  481. stderr(" archivebox remove --filter-type=regex '.*'")
  482. stderr()
  483. raise SystemExit(2)
  484. elif filter_str:
  485. filter_patterns = [ptn.strip() for ptn in filter_str.split('\n')]
  486. log_list_started(filter_patterns, filter_type)
  487. timer = TimedProgress(360, prefix=' ')
  488. try:
  489. links = list(list_links(
  490. filter_patterns=filter_patterns,
  491. filter_type=filter_type,
  492. after=after,
  493. before=before,
  494. ))
  495. finally:
  496. timer.end()
  497. if not len(links):
  498. log_removal_finished(0, 0)
  499. raise SystemExit(1)
  500. log_list_finished(links)
  501. log_removal_started(links, yes=yes, delete=delete)
  502. timer = TimedProgress(360, prefix=' ')
  503. try:
  504. to_keep = []
  505. all_links = load_main_index(out_dir=out_dir)
  506. for link in all_links:
  507. should_remove = (
  508. (after is not None and float(link.timestamp) < after)
  509. or (before is not None and float(link.timestamp) > before)
  510. or link_matches_filter(link, filter_patterns, filter_type)
  511. )
  512. if not should_remove:
  513. to_keep.append(link)
  514. elif should_remove and delete:
  515. shutil.rmtree(link.link_dir, ignore_errors=True)
  516. finally:
  517. timer.end()
  518. write_main_index(links=to_keep, out_dir=out_dir, finished=True)
  519. log_removal_finished(len(all_links), len(to_keep))
  520. return to_keep
  521. @enforce_types
  522. def update(resume: Optional[float]=None,
  523. only_new: bool=ONLY_NEW,
  524. index_only: bool=False,
  525. overwrite: bool=False,
  526. filter_patterns_str: Optional[str]=None,
  527. filter_patterns: Optional[List[str]]=None,
  528. filter_type: Optional[str]=None,
  529. status: Optional[str]=None,
  530. after: Optional[str]=None,
  531. before: Optional[str]=None,
  532. out_dir: str=OUTPUT_DIR) -> List[Link]:
  533. """Import any new links from subscriptions and retry any previously failed/skipped links"""
  534. check_dependencies()
  535. check_data_folder(out_dir=out_dir)
  536. # Step 1: Load list of links from the existing index
  537. # merge in and dedupe new links from import_path
  538. all_links: List[Link] = []
  539. new_links: List[Link] = []
  540. all_links = load_main_index(out_dir=out_dir)
  541. # Step 2: Write updated index with deduped old and new links back to disk
  542. write_main_index(links=list(all_links), out_dir=out_dir)
  543. # Step 3: Filter for selected_links
  544. matching_links = list_links(
  545. filter_patterns=filter_patterns,
  546. filter_type=filter_type,
  547. before=before,
  548. after=after,
  549. )
  550. matching_folders = list_folders(
  551. links=list(matching_links),
  552. status=status,
  553. out_dir=out_dir,
  554. )
  555. all_links = [link for link in matching_folders.values() if link]
  556. if index_only:
  557. return all_links
  558. # Step 3: Run the archive methods for each link
  559. links = new_links if only_new else all_links
  560. log_archiving_started(len(links), resume)
  561. idx: int = 0
  562. link: Link = None # type: ignore
  563. try:
  564. for idx, link in enumerate(links_after_timestamp(links, resume)):
  565. archive_link(link, overwrite=overwrite, out_dir=link.link_dir)
  566. except KeyboardInterrupt:
  567. log_archiving_paused(len(links), idx, link.timestamp if link else '0')
  568. raise SystemExit(0)
  569. except:
  570. print()
  571. raise
  572. log_archiving_finished(len(links))
  573. # Step 4: Re-write links index with updated titles, icons, and resources
  574. all_links = load_main_index(out_dir=out_dir)
  575. write_main_index(links=list(all_links), out_dir=out_dir, finished=True)
  576. return all_links
  577. @enforce_types
  578. def list_all(filter_patterns_str: Optional[str]=None,
  579. filter_patterns: Optional[List[str]]=None,
  580. filter_type: str='exact',
  581. status: Optional[str]=None,
  582. after: Optional[float]=None,
  583. before: Optional[float]=None,
  584. sort: Optional[str]=None,
  585. csv: Optional[str]=None,
  586. json: bool=False,
  587. out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
  588. """List, filter, and export information about archive entries"""
  589. check_data_folder(out_dir=out_dir)
  590. if filter_patterns and filter_patterns_str:
  591. stderr(
  592. '[X] You should either pass filter patterns as an arguments '
  593. 'or via stdin, but not both.\n',
  594. color='red',
  595. )
  596. raise SystemExit(2)
  597. elif filter_patterns_str:
  598. filter_patterns = filter_patterns_str.split('\n')
  599. links = list_links(
  600. filter_patterns=filter_patterns,
  601. filter_type=filter_type,
  602. before=before,
  603. after=after,
  604. )
  605. if sort:
  606. links = sorted(links, key=lambda link: getattr(link, sort))
  607. folders = list_folders(
  608. links=list(links),
  609. status=status,
  610. out_dir=out_dir,
  611. )
  612. print(printable_folders(folders, json=json, csv=csv))
  613. return folders
  614. @enforce_types
  615. def list_links(filter_patterns: Optional[List[str]]=None,
  616. filter_type: str='exact',
  617. after: Optional[float]=None,
  618. before: Optional[float]=None,
  619. out_dir: str=OUTPUT_DIR) -> Iterable[Link]:
  620. check_data_folder(out_dir=out_dir)
  621. all_links = load_main_index(out_dir=out_dir)
  622. for link in all_links:
  623. if after is not None and float(link.timestamp) < after:
  624. continue
  625. if before is not None and float(link.timestamp) > before:
  626. continue
  627. if filter_patterns:
  628. if link_matches_filter(link, filter_patterns, filter_type):
  629. yield link
  630. else:
  631. yield link
  632. @enforce_types
  633. def list_folders(links: List[Link],
  634. status: str,
  635. out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  636. check_data_folder(out_dir=out_dir)
  637. if status == 'indexed':
  638. return get_indexed_folders(links, out_dir=out_dir)
  639. elif status == 'archived':
  640. return get_archived_folders(links, out_dir=out_dir)
  641. elif status == 'unarchived':
  642. return get_unarchived_folders(links, out_dir=out_dir)
  643. elif status == 'present':
  644. return get_present_folders(links, out_dir=out_dir)
  645. elif status == 'valid':
  646. return get_valid_folders(links, out_dir=out_dir)
  647. elif status == 'invalid':
  648. return get_invalid_folders(links, out_dir=out_dir)
  649. elif status == 'duplicate':
  650. return get_duplicate_folders(links, out_dir=out_dir)
  651. elif status == 'orphaned':
  652. return get_orphaned_folders(links, out_dir=out_dir)
  653. elif status == 'corrupted':
  654. return get_corrupted_folders(links, out_dir=out_dir)
  655. elif status == 'unrecognized':
  656. return get_unrecognized_folders(links, out_dir=out_dir)
  657. raise ValueError('Status not recognized.')
  658. @enforce_types
  659. def config(config_options_str: Optional[str]=None,
  660. config_options: Optional[List[str]]=None,
  661. get: bool=False,
  662. set: bool=False,
  663. reset: bool=False,
  664. out_dir: str=OUTPUT_DIR) -> None:
  665. """Get and set your ArchiveBox project configuration values"""
  666. check_data_folder(out_dir=out_dir)
  667. if config_options and config_options_str:
  668. stderr(
  669. '[X] You should either pass config values as an arguments '
  670. 'or via stdin, but not both.\n',
  671. color='red',
  672. )
  673. raise SystemExit(2)
  674. elif config_options_str:
  675. config_options = config_options_str.split('\n')
  676. config_options = config_options or []
  677. no_args = not (get or set or reset or config_options)
  678. matching_config: ConfigDict = {}
  679. if get or no_args:
  680. if config_options:
  681. config_options = [get_real_name(key) for key in config_options]
  682. matching_config = {key: CONFIG[key] for key in config_options if key in CONFIG}
  683. failed_config = [key for key in config_options if key not in CONFIG]
  684. if failed_config:
  685. stderr()
  686. stderr('[X] These options failed to get', color='red')
  687. stderr(' {}'.format('\n '.join(config_options)))
  688. raise SystemExit(1)
  689. else:
  690. matching_config = CONFIG
  691. print(printable_config(matching_config))
  692. raise SystemExit(not matching_config)
  693. elif set:
  694. new_config = {}
  695. failed_options = []
  696. for line in config_options:
  697. if line.startswith('#') or not line.strip():
  698. continue
  699. if '=' not in line:
  700. stderr('[X] Config KEY=VALUE must have an = sign in it', color='red')
  701. stderr(f' {line}')
  702. raise SystemExit(2)
  703. raw_key, val = line.split('=')
  704. raw_key = raw_key.upper().strip()
  705. key = get_real_name(raw_key)
  706. if key != raw_key:
  707. stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
  708. if key in CONFIG:
  709. new_config[key] = val.strip()
  710. else:
  711. failed_options.append(line)
  712. if new_config:
  713. before = CONFIG
  714. matching_config = write_config_file(new_config, out_dir=OUTPUT_DIR)
  715. after = load_all_config()
  716. print(printable_config(matching_config))
  717. side_effect_changes: ConfigDict = {}
  718. for key, val in after.items():
  719. if key in USER_CONFIG and (before[key] != after[key]) and (key not in matching_config):
  720. side_effect_changes[key] = after[key]
  721. if side_effect_changes:
  722. stderr()
  723. stderr('[i] Note: This change also affected these other options that depended on it:', color='lightyellow')
  724. print(' {}'.format(printable_config(side_effect_changes, prefix=' ')))
  725. if failed_options:
  726. stderr()
  727. stderr('[X] These options failed to set:', color='red')
  728. stderr(' {}'.format('\n '.join(failed_options)))
  729. raise SystemExit(bool(failed_options))
  730. elif reset:
  731. stderr('[X] This command is not implemented yet.', color='red')
  732. stderr(' Please manually remove the relevant lines from your config file:')
  733. stderr(f' {CONFIG_FILE}')
  734. raise SystemExit(2)
  735. else:
  736. stderr('[X] You must pass either --get or --set, or no arguments to get the whole config.', color='red')
  737. stderr(' archivebox config')
  738. stderr(' archivebox config --get SOME_KEY')
  739. stderr(' archivebox config --set SOME_KEY=SOME_VALUE')
  740. raise SystemExit(2)
  741. @enforce_types
  742. def schedule(add: bool=False,
  743. show: bool=False,
  744. clear: bool=False,
  745. foreground: bool=False,
  746. run_all: bool=False,
  747. quiet: bool=False,
  748. every: Optional[str]=None,
  749. import_path: Optional[str]=None,
  750. out_dir: str=OUTPUT_DIR):
  751. """Set ArchiveBox to regularly import URLs at specific times using cron"""
  752. check_data_folder(out_dir=out_dir)
  753. os.makedirs(os.path.join(out_dir, LOGS_DIR_NAME), exist_ok=True)
  754. cron = CronTab(user=True)
  755. cron = dedupe_cron_jobs(cron)
  756. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  757. if foreground or run_all:
  758. if import_path or (not existing_jobs):
  759. stderr('{red}[X] You must schedule some jobs first before running in foreground mode.{reset}'.format(**ANSI))
  760. stderr(' archivebox schedule --every=hour https://example.com/some/rss/feed.xml')
  761. raise SystemExit(1)
  762. print('{green}[*] Running {} ArchiveBox jobs in foreground task scheduler...{reset}'.format(len(existing_jobs), **ANSI))
  763. if run_all:
  764. try:
  765. for job in existing_jobs:
  766. sys.stdout.write(f' > {job.command}')
  767. sys.stdout.flush()
  768. job.run()
  769. sys.stdout.write(f'\r √ {job.command}\n')
  770. except KeyboardInterrupt:
  771. print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
  772. raise SystemExit(1)
  773. if foreground:
  774. try:
  775. for result in cron.run_scheduler():
  776. print(result)
  777. except KeyboardInterrupt:
  778. print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
  779. raise SystemExit(1)
  780. elif show:
  781. if existing_jobs:
  782. print('\n'.join(str(cmd) for cmd in existing_jobs))
  783. else:
  784. stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **ANSI))
  785. stderr(' To schedule a new job, run:')
  786. stderr(' archivebox schedule --every=[timeperiod] https://example.com/some/rss/feed.xml')
  787. raise SystemExit(0)
  788. elif clear:
  789. print(cron.remove_all(comment=CRON_COMMENT))
  790. cron.write()
  791. raise SystemExit(0)
  792. elif every:
  793. quoted = lambda s: f'"{s}"' if s and ' ' in s else s
  794. cmd = [
  795. 'cd',
  796. quoted(out_dir),
  797. '&&',
  798. quoted(ARCHIVEBOX_BINARY),
  799. *(['add', f'"{import_path}"'] if import_path else ['update']),
  800. '2>&1',
  801. '>',
  802. quoted(os.path.join(LOGS_DIR, 'archivebox.log')),
  803. ]
  804. new_job = cron.new(command=' '.join(cmd), comment=CRON_COMMENT)
  805. if every in ('minute', 'hour', 'day', 'week', 'month', 'year'):
  806. set_every = getattr(new_job.every(), every)
  807. set_every()
  808. elif CronSlices.is_valid(every):
  809. new_job.setall(every)
  810. else:
  811. stderr('{red}[X] Got invalid timeperiod for cron task.{reset}'.format(**ANSI))
  812. stderr(' It must be one of minute/hour/day/week/month')
  813. stderr(' or a quoted cron-format schedule like:')
  814. stderr(' archivebox init --every=day https://example.com/some/rss/feed.xml')
  815. stderr(' archivebox init --every="0/5 * * * *" https://example.com/some/rss/feed.xml')
  816. raise SystemExit(1)
  817. cron = dedupe_cron_jobs(cron)
  818. cron.write()
  819. total_runs = sum(j.frequency_per_year() for j in cron)
  820. existing_jobs = list(cron.find_comment(CRON_COMMENT))
  821. print()
  822. print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **ANSI))
  823. print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
  824. if total_runs > 60 and not quiet:
  825. stderr()
  826. stderr('{lightyellow}[!] With the current cron config, ArchiveBox is estimated to run >{} times per year.{reset}'.format(total_runs, **ANSI))
  827. stderr(f' Congrats on being an enthusiastic internet archiver! 👌')
  828. stderr()
  829. stderr(' Make sure you have enough storage space available to hold all the data.')
  830. stderr(' Using a compressed/deduped filesystem like ZFS is recommended if you plan on archiving a lot.')
  831. raise SystemExit(0)
  832. @enforce_types
  833. def server(runserver_args: Optional[List[str]]=None,
  834. reload: bool=False,
  835. debug: bool=False,
  836. out_dir: str=OUTPUT_DIR) -> None:
  837. """Run the ArchiveBox HTTP server"""
  838. runserver_args = runserver_args or []
  839. check_data_folder(out_dir=out_dir)
  840. if debug:
  841. os.environ['DEBUG'] = 'True'
  842. else:
  843. runserver_args.append('--insecure')
  844. setup_django(out_dir)
  845. from django.core.management import call_command
  846. from django.contrib.auth.models import User
  847. if IS_TTY and not User.objects.filter(is_superuser=True).exists():
  848. print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
  849. print()
  850. print(' To create an admin user, run:')
  851. print(' archivebox manage createsuperuser')
  852. print()
  853. print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
  854. if not reload:
  855. runserver_args.append('--noreload')
  856. call_command("runserver", *runserver_args)
  857. @enforce_types
  858. def manage(args: Optional[List[str]]=None, out_dir: str=OUTPUT_DIR) -> None:
  859. """Run an ArchiveBox Django management command"""
  860. check_data_folder(out_dir=out_dir)
  861. setup_django(out_dir)
  862. from django.core.management import execute_from_command_line
  863. execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
  864. @enforce_types
  865. def shell(out_dir: str=OUTPUT_DIR) -> None:
  866. """Enter an interactive ArchiveBox Django shell"""
  867. check_data_folder(out_dir=out_dir)
  868. setup_django(OUTPUT_DIR)
  869. from django.core.management import call_command
  870. call_command("shell_plus")