archivebox_add.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox add'
  4. import sys
  5. from typing import TYPE_CHECKING
  6. import rich_click as click
  7. from django.utils import timezone
  8. from django.db.models import QuerySet
  9. from archivebox.misc.util import enforce_types, docstring
  10. from archivebox import CONSTANTS
  11. from archivebox.config.common import ARCHIVING_CONFIG
  12. from archivebox.config.permissions import USER, HOSTNAME
  13. if TYPE_CHECKING:
  14. from archivebox.core.models import Snapshot
  15. @enforce_types
  16. def add(urls: str | list[str],
  17. depth: int | str=0,
  18. tag: str='',
  19. parser: str="auto",
  20. plugins: str="",
  21. persona: str='Default',
  22. overwrite: bool=False,
  23. update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
  24. index_only: bool=False,
  25. bg: bool=False,
  26. created_by_id: int | None=None) -> QuerySet['Snapshot']:
  27. """Add a new URL or list of URLs to your archive.
  28. The flow is:
  29. 1. Save URLs to sources file
  30. 2. Create Crawl with URLs and max_depth
  31. 3. Orchestrator creates Snapshots from Crawl URLs (depth=0)
  32. 4. Orchestrator runs parser extractors on root snapshots
  33. 5. Parser extractors output to urls.jsonl
  34. 6. URLs are added to Crawl.urls and child Snapshots are created
  35. 7. Repeat until max_depth is reached
  36. """
  37. from rich import print
  38. depth = int(depth)
  39. assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
  40. # import models once django is set up
  41. from archivebox.core.models import Snapshot
  42. from archivebox.crawls.models import Crawl
  43. from archivebox.base_models.models import get_or_create_system_user_pk
  44. from archivebox.workers.orchestrator import Orchestrator
  45. created_by_id = created_by_id or get_or_create_system_user_pk()
  46. # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
  47. sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
  48. sources_file.parent.mkdir(parents=True, exist_ok=True)
  49. sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
  50. # 2. Create a new Crawl with inline URLs
  51. cli_args = [*sys.argv]
  52. if cli_args[0].lower().endswith('archivebox'):
  53. cli_args[0] = 'archivebox'
  54. cmd_str = ' '.join(cli_args)
  55. timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
  56. # Read URLs directly into crawl
  57. urls_content = sources_file.read_text()
  58. crawl = Crawl.objects.create(
  59. urls=urls_content,
  60. max_depth=depth,
  61. tags_str=tag,
  62. label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
  63. created_by_id=created_by_id,
  64. config={
  65. 'ONLY_NEW': not update,
  66. 'INDEX_ONLY': index_only,
  67. 'OVERWRITE': overwrite,
  68. 'PLUGINS': plugins,
  69. 'DEFAULT_PERSONA': persona or 'Default',
  70. 'PARSER': parser,
  71. }
  72. )
  73. print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
  74. first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
  75. print(f' [dim]First URL: {first_url}[/dim]')
  76. # 3. The CrawlMachine will create the root Snapshot when started
  77. # If URLs are from a file: first URL = file:///path/to/sources/...txt
  78. # Parser extractors will run on it and discover more URLs
  79. # Those URLs become child Snapshots (depth=1)
  80. if index_only:
  81. # Just create the crawl but don't start processing
  82. print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
  83. # Create snapshots for all URLs in the crawl
  84. for url in crawl.get_urls_list():
  85. Snapshot.objects.update_or_create(
  86. crawl=crawl, url=url,
  87. defaults={
  88. 'status': Snapshot.INITIAL_STATE,
  89. 'retry_at': timezone.now(),
  90. 'timestamp': str(timezone.now().timestamp()),
  91. 'depth': 0,
  92. },
  93. )
  94. return crawl.snapshot_set.all()
  95. # 5. Start the orchestrator to process the queue
  96. # The orchestrator will:
  97. # - Process Crawl -> create root Snapshot
  98. # - Process root Snapshot -> run parser extractors -> discover URLs
  99. # - Create child Snapshots from discovered URLs
  100. # - Process child Snapshots -> run extractors
  101. # - Repeat until max_depth reached
  102. if bg:
  103. # Background mode: just queue work and return (orchestrator via server will pick it up)
  104. print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
  105. else:
  106. # Foreground mode: run orchestrator inline until all work is done
  107. print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
  108. orchestrator = Orchestrator(exit_on_idle=True)
  109. orchestrator.runloop() # Block until complete
  110. # 6. Return the list of Snapshots in this crawl
  111. return crawl.snapshot_set.all()
  112. @click.command()
  113. @click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
  114. @click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
  115. @click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
  116. @click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
  117. @click.option('--persona', default='Default', help='Authentication profile to use when archiving')
  118. @click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
  119. @click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
  120. @click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
  121. @click.option('--bg', is_flag=True, help='Run archiving in background (start orchestrator and return immediately)')
  122. @click.argument('urls', nargs=-1, type=click.Path())
  123. @docstring(add.__doc__)
  124. def main(**kwargs):
  125. """Add a new URL or list of URLs to your archive"""
  126. add(**kwargs)
  127. if __name__ == '__main__':
  128. main()