archivebox_add.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox add'
  4. import sys
  5. from typing import TYPE_CHECKING
  6. import rich_click as click
  7. from django.utils import timezone
  8. from django.db.models import QuerySet
  9. from archivebox.misc.util import enforce_types, docstring
  10. from archivebox import CONSTANTS
  11. from archivebox.config.common import ARCHIVING_CONFIG
  12. from archivebox.config.permissions import USER, HOSTNAME
  13. from archivebox.parsers import PARSERS
  14. if TYPE_CHECKING:
  15. from core.models import Snapshot
  16. ORCHESTRATOR = None
  17. @enforce_types
  18. def add(urls: str | list[str],
  19. depth: int | str=0,
  20. tag: str='',
  21. parser: str="auto",
  22. extract: str="",
  23. persona: str='Default',
  24. overwrite: bool=False,
  25. update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
  26. index_only: bool=False,
  27. bg: bool=False,
  28. created_by_id: int | None=None) -> QuerySet['Snapshot']:
  29. """Add a new URL or list of URLs to your archive"""
  30. global ORCHESTRATOR
  31. depth = int(depth)
  32. assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
  33. # import models once django is set up
  34. from crawls.models import Seed, Crawl
  35. from workers.orchestrator import Orchestrator
  36. from archivebox.base_models.models import get_or_create_system_user_pk
  37. created_by_id = created_by_id or get_or_create_system_user_pk()
  38. # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
  39. sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
  40. sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
  41. # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
  42. cli_args = [*sys.argv]
  43. if cli_args[0].lower().endswith('archivebox'):
  44. cli_args[0] = 'archivebox' # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
  45. cmd_str = ' '.join(cli_args)
  46. seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
  47. 'ONLY_NEW': not update,
  48. 'INDEX_ONLY': index_only,
  49. 'OVERWRITE': overwrite,
  50. 'EXTRACTORS': extract,
  51. 'DEFAULT_PERSONA': persona or 'Default',
  52. })
  53. # 3. create a new Crawl pointing to the Seed
  54. crawl = Crawl.from_seed(seed, max_depth=depth)
  55. # 4. start the Orchestrator & wait until it completes
  56. # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
  57. # from crawls.actors import CrawlActor
  58. # from core.actors import SnapshotActor, ArchiveResultActor
  59. if not bg:
  60. orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
  61. orchestrator.start()
  62. # 5. return the list of new Snapshots created
  63. return crawl.snapshot_set.all()
  64. @click.command()
  65. @click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
  66. @click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
  67. @click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
  68. @click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
  69. @click.option('--persona', default='Default', help='Authentication profile to use when archiving')
  70. @click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
  71. @click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
  72. @click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
  73. # @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
  74. @click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
  75. @click.argument('urls', nargs=-1, type=click.Path())
  76. @docstring(add.__doc__)
  77. def main(**kwargs):
  78. """Add a new URL or list of URLs to your archive"""
  79. add(**kwargs)
  80. if __name__ == '__main__':
  81. main()
  82. # OLD VERSION:
  83. # def add(urls: Union[str, List[str]],
  84. # tag: str='',
  85. # depth: int=0,
  86. # update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
  87. # update_all: bool=False,
  88. # index_only: bool=False,
  89. # overwrite: bool=False,
  90. # # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
  91. # init: bool=False,
  92. # extractors: str="",
  93. # parser: str="auto",
  94. # created_by_id: int | None=None,
  95. # out_dir: Path=DATA_DIR) -> List[Link]:
  96. # """Add a new URL or list of URLs to your archive"""
  97. # from core.models import Snapshot, Tag
  98. # # from workers.supervisord_util import start_cli_workers, tail_worker_logs
  99. # # from workers.tasks import bg_archive_link
  100. # assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
  101. # extractors = extractors.split(",") if extractors else []
  102. # if init:
  103. # run_subcommand('init', stdin=None, pwd=out_dir)
  104. # # Load list of links from the existing index
  105. # check_data_folder()
  106. # # worker = start_cli_workers()
  107. # new_links: List[Link] = []
  108. # all_links = load_main_index(out_dir=out_dir)
  109. # log_importing_started(urls=urls, depth=depth, index_only=index_only)
  110. # if isinstance(urls, str):
  111. # # save verbatim stdin to sources
  112. # write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
  113. # elif isinstance(urls, list):
  114. # # save verbatim args to sources
  115. # write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
  116. # new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
  117. # # If we're going one level deeper, download each link and look for more links
  118. # new_links_depth = []
  119. # if new_links and depth == 1:
  120. # log_crawl_started(new_links)
  121. # for new_link in new_links:
  122. # try:
  123. # downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
  124. # new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
  125. # except Exception as err:
  126. # stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
  127. # imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
  128. # new_links = dedupe_links(all_links, imported_links)
  129. # write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
  130. # all_links = load_main_index(out_dir=out_dir)
  131. # tags = [
  132. # Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
  133. # for name in tag.split(',')
  134. # if name.strip()
  135. # ]
  136. # if tags:
  137. # for link in imported_links:
  138. # snapshot = Snapshot.objects.get(url=link.url)
  139. # snapshot.tags.add(*tags)
  140. # snapshot.tags_str(nocache=True)
  141. # snapshot.save()
  142. # # print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
  143. # if index_only:
  144. # # mock archive all the links using the fake index_only extractor method in order to update their state
  145. # if overwrite:
  146. # archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
  147. # else:
  148. # archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
  149. # else:
  150. # # fully run the archive extractor methods for each link
  151. # archive_kwargs = {
  152. # "out_dir": out_dir,
  153. # "created_by_id": created_by_id,
  154. # }
  155. # if extractors:
  156. # archive_kwargs["methods"] = extractors
  157. # stderr()
  158. # ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
  159. # if update:
  160. # stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
  161. # archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
  162. # elif update_all:
  163. # stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
  164. # archive_links(all_links, overwrite=overwrite, **archive_kwargs)
  165. # elif overwrite:
  166. # stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
  167. # archive_links(imported_links, overwrite=True, **archive_kwargs)
  168. # elif new_links:
  169. # stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
  170. # archive_links(new_links, overwrite=False, **archive_kwargs)
  171. # # tail_worker_logs(worker['stdout_logfile'])
  172. # # if CAN_UPGRADE:
  173. # # hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
  174. # return new_links