archivebox_crawl.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/bin/env python3
  2. """
  3. archivebox crawl [urls...] [--depth=N] [--tag=TAG]
  4. Create Crawl jobs from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
  5. Does NOT immediately start the crawl - pipe to `archivebox snapshot` to process.
  6. Input formats:
  7. - Plain URLs (one per line)
  8. - JSONL: {"url": "...", "depth": 1, "tags": "..."}
  9. Output (JSONL):
  10. {"type": "Crawl", "id": "...", "urls": "...", "status": "queued", ...}
  11. Examples:
  12. # Create a crawl job
  13. archivebox crawl https://example.com
  14. # Create crawl with depth
  15. archivebox crawl --depth=1 https://example.com
  16. # Full pipeline: create crawl, create snapshots, run extractors
  17. archivebox crawl https://example.com | archivebox snapshot | archivebox extract
  18. # Process existing Crawl by ID (runs the crawl state machine)
  19. archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
  20. """
  21. __package__ = 'archivebox.cli'
  22. __command__ = 'archivebox crawl'
  23. import sys
  24. from typing import Optional
  25. import rich_click as click
  26. def create_crawls(
  27. records: list,
  28. depth: int = 0,
  29. tag: str = '',
  30. created_by_id: Optional[int] = None,
  31. ) -> int:
  32. """
  33. Create a single Crawl job from all input URLs.
  34. Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
  35. Does NOT start the crawl - just creates the job in QUEUED state.
  36. Exit codes:
  37. 0: Success
  38. 1: Failure
  39. """
  40. from rich import print as rprint
  41. from archivebox.misc.jsonl import write_record
  42. from archivebox.base_models.models import get_or_create_system_user_pk
  43. from archivebox.crawls.models import Crawl
  44. created_by_id = created_by_id or get_or_create_system_user_pk()
  45. is_tty = sys.stdout.isatty()
  46. if not records:
  47. rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
  48. return 1
  49. # Collect all URLs into a single newline-separated string
  50. urls = []
  51. for record in records:
  52. url = record.get('url')
  53. if url:
  54. urls.append(url)
  55. if not urls:
  56. rprint('[red]No valid URLs found[/red]', file=sys.stderr)
  57. return 1
  58. try:
  59. # Build crawl record with all URLs as newline-separated string
  60. crawl_record = {
  61. 'urls': '\n'.join(urls),
  62. 'max_depth': depth,
  63. 'tags_str': tag,
  64. 'label': '',
  65. }
  66. crawl = Crawl.from_jsonl(crawl_record, overrides={'created_by_id': created_by_id})
  67. if not crawl:
  68. rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
  69. return 1
  70. # Output JSONL record (only when piped)
  71. if not is_tty:
  72. write_record(crawl.to_jsonl())
  73. rprint(f'[green]Created crawl with {len(urls)} URLs[/green]', file=sys.stderr)
  74. # If TTY, show human-readable output
  75. if is_tty:
  76. rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
  77. for url in urls[:5]: # Show first 5 URLs
  78. rprint(f' {url[:70]}', file=sys.stderr)
  79. if len(urls) > 5:
  80. rprint(f' ... and {len(urls) - 5} more', file=sys.stderr)
  81. return 0
  82. except Exception as e:
  83. rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr)
  84. return 1
  85. def process_crawl_by_id(crawl_id: str) -> int:
  86. """
  87. Process a single Crawl by ID (used by workers).
  88. Triggers the Crawl's state machine tick() which will:
  89. - Transition from queued -> started (creates root snapshot)
  90. - Transition from started -> sealed (when all snapshots done)
  91. """
  92. from rich import print as rprint
  93. from archivebox.crawls.models import Crawl
  94. try:
  95. crawl = Crawl.objects.get(id=crawl_id)
  96. except Crawl.DoesNotExist:
  97. rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
  98. return 1
  99. rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
  100. try:
  101. crawl.sm.tick()
  102. crawl.refresh_from_db()
  103. rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
  104. return 0
  105. except Exception as e:
  106. rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
  107. return 1
  108. def is_crawl_id(value: str) -> bool:
  109. """Check if value looks like a Crawl UUID."""
  110. import re
  111. uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
  112. if not uuid_pattern.match(value):
  113. return False
  114. # Verify it's actually a Crawl (not a Snapshot or other object)
  115. from archivebox.crawls.models import Crawl
  116. return Crawl.objects.filter(id=value).exists()
  117. @click.command()
  118. @click.option('--depth', '-d', type=int, default=0, help='Max depth for recursive crawling (default: 0, no recursion)')
  119. @click.option('--tag', '-t', default='', help='Comma-separated tags to add to snapshots')
  120. @click.argument('args', nargs=-1)
  121. def main(depth: int, tag: str, args: tuple):
  122. """Create Crawl jobs from URLs, or process existing Crawls by ID"""
  123. from archivebox.misc.jsonl import read_args_or_stdin
  124. # Read all input
  125. records = list(read_args_or_stdin(args))
  126. if not records:
  127. from rich import print as rprint
  128. rprint('[yellow]No URLs or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
  129. sys.exit(1)
  130. # Check if input looks like existing Crawl IDs to process
  131. # If ALL inputs are Crawl UUIDs, process them
  132. all_are_crawl_ids = all(
  133. is_crawl_id(r.get('id') or r.get('url', ''))
  134. for r in records
  135. )
  136. if all_are_crawl_ids:
  137. # Process existing Crawls by ID
  138. exit_code = 0
  139. for record in records:
  140. crawl_id = record.get('id') or record.get('url')
  141. result = process_crawl_by_id(crawl_id)
  142. if result != 0:
  143. exit_code = result
  144. sys.exit(exit_code)
  145. else:
  146. # Default behavior: create Crawl jobs from URLs
  147. sys.exit(create_crawls(records, depth=depth, tag=tag))
  148. if __name__ == '__main__':
  149. main()