archivebox_crawl.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. #!/usr/bin/env python3
  2. """
  3. archivebox crawl <action> [args...] [--filters]
  4. Manage Crawl records.
  5. Actions:
  6. create - Create Crawl jobs from URLs
  7. list - List Crawls as JSONL (with optional filters)
  8. update - Update Crawls from stdin JSONL
  9. delete - Delete Crawls from stdin JSONL
  10. Examples:
  11. # Create
  12. archivebox crawl create https://example.com https://foo.com --depth=1
  13. archivebox crawl create --tag=news https://example.com
  14. # List with filters
  15. archivebox crawl list --status=queued
  16. archivebox crawl list --urls__icontains=example.com
  17. # Update
  18. archivebox crawl list --status=started | archivebox crawl update --status=queued
  19. # Delete
  20. archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes
  21. # Full pipeline
  22. archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
  23. """
  24. __package__ = 'archivebox.cli'
  25. __command__ = 'archivebox crawl'
  26. import sys
  27. from typing import Optional, Iterable
  28. import rich_click as click
  29. from rich import print as rprint
  30. from archivebox.cli.cli_utils import apply_filters
  31. # =============================================================================
  32. # CREATE
  33. # =============================================================================
  34. def create_crawl(
  35. urls: Iterable[str],
  36. depth: int = 0,
  37. tag: str = '',
  38. status: str = 'queued',
  39. created_by_id: Optional[int] = None,
  40. ) -> int:
  41. """
  42. Create a Crawl job from URLs.
  43. Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
  44. Pass-through: Records that are not URLs are output unchanged (for piping).
  45. Exit codes:
  46. 0: Success
  47. 1: Failure
  48. """
  49. from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL
  50. from archivebox.base_models.models import get_or_create_system_user_pk
  51. from archivebox.crawls.models import Crawl
  52. created_by_id = created_by_id or get_or_create_system_user_pk()
  53. is_tty = sys.stdout.isatty()
  54. # Collect all input records
  55. records = list(read_args_or_stdin(urls))
  56. if not records:
  57. rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
  58. return 1
  59. # Separate pass-through records from URL records
  60. url_list = []
  61. pass_through_records = []
  62. for record in records:
  63. record_type = record.get('type', '')
  64. # Pass-through: output records that aren't URL/Crawl types
  65. if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
  66. pass_through_records.append(record)
  67. continue
  68. # Handle existing Crawl records (just pass through with id)
  69. if record_type == TYPE_CRAWL and record.get('id'):
  70. pass_through_records.append(record)
  71. continue
  72. # Collect URLs
  73. url = record.get('url')
  74. if url:
  75. url_list.append(url)
  76. # Handle 'urls' field (newline-separated)
  77. urls_field = record.get('urls')
  78. if urls_field:
  79. for line in urls_field.split('\n'):
  80. line = line.strip()
  81. if line and not line.startswith('#'):
  82. url_list.append(line)
  83. # Output pass-through records first
  84. if not is_tty:
  85. for record in pass_through_records:
  86. write_record(record)
  87. if not url_list:
  88. if pass_through_records:
  89. # If we had pass-through records but no URLs, that's OK
  90. rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
  91. return 0
  92. rprint('[red]No valid URLs found[/red]', file=sys.stderr)
  93. return 1
  94. try:
  95. # Build crawl record with all URLs as newline-separated string
  96. crawl_record = {
  97. 'urls': '\n'.join(url_list),
  98. 'max_depth': depth,
  99. 'tags_str': tag,
  100. 'status': status,
  101. 'label': '',
  102. }
  103. crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
  104. if not crawl:
  105. rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
  106. return 1
  107. # Output JSONL record (only when piped)
  108. if not is_tty:
  109. write_record(crawl.to_json())
  110. rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
  111. # If TTY, show human-readable output
  112. if is_tty:
  113. rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr)
  114. for url in url_list[:5]: # Show first 5 URLs
  115. rprint(f' {url[:70]}', file=sys.stderr)
  116. if len(url_list) > 5:
  117. rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr)
  118. return 0
  119. except Exception as e:
  120. rprint(f'[red]Error creating crawl: {e}[/red]', file=sys.stderr)
  121. return 1
  122. # =============================================================================
  123. # LIST
  124. # =============================================================================
  125. def list_crawls(
  126. status: Optional[str] = None,
  127. urls__icontains: Optional[str] = None,
  128. max_depth: Optional[int] = None,
  129. limit: Optional[int] = None,
  130. ) -> int:
  131. """
  132. List Crawls as JSONL with optional filters.
  133. Exit codes:
  134. 0: Success (even if no results)
  135. """
  136. from archivebox.misc.jsonl import write_record
  137. from archivebox.crawls.models import Crawl
  138. is_tty = sys.stdout.isatty()
  139. queryset = Crawl.objects.all().order_by('-created_at')
  140. # Apply filters
  141. filter_kwargs = {
  142. 'status': status,
  143. 'urls__icontains': urls__icontains,
  144. 'max_depth': max_depth,
  145. }
  146. queryset = apply_filters(queryset, filter_kwargs, limit=limit)
  147. count = 0
  148. for crawl in queryset:
  149. if is_tty:
  150. status_color = {
  151. 'queued': 'yellow',
  152. 'started': 'blue',
  153. 'sealed': 'green',
  154. }.get(crawl.status, 'dim')
  155. url_preview = crawl.urls[:50].replace('\n', ' ')
  156. rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
  157. else:
  158. write_record(crawl.to_json())
  159. count += 1
  160. rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
  161. return 0
  162. # =============================================================================
  163. # UPDATE
  164. # =============================================================================
  165. def update_crawls(
  166. status: Optional[str] = None,
  167. max_depth: Optional[int] = None,
  168. ) -> int:
  169. """
  170. Update Crawls from stdin JSONL.
  171. Reads Crawl records from stdin and applies updates.
  172. Uses PATCH semantics - only specified fields are updated.
  173. Exit codes:
  174. 0: Success
  175. 1: No input or error
  176. """
  177. from django.utils import timezone
  178. from archivebox.misc.jsonl import read_stdin, write_record
  179. from archivebox.crawls.models import Crawl
  180. is_tty = sys.stdout.isatty()
  181. records = list(read_stdin())
  182. if not records:
  183. rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
  184. return 1
  185. updated_count = 0
  186. for record in records:
  187. crawl_id = record.get('id')
  188. if not crawl_id:
  189. continue
  190. try:
  191. crawl = Crawl.objects.get(id=crawl_id)
  192. # Apply updates from CLI flags
  193. if status:
  194. crawl.status = status
  195. crawl.retry_at = timezone.now()
  196. if max_depth is not None:
  197. crawl.max_depth = max_depth
  198. crawl.save()
  199. updated_count += 1
  200. if not is_tty:
  201. write_record(crawl.to_json())
  202. except Crawl.DoesNotExist:
  203. rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
  204. continue
  205. rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
  206. return 0
  207. # =============================================================================
  208. # DELETE
  209. # =============================================================================
  210. def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
  211. """
  212. Delete Crawls from stdin JSONL.
  213. Requires --yes flag to confirm deletion.
  214. Exit codes:
  215. 0: Success
  216. 1: No input or missing --yes flag
  217. """
  218. from archivebox.misc.jsonl import read_stdin
  219. from archivebox.crawls.models import Crawl
  220. records = list(read_stdin())
  221. if not records:
  222. rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
  223. return 1
  224. crawl_ids = [r.get('id') for r in records if r.get('id')]
  225. if not crawl_ids:
  226. rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
  227. return 1
  228. crawls = Crawl.objects.filter(id__in=crawl_ids)
  229. count = crawls.count()
  230. if count == 0:
  231. rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
  232. return 0
  233. if dry_run:
  234. rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
  235. for crawl in crawls:
  236. url_preview = crawl.urls[:50].replace('\n', ' ')
  237. rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
  238. return 0
  239. if not yes:
  240. rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
  241. return 1
  242. # Perform deletion
  243. deleted_count, _ = crawls.delete()
  244. rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
  245. return 0
  246. # =============================================================================
  247. # CLI Commands
  248. # =============================================================================
  249. @click.group()
  250. def main():
  251. """Manage Crawl records."""
  252. pass
  253. @main.command('create')
  254. @click.argument('urls', nargs=-1)
  255. @click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
  256. @click.option('--tag', '-t', default='', help='Comma-separated tags to add')
  257. @click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
  258. def create_cmd(urls: tuple, depth: int, tag: str, status: str):
  259. """Create a Crawl job from URLs or stdin."""
  260. sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
  261. @main.command('list')
  262. @click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
  263. @click.option('--urls__icontains', help='Filter by URLs contains')
  264. @click.option('--max-depth', type=int, help='Filter by max depth')
  265. @click.option('--limit', '-n', type=int, help='Limit number of results')
  266. def list_cmd(status: Optional[str], urls__icontains: Optional[str],
  267. max_depth: Optional[int], limit: Optional[int]):
  268. """List Crawls as JSONL."""
  269. sys.exit(list_crawls(
  270. status=status,
  271. urls__icontains=urls__icontains,
  272. max_depth=max_depth,
  273. limit=limit,
  274. ))
  275. @main.command('update')
  276. @click.option('--status', '-s', help='Set status')
  277. @click.option('--max-depth', type=int, help='Set max depth')
  278. def update_cmd(status: Optional[str], max_depth: Optional[int]):
  279. """Update Crawls from stdin JSONL."""
  280. sys.exit(update_crawls(status=status, max_depth=max_depth))
  281. @main.command('delete')
  282. @click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
  283. @click.option('--dry-run', is_flag=True, help='Show what would be deleted')
  284. def delete_cmd(yes: bool, dry_run: bool):
  285. """Delete Crawls from stdin JSONL."""
  286. sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
  287. if __name__ == '__main__':
  288. main()