archivebox_snapshot.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. #!/usr/bin/env python3
  2. """
  3. archivebox snapshot <action> [args...] [--filters]
  4. Manage Snapshot records.
  5. Actions:
  6. create - Create Snapshots from URLs or Crawl JSONL
  7. list - List Snapshots as JSONL (with optional filters)
  8. update - Update Snapshots from stdin JSONL
  9. delete - Delete Snapshots from stdin JSONL
  10. Examples:
  11. # Create
  12. archivebox snapshot create https://example.com --tag=news
  13. archivebox crawl create https://example.com | archivebox snapshot create
  14. # List with filters
  15. archivebox snapshot list --status=queued
  16. archivebox snapshot list --url__icontains=example.com
  17. # Update
  18. archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
  19. # Delete
  20. archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
  21. """
  22. __package__ = 'archivebox.cli'
  23. __command__ = 'archivebox snapshot'
  24. import sys
  25. from typing import Optional, Iterable
  26. import rich_click as click
  27. from rich import print as rprint
  28. from archivebox.cli.cli_utils import apply_filters
  29. # =============================================================================
  30. # CREATE
  31. # =============================================================================
  32. def create_snapshots(
  33. urls: Iterable[str],
  34. tag: str = '',
  35. status: str = 'queued',
  36. depth: int = 0,
  37. created_by_id: Optional[int] = None,
  38. ) -> int:
  39. """
  40. Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
  41. Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged.
  42. Exit codes:
  43. 0: Success
  44. 1: Failure
  45. """
  46. from archivebox.misc.jsonl import (
  47. read_args_or_stdin, write_record,
  48. TYPE_SNAPSHOT, TYPE_CRAWL
  49. )
  50. from archivebox.base_models.models import get_or_create_system_user_pk
  51. from archivebox.core.models import Snapshot
  52. from archivebox.crawls.models import Crawl
  53. created_by_id = created_by_id or get_or_create_system_user_pk()
  54. is_tty = sys.stdout.isatty()
  55. # Collect all input records
  56. records = list(read_args_or_stdin(urls))
  57. if not records:
  58. rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
  59. return 1
  60. # Process each record - handle Crawls and plain URLs/Snapshots
  61. created_snapshots = []
  62. pass_through_count = 0
  63. for record in records:
  64. record_type = record.get('type', '')
  65. try:
  66. if record_type == TYPE_CRAWL:
  67. # Pass through the Crawl record itself first
  68. if not is_tty:
  69. write_record(record)
  70. # Input is a Crawl - get or create it, then create Snapshots for its URLs
  71. crawl = None
  72. crawl_id = record.get('id')
  73. if crawl_id:
  74. try:
  75. crawl = Crawl.objects.get(id=crawl_id)
  76. except Crawl.DoesNotExist:
  77. crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
  78. else:
  79. crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
  80. if not crawl:
  81. continue
  82. # Create snapshots for each URL in the crawl
  83. for url in crawl.get_urls_list():
  84. merged_tags = crawl.tags_str
  85. if tag:
  86. merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
  87. snapshot_record = {
  88. 'url': url,
  89. 'tags': merged_tags,
  90. 'crawl_id': str(crawl.id),
  91. 'depth': depth,
  92. 'status': status,
  93. }
  94. snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
  95. if snapshot:
  96. created_snapshots.append(snapshot)
  97. if not is_tty:
  98. write_record(snapshot.to_json())
  99. elif record_type == TYPE_SNAPSHOT or record.get('url'):
  100. # Input is a Snapshot or plain URL
  101. if tag and not record.get('tags'):
  102. record['tags'] = tag
  103. if status:
  104. record['status'] = status
  105. record['depth'] = record.get('depth', depth)
  106. snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
  107. if snapshot:
  108. created_snapshots.append(snapshot)
  109. if not is_tty:
  110. write_record(snapshot.to_json())
  111. else:
  112. # Pass-through: output records we don't handle
  113. if not is_tty:
  114. write_record(record)
  115. pass_through_count += 1
  116. except Exception as e:
  117. rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
  118. continue
  119. if not created_snapshots:
  120. if pass_through_count > 0:
  121. rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
  122. return 0
  123. rprint('[red]No snapshots created[/red]', file=sys.stderr)
  124. return 1
  125. rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
  126. if is_tty:
  127. for snapshot in created_snapshots:
  128. rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
  129. return 0
  130. # =============================================================================
  131. # LIST
  132. # =============================================================================
  133. def list_snapshots(
  134. status: Optional[str] = None,
  135. url__icontains: Optional[str] = None,
  136. url__istartswith: Optional[str] = None,
  137. tag: Optional[str] = None,
  138. crawl_id: Optional[str] = None,
  139. limit: Optional[int] = None,
  140. ) -> int:
  141. """
  142. List Snapshots as JSONL with optional filters.
  143. Exit codes:
  144. 0: Success (even if no results)
  145. """
  146. from archivebox.misc.jsonl import write_record
  147. from archivebox.core.models import Snapshot
  148. is_tty = sys.stdout.isatty()
  149. queryset = Snapshot.objects.all().order_by('-created_at')
  150. # Apply filters
  151. filter_kwargs = {
  152. 'status': status,
  153. 'url__icontains': url__icontains,
  154. 'url__istartswith': url__istartswith,
  155. 'crawl_id': crawl_id,
  156. }
  157. queryset = apply_filters(queryset, filter_kwargs, limit=limit)
  158. # Tag filter requires special handling (M2M)
  159. if tag:
  160. queryset = queryset.filter(tags__name__iexact=tag)
  161. count = 0
  162. for snapshot in queryset:
  163. if is_tty:
  164. status_color = {
  165. 'queued': 'yellow',
  166. 'started': 'blue',
  167. 'sealed': 'green',
  168. }.get(snapshot.status, 'dim')
  169. rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
  170. else:
  171. write_record(snapshot.to_json())
  172. count += 1
  173. rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
  174. return 0
  175. # =============================================================================
  176. # UPDATE
  177. # =============================================================================
  178. def update_snapshots(
  179. status: Optional[str] = None,
  180. tag: Optional[str] = None,
  181. ) -> int:
  182. """
  183. Update Snapshots from stdin JSONL.
  184. Reads Snapshot records from stdin and applies updates.
  185. Uses PATCH semantics - only specified fields are updated.
  186. Exit codes:
  187. 0: Success
  188. 1: No input or error
  189. """
  190. from django.utils import timezone
  191. from archivebox.misc.jsonl import read_stdin, write_record
  192. from archivebox.core.models import Snapshot
  193. is_tty = sys.stdout.isatty()
  194. records = list(read_stdin())
  195. if not records:
  196. rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
  197. return 1
  198. updated_count = 0
  199. for record in records:
  200. snapshot_id = record.get('id')
  201. if not snapshot_id:
  202. continue
  203. try:
  204. snapshot = Snapshot.objects.get(id=snapshot_id)
  205. # Apply updates from CLI flags (override stdin values)
  206. if status:
  207. snapshot.status = status
  208. snapshot.retry_at = timezone.now()
  209. if tag:
  210. # Add tag to existing tags
  211. snapshot.save() # Ensure saved before M2M
  212. from archivebox.core.models import Tag
  213. tag_obj, _ = Tag.objects.get_or_create(name=tag)
  214. snapshot.tags.add(tag_obj)
  215. snapshot.save()
  216. updated_count += 1
  217. if not is_tty:
  218. write_record(snapshot.to_json())
  219. except Snapshot.DoesNotExist:
  220. rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
  221. continue
  222. rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
  223. return 0
  224. # =============================================================================
  225. # DELETE
  226. # =============================================================================
  227. def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
  228. """
  229. Delete Snapshots from stdin JSONL.
  230. Requires --yes flag to confirm deletion.
  231. Exit codes:
  232. 0: Success
  233. 1: No input or missing --yes flag
  234. """
  235. from archivebox.misc.jsonl import read_stdin
  236. from archivebox.core.models import Snapshot
  237. records = list(read_stdin())
  238. if not records:
  239. rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
  240. return 1
  241. snapshot_ids = [r.get('id') for r in records if r.get('id')]
  242. if not snapshot_ids:
  243. rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
  244. return 1
  245. snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
  246. count = snapshots.count()
  247. if count == 0:
  248. rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
  249. return 0
  250. if dry_run:
  251. rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
  252. for snapshot in snapshots:
  253. rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
  254. return 0
  255. if not yes:
  256. rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
  257. return 1
  258. # Perform deletion
  259. deleted_count, _ = snapshots.delete()
  260. rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
  261. return 0
  262. # =============================================================================
  263. # CLI Commands
  264. # =============================================================================
  265. @click.group()
  266. def main():
  267. """Manage Snapshot records."""
  268. pass
  269. @main.command('create')
  270. @click.argument('urls', nargs=-1)
  271. @click.option('--tag', '-t', default='', help='Comma-separated tags to add')
  272. @click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
  273. @click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
  274. def create_cmd(urls: tuple, tag: str, status: str, depth: int):
  275. """Create Snapshots from URLs or stdin JSONL."""
  276. sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
  277. @main.command('list')
  278. @click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
  279. @click.option('--url__icontains', help='Filter by URL contains')
  280. @click.option('--url__istartswith', help='Filter by URL starts with')
  281. @click.option('--tag', '-t', help='Filter by tag name')
  282. @click.option('--crawl-id', help='Filter by crawl ID')
  283. @click.option('--limit', '-n', type=int, help='Limit number of results')
  284. def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
  285. tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
  286. """List Snapshots as JSONL."""
  287. sys.exit(list_snapshots(
  288. status=status,
  289. url__icontains=url__icontains,
  290. url__istartswith=url__istartswith,
  291. tag=tag,
  292. crawl_id=crawl_id,
  293. limit=limit,
  294. ))
  295. @main.command('update')
  296. @click.option('--status', '-s', help='Set status')
  297. @click.option('--tag', '-t', help='Add tag')
  298. def update_cmd(status: Optional[str], tag: Optional[str]):
  299. """Update Snapshots from stdin JSONL."""
  300. sys.exit(update_snapshots(status=status, tag=tag))
  301. @main.command('delete')
  302. @click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
  303. @click.option('--dry-run', is_flag=True, help='Show what would be deleted')
  304. def delete_cmd(yes: bool, dry_run: bool):
  305. """Delete Snapshots from stdin JSONL."""
  306. sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
  307. if __name__ == '__main__':
  308. main()