archivebox_archiveresult.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. #!/usr/bin/env python3
  2. """
  3. archivebox archiveresult <action> [args...] [--filters]
  4. Manage ArchiveResult records (plugin extraction results).
  5. Actions:
  6. create - Create ArchiveResults for Snapshots (queue extractions)
  7. list - List ArchiveResults as JSONL (with optional filters)
  8. update - Update ArchiveResults from stdin JSONL
  9. delete - Delete ArchiveResults from stdin JSONL
  10. Examples:
  11. # Create ArchiveResults for snapshots (queue for extraction)
  12. archivebox snapshot list --status=queued | archivebox archiveresult create
  13. archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid>
  14. # List with filters
  15. archivebox archiveresult list --status=failed
  16. archivebox archiveresult list --plugin=screenshot --status=succeeded
  17. # Update (reset failed extractions to queued)
  18. archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued
  19. # Delete
  20. archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes
  21. # Re-run failed extractions
  22. archivebox archiveresult list --status=failed | archivebox run
  23. """
  24. __package__ = 'archivebox.cli'
  25. __command__ = 'archivebox archiveresult'
  26. import sys
  27. from typing import Optional
  28. import rich_click as click
  29. from rich import print as rprint
  30. from archivebox.cli.cli_utils import apply_filters
  31. # =============================================================================
  32. # CREATE
  33. # =============================================================================
  34. def create_archiveresults(
  35. snapshot_id: Optional[str] = None,
  36. plugin: Optional[str] = None,
  37. status: str = 'queued',
  38. ) -> int:
  39. """
  40. Create ArchiveResults for Snapshots.
  41. Reads Snapshot records from stdin and creates ArchiveResult entries.
  42. Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
  43. If --plugin is specified, only creates results for that plugin.
  44. Otherwise, creates results for all pending plugins.
  45. Exit codes:
  46. 0: Success
  47. 1: Failure
  48. """
  49. from django.utils import timezone
  50. from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
  51. from archivebox.core.models import Snapshot, ArchiveResult
  52. is_tty = sys.stdout.isatty()
  53. # If snapshot_id provided directly, use that
  54. if snapshot_id:
  55. try:
  56. snapshots = [Snapshot.objects.get(id=snapshot_id)]
  57. pass_through_records = []
  58. except Snapshot.DoesNotExist:
  59. rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
  60. return 1
  61. else:
  62. # Read from stdin
  63. records = list(read_stdin())
  64. if not records:
  65. rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
  66. return 1
  67. # Separate snapshot records from pass-through records
  68. snapshot_ids = []
  69. pass_through_records = []
  70. for record in records:
  71. record_type = record.get('type', '')
  72. if record_type == TYPE_SNAPSHOT:
  73. # Pass through the Snapshot record itself
  74. pass_through_records.append(record)
  75. if record.get('id'):
  76. snapshot_ids.append(record['id'])
  77. elif record_type == TYPE_ARCHIVERESULT:
  78. # ArchiveResult records: pass through if they have an id
  79. if record.get('id'):
  80. pass_through_records.append(record)
  81. # If no id, we could create it, but for now just pass through
  82. else:
  83. pass_through_records.append(record)
  84. elif record_type:
  85. # Other typed records (Crawl, Tag, etc): pass through
  86. pass_through_records.append(record)
  87. elif record.get('id'):
  88. # Untyped record with id - assume it's a snapshot ID
  89. snapshot_ids.append(record['id'])
  90. # Output pass-through records first
  91. if not is_tty:
  92. for record in pass_through_records:
  93. write_record(record)
  94. if not snapshot_ids:
  95. if pass_through_records:
  96. rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
  97. return 0
  98. rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
  99. return 1
  100. snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
  101. if not snapshots:
  102. rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
  103. return 0 if pass_through_records else 1
  104. created_count = 0
  105. for snapshot in snapshots:
  106. if plugin:
  107. # Create for specific plugin only
  108. result, created = ArchiveResult.objects.get_or_create(
  109. snapshot=snapshot,
  110. plugin=plugin,
  111. defaults={
  112. 'status': status,
  113. 'retry_at': timezone.now(),
  114. }
  115. )
  116. if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
  117. # Reset for retry
  118. result.status = status
  119. result.retry_at = timezone.now()
  120. result.save()
  121. if not is_tty:
  122. write_record(result.to_json())
  123. created_count += 1
  124. else:
  125. # Create all pending plugins
  126. snapshot.create_pending_archiveresults()
  127. for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
  128. if not is_tty:
  129. write_record(result.to_json())
  130. created_count += 1
  131. rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
  132. return 0
  133. # =============================================================================
  134. # LIST
  135. # =============================================================================
  136. def list_archiveresults(
  137. status: Optional[str] = None,
  138. plugin: Optional[str] = None,
  139. snapshot_id: Optional[str] = None,
  140. limit: Optional[int] = None,
  141. ) -> int:
  142. """
  143. List ArchiveResults as JSONL with optional filters.
  144. Exit codes:
  145. 0: Success (even if no results)
  146. """
  147. from archivebox.misc.jsonl import write_record
  148. from archivebox.core.models import ArchiveResult
  149. is_tty = sys.stdout.isatty()
  150. queryset = ArchiveResult.objects.all().order_by('-start_ts')
  151. # Apply filters
  152. filter_kwargs = {
  153. 'status': status,
  154. 'plugin': plugin,
  155. 'snapshot_id': snapshot_id,
  156. }
  157. queryset = apply_filters(queryset, filter_kwargs, limit=limit)
  158. count = 0
  159. for result in queryset:
  160. if is_tty:
  161. status_color = {
  162. 'queued': 'yellow',
  163. 'started': 'blue',
  164. 'succeeded': 'green',
  165. 'failed': 'red',
  166. 'skipped': 'dim',
  167. 'backoff': 'magenta',
  168. }.get(result.status, 'dim')
  169. rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
  170. else:
  171. write_record(result.to_json())
  172. count += 1
  173. rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
  174. return 0
  175. # =============================================================================
  176. # UPDATE
  177. # =============================================================================
  178. def update_archiveresults(
  179. status: Optional[str] = None,
  180. ) -> int:
  181. """
  182. Update ArchiveResults from stdin JSONL.
  183. Reads ArchiveResult records from stdin and applies updates.
  184. Uses PATCH semantics - only specified fields are updated.
  185. Exit codes:
  186. 0: Success
  187. 1: No input or error
  188. """
  189. from django.utils import timezone
  190. from archivebox.misc.jsonl import read_stdin, write_record
  191. from archivebox.core.models import ArchiveResult
  192. is_tty = sys.stdout.isatty()
  193. records = list(read_stdin())
  194. if not records:
  195. rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
  196. return 1
  197. updated_count = 0
  198. for record in records:
  199. result_id = record.get('id')
  200. if not result_id:
  201. continue
  202. try:
  203. result = ArchiveResult.objects.get(id=result_id)
  204. # Apply updates from CLI flags
  205. if status:
  206. result.status = status
  207. result.retry_at = timezone.now()
  208. result.save()
  209. updated_count += 1
  210. if not is_tty:
  211. write_record(result.to_json())
  212. except ArchiveResult.DoesNotExist:
  213. rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
  214. continue
  215. rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
  216. return 0
  217. # =============================================================================
  218. # DELETE
  219. # =============================================================================
  220. def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
  221. """
  222. Delete ArchiveResults from stdin JSONL.
  223. Requires --yes flag to confirm deletion.
  224. Exit codes:
  225. 0: Success
  226. 1: No input or missing --yes flag
  227. """
  228. from archivebox.misc.jsonl import read_stdin
  229. from archivebox.core.models import ArchiveResult
  230. records = list(read_stdin())
  231. if not records:
  232. rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
  233. return 1
  234. result_ids = [r.get('id') for r in records if r.get('id')]
  235. if not result_ids:
  236. rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
  237. return 1
  238. results = ArchiveResult.objects.filter(id__in=result_ids)
  239. count = results.count()
  240. if count == 0:
  241. rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
  242. return 0
  243. if dry_run:
  244. rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
  245. for result in results[:10]:
  246. rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
  247. if count > 10:
  248. rprint(f' ... and {count - 10} more', file=sys.stderr)
  249. return 0
  250. if not yes:
  251. rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
  252. return 1
  253. # Perform deletion
  254. deleted_count, _ = results.delete()
  255. rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
  256. return 0
  257. # =============================================================================
  258. # CLI Commands
  259. # =============================================================================
  260. @click.group()
  261. def main():
  262. """Manage ArchiveResult records (plugin extraction results)."""
  263. pass
  264. @main.command('create')
  265. @click.option('--snapshot-id', help='Snapshot ID to create results for')
  266. @click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
  267. @click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
  268. def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
  269. """Create ArchiveResults for Snapshots from stdin JSONL."""
  270. sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
  271. @main.command('list')
  272. @click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
  273. @click.option('--plugin', '-p', help='Filter by plugin name')
  274. @click.option('--snapshot-id', help='Filter by snapshot ID')
  275. @click.option('--limit', '-n', type=int, help='Limit number of results')
  276. def list_cmd(status: Optional[str], plugin: Optional[str],
  277. snapshot_id: Optional[str], limit: Optional[int]):
  278. """List ArchiveResults as JSONL."""
  279. sys.exit(list_archiveresults(
  280. status=status,
  281. plugin=plugin,
  282. snapshot_id=snapshot_id,
  283. limit=limit,
  284. ))
  285. @main.command('update')
  286. @click.option('--status', '-s', help='Set status')
  287. def update_cmd(status: Optional[str]):
  288. """Update ArchiveResults from stdin JSONL."""
  289. sys.exit(update_archiveresults(status=status))
  290. @main.command('delete')
  291. @click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
  292. @click.option('--dry-run', is_flag=True, help='Show what would be deleted')
  293. def delete_cmd(yes: bool, dry_run: bool):
  294. """Delete ArchiveResults from stdin JSONL."""
  295. sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
  296. if __name__ == '__main__':
  297. main()