models.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. __package__ = 'archivebox.crawls'
  2. from typing import TYPE_CHECKING, Iterable
  3. from datetime import timedelta
  4. from archivebox.uuid_compat import uuid7
  5. from pathlib import Path
  6. from django.db import models
  7. from django.db.models import QuerySet
  8. from django.core.validators import MaxValueValidator, MinValueValidator
  9. from django.conf import settings
  10. from django.urls import reverse_lazy
  11. from django.utils import timezone
  12. from django_stubs_ext.db.models import TypedModelMeta
  13. from statemachine import State, registry
  14. from rich import print
  15. from archivebox.config import CONSTANTS
  16. from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
  17. from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
  18. if TYPE_CHECKING:
  19. from archivebox.core.models import Snapshot, ArchiveResult
  20. class CrawlSchedule(ModelWithUUID, ModelWithNotes):
  21. id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
  22. created_at = models.DateTimeField(default=timezone.now, db_index=True)
  23. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
  24. modified_at = models.DateTimeField(auto_now=True)
  25. template: 'Crawl' = models.ForeignKey('Crawl', on_delete=models.CASCADE, null=False, blank=False) # type: ignore
  26. schedule = models.CharField(max_length=64, blank=False, null=False)
  27. is_enabled = models.BooleanField(default=True)
  28. label = models.CharField(max_length=64, blank=True, null=False, default='')
  29. notes = models.TextField(blank=True, null=False, default='')
  30. crawl_set: models.Manager['Crawl']
  31. class Meta(TypedModelMeta):
  32. app_label = 'crawls'
  33. verbose_name = 'Scheduled Crawl'
  34. verbose_name_plural = 'Scheduled Crawls'
  35. def __str__(self) -> str:
  36. urls_preview = self.template.urls[:64] if self.template and self.template.urls else ""
  37. return f'[{self.id}] {urls_preview} @ {self.schedule}'
  38. @property
  39. def api_url(self) -> str:
  40. return reverse_lazy('api-1:get_any', args=[self.id])
  41. def save(self, *args, **kwargs):
  42. self.label = self.label or (self.template.label if self.template else '')
  43. super().save(*args, **kwargs)
  44. if self.template:
  45. self.template.schedule = self
  46. self.template.save()
  47. class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
  48. id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
  49. created_at = models.DateTimeField(default=timezone.now, db_index=True)
  50. created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
  51. modified_at = models.DateTimeField(auto_now=True)
  52. urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
  53. config = models.JSONField(default=dict, null=True, blank=True)
  54. max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
  55. tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
  56. persona_id = models.UUIDField(null=True, blank=True)
  57. label = models.CharField(max_length=64, blank=True, null=False, default='')
  58. notes = models.TextField(blank=True, null=False, default='')
  59. schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
  60. output_dir = models.CharField(max_length=512, null=False, blank=True, default='')
  61. status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
  62. retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
  63. state_machine_name = 'archivebox.crawls.models.CrawlMachine'
  64. retry_at_field_name = 'retry_at'
  65. state_field_name = 'status'
  66. StatusChoices = ModelWithStateMachine.StatusChoices
  67. active_state = StatusChoices.STARTED
  68. snapshot_set: models.Manager['Snapshot']
  69. class Meta(TypedModelMeta):
  70. app_label = 'crawls'
  71. verbose_name = 'Crawl'
  72. verbose_name_plural = 'Crawls'
  73. def __str__(self):
  74. first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
  75. # Show last 8 digits of UUID and more of the URL
  76. short_id = str(self.id)[-8:]
  77. return f'[...{short_id}] {first_url[:120]}'
  78. def save(self, *args, **kwargs):
  79. is_new = self._state.adding
  80. super().save(*args, **kwargs)
  81. if is_new:
  82. from archivebox.misc.logging_util import log_worker_event
  83. first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
  84. log_worker_event(
  85. worker_type='DB',
  86. event='Created Crawl',
  87. indent_level=1,
  88. metadata={
  89. 'id': str(self.id),
  90. 'first_url': first_url[:64],
  91. 'max_depth': self.max_depth,
  92. 'status': self.status,
  93. },
  94. )
  95. @property
  96. def api_url(self) -> str:
  97. return reverse_lazy('api-1:get_crawl', args=[self.id])
  98. def to_json(self) -> dict:
  99. """
  100. Convert Crawl model instance to a JSON-serializable dict.
  101. """
  102. from archivebox.config import VERSION
  103. return {
  104. 'type': 'Crawl',
  105. 'schema_version': VERSION,
  106. 'id': str(self.id),
  107. 'urls': self.urls,
  108. 'status': self.status,
  109. 'max_depth': self.max_depth,
  110. 'tags_str': self.tags_str,
  111. 'label': self.label,
  112. 'created_at': self.created_at.isoformat() if self.created_at else None,
  113. }
  114. @staticmethod
  115. def from_json(record: dict, overrides: dict = None):
  116. """
  117. Create or get a Crawl from a JSON dict.
  118. Args:
  119. record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'
  120. overrides: Dict of field overrides (e.g., created_by_id)
  121. Returns:
  122. Crawl instance or None if invalid
  123. """
  124. from django.utils import timezone
  125. overrides = overrides or {}
  126. # Check if crawl already exists by ID
  127. crawl_id = record.get('id')
  128. if crawl_id:
  129. try:
  130. return Crawl.objects.get(id=crawl_id)
  131. except Crawl.DoesNotExist:
  132. pass
  133. # Get URLs - can be string (newline-separated) or from 'url' field
  134. urls = record.get('urls', '')
  135. if not urls and record.get('url'):
  136. urls = record['url']
  137. if not urls:
  138. return None
  139. # Create new crawl (status stays QUEUED, not started)
  140. crawl = Crawl.objects.create(
  141. urls=urls,
  142. max_depth=record.get('max_depth', record.get('depth', 0)),
  143. tags_str=record.get('tags_str', record.get('tags', '')),
  144. label=record.get('label', ''),
  145. status=Crawl.StatusChoices.QUEUED,
  146. retry_at=timezone.now(),
  147. **overrides,
  148. )
  149. return crawl
  150. @property
  151. def output_dir(self) -> Path:
  152. """
  153. Construct output directory: users/{username}/crawls/{YYYYMMDD}/{domain}/{crawl-id}
  154. Domain is extracted from the first URL in the crawl.
  155. """
  156. from archivebox import DATA_DIR
  157. from archivebox.core.models import Snapshot
  158. date_str = self.created_at.strftime('%Y%m%d')
  159. urls = self.get_urls_list()
  160. domain = Snapshot.extract_domain_from_url(urls[0]) if urls else 'unknown'
  161. return DATA_DIR / 'users' / self.created_by.username / 'crawls' / date_str / domain / str(self.id)
  162. def get_urls_list(self) -> list[str]:
  163. """Get list of URLs from urls field, filtering out comments and empty lines."""
  164. if not self.urls:
  165. return []
  166. return [
  167. url.strip()
  168. for url in self.urls.split('\n')
  169. if url.strip() and not url.strip().startswith('#')
  170. ]
  171. def add_url(self, entry: dict) -> bool:
  172. """
  173. Add a URL to the crawl queue if not already present.
  174. Args:
  175. entry: dict with 'url', optional 'depth', 'title', 'timestamp', 'tags', 'via_snapshot', 'plugin'
  176. Returns:
  177. True if URL was added, False if skipped (duplicate or depth exceeded)
  178. """
  179. import json
  180. url = entry.get('url', '')
  181. if not url:
  182. return False
  183. depth = entry.get('depth', 1)
  184. # Skip if depth exceeds max_depth
  185. if depth > self.max_depth:
  186. return False
  187. # Skip if already a Snapshot for this crawl
  188. if self.snapshot_set.filter(url=url).exists():
  189. return False
  190. # Check if already in urls (parse existing JSONL entries)
  191. existing_urls = set()
  192. for line in self.urls.splitlines():
  193. if not line.strip():
  194. continue
  195. try:
  196. existing_entry = json.loads(line)
  197. existing_urls.add(existing_entry.get('url', ''))
  198. except json.JSONDecodeError:
  199. existing_urls.add(line.strip())
  200. if url in existing_urls:
  201. return False
  202. # Append as JSONL
  203. jsonl_entry = json.dumps(entry)
  204. self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n')
  205. self.save(update_fields=['urls', 'modified_at'])
  206. return True
  207. def create_snapshots_from_urls(self) -> list['Snapshot']:
  208. """
  209. Create Snapshot objects for each URL in self.urls that doesn't already exist.
  210. Returns:
  211. List of newly created Snapshot objects
  212. """
  213. import sys
  214. import json
  215. from archivebox.core.models import Snapshot
  216. created_snapshots = []
  217. print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
  218. print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
  219. for line in self.urls.splitlines():
  220. if not line.strip():
  221. continue
  222. # Parse JSONL or plain URL
  223. try:
  224. entry = json.loads(line)
  225. url = entry.get('url', '')
  226. depth = entry.get('depth', 0)
  227. title = entry.get('title')
  228. timestamp = entry.get('timestamp')
  229. tags = entry.get('tags', '')
  230. except json.JSONDecodeError:
  231. url = line.strip()
  232. depth = 0
  233. title = None
  234. timestamp = None
  235. tags = ''
  236. if not url:
  237. continue
  238. # Skip if depth exceeds max_depth
  239. if depth > self.max_depth:
  240. continue
  241. # Create snapshot if doesn't exist
  242. snapshot, created = Snapshot.objects.get_or_create(
  243. url=url,
  244. crawl=self,
  245. defaults={
  246. 'depth': depth,
  247. 'title': title,
  248. 'timestamp': timestamp or str(timezone.now().timestamp()),
  249. 'status': Snapshot.INITIAL_STATE,
  250. 'retry_at': timezone.now(),
  251. # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
  252. }
  253. )
  254. if created:
  255. created_snapshots.append(snapshot)
  256. # Save tags if present
  257. if tags:
  258. snapshot.save_tags(tags.split(','))
  259. # Ensure crawl -> snapshot symlink exists for both new and existing snapshots
  260. try:
  261. snapshot.ensure_crawl_symlink()
  262. except Exception:
  263. pass
  264. return created_snapshots
  265. def run(self) -> 'Snapshot | None':
  266. """
  267. Execute this Crawl: run hooks, process JSONL, create snapshots.
  268. Called by the state machine when entering the 'started' state.
  269. Returns:
  270. The root Snapshot for this crawl, or None for system crawls that don't create snapshots
  271. """
  272. import time
  273. from pathlib import Path
  274. from archivebox.hooks import run_hook, discover_hooks, process_hook_records
  275. from archivebox.config.configset import get_config
  276. # Debug logging to file (since stdout/stderr redirected to /dev/null in progress mode)
  277. debug_log = Path('/tmp/archivebox_crawl_debug.log')
  278. with open(debug_log, 'a') as f:
  279. f.write(f'\n=== Crawl.run() starting for {self.id} at {time.time()} ===\n')
  280. f.flush()
  281. # Get merged config with crawl context
  282. config = get_config(crawl=self)
  283. # Discover and run on_Crawl hooks
  284. with open(debug_log, 'a') as f:
  285. f.write(f'Discovering Crawl hooks...\n')
  286. f.flush()
  287. hooks = discover_hooks('Crawl', config=config)
  288. with open(debug_log, 'a') as f:
  289. f.write(f'Found {len(hooks)} hooks\n')
  290. f.flush()
  291. for hook in hooks:
  292. with open(debug_log, 'a') as f:
  293. f.write(f'Running hook: {hook.name}\n')
  294. f.flush()
  295. hook_start = time.time()
  296. plugin_name = hook.parent.name
  297. output_dir = self.output_dir / plugin_name
  298. output_dir.mkdir(parents=True, exist_ok=True)
  299. # Run hook using Process.launch() - returns Process model
  300. process = run_hook(
  301. hook,
  302. output_dir=output_dir,
  303. config=config,
  304. crawl_id=str(self.id),
  305. source_url=self.urls, # Pass full newline-separated URLs
  306. )
  307. with open(debug_log, 'a') as f:
  308. f.write(f'Hook {hook.name} completed with status={process.status}\n')
  309. f.flush()
  310. hook_elapsed = time.time() - hook_start
  311. if hook_elapsed > 0.5: # Log slow hooks
  312. print(f'[yellow]โฑ๏ธ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
  313. # Background hook - still running
  314. if process.status == process.StatusChoices.RUNNING:
  315. continue
  316. # Foreground hook - process JSONL records
  317. from archivebox.hooks import extract_records_from_process
  318. records = extract_records_from_process(process)
  319. if records:
  320. print(f'[cyan]๐Ÿ“ Processing {len(records)} records from {hook.name}[/cyan]')
  321. for record in records[:3]: # Show first 3
  322. print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
  323. overrides = {'crawl': self}
  324. stats = process_hook_records(records, overrides=overrides)
  325. if stats:
  326. print(f'[green]โœ“ Created: {stats}[/green]')
  327. # Ensure any newly declared binaries are installed before creating snapshots
  328. from archivebox.machine.models import Binary, Machine
  329. from django.utils import timezone
  330. machine = Machine.current()
  331. while True:
  332. pending_binaries = Binary.objects.filter(
  333. machine=machine,
  334. status=Binary.StatusChoices.QUEUED,
  335. retry_at__lte=timezone.now(),
  336. ).order_by('retry_at')
  337. if not pending_binaries.exists():
  338. break
  339. for binary in pending_binaries:
  340. try:
  341. binary.sm.tick()
  342. except Exception:
  343. continue
  344. # Exit if nothing else is immediately retryable
  345. if not Binary.objects.filter(
  346. machine=machine,
  347. status=Binary.StatusChoices.QUEUED,
  348. retry_at__lte=timezone.now(),
  349. ).exists():
  350. break
  351. # Create snapshots from all URLs in self.urls
  352. with open(debug_log, 'a') as f:
  353. f.write(f'Creating snapshots from URLs...\n')
  354. f.flush()
  355. created_snapshots = self.create_snapshots_from_urls()
  356. with open(debug_log, 'a') as f:
  357. f.write(f'Created {len(created_snapshots)} snapshots\n')
  358. f.write(f'=== Crawl.run() complete ===\n\n')
  359. f.flush()
  360. # Return first snapshot for this crawl (newly created or existing)
  361. # This ensures the crawl doesn't seal if snapshots exist, even if they weren't just created
  362. return self.snapshot_set.first()
  363. def is_finished(self) -> bool:
  364. """Check if crawl is finished (all snapshots sealed or no snapshots exist)."""
  365. from archivebox.core.models import Snapshot
  366. # Check if any snapshots exist for this crawl
  367. snapshots = Snapshot.objects.filter(crawl=self)
  368. # If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
  369. if not snapshots.exists():
  370. return True
  371. # If snapshots exist, check if all are sealed
  372. if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
  373. return False
  374. return True
  375. def cleanup(self):
  376. """Clean up background hooks and run on_CrawlEnd hooks."""
  377. from archivebox.hooks import run_hook, discover_hooks
  378. from archivebox.machine.models import Process
  379. # Kill any background Crawl hooks using Process records
  380. # Find all running hook Processes that are children of this crawl's workers
  381. # (CrawlWorker already kills its hooks via on_shutdown, but this is backup for orphans)
  382. running_hooks = Process.objects.filter(
  383. parent__worker_type='crawl',
  384. process_type=Process.TypeChoices.HOOK,
  385. status=Process.StatusChoices.RUNNING,
  386. ).distinct()
  387. for process in running_hooks:
  388. # Use Process.kill_tree() to gracefully kill parent + children
  389. killed_count = process.kill_tree(graceful_timeout=2.0)
  390. if killed_count > 0:
  391. print(f'[yellow]๐Ÿ”ช Killed {killed_count} orphaned crawl hook process(es)[/yellow]')
  392. # Clean up .pid files from output directory
  393. if self.output_dir.exists():
  394. for pid_file in self.output_dir.glob('**/*.pid'):
  395. pid_file.unlink(missing_ok=True)
  396. # Run on_CrawlEnd hooks
  397. from archivebox.config.configset import get_config
  398. config = get_config(crawl=self)
  399. hooks = discover_hooks('CrawlEnd', config=config)
  400. for hook in hooks:
  401. plugin_name = hook.parent.name
  402. output_dir = self.output_dir / plugin_name
  403. output_dir.mkdir(parents=True, exist_ok=True)
  404. process = run_hook(
  405. hook,
  406. output_dir=output_dir,
  407. config=config,
  408. crawl_id=str(self.id),
  409. source_url=self.urls, # Pass full newline-separated URLs
  410. )
  411. # Log failures but don't block
  412. if process.exit_code != 0:
  413. print(f'[yellow]โš ๏ธ CrawlEnd hook failed: {hook.name}[/yellow]')
  414. # =============================================================================
  415. # State Machines
  416. # =============================================================================
  417. class CrawlMachine(BaseStateMachine, strict_states=True):
  418. """
  419. State machine for managing Crawl lifecycle.
  420. Hook Lifecycle:
  421. โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
  422. โ”‚ QUEUED State โ”‚
  423. โ”‚ โ€ข Waiting for crawl to be ready (has URLs) โ”‚
  424. โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
  425. โ†“ tick() when can_start()
  426. โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
  427. โ”‚ STARTED State โ†’ enter_started() โ”‚
  428. โ”‚ 1. crawl.run() โ”‚
  429. โ”‚ โ€ข discover_hooks('Crawl') โ†’ finds all crawl hooks โ”‚
  430. โ”‚ โ€ข For each hook: โ”‚
  431. โ”‚ - run_hook(script, output_dir, ...) โ”‚
  432. โ”‚ - Parse JSONL from hook output โ”‚
  433. โ”‚ - process_hook_records() โ†’ creates Snapshots โ”‚
  434. โ”‚ โ€ข create_snapshots_from_urls() โ†’ from self.urls field โ”‚
  435. โ”‚ โ”‚
  436. โ”‚ 2. Snapshots process independently with their own โ”‚
  437. โ”‚ state machines (see SnapshotMachine) โ”‚
  438. โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
  439. โ†“ tick() when is_finished()
  440. โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
  441. โ”‚ SEALED State โ†’ enter_sealed() โ”‚
  442. โ”‚ โ€ข cleanup() โ†’ runs on_CrawlEnd hooks, kills background โ”‚
  443. โ”‚ โ€ข Set retry_at=None (no more processing) โ”‚
  444. โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
  445. """
  446. model_attr_name = 'crawl'
  447. # States
  448. queued = State(value=Crawl.StatusChoices.QUEUED, initial=True)
  449. started = State(value=Crawl.StatusChoices.STARTED)
  450. sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
  451. # Tick Event (polled by workers)
  452. tick = (
  453. queued.to.itself(unless='can_start') |
  454. queued.to(started, cond='can_start') |
  455. started.to(sealed, cond='is_finished')
  456. )
  457. # Manual event (triggered by last Snapshot sealing)
  458. seal = started.to(sealed)
  459. def can_start(self) -> bool:
  460. if not self.crawl.urls:
  461. print(f'[red]โš ๏ธ Crawl {self.crawl.id} cannot start: no URLs[/red]')
  462. return False
  463. urls_list = self.crawl.get_urls_list()
  464. if not urls_list:
  465. print(f'[red]โš ๏ธ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]')
  466. return False
  467. return True
  468. def is_finished(self) -> bool:
  469. """Check if all Snapshots for this crawl are finished."""
  470. return self.crawl.is_finished()
  471. @started.enter
  472. def enter_started(self):
  473. import sys
  474. from archivebox.core.models import Snapshot
  475. print(f'[cyan]๐Ÿ”„ CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr)
  476. try:
  477. # Run the crawl - runs hooks, processes JSONL, creates snapshots
  478. first_snapshot = self.crawl.run()
  479. if first_snapshot:
  480. print(f'[cyan]๐Ÿ”„ Created {self.crawl.snapshot_set.count()} snapshot(s), first: {first_snapshot.url}[/cyan]', file=sys.stderr)
  481. # Update status to STARTED
  482. # Set retry_at to near future so tick() can poll and check is_finished()
  483. self.crawl.update_and_requeue(
  484. retry_at=timezone.now() + timedelta(seconds=2),
  485. status=Crawl.StatusChoices.STARTED,
  486. )
  487. else:
  488. # No snapshots (system crawl like archivebox://install)
  489. print(f'[cyan]๐Ÿ”„ No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
  490. # Seal immediately since there's no work to do
  491. self.seal()
  492. except Exception as e:
  493. print(f'[red]โš ๏ธ Crawl {self.crawl.id} failed to start: {e}[/red]')
  494. import traceback
  495. traceback.print_exc()
  496. raise
  497. @sealed.enter
  498. def enter_sealed(self):
  499. # Clean up background hooks and run on_CrawlEnd hooks
  500. self.crawl.cleanup()
  501. self.crawl.update_and_requeue(
  502. retry_at=None,
  503. status=Crawl.StatusChoices.SEALED,
  504. )
  505. # =============================================================================
  506. # Register State Machines
  507. # =============================================================================
  508. # Manually register state machines with python-statemachine registry
  509. # (normally auto-discovered from statemachines.py, but we define them here for clarity)
  510. registry.register(CrawlMachine)