progress_layout.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. """
  2. Rich Layout-based live progress display for ArchiveBox orchestrator.
  3. Shows a comprehensive dashboard with:
  4. - Top: Crawl queue status (full width)
  5. - Middle: Crawl queue tree with hook outputs
  6. - Bottom: Running process logs (dynamic panels)
  7. """
  8. __package__ = 'archivebox.misc'
  9. from datetime import datetime, timezone
  10. import os
  11. import re
  12. from typing import List, Optional, Any
  13. from collections import deque
  14. from pathlib import Path
  15. from rich import box
  16. from rich.console import Group
  17. from rich.layout import Layout
  18. from rich.columns import Columns
  19. from rich.panel import Panel
  20. from rich.text import Text
  21. from rich.table import Table
  22. from rich.tree import Tree
  23. from rich.cells import cell_len
  24. from archivebox.config import VERSION
  25. _RICH_TAG_RE = re.compile(r'\[/?[^\]]+\]')
  26. def _strip_rich(text: str) -> str:
  27. return _RICH_TAG_RE.sub('', text or '').strip()
  28. class CrawlQueuePanel:
  29. """Display crawl queue status across full width."""
  30. def __init__(self):
  31. self.orchestrator_status = "Idle"
  32. self.crawl_queue_count = 0
  33. self.crawl_workers_count = 0
  34. self.binary_queue_count = 0
  35. self.binary_workers_count = 0
  36. self.max_crawl_workers = 8
  37. self.crawl_id: Optional[str] = None
  38. def __rich__(self) -> Panel:
  39. grid = Table.grid(expand=True)
  40. grid.add_column(justify="left", ratio=1)
  41. grid.add_column(justify="center", ratio=1)
  42. grid.add_column(justify="center", ratio=1)
  43. grid.add_column(justify="right", ratio=1)
  44. # Left: ArchiveBox version + timestamp
  45. left_text = Text()
  46. left_text.append("ArchiveBox ", style="bold cyan")
  47. left_text.append(f"v{VERSION}", style="bold yellow")
  48. left_text.append(f" • {datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
  49. # Center-left: Crawl + Binary queue status
  50. queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
  51. center_left_text = Text()
  52. center_left_text.append("Crawls: ", style="white")
  53. center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
  54. center_left_text.append(" queued", style="grey53")
  55. center_left_text.append(" • Binaries: ", style="white")
  56. binary_queue_style = "yellow" if self.binary_queue_count > 0 else "grey53"
  57. center_left_text.append(str(self.binary_queue_count), style=f"bold {binary_queue_style}")
  58. center_left_text.append(" queued", style="grey53")
  59. # Center-right: Worker status
  60. worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
  61. center_right_text = Text()
  62. center_right_text.append("Workers: ", style="white")
  63. center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
  64. center_right_text.append(" crawl", style="grey53")
  65. binary_worker_style = "green" if self.binary_workers_count > 0 else "grey53"
  66. center_right_text.append(" • ", style="grey53")
  67. center_right_text.append(str(self.binary_workers_count), style=f"bold {binary_worker_style}")
  68. center_right_text.append(" binary", style="grey53")
  69. # Right: Orchestrator status
  70. status_color = "green" if self.crawl_workers_count > 0 else "grey53"
  71. right_text = Text()
  72. right_text.append("Status: ", style="white")
  73. right_text.append(self.orchestrator_status, style=f"bold {status_color}")
  74. if self.crawl_id:
  75. right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
  76. grid.add_row(left_text, center_left_text, center_right_text, right_text)
  77. return Panel(grid, style="white on blue", box=box.HORIZONTALS)
  78. class ProcessLogPanel:
  79. """Display logs for a running Process."""
  80. def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None, bg_terminating: bool = False):
  81. self.process = process
  82. self.max_lines = max_lines
  83. self.compact = compact
  84. self.bg_terminating = bg_terminating
  85. def __rich__(self) -> Panel:
  86. completed_line = self._completed_output_line()
  87. if completed_line:
  88. style = "green" if self._completed_ok() else "yellow"
  89. return Text(completed_line, style=style)
  90. is_pending = self._is_pending()
  91. output_line = '' if is_pending else self._output_line()
  92. stdout_lines = []
  93. stderr_lines = []
  94. try:
  95. stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
  96. stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
  97. except Exception:
  98. stdout_lines = []
  99. stderr_lines = []
  100. header_lines = []
  101. chrome_launch_line = self._chrome_launch_line(stderr_lines, stdout_lines)
  102. if chrome_launch_line:
  103. header_lines.append(Text(chrome_launch_line, style="grey53"))
  104. if output_line:
  105. header_lines.append(Text(output_line, style="grey53"))
  106. log_lines = []
  107. for line in stdout_lines:
  108. if line:
  109. log_lines.append(Text(line, style="white"))
  110. for line in stderr_lines:
  111. if line:
  112. log_lines.append(Text(line, style="cyan"))
  113. compact = self.compact if self.compact is not None else self._is_background_hook()
  114. max_body = max(1, self.max_lines - len(header_lines))
  115. if not log_lines:
  116. log_lines = []
  117. lines = header_lines + log_lines[-max_body:]
  118. content = Group(*lines) if lines else Text("")
  119. title = self._title()
  120. border_style = self._border_style(is_pending=is_pending)
  121. height = 2 if is_pending else None
  122. return Panel(
  123. content,
  124. title=title,
  125. border_style=border_style,
  126. box=box.HORIZONTALS,
  127. padding=(0, 1),
  128. height=height,
  129. )
  130. def plain_lines(self) -> list[str]:
  131. completed_line = self._completed_output_line()
  132. if completed_line:
  133. return [completed_line]
  134. lines = []
  135. if not self._is_pending():
  136. output_line = self._output_line()
  137. if output_line:
  138. lines.append(output_line)
  139. try:
  140. stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
  141. stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
  142. except Exception:
  143. stdout_lines = []
  144. stderr_lines = []
  145. for line in stdout_lines:
  146. if line:
  147. lines.append(line)
  148. for line in stderr_lines:
  149. if line:
  150. lines.append(line)
  151. return lines
  152. def _title(self) -> str:
  153. process_type = getattr(self.process, 'process_type', 'process')
  154. worker_type = getattr(self.process, 'worker_type', '')
  155. pid = getattr(self.process, 'pid', None)
  156. label = process_type
  157. if process_type == 'worker' and worker_type:
  158. label, worker_suffix = self._worker_label(worker_type)
  159. elif process_type == 'hook':
  160. try:
  161. cmd = getattr(self.process, 'cmd', [])
  162. hook_path = Path(cmd[1]) if len(cmd) > 1 else None
  163. hook_name = hook_path.name if hook_path else 'hook'
  164. plugin_name = hook_path.parent.name if hook_path and hook_path.parent.name else 'hook'
  165. except Exception:
  166. hook_name = 'hook'
  167. plugin_name = 'hook'
  168. label = f"{plugin_name}/{hook_name}"
  169. worker_suffix = ''
  170. else:
  171. worker_suffix = ''
  172. url = self._extract_url()
  173. url_suffix = f" url={self._abbrev_url(url)}" if url else ""
  174. time_suffix = self._elapsed_suffix()
  175. title_style = "grey53" if self._is_pending() else "bold white"
  176. if pid:
  177. return f"[{title_style}]{label}[/{title_style}] [grey53]pid={pid}{worker_suffix}{url_suffix}{time_suffix}[/grey53]"
  178. return f"[{title_style}]{label}[/{title_style}]{f' [grey53]{worker_suffix.strip()} {url_suffix.strip()}{time_suffix}[/grey53]' if (worker_suffix or url_suffix or time_suffix) else ''}".rstrip()
  179. def _is_background_hook(self) -> bool:
  180. if getattr(self.process, 'process_type', '') != 'hook':
  181. return False
  182. try:
  183. cmd = getattr(self.process, 'cmd', [])
  184. hook_path = Path(cmd[1]) if len(cmd) > 1 else None
  185. hook_name = hook_path.name if hook_path else ''
  186. return '.bg.' in hook_name
  187. except Exception:
  188. return False
  189. def _is_pending(self) -> bool:
  190. status = getattr(self.process, 'status', '')
  191. if status in ('queued', 'pending', 'backoff'):
  192. return True
  193. if getattr(self.process, 'process_type', '') == 'hook' and not getattr(self.process, 'pid', None):
  194. return True
  195. return False
  196. def _completed_ok(self) -> bool:
  197. exit_code = getattr(self.process, 'exit_code', None)
  198. return exit_code in (0, None)
  199. def _completed_output_line(self) -> str:
  200. status = getattr(self.process, 'status', '')
  201. if status != 'exited':
  202. return ''
  203. output_line = self._output_line()
  204. if not output_line:
  205. return ''
  206. if not self._has_output_files():
  207. return ''
  208. return output_line
  209. def _has_output_files(self) -> bool:
  210. pwd = getattr(self.process, 'pwd', None)
  211. if not pwd:
  212. return False
  213. try:
  214. base = Path(pwd)
  215. if not base.exists():
  216. return False
  217. ignore = {'stdout.log', 'stderr.log', 'cmd.sh', 'process.pid', 'hook.pid', 'listener.pid'}
  218. for path in base.rglob('*'):
  219. if path.is_file() and path.name not in ignore:
  220. return True
  221. except Exception:
  222. return False
  223. return False
  224. def _border_style(self, is_pending: bool) -> str:
  225. if is_pending:
  226. return "grey53"
  227. status = getattr(self.process, 'status', '')
  228. if status == 'exited':
  229. exit_code = getattr(self.process, 'exit_code', None)
  230. return "green" if exit_code in (0, None) else "yellow"
  231. is_hook = getattr(self.process, 'process_type', '') == 'hook'
  232. if is_hook and not self._is_background_hook():
  233. return "green"
  234. if is_hook and self._is_background_hook() and self.bg_terminating:
  235. return "red"
  236. return "cyan"
  237. def _worker_label(self, worker_type: str) -> tuple[str, str]:
  238. cmd = getattr(self.process, 'cmd', []) or []
  239. if worker_type == 'crawl':
  240. crawl_id = self._extract_arg(cmd, '--crawl-id')
  241. suffix = ''
  242. if crawl_id:
  243. suffix = f" id={str(crawl_id)[-8:]}"
  244. try:
  245. from archivebox.crawls.models import Crawl
  246. crawl = Crawl.objects.filter(id=crawl_id).first()
  247. if crawl:
  248. urls = crawl.get_urls_list()
  249. if urls:
  250. url_list = self._abbrev_urls(urls)
  251. suffix += f" urls={url_list}"
  252. except Exception:
  253. pass
  254. return 'crawl', suffix
  255. if worker_type == 'snapshot':
  256. snapshot_id = self._extract_arg(cmd, '--snapshot-id')
  257. suffix = ''
  258. if snapshot_id:
  259. suffix = f" id={str(snapshot_id)[-8:]}"
  260. try:
  261. from archivebox.core.models import Snapshot
  262. snap = Snapshot.objects.filter(id=snapshot_id).first()
  263. if snap and snap.url:
  264. suffix += f" url={self._abbrev_url(snap.url, max_len=48)}"
  265. except Exception:
  266. pass
  267. return 'snapshot', suffix
  268. return f"worker:{worker_type}", ''
  269. @staticmethod
  270. def _extract_arg(cmd: list[str], key: str) -> str | None:
  271. for i, part in enumerate(cmd):
  272. if part.startswith(f'{key}='):
  273. return part.split('=', 1)[1]
  274. if part == key and i + 1 < len(cmd):
  275. return cmd[i + 1]
  276. return None
  277. def _abbrev_urls(self, urls: list[str], max_len: int = 48) -> str:
  278. if not urls:
  279. return ''
  280. if len(urls) == 1:
  281. return self._abbrev_url(urls[0], max_len=max_len)
  282. first = self._abbrev_url(urls[0], max_len=max_len)
  283. return f"{first},+{len(urls) - 1}"
  284. def _extract_url(self) -> str:
  285. url = getattr(self.process, 'url', None)
  286. if url:
  287. return str(url)
  288. cmd = getattr(self.process, 'cmd', []) or []
  289. for i, part in enumerate(cmd):
  290. if part.startswith('--url='):
  291. return part.split('=', 1)[1].strip()
  292. if part == '--url' and i + 1 < len(cmd):
  293. return str(cmd[i + 1]).strip()
  294. return ''
  295. def _abbrev_url(self, url: str, max_len: int = 48) -> str:
  296. if not url:
  297. return ''
  298. if len(url) <= max_len:
  299. return url
  300. return f"{url[:max_len - 3]}..."
  301. def _chrome_launch_line(self, stderr_lines: list[str], stdout_lines: list[str]) -> str:
  302. try:
  303. cmd = getattr(self.process, 'cmd', [])
  304. hook_path = Path(cmd[1]) if len(cmd) > 1 else None
  305. hook_name = hook_path.name if hook_path else ''
  306. if 'chrome_launch' not in hook_name:
  307. return ''
  308. pid = ''
  309. ws = ''
  310. for line in stderr_lines + stdout_lines:
  311. if not ws and 'CDP URL:' in line:
  312. ws = line.split('CDP URL:', 1)[1].strip()
  313. if not pid and 'PID:' in line:
  314. pid = line.split('PID:', 1)[1].strip()
  315. if pid and ws:
  316. return f"Chrome pid={pid} {ws}"
  317. if ws:
  318. return f"Chrome {ws}"
  319. if pid:
  320. return f"Chrome pid={pid}"
  321. try:
  322. from archivebox import DATA_DIR
  323. base = Path(DATA_DIR)
  324. pwd = getattr(self.process, 'pwd', None)
  325. if pwd:
  326. chrome_dir = Path(pwd)
  327. if not chrome_dir.is_absolute():
  328. chrome_dir = (base / chrome_dir).resolve()
  329. cdp_file = chrome_dir / 'cdp_url.txt'
  330. pid_file = chrome_dir / 'chrome.pid'
  331. if cdp_file.exists():
  332. ws = cdp_file.read_text().strip()
  333. if pid_file.exists():
  334. pid = pid_file.read_text().strip()
  335. if pid and ws:
  336. return f"Chrome pid={pid} {ws}"
  337. if ws:
  338. return f"Chrome {ws}"
  339. if pid:
  340. return f"Chrome pid={pid}"
  341. except Exception:
  342. pass
  343. except Exception:
  344. return ''
  345. return ''
  346. def _elapsed_suffix(self) -> str:
  347. started_at = getattr(self.process, 'started_at', None)
  348. timeout = getattr(self.process, 'timeout', None)
  349. if not started_at or not timeout:
  350. return ''
  351. try:
  352. now = datetime.now(timezone.utc) if started_at.tzinfo else datetime.now()
  353. elapsed = int((now - started_at).total_seconds())
  354. elapsed = max(elapsed, 0)
  355. return f" [{elapsed}/{int(timeout)}s]"
  356. except Exception:
  357. return ''
  358. def _output_line(self) -> str:
  359. pwd = getattr(self.process, 'pwd', None)
  360. if not pwd:
  361. return ''
  362. try:
  363. from archivebox import DATA_DIR
  364. rel = Path(pwd)
  365. base = Path(DATA_DIR)
  366. if rel.is_absolute():
  367. try:
  368. rel = rel.relative_to(base)
  369. except Exception:
  370. pass
  371. rel_str = f"./{rel}" if not str(rel).startswith("./") else str(rel)
  372. return f"{rel_str}"
  373. except Exception:
  374. return f"{pwd}"
  375. class WorkerLogPanel:
  376. """Display worker logs by tailing stdout/stderr from Process."""
  377. def __init__(self, title: str, empty_message: str, running_message: str, max_lines: int = 8):
  378. self.title = title
  379. self.empty_message = empty_message
  380. self.running_message = running_message
  381. self.log_lines: deque = deque(maxlen=max_lines * 2) # Allow more buffer
  382. self.max_lines = max_lines
  383. self.last_stdout_pos = 0 # Track file position for efficient tailing
  384. self.last_stderr_pos = 0
  385. self.last_process_running = False
  386. def update_from_process(self, process: Any):
  387. """Update logs by tailing the Process stdout/stderr files."""
  388. if not process:
  389. self.last_process_running = False
  390. return
  391. # Use Process tail helpers for consistency
  392. try:
  393. self.last_process_running = bool(getattr(process, 'is_running', False))
  394. stdout_lines = list(process.tail_stdout(lines=self.max_lines, follow=False))
  395. stderr_lines = list(process.tail_stderr(lines=self.max_lines, follow=False))
  396. except Exception:
  397. return
  398. self.log_lines.clear()
  399. # Preserve ordering by showing stdout then stderr
  400. for line in stdout_lines:
  401. if line:
  402. self.log_lines.append(('stdout', line))
  403. for line in stderr_lines:
  404. if line:
  405. self.log_lines.append(('stderr', line))
  406. def __rich__(self) -> Panel:
  407. if not self.log_lines:
  408. message = self.running_message if self.last_process_running else self.empty_message
  409. content = Text(message, style="grey53", justify="center")
  410. else:
  411. # Get the last max_lines for display
  412. display_lines = list(self.log_lines)[-self.max_lines:]
  413. lines = []
  414. for stream, message in display_lines:
  415. line = Text()
  416. # Color code by stream - stderr is usually debug output
  417. if stream == 'stderr':
  418. # Rich formatted logs from stderr
  419. line.append(message, style="cyan")
  420. else:
  421. line.append(message, style="white")
  422. lines.append(line)
  423. content = Group(*lines)
  424. return Panel(
  425. content,
  426. title=f"[bold cyan]{self.title}",
  427. border_style="cyan",
  428. box=box.HORIZONTALS,
  429. )
  430. class CrawlQueueTreePanel:
  431. """Display crawl queue with snapshots + hook summary in a tree view."""
  432. def __init__(self, max_crawls: int = 8, max_snapshots: int = 16):
  433. self.crawls: list[dict[str, Any]] = []
  434. self.max_crawls = max_crawls
  435. self.max_snapshots = max_snapshots
  436. def update_crawls(self, crawls: list[dict[str, Any]]) -> None:
  437. """Update crawl tree data."""
  438. self.crawls = crawls[:self.max_crawls]
  439. def __rich__(self) -> Panel:
  440. if not self.crawls:
  441. content = Text("No active crawls", style="grey53", justify="center")
  442. else:
  443. trees = []
  444. for crawl in self.crawls:
  445. crawl_status = crawl.get('status', '')
  446. crawl_label = crawl.get('label', '')
  447. crawl_id = crawl.get('id', '')[:8]
  448. crawl_text = Text(f"{self._status_icon(crawl_status)} {crawl_id} {crawl_label}", style="white")
  449. crawl_tree = Tree(crawl_text, guide_style="grey53")
  450. snapshots = crawl.get('snapshots', [])[:self.max_snapshots]
  451. for snap in snapshots:
  452. snap_status = snap.get('status', '')
  453. snap_label = snap.get('label', '')
  454. snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white")
  455. snap_node = crawl_tree.add(snap_text)
  456. output_path = snap.get('output_path', '')
  457. if output_path:
  458. snap_node.add(Text(output_path, style="grey53"))
  459. hooks = snap.get('hooks', []) or []
  460. for hook in hooks:
  461. status = hook.get('status', '')
  462. path = hook.get('path', '')
  463. size = hook.get('size', '')
  464. elapsed = hook.get('elapsed', '')
  465. timeout = hook.get('timeout', '')
  466. is_bg = hook.get('is_bg', False)
  467. is_running = hook.get('is_running', False)
  468. is_pending = hook.get('is_pending', False)
  469. icon, color = self._hook_style(status, is_bg=is_bg, is_running=is_running, is_pending=is_pending)
  470. stats = self._hook_stats(size=size, elapsed=elapsed, timeout=timeout, status=status)
  471. line = Text(f"{icon} {path}{stats}", style=color)
  472. stderr_tail = hook.get('stderr', '')
  473. if stderr_tail:
  474. left_str = f"{icon} {path}{stats}"
  475. avail = self._available_width(left_str, indent=16)
  476. trunc = getattr(self, "_truncate_tail", self._truncate_to_width)
  477. stderr_tail = trunc(stderr_tail, avail)
  478. if not stderr_tail:
  479. snap_node.add(line)
  480. continue
  481. row = Table.grid(expand=True)
  482. row.add_column(justify="left", ratio=1)
  483. row.add_column(justify="right")
  484. row.add_row(line, Text(stderr_tail, style="grey70"))
  485. snap_node.add(row)
  486. else:
  487. snap_node.add(line)
  488. trees.append(crawl_tree)
  489. content = Group(*trees)
  490. return Panel(
  491. content,
  492. title="[bold white]Crawl Queue",
  493. border_style="white",
  494. box=box.HORIZONTALS,
  495. )
  496. @staticmethod
  497. def _status_icon(status: str) -> str:
  498. if status in ('queued', 'pending'):
  499. return '⏳'
  500. if status in ('started', 'running'):
  501. return '▶'
  502. if status in ('sealed', 'done', 'completed'):
  503. return '✅'
  504. if status in ('failed', 'error'):
  505. return '✖'
  506. return '•'
  507. @staticmethod
  508. def _hook_style(status: str, is_bg: bool = False, is_running: bool = False, is_pending: bool = False) -> tuple[str, str]:
  509. if status == 'succeeded':
  510. return '✅', 'green'
  511. if status == 'failed':
  512. return '✖', 'red'
  513. if status == 'skipped':
  514. return '⏭', 'grey53'
  515. if is_pending:
  516. return '⌛️', 'grey53'
  517. if is_running and is_bg:
  518. return '᠁', 'cyan'
  519. if is_running:
  520. return '▶️', 'cyan'
  521. if status == 'started':
  522. return '▶️', 'cyan'
  523. return '•', 'grey53'
  524. @staticmethod
  525. def _hook_stats(size: str = '', elapsed: str = '', timeout: str = '', status: str = '') -> str:
  526. if status in ('succeeded', 'failed', 'skipped'):
  527. parts = []
  528. if size:
  529. parts.append(size)
  530. if elapsed:
  531. parts.append(elapsed)
  532. if not parts:
  533. return ''
  534. return f" ({' | '.join(parts)})"
  535. if elapsed or timeout:
  536. size_part = '...' if elapsed or timeout else ''
  537. time_part = ''
  538. if elapsed and timeout:
  539. time_part = f"{elapsed}/{timeout}"
  540. elif elapsed:
  541. time_part = f"{elapsed}"
  542. return f" ({size_part} | {time_part})" if time_part else f" ({size_part})"
  543. return ''
  544. @staticmethod
  545. def _terminal_width() -> int:
  546. try:
  547. return os.get_terminal_size().columns
  548. except OSError:
  549. return 120
  550. @staticmethod
  551. def _truncate_to_width(text: str, max_width: int) -> str:
  552. if not text or max_width <= 0:
  553. return ''
  554. t = Text(text)
  555. t.truncate(max_width, overflow="ellipsis")
  556. return t.plain
  557. @staticmethod
  558. def _truncate_tail(text: str, max_width: int) -> str:
  559. if not text or max_width <= 0:
  560. return ''
  561. if cell_len(text) <= max_width:
  562. return text
  563. if max_width <= 1:
  564. return '…'
  565. return f"…{text[-(max_width - 1):]}"
  566. def _available_width(self, left_text: str, indent: int = 0) -> int:
  567. width = self._terminal_width()
  568. base = max(0, width - cell_len(left_text) - indent - 6)
  569. cap = max(0, (width * 2) // 5)
  570. return max(0, min(base, cap))
  571. class ArchiveBoxProgressLayout:
  572. """
  573. Main layout manager for ArchiveBox orchestrator progress display.
  574. Layout structure:
  575. ┌─────────────────────────────────────────────────────────────┐
  576. │ Crawl Queue (full width) │
  577. ├─────────────────────────────────────────────────────────────┤
  578. │ Crawl Queue Tree (hooks + outputs) │
  579. ├─────────────────────────────────────────────────────────────┤
  580. │ Running Process Logs (dynamic panels) │
  581. └─────────────────────────────────────────────────────────────┘
  582. """
  583. def __init__(self, crawl_id: Optional[str] = None):
  584. self.crawl_id = crawl_id
  585. self.start_time = datetime.now(timezone.utc)
  586. # Create components
  587. self.crawl_queue = CrawlQueuePanel()
  588. self.crawl_queue.crawl_id = crawl_id
  589. self.process_panels: List[ProcessLogPanel] = []
  590. self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16)
  591. # Create layout
  592. self.layout = self._make_layout()
  593. def _make_layout(self) -> Layout:
  594. """Define the layout structure."""
  595. layout = Layout(name="root")
  596. # Top-level split: crawl_queue, crawl_tree, processes
  597. layout.split(
  598. Layout(name="crawl_queue", size=3),
  599. Layout(name="crawl_tree", size=20),
  600. Layout(name="processes", ratio=1),
  601. )
  602. # Assign components to layout sections
  603. layout["crawl_queue"].update(self.crawl_queue)
  604. layout["crawl_tree"].update(self.crawl_queue_tree)
  605. layout["processes"].update(Columns([]))
  606. return layout
  607. def update_orchestrator_status(
  608. self,
  609. status: str,
  610. crawl_queue_count: int = 0,
  611. crawl_workers_count: int = 0,
  612. binary_queue_count: int = 0,
  613. binary_workers_count: int = 0,
  614. max_crawl_workers: int = 8,
  615. ):
  616. """Update orchestrator status in the crawl queue panel."""
  617. self.crawl_queue.orchestrator_status = status
  618. self.crawl_queue.crawl_queue_count = crawl_queue_count
  619. self.crawl_queue.crawl_workers_count = crawl_workers_count
  620. self.crawl_queue.binary_queue_count = binary_queue_count
  621. self.crawl_queue.binary_workers_count = binary_workers_count
  622. self.crawl_queue.max_crawl_workers = max_crawl_workers
  623. def update_process_panels(self, processes: List[Any], pending: Optional[List[Any]] = None) -> None:
  624. """Update process panels to show all running processes."""
  625. panels = []
  626. all_processes = list(processes) + list(pending or [])
  627. fg_running = False
  628. for process in processes:
  629. if getattr(process, 'process_type', '') != 'hook':
  630. continue
  631. try:
  632. cmd = getattr(process, 'cmd', [])
  633. hook_path = Path(cmd[1]) if len(cmd) > 1 else None
  634. hook_name = hook_path.name if hook_path else ''
  635. if '.bg.' in hook_name:
  636. continue
  637. if '.bg.' not in hook_name:
  638. fg_running = True
  639. break
  640. except Exception:
  641. continue
  642. fg_pending = False
  643. for process in (pending or []):
  644. if getattr(process, 'process_type', '') != 'hook':
  645. continue
  646. try:
  647. cmd = getattr(process, 'cmd', [])
  648. hook_path = Path(cmd[1]) if len(cmd) > 1 else None
  649. hook_name = hook_path.name if hook_path else ''
  650. if '.bg.' in hook_name:
  651. continue
  652. if '.bg.' not in hook_name:
  653. fg_pending = True
  654. break
  655. except Exception:
  656. continue
  657. bg_terminating = bool(processes) and not fg_running and not fg_pending
  658. for process in all_processes:
  659. is_hook = getattr(process, 'process_type', '') == 'hook'
  660. is_bg = False
  661. if is_hook:
  662. try:
  663. cmd = getattr(process, 'cmd', [])
  664. hook_path = Path(cmd[1]) if len(cmd) > 1 else None
  665. hook_name = hook_path.name if hook_path else ''
  666. is_bg = '.bg.' in hook_name
  667. except Exception:
  668. is_bg = False
  669. if is_hook and is_bg:
  670. continue
  671. if not self._has_log_lines(process):
  672. continue
  673. is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
  674. max_lines = 2 if is_pending else (4 if is_bg else 7)
  675. panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg, bg_terminating=bg_terminating))
  676. if not panels:
  677. self.layout["processes"].size = 0
  678. self.layout["processes"].update(Text(""))
  679. self.process_panels = []
  680. return
  681. self.process_panels = panels
  682. self.layout["processes"].size = None
  683. self.layout["processes"].ratio = 1
  684. self.layout["processes"].update(Columns(panels, equal=True, expand=True))
  685. def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
  686. """Update the crawl queue tree panel."""
  687. self.crawl_queue_tree.update_crawls(crawls)
  688. # Auto-size crawl tree panel to content
  689. line_count = 0
  690. for crawl in crawls:
  691. line_count += 1
  692. for snap in crawl.get('snapshots', []) or []:
  693. line_count += 1
  694. if snap.get('output_path'):
  695. line_count += 1
  696. for _ in snap.get('hooks', []) or []:
  697. line_count += 1
  698. self.layout["crawl_tree"].size = max(4, line_count + 2)
  699. def log_event(self, message: str, style: str = "white") -> None:
  700. """Add an event to the orchestrator log."""
  701. return
  702. def get_layout(self) -> Layout:
  703. """Get the Rich Layout object for rendering."""
  704. return self.layout
  705. def plain_lines(self) -> list[tuple[str, str]]:
  706. lines: list[tuple[str, str]] = []
  707. queue = self.crawl_queue
  708. queue_line = (
  709. f"Status: {queue.orchestrator_status} | Crawls: {queue.crawl_queue_count} queued | "
  710. f"Binaries: {queue.binary_queue_count} queued | Workers: {queue.crawl_workers_count}/{queue.max_crawl_workers} "
  711. f"crawl, {queue.binary_workers_count} binary"
  712. )
  713. lines.append(("crawl_queue", queue_line))
  714. for panel in self.process_panels:
  715. title = _strip_rich(panel._title())
  716. for line in panel.plain_lines():
  717. if line:
  718. lines.append((title or "process", line))
  719. for crawl in self.crawl_queue_tree.crawls:
  720. crawl_line = f"{self.crawl_queue_tree._status_icon(crawl.get('status', ''))} {crawl.get('id', '')[:8]} {crawl.get('label', '')}".strip()
  721. lines.append(("crawl_tree", crawl_line))
  722. for snap in crawl.get('snapshots', []):
  723. snap_line = f" {self.crawl_queue_tree._status_icon(snap.get('status', ''))} {snap.get('label', '')}".rstrip()
  724. lines.append(("crawl_tree", snap_line))
  725. output_path = snap.get('output_path', '')
  726. if output_path:
  727. lines.append(("crawl_tree", f" {output_path}"))
  728. for hook in snap.get('hooks', []) or []:
  729. status = hook.get('status', '')
  730. path = hook.get('path', '')
  731. icon, _ = self.crawl_queue_tree._hook_style(
  732. status,
  733. is_bg=hook.get('is_bg', False),
  734. is_running=hook.get('is_running', False),
  735. is_pending=hook.get('is_pending', False),
  736. )
  737. stats = self.crawl_queue_tree._hook_stats(
  738. size=hook.get('size', ''),
  739. elapsed=hook.get('elapsed', ''),
  740. timeout=hook.get('timeout', ''),
  741. status=status,
  742. )
  743. stderr_tail = hook.get('stderr', '')
  744. hook_line = f" {icon} {path}{stats}".strip()
  745. if stderr_tail:
  746. avail = self.crawl_queue_tree._available_width(hook_line, indent=16)
  747. trunc = getattr(self.crawl_queue_tree, "_truncate_tail", self.crawl_queue_tree._truncate_to_width)
  748. stderr_tail = trunc(stderr_tail, avail)
  749. if stderr_tail:
  750. hook_line = f"{hook_line} {stderr_tail}"
  751. if hook_line:
  752. lines.append(("crawl_tree", hook_line))
  753. return lines
  754. @staticmethod
  755. def _has_log_lines(process: Any) -> bool:
  756. try:
  757. stdout_lines = list(process.tail_stdout(lines=1, follow=False))
  758. if any(line.strip() for line in stdout_lines):
  759. return True
  760. stderr_lines = list(process.tail_stderr(lines=1, follow=False))
  761. if any(line.strip() for line in stderr_lines):
  762. return True
  763. except Exception:
  764. return False
  765. return False