|
|
@@ -3,30 +3,29 @@ Rich Layout-based live progress display for ArchiveBox orchestrator.
|
|
|
|
|
|
Shows a comprehensive dashboard with:
|
|
|
- Top: Crawl queue status (full width)
|
|
|
-- Middle: 4-column grid of SnapshotWorker progress panels
|
|
|
+- Middle: Running process logs (dynamic panels)
|
|
|
- Bottom: Orchestrator/Daphne logs
|
|
|
"""
|
|
|
|
|
|
__package__ = 'archivebox.misc'
|
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
-from typing import Dict, List, Optional, Any
|
|
|
+from typing import List, Optional, Any
|
|
|
from collections import deque
|
|
|
+from pathlib import Path
|
|
|
|
|
|
from rich import box
|
|
|
from rich.align import Align
|
|
|
-from rich.console import Console, Group, RenderableType
|
|
|
+from rich.console import Group
|
|
|
from rich.layout import Layout
|
|
|
+from rich.columns import Columns
|
|
|
from rich.panel import Panel
|
|
|
-from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn, SpinnerColumn
|
|
|
-from rich.table import Table
|
|
|
from rich.text import Text
|
|
|
+from rich.table import Table
|
|
|
+from rich.tree import Tree
|
|
|
|
|
|
from archivebox.config import VERSION
|
|
|
|
|
|
-# Maximum number of SnapshotWorker columns to display
|
|
|
-MAX_WORKER_COLUMNS = 4
|
|
|
-
|
|
|
|
|
|
class CrawlQueuePanel:
|
|
|
"""Display crawl queue status across full width."""
|
|
|
@@ -35,6 +34,8 @@ class CrawlQueuePanel:
|
|
|
self.orchestrator_status = "Idle"
|
|
|
self.crawl_queue_count = 0
|
|
|
self.crawl_workers_count = 0
|
|
|
+ self.binary_queue_count = 0
|
|
|
+ self.binary_workers_count = 0
|
|
|
self.max_crawl_workers = 8
|
|
|
self.crawl_id: Optional[str] = None
|
|
|
|
|
|
@@ -51,19 +52,27 @@ class CrawlQueuePanel:
|
|
|
left_text.append(f"v{VERSION}", style="bold yellow")
|
|
|
left_text.append(f" • {datetime.now(timezone.utc).strftime('%H:%M:%S')}", style="grey53")
|
|
|
|
|
|
- # Center-left: Crawl queue status
|
|
|
+ # Center-left: Crawl + Binary queue status
|
|
|
queue_style = "yellow" if self.crawl_queue_count > 0 else "grey53"
|
|
|
center_left_text = Text()
|
|
|
center_left_text.append("Crawls: ", style="white")
|
|
|
center_left_text.append(str(self.crawl_queue_count), style=f"bold {queue_style}")
|
|
|
center_left_text.append(" queued", style="grey53")
|
|
|
+ center_left_text.append(" • Binaries: ", style="white")
|
|
|
+ binary_queue_style = "yellow" if self.binary_queue_count > 0 else "grey53"
|
|
|
+ center_left_text.append(str(self.binary_queue_count), style=f"bold {binary_queue_style}")
|
|
|
+ center_left_text.append(" queued", style="grey53")
|
|
|
|
|
|
- # Center-right: CrawlWorker status
|
|
|
+ # Center-right: Worker status
|
|
|
worker_style = "green" if self.crawl_workers_count > 0 else "grey53"
|
|
|
center_right_text = Text()
|
|
|
center_right_text.append("Workers: ", style="white")
|
|
|
center_right_text.append(f"{self.crawl_workers_count}/{self.max_crawl_workers}", style=f"bold {worker_style}")
|
|
|
- center_right_text.append(" active", style="grey53")
|
|
|
+ center_right_text.append(" crawl", style="grey53")
|
|
|
+ binary_worker_style = "green" if self.binary_workers_count > 0 else "grey53"
|
|
|
+ center_right_text.append(" • ", style="grey53")
|
|
|
+ center_right_text.append(str(self.binary_workers_count), style=f"bold {binary_worker_style}")
|
|
|
+ center_right_text.append(" binary", style="grey53")
|
|
|
|
|
|
# Right: Orchestrator status
|
|
|
status_color = "green" if self.crawl_workers_count > 0 else "grey53"
|
|
|
@@ -74,151 +83,302 @@ class CrawlQueuePanel:
|
|
|
right_text.append(f" [{self.crawl_id[:8]}]", style="grey53")
|
|
|
|
|
|
grid.add_row(left_text, center_left_text, center_right_text, right_text)
|
|
|
- return Panel(grid, style="white on blue", box=box.ROUNDED)
|
|
|
+ return Panel(grid, style="white on blue", box=box.HORIZONTALS)
|
|
|
|
|
|
|
|
|
-class SnapshotWorkerPanel:
|
|
|
- """Display progress for a single SnapshotWorker."""
|
|
|
+class ProcessLogPanel:
|
|
|
+ """Display logs for a running Process."""
|
|
|
|
|
|
- def __init__(self, worker_num: int):
|
|
|
- self.worker_num = worker_num
|
|
|
- self.snapshot_id: Optional[str] = None
|
|
|
- self.snapshot_url: Optional[str] = None
|
|
|
- self.total_hooks: int = 0
|
|
|
- self.completed_hooks: int = 0
|
|
|
- self.current_plugin: Optional[str] = None
|
|
|
- self.status: str = "idle" # idle, working, completed
|
|
|
- self.recent_logs: deque = deque(maxlen=5)
|
|
|
+ def __init__(self, process: Any, max_lines: int = 8, compact: bool | None = None):
|
|
|
+ self.process = process
|
|
|
+ self.max_lines = max_lines
|
|
|
+ self.compact = compact
|
|
|
|
|
|
def __rich__(self) -> Panel:
|
|
|
- if self.status == "idle":
|
|
|
- content = Align.center(
|
|
|
- Text("Idle", style="grey53"),
|
|
|
- vertical="middle",
|
|
|
- )
|
|
|
- border_style = "grey53"
|
|
|
- title_style = "grey53"
|
|
|
- else:
|
|
|
- # Build progress display
|
|
|
- lines = []
|
|
|
-
|
|
|
- # URL (truncated)
|
|
|
- if self.snapshot_url:
|
|
|
- url_display = self.snapshot_url[:35] + "..." if len(self.snapshot_url) > 35 else self.snapshot_url
|
|
|
- lines.append(Text(url_display, style="cyan"))
|
|
|
- lines.append(Text()) # Spacing
|
|
|
-
|
|
|
- # Progress bar
|
|
|
- if self.total_hooks > 0:
|
|
|
- pct = (self.completed_hooks / self.total_hooks) * 100
|
|
|
- bar_width = 30
|
|
|
- filled = int((pct / 100) * bar_width)
|
|
|
- bar = "█" * filled + "░" * (bar_width - filled)
|
|
|
-
|
|
|
- # Color based on progress
|
|
|
- if pct < 30:
|
|
|
- bar_style = "yellow"
|
|
|
- elif pct < 100:
|
|
|
- bar_style = "green"
|
|
|
- else:
|
|
|
- bar_style = "blue"
|
|
|
-
|
|
|
- progress_text = Text()
|
|
|
- progress_text.append(bar, style=bar_style)
|
|
|
- progress_text.append(f" {pct:.0f}%", style="white")
|
|
|
- lines.append(progress_text)
|
|
|
- lines.append(Text()) # Spacing
|
|
|
-
|
|
|
- # Stats
|
|
|
- stats = Table.grid(padding=(0, 1))
|
|
|
- stats.add_column(style="grey53", no_wrap=True)
|
|
|
- stats.add_column(style="white")
|
|
|
- stats.add_row("Hooks:", f"{self.completed_hooks}/{self.total_hooks}")
|
|
|
- if self.current_plugin:
|
|
|
- stats.add_row("Current:", Text(self.current_plugin, style="yellow"))
|
|
|
- lines.append(stats)
|
|
|
- lines.append(Text()) # Spacing
|
|
|
-
|
|
|
- # Recent logs
|
|
|
- if self.recent_logs:
|
|
|
- lines.append(Text("Recent:", style="grey53"))
|
|
|
- for log_msg, log_style in self.recent_logs:
|
|
|
- log_text = Text(f"• {log_msg[:30]}", style=log_style)
|
|
|
- lines.append(log_text)
|
|
|
-
|
|
|
- content = Group(*lines)
|
|
|
- border_style = "green" if self.status == "working" else "blue"
|
|
|
- title_style = "green" if self.status == "working" else "blue"
|
|
|
-
|
|
|
+ is_pending = self._is_pending()
|
|
|
+ output_line = '' if is_pending else self._output_line()
|
|
|
+ stdout_lines = []
|
|
|
+ stderr_lines = []
|
|
|
+ try:
|
|
|
+ stdout_lines = list(self.process.tail_stdout(lines=self.max_lines, follow=False))
|
|
|
+ stderr_lines = list(self.process.tail_stderr(lines=self.max_lines, follow=False))
|
|
|
+ except Exception:
|
|
|
+ stdout_lines = []
|
|
|
+ stderr_lines = []
|
|
|
+
|
|
|
+ header_lines = []
|
|
|
+ chrome_launch_line = self._chrome_launch_line(stderr_lines, stdout_lines)
|
|
|
+ if chrome_launch_line:
|
|
|
+ header_lines.append(Text(chrome_launch_line, style="grey53"))
|
|
|
+ if output_line:
|
|
|
+ header_lines.append(Text(output_line, style="grey53"))
|
|
|
+ log_lines = []
|
|
|
+ for line in stdout_lines:
|
|
|
+ if line:
|
|
|
+ log_lines.append(Text(line, style="white"))
|
|
|
+ for line in stderr_lines:
|
|
|
+ if line:
|
|
|
+ log_lines.append(Text(line, style="cyan"))
|
|
|
+
|
|
|
+ compact = self.compact if self.compact is not None else self._is_background_hook()
|
|
|
+ max_body = max(1, self.max_lines - len(header_lines))
|
|
|
+ if not log_lines:
|
|
|
+ log_lines = []
|
|
|
+
|
|
|
+ lines = header_lines + log_lines[-max_body:]
|
|
|
+
|
|
|
+ content = Group(*lines) if lines else Text("")
|
|
|
+
|
|
|
+ title = self._title()
|
|
|
+ border_style = "grey53" if is_pending else "cyan"
|
|
|
+ height = 2 if is_pending else None
|
|
|
return Panel(
|
|
|
content,
|
|
|
- title=f"[{title_style}]Worker {self.worker_num}",
|
|
|
+ title=title,
|
|
|
border_style=border_style,
|
|
|
- box=box.ROUNDED,
|
|
|
- height=20,
|
|
|
+ box=box.HORIZONTALS,
|
|
|
+ padding=(0, 1),
|
|
|
+ height=height,
|
|
|
)
|
|
|
|
|
|
- def add_log(self, message: str, style: str = "white"):
|
|
|
- """Add a log message to this worker's recent logs."""
|
|
|
- self.recent_logs.append((message, style))
|
|
|
+ def _title(self) -> str:
|
|
|
+ process_type = getattr(self.process, 'process_type', 'process')
|
|
|
+ worker_type = getattr(self.process, 'worker_type', '')
|
|
|
+ pid = getattr(self.process, 'pid', None)
|
|
|
+ label = process_type
|
|
|
+ if process_type == 'worker' and worker_type:
|
|
|
+ label, worker_suffix = self._worker_label(worker_type)
|
|
|
+ elif process_type == 'hook':
|
|
|
+ try:
|
|
|
+ cmd = getattr(self.process, 'cmd', [])
|
|
|
+ hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
|
|
+ hook_name = hook_path.name if hook_path else 'hook'
|
|
|
+ plugin_name = hook_path.parent.name if hook_path and hook_path.parent.name else 'hook'
|
|
|
+ except Exception:
|
|
|
+ hook_name = 'hook'
|
|
|
+ plugin_name = 'hook'
|
|
|
+ label = f"{plugin_name}/{hook_name}"
|
|
|
+ worker_suffix = ''
|
|
|
+ else:
|
|
|
+ worker_suffix = ''
|
|
|
+
|
|
|
+ url = self._extract_url()
|
|
|
+ url_suffix = f" url={self._abbrev_url(url)}" if url else ""
|
|
|
+ time_suffix = self._elapsed_suffix()
|
|
|
+ title_style = "grey53" if self._is_pending() else "bold white"
|
|
|
+ if pid:
|
|
|
+ return f"[{title_style}]{label}[/{title_style}] [grey53]pid={pid}{worker_suffix}{url_suffix}{time_suffix}[/grey53]"
|
|
|
+ return f"[{title_style}]{label}[/{title_style}]{f' [grey53]{worker_suffix.strip()} {url_suffix.strip()}{time_suffix}[/grey53]' if (worker_suffix or url_suffix or time_suffix) else ''}".rstrip()
|
|
|
+
|
|
|
+ def _is_background_hook(self) -> bool:
|
|
|
+ if getattr(self.process, 'process_type', '') != 'hook':
|
|
|
+ return False
|
|
|
+ try:
|
|
|
+ cmd = getattr(self.process, 'cmd', [])
|
|
|
+ hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
|
|
+ hook_name = hook_path.name if hook_path else ''
|
|
|
+ return '.bg.' in hook_name
|
|
|
+ except Exception:
|
|
|
+ return False
|
|
|
+
|
|
|
+ def _is_pending(self) -> bool:
|
|
|
+ status = getattr(self.process, 'status', '')
|
|
|
+ if status in ('queued', 'pending', 'backoff'):
|
|
|
+ return True
|
|
|
+ if getattr(self.process, 'process_type', '') == 'hook' and not getattr(self.process, 'pid', None):
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ def _worker_label(self, worker_type: str) -> tuple[str, str]:
|
|
|
+ cmd = getattr(self.process, 'cmd', []) or []
|
|
|
+ if worker_type == 'crawl':
|
|
|
+ crawl_id = self._extract_arg(cmd, '--crawl-id')
|
|
|
+ suffix = ''
|
|
|
+ if crawl_id:
|
|
|
+ suffix = f" id={str(crawl_id)[-8:]}"
|
|
|
+ try:
|
|
|
+ from archivebox.crawls.models import Crawl
|
|
|
+ crawl = Crawl.objects.filter(id=crawl_id).first()
|
|
|
+ if crawl:
|
|
|
+ urls = crawl.get_urls_list()
|
|
|
+ if urls:
|
|
|
+ url_list = self._abbrev_urls(urls)
|
|
|
+ suffix += f" urls={url_list}"
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ return 'crawl', suffix
|
|
|
+ if worker_type == 'snapshot':
|
|
|
+ snapshot_id = self._extract_arg(cmd, '--snapshot-id')
|
|
|
+ suffix = ''
|
|
|
+ if snapshot_id:
|
|
|
+ suffix = f" id={str(snapshot_id)[-8:]}"
|
|
|
+ try:
|
|
|
+ from archivebox.core.models import Snapshot
|
|
|
+ snap = Snapshot.objects.filter(id=snapshot_id).first()
|
|
|
+ if snap and snap.url:
|
|
|
+ suffix += f" url={self._abbrev_url(snap.url, max_len=48)}"
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ return 'snapshot', suffix
|
|
|
+ return f"worker:{worker_type}", ''
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _extract_arg(cmd: list[str], key: str) -> str | None:
|
|
|
+ for i, part in enumerate(cmd):
|
|
|
+ if part.startswith(f'{key}='):
|
|
|
+ return part.split('=', 1)[1]
|
|
|
+ if part == key and i + 1 < len(cmd):
|
|
|
+ return cmd[i + 1]
|
|
|
+ return None
|
|
|
+
|
|
|
+ def _abbrev_urls(self, urls: list[str], max_len: int = 48) -> str:
|
|
|
+ if not urls:
|
|
|
+ return ''
|
|
|
+ if len(urls) == 1:
|
|
|
+ return self._abbrev_url(urls[0], max_len=max_len)
|
|
|
+ first = self._abbrev_url(urls[0], max_len=max_len)
|
|
|
+ return f"{first},+{len(urls) - 1}"
|
|
|
+
|
|
|
+ def _extract_url(self) -> str:
|
|
|
+ url = getattr(self.process, 'url', None)
|
|
|
+ if url:
|
|
|
+ return str(url)
|
|
|
+ cmd = getattr(self.process, 'cmd', []) or []
|
|
|
+ for i, part in enumerate(cmd):
|
|
|
+ if part.startswith('--url='):
|
|
|
+ return part.split('=', 1)[1].strip()
|
|
|
+ if part == '--url' and i + 1 < len(cmd):
|
|
|
+ return str(cmd[i + 1]).strip()
|
|
|
+ return ''
|
|
|
+
|
|
|
+ def _abbrev_url(self, url: str, max_len: int = 48) -> str:
|
|
|
+ if not url:
|
|
|
+ return ''
|
|
|
+ if len(url) <= max_len:
|
|
|
+ return url
|
|
|
+ return f"{url[:max_len - 3]}..."
|
|
|
+
|
|
|
+ def _chrome_launch_line(self, stderr_lines: list[str], stdout_lines: list[str]) -> str:
|
|
|
+ try:
|
|
|
+ cmd = getattr(self.process, 'cmd', [])
|
|
|
+ hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
|
|
+ hook_name = hook_path.name if hook_path else ''
|
|
|
+ if 'chrome_launch' not in hook_name:
|
|
|
+ return ''
|
|
|
+
|
|
|
+ pid = ''
|
|
|
+ ws = ''
|
|
|
+ for line in stderr_lines + stdout_lines:
|
|
|
+ if not ws and 'CDP URL:' in line:
|
|
|
+ ws = line.split('CDP URL:', 1)[1].strip()
|
|
|
+ if not pid and 'PID:' in line:
|
|
|
+ pid = line.split('PID:', 1)[1].strip()
|
|
|
+
|
|
|
+ if pid and ws:
|
|
|
+ return f"Chrome pid={pid} {ws}"
|
|
|
+ if ws:
|
|
|
+ return f"Chrome {ws}"
|
|
|
+ if pid:
|
|
|
+ return f"Chrome pid={pid}"
|
|
|
+ try:
|
|
|
+ from archivebox import DATA_DIR
|
|
|
+ base = Path(DATA_DIR)
|
|
|
+ pwd = getattr(self.process, 'pwd', None)
|
|
|
+ if pwd:
|
|
|
+ chrome_dir = Path(pwd)
|
|
|
+ if not chrome_dir.is_absolute():
|
|
|
+ chrome_dir = (base / chrome_dir).resolve()
|
|
|
+ cdp_file = chrome_dir / 'cdp_url.txt'
|
|
|
+ pid_file = chrome_dir / 'chrome.pid'
|
|
|
+ if cdp_file.exists():
|
|
|
+ ws = cdp_file.read_text().strip()
|
|
|
+ if pid_file.exists():
|
|
|
+ pid = pid_file.read_text().strip()
|
|
|
+ if pid and ws:
|
|
|
+ return f"Chrome pid={pid} {ws}"
|
|
|
+ if ws:
|
|
|
+ return f"Chrome {ws}"
|
|
|
+ if pid:
|
|
|
+ return f"Chrome pid={pid}"
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ except Exception:
|
|
|
+ return ''
|
|
|
+ return ''
|
|
|
+
|
|
|
+ def _elapsed_suffix(self) -> str:
|
|
|
+ started_at = getattr(self.process, 'started_at', None)
|
|
|
+ timeout = getattr(self.process, 'timeout', None)
|
|
|
+ if not started_at or not timeout:
|
|
|
+ return ''
|
|
|
+ try:
|
|
|
+ now = datetime.now(timezone.utc) if started_at.tzinfo else datetime.now()
|
|
|
+ elapsed = int((now - started_at).total_seconds())
|
|
|
+ elapsed = max(elapsed, 0)
|
|
|
+ return f" [{elapsed}/{int(timeout)}s]"
|
|
|
+ except Exception:
|
|
|
+ return ''
|
|
|
+
|
|
|
+ def _output_line(self) -> str:
|
|
|
+ pwd = getattr(self.process, 'pwd', None)
|
|
|
+ if not pwd:
|
|
|
+ return ''
|
|
|
+ try:
|
|
|
+ from archivebox import DATA_DIR
|
|
|
+ rel = Path(pwd)
|
|
|
+ base = Path(DATA_DIR)
|
|
|
+ if rel.is_absolute():
|
|
|
+ try:
|
|
|
+ rel = rel.relative_to(base)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ rel_str = f"./{rel}" if not str(rel).startswith("./") else str(rel)
|
|
|
+ return f"{rel_str}"
|
|
|
+ except Exception:
|
|
|
+ return f"{pwd}"
|
|
|
|
|
|
|
|
|
-class CrawlWorkerLogPanel:
|
|
|
- """Display CrawlWorker logs by tailing stdout/stderr from Process."""
|
|
|
+class WorkerLogPanel:
|
|
|
+ """Display worker logs by tailing stdout/stderr from Process."""
|
|
|
|
|
|
- def __init__(self, max_lines: int = 8):
|
|
|
+ def __init__(self, title: str, empty_message: str, running_message: str, max_lines: int = 8):
|
|
|
+ self.title = title
|
|
|
+ self.empty_message = empty_message
|
|
|
+ self.running_message = running_message
|
|
|
self.log_lines: deque = deque(maxlen=max_lines * 2) # Allow more buffer
|
|
|
self.max_lines = max_lines
|
|
|
self.last_stdout_pos = 0 # Track file position for efficient tailing
|
|
|
self.last_stderr_pos = 0
|
|
|
+ self.last_process_running = False
|
|
|
|
|
|
def update_from_process(self, process: Any):
|
|
|
"""Update logs by tailing the Process stdout/stderr files."""
|
|
|
- from pathlib import Path
|
|
|
-
|
|
|
if not process:
|
|
|
+ self.last_process_running = False
|
|
|
return
|
|
|
|
|
|
- # Read new stdout lines since last read
|
|
|
+ # Use Process tail helpers for consistency
|
|
|
try:
|
|
|
- stdout_path = Path(process.stdout)
|
|
|
- if stdout_path.exists():
|
|
|
- with open(stdout_path, 'r') as f:
|
|
|
- # Seek to last read position
|
|
|
- f.seek(self.last_stdout_pos)
|
|
|
- new_lines = f.readlines()
|
|
|
-
|
|
|
- # Update position
|
|
|
- self.last_stdout_pos = f.tell()
|
|
|
-
|
|
|
- # Add new lines (up to max_lines to avoid overflow)
|
|
|
- for line in new_lines[-self.max_lines:]:
|
|
|
- line = line.rstrip('\n')
|
|
|
- if line and not line.startswith('['): # Skip Rich markup lines
|
|
|
- self.log_lines.append(('stdout', line))
|
|
|
+ self.last_process_running = bool(getattr(process, 'is_running', False))
|
|
|
+ stdout_lines = list(process.tail_stdout(lines=self.max_lines, follow=False))
|
|
|
+ stderr_lines = list(process.tail_stderr(lines=self.max_lines, follow=False))
|
|
|
except Exception:
|
|
|
- pass
|
|
|
+ return
|
|
|
|
|
|
- # Read new stderr lines since last read
|
|
|
- try:
|
|
|
- stderr_path = Path(process.stderr)
|
|
|
- if stderr_path.exists():
|
|
|
- with open(stderr_path, 'r') as f:
|
|
|
- f.seek(self.last_stderr_pos)
|
|
|
- new_lines = f.readlines()
|
|
|
-
|
|
|
- self.last_stderr_pos = f.tell()
|
|
|
-
|
|
|
- for line in new_lines[-self.max_lines:]:
|
|
|
- line = line.rstrip('\n')
|
|
|
- if line and not line.startswith('['): # Skip Rich markup lines
|
|
|
- self.log_lines.append(('stderr', line))
|
|
|
- except Exception:
|
|
|
- pass
|
|
|
+ self.log_lines.clear()
|
|
|
+
|
|
|
+ # Preserve ordering by showing stdout then stderr
|
|
|
+ for line in stdout_lines:
|
|
|
+ if line:
|
|
|
+ self.log_lines.append(('stdout', line))
|
|
|
+ for line in stderr_lines:
|
|
|
+ if line:
|
|
|
+ self.log_lines.append(('stderr', line))
|
|
|
|
|
|
def __rich__(self) -> Panel:
|
|
|
if not self.log_lines:
|
|
|
- content = Text("No CrawlWorker logs yet", style="grey53", justify="center")
|
|
|
+ message = self.running_message if self.last_process_running else self.empty_message
|
|
|
+ content = Text(message, style="grey53", justify="center")
|
|
|
else:
|
|
|
# Get the last max_lines for display
|
|
|
display_lines = list(self.log_lines)[-self.max_lines:]
|
|
|
@@ -236,9 +396,9 @@ class CrawlWorkerLogPanel:
|
|
|
|
|
|
return Panel(
|
|
|
content,
|
|
|
- title="[bold cyan]CrawlWorker Logs (stdout/stderr)",
|
|
|
+ title=f"[bold cyan]{self.title}",
|
|
|
border_style="cyan",
|
|
|
- box=box.ROUNDED,
|
|
|
+ box=box.HORIZONTALS,
|
|
|
)
|
|
|
|
|
|
|
|
|
@@ -270,10 +430,71 @@ class OrchestratorLogPanel:
|
|
|
content,
|
|
|
title="[bold white]Orchestrator / Daphne Logs",
|
|
|
border_style="white",
|
|
|
- box=box.ROUNDED,
|
|
|
+ box=box.HORIZONTALS,
|
|
|
)
|
|
|
|
|
|
|
|
|
+class CrawlQueueTreePanel:
|
|
|
+ """Display crawl queue with snapshots + hook summary in a tree view."""
|
|
|
+
|
|
|
+ def __init__(self, max_crawls: int = 8, max_snapshots: int = 16):
|
|
|
+ self.crawls: list[dict[str, Any]] = []
|
|
|
+ self.max_crawls = max_crawls
|
|
|
+ self.max_snapshots = max_snapshots
|
|
|
+
|
|
|
+ def update_crawls(self, crawls: list[dict[str, Any]]) -> None:
|
|
|
+ """Update crawl tree data."""
|
|
|
+ self.crawls = crawls[:self.max_crawls]
|
|
|
+
|
|
|
+ def __rich__(self) -> Panel:
|
|
|
+ if not self.crawls:
|
|
|
+ content = Text("No active crawls", style="grey53", justify="center")
|
|
|
+ else:
|
|
|
+ trees = []
|
|
|
+ for crawl in self.crawls:
|
|
|
+ crawl_status = crawl.get('status', '')
|
|
|
+ crawl_label = crawl.get('label', '')
|
|
|
+ crawl_id = crawl.get('id', '')[:8]
|
|
|
+ crawl_text = Text(f"{self._status_icon(crawl_status)} {crawl_id} {crawl_label}", style="white")
|
|
|
+ crawl_tree = Tree(crawl_text, guide_style="grey53")
|
|
|
+
|
|
|
+ snapshots = crawl.get('snapshots', [])[:self.max_snapshots]
|
|
|
+ for snap in snapshots:
|
|
|
+ snap_status = snap.get('status', '')
|
|
|
+ snap_label = snap.get('label', '')
|
|
|
+ snap_text = Text(f"{self._status_icon(snap_status)} {snap_label}", style="white")
|
|
|
+ snap_node = crawl_tree.add(snap_text)
|
|
|
+
|
|
|
+ hooks = snap.get('hooks', {})
|
|
|
+ if hooks:
|
|
|
+ completed = hooks.get('completed', 0)
|
|
|
+ running = hooks.get('running', 0)
|
|
|
+ pending = hooks.get('pending', 0)
|
|
|
+ summary = f"✅ {completed} | ▶️ {running} | ⌛️ {pending}"
|
|
|
+ snap_node.add(Text(summary, style="grey53"))
|
|
|
+ trees.append(crawl_tree)
|
|
|
+ content = Group(*trees)
|
|
|
+
|
|
|
+ return Panel(
|
|
|
+ content,
|
|
|
+ title="[bold white]Crawl Queue",
|
|
|
+ border_style="white",
|
|
|
+ box=box.HORIZONTALS,
|
|
|
+ )
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _status_icon(status: str) -> str:
|
|
|
+ if status in ('queued', 'pending'):
|
|
|
+ return '⏳'
|
|
|
+ if status in ('started', 'running'):
|
|
|
+ return '▶'
|
|
|
+ if status in ('sealed', 'done', 'completed'):
|
|
|
+ return '✅'
|
|
|
+ if status in ('failed', 'error'):
|
|
|
+ return '✖'
|
|
|
+ return '•'
|
|
|
+
|
|
|
+
|
|
|
class ArchiveBoxProgressLayout:
|
|
|
"""
|
|
|
Main layout manager for ArchiveBox orchestrator progress display.
|
|
|
@@ -281,15 +502,8 @@ class ArchiveBoxProgressLayout:
|
|
|
Layout structure:
|
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
|
│ Crawl Queue (full width) │
|
|
|
- ├───────────────┬───────────────┬───────────────┬─────────────┤
|
|
|
- │ Snapshot │ Snapshot │ Snapshot │ Snapshot │
|
|
|
- │ Worker 1 │ Worker 2 │ Worker 3 │ Worker 4 │
|
|
|
- │ │ │ │ │
|
|
|
- │ Progress + │ Progress + │ Progress + │ Progress + │
|
|
|
- │ Stats + │ Stats + │ Stats + │ Stats + │
|
|
|
- │ Logs │ Logs │ Logs │ Logs │
|
|
|
- ├───────────────┴───────────────┴───────────────┴─────────────┤
|
|
|
- │ CrawlWorker Logs (stdout/stderr) │
|
|
|
+ ├─────────────────────────────────────────────────────────────┤
|
|
|
+ │ Running Process Logs (dynamic panels) │
|
|
|
├─────────────────────────────────────────────────────────────┤
|
|
|
│ Orchestrator / Daphne Logs │
|
|
|
└─────────────────────────────────────────────────────────────┘
|
|
|
@@ -303,51 +517,33 @@ class ArchiveBoxProgressLayout:
|
|
|
self.crawl_queue = CrawlQueuePanel()
|
|
|
self.crawl_queue.crawl_id = crawl_id
|
|
|
|
|
|
- # Create 4 worker panels
|
|
|
- self.worker_panels = [SnapshotWorkerPanel(i + 1) for i in range(MAX_WORKER_COLUMNS)]
|
|
|
-
|
|
|
- self.crawl_worker_log = CrawlWorkerLogPanel(max_lines=8)
|
|
|
+ self.process_panels: List[ProcessLogPanel] = []
|
|
|
self.orchestrator_log = OrchestratorLogPanel(max_events=8)
|
|
|
+ self.crawl_queue_tree = CrawlQueueTreePanel(max_crawls=8, max_snapshots=16)
|
|
|
|
|
|
# Create layout
|
|
|
self.layout = self._make_layout()
|
|
|
|
|
|
- # Track snapshot ID to worker panel mapping
|
|
|
- self.snapshot_to_worker: Dict[str, int] = {} # snapshot_id -> worker_panel_index
|
|
|
-
|
|
|
def _make_layout(self) -> Layout:
|
|
|
"""Define the layout structure."""
|
|
|
layout = Layout(name="root")
|
|
|
|
|
|
- # Top-level split: crawl_queue, workers, logs
|
|
|
+ # Top-level split: crawl_queue, workers, bottom
|
|
|
layout.split(
|
|
|
Layout(name="crawl_queue", size=3),
|
|
|
- Layout(name="workers", ratio=1),
|
|
|
- Layout(name="logs", size=20),
|
|
|
- )
|
|
|
-
|
|
|
- # Split workers into 4 columns
|
|
|
- layout["workers"].split_row(
|
|
|
- Layout(name="worker1"),
|
|
|
- Layout(name="worker2"),
|
|
|
- Layout(name="worker3"),
|
|
|
- Layout(name="worker4"),
|
|
|
- )
|
|
|
-
|
|
|
- # Split logs into crawl_worker_logs and orchestrator_logs
|
|
|
- layout["logs"].split(
|
|
|
- Layout(name="crawl_worker_logs", size=10),
|
|
|
- Layout(name="orchestrator_logs", size=10),
|
|
|
+ Layout(name="processes", ratio=1),
|
|
|
+ Layout(name="bottom", size=12),
|
|
|
)
|
|
|
|
|
|
# Assign components to layout sections
|
|
|
layout["crawl_queue"].update(self.crawl_queue)
|
|
|
- layout["worker1"].update(self.worker_panels[0])
|
|
|
- layout["worker2"].update(self.worker_panels[1])
|
|
|
- layout["worker3"].update(self.worker_panels[2])
|
|
|
- layout["worker4"].update(self.worker_panels[3])
|
|
|
- layout["crawl_worker_logs"].update(self.crawl_worker_log)
|
|
|
+ layout["processes"].update(Columns([]))
|
|
|
+ layout["bottom"].split_row(
|
|
|
+ Layout(name="orchestrator_logs", ratio=2),
|
|
|
+ Layout(name="crawl_tree", ratio=1),
|
|
|
+ )
|
|
|
layout["orchestrator_logs"].update(self.orchestrator_log)
|
|
|
+ layout["crawl_tree"].update(self.crawl_queue_tree)
|
|
|
|
|
|
return layout
|
|
|
|
|
|
@@ -356,82 +552,53 @@ class ArchiveBoxProgressLayout:
|
|
|
status: str,
|
|
|
crawl_queue_count: int = 0,
|
|
|
crawl_workers_count: int = 0,
|
|
|
+ binary_queue_count: int = 0,
|
|
|
+ binary_workers_count: int = 0,
|
|
|
max_crawl_workers: int = 8,
|
|
|
):
|
|
|
"""Update orchestrator status in the crawl queue panel."""
|
|
|
self.crawl_queue.orchestrator_status = status
|
|
|
self.crawl_queue.crawl_queue_count = crawl_queue_count
|
|
|
self.crawl_queue.crawl_workers_count = crawl_workers_count
|
|
|
+ self.crawl_queue.binary_queue_count = binary_queue_count
|
|
|
+ self.crawl_queue.binary_workers_count = binary_workers_count
|
|
|
self.crawl_queue.max_crawl_workers = max_crawl_workers
|
|
|
|
|
|
- def update_snapshot_worker(
|
|
|
- self,
|
|
|
- snapshot_id: str,
|
|
|
- url: str,
|
|
|
- total: int,
|
|
|
- completed: int,
|
|
|
- current_plugin: str = "",
|
|
|
- ):
|
|
|
- """Update or assign a snapshot to a worker panel."""
|
|
|
- # Find or assign worker panel for this snapshot
|
|
|
- if snapshot_id not in self.snapshot_to_worker:
|
|
|
- # Find first idle worker panel
|
|
|
- worker_idx = None
|
|
|
- for idx, panel in enumerate(self.worker_panels):
|
|
|
- if panel.status == "idle":
|
|
|
- worker_idx = idx
|
|
|
- break
|
|
|
-
|
|
|
- # If no idle worker, use round-robin (shouldn't happen often)
|
|
|
- if worker_idx is None:
|
|
|
- worker_idx = len(self.snapshot_to_worker) % MAX_WORKER_COLUMNS
|
|
|
-
|
|
|
- self.snapshot_to_worker[snapshot_id] = worker_idx
|
|
|
-
|
|
|
- # Get assigned worker panel
|
|
|
- worker_idx = self.snapshot_to_worker[snapshot_id]
|
|
|
- panel = self.worker_panels[worker_idx]
|
|
|
-
|
|
|
- # Update panel
|
|
|
- panel.snapshot_id = snapshot_id
|
|
|
- panel.snapshot_url = url
|
|
|
- panel.total_hooks = total
|
|
|
- panel.completed_hooks = completed
|
|
|
- panel.current_plugin = current_plugin
|
|
|
- panel.status = "working" if completed < total else "completed"
|
|
|
-
|
|
|
- def remove_snapshot_worker(self, snapshot_id: str):
|
|
|
- """Mark a snapshot worker as idle after completion."""
|
|
|
- if snapshot_id in self.snapshot_to_worker:
|
|
|
- worker_idx = self.snapshot_to_worker[snapshot_id]
|
|
|
- panel = self.worker_panels[worker_idx]
|
|
|
-
|
|
|
- # Mark as idle
|
|
|
- panel.status = "idle"
|
|
|
- panel.snapshot_id = None
|
|
|
- panel.snapshot_url = None
|
|
|
- panel.total_hooks = 0
|
|
|
- panel.completed_hooks = 0
|
|
|
- panel.current_plugin = None
|
|
|
- panel.recent_logs.clear()
|
|
|
-
|
|
|
- # Remove mapping
|
|
|
- del self.snapshot_to_worker[snapshot_id]
|
|
|
-
|
|
|
- def log_to_worker(self, snapshot_id: str, message: str, style: str = "white"):
|
|
|
- """Add a log message to a specific worker's panel."""
|
|
|
- if snapshot_id in self.snapshot_to_worker:
|
|
|
- worker_idx = self.snapshot_to_worker[snapshot_id]
|
|
|
- self.worker_panels[worker_idx].add_log(message, style)
|
|
|
-
|
|
|
- def log_event(self, message: str, style: str = "white"):
|
|
|
+ def update_process_panels(self, processes: List[Any], pending: Optional[List[Any]] = None) -> None:
|
|
|
+ """Update process panels to show all running processes."""
|
|
|
+ panels = []
|
|
|
+ all_processes = list(processes) + list(pending or [])
|
|
|
+ for process in all_processes:
|
|
|
+ is_hook = getattr(process, 'process_type', '') == 'hook'
|
|
|
+ is_bg = False
|
|
|
+ if is_hook:
|
|
|
+ try:
|
|
|
+ cmd = getattr(process, 'cmd', [])
|
|
|
+ hook_path = Path(cmd[1]) if len(cmd) > 1 else None
|
|
|
+ hook_name = hook_path.name if hook_path else ''
|
|
|
+ is_bg = '.bg.' in hook_name
|
|
|
+ except Exception:
|
|
|
+ is_bg = False
|
|
|
+ is_pending = getattr(process, 'status', '') in ('queued', 'pending', 'backoff') or (is_hook and not getattr(process, 'pid', None))
|
|
|
+ max_lines = 2 if is_pending else (4 if is_bg else 7)
|
|
|
+ panels.append(ProcessLogPanel(process, max_lines=max_lines, compact=is_bg))
|
|
|
+ if not panels:
|
|
|
+ self.layout["processes"].size = 0
|
|
|
+ self.layout["processes"].update(Text(""))
|
|
|
+ return
|
|
|
+
|
|
|
+ self.layout["processes"].size = None
|
|
|
+ self.layout["processes"].ratio = 1
|
|
|
+ self.layout["processes"].update(Columns(panels, equal=True, expand=True))
|
|
|
+
|
|
|
+ def update_crawl_tree(self, crawls: list[dict[str, Any]]) -> None:
|
|
|
+ """Update the crawl queue tree panel."""
|
|
|
+ self.crawl_queue_tree.update_crawls(crawls)
|
|
|
+
|
|
|
+ def log_event(self, message: str, style: str = "white") -> None:
|
|
|
"""Add an event to the orchestrator log."""
|
|
|
self.orchestrator_log.add_event(message, style)
|
|
|
|
|
|
- def update_crawl_worker_logs(self, process: Any):
|
|
|
- """Update CrawlWorker logs by tailing the Process stdout/stderr files."""
|
|
|
- self.crawl_worker_log.update_from_process(process)
|
|
|
-
|
|
|
def get_layout(self) -> Layout:
|
|
|
"""Get the Rich Layout object for rendering."""
|
|
|
return self.layout
|