logging_util.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. __package__ = 'archivebox'
  2. # High-level logging functions for CLI output and progress tracking
  3. # Low-level primitives (Rich console, ANSI colors) are in logging.py
  4. import re
  5. import os
  6. import sys
  7. import time
  8. from math import log
  9. from multiprocessing import Process
  10. from pathlib import Path
  11. from datetime import datetime, timezone
  12. from dataclasses import dataclass
  13. from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING
  14. if TYPE_CHECKING:
  15. from archivebox.core.models import Snapshot
  16. from rich import print
  17. from rich.panel import Panel
  18. from django.core.management.base import DjangoHelpFormatter
  19. from archivebox.config import CONSTANTS, DATA_DIR, VERSION
  20. from archivebox.config.common import SHELL_CONFIG
  21. from archivebox.misc.system import get_dir_size
  22. from archivebox.misc.util import enforce_types
  23. from archivebox.misc.logging import ANSI, stderr
  24. @dataclass
  25. class RuntimeStats:
  26. """mutable stats counter for logging archiving timing info to CLI output"""
  27. skipped: int = 0
  28. succeeded: int = 0
  29. failed: int = 0
  30. parse_start_ts: Optional[datetime] = None
  31. parse_end_ts: Optional[datetime] = None
  32. index_start_ts: Optional[datetime] = None
  33. index_end_ts: Optional[datetime] = None
  34. archiving_start_ts: Optional[datetime] = None
  35. archiving_end_ts: Optional[datetime] = None
  36. # globals are bad, mmkay
  37. _LAST_RUN_STATS = RuntimeStats()
  38. class TimedProgress:
  39. """Show a progress bar and measure elapsed time until .end() is called"""
  40. def __init__(self, seconds, prefix=''):
  41. self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
  42. self.ANSI = SHELL_CONFIG.ANSI
  43. if self.SHOW_PROGRESS:
  44. self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
  45. self.p.start()
  46. self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
  47. def end(self):
  48. """immediately end progress, clear the progressbar line, and save end_ts"""
  49. end_ts = datetime.now(timezone.utc)
  50. self.stats['end_ts'] = end_ts
  51. if self.SHOW_PROGRESS:
  52. # terminate if we havent already terminated
  53. try:
  54. # kill the progress bar subprocess
  55. try:
  56. self.p.close() # must be closed *before* its terminnated
  57. except (KeyboardInterrupt, SystemExit):
  58. print()
  59. raise
  60. except BaseException: # lgtm [py/catch-base-exception]
  61. pass
  62. self.p.terminate()
  63. time.sleep(0.1)
  64. # sometimes the timer doesn't terminate properly, then blocks at the join until
  65. # the full time has elapsed. sending a kill tries to avoid that.
  66. try:
  67. self.p.kill()
  68. except Exception:
  69. pass
  70. # clear whole terminal line
  71. try:
  72. sys.stdout.write('\r{}{}\r'.format((' ' * SHELL_CONFIG.TERM_WIDTH), self.ANSI['reset']))
  73. except (IOError, BrokenPipeError):
  74. # ignore when the parent proc has stopped listening to our stdout
  75. pass
  76. except ValueError:
  77. pass
  78. @enforce_types
  79. def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
  80. """show timer in the form of progress bar, with percentage and seconds remaining"""
  81. output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
  82. chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
  83. last_width = SHELL_CONFIG.TERM_WIDTH
  84. chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  85. try:
  86. for s in range(seconds * chunks):
  87. max_width = SHELL_CONFIG.TERM_WIDTH
  88. if max_width < last_width:
  89. # when the terminal size is shrunk, we have to write a newline
  90. # otherwise the progress bar will keep wrapping incorrectly
  91. sys.stdout.write('\r\n')
  92. sys.stdout.flush()
  93. chunks = max_width - len(prefix) - 20
  94. pct_complete = s / chunks / seconds * 100
  95. log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;)
  96. bar_width = round(log_pct/(100/chunks))
  97. last_width = max_width
  98. # ████████████████████ 0.9% (1/60sec)
  99. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  100. prefix,
  101. ANSI['green' if pct_complete < 80 else 'lightyellow'],
  102. (chunk * bar_width).ljust(chunks),
  103. ANSI['reset'],
  104. round(pct_complete, 1),
  105. round(s/chunks),
  106. seconds,
  107. ))
  108. sys.stdout.flush()
  109. time.sleep(1 / chunks)
  110. # ██████████████████████████████████ 100.0% (60/60sec)
  111. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  112. prefix,
  113. ANSI['red'],
  114. chunk * chunks,
  115. ANSI['reset'],
  116. 100.0,
  117. seconds,
  118. seconds,
  119. ))
  120. sys.stdout.flush()
  121. # uncomment to have it disappear when it hits 100% instead of staying full red:
  122. # time.sleep(0.5)
  123. # sys.stdout.write('\r{}{}\r'.format((' ' * SHELL_CONFIG.TERM_WIDTH), ANSI['reset']))
  124. # sys.stdout.flush()
  125. except (KeyboardInterrupt, BrokenPipeError):
  126. print()
  127. def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'):
  128. args = ' '.join(subcommand_args)
  129. version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
  130. now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
  131. VERSION=VERSION,
  132. subcommand=subcommand,
  133. args=args,
  134. )
  135. # stderr()
  136. # stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI))
  137. # stderr()
  138. print(Panel(version_msg), file=sys.stderr)
  139. ### Parsing Stage
  140. def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
  141. _LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc)
  142. print('[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]'.format(
  143. _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  144. len(urls) if isinstance(urls, list) else len(urls.split('\n')),
  145. depth,
  146. ' (index only)' if index_only else '',
  147. ))
  148. def log_source_saved(source_file: str):
  149. print(' > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
  150. def log_parsing_finished(num_parsed: int, parser_name: str):
  151. _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
  152. print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
  153. def log_deduping_finished(num_new_links: int):
  154. print(' > Found {} new URLs not already in index'.format(num_new_links))
  155. def log_crawl_started(new_links):
  156. print()
  157. print(f'[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]')
  158. ### Indexing Stage
  159. def log_indexing_process_started(num_links: int):
  160. start_ts = datetime.now(timezone.utc)
  161. _LAST_RUN_STATS.index_start_ts = start_ts
  162. print()
  163. print('[bright_black][*] [{}] Writing {} links to main index...[/]'.format(
  164. start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  165. num_links,
  166. ))
  167. def log_indexing_process_finished():
  168. end_ts = datetime.now(timezone.utc)
  169. _LAST_RUN_STATS.index_end_ts = end_ts
  170. def log_indexing_started(out_path: str):
  171. if SHELL_CONFIG.IS_TTY:
  172. sys.stdout.write(f' > ./{Path(out_path).relative_to(DATA_DIR)}')
  173. def log_indexing_finished(out_path: str):
  174. print(f'\r √ ./{Path(out_path).relative_to(DATA_DIR)}')
  175. ### Archiving Stage
  176. def log_archiving_started(num_links: int, resume: Optional[float]=None):
  177. start_ts = datetime.now(timezone.utc)
  178. _LAST_RUN_STATS.archiving_start_ts = start_ts
  179. print()
  180. if resume:
  181. print('[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]'.format(
  182. start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  183. num_links,
  184. resume,
  185. ))
  186. else:
  187. print('[green][▶] [{}] Starting archiving of {} snapshots in index...[/]'.format(
  188. start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  189. num_links,
  190. ))
  191. def log_archiving_paused(num_links: int, idx: int, timestamp: str):
  192. end_ts = datetime.now(timezone.utc)
  193. _LAST_RUN_STATS.archiving_end_ts = end_ts
  194. print()
  195. print('\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]'.format(
  196. now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
  197. idx=idx+1,
  198. timestamp=timestamp,
  199. total=num_links,
  200. ))
  201. print()
  202. print(' Continue archiving where you left off by running:')
  203. print(' archivebox update --resume={}'.format(timestamp))
  204. def log_archiving_finished(num_links: int):
  205. from archivebox.core.models import Snapshot
  206. end_ts = datetime.now(timezone.utc)
  207. _LAST_RUN_STATS.archiving_end_ts = end_ts
  208. assert _LAST_RUN_STATS.archiving_start_ts is not None
  209. seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
  210. if seconds > 60:
  211. duration = '{0:.2f} min'.format(seconds / 60)
  212. else:
  213. duration = '{0:.2f} sec'.format(seconds)
  214. print()
  215. print('[green][√] [{}] Update of {} pages complete ({})[/]'.format(
  216. end_ts.strftime('%Y-%m-%d %H:%M:%S'),
  217. num_links,
  218. duration,
  219. ))
  220. print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
  221. print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed))
  222. print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
  223. if Snapshot.objects.count() < 50:
  224. print()
  225. print(' [violet]Hint:[/] To manage your archive in a Web UI, run:')
  226. print(' archivebox server 0.0.0.0:8000')
  227. def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool):
  228. # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
  229. # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
  230. # > output/archive/1478739709
  231. print('\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format(
  232. symbol_color='green' if is_new else 'bright_black',
  233. symbol='+' if is_new else '√',
  234. now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
  235. title=snapshot.title or snapshot.base_url,
  236. ))
  237. print(f' [sky_blue1]{snapshot.url}[/]')
  238. print(' {} {}'.format(
  239. '>' if is_new else '√',
  240. pretty_path(out_dir),
  241. ))
  242. def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime):
  243. total = sum(stats.values())
  244. if stats['failed'] > 0 :
  245. _LAST_RUN_STATS.failed += 1
  246. elif stats['skipped'] == total:
  247. _LAST_RUN_STATS.skipped += 1
  248. else:
  249. _LAST_RUN_STATS.succeeded += 1
  250. try:
  251. size = get_dir_size(out_dir)
  252. except FileNotFoundError:
  253. size = (0, None, '0')
  254. end_ts = datetime.now(timezone.utc)
  255. duration = str(end_ts - start_ts).split('.')[0]
  256. print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
  257. def log_archive_method_started(method: str):
  258. print(' > {}'.format(method))
  259. def log_archive_method_finished(result: dict):
  260. """
  261. quote the argument with whitespace in a command so the user can
  262. copy-paste the outputted string directly to run the cmd
  263. """
  264. # Prettify CMD string and make it safe to copy-paste by quoting arguments
  265. quoted_cmd = ' '.join(
  266. '"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
  267. for arg in result['cmd']
  268. )
  269. if result['status'] == 'failed':
  270. output = result.get('output')
  271. if output and output.__class__.__name__ == 'TimeoutExpired':
  272. duration = (result['end_ts'] - result['start_ts']).seconds
  273. hint_header = [
  274. f'[yellow3]Extractor timed out after {duration}s.[/]',
  275. ]
  276. else:
  277. error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error'
  278. hint_header = [
  279. '[yellow3]Extractor failed:[/]',
  280. f' {error_name} [red1]{output}[/]',
  281. ]
  282. # Prettify error output hints string and limit to five lines
  283. hints = getattr(output, 'hints', None) or () if output else ()
  284. if hints:
  285. if isinstance(hints, (list, tuple, type(_ for _ in ()))):
  286. hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
  287. else:
  288. if isinstance(hints, bytes):
  289. hints = hints.decode()
  290. hints = hints.split('\n')
  291. hints = (
  292. f' [yellow1]{line.strip()}[/]'
  293. for line in list(hints)[:5] if line.strip()
  294. )
  295. docker_hints = ()
  296. if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
  297. docker_hints = (
  298. ' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
  299. )
  300. # Collect and prefix output lines with indentation
  301. output_lines = [
  302. *hint_header,
  303. *hints,
  304. '[violet]Run to see full output:[/]',
  305. *docker_hints,
  306. *([' cd {};'.format(result.get('pwd'))] if result.get('pwd') else []),
  307. ' {}'.format(quoted_cmd),
  308. ]
  309. print('\n'.join(
  310. ' {}'.format(line)
  311. for line in output_lines
  312. if line
  313. ))
  314. print()
  315. def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
  316. print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]')
  317. print(' {}'.format(' '.join(filter_patterns or ())))
  318. def log_list_finished(snapshots):
  319. from archivebox.core.models import Snapshot
  320. print()
  321. print('---------------------------------------------------------------------------------------------------')
  322. print(Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]).to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
  323. print('---------------------------------------------------------------------------------------------------')
  324. print()
  325. def log_removal_started(snapshots, yes: bool, delete: bool):
  326. count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots)
  327. print(f'[yellow3][i] Found {count} matching URLs to remove.[/]')
  328. if delete:
  329. file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)]
  330. print(
  331. f' {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
  332. f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
  333. )
  334. else:
  335. print(
  336. ' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
  337. ' (Pass --delete if you also want to permanently delete the data folders)'
  338. )
  339. if not yes:
  340. print()
  341. print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]')
  342. try:
  343. assert input(' y/[n]: ').lower() == 'y'
  344. except (KeyboardInterrupt, EOFError, AssertionError):
  345. raise SystemExit(0)
  346. def log_removal_finished(all_links: int, to_remove: int):
  347. if all_links == 0:
  348. print()
  349. print('[red1][X] No matching links found.[/]')
  350. else:
  351. print()
  352. print(f'[red1][√] Removed {to_remove} out of {all_links} links from the archive index.[/]')
  353. print(f' Index now contains {all_links - to_remove} links.')
  354. ### Search Indexing Stage
  355. def log_index_started(url: str):
  356. print('[green][*] Indexing url: {} in the search index[/]'.format(url))
  357. print()
  358. ### Helpers
  359. @enforce_types
  360. def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str:
  361. """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
  362. pwd = str(Path(pwd)) # .resolve()
  363. path = str(path)
  364. if not path:
  365. return path
  366. # replace long absolute paths with ./ relative ones to save on terminal output width
  367. if path.startswith(pwd) and (pwd != '/') and path != pwd:
  368. if color:
  369. path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
  370. else:
  371. path = path.replace(pwd, '.', 1)
  372. # quote paths containing spaces
  373. if ' ' in path:
  374. path = f'"{path}"'
  375. # replace home directory with ~ for shorter output
  376. path = path.replace(str(Path('~').expanduser()), '~')
  377. return path
  378. @enforce_types
  379. def printable_filesize(num_bytes: Union[int, float]) -> str:
  380. for count in ['Bytes','KB','MB','GB']:
  381. if num_bytes > -1024.0 and num_bytes < 1024.0:
  382. return '%3.1f %s' % (num_bytes, count)
  383. num_bytes /= 1024.0
  384. return '%3.1f %s' % (num_bytes, 'TB')
  385. @enforce_types
  386. def format_duration(seconds: float) -> str:
  387. """Format duration in human-readable form."""
  388. if seconds < 1:
  389. return f'{seconds*1000:.0f}ms'
  390. elif seconds < 60:
  391. return f'{seconds:.1f}s'
  392. elif seconds < 3600:
  393. minutes = int(seconds // 60)
  394. secs = int(seconds % 60)
  395. return f'{minutes}min {secs}s' if secs else f'{minutes}min'
  396. else:
  397. hours = int(seconds // 3600)
  398. minutes = int((seconds % 3600) // 60)
  399. return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
  400. @enforce_types
  401. def truncate_url(url: str, max_length: int = 60) -> str:
  402. """Truncate URL to max_length, keeping domain and adding ellipsis."""
  403. if len(url) <= max_length:
  404. return url
  405. # Try to keep the domain and beginning of path
  406. if '://' in url:
  407. protocol, rest = url.split('://', 1)
  408. if '/' in rest:
  409. domain, path = rest.split('/', 1)
  410. available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..."
  411. if available > 10:
  412. return f'{protocol}://{domain}/{path[:available]}...'
  413. # Fallback: just truncate
  414. return url[:max_length-3] + '...'
  415. @enforce_types
  416. def log_worker_event(
  417. worker_type: str,
  418. event: str,
  419. indent_level: int = 0,
  420. pid: Optional[int] = None,
  421. worker_id: Optional[str] = None,
  422. url: Optional[str] = None,
  423. plugin: Optional[str] = None,
  424. metadata: Optional[Dict[str, Any]] = None,
  425. error: Optional[Exception] = None,
  426. ) -> None:
  427. """
  428. Log a worker event with structured metadata and indentation.
  429. Args:
  430. worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
  431. event: Event name (Starting, Completed, Failed, etc.)
  432. indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
  433. pid: Process ID
  434. worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker)
  435. url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
  436. plugin: Plugin name (for ArchiveResultWorker)
  437. metadata: Dict of metadata to show in curly braces
  438. error: Exception if event is an error
  439. """
  440. indent = ' ' * indent_level
  441. from rich.markup import escape
  442. # Build worker identifier (without URL/plugin)
  443. worker_parts = [worker_type]
  444. # Don't add pid/worker_id for DB operations (they happen in whatever process is running)
  445. if pid and worker_type != 'DB':
  446. worker_parts.append(f'pid={pid}')
  447. if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
  448. worker_parts.append(f'id={worker_id}')
  449. # Build worker label parts for brackets (shown inside brackets)
  450. worker_label_base = worker_parts[0]
  451. worker_bracket_content = ", ".join(worker_parts[1:]) if len(worker_parts) > 1 else None
  452. # Build URL/plugin display (shown AFTER the label, outside brackets)
  453. url_extractor_parts = []
  454. if url:
  455. url_extractor_parts.append(f'url: {escape(url)}')
  456. if plugin:
  457. url_extractor_parts.append(f'extractor: {escape(plugin)}')
  458. url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''
  459. # Build metadata string
  460. metadata_str = ''
  461. if metadata:
  462. # Format metadata nicely
  463. meta_parts = []
  464. for k, v in metadata.items():
  465. if isinstance(v, float):
  466. # Format floats nicely (durations, sizes)
  467. if 'duration' in k.lower():
  468. meta_parts.append(f'{k}: {format_duration(v)}')
  469. elif 'size' in k.lower():
  470. meta_parts.append(f'{k}: {printable_filesize(int(v))}')
  471. else:
  472. meta_parts.append(f'{k}: {v:.2f}')
  473. elif isinstance(v, int):
  474. # Format integers - check if it's a size
  475. if 'size' in k.lower() or 'bytes' in k.lower():
  476. meta_parts.append(f'{k}: {printable_filesize(v)}')
  477. else:
  478. meta_parts.append(f'{k}: {v}')
  479. elif isinstance(v, (list, tuple)):
  480. meta_parts.append(f'{k}: {len(v)}')
  481. else:
  482. meta_parts.append(f'{k}: {v}')
  483. metadata_str = ' | '.join(meta_parts)
  484. # Determine color based on event
  485. color = 'white'
  486. if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
  487. color = 'green'
  488. elif event.startswith('Created'):
  489. color = 'cyan' # DB creation events
  490. elif event in ('Completed', 'COMPLETED', 'All work complete'):
  491. color = 'blue'
  492. elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
  493. color = 'red'
  494. elif event in ('Shutting down', 'SHUTDOWN'):
  495. color = 'grey53'
  496. # Build final message
  497. error_str = f' {type(error).__name__}: {error}' if error else ''
  498. from archivebox.misc.logging import CONSOLE
  499. from rich.text import Text
  500. # Create a Rich Text object for proper formatting
  501. # Text.append() treats content as literal (no markup parsing)
  502. text = Text()
  503. text.append(indent)
  504. text.append(worker_label_base, style=color)
  505. # Add bracketed content if present (using Text.append to avoid markup issues)
  506. if worker_bracket_content:
  507. text.append('[', style=color)
  508. text.append(worker_bracket_content, style=color)
  509. text.append(']', style=color)
  510. text.append(f' {event}{error_str}', style=color)
  511. # Add URL/plugin info first (more important)
  512. if url_extractor_str:
  513. text.append(f' | {url_extractor_str}')
  514. # Then add other metadata
  515. if metadata_str:
  516. text.append(f' | {metadata_str}')
  517. CONSOLE.print(text, soft_wrap=True)
  518. @enforce_types
  519. def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
  520. return '\n'.join(
  521. f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"'
  522. for folder, snapshot in folders.items()
  523. )
  524. @enforce_types
  525. def printable_config(config: dict, prefix: str='') -> str:
  526. return f'\n{prefix}'.join(
  527. f'{key}={val}'
  528. for key, val in config.items()
  529. if not (isinstance(val, dict) or callable(val))
  530. )
  531. @enforce_types
  532. def printable_folder_status(name: str, folder: Dict) -> str:
  533. if folder['enabled']:
  534. if folder['is_valid']:
  535. color, symbol, note, num_files = 'green', '√', 'valid', ''
  536. else:
  537. color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
  538. else:
  539. color, symbol, note, num_files = 'grey53', '-', 'unused', '-'
  540. if folder['path']:
  541. if os.access(folder['path'], os.R_OK):
  542. try:
  543. num_files = (
  544. f'{len(os.listdir(folder["path"]))} files'
  545. if os.path.isdir(folder['path']) else
  546. printable_filesize(Path(folder['path']).stat().st_size)
  547. )
  548. except PermissionError:
  549. num_files = 'error'
  550. else:
  551. num_files = 'missing'
  552. if folder.get('is_mount'):
  553. # add symbol @ next to filecount if path is a remote filesystem mount
  554. num_files = f'{num_files} @' if num_files else '@'
  555. path = pretty_path(folder['path'])
  556. return ' '.join((
  557. f'[{color}]',
  558. symbol,
  559. '[/]',
  560. name.ljust(21).replace('DATA_DIR', '[light_slate_blue]DATA_DIR[/light_slate_blue]'),
  561. num_files.ljust(14).replace('missing', '[grey53]missing[/grey53]'),
  562. f'[{color}]',
  563. note.ljust(8),
  564. '[/]',
  565. path.ljust(76),
  566. ))
  567. @enforce_types
  568. def printable_dependency_version(name: str, dependency: Dict) -> str:
  569. color, symbol, note, version = 'red', 'X', 'invalid', '?'
  570. if dependency['enabled']:
  571. if dependency['is_valid']:
  572. color, symbol, note = 'green', '√', 'valid'
  573. parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
  574. if parsed_version_num:
  575. version = f'v{parsed_version_num[0]}'
  576. else:
  577. color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
  578. path = pretty_path(dependency['path'])
  579. return ' '.join((
  580. ANSI[color],
  581. symbol,
  582. ANSI['reset'],
  583. name.ljust(21),
  584. version.ljust(14),
  585. ANSI[color],
  586. note.ljust(8),
  587. ANSI['reset'],
  588. path.ljust(76),
  589. ))