logging_util.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. __package__ = 'archivebox'
  2. import re
  3. import os
  4. import sys
  5. import time
  6. import platform
  7. import argparse
  8. from math import log
  9. from multiprocessing import Process
  10. from pathlib import Path
  11. from datetime import datetime
  12. from dataclasses import dataclass
  13. from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
  14. if TYPE_CHECKING:
  15. from .index.schema import Link, ArchiveResult
  16. from .util import enforce_types
  17. from .config import (
  18. ConfigDict,
  19. PYTHON_ENCODING,
  20. ANSI,
  21. IS_TTY,
  22. TERM_WIDTH,
  23. SHOW_PROGRESS,
  24. SOURCES_DIR_NAME,
  25. stderr,
  26. )
  27. @dataclass
  28. class RuntimeStats:
  29. """mutable stats counter for logging archiving timing info to CLI output"""
  30. skipped: int = 0
  31. succeeded: int = 0
  32. failed: int = 0
  33. parse_start_ts: Optional[datetime] = None
  34. parse_end_ts: Optional[datetime] = None
  35. index_start_ts: Optional[datetime] = None
  36. index_end_ts: Optional[datetime] = None
  37. archiving_start_ts: Optional[datetime] = None
  38. archiving_end_ts: Optional[datetime] = None
  39. # globals are bad, mmkay
  40. _LAST_RUN_STATS = RuntimeStats()
  41. class SmartFormatter(argparse.HelpFormatter):
  42. """Patched formatter that prints newlines in argparse help strings"""
  43. def _split_lines(self, text, width):
  44. if '\n' in text:
  45. return text.splitlines()
  46. return argparse.HelpFormatter._split_lines(self, text, width)
  47. def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
  48. """Tell the user they passed stdin to a command that doesn't accept it"""
  49. if stdin and not stdin.isatty():
  50. stdin_raw_text = stdin.read().strip()
  51. if stdin_raw_text:
  52. stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
  53. stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
  54. stderr()
  55. raise SystemExit(1)
  56. def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
  57. """accept any standard input and return it as a string or None"""
  58. if not stdin:
  59. return None
  60. elif stdin and not stdin.isatty():
  61. stdin_str = stdin.read().strip()
  62. return stdin_str or None
  63. return None
  64. class TimedProgress:
  65. """Show a progress bar and measure elapsed time until .end() is called"""
  66. def __init__(self, seconds, prefix=''):
  67. self.SHOW_PROGRESS = SHOW_PROGRESS
  68. if self.SHOW_PROGRESS:
  69. self.p = Process(target=progress_bar, args=(seconds, prefix))
  70. self.p.start()
  71. self.stats = {'start_ts': datetime.now(), 'end_ts': None}
  72. def end(self):
  73. """immediately end progress, clear the progressbar line, and save end_ts"""
  74. end_ts = datetime.now()
  75. self.stats['end_ts'] = end_ts
  76. if self.SHOW_PROGRESS:
  77. # terminate if we havent already terminated
  78. try:
  79. # WARNING: HACKY
  80. # I've spent over 15 hours trying to get rid of this stupid macOS-only
  81. # intermittent (but harmeless) warning when the progress bars end sometimes
  82. # Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>
  83. # BrokenPipeError: [Errno 32] Broken pipe
  84. # In the end, this is the only thing I found that makes it
  85. # happen slightly less often:
  86. if platform.system() == 'Darwin':
  87. time.sleep(0.1)
  88. # kill the progress bar subprocess
  89. self.p.terminate()
  90. self.p.join()
  91. self.p.close()
  92. if platform.system() == 'Darwin':
  93. time.sleep(0.1)
  94. # clear whole terminal line
  95. try:
  96. sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
  97. except (IOError, BrokenPipeError):
  98. # ignore when the parent proc has stopped listening to our stdout
  99. pass
  100. except ValueError:
  101. pass
  102. @enforce_types
  103. def progress_bar(seconds: int, prefix: str='') -> None:
  104. """show timer in the form of progress bar, with percentage and seconds remaining"""
  105. chunk = '█' if PYTHON_ENCODING == 'UTF-8' else '#'
  106. last_width = TERM_WIDTH()
  107. chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
  108. try:
  109. for s in range(seconds * chunks):
  110. max_width = TERM_WIDTH()
  111. if max_width < last_width:
  112. # when the terminal size is shrunk, we have to write a newline
  113. # otherwise the progress bar will keep wrapping incorrectly
  114. sys.stdout.write('\r\n')
  115. sys.stdout.flush()
  116. chunks = max_width - len(prefix) - 20
  117. pct_complete = s / chunks / seconds * 100
  118. log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;)
  119. bar_width = round(log_pct/(100/chunks))
  120. last_width = max_width
  121. # ████████████████████ 0.9% (1/60sec)
  122. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  123. prefix,
  124. ANSI['green' if pct_complete < 80 else 'lightyellow'],
  125. (chunk * bar_width).ljust(chunks),
  126. ANSI['reset'],
  127. round(pct_complete, 1),
  128. round(s/chunks),
  129. seconds,
  130. ))
  131. sys.stdout.flush()
  132. time.sleep(1 / chunks)
  133. # ██████████████████████████████████ 100.0% (60/60sec)
  134. sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
  135. prefix,
  136. ANSI['red'],
  137. chunk * chunks,
  138. ANSI['reset'],
  139. 100.0,
  140. seconds,
  141. seconds,
  142. ))
  143. sys.stdout.flush()
  144. # uncomment to have it disappear when it hits 100% instead of staying full red:
  145. # time.sleep(0.5)
  146. # sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
  147. # sys.stdout.flush()
  148. except (KeyboardInterrupt, BrokenPipeError):
  149. print()
  150. pass
  151. def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
  152. from .config import VERSION, ANSI
  153. cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
  154. stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
  155. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  156. VERSION=VERSION,
  157. cmd=cmd,
  158. **ANSI,
  159. ))
  160. stderr('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
  161. stderr()
  162. ### Parsing Stage
  163. def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
  164. _LAST_RUN_STATS.parse_start_ts = datetime.now()
  165. print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
  166. _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  167. len(urls) if isinstance(urls, list) else len(urls.split('\n')),
  168. depth,
  169. ' (index only)' if index_only else '',
  170. **ANSI,
  171. ))
  172. def log_source_saved(source_file: str):
  173. print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
  174. def log_parsing_finished(num_parsed: int, parser_name: str):
  175. _LAST_RUN_STATS.parse_end_ts = datetime.now()
  176. print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
  177. def log_deduping_finished(num_new_links: int):
  178. print(' > Found {} new URLs not already in index'.format(num_new_links))
  179. def log_crawl_started(new_links):
  180. print()
  181. print('{green}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
  182. ### Indexing Stage
  183. def log_indexing_process_started(num_links: int):
  184. start_ts = datetime.now()
  185. _LAST_RUN_STATS.index_start_ts = start_ts
  186. print()
  187. print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
  188. start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  189. num_links,
  190. **ANSI,
  191. ))
  192. def log_indexing_process_finished():
  193. end_ts = datetime.now()
  194. _LAST_RUN_STATS.index_end_ts = end_ts
  195. def log_indexing_started(out_path: str):
  196. if IS_TTY:
  197. sys.stdout.write(f' > {out_path}')
  198. def log_indexing_finished(out_path: str):
  199. print(f'\r √ {out_path}')
  200. ### Archiving Stage
  201. def log_archiving_started(num_links: int, resume: Optional[float]=None):
  202. start_ts = datetime.now()
  203. _LAST_RUN_STATS.archiving_start_ts = start_ts
  204. print()
  205. if resume:
  206. print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
  207. start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  208. num_links,
  209. resume,
  210. **ANSI,
  211. ))
  212. else:
  213. print('{green}[▶] [{}] Starting archiving of {} snapshots in index...{reset}'.format(
  214. start_ts.strftime('%Y-%m-%d %H:%M:%S'),
  215. num_links,
  216. **ANSI,
  217. ))
  218. def log_archiving_paused(num_links: int, idx: int, timestamp: str):
  219. end_ts = datetime.now()
  220. _LAST_RUN_STATS.archiving_end_ts = end_ts
  221. print()
  222. print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
  223. **ANSI,
  224. now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
  225. idx=idx+1,
  226. timestamp=timestamp,
  227. total=num_links,
  228. ))
  229. print()
  230. print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
  231. print(' archivebox server # then visit http://127.0.0.1:8000')
  232. print(' Continue archiving where you left off by running:')
  233. print(' archivebox update --resume={}'.format(timestamp))
  234. def log_archiving_finished(num_links: int):
  235. end_ts = datetime.now()
  236. _LAST_RUN_STATS.archiving_end_ts = end_ts
  237. assert _LAST_RUN_STATS.archiving_start_ts is not None
  238. seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
  239. if seconds > 60:
  240. duration = '{0:.2f} min'.format(seconds / 60)
  241. else:
  242. duration = '{0:.2f} sec'.format(seconds)
  243. print()
  244. print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
  245. ANSI['green'],
  246. end_ts.strftime('%Y-%m-%d %H:%M:%S'),
  247. num_links,
  248. duration,
  249. ANSI['reset'],
  250. ))
  251. print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
  252. print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed))
  253. print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
  254. print()
  255. print(' {lightred}Hint:{reset} To manage your archive in a Web UI, run:'.format(**ANSI))
  256. print(' archivebox server 0.0.0.0:8000')
  257. def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
  258. # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
  259. # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
  260. # > output/archive/1478739709
  261. print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
  262. symbol_color=ANSI['green' if is_new else 'black'],
  263. symbol='+' if is_new else '√',
  264. now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  265. title=link.title or link.base_url,
  266. **ANSI,
  267. ))
  268. print(' {blue}{url}{reset}'.format(url=link.url, **ANSI))
  269. print(' {} {}'.format(
  270. '>' if is_new else '√',
  271. pretty_path(link_dir),
  272. ))
  273. def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict):
  274. total = sum(stats.values())
  275. if stats['failed'] > 0 :
  276. _LAST_RUN_STATS.failed += 1
  277. elif stats['skipped'] == total:
  278. _LAST_RUN_STATS.skipped += 1
  279. else:
  280. _LAST_RUN_STATS.succeeded += 1
  281. def log_archive_method_started(method: str):
  282. print(' > {}'.format(method))
  283. def log_archive_method_finished(result: "ArchiveResult"):
  284. """quote the argument with whitespace in a command so the user can
  285. copy-paste the outputted string directly to run the cmd
  286. """
  287. # Prettify CMD string and make it safe to copy-paste by quoting arguments
  288. quoted_cmd = ' '.join(
  289. '"{}"'.format(arg) if ' ' in arg else arg
  290. for arg in result.cmd
  291. )
  292. if result.status == 'failed':
  293. # Prettify error output hints string and limit to five lines
  294. hints = getattr(result.output, 'hints', None) or ()
  295. if hints:
  296. hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
  297. hints = (
  298. ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
  299. for line in hints[:5] if line.strip()
  300. )
  301. # Collect and prefix output lines with indentation
  302. output_lines = [
  303. '{lightyellow}Extractor failed:{reset}'.format(**ANSI),
  304. ' {reset}{} {red}{}{reset}'.format(
  305. result.output.__class__.__name__.replace('ArchiveError', ''),
  306. result.output,
  307. **ANSI,
  308. ),
  309. *hints,
  310. '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
  311. *([' cd {};'.format(result.pwd)] if result.pwd else []),
  312. ' {}'.format(quoted_cmd),
  313. ]
  314. print('\n'.join(
  315. ' {}'.format(line)
  316. for line in output_lines
  317. if line
  318. ))
  319. print()
  320. def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
  321. print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
  322. filter_type,
  323. **ANSI,
  324. ))
  325. print(' {}'.format(' '.join(filter_patterns or ())))
  326. def log_list_finished(links):
  327. from .index.csv import links_to_csv
  328. print()
  329. print('---------------------------------------------------------------------------------------------------')
  330. print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
  331. print('---------------------------------------------------------------------------------------------------')
  332. print()
  333. def log_removal_started(links: List["Link"], yes: bool, delete: bool):
  334. print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
  335. if delete:
  336. file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
  337. print(
  338. f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
  339. f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
  340. )
  341. else:
  342. print(
  343. ' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
  344. ' (Pass --delete if you also want to permanently delete the data folders)'
  345. )
  346. if not yes:
  347. print()
  348. print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI))
  349. try:
  350. assert input(' y/[n]: ').lower() == 'y'
  351. except (KeyboardInterrupt, EOFError, AssertionError):
  352. raise SystemExit(0)
  353. def log_removal_finished(all_links: int, to_remove: int):
  354. if all_links == 0:
  355. print()
  356. print('{red}[X] No matching links found.{reset}'.format(**ANSI))
  357. else:
  358. print()
  359. print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
  360. to_remove,
  361. all_links,
  362. **ANSI,
  363. ))
  364. print(' Index now contains {} links.'.format(all_links - to_remove))
  365. def log_shell_welcome_msg():
  366. from .cli import list_subcommands
  367. print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
  368. print('{green}from archivebox.core.models import Snapshot, User{reset}'.format(**ANSI))
  369. print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
  370. print()
  371. print('[i] Welcome to the ArchiveBox Shell!')
  372. print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
  373. print()
  374. print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
  375. print(' print(Snapshot.objects.filter(is_archived=True).count())')
  376. print(' Snapshot.objects.get(url="https://example.com").as_json()')
  377. print(' add("https://example.com/some/new/url")')
  378. ### Helpers
  379. @enforce_types
  380. def pretty_path(path: Union[Path, str]) -> str:
  381. """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
  382. pwd = Path('.').resolve()
  383. # parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
  384. return str(path).replace(str(pwd) + '/', './')
  385. @enforce_types
  386. def printable_filesize(num_bytes: Union[int, float]) -> str:
  387. for count in ['Bytes','KB','MB','GB']:
  388. if num_bytes > -1024.0 and num_bytes < 1024.0:
  389. return '%3.1f %s' % (num_bytes, count)
  390. num_bytes /= 1024.0
  391. return '%3.1f %s' % (num_bytes, 'TB')
  392. @enforce_types
  393. def printable_folders(folders: Dict[str, Optional["Link"]],
  394. json: bool=False,
  395. html: bool=False,
  396. csv: Optional[str]=None,
  397. with_headers: bool=False) -> str:
  398. from .index.json import MAIN_INDEX_HEADER
  399. links = folders.values()
  400. if json:
  401. from .index.json import to_json
  402. if with_headers:
  403. output = {
  404. **MAIN_INDEX_HEADER,
  405. 'num_links': len(links),
  406. 'updated': datetime.now(),
  407. 'last_run_cmd': sys.argv,
  408. 'links': links,
  409. }
  410. else:
  411. output = links
  412. return to_json(output, indent=4, sort_keys=True)
  413. elif html:
  414. from .index.html import main_index_template
  415. if with_headers:
  416. output = main_index_template(links, True)
  417. else:
  418. from .index.html import MINIMAL_INDEX_TEMPLATE
  419. output = main_index_template(links, True, MINIMAL_INDEX_TEMPLATE)
  420. return output
  421. elif csv:
  422. from .index.csv import links_to_csv
  423. return links_to_csv(folders.values(), cols=csv.split(','), header=with_headers)
  424. return '\n'.join(
  425. f'{folder} {link and link.url} "{link and link.title}"'
  426. for folder, link in folders.items()
  427. )
  428. @enforce_types
  429. def printable_config(config: ConfigDict, prefix: str='') -> str:
  430. return f'\n{prefix}'.join(
  431. f'{key}={val}'
  432. for key, val in config.items()
  433. if not (isinstance(val, dict) or callable(val))
  434. )
  435. @enforce_types
  436. def printable_folder_status(name: str, folder: Dict) -> str:
  437. if folder['enabled']:
  438. if folder['is_valid']:
  439. color, symbol, note = 'green', '√', 'valid'
  440. else:
  441. color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
  442. else:
  443. color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
  444. if folder['path']:
  445. if Path(folder['path']).exists():
  446. num_files = (
  447. f'{len(os.listdir(folder["path"]))} files'
  448. if Path(folder['path']).is_dir() else
  449. printable_filesize(Path(folder['path']).stat().st_size)
  450. )
  451. else:
  452. num_files = 'missing'
  453. if ' ' in str(folder['path']):
  454. folder['path'] = f'"{folder["path"]}"'
  455. return ' '.join((
  456. ANSI[color],
  457. symbol,
  458. ANSI['reset'],
  459. name.ljust(22),
  460. (str(folder["path"]) or '').ljust(76),
  461. num_files.ljust(14),
  462. ANSI[color],
  463. note,
  464. ANSI['reset'],
  465. ))
  466. @enforce_types
  467. def printable_dependency_version(name: str, dependency: Dict) -> str:
  468. version = None
  469. if dependency['enabled']:
  470. if dependency['is_valid']:
  471. color, symbol, note, version = 'green', '√', 'valid', ''
  472. parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
  473. if parsed_version_num:
  474. version = f'v{parsed_version_num[0]}'
  475. if not version:
  476. color, symbol, note, version = 'red', 'X', 'invalid', '?'
  477. else:
  478. color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
  479. if ' ' in (dependency["path"] or ''):
  480. dependency["path"] = f'"{dependency["path"]}"'
  481. return ' '.join((
  482. ANSI[color],
  483. symbol,
  484. ANSI['reset'],
  485. name.ljust(22),
  486. (dependency["path"] or '').ljust(76),
  487. version.ljust(14),
  488. ANSI[color],
  489. note,
  490. ANSI['reset'],
  491. ))