| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665 |
- __package__ = 'archivebox'
- import re
- import os
- import sys
- import stat
- import time
- import argparse
- from math import log
- from multiprocessing import Process
- from pathlib import Path
- from datetime import datetime, timezone
- from dataclasses import dataclass
- from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
- if TYPE_CHECKING:
- from .index.schema import Link, ArchiveResult
- from .system import get_dir_size
- from .util import enforce_types
- from .config import (
- ConfigDict,
- OUTPUT_DIR,
- PYTHON_ENCODING,
- VERSION,
- ANSI,
- IS_TTY,
- IN_DOCKER,
- TERM_WIDTH,
- SHOW_PROGRESS,
- SOURCES_DIR_NAME,
- stderr,
- )
- @dataclass
- class RuntimeStats:
- """mutable stats counter for logging archiving timing info to CLI output"""
- skipped: int = 0
- succeeded: int = 0
- failed: int = 0
- parse_start_ts: Optional[datetime] = None
- parse_end_ts: Optional[datetime] = None
- index_start_ts: Optional[datetime] = None
- index_end_ts: Optional[datetime] = None
- archiving_start_ts: Optional[datetime] = None
- archiving_end_ts: Optional[datetime] = None
- # globals are bad, mmkay
- _LAST_RUN_STATS = RuntimeStats()
- def debug_dict_summary(obj: Dict[Any, Any]) -> None:
- stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
- def get_fd_info(fd) -> Dict[str, Any]:
- NAME = fd.name[1:-1]
- FILENO = fd.fileno()
- MODE = os.fstat(FILENO).st_mode
- IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
- IS_PIPE = stat.S_ISFIFO(MODE)
- IS_FILE = stat.S_ISREG(MODE)
- IS_TERMINAL = not (IS_PIPE or IS_FILE)
- IS_LINE_BUFFERED = fd.line_buffering
- IS_READABLE = fd.readable()
- return {
- 'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
- 'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
- 'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
- 'IS_READABLE': IS_READABLE,
- }
-
- # # Log debug information about stdin, stdout, and stderr
- # sys.stdout.write('[>&1] this is python stdout\n')
- # sys.stderr.write('[>&2] this is python stderr\n')
- # debug_dict_summary(get_fd_info(sys.stdin))
- # debug_dict_summary(get_fd_info(sys.stdout))
- # debug_dict_summary(get_fd_info(sys.stderr))
- class SmartFormatter(argparse.HelpFormatter):
- """Patched formatter that prints newlines in argparse help strings"""
- def _split_lines(self, text, width):
- if '\n' in text:
- return text.splitlines()
- return argparse.HelpFormatter._split_lines(self, text, width)
- def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
- """Tell the user they passed stdin to a command that doesn't accept it"""
- if not stdin:
- return None
- if IN_DOCKER:
- # when TTY is disabled in docker we cant tell if stdin is being piped in or not
- # if we try to read stdin when its not piped we will hang indefinitely waiting for it
- return None
- if not stdin.isatty():
- # stderr('READING STDIN TO REJECT...')
- stdin_raw_text = stdin.read()
- if stdin_raw_text.strip():
- # stderr('GOT STDIN!', len(stdin_str))
- stderr(f'[!] The "{caller}" command does not accept stdin (ignoring).', color='red')
- stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
- stderr()
- # raise SystemExit(1)
- return None
- def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
- """accept any standard input and return it as a string or None"""
-
- if not stdin:
- return None
- if not stdin.isatty():
- # stderr('READING STDIN TO ACCEPT...')
- stdin_str = stdin.read()
- if stdin_str:
- # stderr('GOT STDIN...', len(stdin_str))
- return stdin_str
- return None
- class TimedProgress:
- """Show a progress bar and measure elapsed time until .end() is called"""
- def __init__(self, seconds, prefix=''):
- self.SHOW_PROGRESS = SHOW_PROGRESS
- if self.SHOW_PROGRESS:
- self.p = Process(target=progress_bar, args=(seconds, prefix))
- self.p.start()
- self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
- def end(self):
- """immediately end progress, clear the progressbar line, and save end_ts"""
- end_ts = datetime.now(timezone.utc)
- self.stats['end_ts'] = end_ts
-
- if self.SHOW_PROGRESS:
- # terminate if we havent already terminated
- try:
- # kill the progress bar subprocess
- try:
- self.p.close() # must be closed *before* its terminnated
- except (KeyboardInterrupt, SystemExit):
- print()
- raise
- except BaseException: # lgtm [py/catch-base-exception]
- pass
- self.p.terminate()
- self.p.join()
- # clear whole terminal line
- try:
- sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
- except (IOError, BrokenPipeError):
- # ignore when the parent proc has stopped listening to our stdout
- pass
- except ValueError:
- pass
- @enforce_types
- def progress_bar(seconds: int, prefix: str='') -> None:
- """show timer in the form of progress bar, with percentage and seconds remaining"""
- chunk = '█' if PYTHON_ENCODING == 'UTF-8' else '#'
- last_width = TERM_WIDTH()
- chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
- try:
- for s in range(seconds * chunks):
- max_width = TERM_WIDTH()
- if max_width < last_width:
- # when the terminal size is shrunk, we have to write a newline
- # otherwise the progress bar will keep wrapping incorrectly
- sys.stdout.write('\r\n')
- sys.stdout.flush()
- chunks = max_width - len(prefix) - 20
- pct_complete = s / chunks / seconds * 100
- log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;)
- bar_width = round(log_pct/(100/chunks))
- last_width = max_width
- # ████████████████████ 0.9% (1/60sec)
- sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
- prefix,
- ANSI['green' if pct_complete < 80 else 'lightyellow'],
- (chunk * bar_width).ljust(chunks),
- ANSI['reset'],
- round(pct_complete, 1),
- round(s/chunks),
- seconds,
- ))
- sys.stdout.flush()
- time.sleep(1 / chunks)
- # ██████████████████████████████████ 100.0% (60/60sec)
- sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
- prefix,
- ANSI['red'],
- chunk * chunks,
- ANSI['reset'],
- 100.0,
- seconds,
- seconds,
- ))
- sys.stdout.flush()
- # uncomment to have it disappear when it hits 100% instead of staying full red:
- # time.sleep(0.5)
- # sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
- # sys.stdout.flush()
- except (KeyboardInterrupt, BrokenPipeError):
- print()
- def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str):
- cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
- stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
- now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
- VERSION=VERSION,
- cmd=cmd,
- **ANSI,
- ))
- stderr('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
- stderr()
- ### Parsing Stage
- def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
- _LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc)
- print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
- _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
- len(urls) if isinstance(urls, list) else len(urls.split('\n')),
- depth,
- ' (index only)' if index_only else '',
- **ANSI,
- ))
- def log_source_saved(source_file: str):
- print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
- def log_parsing_finished(num_parsed: int, parser_name: str):
- _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
- print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
- def log_deduping_finished(num_new_links: int):
- print(' > Found {} new URLs not already in index'.format(num_new_links))
- def log_crawl_started(new_links):
- print()
- print('{green}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
- ### Indexing Stage
- def log_indexing_process_started(num_links: int):
- start_ts = datetime.now(timezone.utc)
- _LAST_RUN_STATS.index_start_ts = start_ts
- print()
- print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
- start_ts.strftime('%Y-%m-%d %H:%M:%S'),
- num_links,
- **ANSI,
- ))
- def log_indexing_process_finished():
- end_ts = datetime.now(timezone.utc)
- _LAST_RUN_STATS.index_end_ts = end_ts
- def log_indexing_started(out_path: str):
- if IS_TTY:
- sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
- def log_indexing_finished(out_path: str):
- print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
- ### Archiving Stage
- def log_archiving_started(num_links: int, resume: Optional[float]=None):
- start_ts = datetime.now(timezone.utc)
- _LAST_RUN_STATS.archiving_start_ts = start_ts
- print()
- if resume:
- print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
- start_ts.strftime('%Y-%m-%d %H:%M:%S'),
- num_links,
- resume,
- **ANSI,
- ))
- else:
- print('{green}[▶] [{}] Starting archiving of {} snapshots in index...{reset}'.format(
- start_ts.strftime('%Y-%m-%d %H:%M:%S'),
- num_links,
- **ANSI,
- ))
- def log_archiving_paused(num_links: int, idx: int, timestamp: str):
- end_ts = datetime.now(timezone.utc)
- _LAST_RUN_STATS.archiving_end_ts = end_ts
- print()
- print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
- **ANSI,
- now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
- idx=idx+1,
- timestamp=timestamp,
- total=num_links,
- ))
- print()
- print(' Continue archiving where you left off by running:')
- print(' archivebox update --resume={}'.format(timestamp))
- def log_archiving_finished(num_links: int):
- from core.models import Snapshot
- end_ts = datetime.now(timezone.utc)
- _LAST_RUN_STATS.archiving_end_ts = end_ts
- assert _LAST_RUN_STATS.archiving_start_ts is not None
- seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
- if seconds > 60:
- duration = '{0:.2f} min'.format(seconds / 60)
- else:
- duration = '{0:.2f} sec'.format(seconds)
- print()
- print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
- ANSI['green'],
- end_ts.strftime('%Y-%m-%d %H:%M:%S'),
- num_links,
- duration,
- ANSI['reset'],
- ))
- print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
- print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed))
- print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
-
- if Snapshot.objects.count() < 50:
- print()
- print(' {lightred}Hint:{reset} To manage your archive in a Web UI, run:'.format(**ANSI))
- print(' archivebox server 0.0.0.0:8000')
- def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
- # [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
- # http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
- # > output/archive/1478739709
- print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
- symbol_color=ANSI['green' if is_new else 'black'],
- symbol='+' if is_new else '√',
- now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
- title=link.title or link.base_url,
- **ANSI,
- ))
- print(' {blue}{url}{reset}'.format(url=link.url, **ANSI))
- print(' {} {}'.format(
- '>' if is_new else '√',
- pretty_path(link_dir),
- ))
- def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict, start_ts: datetime):
- total = sum(stats.values())
- if stats['failed'] > 0 :
- _LAST_RUN_STATS.failed += 1
- elif stats['skipped'] == total:
- _LAST_RUN_STATS.skipped += 1
- else:
- _LAST_RUN_STATS.succeeded += 1
- try:
- size = get_dir_size(link_dir)
- except FileNotFoundError:
- size = (0, None, '0')
- end_ts = datetime.now(timezone.utc)
- duration = str(end_ts - start_ts).split('.')[0]
- print(' {black}{} files ({}) in {}s {reset}'.format(size[2], printable_filesize(size[0]), duration, **ANSI))
- def log_archive_method_started(method: str):
- print(' > {}'.format(method))
- def log_archive_method_finished(result: "ArchiveResult"):
- """quote the argument with whitespace in a command so the user can
- copy-paste the outputted string directly to run the cmd
- """
- # Prettify CMD string and make it safe to copy-paste by quoting arguments
- quoted_cmd = ' '.join(
- '"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
- for arg in result.cmd
- )
- if result.status == 'failed':
- if result.output.__class__.__name__ == 'TimeoutExpired':
- duration = (result.end_ts - result.start_ts).seconds
- hint_header = [
- '{lightyellow}Extractor timed out after {}s.{reset}'.format(duration, **ANSI),
- ]
- else:
- hint_header = [
- '{lightyellow}Extractor failed:{reset}'.format(**ANSI),
- ' {reset}{} {red}{}{reset}'.format(
- result.output.__class__.__name__.replace('ArchiveError', ''),
- result.output,
- **ANSI,
- ),
- ]
-
- # import pudb; pudb.set_trace()
- # Prettify error output hints string and limit to five lines
- hints = getattr(result.output, 'hints', None) or ()
- if hints:
- if isinstance(hints, (list, tuple, type(_ for _ in ()))):
- hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
- else:
- if isinstance(hints, bytes):
- hints = hints.decode()
- hints = hints.split('\n')
- hints = (
- ' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
- for line in list(hints)[:5] if line.strip()
- )
- docker_hints = ()
- if IN_DOCKER:
- docker_hints = (
- ' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
- )
- # Collect and prefix output lines with indentation
- output_lines = [
- *hint_header,
- *hints,
- '{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
- *docker_hints,
- *([' cd {};'.format(result.pwd)] if result.pwd else []),
- ' {}'.format(quoted_cmd),
- ]
- print('\n'.join(
- ' {}'.format(line)
- for line in output_lines
- if line
- ))
- print()
- def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
- print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
- filter_type,
- **ANSI,
- ))
- print(' {}'.format(' '.join(filter_patterns or ())))
- def log_list_finished(links):
- from .index.csv import links_to_csv
- print()
- print('---------------------------------------------------------------------------------------------------')
- print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
- print('---------------------------------------------------------------------------------------------------')
- print()
- def log_removal_started(links: List["Link"], yes: bool, delete: bool):
- print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
- if delete:
- file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
- print(
- f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
- f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
- )
- else:
- print(
- ' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
- ' (Pass --delete if you also want to permanently delete the data folders)'
- )
- if not yes:
- print()
- print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI))
- try:
- assert input(' y/[n]: ').lower() == 'y'
- except (KeyboardInterrupt, EOFError, AssertionError):
- raise SystemExit(0)
- def log_removal_finished(all_links: int, to_remove: int):
- if all_links == 0:
- print()
- print('{red}[X] No matching links found.{reset}'.format(**ANSI))
- else:
- print()
- print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
- to_remove,
- all_links,
- **ANSI,
- ))
- print(' Index now contains {} links.'.format(all_links - to_remove))
- def log_shell_welcome_msg():
- from .cli import CLI_SUBCOMMANDS
- print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
- print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
- print('{green}from cli import *\n {}{reset}'.format("\n ".join(CLI_SUBCOMMANDS.keys()), **ANSI))
- print()
- print('[i] Welcome to the ArchiveBox Shell!')
- print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')
- print()
- print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
- print(' print(Snapshot.objects.filter(is_archived=True).count())')
- print(' Snapshot.objects.get(url="https://example.com").as_json()')
- print(' add("https://example.com/some/new/url")')
- ### Helpers
- @enforce_types
- def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
- """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
- pwd = str(Path(pwd)) # .resolve()
- path = str(path)
- if not path:
- return path
- # replace long absolute paths with ./ relative ones to save on terminal output width
- if path.startswith(pwd) and (pwd != '/'):
- path = path.replace(pwd, '.', 1)
-
- # quote paths containing spaces
- if ' ' in path:
- path = f'"{path}"'
- # if path is just a plain dot, replace it back with the absolute path for clarity
- if path == '.':
- path = pwd
- return path
- @enforce_types
- def printable_filesize(num_bytes: Union[int, float]) -> str:
- for count in ['Bytes','KB','MB','GB']:
- if num_bytes > -1024.0 and num_bytes < 1024.0:
- return '%3.1f %s' % (num_bytes, count)
- num_bytes /= 1024.0
- return '%3.1f %s' % (num_bytes, 'TB')
- @enforce_types
- def printable_folders(folders: Dict[str, Optional["Link"]],
- with_headers: bool=False) -> str:
- return '\n'.join(
- f'{folder} {link and link.url} "{link and link.title}"'
- for folder, link in folders.items()
- )
- @enforce_types
- def printable_config(config: ConfigDict, prefix: str='') -> str:
- return f'\n{prefix}'.join(
- f'{key}={val}'
- for key, val in config.items()
- if not (isinstance(val, dict) or callable(val))
- )
- @enforce_types
- def printable_folder_status(name: str, folder: Dict) -> str:
- if folder['enabled']:
- if folder['is_valid']:
- color, symbol, note, num_files = 'green', '√', 'valid', ''
- else:
- color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
- else:
- color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
- if folder['path']:
- if Path(folder['path']).exists():
- num_files = (
- f'{len(os.listdir(folder["path"]))} files'
- if Path(folder['path']).is_dir() else
- printable_filesize(Path(folder['path']).stat().st_size)
- )
- else:
- num_files = 'missing'
-
- if folder.get('is_mount'):
- # add symbol @ next to filecount if path is a remote filesystem mount
- num_files = f'{num_files} @' if num_files else '@'
- path = pretty_path(folder['path'])
- return ' '.join((
- ANSI[color],
- symbol,
- ANSI['reset'],
- name.ljust(21),
- num_files.ljust(14),
- ANSI[color],
- note.ljust(8),
- ANSI['reset'],
- path.ljust(76),
- ))
- @enforce_types
- def printable_dependency_version(name: str, dependency: Dict) -> str:
- color, symbol, note, version = 'red', 'X', 'invalid', '?'
- if dependency['enabled']:
- if dependency['is_valid']:
- color, symbol, note = 'green', '√', 'valid'
- parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
- if parsed_version_num:
- version = f'v{parsed_version_num[0]}'
- else:
- color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
- path = pretty_path(dependency['path'])
- return ' '.join((
- ANSI[color],
- symbol,
- ANSI['reset'],
- name.ljust(21),
- version.ljust(14),
- ANSI[color],
- note.ljust(8),
- ANSI['reset'],
- path.ljust(76),
- ))
|