| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233 |
- #!/usr/bin/env python3
- """
- Archive a URL using wget.
- Usage: on_Snapshot__06_wget.bg.py --url=<url> --snapshot-id=<uuid>
- Output: Downloads files to $PWD
- Environment variables:
- WGET_ENABLED: Enable wget archiving (default: True)
- WGET_WARC_ENABLED: Save WARC file (default: True)
- WGET_BINARY: Path to wget binary (default: wget)
- WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
- WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT)
- WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
- WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY)
- WGET_ARGS: Default wget arguments (JSON array)
- WGET_ARGS_EXTRA: Extra arguments to append (JSON array)
- """
- import json
- import os
- import re
- import subprocess
- import sys
- from datetime import datetime, timezone
- from pathlib import Path
- import rich_click as click
- # Extractor metadata
- PLUGIN_NAME = 'wget'
- BIN_NAME = 'wget'
- BIN_PROVIDERS = 'apt,brew,env'
- OUTPUT_DIR = '.'
- def get_env(name: str, default: str = '') -> str:
- return os.environ.get(name, default).strip()
- def get_env_bool(name: str, default: bool = False) -> bool:
- val = get_env(name, '').lower()
- if val in ('true', '1', 'yes', 'on'):
- return True
- if val in ('false', '0', 'no', 'off'):
- return False
- return default
- def get_env_int(name: str, default: int = 0) -> int:
- try:
- return int(get_env(name, str(default)))
- except ValueError:
- return default
- def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
- """Parse a JSON array from environment variable."""
- val = get_env(name, '')
- if not val:
- return default if default is not None else []
- try:
- result = json.loads(val)
- if isinstance(result, list):
- return [str(item) for item in result]
- return default if default is not None else []
- except json.JSONDecodeError:
- return default if default is not None else []
- STATICFILE_DIR = '../staticfile'
- def has_staticfile_output() -> bool:
- """Check if staticfile extractor already downloaded this URL."""
- staticfile_dir = Path(STATICFILE_DIR)
- if not staticfile_dir.exists():
- return False
- stdout_log = staticfile_dir / 'stdout.log'
- if not stdout_log.exists():
- return False
- for line in stdout_log.read_text(errors='ignore').splitlines():
- line = line.strip()
- if not line.startswith('{'):
- continue
- try:
- record = json.loads(line)
- except json.JSONDecodeError:
- continue
- if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
- return True
- return False
- def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
- """
- Archive URL using wget.
- Returns: (success, output_path, error_message)
- """
- # Get config from env (with WGET_ prefix, x-fallback handled by config loader)
- timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
- user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
- check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
- cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
- wget_args = get_env_array('WGET_ARGS', [])
- wget_args_extra = get_env_array('WGET_ARGS_EXTRA', [])
- # Feature toggles
- warc_enabled = get_env_bool('WGET_WARC_ENABLED', True)
- # Build wget command (later options take precedence)
- cmd = [
- binary,
- *wget_args,
- f'--timeout={timeout}',
- ]
- if user_agent:
- cmd.append(f'--user-agent={user_agent}')
- if warc_enabled:
- warc_dir = Path('warc')
- warc_dir.mkdir(exist_ok=True)
- warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
- cmd.append(f'--warc-file={warc_path}')
- else:
- cmd.append('--timestamping')
- if cookies_file and Path(cookies_file).is_file():
- cmd.extend(['--load-cookies', cookies_file])
- if not check_ssl:
- cmd.extend(['--no-check-certificate', '--no-hsts'])
- if wget_args_extra:
- cmd.extend(wget_args_extra)
- cmd.append(url)
- # Run wget
- try:
- result = subprocess.run(
- cmd,
- capture_output=True,
- text=True,
- timeout=timeout * 2, # Allow extra time for large downloads
- )
- # Find downloaded files
- downloaded_files = [
- f for f in Path('.').rglob('*')
- if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/')
- ]
- if not downloaded_files:
- if result.returncode != 0:
- return False, None, f'wget failed (exit={result.returncode})'
- return False, None, 'No files downloaded'
- # Find main HTML file
- html_files = [
- f for f in downloaded_files
- if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f))
- ]
- output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
- # Parse download stats from wget output
- stderr_text = (result.stderr or '')
- output_tail = stderr_text.strip().split('\n')[-3:] if stderr_text else []
- files_count = len(downloaded_files)
- return True, output_path, ''
- except subprocess.TimeoutExpired:
- return False, None, f'Timed out after {timeout * 2} seconds'
- except Exception as e:
- return False, None, f'{type(e).__name__}: {e}'
- @click.command()
- @click.option('--url', required=True, help='URL to archive')
- @click.option('--snapshot-id', required=True, help='Snapshot UUID')
- def main(url: str, snapshot_id: str):
- """Archive a URL using wget."""
- output = None
- status = 'failed'
- error = ''
- try:
- # Check if wget is enabled
- if not get_env_bool('WGET_ENABLED', True):
- print('Skipping wget (WGET_ENABLED=False)', file=sys.stderr)
- # Temporary failure (config disabled) - NO JSONL emission
- sys.exit(0)
- # Check if staticfile extractor already handled this (permanent skip)
- if has_staticfile_output():
- print('Skipping wget - staticfile extractor already downloaded this', file=sys.stderr)
- print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
- sys.exit(0)
- # Get binary from environment
- binary = get_env('WGET_BINARY', 'wget')
- # Run extraction
- success, output, error = save_wget(url, binary)
- if success:
- # Success - emit ArchiveResult
- result = {
- 'type': 'ArchiveResult',
- 'status': 'succeeded',
- 'output_str': output or ''
- }
- print(json.dumps(result))
- sys.exit(0)
- else:
- # Transient error - emit NO JSONL
- print(f'ERROR: {error}', file=sys.stderr)
- sys.exit(1)
- except Exception as e:
- # Transient error - emit NO JSONL
- print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
- sys.exit(1)
- if __name__ == '__main__':
- main()
|