on_Snapshot__06_wget.bg.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. #!/usr/bin/env python3
  2. """
  3. Archive a URL using wget.
  4. Usage: on_Snapshot__06_wget.bg.py --url=<url> --snapshot-id=<uuid>
  5. Output: Downloads files to $PWD
  6. Environment variables:
  7. WGET_ENABLED: Enable wget archiving (default: True)
  8. WGET_WARC_ENABLED: Save WARC file (default: True)
  9. WGET_BINARY: Path to wget binary (default: wget)
  10. WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
  11. WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT)
  12. WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
  13. WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY)
  14. WGET_ARGS: Default wget arguments (JSON array)
  15. WGET_ARGS_EXTRA: Extra arguments to append (JSON array)
  16. """
  17. import json
  18. import os
  19. import re
  20. import subprocess
  21. import sys
  22. from datetime import datetime, timezone
  23. from pathlib import Path
  24. import rich_click as click
  25. # Extractor metadata
  26. PLUGIN_NAME = 'wget'
  27. BIN_NAME = 'wget'
  28. BIN_PROVIDERS = 'apt,brew,env'
  29. OUTPUT_DIR = '.'
  30. def get_env(name: str, default: str = '') -> str:
  31. return os.environ.get(name, default).strip()
  32. def get_env_bool(name: str, default: bool = False) -> bool:
  33. val = get_env(name, '').lower()
  34. if val in ('true', '1', 'yes', 'on'):
  35. return True
  36. if val in ('false', '0', 'no', 'off'):
  37. return False
  38. return default
  39. def get_env_int(name: str, default: int = 0) -> int:
  40. try:
  41. return int(get_env(name, str(default)))
  42. except ValueError:
  43. return default
  44. def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
  45. """Parse a JSON array from environment variable."""
  46. val = get_env(name, '')
  47. if not val:
  48. return default if default is not None else []
  49. try:
  50. result = json.loads(val)
  51. if isinstance(result, list):
  52. return [str(item) for item in result]
  53. return default if default is not None else []
  54. except json.JSONDecodeError:
  55. return default if default is not None else []
  56. STATICFILE_DIR = '../staticfile'
  57. def has_staticfile_output() -> bool:
  58. """Check if staticfile extractor already downloaded this URL."""
  59. staticfile_dir = Path(STATICFILE_DIR)
  60. if not staticfile_dir.exists():
  61. return False
  62. stdout_log = staticfile_dir / 'stdout.log'
  63. if not stdout_log.exists():
  64. return False
  65. for line in stdout_log.read_text(errors='ignore').splitlines():
  66. line = line.strip()
  67. if not line.startswith('{'):
  68. continue
  69. try:
  70. record = json.loads(line)
  71. except json.JSONDecodeError:
  72. continue
  73. if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
  74. return True
  75. return False
  76. def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
  77. """
  78. Archive URL using wget.
  79. Returns: (success, output_path, error_message)
  80. """
  81. # Get config from env (with WGET_ prefix, x-fallback handled by config loader)
  82. timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
  83. user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
  84. check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
  85. cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
  86. wget_args = get_env_array('WGET_ARGS', [])
  87. wget_args_extra = get_env_array('WGET_ARGS_EXTRA', [])
  88. # Feature toggles
  89. warc_enabled = get_env_bool('WGET_WARC_ENABLED', True)
  90. # Build wget command (later options take precedence)
  91. cmd = [
  92. binary,
  93. *wget_args,
  94. f'--timeout={timeout}',
  95. ]
  96. if user_agent:
  97. cmd.append(f'--user-agent={user_agent}')
  98. if warc_enabled:
  99. warc_dir = Path('warc')
  100. warc_dir.mkdir(exist_ok=True)
  101. warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
  102. cmd.append(f'--warc-file={warc_path}')
  103. else:
  104. cmd.append('--timestamping')
  105. if cookies_file and Path(cookies_file).is_file():
  106. cmd.extend(['--load-cookies', cookies_file])
  107. if not check_ssl:
  108. cmd.extend(['--no-check-certificate', '--no-hsts'])
  109. if wget_args_extra:
  110. cmd.extend(wget_args_extra)
  111. cmd.append(url)
  112. # Run wget
  113. try:
  114. result = subprocess.run(
  115. cmd,
  116. capture_output=True,
  117. text=True,
  118. timeout=timeout * 2, # Allow extra time for large downloads
  119. )
  120. # Find downloaded files
  121. downloaded_files = [
  122. f for f in Path('.').rglob('*')
  123. if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/')
  124. ]
  125. if not downloaded_files:
  126. if result.returncode != 0:
  127. return False, None, f'wget failed (exit={result.returncode})'
  128. return False, None, 'No files downloaded'
  129. # Find main HTML file
  130. html_files = [
  131. f for f in downloaded_files
  132. if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f))
  133. ]
  134. output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
  135. # Parse download stats from wget output
  136. stderr_text = (result.stderr or '')
  137. output_tail = stderr_text.strip().split('\n')[-3:] if stderr_text else []
  138. files_count = len(downloaded_files)
  139. return True, output_path, ''
  140. except subprocess.TimeoutExpired:
  141. return False, None, f'Timed out after {timeout * 2} seconds'
  142. except Exception as e:
  143. return False, None, f'{type(e).__name__}: {e}'
  144. @click.command()
  145. @click.option('--url', required=True, help='URL to archive')
  146. @click.option('--snapshot-id', required=True, help='Snapshot UUID')
  147. def main(url: str, snapshot_id: str):
  148. """Archive a URL using wget."""
  149. output = None
  150. status = 'failed'
  151. error = ''
  152. try:
  153. # Check if wget is enabled
  154. if not get_env_bool('WGET_ENABLED', True):
  155. print('Skipping wget (WGET_ENABLED=False)', file=sys.stderr)
  156. # Temporary failure (config disabled) - NO JSONL emission
  157. sys.exit(0)
  158. # Check if staticfile extractor already handled this (permanent skip)
  159. if has_staticfile_output():
  160. print('Skipping wget - staticfile extractor already downloaded this', file=sys.stderr)
  161. print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
  162. sys.exit(0)
  163. # Get binary from environment
  164. binary = get_env('WGET_BINARY', 'wget')
  165. # Run extraction
  166. success, output, error = save_wget(url, binary)
  167. if success:
  168. # Success - emit ArchiveResult
  169. result = {
  170. 'type': 'ArchiveResult',
  171. 'status': 'succeeded',
  172. 'output_str': output or ''
  173. }
  174. print(json.dumps(result))
  175. sys.exit(0)
  176. else:
  177. # Transient error - emit NO JSONL
  178. print(f'ERROR: {error}', file=sys.stderr)
  179. sys.exit(1)
  180. except Exception as e:
  181. # Transient error - emit NO JSONL
  182. print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
  183. sys.exit(1)
  184. if __name__ == '__main__':
  185. main()