on_Snapshot__50_singlefile.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. #!/usr/bin/env python3
  2. """
  3. Archive a URL using SingleFile.
  4. Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
  5. Output: Writes singlefile.html to $PWD
  6. Environment variables:
  7. SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True)
  8. SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file)
  9. SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
  10. SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) [unused; shared Chrome session required]
  11. SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
  12. SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT)
  13. SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
  14. SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
  15. SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS) [unused; shared Chrome session required]
  16. SINGLEFILE_ARGS: Default SingleFile arguments (JSON array)
  17. SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array)
  18. """
  19. import json
  20. import os
  21. import subprocess
  22. import sys
  23. import threading
  24. import time
  25. from urllib.request import urlopen
  26. from pathlib import Path
  27. import shutil
  28. import rich_click as click
  29. # Extractor metadata
  30. PLUGIN_NAME = 'singlefile'
  31. BIN_NAME = 'single-file'
  32. BIN_PROVIDERS = 'npm,env'
  33. OUTPUT_DIR = '.'
  34. OUTPUT_FILE = 'singlefile.html'
  35. EXTENSION_SAVE_SCRIPT = Path(__file__).parent / 'singlefile_extension_save.js'
  36. def get_env(name: str, default: str = '') -> str:
  37. return os.environ.get(name, default).strip()
  38. def get_env_bool(name: str, default: bool = False) -> bool:
  39. val = get_env(name, '').lower()
  40. if val in ('true', '1', 'yes', 'on'):
  41. return True
  42. if val in ('false', '0', 'no', 'off'):
  43. return False
  44. return default
  45. def get_env_int(name: str, default: int = 0) -> int:
  46. try:
  47. return int(get_env(name, str(default)))
  48. except ValueError:
  49. return default
  50. def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
  51. """Parse a JSON array from environment variable."""
  52. val = get_env(name, '')
  53. if not val:
  54. return default if default is not None else []
  55. try:
  56. result = json.loads(val)
  57. if isinstance(result, list):
  58. return [str(item) for item in result]
  59. return default if default is not None else []
  60. except json.JSONDecodeError:
  61. return default if default is not None else []
  62. STATICFILE_DIR = '../staticfile'
  63. def has_staticfile_output() -> bool:
  64. """Check if staticfile extractor already downloaded this URL."""
  65. staticfile_dir = Path(STATICFILE_DIR)
  66. if not staticfile_dir.exists():
  67. return False
  68. stdout_log = staticfile_dir / 'stdout.log'
  69. if not stdout_log.exists():
  70. return False
  71. for line in stdout_log.read_text(errors='ignore').splitlines():
  72. line = line.strip()
  73. if not line.startswith('{'):
  74. continue
  75. try:
  76. record = json.loads(line)
  77. except json.JSONDecodeError:
  78. continue
  79. if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
  80. return True
  81. return False
  82. # Chrome session directory (relative to extractor output dir)
  83. # Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for.
  84. # The centralized Chrome binary search is in chrome_utils.js findChromium().
  85. CHROME_SESSION_DIR = '../chrome'
  86. def get_cdp_url(wait_seconds: float = 0.0) -> str | None:
  87. """Get CDP URL from chrome plugin if available."""
  88. cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt'
  89. deadline = time.time() + max(wait_seconds, 0.0)
  90. while True:
  91. if cdp_file.exists():
  92. cdp_url = cdp_file.read_text().strip()
  93. return cdp_url or None
  94. if time.time() >= deadline:
  95. return None
  96. time.sleep(0.2)
  97. def get_port_from_cdp_url(cdp_url: str) -> str | None:
  98. """Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...)."""
  99. import re
  100. match = re.search(r':(\d+)/', cdp_url)
  101. if match:
  102. return match.group(1)
  103. return None
  104. def is_cdp_server_available(cdp_remote_url: str) -> bool:
  105. try:
  106. with urlopen(f'{cdp_remote_url}/json/version', timeout=1) as resp:
  107. return resp.status == 200
  108. except Exception:
  109. return False
  110. def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
  111. """
  112. Archive URL using SingleFile.
  113. Requires a Chrome session (from chrome plugin) and connects to it via CDP.
  114. Returns: (success, output_path, error_message)
  115. """
  116. print(f'[singlefile] CLI mode start url={url}', file=sys.stderr)
  117. # Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
  118. timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
  119. user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
  120. check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
  121. cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
  122. singlefile_args = get_env_array('SINGLEFILE_ARGS', [])
  123. singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', [])
  124. # Chrome args/binary are intentionally ignored because we require a shared Chrome session
  125. cmd = [binary, *singlefile_args]
  126. # Try to use existing Chrome session via CDP (prefer HTTP base URL)
  127. cdp_wait = min(10, max(1, timeout // 10))
  128. cdp_url = get_cdp_url(wait_seconds=cdp_wait)
  129. cdp_remote_url = None
  130. if cdp_url:
  131. if cdp_url.startswith(('http://', 'https://')):
  132. cdp_remote_url = cdp_url
  133. else:
  134. port = get_port_from_cdp_url(cdp_url)
  135. if port:
  136. cdp_remote_url = f'http://127.0.0.1:{port}'
  137. else:
  138. cdp_remote_url = cdp_url
  139. if cdp_remote_url and not is_cdp_server_available(cdp_remote_url):
  140. cdp_remote_url = None
  141. if cdp_remote_url:
  142. print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr)
  143. cmd.extend(['--browser-server', cdp_remote_url])
  144. else:
  145. return False, None, 'No Chrome session found (chrome plugin must run first)'
  146. # SSL handling
  147. if not check_ssl:
  148. cmd.append('--browser-ignore-insecure-certs')
  149. if user_agent:
  150. cmd.extend(['--user-agent', user_agent])
  151. if cookies_file and Path(cookies_file).is_file():
  152. cmd.extend(['--browser-cookies-file', cookies_file])
  153. # Add extra args from config
  154. if singlefile_args_extra:
  155. cmd.extend(singlefile_args_extra)
  156. # Output directory is current directory (hook already runs in output dir)
  157. output_dir = Path(OUTPUT_DIR)
  158. output_path = output_dir / OUTPUT_FILE
  159. cmd.extend([url, str(output_path)])
  160. print(f'[singlefile] CLI command: {" ".join(cmd[:6])} ...', file=sys.stderr)
  161. try:
  162. output_lines: list[str] = []
  163. process = subprocess.Popen(
  164. cmd,
  165. stdout=subprocess.PIPE,
  166. stderr=subprocess.STDOUT,
  167. text=True,
  168. bufsize=1,
  169. )
  170. def _read_output() -> None:
  171. if not process.stdout:
  172. return
  173. for line in process.stdout:
  174. output_lines.append(line)
  175. sys.stderr.write(line)
  176. reader = threading.Thread(target=_read_output, daemon=True)
  177. reader.start()
  178. try:
  179. process.wait(timeout=timeout)
  180. except subprocess.TimeoutExpired:
  181. process.kill()
  182. reader.join(timeout=1)
  183. return False, None, f'Timed out after {timeout} seconds'
  184. reader.join(timeout=1)
  185. combined_output = ''.join(output_lines)
  186. if output_path.exists() and output_path.stat().st_size > 0:
  187. return True, str(output_path), ''
  188. else:
  189. stderr = combined_output
  190. if 'ERR_NAME_NOT_RESOLVED' in stderr:
  191. return False, None, 'DNS resolution failed'
  192. if 'ERR_CONNECTION_REFUSED' in stderr:
  193. return False, None, 'Connection refused'
  194. detail = (stderr or '').strip()
  195. if len(detail) > 2000:
  196. detail = detail[:2000]
  197. cmd_preview = list(cmd)
  198. if '--browser-args' in cmd_preview:
  199. idx = cmd_preview.index('--browser-args')
  200. if idx + 1 < len(cmd_preview):
  201. cmd_preview[idx + 1] = '<json>'
  202. cmd_str = ' '.join(cmd_preview)
  203. return False, None, f'SingleFile failed (cmd={cmd_str}): {detail}'
  204. except subprocess.TimeoutExpired:
  205. return False, None, f'Timed out after {timeout} seconds'
  206. except Exception as e:
  207. return False, None, f'{type(e).__name__}: {e}'
  208. def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]:
  209. """Save using the SingleFile Chrome extension via existing Chrome session."""
  210. print(f'[singlefile] Extension mode start url={url}', file=sys.stderr)
  211. # Only attempt if chrome session exists
  212. cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10)))
  213. if not cdp_url:
  214. print('[singlefile] No Chrome session found (chrome plugin must run first)', file=sys.stderr)
  215. return False, None, 'No Chrome session found (chrome plugin must run first)'
  216. if not EXTENSION_SAVE_SCRIPT.exists():
  217. print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr)
  218. return False, None, 'SingleFile extension helper script missing'
  219. node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node')
  220. downloads_dir = get_env('CHROME_DOWNLOADS_DIR', '')
  221. extensions_dir = get_env('CHROME_EXTENSIONS_DIR', '')
  222. cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}']
  223. print(f'[singlefile] cdp_url={cdp_url}', file=sys.stderr)
  224. print(f'[singlefile] node={node_binary}', file=sys.stderr)
  225. node_resolved = shutil.which(node_binary) if node_binary else None
  226. print(f'[singlefile] node_resolved={node_resolved}', file=sys.stderr)
  227. print(f'[singlefile] PATH={os.environ.get("PATH","")}', file=sys.stderr)
  228. if downloads_dir:
  229. print(f'[singlefile] CHROME_DOWNLOADS_DIR={downloads_dir}', file=sys.stderr)
  230. if extensions_dir:
  231. print(f'[singlefile] CHROME_EXTENSIONS_DIR={extensions_dir}', file=sys.stderr)
  232. print(f'[singlefile] helper_cmd={" ".join(cmd)}', file=sys.stderr)
  233. try:
  234. output_lines: list[str] = []
  235. error_lines: list[str] = []
  236. process = subprocess.Popen(
  237. cmd,
  238. stdout=subprocess.PIPE,
  239. stderr=subprocess.PIPE,
  240. text=True,
  241. bufsize=1,
  242. )
  243. def _read_stream(stream, sink, label: str) -> None:
  244. if not stream:
  245. return
  246. for line in stream:
  247. sink.append(line)
  248. sys.stderr.write(line)
  249. sys.stderr.flush()
  250. stdout_thread = threading.Thread(target=_read_stream, args=(process.stdout, output_lines, 'stdout'), daemon=True)
  251. stderr_thread = threading.Thread(target=_read_stream, args=(process.stderr, error_lines, 'stderr'), daemon=True)
  252. stdout_thread.start()
  253. stderr_thread.start()
  254. try:
  255. process.wait(timeout=timeout)
  256. except subprocess.TimeoutExpired:
  257. process.kill()
  258. stdout_thread.join(timeout=1)
  259. stderr_thread.join(timeout=1)
  260. print(f'[singlefile] Extension helper timed out after {timeout}s', file=sys.stderr)
  261. return False, None, f'Timed out after {timeout} seconds'
  262. stdout_thread.join(timeout=1)
  263. stderr_thread.join(timeout=1)
  264. result_stdout = ''.join(output_lines).encode('utf-8', errors='replace')
  265. result_stderr = ''.join(error_lines).encode('utf-8', errors='replace')
  266. result_returncode = process.returncode
  267. except Exception as e:
  268. print(f'[singlefile] Extension helper error: {type(e).__name__}: {e}', file=sys.stderr)
  269. return False, None, f'{type(e).__name__}: {e}'
  270. print(f'[singlefile] helper_returncode={result_returncode}', file=sys.stderr)
  271. print(f'[singlefile] helper_stdout_len={len(result_stdout or b"")}', file=sys.stderr)
  272. print(f'[singlefile] helper_stderr_len={len(result_stderr or b"")}', file=sys.stderr)
  273. if result_returncode == 0:
  274. # Prefer explicit stdout path, fallback to local output file
  275. out_text = result_stdout.decode('utf-8', errors='replace').strip()
  276. if out_text and Path(out_text).exists():
  277. print(f'[singlefile] Extension output: {out_text}', file=sys.stderr)
  278. return True, out_text, ''
  279. output_path = Path(OUTPUT_DIR) / OUTPUT_FILE
  280. if output_path.exists() and output_path.stat().st_size > 0:
  281. print(f'[singlefile] Extension output: {output_path}', file=sys.stderr)
  282. return True, str(output_path), ''
  283. return False, None, 'SingleFile extension completed but no output file found'
  284. stderr = result_stderr.decode('utf-8', errors='replace').strip()
  285. stdout = result_stdout.decode('utf-8', errors='replace').strip()
  286. detail = stderr or stdout
  287. return False, None, detail or 'SingleFile extension failed'
  288. @click.command()
  289. @click.option('--url', required=True, help='URL to archive')
  290. @click.option('--snapshot-id', required=True, help='Snapshot UUID')
  291. def main(url: str, snapshot_id: str):
  292. """Archive a URL using SingleFile."""
  293. print(f'[singlefile] Hook starting pid={os.getpid()} url={url}', file=sys.stderr)
  294. output = None
  295. status = 'failed'
  296. error = ''
  297. try:
  298. # Check if SingleFile is enabled
  299. if not get_env_bool('SINGLEFILE_ENABLED', True):
  300. print('Skipping SingleFile (SINGLEFILE_ENABLED=False)', file=sys.stderr)
  301. # Feature disabled - no ArchiveResult, just exit
  302. sys.exit(0)
  303. # Check if staticfile extractor already handled this (permanent skip)
  304. if has_staticfile_output():
  305. print('Skipping SingleFile - staticfile extractor already downloaded this', file=sys.stderr)
  306. print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
  307. sys.exit(0)
  308. # Prefer SingleFile extension via existing Chrome session
  309. timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
  310. success, output, error = save_singlefile_with_extension(url, timeout)
  311. status = 'succeeded' if success else 'failed'
  312. except Exception as e:
  313. error = f'{type(e).__name__}: {e}'
  314. status = 'failed'
  315. if error:
  316. print(f'ERROR: {error}', file=sys.stderr)
  317. # Output clean JSONL (no RESULT_JSON= prefix)
  318. result = {
  319. 'type': 'ArchiveResult',
  320. 'status': status,
  321. 'output_str': output or error or '',
  322. }
  323. print(json.dumps(result))
  324. sys.exit(0 if status == 'succeeded' else 1)
  325. if __name__ == '__main__':
  326. main()