hooks.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260
  1. """
  2. Hook discovery and execution system for ArchiveBox plugins.
  3. Hooks are standalone scripts that run as separate processes and communicate
  4. with ArchiveBox via CLI arguments and stdout JSON output. This keeps the plugin
  5. system simple and language-agnostic.
  6. Directory structure:
  7. archivebox/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in)
  8. data/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
  9. Hook contract:
  10. Input: --url=<url> (and other --key=value args)
  11. Output: JSON to stdout, files to $PWD
  12. Exit: 0 = success, non-zero = failure
  13. Execution order:
  14. - Hooks are named with two-digit prefixes (00-99) and sorted lexicographically by filename
  15. - Foreground hooks run sequentially in that order
  16. - Background hooks (.bg suffix) run concurrently and do not block foreground progress
  17. - After all foreground hooks complete, background hooks receive SIGTERM and must finalize
  18. - Failed extractors don't block subsequent extractors
  19. Hook Naming Convention:
  20. on_{ModelName}__{run_order}_{description}[.bg].{ext}
  21. Examples:
  22. on_Snapshot__00_setup.py # runs first
  23. on_Snapshot__10_chrome_tab.bg.js # background (doesn't block)
  24. on_Snapshot__50_screenshot.js # foreground (blocks)
  25. on_Snapshot__63_media.bg.py # background (long-running)
  26. Dependency handling:
  27. Extractor plugins that depend on other plugins' output should check at runtime:
  28. ```python
  29. # Example: screenshot plugin depends on chrome plugin
  30. chrome_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome'
  31. if not (chrome_dir / 'cdp_url.txt').exists():
  32. print('{"status": "skipped", "output": "chrome session not available"}')
  33. sys.exit(1) # Exit non-zero so it gets retried later
  34. ```
  35. On retry (Snapshot.retry_failed_archiveresults()):
  36. - Only FAILED/SKIPPED plugins reset to queued (SUCCEEDED stays)
  37. - Run in order again
  38. - If dependencies now succeed, dependents can run
  39. API (all hook logic lives here):
  40. discover_hooks(event) -> List[Path] Find hook scripts
  41. run_hook(script, ...) -> HookResult Execute a hook script
  42. run_hooks(event, ...) -> List[HookResult] Run all hooks for an event
  43. extract_step(hook_name) -> int Deprecated: get two-digit order prefix if present
  44. is_background_hook(name) -> bool Check if hook is background (.bg suffix)
  45. """
  46. __package__ = 'archivebox'
  47. import os
  48. import re
  49. import json
  50. import signal
  51. import time
  52. import subprocess
  53. from functools import lru_cache
  54. from pathlib import Path
  55. from typing import List, Dict, Any, Optional, TypedDict
  56. from django.conf import settings
  57. from django.utils import timezone
  58. from django.utils.safestring import mark_safe
  59. # Plugin directories
  60. BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
  61. USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
  62. # =============================================================================
  63. # Hook Step Extraction
  64. # =============================================================================
  65. def extract_step(hook_name: str) -> int:
  66. """
  67. Deprecated: return the two-digit order prefix as an integer (00-99) if present.
  68. Hook execution is based on lexicographic ordering of filenames; callers should
  69. not rely on parsed numeric steps for ordering decisions.
  70. """
  71. match = re.search(r'__(\d{2})_', hook_name)
  72. if match:
  73. return int(match.group(1))
  74. import sys
  75. print(f"Warning: Hook '{hook_name}' has no order prefix (expected __XX_), defaulting to 99", file=sys.stderr)
  76. return 99
  77. def is_background_hook(hook_name: str) -> bool:
  78. """
  79. Check if a hook is a background hook (doesn't block foreground progression).
  80. Background hooks have '.bg.' in their filename before the extension.
  81. Args:
  82. hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.bg.js')
  83. Returns:
  84. True if background hook, False if foreground.
  85. Examples:
  86. is_background_hook('on_Snapshot__10_chrome_tab.bg.js') -> True
  87. is_background_hook('on_Snapshot__50_wget.py') -> False
  88. is_background_hook('on_Snapshot__63_media.bg.py') -> True
  89. """
  90. return '.bg.' in hook_name or '__background' in hook_name
  91. class HookResult(TypedDict, total=False):
  92. """Raw result from run_hook()."""
  93. returncode: int
  94. stdout: str
  95. stderr: str
  96. output_json: Optional[Dict[str, Any]]
  97. output_files: List[str]
  98. duration_ms: int
  99. hook: str
  100. plugin: str # Plugin name (directory name, e.g., 'wget', 'screenshot')
  101. hook_name: str # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
  102. # New fields for JSONL parsing
  103. records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
  104. def discover_hooks(
  105. event_name: str,
  106. filter_disabled: bool = True,
  107. config: Optional[Dict[str, Any]] = None
  108. ) -> List[Path]:
  109. """
  110. Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
  111. Searches both built-in and user plugin directories.
  112. Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
  113. Returns scripts sorted alphabetically by filename for deterministic execution order.
  114. Hook naming convention uses numeric prefixes to control order:
  115. on_Snapshot__10_title.py # runs first
  116. on_Snapshot__15_singlefile.py # runs second
  117. on_Snapshot__26_readability.py # runs later (depends on singlefile)
  118. Args:
  119. event_name: Event name (e.g., 'Snapshot', 'Binary', 'Crawl')
  120. filter_disabled: If True, skip hooks from disabled plugins (default: True)
  121. config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
  122. If None, will call get_config() with global scope
  123. Returns:
  124. Sorted list of hook script paths from enabled plugins only.
  125. Examples:
  126. # With proper config context (recommended):
  127. from archivebox.config.configset import get_config
  128. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  129. discover_hooks('Snapshot', config=config)
  130. # Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
  131. # Without config (uses global defaults):
  132. discover_hooks('Snapshot')
  133. # Returns: [Path('.../on_Snapshot__10_title.py'), ...]
  134. # Show all plugins regardless of enabled status:
  135. discover_hooks('Snapshot', filter_disabled=False)
  136. # Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
  137. """
  138. hooks = []
  139. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  140. if not base_dir.exists():
  141. continue
  142. # Search for hook scripts in all subdirectories
  143. for ext in ('sh', 'py', 'js'):
  144. pattern = f'*/on_{event_name}__*.{ext}'
  145. hooks.extend(base_dir.glob(pattern))
  146. # Also check for hooks directly in the plugins directory
  147. pattern_direct = f'on_{event_name}__*.{ext}'
  148. hooks.extend(base_dir.glob(pattern_direct))
  149. # Filter by enabled plugins
  150. if filter_disabled:
  151. # Get merged config if not provided (lazy import to avoid circular dependency)
  152. if config is None:
  153. from archivebox.config.configset import get_config
  154. config = get_config()
  155. enabled_hooks = []
  156. for hook in hooks:
  157. # Get plugin name from parent directory
  158. # e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
  159. plugin_name = hook.parent.name
  160. # Check if this is a plugin directory (not the root plugins dir)
  161. if plugin_name in ('plugins', '.'):
  162. # Hook is in root plugins directory, not a plugin subdir
  163. # Include it by default (no filtering for non-plugin hooks)
  164. enabled_hooks.append(hook)
  165. continue
  166. # Check if plugin is enabled
  167. plugin_config = get_plugin_special_config(plugin_name, config)
  168. if plugin_config['enabled']:
  169. enabled_hooks.append(hook)
  170. hooks = enabled_hooks
  171. # Sort by filename (not full path) to ensure numeric prefix ordering works
  172. # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
  173. return sorted(set(hooks), key=lambda p: p.name)
  174. def run_hook(
  175. script: Path,
  176. output_dir: Path,
  177. config: Dict[str, Any],
  178. timeout: Optional[int] = None,
  179. parent: Optional['Process'] = None,
  180. **kwargs: Any
  181. ) -> 'Process':
  182. """
  183. Execute a hook script with the given arguments using Process model.
  184. This is the low-level hook executor that creates a Process record and
  185. uses Process.launch() for subprocess management.
  186. Config is passed to hooks via environment variables. Caller MUST use
  187. get_config() to merge all sources (file, env, machine, crawl, snapshot).
  188. Args:
  189. script: Path to the hook script (.sh, .py, or .js)
  190. output_dir: Working directory for the script (where output files go)
  191. config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
  192. timeout: Maximum execution time in seconds
  193. If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
  194. parent: Optional parent Process (for tracking worker->hook hierarchy)
  195. **kwargs: Arguments passed to the script as --key=value
  196. Returns:
  197. Process model instance (use process.exit_code, process.stdout, process.get_records())
  198. Example:
  199. from archivebox.config.configset import get_config
  200. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  201. process = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
  202. if process.status == 'exited':
  203. records = process.get_records() # Get parsed JSONL output
  204. """
  205. from archivebox.machine.models import Process, Machine
  206. from archivebox.config.constants import CONSTANTS
  207. import time
  208. import sys
  209. start_time = time.time()
  210. # Auto-detect timeout from plugin config if not explicitly provided
  211. if timeout is None:
  212. plugin_name = script.parent.name
  213. plugin_config = get_plugin_special_config(plugin_name, config)
  214. timeout = plugin_config['timeout']
  215. if timeout:
  216. timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS))
  217. # Get current machine
  218. machine = Machine.current()
  219. # Auto-detect parent process if not explicitly provided
  220. # This enables automatic hierarchy tracking: Worker -> Hook
  221. if parent is None:
  222. try:
  223. parent = Process.current()
  224. except Exception:
  225. # If Process.current() fails (e.g., not in a worker context), leave parent as None
  226. pass
  227. if not script.exists():
  228. # Create a failed Process record for hooks that don't exist
  229. process = Process.objects.create(
  230. machine=machine,
  231. parent=parent,
  232. process_type=Process.TypeChoices.HOOK,
  233. pwd=str(output_dir),
  234. cmd=['echo', f'Hook script not found: {script}'],
  235. timeout=timeout,
  236. status=Process.StatusChoices.EXITED,
  237. exit_code=1,
  238. stderr=f'Hook script not found: {script}',
  239. )
  240. return process
  241. # Determine the interpreter based on file extension
  242. ext = script.suffix.lower()
  243. if ext == '.sh':
  244. cmd = ['bash', str(script)]
  245. elif ext == '.py':
  246. cmd = [sys.executable, str(script)]
  247. elif ext == '.js':
  248. cmd = ['node', str(script)]
  249. else:
  250. # Try to execute directly (assumes shebang)
  251. cmd = [str(script)]
  252. # Build CLI arguments from kwargs
  253. for key, value in kwargs.items():
  254. # Skip keys that start with underscore (internal parameters)
  255. if key.startswith('_'):
  256. continue
  257. arg_key = f'--{key.replace("_", "-")}'
  258. if isinstance(value, bool):
  259. if value:
  260. cmd.append(arg_key)
  261. elif value is not None and value != '':
  262. # JSON-encode complex values, use str for simple ones
  263. # Skip empty strings to avoid --key= which breaks argument parsers
  264. if isinstance(value, (dict, list)):
  265. cmd.append(f'{arg_key}={json.dumps(value)}')
  266. else:
  267. # Ensure value is converted to string and strip whitespace
  268. str_value = str(value).strip()
  269. if str_value: # Only add if non-empty after stripping
  270. cmd.append(f'{arg_key}={str_value}')
  271. # Set up environment with base paths
  272. env = os.environ.copy()
  273. env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
  274. env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
  275. env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
  276. # Get LIB_DIR and LIB_BIN_DIR from config
  277. lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None))
  278. lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None))
  279. if lib_dir:
  280. env['LIB_DIR'] = str(lib_dir)
  281. if not lib_bin_dir and lib_dir:
  282. # Derive LIB_BIN_DIR from LIB_DIR if not set
  283. lib_bin_dir = Path(lib_dir) / 'bin'
  284. # Build PATH with proper precedence:
  285. # 1. LIB_BIN_DIR (highest priority - local symlinked binaries)
  286. # 2. Machine.config.PATH (pip/npm bin dirs from providers)
  287. # 3. os.environ['PATH'] (system PATH)
  288. if lib_bin_dir:
  289. lib_bin_dir = str(lib_bin_dir)
  290. env['LIB_BIN_DIR'] = lib_bin_dir
  291. # Start with base PATH
  292. current_path = env.get('PATH', '')
  293. # Prepend Machine.config.PATH if it exists (treat as extra entries, not replacement)
  294. try:
  295. from archivebox.machine.models import Machine
  296. machine = Machine.current()
  297. if machine and machine.config:
  298. machine_path = machine.config.get('PATH')
  299. if machine_path:
  300. # Prepend machine_path to current PATH
  301. current_path = f'{machine_path}:{current_path}' if current_path else machine_path
  302. except Exception:
  303. pass
  304. # Finally prepend LIB_BIN_DIR to the front (highest priority)
  305. if lib_bin_dir:
  306. if not current_path.startswith(f'{lib_bin_dir}:'):
  307. env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else lib_bin_dir
  308. else:
  309. env['PATH'] = current_path
  310. else:
  311. env['PATH'] = current_path
  312. # Set NODE_PATH for Node.js module resolution
  313. # Priority: config dict > Machine.config > derive from LIB_DIR
  314. node_path = config.get('NODE_PATH')
  315. if not node_path and lib_dir:
  316. # Derive from LIB_DIR/npm/node_modules (create if needed)
  317. node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules'
  318. node_modules_dir.mkdir(parents=True, exist_ok=True)
  319. node_path = str(node_modules_dir)
  320. if not node_path:
  321. try:
  322. # Fallback to Machine.config
  323. node_path = machine.config.get('NODE_MODULES_DIR')
  324. except Exception:
  325. pass
  326. if node_path:
  327. env['NODE_PATH'] = node_path
  328. env['NODE_MODULES_DIR'] = node_path # For backwards compatibility
  329. # Export all config values to environment (already merged by get_config())
  330. # Skip keys we've already handled specially above (PATH, LIB_DIR, LIB_BIN_DIR, NODE_PATH, etc.)
  331. SKIP_KEYS = {'PATH', 'LIB_DIR', 'LIB_BIN_DIR', 'NODE_PATH', 'NODE_MODULES_DIR', 'DATA_DIR', 'ARCHIVE_DIR', 'MACHINE_ID'}
  332. for key, value in config.items():
  333. if key in SKIP_KEYS:
  334. continue # Already handled specially above, don't overwrite
  335. if value is None:
  336. continue
  337. elif isinstance(value, bool):
  338. env[key] = 'true' if value else 'false'
  339. elif isinstance(value, (list, dict)):
  340. env[key] = json.dumps(value)
  341. else:
  342. env[key] = str(value)
  343. # Create output directory if needed
  344. output_dir.mkdir(parents=True, exist_ok=True)
  345. # Detect if this is a background hook (long-running daemon)
  346. # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
  347. # Old convention: __background in stem (for backwards compatibility)
  348. is_background = '.bg.' in script.name or '__background' in script.stem
  349. try:
  350. # Create Process record
  351. process = Process.objects.create(
  352. machine=machine,
  353. parent=parent,
  354. process_type=Process.TypeChoices.HOOK,
  355. pwd=str(output_dir),
  356. cmd=cmd,
  357. timeout=timeout,
  358. )
  359. # Copy the env dict we already built (includes os.environ + all customizations)
  360. process.env = env.copy()
  361. # Save env before launching
  362. process.save()
  363. # Launch subprocess using Process.launch()
  364. process.launch(background=is_background)
  365. # Return Process object (caller can use process.exit_code, process.stdout, process.get_records())
  366. return process
  367. except Exception as e:
  368. # Create a failed Process record for exceptions
  369. process = Process.objects.create(
  370. machine=machine,
  371. process_type=Process.TypeChoices.HOOK,
  372. pwd=str(output_dir),
  373. cmd=cmd,
  374. timeout=timeout,
  375. status=Process.StatusChoices.EXITED,
  376. exit_code=1,
  377. stderr=f'Failed to run hook: {type(e).__name__}: {e}',
  378. )
  379. return process
  380. def extract_records_from_process(process: 'Process') -> List[Dict[str, Any]]:
  381. """
  382. Extract JSONL records from a Process's stdout.
  383. Adds plugin metadata to each record.
  384. Args:
  385. process: Process model instance with stdout captured
  386. Returns:
  387. List of parsed JSONL records with plugin metadata
  388. """
  389. records = process.get_records()
  390. if not records:
  391. return []
  392. # Extract plugin metadata from process.pwd and process.cmd
  393. plugin_name = Path(process.pwd).name if process.pwd else 'unknown'
  394. hook_name = Path(process.cmd[1]).name if len(process.cmd) > 1 else 'unknown'
  395. plugin_hook = process.cmd[1] if len(process.cmd) > 1 else ''
  396. for record in records:
  397. # Add plugin metadata to record
  398. record.setdefault('plugin', plugin_name)
  399. record.setdefault('hook_name', hook_name)
  400. record.setdefault('plugin_hook', plugin_hook)
  401. return records
  402. def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
  403. """
  404. Collect all urls.jsonl entries from parser plugin output subdirectories.
  405. Each parser plugin outputs urls.jsonl to its own subdir:
  406. snapshot_dir/parse_rss_urls/urls.jsonl
  407. snapshot_dir/parse_html_urls/urls.jsonl
  408. etc.
  409. This is not special handling - urls.jsonl is just a normal output file.
  410. This utility collects them all for the crawl system.
  411. """
  412. urls = []
  413. # Look in each immediate subdirectory for urls.jsonl
  414. if not snapshot_dir.exists():
  415. return urls
  416. for subdir in snapshot_dir.iterdir():
  417. if not subdir.is_dir():
  418. continue
  419. urls_file = subdir / 'urls.jsonl'
  420. if not urls_file.exists():
  421. continue
  422. try:
  423. from archivebox.machine.models import Process
  424. text = urls_file.read_text()
  425. for entry in Process.parse_records_from_text(text):
  426. if entry.get('url'):
  427. # Track which parser plugin found this URL
  428. entry['plugin'] = subdir.name
  429. urls.append(entry)
  430. except Exception:
  431. pass
  432. return urls
  433. def run_hooks(
  434. event_name: str,
  435. output_dir: Path,
  436. config: Dict[str, Any],
  437. timeout: Optional[int] = None,
  438. stop_on_failure: bool = False,
  439. **kwargs: Any
  440. ) -> List[HookResult]:
  441. """
  442. Run all hooks for a given event.
  443. Args:
  444. event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
  445. output_dir: Working directory for hook scripts
  446. config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
  447. timeout: Maximum execution time per hook (None = auto-detect from plugin config)
  448. stop_on_failure: If True, stop executing hooks after first failure
  449. **kwargs: Arguments passed to each hook script
  450. Returns:
  451. List of results from each hook execution
  452. Example:
  453. from archivebox.config.configset import get_config
  454. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  455. results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
  456. """
  457. hooks = discover_hooks(event_name, config=config)
  458. results = []
  459. for hook in hooks:
  460. result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
  461. # Background hooks return None - skip adding to results
  462. if result is None:
  463. continue
  464. result['hook'] = str(hook)
  465. results.append(result)
  466. if stop_on_failure and result['returncode'] != 0:
  467. break
  468. return results
  469. @lru_cache(maxsize=1)
  470. def get_plugins() -> List[str]:
  471. """
  472. Get list of available plugins by discovering Snapshot hooks.
  473. Returns plugin names (directory names) that contain on_Snapshot hooks.
  474. The plugin name is the plugin directory name, not the hook script name.
  475. Example:
  476. archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
  477. -> plugin = 'chrome'
  478. Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
  479. """
  480. plugins = []
  481. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  482. if not base_dir.exists():
  483. continue
  484. for ext in ('sh', 'py', 'js'):
  485. for hook_path in base_dir.glob(f'*/on_Snapshot__*.{ext}'):
  486. # Use plugin directory name as plugin name
  487. plugin_name = hook_path.parent.name
  488. plugins.append(plugin_name)
  489. return sorted(set(plugins))
  490. def get_parser_plugins() -> List[str]:
  491. """
  492. Get list of parser plugins by discovering parse_*_urls hooks.
  493. Parser plugins discover URLs from source files and output urls.jsonl.
  494. Returns plugin names like: ['50_parse_html_urls', '51_parse_rss_urls', ...]
  495. """
  496. return [e for e in get_plugins() if 'parse_' in e and '_urls' in e]
  497. def get_plugin_name(plugin: str) -> str:
  498. """
  499. Get the base plugin name without numeric prefix.
  500. Examples:
  501. '10_title' -> 'title'
  502. '26_readability' -> 'readability'
  503. '50_parse_html_urls' -> 'parse_html_urls'
  504. """
  505. # Split on first underscore after any leading digits
  506. parts = plugin.split('_', 1)
  507. if len(parts) == 2 and parts[0].isdigit():
  508. return parts[1]
  509. return plugin
  510. def is_parser_plugin(plugin: str) -> bool:
  511. """Check if a plugin is a parser plugin (discovers URLs)."""
  512. name = get_plugin_name(plugin)
  513. return name.startswith('parse_') and name.endswith('_urls')
  514. def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
  515. """
  516. Get the list of enabled plugins based on config and available hooks.
  517. Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
  518. Args:
  519. config: Merged config dict from get_config() - if None, uses global config
  520. Returns:
  521. Plugin names sorted alphabetically (numeric prefix controls order).
  522. Example:
  523. from archivebox.config.configset import get_config
  524. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  525. enabled = get_enabled_plugins(config) # ['wget', 'media', 'chrome', ...]
  526. """
  527. # Get merged config if not provided
  528. if config is None:
  529. from archivebox.config.configset import get_config
  530. config = get_config()
  531. # Support explicit ENABLED_PLUGINS override (legacy)
  532. if 'ENABLED_PLUGINS' in config:
  533. return config['ENABLED_PLUGINS']
  534. if 'ENABLED_EXTRACTORS' in config:
  535. return config['ENABLED_EXTRACTORS']
  536. # Filter all plugins by enabled status
  537. all_plugins = get_plugins()
  538. enabled = []
  539. for plugin in all_plugins:
  540. plugin_config = get_plugin_special_config(plugin, config)
  541. if plugin_config['enabled']:
  542. enabled.append(plugin)
  543. return enabled
  544. def discover_plugins_that_provide_interface(
  545. module_name: str,
  546. required_attrs: List[str],
  547. plugin_prefix: Optional[str] = None,
  548. ) -> Dict[str, Any]:
  549. """
  550. Discover plugins that provide a specific Python module with required interface.
  551. This enables dynamic plugin discovery for features like search backends,
  552. storage backends, etc. without hardcoding imports.
  553. Args:
  554. module_name: Name of the module to look for (e.g., 'search')
  555. required_attrs: List of attributes the module must have (e.g., ['search', 'flush'])
  556. plugin_prefix: Optional prefix to filter plugins (e.g., 'search_backend_')
  557. Returns:
  558. Dict mapping backend names to imported modules.
  559. Backend name is derived from plugin directory name minus the prefix.
  560. e.g., search_backend_sqlite -> 'sqlite'
  561. Example:
  562. backends = discover_plugins_that_provide_interface(
  563. module_name='search',
  564. required_attrs=['search', 'flush'],
  565. plugin_prefix='search_backend_',
  566. )
  567. # Returns: {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
  568. """
  569. import importlib.util
  570. backends = {}
  571. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  572. if not base_dir.exists():
  573. continue
  574. for plugin_dir in base_dir.iterdir():
  575. if not plugin_dir.is_dir():
  576. continue
  577. plugin_name = plugin_dir.name
  578. # Filter by prefix if specified
  579. if plugin_prefix and not plugin_name.startswith(plugin_prefix):
  580. continue
  581. # Look for the module file
  582. module_path = plugin_dir / f'{module_name}.py'
  583. if not module_path.exists():
  584. continue
  585. try:
  586. # Import the module dynamically
  587. spec = importlib.util.spec_from_file_location(
  588. f'archivebox.plugins.{plugin_name}.{module_name}',
  589. module_path
  590. )
  591. if spec is None or spec.loader is None:
  592. continue
  593. module = importlib.util.module_from_spec(spec)
  594. spec.loader.exec_module(module)
  595. # Check for required attributes
  596. if not all(hasattr(module, attr) for attr in required_attrs):
  597. continue
  598. # Derive backend name from plugin directory name
  599. if plugin_prefix:
  600. backend_name = plugin_name[len(plugin_prefix):]
  601. else:
  602. backend_name = plugin_name
  603. backends[backend_name] = module
  604. except Exception:
  605. # Skip plugins that fail to import
  606. continue
  607. return backends
  608. def get_search_backends() -> Dict[str, Any]:
  609. """
  610. Discover all available search backend plugins.
  611. Search backends must provide a search.py module with:
  612. - search(query: str) -> List[str] (returns snapshot IDs)
  613. - flush(snapshot_ids: Iterable[str]) -> None
  614. Returns:
  615. Dict mapping backend names to their modules.
  616. e.g., {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
  617. """
  618. return discover_plugins_that_provide_interface(
  619. module_name='search',
  620. required_attrs=['search', 'flush'],
  621. plugin_prefix='search_backend_',
  622. )
  623. def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
  624. """
  625. Discover all plugin config.json schemas.
  626. Each plugin can define a config.json file with JSONSchema defining
  627. its configuration options. This function discovers and loads all such schemas.
  628. The config.json files use JSONSchema draft-07 with custom extensions:
  629. - x-fallback: Global config key to use as fallback
  630. - x-aliases: List of old/alternative config key names
  631. Returns:
  632. Dict mapping plugin names to their parsed JSONSchema configs.
  633. e.g., {'wget': {...schema...}, 'chrome': {...schema...}}
  634. Example config.json:
  635. {
  636. "$schema": "http://json-schema.org/draft-07/schema#",
  637. "type": "object",
  638. "properties": {
  639. "SAVE_WGET": {"type": "boolean", "default": true},
  640. "WGET_TIMEOUT": {"type": "integer", "default": 60, "x-fallback": "TIMEOUT"}
  641. }
  642. }
  643. """
  644. configs = {}
  645. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  646. if not base_dir.exists():
  647. continue
  648. for plugin_dir in base_dir.iterdir():
  649. if not plugin_dir.is_dir():
  650. continue
  651. config_path = plugin_dir / 'config.json'
  652. if not config_path.exists():
  653. continue
  654. try:
  655. with open(config_path, 'r') as f:
  656. schema = json.load(f)
  657. # Basic validation: must be an object with properties
  658. if not isinstance(schema, dict):
  659. continue
  660. if schema.get('type') != 'object':
  661. continue
  662. if 'properties' not in schema:
  663. continue
  664. configs[plugin_dir.name] = schema
  665. except (json.JSONDecodeError, OSError) as e:
  666. # Log warning but continue - malformed config shouldn't break discovery
  667. import sys
  668. print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr)
  669. continue
  670. return configs
  671. def get_config_defaults_from_plugins() -> Dict[str, Any]:
  672. """
  673. Get default values for all plugin config options.
  674. Returns:
  675. Dict mapping config keys to their default values.
  676. e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
  677. """
  678. plugin_configs = discover_plugin_configs()
  679. defaults = {}
  680. for plugin_name, schema in plugin_configs.items():
  681. properties = schema.get('properties', {})
  682. for key, prop_schema in properties.items():
  683. if 'default' in prop_schema:
  684. defaults[key] = prop_schema['default']
  685. return defaults
  686. def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
  687. """
  688. Extract special config keys for a plugin following naming conventions.
  689. ArchiveBox recognizes 3 special config key patterns per plugin:
  690. - {PLUGIN}_ENABLED: Enable/disable toggle (default True)
  691. - {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
  692. - {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
  693. These allow ArchiveBox to:
  694. - Skip disabled plugins (optimization)
  695. - Enforce plugin-specific timeouts automatically
  696. - Discover plugin binaries for validation
  697. Args:
  698. plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
  699. config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
  700. Returns:
  701. Dict with standardized keys:
  702. {
  703. 'enabled': True, # bool
  704. 'timeout': 60, # int, seconds
  705. 'binary': 'wget', # str, path or name
  706. }
  707. Examples:
  708. >>> from archivebox.config.configset import get_config
  709. >>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  710. >>> get_plugin_special_config('wget', config)
  711. {'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
  712. """
  713. plugin_upper = plugin_name.upper()
  714. # 1. Enabled: Check PLUGINS whitelist first, then PLUGINNAME_ENABLED (default True)
  715. # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
  716. # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
  717. plugins_whitelist = config.get('PLUGINS', '')
  718. if plugins_whitelist:
  719. # PLUGINS whitelist is specified - only enable plugins in the list
  720. plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
  721. if plugin_name.lower() not in plugin_names:
  722. # Plugin not in whitelist - explicitly disabled
  723. enabled = False
  724. else:
  725. # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
  726. enabled_key = f'{plugin_upper}_ENABLED'
  727. enabled = config.get(enabled_key)
  728. if enabled is None:
  729. enabled = True # Default to enabled if in whitelist
  730. elif isinstance(enabled, str):
  731. enabled = enabled.lower() not in ('false', '0', 'no', '')
  732. else:
  733. # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
  734. enabled_key = f'{plugin_upper}_ENABLED'
  735. enabled = config.get(enabled_key)
  736. if enabled is None:
  737. enabled = True
  738. elif isinstance(enabled, str):
  739. # Handle string values from config file ("true"/"false")
  740. enabled = enabled.lower() not in ('false', '0', 'no', '')
  741. # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
  742. timeout_key = f'{plugin_upper}_TIMEOUT'
  743. timeout = config.get(timeout_key) or config.get('TIMEOUT', 300)
  744. # 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
  745. binary_key = f'{plugin_upper}_BINARY'
  746. binary = config.get(binary_key, plugin_name)
  747. return {
  748. 'enabled': bool(enabled),
  749. 'timeout': int(timeout),
  750. 'binary': str(binary),
  751. }
  752. # =============================================================================
  753. # Plugin Template Discovery
  754. # =============================================================================
  755. #
  756. # Plugins can provide custom templates for rendering their output in the UI.
  757. # Templates are discovered by filename convention inside each plugin's templates/ dir:
  758. #
  759. # archivebox/plugins/<plugin_name>/
  760. # templates/
  761. # icon.html # Icon for admin table view (small inline HTML)
  762. # card.html # Preview card for snapshot header
  763. # full.html # Fullscreen view template
  764. #
  765. # Template context variables available:
  766. # {{ result }} - ArchiveResult object
  767. # {{ snapshot }} - Parent Snapshot object
  768. # {{ output_path }} - Path to output file/dir relative to snapshot dir
  769. # {{ plugin }} - Plugin name (e.g., 'screenshot', 'singlefile')
  770. #
  771. # Default templates used when plugin doesn't provide one
  772. DEFAULT_TEMPLATES = {
  773. 'icon': '''
  774. <span title="{{ plugin }}" style="display:inline-flex; width:20px; height:20px; align-items:center; justify-content:center;">
  775. {{ icon }}
  776. </span>
  777. ''',
  778. 'card': '''
  779. <iframe src="{{ output_path }}"
  780. class="card-img-top"
  781. style="width: 100%; height: 100%; border: none;"
  782. sandbox="allow-same-origin allow-scripts allow-forms"
  783. loading="lazy">
  784. </iframe>
  785. ''',
  786. 'full': '''
  787. <iframe src="{{ output_path }}"
  788. class="full-page-iframe"
  789. style="width: 100%; height: 100vh; border: none;"
  790. sandbox="allow-same-origin allow-scripts allow-forms">
  791. </iframe>
  792. ''',
  793. }
  794. def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> Optional[str]:
  795. """
  796. Get a plugin template by plugin name and template type.
  797. Args:
  798. plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
  799. template_name: One of 'icon', 'card', 'full'
  800. fallback: If True, return default template if plugin template not found
  801. Returns:
  802. Template content as string, or None if not found and fallback=False.
  803. """
  804. base_name = get_plugin_name(plugin)
  805. if base_name in ('yt-dlp', 'youtube-dl'):
  806. base_name = 'ytdlp'
  807. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  808. if not base_dir.exists():
  809. continue
  810. # Look for plugin directory matching plugin name
  811. for plugin_dir in base_dir.iterdir():
  812. if not plugin_dir.is_dir():
  813. continue
  814. # Match by directory name (exact or partial)
  815. if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
  816. template_path = plugin_dir / 'templates' / f'{template_name}.html'
  817. if template_path.exists():
  818. return template_path.read_text()
  819. # Fall back to default template if requested
  820. if fallback:
  821. return DEFAULT_TEMPLATES.get(template_name, '')
  822. return None
  823. @lru_cache(maxsize=None)
  824. def get_plugin_icon(plugin: str) -> str:
  825. """
  826. Get the icon for a plugin from its icon.html template.
  827. Args:
  828. plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
  829. Returns:
  830. Icon HTML/emoji string.
  831. """
  832. # Try plugin-provided icon template
  833. icon_template = get_plugin_template(plugin, 'icon', fallback=False)
  834. if icon_template:
  835. return mark_safe(icon_template.strip())
  836. # Fall back to generic folder icon
  837. return mark_safe('📁')
  838. def get_all_plugin_icons() -> Dict[str, str]:
  839. """
  840. Get icons for all discovered plugins.
  841. Returns:
  842. Dict mapping plugin base names to their icons.
  843. """
  844. icons = {}
  845. for plugin in get_plugins():
  846. base_name = get_plugin_name(plugin)
  847. icons[base_name] = get_plugin_icon(plugin)
  848. return icons
  849. def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
  850. """
  851. Discover all plugin templates organized by plugin.
  852. Returns:
  853. Dict mapping plugin names to dicts of template_name -> template_path.
  854. e.g., {'screenshot': {'icon': '/path/to/icon.html', 'card': '/path/to/card.html'}}
  855. """
  856. templates: Dict[str, Dict[str, str]] = {}
  857. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  858. if not base_dir.exists():
  859. continue
  860. for plugin_dir in base_dir.iterdir():
  861. if not plugin_dir.is_dir():
  862. continue
  863. templates_dir = plugin_dir / 'templates'
  864. if not templates_dir.exists():
  865. continue
  866. plugin_templates = {}
  867. for template_file in templates_dir.glob('*.html'):
  868. template_name = template_file.stem # icon, card, full
  869. plugin_templates[template_name] = str(template_file)
  870. if plugin_templates:
  871. templates[plugin_dir.name] = plugin_templates
  872. return templates
  873. # =============================================================================
  874. # Hook Result Processing Helpers
  875. # =============================================================================
  876. def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
  877. """
  878. Find Binary for a command, trying abspath first then name.
  879. Only matches binaries on the current machine.
  880. Args:
  881. cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
  882. machine_id: Current machine ID
  883. Returns:
  884. Binary ID as string if found, None otherwise
  885. """
  886. if not cmd:
  887. return None
  888. from archivebox.machine.models import Binary
  889. bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
  890. # Try matching by absolute path first
  891. binary = Binary.objects.filter(
  892. abspath=bin_path_or_name,
  893. machine_id=machine_id
  894. ).first()
  895. if binary:
  896. return str(binary.id)
  897. # Fallback: match by binary name
  898. bin_name = Path(bin_path_or_name).name
  899. binary = Binary.objects.filter(
  900. name=bin_name,
  901. machine_id=machine_id
  902. ).first()
  903. return str(binary.id) if binary else None
  904. def create_model_record(record: Dict[str, Any]) -> Any:
  905. """
  906. Generic helper to create/update model instances from hook JSONL output.
  907. Args:
  908. record: Dict with 'type' field and model data
  909. Returns:
  910. Created/updated model instance, or None if type unknown
  911. """
  912. from archivebox.machine.models import Binary, Machine
  913. record_type = record.pop('type', None)
  914. if not record_type:
  915. return None
  916. # Remove plugin metadata (not model fields)
  917. record.pop('plugin', None)
  918. record.pop('plugin_hook', None)
  919. if record_type == 'Binary':
  920. # Binary requires machine FK
  921. machine = Machine.current()
  922. record.setdefault('machine', machine)
  923. # Required fields check
  924. name = record.get('name')
  925. abspath = record.get('abspath')
  926. if not name or not abspath:
  927. return None
  928. obj, created = Binary.objects.update_or_create(
  929. machine=machine,
  930. name=name,
  931. defaults={
  932. 'abspath': abspath,
  933. 'version': record.get('version', ''),
  934. 'sha256': record.get('sha256', ''),
  935. 'binprovider': record.get('binprovider', 'env'),
  936. }
  937. )
  938. return obj
  939. elif record_type == 'Machine':
  940. config_patch = record.get('config')
  941. if isinstance(config_patch, dict) and config_patch:
  942. machine = Machine.current()
  943. if not machine.config:
  944. machine.config = {}
  945. machine.config.update(config_patch)
  946. machine.save(update_fields=['config'])
  947. return machine
  948. return None
  949. # Add more types as needed (Dependency, Snapshot, etc.)
  950. else:
  951. # Unknown type - log warning but don't fail
  952. import sys
  953. print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
  954. return None
  955. def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
  956. """
  957. Process JSONL records from hook output.
  958. Dispatches to Model.from_json() for each record type.
  959. Args:
  960. records: List of JSONL record dicts from result['records']
  961. overrides: Dict with 'snapshot', 'crawl', 'dependency', 'created_by_id', etc.
  962. Returns:
  963. Dict with counts by record type
  964. """
  965. stats = {}
  966. overrides = overrides or {}
  967. for record in records:
  968. record_type = record.get('type')
  969. if not record_type:
  970. continue
  971. # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
  972. if record_type == 'ArchiveResult':
  973. continue
  974. try:
  975. # Dispatch to appropriate model's from_json() method
  976. if record_type == 'Snapshot':
  977. from archivebox.core.models import Snapshot
  978. # Check if discovered snapshot exceeds crawl max_depth
  979. snapshot_depth = record.get('depth', 0)
  980. crawl = overrides.get('crawl')
  981. if crawl and snapshot_depth > crawl.max_depth:
  982. # Skip - this URL was discovered but exceeds max crawl depth
  983. continue
  984. obj = Snapshot.from_json(record.copy(), overrides)
  985. if obj:
  986. stats['Snapshot'] = stats.get('Snapshot', 0) + 1
  987. elif record_type == 'Tag':
  988. from archivebox.core.models import Tag
  989. obj = Tag.from_json(record.copy(), overrides)
  990. if obj:
  991. stats['Tag'] = stats.get('Tag', 0) + 1
  992. elif record_type == 'Binary':
  993. from archivebox.machine.models import Binary
  994. obj = Binary.from_json(record.copy(), overrides)
  995. if obj:
  996. stats['Binary'] = stats.get('Binary', 0) + 1
  997. elif record_type == 'Machine':
  998. from archivebox.machine.models import Machine
  999. obj = Machine.from_json(record.copy(), overrides)
  1000. if obj:
  1001. stats['Machine'] = stats.get('Machine', 0) + 1
  1002. else:
  1003. import sys
  1004. print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
  1005. except Exception as e:
  1006. import sys
  1007. print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
  1008. continue
  1009. return stats