hooks.py 43 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253
  1. """
  2. Hook discovery and execution system for ArchiveBox plugins.
  3. Hooks are standalone scripts that run as separate processes and communicate
  4. with ArchiveBox via CLI arguments and stdout JSON output. This keeps the plugin
  5. system simple and language-agnostic.
  6. Directory structure:
  7. archivebox/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in)
  8. data/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
  9. Hook contract:
  10. Input: --url=<url> (and other --key=value args)
  11. Output: JSON to stdout, files to $PWD
  12. Exit: 0 = success, non-zero = failure
  13. Execution order:
  14. - Hooks are numbered 00-99 with first digit determining step (0-9)
  15. - All hooks in a step can run in parallel
  16. - Steps execute sequentially (step 0 → step 1 → ... → step 9)
  17. - Background hooks (.bg suffix) don't block step advancement
  18. - Failed extractors don't block subsequent extractors
  19. Hook Naming Convention:
  20. on_{ModelName}__{run_order}_{description}[.bg].{ext}
  21. Examples:
  22. on_Snapshot__00_setup.py # Step 0, runs first
  23. on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block)
  24. on_Snapshot__50_screenshot.js # Step 5, foreground (blocks step)
  25. on_Snapshot__63_media.bg.py # Step 6, background (long-running)
  26. Dependency handling:
  27. Extractor plugins that depend on other plugins' output should check at runtime:
  28. ```python
  29. # Example: screenshot plugin depends on chrome plugin
  30. chrome_session_dir = Path(os.environ.get('SNAPSHOT_DIR', '.')) / 'chrome_session'
  31. if not (chrome_session_dir / 'session.json').exists():
  32. print('{"status": "skipped", "output": "chrome_session not available"}')
  33. sys.exit(1) # Exit non-zero so it gets retried later
  34. ```
  35. On retry (Snapshot.retry_failed_archiveresults()):
  36. - Only FAILED/SKIPPED plugins reset to queued (SUCCEEDED stays)
  37. - Run in order again
  38. - If dependencies now succeed, dependents can run
  39. API (all hook logic lives here):
  40. discover_hooks(event) -> List[Path] Find hook scripts
  41. run_hook(script, ...) -> HookResult Execute a hook script
  42. run_hooks(event, ...) -> List[HookResult] Run all hooks for an event
  43. extract_step(hook_name) -> int Get step number (0-9) from hook name
  44. is_background_hook(name) -> bool Check if hook is background (.bg suffix)
  45. """
  46. __package__ = 'archivebox'
  47. import os
  48. import re
  49. import json
  50. import signal
  51. import time
  52. import subprocess
  53. from pathlib import Path
  54. from typing import List, Dict, Any, Optional, TypedDict
  55. from django.conf import settings
  56. from django.utils import timezone
  57. # Plugin directories
  58. BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
  59. USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
  60. # =============================================================================
  61. # Hook Step Extraction
  62. # =============================================================================
  63. def extract_step(hook_name: str) -> int:
  64. """
  65. Extract step number (0-9) from hook name.
  66. Hooks are numbered 00-99 with the first digit determining the step.
  67. Pattern: on_{Model}__{XX}_{description}[.bg].{ext}
  68. Args:
  69. hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py')
  70. Returns:
  71. Step number 0-9, or 9 (default) for unnumbered hooks.
  72. Examples:
  73. extract_step('on_Snapshot__05_chrome.py') -> 0
  74. extract_step('on_Snapshot__50_wget.py') -> 5
  75. extract_step('on_Snapshot__63_media.bg.py') -> 6
  76. extract_step('on_Snapshot__99_cleanup.sh') -> 9
  77. extract_step('on_Snapshot__unnumbered.py') -> 9 (default)
  78. """
  79. # Pattern matches __XX_ where XX is two digits
  80. match = re.search(r'__(\d{2})_', hook_name)
  81. if match:
  82. two_digit = int(match.group(1))
  83. step = two_digit // 10 # First digit is the step (0-9)
  84. return step
  85. # Log warning for unnumbered hooks and default to step 9
  86. import sys
  87. print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr)
  88. return 9
  89. def is_background_hook(hook_name: str) -> bool:
  90. """
  91. Check if a hook is a background hook (doesn't block step advancement).
  92. Background hooks have '.bg.' in their filename before the extension.
  93. Args:
  94. hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js')
  95. Returns:
  96. True if background hook, False if foreground.
  97. Examples:
  98. is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True
  99. is_background_hook('on_Snapshot__50_wget.py') -> False
  100. is_background_hook('on_Snapshot__63_media.bg.py') -> True
  101. """
  102. return '.bg.' in hook_name or '__background' in hook_name
  103. class HookResult(TypedDict, total=False):
  104. """Raw result from run_hook()."""
  105. returncode: int
  106. stdout: str
  107. stderr: str
  108. output_json: Optional[Dict[str, Any]]
  109. output_files: List[str]
  110. duration_ms: int
  111. hook: str
  112. plugin: str # Plugin name (directory name, e.g., 'wget', 'screenshot')
  113. hook_name: str # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
  114. # New fields for JSONL parsing
  115. records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
  116. def discover_hooks(
  117. event_name: str,
  118. filter_disabled: bool = True,
  119. config: Optional[Dict[str, Any]] = None
  120. ) -> List[Path]:
  121. """
  122. Find all hook scripts matching on_{event_name}__*.{sh,py,js} pattern.
  123. Searches both built-in and user plugin directories.
  124. Filters out hooks from disabled plugins by default (respects USE_/SAVE_ flags).
  125. Returns scripts sorted alphabetically by filename for deterministic execution order.
  126. Hook naming convention uses numeric prefixes to control order:
  127. on_Snapshot__10_title.py # runs first
  128. on_Snapshot__15_singlefile.py # runs second
  129. on_Snapshot__26_readability.py # runs later (depends on singlefile)
  130. Args:
  131. event_name: Event name (e.g., 'Snapshot', 'Binary', 'Crawl')
  132. filter_disabled: If True, skip hooks from disabled plugins (default: True)
  133. config: Optional config dict from get_config() (merges file, env, machine, crawl, snapshot)
  134. If None, will call get_config() with global scope
  135. Returns:
  136. Sorted list of hook script paths from enabled plugins only.
  137. Examples:
  138. # With proper config context (recommended):
  139. from archivebox.config.configset import get_config
  140. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  141. discover_hooks('Snapshot', config=config)
  142. # Returns: [Path('.../on_Snapshot__10_title.py'), ...] (wget excluded if SAVE_WGET=False)
  143. # Without config (uses global defaults):
  144. discover_hooks('Snapshot')
  145. # Returns: [Path('.../on_Snapshot__10_title.py'), ...]
  146. # Show all plugins regardless of enabled status:
  147. discover_hooks('Snapshot', filter_disabled=False)
  148. # Returns: [Path('.../on_Snapshot__10_title.py'), ..., Path('.../on_Snapshot__50_wget.py')]
  149. """
  150. hooks = []
  151. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  152. if not base_dir.exists():
  153. continue
  154. # Search for hook scripts in all subdirectories
  155. for ext in ('sh', 'py', 'js'):
  156. pattern = f'*/on_{event_name}__*.{ext}'
  157. hooks.extend(base_dir.glob(pattern))
  158. # Also check for hooks directly in the plugins directory
  159. pattern_direct = f'on_{event_name}__*.{ext}'
  160. hooks.extend(base_dir.glob(pattern_direct))
  161. # Filter by enabled plugins
  162. if filter_disabled:
  163. # Get merged config if not provided (lazy import to avoid circular dependency)
  164. if config is None:
  165. from archivebox.config.configset import get_config
  166. config = get_config(scope='global')
  167. enabled_hooks = []
  168. for hook in hooks:
  169. # Get plugin name from parent directory
  170. # e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
  171. plugin_name = hook.parent.name
  172. # Check if this is a plugin directory (not the root plugins dir)
  173. if plugin_name in ('plugins', '.'):
  174. # Hook is in root plugins directory, not a plugin subdir
  175. # Include it by default (no filtering for non-plugin hooks)
  176. enabled_hooks.append(hook)
  177. continue
  178. # Check if plugin is enabled
  179. plugin_config = get_plugin_special_config(plugin_name, config)
  180. if plugin_config['enabled']:
  181. enabled_hooks.append(hook)
  182. hooks = enabled_hooks
  183. # Sort by filename (not full path) to ensure numeric prefix ordering works
  184. # e.g., on_Snapshot__10_title.py sorts before on_Snapshot__26_readability.py
  185. return sorted(set(hooks), key=lambda p: p.name)
  186. def run_hook(
  187. script: Path,
  188. output_dir: Path,
  189. config: Dict[str, Any],
  190. timeout: Optional[int] = None,
  191. **kwargs: Any
  192. ) -> HookResult:
  193. """
  194. Execute a hook script with the given arguments.
  195. This is the low-level hook executor. For running extractors with proper
  196. metadata handling, use call_extractor() instead.
  197. Config is passed to hooks via environment variables. Caller MUST use
  198. get_config() to merge all sources (file, env, machine, crawl, snapshot).
  199. Args:
  200. script: Path to the hook script (.sh, .py, or .js)
  201. output_dir: Working directory for the script (where output files go)
  202. config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
  203. timeout: Maximum execution time in seconds
  204. If None, auto-detects from PLUGINNAME_TIMEOUT config (fallback to TIMEOUT, default 300)
  205. **kwargs: Arguments passed to the script as --key=value
  206. Returns:
  207. HookResult with 'returncode', 'stdout', 'stderr', 'output_json', 'output_files', 'duration_ms'
  208. Example:
  209. from archivebox.config.configset import get_config
  210. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  211. result = run_hook(hook_path, output_dir, config=config, url=url, snapshot_id=id)
  212. """
  213. import time
  214. start_time = time.time()
  215. # Auto-detect timeout from plugin config if not explicitly provided
  216. if timeout is None:
  217. plugin_name = script.parent.name
  218. plugin_config = get_plugin_special_config(plugin_name, config)
  219. timeout = plugin_config['timeout']
  220. if not script.exists():
  221. return HookResult(
  222. returncode=1,
  223. stdout='',
  224. stderr=f'Hook script not found: {script}',
  225. output_json=None,
  226. output_files=[],
  227. duration_ms=0,
  228. hook=str(script),
  229. plugin=script.parent.name,
  230. hook_name=script.name,
  231. )
  232. # Determine the interpreter based on file extension
  233. ext = script.suffix.lower()
  234. if ext == '.sh':
  235. cmd = ['bash', str(script)]
  236. elif ext == '.py':
  237. cmd = ['python3', str(script)]
  238. elif ext == '.js':
  239. cmd = ['node', str(script)]
  240. else:
  241. # Try to execute directly (assumes shebang)
  242. cmd = [str(script)]
  243. # Build CLI arguments from kwargs
  244. for key, value in kwargs.items():
  245. # Skip keys that start with underscore (internal parameters)
  246. if key.startswith('_'):
  247. continue
  248. arg_key = f'--{key.replace("_", "-")}'
  249. if isinstance(value, bool):
  250. if value:
  251. cmd.append(arg_key)
  252. elif value is not None and value != '':
  253. # JSON-encode complex values, use str for simple ones
  254. # Skip empty strings to avoid --key= which breaks argument parsers
  255. if isinstance(value, (dict, list)):
  256. cmd.append(f'{arg_key}={json.dumps(value)}')
  257. else:
  258. # Ensure value is converted to string and strip whitespace
  259. str_value = str(value).strip()
  260. if str_value: # Only add if non-empty after stripping
  261. cmd.append(f'{arg_key}={str_value}')
  262. # Set up environment with base paths
  263. env = os.environ.copy()
  264. env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
  265. env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
  266. env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
  267. # Use Machine.config.PATH if set (includes pip/npm bin dirs from providers)
  268. try:
  269. from archivebox.machine.models import Machine
  270. machine = Machine.current()
  271. if machine and machine.config:
  272. machine_path = machine.config.get('config/PATH')
  273. if machine_path:
  274. env['PATH'] = machine_path
  275. # Also set NODE_MODULES_DIR if configured
  276. node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
  277. if node_modules_dir:
  278. env['NODE_MODULES_DIR'] = node_modules_dir
  279. except Exception:
  280. pass # Fall back to system PATH if Machine not available
  281. # Export all config values to environment (already merged by get_config())
  282. for key, value in config.items():
  283. if value is None:
  284. continue
  285. elif isinstance(value, bool):
  286. env[key] = 'true' if value else 'false'
  287. elif isinstance(value, (list, dict)):
  288. env[key] = json.dumps(value)
  289. else:
  290. env[key] = str(value)
  291. # Create output directory if needed
  292. output_dir.mkdir(parents=True, exist_ok=True)
  293. # Capture files before execution to detect new output
  294. files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
  295. # Detect if this is a background hook (long-running daemon)
  296. # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
  297. # Old convention: __background in stem (for backwards compatibility)
  298. is_background = '.bg.' in script.name or '__background' in script.stem
  299. # Set up output files for ALL hooks (useful for debugging)
  300. stdout_file = output_dir / 'stdout.log'
  301. stderr_file = output_dir / 'stderr.log'
  302. pid_file = output_dir / 'hook.pid'
  303. cmd_file = output_dir / 'cmd.sh'
  304. try:
  305. # Write command script for validation
  306. from archivebox.misc.process_utils import write_cmd_file
  307. write_cmd_file(cmd_file, cmd)
  308. # Open log files for writing
  309. with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err:
  310. process = subprocess.Popen(
  311. cmd,
  312. cwd=str(output_dir),
  313. stdout=out,
  314. stderr=err,
  315. env=env,
  316. )
  317. # Write PID with mtime set to process start time for validation
  318. from archivebox.misc.process_utils import write_pid_file_with_mtime
  319. process_start_time = time.time()
  320. write_pid_file_with_mtime(pid_file, process.pid, process_start_time)
  321. if is_background:
  322. # Background hook - return None immediately, don't wait
  323. # Process continues running, writing to stdout.log
  324. # ArchiveResult will poll for completion later
  325. return None
  326. # Normal hook - wait for completion with timeout
  327. try:
  328. returncode = process.wait(timeout=timeout)
  329. except subprocess.TimeoutExpired:
  330. process.kill()
  331. process.wait() # Clean up zombie
  332. duration_ms = int((time.time() - start_time) * 1000)
  333. return HookResult(
  334. returncode=-1,
  335. stdout='',
  336. stderr=f'Hook timed out after {timeout} seconds',
  337. output_json=None,
  338. output_files=[],
  339. duration_ms=duration_ms,
  340. hook=str(script),
  341. )
  342. # Read output from files
  343. stdout = stdout_file.read_text() if stdout_file.exists() else ''
  344. stderr = stderr_file.read_text() if stderr_file.exists() else ''
  345. # Detect new files created by the hook
  346. files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
  347. new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
  348. # Exclude the log files themselves from new_files
  349. new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
  350. # Parse JSONL output from stdout
  351. # Each line starting with { that has 'type' field is a record
  352. records = []
  353. plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
  354. hook_name = script.name # Full hook filename (e.g., 'on_Snapshot__50_wget.py')
  355. for line in stdout.splitlines():
  356. line = line.strip()
  357. if not line or not line.startswith('{'):
  358. continue
  359. try:
  360. data = json.loads(line)
  361. if 'type' in data:
  362. # Add plugin metadata to every record
  363. data['plugin'] = plugin_name
  364. data['hook_name'] = hook_name
  365. data['plugin_hook'] = str(script)
  366. records.append(data)
  367. except json.JSONDecodeError:
  368. pass
  369. duration_ms = int((time.time() - start_time) * 1000)
  370. # Clean up log files on success (keep on failure for debugging)
  371. if returncode == 0:
  372. stdout_file.unlink(missing_ok=True)
  373. stderr_file.unlink(missing_ok=True)
  374. pid_file.unlink(missing_ok=True)
  375. return HookResult(
  376. returncode=returncode,
  377. stdout=stdout,
  378. stderr=stderr,
  379. output_json=output_json,
  380. output_files=new_files,
  381. duration_ms=duration_ms,
  382. hook=str(script),
  383. plugin=plugin_name,
  384. hook_name=hook_name,
  385. records=records,
  386. )
  387. except Exception as e:
  388. duration_ms = int((time.time() - start_time) * 1000)
  389. return HookResult(
  390. returncode=-1,
  391. stdout='',
  392. stderr=f'Failed to run hook: {type(e).__name__}: {e}',
  393. output_json=None,
  394. output_files=[],
  395. duration_ms=duration_ms,
  396. hook=str(script),
  397. plugin=script.parent.name,
  398. hook_name=script.name,
  399. records=[],
  400. )
  401. def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
  402. """
  403. Collect all urls.jsonl entries from parser plugin output subdirectories.
  404. Each parser plugin outputs urls.jsonl to its own subdir:
  405. snapshot_dir/parse_rss_urls/urls.jsonl
  406. snapshot_dir/parse_html_urls/urls.jsonl
  407. etc.
  408. This is not special handling - urls.jsonl is just a normal output file.
  409. This utility collects them all for the crawl system.
  410. """
  411. urls = []
  412. # Look in each immediate subdirectory for urls.jsonl
  413. if not snapshot_dir.exists():
  414. return urls
  415. for subdir in snapshot_dir.iterdir():
  416. if not subdir.is_dir():
  417. continue
  418. urls_file = subdir / 'urls.jsonl'
  419. if not urls_file.exists():
  420. continue
  421. try:
  422. with open(urls_file, 'r') as f:
  423. for line in f:
  424. line = line.strip()
  425. if line:
  426. try:
  427. entry = json.loads(line)
  428. if entry.get('url'):
  429. # Track which parser plugin found this URL
  430. entry['plugin'] = subdir.name
  431. urls.append(entry)
  432. except json.JSONDecodeError:
  433. continue
  434. except Exception:
  435. pass
  436. return urls
  437. def run_hooks(
  438. event_name: str,
  439. output_dir: Path,
  440. config: Dict[str, Any],
  441. timeout: Optional[int] = None,
  442. stop_on_failure: bool = False,
  443. **kwargs: Any
  444. ) -> List[HookResult]:
  445. """
  446. Run all hooks for a given event.
  447. Args:
  448. event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
  449. output_dir: Working directory for hook scripts
  450. config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
  451. timeout: Maximum execution time per hook (None = auto-detect from plugin config)
  452. stop_on_failure: If True, stop executing hooks after first failure
  453. **kwargs: Arguments passed to each hook script
  454. Returns:
  455. List of results from each hook execution
  456. Example:
  457. from archivebox.config.configset import get_config
  458. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  459. results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
  460. """
  461. hooks = discover_hooks(event_name, config=config)
  462. results = []
  463. for hook in hooks:
  464. result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
  465. # Background hooks return None - skip adding to results
  466. if result is None:
  467. continue
  468. result['hook'] = str(hook)
  469. results.append(result)
  470. if stop_on_failure and result['returncode'] != 0:
  471. break
  472. return results
  473. def get_plugins() -> List[str]:
  474. """
  475. Get list of available plugins by discovering Snapshot hooks.
  476. Returns plugin names (directory names) that contain on_Snapshot hooks.
  477. The plugin name is the plugin directory name, not the hook script name.
  478. Example:
  479. archivebox/plugins/chrome_session/on_Snapshot__20_chrome_tab.bg.js
  480. -> plugin = 'chrome_session'
  481. Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
  482. """
  483. plugins = []
  484. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  485. if not base_dir.exists():
  486. continue
  487. for ext in ('sh', 'py', 'js'):
  488. for hook_path in base_dir.glob(f'*/on_Snapshot__*.{ext}'):
  489. # Use plugin directory name as plugin name
  490. plugin_name = hook_path.parent.name
  491. plugins.append(plugin_name)
  492. return sorted(set(plugins))
  493. def get_parser_plugins() -> List[str]:
  494. """
  495. Get list of parser plugins by discovering parse_*_urls hooks.
  496. Parser plugins discover URLs from source files and output urls.jsonl.
  497. Returns plugin names like: ['50_parse_html_urls', '51_parse_rss_urls', ...]
  498. """
  499. return [e for e in get_plugins() if 'parse_' in e and '_urls' in e]
  500. def get_plugin_name(plugin: str) -> str:
  501. """
  502. Get the base plugin name without numeric prefix.
  503. Examples:
  504. '10_title' -> 'title'
  505. '26_readability' -> 'readability'
  506. '50_parse_html_urls' -> 'parse_html_urls'
  507. """
  508. # Split on first underscore after any leading digits
  509. parts = plugin.split('_', 1)
  510. if len(parts) == 2 and parts[0].isdigit():
  511. return parts[1]
  512. return plugin
  513. def is_parser_plugin(plugin: str) -> bool:
  514. """Check if a plugin is a parser plugin (discovers URLs)."""
  515. name = get_plugin_name(plugin)
  516. return name.startswith('parse_') and name.endswith('_urls')
  517. def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
  518. """
  519. Get the list of enabled plugins based on config and available hooks.
  520. Filters plugins by USE_/SAVE_ flags. Only returns plugins that are enabled.
  521. Args:
  522. config: Merged config dict from get_config() - if None, uses global config
  523. Returns:
  524. Plugin names sorted alphabetically (numeric prefix controls order).
  525. Example:
  526. from archivebox.config.configset import get_config
  527. config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  528. enabled = get_enabled_plugins(config) # ['wget', 'media', 'chrome', ...]
  529. """
  530. # Get merged config if not provided
  531. if config is None:
  532. from archivebox.config.configset import get_config
  533. config = get_config(scope='global')
  534. # Support explicit ENABLED_PLUGINS override (legacy)
  535. if 'ENABLED_PLUGINS' in config:
  536. return config['ENABLED_PLUGINS']
  537. if 'ENABLED_EXTRACTORS' in config:
  538. return config['ENABLED_EXTRACTORS']
  539. # Filter all plugins by enabled status
  540. all_plugins = get_plugins()
  541. enabled = []
  542. for plugin in all_plugins:
  543. plugin_config = get_plugin_special_config(plugin, config)
  544. if plugin_config['enabled']:
  545. enabled.append(plugin)
  546. return enabled
  547. def discover_plugins_that_provide_interface(
  548. module_name: str,
  549. required_attrs: List[str],
  550. plugin_prefix: Optional[str] = None,
  551. ) -> Dict[str, Any]:
  552. """
  553. Discover plugins that provide a specific Python module with required interface.
  554. This enables dynamic plugin discovery for features like search backends,
  555. storage backends, etc. without hardcoding imports.
  556. Args:
  557. module_name: Name of the module to look for (e.g., 'search')
  558. required_attrs: List of attributes the module must have (e.g., ['search', 'flush'])
  559. plugin_prefix: Optional prefix to filter plugins (e.g., 'search_backend_')
  560. Returns:
  561. Dict mapping backend names to imported modules.
  562. Backend name is derived from plugin directory name minus the prefix.
  563. e.g., search_backend_sqlite -> 'sqlite'
  564. Example:
  565. backends = discover_plugins_that_provide_interface(
  566. module_name='search',
  567. required_attrs=['search', 'flush'],
  568. plugin_prefix='search_backend_',
  569. )
  570. # Returns: {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
  571. """
  572. import importlib.util
  573. backends = {}
  574. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  575. if not base_dir.exists():
  576. continue
  577. for plugin_dir in base_dir.iterdir():
  578. if not plugin_dir.is_dir():
  579. continue
  580. plugin_name = plugin_dir.name
  581. # Filter by prefix if specified
  582. if plugin_prefix and not plugin_name.startswith(plugin_prefix):
  583. continue
  584. # Look for the module file
  585. module_path = plugin_dir / f'{module_name}.py'
  586. if not module_path.exists():
  587. continue
  588. try:
  589. # Import the module dynamically
  590. spec = importlib.util.spec_from_file_location(
  591. f'archivebox.plugins.{plugin_name}.{module_name}',
  592. module_path
  593. )
  594. if spec is None or spec.loader is None:
  595. continue
  596. module = importlib.util.module_from_spec(spec)
  597. spec.loader.exec_module(module)
  598. # Check for required attributes
  599. if not all(hasattr(module, attr) for attr in required_attrs):
  600. continue
  601. # Derive backend name from plugin directory name
  602. if plugin_prefix:
  603. backend_name = plugin_name[len(plugin_prefix):]
  604. else:
  605. backend_name = plugin_name
  606. backends[backend_name] = module
  607. except Exception:
  608. # Skip plugins that fail to import
  609. continue
  610. return backends
  611. def get_search_backends() -> Dict[str, Any]:
  612. """
  613. Discover all available search backend plugins.
  614. Search backends must provide a search.py module with:
  615. - search(query: str) -> List[str] (returns snapshot IDs)
  616. - flush(snapshot_ids: Iterable[str]) -> None
  617. Returns:
  618. Dict mapping backend names to their modules.
  619. e.g., {'sqlite': <module>, 'sonic': <module>, 'ripgrep': <module>}
  620. """
  621. return discover_plugins_that_provide_interface(
  622. module_name='search',
  623. required_attrs=['search', 'flush'],
  624. plugin_prefix='search_backend_',
  625. )
  626. def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
  627. """
  628. Discover all plugin config.json schemas.
  629. Each plugin can define a config.json file with JSONSchema defining
  630. its configuration options. This function discovers and loads all such schemas.
  631. The config.json files use JSONSchema draft-07 with custom extensions:
  632. - x-fallback: Global config key to use as fallback
  633. - x-aliases: List of old/alternative config key names
  634. Returns:
  635. Dict mapping plugin names to their parsed JSONSchema configs.
  636. e.g., {'wget': {...schema...}, 'chrome_session': {...schema...}}
  637. Example config.json:
  638. {
  639. "$schema": "http://json-schema.org/draft-07/schema#",
  640. "type": "object",
  641. "properties": {
  642. "SAVE_WGET": {"type": "boolean", "default": true},
  643. "WGET_TIMEOUT": {"type": "integer", "default": 60, "x-fallback": "TIMEOUT"}
  644. }
  645. }
  646. """
  647. configs = {}
  648. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  649. if not base_dir.exists():
  650. continue
  651. for plugin_dir in base_dir.iterdir():
  652. if not plugin_dir.is_dir():
  653. continue
  654. config_path = plugin_dir / 'config.json'
  655. if not config_path.exists():
  656. continue
  657. try:
  658. with open(config_path, 'r') as f:
  659. schema = json.load(f)
  660. # Basic validation: must be an object with properties
  661. if not isinstance(schema, dict):
  662. continue
  663. if schema.get('type') != 'object':
  664. continue
  665. if 'properties' not in schema:
  666. continue
  667. configs[plugin_dir.name] = schema
  668. except (json.JSONDecodeError, OSError) as e:
  669. # Log warning but continue - malformed config shouldn't break discovery
  670. import sys
  671. print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr)
  672. continue
  673. return configs
  674. def get_config_defaults_from_plugins() -> Dict[str, Any]:
  675. """
  676. Get default values for all plugin config options.
  677. Returns:
  678. Dict mapping config keys to their default values.
  679. e.g., {'SAVE_WGET': True, 'WGET_TIMEOUT': 60, ...}
  680. """
  681. plugin_configs = discover_plugin_configs()
  682. defaults = {}
  683. for plugin_name, schema in plugin_configs.items():
  684. properties = schema.get('properties', {})
  685. for key, prop_schema in properties.items():
  686. if 'default' in prop_schema:
  687. defaults[key] = prop_schema['default']
  688. return defaults
  689. def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
  690. """
  691. Extract special config keys for a plugin following naming conventions.
  692. ArchiveBox recognizes 3 special config key patterns per plugin:
  693. - {PLUGIN}_ENABLED: Enable/disable toggle (default True)
  694. - {PLUGIN}_TIMEOUT: Plugin-specific timeout (fallback to TIMEOUT, default 300)
  695. - {PLUGIN}_BINARY: Primary binary path (default to plugin_name)
  696. These allow ArchiveBox to:
  697. - Skip disabled plugins (optimization)
  698. - Enforce plugin-specific timeouts automatically
  699. - Discover plugin binaries for validation
  700. Args:
  701. plugin_name: Plugin name (e.g., 'wget', 'media', 'chrome')
  702. config: Merged config dict from get_config() (properly merges file, env, machine, crawl, snapshot)
  703. Returns:
  704. Dict with standardized keys:
  705. {
  706. 'enabled': True, # bool
  707. 'timeout': 60, # int, seconds
  708. 'binary': 'wget', # str, path or name
  709. }
  710. Examples:
  711. >>> from archivebox.config.configset import get_config
  712. >>> config = get_config(crawl=my_crawl, snapshot=my_snapshot)
  713. >>> get_plugin_special_config('wget', config)
  714. {'enabled': True, 'timeout': 120, 'binary': '/usr/bin/wget'}
  715. """
  716. import os
  717. plugin_upper = plugin_name.upper()
  718. # 1. Enabled: PLUGINNAME_ENABLED (default True unless DISABLE_ALL_PLUGINS is set)
  719. # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
  720. enabled_key = f'{plugin_upper}_ENABLED'
  721. # Check for global plugin disable flag (useful for tests)
  722. # When set, only plugins with EXPLICIT env var override are enabled
  723. global_disable = config.get('DISABLE_ALL_PLUGINS') or os.environ.get('DISABLE_ALL_PLUGINS', '')
  724. is_globally_disabled = global_disable is True or (isinstance(global_disable, str) and global_disable.lower() in ('true', '1', 'yes'))
  725. if is_globally_disabled:
  726. # When globally disabled, only check environment for explicit override
  727. # Ignore defaults from config.json
  728. env_val = os.environ.get(enabled_key, '')
  729. if env_val.lower() in ('true', '1', 'yes'):
  730. enabled = True
  731. else:
  732. enabled = False
  733. else:
  734. # Normal mode - check config (which includes defaults from config.json)
  735. enabled = config.get(enabled_key)
  736. if enabled is None:
  737. enabled = True
  738. elif isinstance(enabled, str):
  739. # Handle string values from config file ("true"/"false")
  740. enabled = enabled.lower() not in ('false', '0', 'no', '')
  741. # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
  742. timeout_key = f'{plugin_upper}_TIMEOUT'
  743. timeout = config.get(timeout_key) or config.get('TIMEOUT', 300)
  744. # 3. Binary: PLUGINNAME_BINARY (default to plugin_name)
  745. binary_key = f'{plugin_upper}_BINARY'
  746. binary = config.get(binary_key, plugin_name)
  747. return {
  748. 'enabled': bool(enabled),
  749. 'timeout': int(timeout),
  750. 'binary': str(binary),
  751. }
  752. # =============================================================================
  753. # Plugin Template Discovery
  754. # =============================================================================
  755. #
  756. # Plugins can provide custom templates for rendering their output in the UI.
  757. # Templates are discovered by filename convention inside each plugin's templates/ dir:
  758. #
  759. # archivebox/plugins/<plugin_name>/
  760. # templates/
  761. # icon.html # Icon for admin table view (small inline HTML)
  762. # thumbnail.html # Preview thumbnail for snapshot cards
  763. # embed.html # Iframe embed content for main preview
  764. # fullscreen.html # Fullscreen view template
  765. #
  766. # Template context variables available:
  767. # {{ result }} - ArchiveResult object
  768. # {{ snapshot }} - Parent Snapshot object
  769. # {{ output_path }} - Path to output file/dir relative to snapshot dir
  770. # {{ plugin }} - Plugin name (e.g., 'screenshot', 'singlefile')
  771. #
  772. # Default templates used when plugin doesn't provide one
  773. DEFAULT_TEMPLATES = {
  774. 'icon': '''<span title="{{ plugin }}">{{ icon }}</span>''',
  775. 'thumbnail': '''
  776. <img src="{{ output_path }}"
  777. alt="{{ plugin }} output"
  778. style="max-width: 100%; max-height: 100px; object-fit: cover;"
  779. onerror="this.style.display='none'">
  780. ''',
  781. 'embed': '''
  782. <iframe src="{{ output_path }}"
  783. style="width: 100%; height: 100%; border: none;"
  784. sandbox="allow-same-origin allow-scripts">
  785. </iframe>
  786. ''',
  787. 'fullscreen': '''
  788. <iframe src="{{ output_path }}"
  789. style="width: 100%; height: 100vh; border: none;"
  790. sandbox="allow-same-origin allow-scripts allow-forms">
  791. </iframe>
  792. ''',
  793. }
  794. def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> Optional[str]:
  795. """
  796. Get a plugin template by plugin name and template type.
  797. Args:
  798. plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
  799. template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
  800. fallback: If True, return default template if plugin template not found
  801. Returns:
  802. Template content as string, or None if not found and fallback=False.
  803. """
  804. base_name = get_plugin_name(plugin)
  805. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  806. if not base_dir.exists():
  807. continue
  808. # Look for plugin directory matching plugin name
  809. for plugin_dir in base_dir.iterdir():
  810. if not plugin_dir.is_dir():
  811. continue
  812. # Match by directory name (exact or partial)
  813. if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
  814. template_path = plugin_dir / 'templates' / f'{template_name}.html'
  815. if template_path.exists():
  816. return template_path.read_text()
  817. # Fall back to default template if requested
  818. if fallback:
  819. return DEFAULT_TEMPLATES.get(template_name, '')
  820. return None
  821. def get_plugin_icon(plugin: str) -> str:
  822. """
  823. Get the icon for a plugin from its icon.html template.
  824. Args:
  825. plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
  826. Returns:
  827. Icon HTML/emoji string.
  828. """
  829. # Try plugin-provided icon template
  830. icon_template = get_plugin_template(plugin, 'icon', fallback=False)
  831. if icon_template:
  832. return icon_template.strip()
  833. # Fall back to generic folder icon
  834. return '📁'
  835. def get_all_plugin_icons() -> Dict[str, str]:
  836. """
  837. Get icons for all discovered plugins.
  838. Returns:
  839. Dict mapping plugin base names to their icons.
  840. """
  841. icons = {}
  842. for plugin in get_plugins():
  843. base_name = get_plugin_name(plugin)
  844. icons[base_name] = get_plugin_icon(plugin)
  845. return icons
  846. def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
  847. """
  848. Discover all plugin templates organized by plugin.
  849. Returns:
  850. Dict mapping plugin names to dicts of template_name -> template_path.
  851. e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
  852. """
  853. templates: Dict[str, Dict[str, str]] = {}
  854. for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
  855. if not base_dir.exists():
  856. continue
  857. for plugin_dir in base_dir.iterdir():
  858. if not plugin_dir.is_dir():
  859. continue
  860. templates_dir = plugin_dir / 'templates'
  861. if not templates_dir.exists():
  862. continue
  863. plugin_templates = {}
  864. for template_file in templates_dir.glob('*.html'):
  865. template_name = template_file.stem # icon, thumbnail, embed, fullscreen
  866. plugin_templates[template_name] = str(template_file)
  867. if plugin_templates:
  868. templates[plugin_dir.name] = plugin_templates
  869. return templates
  870. # =============================================================================
  871. # Hook Result Processing Helpers
  872. # =============================================================================
  873. def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
  874. """
  875. Find Binary for a command, trying abspath first then name.
  876. Only matches binaries on the current machine.
  877. Args:
  878. cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
  879. machine_id: Current machine ID
  880. Returns:
  881. Binary ID as string if found, None otherwise
  882. """
  883. if not cmd:
  884. return None
  885. from archivebox.machine.models import Binary
  886. bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
  887. # Try matching by absolute path first
  888. binary = Binary.objects.filter(
  889. abspath=bin_path_or_name,
  890. machine_id=machine_id
  891. ).first()
  892. if binary:
  893. return str(binary.id)
  894. # Fallback: match by binary name
  895. bin_name = Path(bin_path_or_name).name
  896. binary = Binary.objects.filter(
  897. name=bin_name,
  898. machine_id=machine_id
  899. ).first()
  900. return str(binary.id) if binary else None
  901. def create_model_record(record: Dict[str, Any]) -> Any:
  902. """
  903. Generic helper to create/update model instances from hook JSONL output.
  904. Args:
  905. record: Dict with 'type' field and model data
  906. Returns:
  907. Created/updated model instance, or None if type unknown
  908. """
  909. from archivebox.machine.models import Binary, Machine
  910. record_type = record.pop('type', None)
  911. if not record_type:
  912. return None
  913. # Remove plugin metadata (not model fields)
  914. record.pop('plugin', None)
  915. record.pop('plugin_hook', None)
  916. if record_type == 'Binary':
  917. # Binary requires machine FK
  918. machine = Machine.current()
  919. record.setdefault('machine', machine)
  920. # Required fields check
  921. name = record.get('name')
  922. abspath = record.get('abspath')
  923. if not name or not abspath:
  924. return None
  925. obj, created = Binary.objects.update_or_create(
  926. machine=machine,
  927. name=name,
  928. defaults={
  929. 'abspath': abspath,
  930. 'version': record.get('version', ''),
  931. 'sha256': record.get('sha256', ''),
  932. 'binprovider': record.get('binprovider', 'env'),
  933. }
  934. )
  935. return obj
  936. elif record_type == 'Machine':
  937. # Machine config update (special _method handling)
  938. method = record.pop('_method', None)
  939. if method == 'update':
  940. key = record.get('key')
  941. value = record.get('value')
  942. if key and value:
  943. machine = Machine.current()
  944. if not machine.config:
  945. machine.config = {}
  946. machine.config[key] = value
  947. machine.save(update_fields=['config'])
  948. return machine
  949. return None
  950. # Add more types as needed (Dependency, Snapshot, etc.)
  951. else:
  952. # Unknown type - log warning but don't fail
  953. import sys
  954. print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
  955. return None
  956. def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
  957. """
  958. Process JSONL records from hook output.
  959. Dispatches to Model.from_jsonl() for each record type.
  960. Args:
  961. records: List of JSONL record dicts from result['records']
  962. overrides: Dict with 'snapshot', 'crawl', 'dependency', 'created_by_id', etc.
  963. Returns:
  964. Dict with counts by record type
  965. """
  966. stats = {}
  967. overrides = overrides or {}
  968. for record in records:
  969. record_type = record.get('type')
  970. if not record_type:
  971. continue
  972. # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
  973. if record_type == 'ArchiveResult':
  974. continue
  975. try:
  976. # Dispatch to appropriate model's from_jsonl() method
  977. if record_type == 'Snapshot':
  978. from archivebox.core.models import Snapshot
  979. obj = Snapshot.from_jsonl(record.copy(), overrides)
  980. if obj:
  981. stats['Snapshot'] = stats.get('Snapshot', 0) + 1
  982. elif record_type == 'Tag':
  983. from archivebox.core.models import Tag
  984. obj = Tag.from_jsonl(record.copy(), overrides)
  985. if obj:
  986. stats['Tag'] = stats.get('Tag', 0) + 1
  987. elif record_type == 'Binary':
  988. from archivebox.machine.models import Binary
  989. obj = Binary.from_jsonl(record.copy(), overrides)
  990. if obj:
  991. stats['Binary'] = stats.get('Binary', 0) + 1
  992. elif record_type == 'Machine':
  993. from archivebox.machine.models import Machine
  994. obj = Machine.from_jsonl(record.copy(), overrides)
  995. if obj:
  996. stats['Machine'] = stats.get('Machine', 0) + 1
  997. else:
  998. import sys
  999. print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
  1000. except Exception as e:
  1001. import sys
  1002. print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
  1003. continue
  1004. return stats