archivebox_pluginmap.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. from typing import Optional
  4. from pathlib import Path
  5. import rich_click as click
  6. from archivebox.misc.util import docstring, enforce_types
  7. # State Machine ASCII Art Diagrams
  8. CRAWL_MACHINE_DIAGRAM = """
  9. ┌─────────────────────────────────────────────────────────────────────────────┐
  10. │ CrawlMachine │
  11. ├─────────────────────────────────────────────────────────────────────────────┤
  12. │ │
  13. │ ┌─────────────┐ │
  14. │ │ QUEUED │◄────────────────┐ │
  15. │ │ (initial) │ │ │
  16. │ └──────┬──────┘ │ │
  17. │ │ │ tick() unless can_start() │
  18. │ │ tick() when │ │
  19. │ │ can_start() │ │
  20. │ ▼ │ │
  21. │ ┌─────────────┐ │ │
  22. │ │ STARTED │─────────────────┘ │
  23. │ │ │◄────────────────┐ │
  24. │ │ enter: │ │ │
  25. │ │ crawl.run()│ │ tick() unless is_finished() │
  26. │ │ (discover │ │ │
  27. │ │ Crawl │─────────────────┘ │
  28. │ │ hooks) │ │
  29. │ └──────┬──────┘ │
  30. │ │ │
  31. │ │ tick() when is_finished() │
  32. │ ▼ │
  33. │ ┌─────────────┐ │
  34. │ │ SEALED │ │
  35. │ │ (final) │ │
  36. │ │ │ │
  37. │ │ enter: │ │
  38. │ │ cleanup() │ │
  39. │ └─────────────┘ │
  40. │ │
  41. │ Hooks triggered: on_Crawl__* (during STARTED.enter via crawl.run()) │
  42. │ on_CrawlEnd__* (during SEALED.enter via cleanup()) │
  43. └─────────────────────────────────────────────────────────────────────────────┘
  44. """
  45. SNAPSHOT_MACHINE_DIAGRAM = """
  46. ┌─────────────────────────────────────────────────────────────────────────────┐
  47. │ SnapshotMachine │
  48. ├─────────────────────────────────────────────────────────────────────────────┤
  49. │ │
  50. │ ┌─────────────┐ │
  51. │ │ QUEUED │◄────────────────┐ │
  52. │ │ (initial) │ │ │
  53. │ └──────┬──────┘ │ │
  54. │ │ │ tick() unless can_start() │
  55. │ │ tick() when │ │
  56. │ │ can_start() │ │
  57. │ ▼ │ │
  58. │ ┌─────────────┐ │ │
  59. │ │ STARTED │─────────────────┘ │
  60. │ │ │◄────────────────┐ │
  61. │ │ enter: │ │ │
  62. │ │ snapshot │ │ tick() unless is_finished() │
  63. │ │ .run() │ │ │
  64. │ │ (discover │─────────────────┘ │
  65. │ │ Snapshot │ │
  66. │ │ hooks, │ │
  67. │ │ create │ │
  68. │ │ pending │ │
  69. │ │ results) │ │
  70. │ └──────┬──────┘ │
  71. │ │ │
  72. │ │ tick() when is_finished() │
  73. │ ▼ │
  74. │ ┌─────────────┐ │
  75. │ │ SEALED │ │
  76. │ │ (final) │ │
  77. │ │ │ │
  78. │ │ enter: │ │
  79. │ │ cleanup() │ │
  80. │ └─────────────┘ │
  81. │ │
  82. │ Hooks triggered: on_Snapshot__* (creates ArchiveResults in STARTED.enter) │
  83. └─────────────────────────────────────────────────────────────────────────────┘
  84. """
  85. ARCHIVERESULT_MACHINE_DIAGRAM = """
  86. ┌─────────────────────────────────────────────────────────────────────────────┐
  87. │ ArchiveResultMachine │
  88. ├─────────────────────────────────────────────────────────────────────────────┤
  89. │ │
  90. │ ┌─────────────┐ │
  91. │ │ QUEUED │◄────────────────┐ │
  92. │ │ (initial) │ │ │
  93. │ └──────┬──────┘ │ │
  94. │ │ │ tick() unless can_start() │
  95. │ │ tick() when │ │
  96. │ │ can_start() │ │
  97. │ ▼ │ │
  98. │ ┌─────────────┐ │ │
  99. │ │ STARTED │─────────────────┘ │
  100. │ │ │◄────────────────┐ │
  101. │ │ enter: │ │ tick() unless is_finished() │
  102. │ │ result.run()│─────────────────┘ │
  103. │ │ (execute │ │
  104. │ │ hook via │ │
  105. │ │ run_hook())│ │
  106. │ └──────┬──────┘ │
  107. │ │ │
  108. │ │ tick() checks status set by hook output │
  109. │ ├────────────────┬────────────────┬────────────────┐ │
  110. │ ▼ ▼ ▼ ▼ │
  111. │ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
  112. │ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │
  113. │ │ (final) │ │ (final) │ │ (final) │ │ │ │
  114. │ └───────────┘ └───────────┘ └───────────┘ └─────┬─────┘ │
  115. │ │ │
  116. │ can_start()───┘ │
  117. │ loops back to STARTED │
  118. │ │
  119. │ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
  120. └─────────────────────────────────────────────────────────────────────────────┘
  121. """
  122. BINARY_MACHINE_DIAGRAM = """
  123. ┌─────────────────────────────────────────────────────────────────────────────┐
  124. │ BinaryMachine │
  125. ├─────────────────────────────────────────────────────────────────────────────┤
  126. │ │
  127. │ ┌─────────────┐ │
  128. │ │ QUEUED │◄────────────────┐ │
  129. │ │ (initial) │ │ │
  130. │ └──────┬──────┘ │ │
  131. │ │ │ tick() unless can_start() │
  132. │ │ tick() when │ │
  133. │ │ can_start() │ │
  134. │ ▼ │ │
  135. │ ┌─────────────┐ │ │
  136. │ │ STARTED │─────────────────┘ │
  137. │ │ │◄────────────────┐ │
  138. │ │ enter: │ │ │
  139. │ │ binary.run()│ │ tick() unless is_finished() │
  140. │ │ (discover │─────────────────┘ │
  141. │ │ Binary │ │
  142. │ │ hooks, │ │
  143. │ │ try each │ │
  144. │ │ provider) │ │
  145. │ └──────┬──────┘ │
  146. │ │ │
  147. │ │ tick() checks status set by hook output │
  148. │ ├────────────────────────────────┐ │
  149. │ ▼ ▼ │
  150. │ ┌─────────────┐ ┌─────────────┐ │
  151. │ │ SUCCEEDED │ │ FAILED │ │
  152. │ │ (final) │ │ (final) │ │
  153. │ │ │ │ │ │
  154. │ │ abspath, │ │ no provider │ │
  155. │ │ version set │ │ succeeded │ │
  156. │ └─────────────┘ └─────────────┘ │
  157. │ │
  158. │ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter) │
  159. │ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
  160. └─────────────────────────────────────────────────────────────────────────────┘
  161. """
  162. @enforce_types
  163. def pluginmap(
  164. show_disabled: bool = False,
  165. model: Optional[str] = None,
  166. quiet: bool = False,
  167. ) -> dict:
  168. """
  169. Show a map of all state machines and their associated plugin hooks.
  170. Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
  171. ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
  172. that will run for each model's transitions.
  173. """
  174. from rich.console import Console
  175. from rich.table import Table
  176. from rich.panel import Panel
  177. from rich import box
  178. from archivebox.hooks import (
  179. discover_hooks,
  180. extract_step,
  181. is_background_hook,
  182. BUILTIN_PLUGINS_DIR,
  183. USER_PLUGINS_DIR,
  184. )
  185. console = Console()
  186. prnt = console.print
  187. # Model event types that can have hooks
  188. model_events = {
  189. 'Crawl': {
  190. 'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)',
  191. 'machine': 'CrawlMachine',
  192. 'diagram': CRAWL_MACHINE_DIAGRAM,
  193. },
  194. 'CrawlEnd': {
  195. 'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)',
  196. 'machine': 'CrawlMachine',
  197. 'diagram': None, # Part of CrawlMachine
  198. },
  199. 'Snapshot': {
  200. 'description': 'Hooks run for each Snapshot (creates ArchiveResults)',
  201. 'machine': 'SnapshotMachine',
  202. 'diagram': SNAPSHOT_MACHINE_DIAGRAM,
  203. },
  204. 'Binary': {
  205. 'description': 'Hooks for installing binary dependencies (providers)',
  206. 'machine': 'BinaryMachine',
  207. 'diagram': BINARY_MACHINE_DIAGRAM,
  208. },
  209. }
  210. # Filter to specific model if requested
  211. if model:
  212. model = model.title()
  213. if model not in model_events:
  214. prnt(f'[red]Error: Unknown model "{model}". Available: {", ".join(model_events.keys())}[/red]')
  215. return {}
  216. model_events = {model: model_events[model]}
  217. result = {
  218. 'models': {},
  219. 'plugins_dir': str(BUILTIN_PLUGINS_DIR),
  220. 'user_plugins_dir': str(USER_PLUGINS_DIR),
  221. }
  222. if not quiet:
  223. prnt()
  224. prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]')
  225. prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]')
  226. prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
  227. prnt()
  228. # Show diagrams first (unless quiet mode)
  229. if not quiet:
  230. # Show ArchiveResult diagram separately since it's different
  231. prnt(Panel(
  232. ARCHIVERESULT_MACHINE_DIAGRAM,
  233. title='[bold green]ArchiveResultMachine[/bold green]',
  234. border_style='green',
  235. expand=False,
  236. ))
  237. prnt()
  238. for event_name, info in model_events.items():
  239. # Discover hooks for this event
  240. hooks = discover_hooks(event_name, filter_disabled=not show_disabled)
  241. # Build hook info list
  242. hook_infos = []
  243. for hook_path in hooks:
  244. # Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__61_wget.py')
  245. plugin_name = hook_path.parent.name
  246. step = extract_step(hook_path.name)
  247. is_bg = is_background_hook(hook_path.name)
  248. hook_infos.append({
  249. 'path': str(hook_path),
  250. 'name': hook_path.name,
  251. 'plugin': plugin_name,
  252. 'step': step,
  253. 'is_background': is_bg,
  254. 'extension': hook_path.suffix,
  255. })
  256. result['models'][event_name] = {
  257. 'description': info['description'],
  258. 'machine': info['machine'],
  259. 'hooks': hook_infos,
  260. 'hook_count': len(hook_infos),
  261. }
  262. if not quiet:
  263. # Show diagram if this model has one
  264. if info.get('diagram'):
  265. prnt(Panel(
  266. info['diagram'],
  267. title=f'[bold green]{info["machine"]}[/bold green]',
  268. border_style='green',
  269. expand=False,
  270. ))
  271. prnt()
  272. # Create hooks table
  273. table = Table(
  274. title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)',
  275. box=box.ROUNDED,
  276. show_header=True,
  277. header_style='bold magenta',
  278. )
  279. table.add_column('Step', justify='center', width=6)
  280. table.add_column('Plugin', style='cyan', width=20)
  281. table.add_column('Hook Name', style='green')
  282. table.add_column('BG', justify='center', width=4)
  283. table.add_column('Type', justify='center', width=5)
  284. # Sort by step then by name
  285. sorted_hooks = sorted(hook_infos, key=lambda h: (h['step'], h['name']))
  286. for hook in sorted_hooks:
  287. bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else ''
  288. ext = hook['extension'].lstrip('.')
  289. table.add_row(
  290. str(hook['step']),
  291. hook['plugin'],
  292. hook['name'],
  293. bg_marker,
  294. ext,
  295. )
  296. prnt(table)
  297. prnt()
  298. prnt(f'[dim]{info["description"]}[/dim]')
  299. prnt()
  300. # Summary
  301. if not quiet:
  302. total_hooks = sum(m['hook_count'] for m in result['models'].values())
  303. prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]')
  304. prnt()
  305. prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]')
  306. prnt('[dim] - XX: Two-digit order (first digit = step 0-9)[/dim]')
  307. prnt('[dim] - .bg: Background hook (non-blocking)[/dim]')
  308. prnt('[dim] - ext: py, sh, or js[/dim]')
  309. prnt()
  310. return result
  311. @click.command()
  312. @click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too')
  313. @click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)')
  314. @click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams')
  315. @docstring(pluginmap.__doc__)
  316. def main(**kwargs):
  317. import json
  318. result = pluginmap(**kwargs)
  319. if kwargs.get('quiet'):
  320. print(json.dumps(result, indent=2))
  321. if __name__ == '__main__':
  322. main()