on_Snapshot__39_accessibility.js 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. #!/usr/bin/env node
  2. /**
  3. * Extract accessibility tree and page outline from a URL.
  4. *
  5. * Extracts:
  6. * - Page outline (headings h1-h6, sections, articles)
  7. * - Iframe tree
  8. * - Accessibility snapshot
  9. * - ARIA labels and roles
  10. *
  11. * Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>
  12. * Output: Writes accessibility/accessibility.json
  13. *
  14. * Environment variables:
  15. * SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true)
  16. */
  17. const fs = require('fs');
  18. const path = require('path');
  19. // Add NODE_MODULES_DIR to module resolution paths if set
  20. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  21. const puppeteer = require('puppeteer-core');
  22. // Extractor metadata
  23. const PLUGIN_NAME = 'accessibility';
  24. const OUTPUT_DIR = '.';
  25. const OUTPUT_FILE = 'accessibility.json';
  26. const CHROME_SESSION_DIR = '../chrome';
  27. const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
  28. // Parse command line arguments
  29. function parseArgs() {
  30. const args = {};
  31. process.argv.slice(2).forEach(arg => {
  32. if (arg.startsWith('--')) {
  33. const [key, ...valueParts] = arg.slice(2).split('=');
  34. args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
  35. }
  36. });
  37. return args;
  38. }
  39. // Get environment variable with default
  40. function getEnv(name, defaultValue = '') {
  41. return (process.env[name] || defaultValue).trim();
  42. }
  43. function getEnvBool(name, defaultValue = false) {
  44. const val = getEnv(name, '').toLowerCase();
  45. if (['true', '1', 'yes', 'on'].includes(val)) return true;
  46. if (['false', '0', 'no', 'off'].includes(val)) return false;
  47. return defaultValue;
  48. }
  49. // Wait for chrome tab to be fully loaded
  50. async function waitForChromeTabLoaded(timeoutMs = 60000) {
  51. const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
  52. const startTime = Date.now();
  53. while (Date.now() - startTime < timeoutMs) {
  54. if (fs.existsSync(navigationFile)) {
  55. return true;
  56. }
  57. // Wait 100ms before checking again
  58. await new Promise(resolve => setTimeout(resolve, 100));
  59. }
  60. return false;
  61. }
  62. // Get CDP URL from chrome plugin
  63. function getCdpUrl() {
  64. const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
  65. if (fs.existsSync(cdpFile)) {
  66. return fs.readFileSync(cdpFile, 'utf8').trim();
  67. }
  68. return null;
  69. }
  70. function assertChromeSession() {
  71. const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
  72. const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
  73. const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
  74. if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
  75. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  76. }
  77. try {
  78. const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
  79. if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
  80. process.kill(pid, 0);
  81. } catch (e) {
  82. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  83. }
  84. const cdpUrl = getCdpUrl();
  85. if (!cdpUrl) {
  86. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  87. }
  88. return cdpUrl;
  89. }
  90. // Extract accessibility info
  91. async function extractAccessibility(url) {
  92. // Output directory is current directory (hook already runs in output dir)
  93. const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
  94. let browser = null;
  95. try {
  96. // Connect to existing Chrome session
  97. const cdpUrl = assertChromeSession();
  98. browser = await puppeteer.connect({
  99. browserWSEndpoint: cdpUrl,
  100. });
  101. // Get the page
  102. const pages = await browser.pages();
  103. const page = pages.find(p => p.url().startsWith('http')) || pages[0];
  104. if (!page) {
  105. return { success: false, error: 'No page found in Chrome session' };
  106. }
  107. // Get accessibility snapshot
  108. const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true });
  109. // Extract page outline (headings, sections, etc.)
  110. const outline = await page.evaluate(() => {
  111. const headings = [];
  112. const elements = document.querySelectorAll(
  113. 'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe'
  114. );
  115. elements.forEach(elem => {
  116. // Skip unnamed anchors
  117. if (elem.tagName.toLowerCase() === 'a' && !elem.name) return;
  118. const tagName = elem.tagName.toLowerCase();
  119. const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || '';
  120. const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .');
  121. const action = elem.action?.split('/').pop() || '';
  122. let summary = (elem.innerText || '').slice(0, 128);
  123. if (summary.length >= 128) summary += '...';
  124. let prefix = '';
  125. let title = '';
  126. // Format headings with # prefix
  127. const level = parseInt(tagName.replace('h', ''));
  128. if (!isNaN(level)) {
  129. prefix = '#'.repeat(level);
  130. title = elem.innerText || elemId || elemClasses;
  131. } else {
  132. // For other elements, create breadcrumb path
  133. const parents = [tagName];
  134. let node = elem.parentNode;
  135. while (node && parents.length < 5) {
  136. if (node.tagName) {
  137. const tag = node.tagName.toLowerCase();
  138. if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) {
  139. parents.unshift(tag);
  140. } else {
  141. parents.unshift('');
  142. }
  143. }
  144. node = node.parentNode;
  145. }
  146. prefix = parents.join('>');
  147. title = elemId ? `#${elemId}` : '';
  148. if (!title && elemClasses) title = `.${elemClasses}`;
  149. if (action) title += ` /${action}`;
  150. if (summary && !title.includes(summary)) title += `: ${summary}`;
  151. }
  152. // Clean up title
  153. title = title.replace(/\s+/g, ' ').trim();
  154. if (prefix) {
  155. headings.push(`${prefix} ${title}`);
  156. }
  157. });
  158. return headings;
  159. });
  160. // Get iframe tree
  161. const iframes = [];
  162. function dumpFrameTree(frame, indent = '>') {
  163. iframes.push(indent + frame.url());
  164. for (const child of frame.childFrames()) {
  165. dumpFrameTree(child, indent + '>');
  166. }
  167. }
  168. dumpFrameTree(page.mainFrame(), '');
  169. const accessibilityData = {
  170. url,
  171. headings: outline,
  172. iframes,
  173. tree: accessibilityTree,
  174. };
  175. // Write output
  176. fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2));
  177. return { success: true, output: outputPath, accessibilityData };
  178. } catch (e) {
  179. return { success: false, error: `${e.name}: ${e.message}` };
  180. } finally {
  181. if (browser) {
  182. browser.disconnect();
  183. }
  184. }
  185. }
  186. async function main() {
  187. const args = parseArgs();
  188. const url = args.url;
  189. const snapshotId = args.snapshot_id;
  190. if (!url || !snapshotId) {
  191. console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>');
  192. process.exit(1);
  193. }
  194. const startTs = new Date();
  195. let status = 'failed';
  196. let output = null;
  197. let error = '';
  198. try {
  199. // Check if enabled
  200. if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) {
  201. console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)');
  202. // Output clean JSONL (no RESULT_JSON= prefix)
  203. console.log(JSON.stringify({
  204. type: 'ArchiveResult',
  205. status: 'skipped',
  206. output_str: 'ACCESSIBILITY_ENABLED=False',
  207. }));
  208. process.exit(0);
  209. }
  210. // Check if Chrome session exists, then wait for page load
  211. assertChromeSession();
  212. const pageLoaded = await waitForChromeTabLoaded(60000);
  213. if (!pageLoaded) {
  214. throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
  215. }
  216. const result = await extractAccessibility(url);
  217. if (result.success) {
  218. status = 'succeeded';
  219. output = result.output;
  220. const headingCount = result.accessibilityData.headings.length;
  221. const iframeCount = result.accessibilityData.iframes.length;
  222. console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`);
  223. } else {
  224. status = 'failed';
  225. error = result.error;
  226. }
  227. } catch (e) {
  228. error = `${e.name}: ${e.message}`;
  229. status = 'failed';
  230. }
  231. const endTs = new Date();
  232. if (error) console.error(`ERROR: ${error}`);
  233. // Output clean JSONL (no RESULT_JSON= prefix)
  234. console.log(JSON.stringify({
  235. type: 'ArchiveResult',
  236. status,
  237. output_str: output || error || '',
  238. }));
  239. process.exit(status === 'succeeded' ? 0 : 1);
  240. }
  241. main().catch(e => {
  242. console.error(`Fatal error: ${e.message}`);
  243. process.exit(1);
  244. });