on_Snapshot__27_headers.bg.js 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. #!/usr/bin/env node
  2. /**
  3. * Capture original request + response headers for the main navigation.
  4. *
  5. * This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
  6. * then waits for navigation to complete. It records the first top-level
  7. * request headers and the corresponding response headers (with :status).
  8. *
  9. * Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>
  10. * Output: Writes headers.json
  11. */
  12. const fs = require('fs');
  13. const path = require('path');
  14. // Add NODE_MODULES_DIR to module resolution paths if set
  15. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  16. const puppeteer = require('puppeteer-core');
  17. // Import shared utilities from chrome_utils.js
  18. const {
  19. getEnvBool,
  20. getEnvInt,
  21. parseArgs,
  22. connectToPage,
  23. waitForPageLoaded,
  24. } = require('../chrome/chrome_utils.js');
  25. const PLUGIN_NAME = 'headers';
  26. const OUTPUT_DIR = '.';
  27. const OUTPUT_FILE = 'headers.json';
  28. const CHROME_SESSION_DIR = '../chrome';
  29. const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
  30. let browser = null;
  31. let page = null;
  32. let client = null;
  33. let shuttingDown = false;
  34. let headersWritten = false;
  35. let requestId = null;
  36. let requestUrl = null;
  37. let requestHeaders = null;
  38. let responseHeaders = null;
  39. let responseStatus = null;
  40. let responseStatusText = null;
  41. let responseUrl = null;
  42. let originalUrl = null;
  43. function getFinalUrl() {
  44. const finalUrlFile = path.join(CHROME_SESSION_DIR, 'final_url.txt');
  45. if (fs.existsSync(finalUrlFile)) {
  46. return fs.readFileSync(finalUrlFile, 'utf8').trim();
  47. }
  48. return page ? page.url() : null;
  49. }
  50. function writeHeadersFile() {
  51. if (headersWritten) return;
  52. if (!responseHeaders) return;
  53. const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
  54. const responseHeadersWithStatus = {
  55. ...(responseHeaders || {}),
  56. };
  57. if (responseStatus !== null && responseStatus !== undefined &&
  58. responseHeadersWithStatus[':status'] === undefined) {
  59. responseHeadersWithStatus[':status'] = String(responseStatus);
  60. }
  61. const record = {
  62. url: requestUrl || originalUrl,
  63. final_url: getFinalUrl(),
  64. status: responseStatus !== undefined ? responseStatus : null,
  65. request_headers: requestHeaders || {},
  66. response_headers: responseHeadersWithStatus,
  67. headers: responseHeadersWithStatus, // backwards compatibility
  68. };
  69. if (responseStatusText) {
  70. record.statusText = responseStatusText;
  71. }
  72. if (responseUrl) {
  73. record.response_url = responseUrl;
  74. }
  75. fs.writeFileSync(outputPath, JSON.stringify(record, null, 2));
  76. headersWritten = true;
  77. }
  78. async function setupListener(url) {
  79. const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
  80. const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
  81. const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
  82. const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
  83. if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
  84. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  85. }
  86. try {
  87. const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
  88. if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
  89. process.kill(pid, 0);
  90. } catch (e) {
  91. throw new Error(CHROME_SESSION_REQUIRED_ERROR);
  92. }
  93. const { browser, page } = await connectToPage({
  94. chromeSessionDir: CHROME_SESSION_DIR,
  95. timeoutMs: timeout,
  96. puppeteer,
  97. });
  98. client = await page.target().createCDPSession();
  99. await client.send('Network.enable');
  100. client.on('Network.requestWillBeSent', (params) => {
  101. try {
  102. if (requestId && !responseHeaders && params.redirectResponse && params.requestId === requestId) {
  103. responseHeaders = params.redirectResponse.headers || {};
  104. responseStatus = params.redirectResponse.status || null;
  105. responseStatusText = params.redirectResponse.statusText || null;
  106. responseUrl = params.redirectResponse.url || null;
  107. writeHeadersFile();
  108. }
  109. if (requestId) return;
  110. if (params.type && params.type !== 'Document') return;
  111. if (!params.request || !params.request.url) return;
  112. if (!params.request.url.startsWith('http')) return;
  113. requestId = params.requestId;
  114. requestUrl = params.request.url;
  115. requestHeaders = params.request.headers || {};
  116. } catch (e) {
  117. // Ignore errors
  118. }
  119. });
  120. client.on('Network.responseReceived', (params) => {
  121. try {
  122. if (!requestId || params.requestId !== requestId || responseHeaders) return;
  123. const response = params.response || {};
  124. responseHeaders = response.headers || {};
  125. responseStatus = response.status || null;
  126. responseStatusText = response.statusText || null;
  127. responseUrl = response.url || null;
  128. writeHeadersFile();
  129. } catch (e) {
  130. // Ignore errors
  131. }
  132. });
  133. return { browser, page };
  134. }
  135. function emitResult(status = 'succeeded', outputStr = OUTPUT_FILE) {
  136. if (shuttingDown) return;
  137. shuttingDown = true;
  138. console.log(JSON.stringify({
  139. type: 'ArchiveResult',
  140. status,
  141. output_str: outputStr,
  142. }));
  143. }
  144. async function handleShutdown(signal) {
  145. console.error(`\nReceived ${signal}, emitting final results...`);
  146. if (!headersWritten) {
  147. writeHeadersFile();
  148. }
  149. if (headersWritten) {
  150. emitResult('succeeded', OUTPUT_FILE);
  151. } else {
  152. emitResult('failed', 'No headers captured');
  153. }
  154. if (browser) {
  155. try {
  156. browser.disconnect();
  157. } catch (e) {}
  158. }
  159. process.exit(headersWritten ? 0 : 1);
  160. }
  161. async function main() {
  162. const args = parseArgs();
  163. const url = args.url;
  164. const snapshotId = args.snapshot_id;
  165. if (!url || !snapshotId) {
  166. console.error('Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>');
  167. process.exit(1);
  168. }
  169. originalUrl = url;
  170. if (!getEnvBool('HEADERS_ENABLED', true)) {
  171. console.error('Skipping (HEADERS_ENABLED=False)');
  172. console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'HEADERS_ENABLED=False'}));
  173. process.exit(0);
  174. }
  175. try {
  176. // Set up listeners BEFORE navigation
  177. const connection = await setupListener(url);
  178. browser = connection.browser;
  179. page = connection.page;
  180. // Register signal handlers for graceful shutdown
  181. process.on('SIGTERM', () => handleShutdown('SIGTERM'));
  182. process.on('SIGINT', () => handleShutdown('SIGINT'));
  183. // Wait for chrome_navigate to complete (non-fatal)
  184. try {
  185. const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
  186. await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
  187. } catch (e) {
  188. console.error(`WARN: ${e.message}`);
  189. }
  190. // Keep alive until SIGTERM
  191. await new Promise(() => {});
  192. return;
  193. } catch (e) {
  194. const errorMessage = (e && e.message)
  195. ? `${e.name || 'Error'}: ${e.message}`
  196. : String(e || 'Unknown error');
  197. console.error(`ERROR: ${errorMessage}`);
  198. console.log(JSON.stringify({
  199. type: 'ArchiveResult',
  200. status: 'failed',
  201. output_str: errorMessage,
  202. }));
  203. process.exit(1);
  204. }
  205. }
  206. main().catch(e => {
  207. console.error(`Fatal error: ${e.message}`);
  208. process.exit(1);
  209. });