on_Snapshot__52_pdf.js 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. #!/usr/bin/env node
  2. /**
  3. * Print a URL to PDF using Chrome/Puppeteer.
  4. *
  5. * Requires a Chrome session (from chrome plugin) and connects to it via CDP.
  6. *
  7. * Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>
  8. * Output: Writes pdf/output.pdf
  9. *
  10. * Environment variables:
  11. * PDF_ENABLED: Enable PDF generation (default: true)
  12. */
  13. const fs = require('fs');
  14. const path = require('path');
  15. // Add NODE_MODULES_DIR to module resolution paths if set
  16. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  17. const {
  18. getEnvBool,
  19. parseArgs,
  20. readCdpUrl,
  21. } = require('../chrome/chrome_utils.js');
  22. // Check if PDF is enabled BEFORE requiring puppeteer
  23. if (!getEnvBool('PDF_ENABLED', true)) {
  24. console.error('Skipping PDF (PDF_ENABLED=False)');
  25. // Temporary failure (config disabled) - NO JSONL emission
  26. process.exit(0);
  27. }
  28. // Now safe to require puppeteer
  29. const puppeteer = require('puppeteer-core');
  30. // Extractor metadata
  31. const PLUGIN_NAME = 'pdf';
  32. const OUTPUT_DIR = '.';
  33. const OUTPUT_FILE = 'output.pdf';
  34. const CHROME_SESSION_DIR = '../chrome';
  35. // Check if staticfile extractor already downloaded this URL
  36. const STATICFILE_DIR = '../staticfile';
  37. function hasStaticFileOutput() {
  38. if (!fs.existsSync(STATICFILE_DIR)) return false;
  39. const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
  40. if (!fs.existsSync(stdoutPath)) return false;
  41. const stdout = fs.readFileSync(stdoutPath, 'utf8');
  42. for (const line of stdout.split('\n')) {
  43. const trimmed = line.trim();
  44. if (!trimmed.startsWith('{')) continue;
  45. try {
  46. const record = JSON.parse(trimmed);
  47. if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
  48. return true;
  49. }
  50. } catch (e) {}
  51. }
  52. return false;
  53. }
  54. // Wait for chrome tab to be fully loaded
  55. async function waitForChromeTabLoaded(timeoutMs = 60000) {
  56. const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
  57. const startTime = Date.now();
  58. while (Date.now() - startTime < timeoutMs) {
  59. if (fs.existsSync(navigationFile)) {
  60. return true;
  61. }
  62. // Wait 100ms before checking again
  63. await new Promise(resolve => setTimeout(resolve, 100));
  64. }
  65. return false;
  66. }
  67. async function printToPdf(url) {
  68. // Output directory is current directory (hook already runs in output dir)
  69. const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
  70. let browser = null;
  71. let page = null;
  72. try {
  73. // Connect to existing Chrome session (required)
  74. const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
  75. if (!cdpUrl) {
  76. return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
  77. }
  78. browser = await puppeteer.connect({
  79. browserWSEndpoint: cdpUrl,
  80. defaultViewport: null,
  81. });
  82. // Get existing pages or create new one
  83. const pages = await browser.pages();
  84. page = pages.find(p => p.url().startsWith('http')) || pages[0];
  85. if (!page) {
  86. page = await browser.newPage();
  87. }
  88. // Print to PDF
  89. await page.pdf({
  90. path: outputPath,
  91. format: 'A4',
  92. printBackground: true,
  93. margin: {
  94. top: '0.5in',
  95. right: '0.5in',
  96. bottom: '0.5in',
  97. left: '0.5in',
  98. },
  99. });
  100. if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
  101. return { success: true, output: outputPath };
  102. } else {
  103. return { success: false, error: 'PDF file not created' };
  104. }
  105. } catch (e) {
  106. return { success: false, error: `${e.name}: ${e.message}` };
  107. } finally {
  108. if (browser) {
  109. browser.disconnect();
  110. }
  111. }
  112. }
  113. async function main() {
  114. const args = parseArgs();
  115. const url = args.url;
  116. const snapshotId = args.snapshot_id;
  117. if (!url || !snapshotId) {
  118. console.error('Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>');
  119. process.exit(1);
  120. }
  121. try {
  122. // Check if staticfile extractor already handled this (permanent skip)
  123. if (hasStaticFileOutput()) {
  124. console.error(`Skipping PDF - staticfile extractor already downloaded this`);
  125. // Permanent skip - emit ArchiveResult
  126. console.log(JSON.stringify({
  127. type: 'ArchiveResult',
  128. status: 'skipped',
  129. output_str: 'staticfile already handled',
  130. }));
  131. process.exit(0);
  132. }
  133. const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
  134. if (!cdpUrl) {
  135. throw new Error('No Chrome session found (chrome plugin must run first)');
  136. }
  137. // Wait for page to be fully loaded
  138. const pageLoaded = await waitForChromeTabLoaded(60000);
  139. if (!pageLoaded) {
  140. throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
  141. }
  142. const result = await printToPdf(url);
  143. if (result.success) {
  144. // Success - emit ArchiveResult
  145. const size = fs.statSync(result.output).size;
  146. console.error(`PDF saved (${size} bytes)`);
  147. console.log(JSON.stringify({
  148. type: 'ArchiveResult',
  149. status: 'succeeded',
  150. output_str: result.output,
  151. }));
  152. process.exit(0);
  153. } else {
  154. // Transient error - emit NO JSONL
  155. console.error(`ERROR: ${result.error}`);
  156. process.exit(1);
  157. }
  158. } catch (e) {
  159. // Transient error - emit NO JSONL
  160. console.error(`ERROR: ${e.name}: ${e.message}`);
  161. process.exit(1);
  162. }
  163. }
  164. main().catch(e => {
  165. console.error(`Fatal error: ${e.message}`);
  166. process.exit(1);
  167. });