on_Snapshot__38_seo.js 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/env node
  2. /**
  3. * Extract SEO metadata from a URL.
  4. *
  5. * Extracts all <meta> tags including:
  6. * - og:* (Open Graph)
  7. * - twitter:*
  8. * - description, keywords, author
  9. * - Any other meta tags
  10. *
  11. * Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>
  12. * Output: Writes seo/seo.json
  13. *
  14. * Environment variables:
  15. * SAVE_SEO: Enable SEO extraction (default: true)
  16. */
  17. const fs = require('fs');
  18. const path = require('path');
  19. // Add NODE_MODULES_DIR to module resolution paths if set
  20. if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
  21. const puppeteer = require('puppeteer-core');
  22. // Import shared utilities from chrome_utils.js
  23. const {
  24. getEnvBool,
  25. getEnvInt,
  26. parseArgs,
  27. connectToPage,
  28. waitForPageLoaded,
  29. } = require('../chrome/chrome_utils.js');
  30. // Extractor metadata
  31. const PLUGIN_NAME = 'seo';
  32. const OUTPUT_DIR = '.';
  33. const OUTPUT_FILE = 'seo.json';
  34. const CHROME_SESSION_DIR = '../chrome';
  35. // Extract SEO metadata
  36. async function extractSeo(url) {
  37. // Output directory is current directory (hook already runs in output dir)
  38. const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
  39. const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
  40. let browser = null;
  41. try {
  42. // Connect to existing Chrome session and get target page
  43. const connection = await connectToPage({
  44. chromeSessionDir: CHROME_SESSION_DIR,
  45. timeoutMs: timeout,
  46. puppeteer,
  47. });
  48. browser = connection.browser;
  49. const page = connection.page;
  50. // Extract all meta tags
  51. const seoData = await page.evaluate(() => {
  52. const metaTags = Array.from(document.querySelectorAll('meta'));
  53. const seo = {
  54. url: window.location.href,
  55. title: document.title || '',
  56. };
  57. // Process each meta tag
  58. metaTags.forEach(tag => {
  59. // Get the key (name or property attribute)
  60. const key = tag.getAttribute('name') || tag.getAttribute('property') || '';
  61. const content = tag.getAttribute('content') || '';
  62. if (key && content) {
  63. // Store by key
  64. seo[key] = content;
  65. }
  66. });
  67. // Also get canonical URL if present
  68. const canonical = document.querySelector('link[rel="canonical"]');
  69. if (canonical) {
  70. seo.canonical = canonical.getAttribute('href');
  71. }
  72. // Get language
  73. const htmlLang = document.documentElement.lang;
  74. if (htmlLang) {
  75. seo.language = htmlLang;
  76. }
  77. return seo;
  78. });
  79. // Write output
  80. fs.writeFileSync(outputPath, JSON.stringify(seoData, null, 2));
  81. return { success: true, output: outputPath, seoData };
  82. } catch (e) {
  83. return { success: false, error: `${e.name}: ${e.message}` };
  84. } finally {
  85. if (browser) {
  86. browser.disconnect();
  87. }
  88. }
  89. }
  90. async function main() {
  91. const args = parseArgs();
  92. const url = args.url;
  93. const snapshotId = args.snapshot_id;
  94. if (!url || !snapshotId) {
  95. console.error('Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>');
  96. process.exit(1);
  97. }
  98. const startTs = new Date();
  99. let status = 'failed';
  100. let output = null;
  101. let error = '';
  102. try {
  103. // Check if enabled
  104. if (!getEnvBool('SEO_ENABLED', true)) {
  105. console.log('Skipping SEO (SEO_ENABLED=False)');
  106. // Output clean JSONL (no RESULT_JSON= prefix)
  107. console.log(JSON.stringify({
  108. type: 'ArchiveResult',
  109. status: 'skipped',
  110. output_str: 'SEO_ENABLED=False',
  111. }));
  112. process.exit(0);
  113. }
  114. const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
  115. await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
  116. const result = await extractSeo(url);
  117. if (result.success) {
  118. status = 'succeeded';
  119. output = result.output;
  120. const metaCount = Object.keys(result.seoData).length - 2; // Subtract url and title
  121. console.log(`SEO metadata extracted: ${metaCount} meta tags`);
  122. } else {
  123. status = 'failed';
  124. error = result.error;
  125. }
  126. } catch (e) {
  127. error = `${e.name}: ${e.message}`;
  128. status = 'failed';
  129. }
  130. const endTs = new Date();
  131. if (error) console.error(`ERROR: ${error}`);
  132. // Output clean JSONL (no RESULT_JSON= prefix)
  133. console.log(JSON.stringify({
  134. type: 'ArchiveResult',
  135. status,
  136. output_str: output || error || '',
  137. }));
  138. process.exit(status === 'succeeded' ? 0 : 1);
  139. }
  140. main().catch(e => {
  141. console.error(`Fatal error: ${e.message}`);
  142. process.exit(1);
  143. });