on_Snapshot__54_title.js 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. #!/usr/bin/env node
  2. /**
  3. * Extract the title of a URL.
  4. *
  5. * Requires a Chrome session (from chrome plugin) and connects to it via CDP
  6. * to get the page title (which includes JS-rendered content).
  7. *
  8. * Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>
  9. * Output: Writes title/title.txt
  10. *
  11. * Environment variables:
  12. * TITLE_TIMEOUT: Timeout in seconds (default: 30)
  13. */
  14. const fs = require('fs');
  15. const path = require('path');
  16. const puppeteer = require('puppeteer-core');
  17. // Import shared utilities from chrome_utils.js
  18. const {
  19. getEnvInt,
  20. parseArgs,
  21. connectToPage,
  22. waitForPageLoaded,
  23. } = require('../chrome/chrome_utils.js');
  24. // Extractor metadata
  25. const PLUGIN_NAME = 'title';
  26. const OUTPUT_DIR = '.';
  27. const OUTPUT_FILE = 'title.txt';
  28. const CHROME_SESSION_DIR = '../chrome';
  29. async function extractTitle(url) {
  30. // Output directory is current directory (hook already runs in output dir)
  31. const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
  32. const timeoutMs = getEnvInt('TITLE_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
  33. let browser = null;
  34. try {
  35. const connection = await connectToPage({
  36. chromeSessionDir: CHROME_SESSION_DIR,
  37. timeoutMs,
  38. puppeteer,
  39. });
  40. browser = connection.browser;
  41. const page = connection.page;
  42. await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs * 4, 200);
  43. // Get title from page
  44. let title = await page.title();
  45. if (!title) {
  46. // Try getting from DOM directly
  47. title = await page.evaluate(() => {
  48. return document.title ||
  49. document.querySelector('meta[property="og:title"]')?.content ||
  50. document.querySelector('meta[name="twitter:title"]')?.content ||
  51. document.querySelector('h1')?.textContent?.trim();
  52. });
  53. }
  54. if (title) {
  55. fs.writeFileSync(outputPath, title, 'utf8');
  56. return { success: true, output: outputPath, title, method: 'cdp' };
  57. }
  58. return { success: false, error: 'No title found in Chrome session' };
  59. } catch (e) {
  60. return { success: false, error: e.message };
  61. } finally {
  62. if (browser) {
  63. browser.disconnect();
  64. }
  65. }
  66. }
  67. async function main() {
  68. const args = parseArgs();
  69. const url = args.url;
  70. const snapshotId = args.snapshot_id;
  71. if (!url || !snapshotId) {
  72. console.error('Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>');
  73. process.exit(1);
  74. }
  75. const startTs = new Date();
  76. let status = 'failed';
  77. let output = null;
  78. let error = '';
  79. let extractedTitle = null;
  80. try {
  81. const result = await extractTitle(url);
  82. if (result.success) {
  83. status = 'succeeded';
  84. output = result.output;
  85. extractedTitle = result.title;
  86. console.error(`Title extracted (${result.method}): ${result.title}`);
  87. } else {
  88. status = 'failed';
  89. error = result.error;
  90. }
  91. } catch (e) {
  92. error = `${e.name}: ${e.message}`;
  93. status = 'failed';
  94. }
  95. const endTs = new Date();
  96. if (error) {
  97. console.error(`ERROR: ${error}`);
  98. }
  99. // Update snapshot title via JSONL
  100. if (status === 'succeeded' && extractedTitle) {
  101. console.log(JSON.stringify({
  102. type: 'Snapshot',
  103. id: snapshotId,
  104. title: extractedTitle
  105. }));
  106. }
  107. // Output ArchiveResult JSONL
  108. const archiveResult = {
  109. type: 'ArchiveResult',
  110. status,
  111. output_str: output || error || '',
  112. };
  113. console.log(JSON.stringify(archiveResult));
  114. process.exit(status === 'succeeded' ? 0 : 1);
  115. }
  116. main().catch(e => {
  117. console.error(`Fatal error: ${e.message}`);
  118. process.exit(1);
  119. });