on_Crawl__82_singlefile_install.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #!/usr/bin/env node
  2. /**
  3. * SingleFile Extension Plugin
  4. *
  5. * Installs and uses the SingleFile Chrome extension for archiving complete web pages.
  6. * Falls back to single-file-cli if the extension is not available.
  7. *
  8. * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
  9. *
  10. * Priority: 82 - Must install before Chrome session starts at Crawl level
  11. * Hook: on_Crawl (runs once per crawl, not per snapshot)
  12. *
  13. * This extension automatically:
  14. * - Saves complete web pages as single HTML files
  15. * - Inlines all resources (CSS, JS, images, fonts)
  16. * - Preserves page fidelity better than wget/curl
  17. * - Works with SPAs and dynamically loaded content
  18. */
  19. const path = require('path');
  20. const fs = require('fs');
  21. const { promisify } = require('util');
  22. const { exec } = require('child_process');
  23. const execAsync = promisify(exec);
  24. // Import extension utilities
  25. const extensionUtils = require('../chrome/chrome_utils.js');
  26. // Extension metadata
  27. const EXTENSION = {
  28. webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
  29. name: 'singlefile',
  30. };
  31. // Get extensions directory from environment or use default
  32. const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
  33. path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
  34. const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
  35. path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
  36. const OUTPUT_DIR = '.';
  37. const OUTPUT_FILE = 'singlefile.html';
  38. /**
  39. * Install the SingleFile extension
  40. */
  41. async function installSinglefileExtension() {
  42. console.log('[*] Installing SingleFile extension...');
  43. // Install the extension
  44. const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
  45. if (!extension) {
  46. console.error('[❌] Failed to install SingleFile extension');
  47. return null;
  48. }
  49. console.log('[+] SingleFile extension installed');
  50. console.log('[+] Web pages will be saved as single HTML files');
  51. return extension;
  52. }
  53. /**
  54. * Wait for a specified amount of time
  55. */
  56. function wait(ms) {
  57. return new Promise(resolve => setTimeout(resolve, ms));
  58. }
  59. /**
  60. * Save a page using the SingleFile extension
  61. *
  62. * @param {Object} page - Puppeteer page object
  63. * @param {Object} extension - Extension metadata with dispatchAction method
  64. * @param {Object} options - Additional options
  65. * @returns {Promise<string|null>} - Path to saved file or null on failure
  66. */
  67. async function saveSinglefileWithExtension(page, extension, options = {}) {
  68. if (!extension || !extension.version) {
  69. throw new Error('SingleFile extension not found or not loaded');
  70. }
  71. const url = await page.url();
  72. console.error(`[singlefile] Triggering extension for: ${url}`);
  73. // Check for unsupported URL schemes
  74. const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
  75. const scheme = url.split(':')[0];
  76. if (URL_SCHEMES_IGNORED.includes(scheme)) {
  77. console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
  78. return null;
  79. }
  80. const downloadsDir = options.downloadsDir || CHROME_DOWNLOADS_DIR;
  81. console.error(`[singlefile] Watching downloads dir: ${downloadsDir}`);
  82. // Ensure downloads directory exists
  83. await fs.promises.mkdir(downloadsDir, { recursive: true });
  84. // Get list of existing files to ignore
  85. const files_before = new Set(
  86. (await fs.promises.readdir(downloadsDir))
  87. .filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'))
  88. );
  89. // Output directory is current directory (hook already runs in output dir)
  90. const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
  91. console.error(`[singlefile] Saving via extension (${extension.id})...`);
  92. // Bring page to front (extension action button acts on foreground tab)
  93. await page.bringToFront();
  94. // Trigger the extension's action (toolbar button click)
  95. console.error('[singlefile] Dispatching extension action...');
  96. try {
  97. const actionTimeoutMs = options.actionTimeoutMs || 5000;
  98. const actionPromise = extension.dispatchAction();
  99. const actionResult = await Promise.race([
  100. actionPromise,
  101. wait(actionTimeoutMs).then(() => 'timeout'),
  102. ]);
  103. if (actionResult === 'timeout') {
  104. console.error(`[singlefile] Extension action did not resolve within ${actionTimeoutMs}ms, continuing...`);
  105. }
  106. } catch (err) {
  107. console.error(`[singlefile] Extension action error: ${err.message || err}`);
  108. }
  109. // Wait for file to appear in downloads directory
  110. const check_delay = 3000; // 3 seconds
  111. const max_tries = 10;
  112. let files_new = [];
  113. console.error(`[singlefile] Waiting up to ${(check_delay * max_tries) / 1000}s for download...`);
  114. for (let attempt = 0; attempt < max_tries; attempt++) {
  115. await wait(check_delay);
  116. const files_after = (await fs.promises.readdir(downloadsDir))
  117. .filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm'));
  118. files_new = files_after.filter(file => !files_before.has(file));
  119. if (files_new.length === 0) {
  120. console.error(`[singlefile] No new downloads yet (${attempt + 1}/${max_tries})`);
  121. continue;
  122. }
  123. console.error(`[singlefile] New download(s) detected: ${files_new.join(', ')}`);
  124. // Prefer files that match the URL or have SingleFile markers
  125. const url_variants = new Set([url]);
  126. if (url.endsWith('/')) {
  127. url_variants.add(url.slice(0, -1));
  128. } else {
  129. url_variants.add(`${url}/`);
  130. }
  131. const scored = [];
  132. for (const file of files_new) {
  133. const dl_path = path.join(downloadsDir, file);
  134. let header = '';
  135. try {
  136. const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
  137. header = dl_text.slice(0, 200000);
  138. const stat = await fs.promises.stat(dl_path);
  139. console.error(`[singlefile] Download ${file} size=${stat.size} bytes`);
  140. } catch (err) {
  141. // Skip unreadable files
  142. continue;
  143. }
  144. const header_lower = header.toLowerCase();
  145. const has_url = Array.from(url_variants).some(v => header.includes(v));
  146. const has_singlefile_marker = header_lower.includes('singlefile') || header_lower.includes('single-file');
  147. const score = (has_url ? 2 : 0) + (has_singlefile_marker ? 1 : 0);
  148. scored.push({ file, dl_path, score });
  149. }
  150. scored.sort((a, b) => b.score - a.score);
  151. if (scored.length > 0) {
  152. const best = scored[0];
  153. if (best.score > 0 || files_new.length === 1) {
  154. console.error(`[singlefile] Moving download from ${best.file} -> ${out_path}`);
  155. await fs.promises.rename(best.dl_path, out_path);
  156. const out_stat = await fs.promises.stat(out_path);
  157. console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
  158. return out_path;
  159. }
  160. }
  161. if (files_new.length > 0) {
  162. // Fallback: move the newest file if no clear match found
  163. let newest = null;
  164. let newest_mtime = -1;
  165. for (const file of files_new) {
  166. const dl_path = path.join(downloadsDir, file);
  167. try {
  168. const stat = await fs.promises.stat(dl_path);
  169. if (stat.mtimeMs > newest_mtime) {
  170. newest_mtime = stat.mtimeMs;
  171. newest = { file, dl_path };
  172. }
  173. } catch (err) {}
  174. }
  175. if (newest) {
  176. console.error(`[singlefile] Moving newest download from ${newest.file} -> ${out_path}`);
  177. await fs.promises.rename(newest.dl_path, out_path);
  178. const out_stat = await fs.promises.stat(out_path);
  179. console.error(`[singlefile] Moved file size=${out_stat.size} bytes`);
  180. return out_path;
  181. }
  182. }
  183. }
  184. console.error(`[singlefile] Failed to find SingleFile HTML in ${downloadsDir} after ${(check_delay * max_tries) / 1000}s`);
  185. console.error(`[singlefile] New files seen: ${files_new.join(', ')}`);
  186. return null;
  187. }
  188. /**
  189. * Save a page using single-file-cli (fallback method)
  190. *
  191. * @param {string} url - URL to archive
  192. * @param {Object} options - Additional options
  193. * @returns {Promise<string|null>} - Path to saved file or null on failure
  194. */
  195. async function saveSinglefileWithCLI(url, options = {}) {
  196. console.log('[*] Falling back to single-file-cli...');
  197. // Find single-file binary
  198. let binary = null;
  199. try {
  200. const { stdout } = await execAsync('which single-file');
  201. binary = stdout.trim();
  202. } catch (err) {
  203. console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
  204. return null;
  205. }
  206. // Output directory is current directory (hook already runs in output dir)
  207. const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
  208. // Build command
  209. const cmd = [
  210. binary,
  211. '--browser-headless',
  212. url,
  213. out_path,
  214. ];
  215. // Add optional args
  216. if (options.userAgent) {
  217. cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
  218. }
  219. if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
  220. cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
  221. }
  222. if (options.ignoreSSL) {
  223. cmd.splice(2, 0, '--browser-ignore-insecure-certs');
  224. }
  225. // Execute
  226. try {
  227. const timeout = options.timeout || 120000;
  228. await execAsync(cmd.join(' '), { timeout });
  229. if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
  230. console.log(`[+] SingleFile saved via CLI: ${out_path}`);
  231. return out_path;
  232. }
  233. console.error('[❌] SingleFile CLI completed but no output file found');
  234. return null;
  235. } catch (err) {
  236. console.error(`[❌] SingleFile CLI error: ${err.message}`);
  237. return null;
  238. }
  239. }
  240. /**
  241. * Main entry point - install extension before archiving
  242. */
  243. async function main() {
  244. // Check if extension is already cached
  245. const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
  246. if (fs.existsSync(cacheFile)) {
  247. try {
  248. const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
  249. const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
  250. if (fs.existsSync(manifestPath)) {
  251. console.log('[*] SingleFile extension already installed (using cache)');
  252. return cached;
  253. }
  254. } catch (e) {
  255. // Cache file corrupted, re-install
  256. console.warn('[⚠️] Extension cache corrupted, re-installing...');
  257. }
  258. }
  259. // Install extension
  260. const extension = await installSinglefileExtension();
  261. // Export extension metadata for chrome plugin to load
  262. if (extension) {
  263. // Write extension info to a cache file that chrome plugin can read
  264. await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
  265. await fs.promises.writeFile(
  266. cacheFile,
  267. JSON.stringify(extension, null, 2)
  268. );
  269. console.log(`[+] Extension metadata written to ${cacheFile}`);
  270. }
  271. return extension;
  272. }
  273. // Export functions for use by other plugins
  274. module.exports = {
  275. EXTENSION,
  276. installSinglefileExtension,
  277. saveSinglefileWithExtension,
  278. saveSinglefileWithCLI,
  279. };
  280. // Run if executed directly
  281. if (require.main === module) {
  282. main().then(() => {
  283. console.log('[✓] SingleFile extension setup complete');
  284. process.exit(0);
  285. }).catch(err => {
  286. console.error('[❌] SingleFile extension setup failed:', err);
  287. process.exit(1);
  288. });
  289. }