example_js_extractor.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. import {getEnvironmentConfig} from 'archivebox/util/config.js'
  2. import {getScopeConfig} from 'archivebox/util/config.js'
  3. import {getPuppeteerPage} from 'archivebox/util/page.js'
  4. const env_config = await getEnvironmentConfig()
  5. const snapshot_page = await archivebox.getPuppeteerPage(url, config)
  6. async function extract(page) {
  7. const cwd = process.cwd()
  8. const config = await getScopeConfig(url=url)
  9. const page = await archivebox.getPuppeteerPage(url, config)
  10. const output_path = path.join(cwd, 'screenrecording.mp4')
  11. let recorder = null
  12. const {
  13. SCREENRECORDING_DURATION_LIMIT=60,
  14. SCREENRECORDING_CODEC='libx264',
  15. SCREENRECORDING_SAVE_GIF=true,
  16. } = config
  17. page.on('setup', async () => {
  18. recorder = new PuppeteerScreenRecorder(page, {
  19. followNewTab: false,
  20. recordDurationLimit: SCREENRECORDING_DURATION_LIMIT,
  21. // fps: 25,
  22. // ffmpeg_Path: '<path of ffmpeg_path>' || null,
  23. // videoFrame: {
  24. // width: 1024,
  25. // height: 768,
  26. // },
  27. // videoCrf: 18,
  28. videoCodec: SCREENRECORDING_CODEC,
  29. // videoPreset: 'ultrafast',
  30. // videoBitrate: 1000,
  31. // autopad: {
  32. // color: 'black' | '#35A5FF',
  33. // },
  34. // aspectRatio: '4:3',
  35. });
  36. await recorder.start(output_path)
  37. await archivebox.savePageState(page, {recorder})
  38. })
  39. await once(page, 'setup')
  40. await once(page, 'BEHAVIORS_STARTED')
  41. page.on('BEHAVIORS_FINISHED', async () => {
  42. if (!recorder) return
  43. await recorder.stop()
  44. // convert video to GIF
  45. if (SCREENRECORDING_SAVE_GIF) {
  46. try {
  47. const BIN_NAME = process.env.FFMPEG_BINARY || 'ffmpeg'
  48. const child = child_process.spawn(
  49. BIN_NAME,
  50. [
  51. '-hide_banner',
  52. '-loglevel', 'error',
  53. '-ss', '3',
  54. '-t', '10',
  55. '-y',
  56. '-i', output_path,
  57. '-vf', "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse",
  58. '-loop', '0',
  59. output_path.replace('.mp4', '.gif'),
  60. ],
  61. {
  62. cwd,
  63. timeout: 60_000,
  64. // stdio: [null, 'pipe', 'pipe'],
  65. stdio: 'ignore',
  66. detached: true, // run in background, don't block on response
  67. },
  68. )
  69. await blockUntilExists(output_path.replace('.mp4', '.gif'), {min_bytes: 100, timeout: 40_000})
  70. console.log(`[🎥] Saved screen-recording GIF with ffmpeg pid=${child.pid} (${duration/1000}s)...`.padEnd(82), prettyPath(output_path.replace('.mp4', '.gif')))
  71. } catch(err) {
  72. console.log('[❌] Failed to convert video to GIF:', err)
  73. }
  74. }
  75. })
  76. await once(page, 'BEHAVIORS_FINISHED')
  77. }
  78. async function botArchiveTask({page, data, url=''}) {
  79. url = url || data // puppeteer-cluster passes in the url value via the data: arg
  80. const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
  81. const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
  82. if (is_unarchivable_url || is_already_archived) return null
  83. ALREADY_ARCHIVED.add(url.slice(0, 4096))
  84. if (ALREADY_ARCHIVED.size > TASKS_PER_RUN_LIMIT) {
  85. console.warn('[❌] Hit maximum URLs archived per browser session, exiting to free memory.')
  86. console.warn(' Run this process again to continue with the next batch...')
  87. process.exit(21)
  88. }
  89. const browser = await page.browser()
  90. const client = await page.target().createCDPSession()
  91. const extensions = await getChromeExtensionsFromCache({browser})
  92. const browser_version = await browser.version()
  93. const original_url = url.toString()
  94. const start_time = (new Date())
  95. console.log('[0/4]-------------------------------------------------------------------------')
  96. const snapshot_dir = await setupSnapshotDir({original_url, start_time})
  97. const snapshot = await setupSnapshotDB({original_url, start_time, snapshot_dir})
  98. console.log('[1/4]-------------------------------------------------------------------------')
  99. console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
  100. const page_state = {
  101. // global static state
  102. browser,
  103. client,
  104. browser_version,
  105. extensions,
  106. // per-page static metadata
  107. original_url,
  108. snapshot,
  109. snapshot_dir,
  110. start_time: start_time.toISOString(),
  111. start_ts: Number(start_time),
  112. version: versionStrFromDate(start_time),
  113. // per-page mutable archiving state
  114. main_response: null,
  115. recorder: null,
  116. console_log: [],
  117. traffic_log: {},
  118. redirects: {},
  119. }
  120. page._original_url = original_url
  121. try {
  122. // run all page setup functions in parallel
  123. const results = await Promise.allSettled([
  124. // loadAuthStorage(page, page_state, { apply: true }),
  125. startMetadataRecording(page, page_state),
  126. setupURLRewriting(page, page_state),
  127. // setupViewport(page, page_state),
  128. setupModalAutoClosing(page, page_state),
  129. loadCloudflareCookie(page, page_state),
  130. startResponseSaving(page, page_state),
  131. saveYTDLP(page, page_state),
  132. saveGALLERYDL(page, page_state),
  133. // saveSourceMaps(page, page_state),
  134. // TODO: someday setup https://github.com/osnr/TabFS ?
  135. ]);
  136. // run all page setup functions in parallel
  137. const rejected = results
  138. .filter(result => result.status === 'rejected')
  139. .map(result => (result as PromiseRejectedResult).reason);
  140. if (rejected.length) console.warn('[⚠️] Partial failures during page setup:', rejected);
  141. } catch(err) {
  142. console.error('[❌] PAGE SETUP ERROR', JSON.stringify(err, null, 4))
  143. return
  144. }
  145. console.log('[2/4]-------------------------------------------------------------------------')
  146. console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
  147. const startrecording_promise = startScreenrecording(page, page_state)
  148. page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
  149. try {
  150. const results = await Promise.allSettled([
  151. startrecording_promise,
  152. page.bringToFront(),
  153. page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
  154. ])
  155. const rejected = results
  156. .filter(result => result.status === 'rejected')
  157. .map(result => (result as PromiseRejectedResult).reason)
  158. if (rejected.length) console.warn('[⚠️] Parial failures during page load:', rejected)
  159. } catch(err) {
  160. console.error('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
  161. return
  162. }
  163. if (page_state.main_response === null) {
  164. page_state.main_response = await page.waitForResponse(() => true)
  165. }
  166. assert(page_state.main_response)
  167. if (page_state.main_response.status() == 429) {
  168. throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
  169. }
  170. // emulate human browsing behavior
  171. // await disableAnimations(page, page_state);
  172. await jiggleMouse(page, page_state);
  173. await solveCaptchas(page, page_state);
  174. await blockRedirects(page, page_state);
  175. await scrollDown(page, page_state);
  176. // await expandComments(page, page_state);
  177. await submitForm(page, page_state);
  178. // await blockJSExecution(page, page_state);
  179. console.log('[3/4]-------------------------------------------------------------------------')
  180. // stop tampering with page requests & JS / recording metadata / traffic log
  181. await stopMetadataRecording(page, page_state)
  182. // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
  183. const saveScreenrecording_promise = saveScreenrecording(page, page_state);
  184. await saveScreenshot(page, page_state);
  185. await savePDF(page, page_state);
  186. console.log('[4/4]-------------------------------------------------------------------------')
  187. // do all async archiving steps that can be run at the same time
  188. await inlineShadowDOM(page, page_state);
  189. const results = await Promise.allSettled([
  190. saveTitle(page, page_state),
  191. saveSEO(page, page_state),
  192. saveFavicon(page, page_state),
  193. saveSSL(page, page_state),
  194. saveRequests(page, page_state),
  195. saveRedirects(page, page_state),
  196. saveHeaders(page, page_state),
  197. saveRaw(page, page_state),
  198. saveDOM(page, page_state),
  199. saveBodyText(page, page_state),
  200. // savePandoc(page, page_state),
  201. saveReadability(page, page_state),
  202. saveAccessibility(page, page_state),
  203. saveOutlinks(page, page_state),
  204. // saveAuthStorage(page, page_state),
  205. saveAIQualityAssuranceResult(page, page_state),
  206. ]);
  207. // do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
  208. const bg_results = Promise.allSettled([
  209. saveScreenrecording_promise,
  210. saveSinglefile(page, page_state),
  211. // saveArchiveWebPage(page, page_state),
  212. // savePocket(page, page_state),
  213. ])
  214. const {duration} = await saveMetrics(page, page_state);
  215. const rejected = results
  216. .filter(result => result.status === 'rejected')
  217. .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises
  218. if (rejected.length)
  219. console.warn('[⚠️] Parial failures during archiving:', rejected)
  220. // Start an interactive REPL here with the `page` instance.
  221. // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
  222. // await page.repl()
  223. // await page.browser().repl()
  224. console.log(`[✅] ${ANSI.blue}Finished archiving in ${duration/1000}s.${ANSI.reset}`)
  225. try {
  226. const rejected = (await bg_results)
  227. .filter(result => result.status === 'rejected')
  228. .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises
  229. if (rejected.length)
  230. console.warn('[⚠️] Parial failures during wrap-up tasks:', rejected)
  231. console.log('[🗑️] Resetting to about:blank to ensure memory is freed...')
  232. await page.goto('about:blank')
  233. await page.close()
  234. } catch(err) {
  235. console.log(err)
  236. }
  237. // symlink the best results from across all the versions/ into the snapshot dir root
  238. await symlinkBestSnapshotResults(snapshot_dir)
  239. // display latest version screenshot GIF
  240. console.log()
  241. try {
  242. const latest_version_gif = path.join(snapshot_dir, 'versions', page_state.version, path.basename(SCREENRECORDGIF_PATH(page)))
  243. const dirent = await blockUntilExists(latest_version_gif, {min_bytes: 100, timeout: 15_000})
  244. child_process.spawn('/Users/squash/.iterm2/imgcat', [dirent.abspath], {stdio: [null, 'inherit', 'inherit']})
  245. } catch(err) {
  246. console.warn('[⚠️] Failed to display screenrecording.gif...', err)
  247. console.log()
  248. }
  249. // determine whether task succeeded or failed based on AI QA score
  250. const latest_version_aiqa = path.join(snapshot_dir, 'versions', page_state.version, path.basename(AIQA_PATH(page)))
  251. const qa_results = JSON.parse((await fs.promises.readFile(latest_version_aiqa)).toString())
  252. if (qa_results.pct_visible < 50) {
  253. throw `[❌] Task completed with problems, got AI QA score of ${qa_results.pct_visible}%! ${qa_results.warnings.join(', ')} ${qa_results.error_text || ''}`
  254. } else {
  255. console.log(`[💫] Task completed succesfully: ${qa_results.pct_visible}% ${qa_results.warnings.join(', ') || ''}`)
  256. console.log(` Summary: ${(qa_results.main_content_title || qa_results.description || 'No title/description detected').substring(0, 80)}... ${qa_results.main_content_author || ''} ${qa_results.main_content_date || ''}`)
  257. return true
  258. }
  259. }