archivebox.ts 267 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108
  1. tring';
  2. import { Readable } from 'node:stream';
  3. import { finished } from 'node:stream/promises';
  4. import { URL } from 'node:url';
  5. import util from 'node:util';
  6. const exec = util.promisify(child_process.exec);
  7. import { Readability } from '@mozilla/readability';
  8. import FileCookieStore from '@root/file-cookie-store';
  9. import merge from 'deepmerge';
  10. import { createCursor, getRandomPagePoint } from 'ghost-cursor';
  11. import { JSDOM, VirtualConsole } from 'jsdom';
  12. import mime from 'mime-types';
  13. import ToughCookie from 'tough-cookie';
  14. import unzip from 'unzip-crx-3';
  15. import puppeteer from 'puppeteer';
  16. import { Browser, Page, Cookie, HTTPResponse } from 'puppeteer';
  17. import { Cluster } from 'puppeteer-cluster';
  18. import PupeteerExtra from "puppeteer-extra";
  19. import Stealth#!/usr/bin/env node --env-file .env
  20. // https://gist.github.com/pirate/d9a350e83025a1e6cf452cddd815d0d4
  21. // npm install request node-request minimist deepmerge mime-types decompress puppeteer-extra puppeteer-extra-plugin-repl puppeteer-extra-plugin-user-preferences puppeteer-extra-plugin-recaptcha puppeteer-extra-plugin-stealth puppeteer-screen-recorder puppeteer-cluster ghost-cursor @mozilla/readability jsdom unzip-crx-3 node-fetch@2
  22. import assert from 'node:assert/strict';
  23. import { Buffer } from 'node:buffer';
  24. import child_process from 'node:child_process';
  25. import crypto from 'node:crypto';
  26. import fs from 'node:fs';
  27. import { createServer } from 'node:http';
  28. import os from 'node:os';
  29. import path from 'node:path';
  30. import querystring from 'node:querysPlugin from "puppeteer-extra-plugin-stealth";
  31. import PrefsPlugin from 'puppeteer-extra-plugin-user-preferences';
  32. import { PuppeteerScreenRecorder } from 'puppeteer-screen-recorder';
  33. // import RecaptchaPlugin from 'puppeteer-extra-plugin-recaptcha';
  34. // import ReplPlugin from 'puppeteer-extra-plugin-repl';
  35. const __dirname = import.meta.dirname
  36. import { getDatabase } from './models/init-models.js';
  37. const { Tag, Snapshot, ArchiveResult } = await getDatabase({ dbpath: './index.sqlite3' })
  38. // move mitm CA cert into /usr/local/share/ca-certificates/mitmproxy-ca-cert.crt
  39. // update-ca-certificates
  40. const ANSI = {
  41. reset: "\x1b[0m",
  42. blue: "\x1b[34m",
  43. black: "\x1b[30m",
  44. }
  45. /************************* Main Input Arguments *******************************/
  46. let URLS = [
  47. // 'chrome://about',
  48. // 'chrome://system/#chrome_root_store',
  49. 'https://facebook.com/815781663692514/?comment_id=1508571679703640',
  50. 'https://www.instagram.com/p/CrTY1fENHr5/',
  51. 'https://www.tiktok.com/@zemmour_eric/video/7342474065598319904?cid=7343316616878490400',
  52. 'https://twitter.com/DZasken68678/status/1799833933271687304',
  53. 'https://t.me/IONONMIARRENDOGROUP/13598',
  54. 'https://www.youtube.com/watch?v=rpD0qgzlCms',
  55. 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
  56. 'https://gologin.com/check-browser',
  57. 'https://arh.antoinevastel.com/bots/areyouheadless',
  58. 'https://2captcha.com/demo/hcaptcha',
  59. 'https://2captcha.com/demo/cloudflare-turnstile',
  60. 'https://2captcha.com/demo/recaptcha-v3',
  61. 'https://ipinfo.io/',
  62. // 'https://2captcha.com/demo/recaptcha-v2',
  63. // 'https://2captcha.com/demo/keycaptcha',
  64. // 'https://browserleaks.com/canvas',
  65. // 'https://bot.incolumitas.com/#botChallenge',
  66. // 'https://infosimples.github.io/detect-headless/',
  67. // 'https://coveryourtracks.eff.org/',
  68. // 'https://fingerprint.com/demo/',
  69. // 'https://nowsecure.nl',
  70. // 'https://abrahamjuliot.github.io/creepjs/',
  71. // 'https://scrapfly.io/web-scraping-tools/http2-fingerprint',
  72. // 'https://scrapfly.io/web-scraping-tools/browser-fingerprint',
  73. // 'https://scrapfly.io/web-scraping-tools/ja3-fingerprint',
  74. // 'https://scrapfly.io/web-scraping-tools/canvas-fingerprint',
  75. // 'https://scrapfly.io/web-scraping-tools/webgl-fingerprint',
  76. // 'https://scrapfly.io/web-scraping-tools/audio-fingerprint',
  77. // 'https://scrapfly.io/web-scraping-tools/screen-fingerprint',
  78. // 'https://web-scraping.dev/',
  79. // 'https://example.com',
  80. // 'https://www.okta.com/',
  81. // 'https://www.webflow.com/',
  82. // 'https://docker-compose.archivebox.io',
  83. // 'https://www.reddit.com/r/AskReddit/comments/1br0q9b/what_was_ok_10_years_ago_but_isnt_today/',
  84. // 'https://www.quora.com/Is-the-website-2Captcha-true-or-fake-with-paying-money-for-working-on-it',
  85. // 'https://x.com/yawnzzcalo7/status/1747853178849435894',
  86. // 'https://twitter.com/yawnzzcalo7/status/1747853178849435894',
  87. // 'https://rachdele.substack.com/p/is-the-job-market-dying',
  88. // 'https://www.flowradar.com/cloneables/mouse-image-trail-effect',
  89. // 'https://wrong.host.badssl.com/',
  90. // 'http://docker-compose.archivebox.io',
  91. // 'https://pptr.dev/api/puppeteer.page.setrequestinterception',
  92. // 'https://blog.sweeting.me#Writing',
  93. // 'https://github.com/yarnpkg/yarn/issues/9005',
  94. // 'https://archive.md/739Oc',
  95. // 'https://archive.md/Oc72d',
  96. // 'https://archive.vn/fPUBe',
  97. // 'https://archive.vn/mRz4P',
  98. // 'https://archive.vn/Qct6Y',
  99. // 'https://archive.vn/sv50h',
  100. // 'https://facebook.com/815781663692514/?comment_id=1508571679703640',
  101. // 'https://facebook.com/815781663692514/?comment_id=924451748966499',
  102. // 'https://www.facebook.com/wayne.brennan.528/posts/pfbid02fvxFppng2WsHMavhBa62cXizCBGdmPQRH3CMhac79qzS5C1ADaSNC587d3u6qVbkl',
  103. // 'https://www.facebook.com/wildeprods/posts/pfbid02YEPfoB7pZqMNzE4y2MpYSQbRAzASquvHyEMzHqrNngJCSL7onEg2jnsqS6epcQHWl',
  104. // 'https://t.me/aubontouite_francais/9493',
  105. // 'https://t.me/BC_BLACKMIROR/5044',
  106. // 'https://t.me/IONONMIARRENDOGROUP/14004',
  107. // 'https://t.me/newsfactory_pl/51014',
  108. // 'https://t.me/oliverjanich/132574',
  109. // 'https://t.me/tomaszgryguc/10449',
  110. // 'https://t.me/amigosDisidentes/123177',
  111. // 'https://twitter.com/1nfiltr4do_NN/status/1767238399943991389',
  112. // 'https://twitter.com/4lmondcookie/status/1748519205438111914',
  113. // 'https://twitter.com/4olll1ke/status/1753796944827199766',
  114. // 'https://twitter.com/yeokiloss/status/1754908226179502345',
  115. // 'https://twitter.com/YoungWaifLover/status/1735667278090297561',
  116. // 'https://twitter.com/Z_Pour_Demain/status/1766133730278605182',
  117. // 'https://www.aap.com.au/factcheck/aboriginal-lands-claim-a-total-abdication-of-facts/',
  118. // 'https://www.aap.com.au/factcheck/absurd-albanese-clip-fools-voice-voters/',
  119. // 'https://www.instagram.com/_the.forgotten.ones/p/CQQDyoqhsF6/',
  120. // 'https://www.instagram.com/p/CqSM_f9MR4b/',
  121. // 'https://www.instagram.com/p/CqSQgf1sv8B/',
  122. // 'https://instagram.com/p/B-Q22Z_pxyC/',
  123. // 'https://www.tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
  124. // 'https://tiktok.com/@zitatezurzeit/photo/7342474065598319904?cid=7343316616878490400',
  125. // 'https://www.youtube.com/watch?v=rpD0qgzlCms',
  126. ]
  127. const isTruthy = (env_value) => ['1', 'yes', 'true'].includes(env_value?.toLowerCase() || 'false')
  128. /********************** Config: General High-Level Options ********************/
  129. const PASSIVE_ARCHIVING = isTruthy(process.env.PASSIVE_ARCHIVING)
  130. const CHROME_CLUSTER = isTruthy(process.env.CHROME_CLUSTER)
  131. const CHROME_CLUSTER_WORKERS = 4
  132. const API_SERVER_HOST = '0.0.0.0'
  133. const API_SERVER_PORT = 9595
  134. const CHROME_DEBUG_PORT = 9222 // 9222 is default, or use 0 for random port
  135. /********************** Config: Keys & Secrets ********************************/
  136. const API_KEY_2CAPTCHA = process.env.API_KEY_2CAPTCHA || 'YOUR_API_KEY_HERE'
  137. const FLARESOLVERR_API_ENDPOINT = process.env.FLARESOLVERR_API_ENDPOINT || "http://localhost:8191/v1"
  138. const ACTIVE_PERSONA = process.env.ACTIVE_PERSONA || 'Default'
  139. const CHROME_PROFILE_USER = process.env.CHROME_PROFILE_USER || 'Default'
  140. const LOAD_AUTH_STORAGE = isTruthy(process.env.LOAD_AUTH_STORAGE)
  141. const SAVE_AUTH_STORAGE = isTruthy(process.env.SAVE_AUTH_STORAGE)
  142. /********************** Config: Data Dir Locations ****************************/
  143. const SRC_DIR = path.resolve(__dirname)
  144. const DATA_DIR = process.env.DATA_DIR || await fs.promises.realpath(path.join(SRC_DIR, 'data'))
  145. const INDEXES_DIR = path.join(DATA_DIR, 'index')
  146. const ARCHIVE_DIR = path.join(DATA_DIR, 'archive')
  147. if (!fs.existsSync(ARCHIVE_DIR))
  148. throw 'Could not find data/archive, are you running in the right pwd?'
  149. const PERSONA_DIR = path.join(DATA_DIR, 'personas', ACTIVE_PERSONA)
  150. const CHROME_PROFILE_PATH = path.join(PERSONA_DIR, 'chrome_profile')
  151. const CHROME_DOWNLOADS_DIR = path.join(PERSONA_DIR, 'chrome_downloads')
  152. const CHROME_EXTENSIONS_DIR = path.join(PERSONA_DIR, 'chrome_extensions')
  153. const CHROME_EXTENSIONS_JSON_PATH = path.join(CHROME_EXTENSIONS_DIR, 'extensions.json')
  154. const AUTH_JSON_PATH = path.join(PERSONA_DIR, 'auth.json')
  155. const COOKIES_TXT_PATH = path.join(PERSONA_DIR, 'cookies.txt')
  156. const SPEEDTESTS_DIR = path.join(PERSONA_DIR, 'speedtests')
  157. // const CHROME_PROFILE_IMPORT_USER = 'Profile 1'
  158. // const CHROME_PROFILE_IMPORT_PATH = '/Volumes/NVME/Users/squash/Library/Application Support/Google/Chrome'
  159. // chrome profile / persona directories
  160. fs.mkdirSync(PERSONA_DIR, {recursive: true})
  161. fs.mkdirSync(SPEEDTESTS_DIR, {recursive: true})
  162. fs.mkdirSync(CHROME_PROFILE_PATH, {recursive: true})
  163. fs.mkdirSync(CHROME_EXTENSIONS_DIR, {recursive: true})
  164. fs.mkdirSync(CHROME_DOWNLOADS_DIR, {recursive: true})
  165. // cruft directories
  166. const ORPHANS_DIR = path.join(DATA_DIR, 'orphans')
  167. const PARTIALS_DIR = path.join(DATA_DIR, 'partials')
  168. const DUPLICATES_DIR = path.join(DATA_DIR, 'duplicates')
  169. await fs.promises.mkdir(ORPHANS_DIR, {recursive: true})
  170. await fs.promises.mkdir(PARTIALS_DIR, {recursive: true})
  171. await fs.promises.mkdir(DUPLICATES_DIR, {recursive: true})
  172. /********************** Config: Viewport Setup Opts ***************************/
  173. // Config: Viewport
  174. const DEFAULT_TIMEOUT = 20_000
  175. const DEFAULT_GEOLOCATION = {latitude: 59.95, longitude: 30.31667}
  176. const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
  177. const DEFAULT_ASPECT_RAIO = 16/9 // recommended: 16:9 (most common desktop window aspect ratio)
  178. const SCREENSHOT_ASPECT_RATIO = 4/3 // recommended: 4:3 (easier to use as thumbnails when square-ish)
  179. const DEFAULT_WINDOW_WIDTH = 1920 // recommended: 1920x1080p (1080p screenshots)
  180. const DEFAULT_WINDOW_HEIGHT = Math.floor(DEFAULT_WINDOW_WIDTH/DEFAULT_ASPECT_RAIO)
  181. const DEFAULT_VIEWPORT = {
  182. width: DEFAULT_WINDOW_WIDTH,
  183. height: DEFAULT_WINDOW_HEIGHT,
  184. deviceScaleFactor: 2, // 2 gives much sharper text in screenshots/pdfs/etc but uses more CPU/GPU
  185. isMobile: false,
  186. hasTouch: false,
  187. isLandscape: false,
  188. }
  189. const DEFAULT_COLOR_SCHEME = 'light'
  190. const DEFAULT_HEADERS = {
  191. // requires frequent tweaking to remain undetected by cloudflare/recaptcha/etc.
  192. // 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  193. // 'accept-encoding': 'gzip, deflate, br, zstd',
  194. // 'accept-language': accept_language,
  195. // 'cache-Control': no_cache ? 'no-cache' : '',
  196. // 'dnt': '1',
  197. 'sec-ch-ua': '"Google Chrome";v="122", "Not:A-Brand";v="8", "Chromium";v="122"',
  198. 'sec-ch-ua-mobile': '?0',
  199. 'sec-ch-ua-platform': '"macOS"',
  200. 'connection-rtt': '50',
  201. // 'pragma': no_cache ? 'no-cache' : '',
  202. // 'sec-fetch-dest': 'document',
  203. // 'sec-fetch-mode': 'navigate',
  204. // 'sec-fetch-site': 'none',
  205. // 'sec-fetch-user': '?1',
  206. // // 'upgrade-insecure-requests': '1', // breaks some sites, e.g. https://www.flowradar.com/cloneables/mouse-image-trail-effect
  207. // 'user-agent': user_agent,
  208. }
  209. const DEFAULT_REFERRERS = ["https://www.google.com", "https://www.facebook.com", "https://www.instagram.com"]
  210. /****************** Config: Human Behavior Emulation **************************/
  211. const SCROLL_LIMIT = 20; // e.g. 30 = 30 * (1000px/2s) => 30,000px scrolled in 60sec
  212. const SCROLL_DELAY = 1350; // interval per scroll, e.g. 2000 = 2sec to travel 1 * SCROLL_DISTANCE
  213. const SCROLL_DISTANCE = DEFAULT_VIEWPORT.height - 100; // make sure this is slightly less than viewport height so there is some overlap to make stitching easier
  214. /********************** Config: URL Rewriting *********************************/
  215. const URL_REWRITES = [
  216. // replacements should come first
  217. // {
  218. // idx: 0,
  219. // pattern: /\/\/(www\.)?x\.com/gi,
  220. // replacement: '//$1twitter.com/',
  221. // // TODO: scope: 'hostname',
  222. // },
  223. // {
  224. // idx: 1,
  225. // pattern: /\/\/(www\.)?twitter\.com/gi,
  226. // replacement: '//$1nitter.net',
  227. // // TODO: scope: 'hostname',
  228. // },
  229. // // blocks should come at the end
  230. // {
  231. // idx: 999,
  232. // pattern: /\/\/(www\.)?notallowed\.com/gi,
  233. // replacement: '',
  234. // // TODO: scope: 'href',
  235. // },
  236. ]
  237. const URL_SCHEMES_IGNORED = [
  238. '', // no scheme is also invalid (e.g. opening a new tab page without any url yet)
  239. 'chrome',
  240. 'chrome-extension',
  241. 'chrome-untrusted',
  242. 'file',
  243. 'data',
  244. 'about',
  245. ]
  246. /**************** Load existing data/archive/<timestamp> snapshots *************/
  247. const snapshots = await Snapshot.findAll({ attributes: ['id', 'timestamp', 'url'] }) // include: { model: ArchiveResult, as: 'archiveresults' }, });
  248. const results = await ArchiveResult.findAll({ attributes: ['id', 'snapshot_id', 'extractor', 'start_ts'] }) // include: { model: Snapshot, as: 'snapshot' }, });
  249. globalThis.snapshots = snapshots
  250. globalThis.results = results
  251. console.log(`[💿] Found ${snapshots.length} existing snapshots in index.sqlite3...`)
  252. console.log(`[💿] Found ${results.length} existing results in index.sqlite3...`)
  253. // debugger;
  254. const locateExistingSnapshots = (archive_dir) => {
  255. const urls_to_dirs = {}
  256. // for each data/archive/<timestamp>/index.json found, store {url: data/archive/<timestamp>}
  257. for (const snapshot_dir of fs.readdirSync(archive_dir)) {
  258. const snapshot_json = path.join(archive_dir, snapshot_dir, 'index.json')
  259. if (fs.existsSync(snapshot_json)) {
  260. const {url, archive_path} = JSON.parse(fs.readFileSync(snapshot_json, 'utf-8'))
  261. if (!snapshot_dir.includes(archive_path.replace('archive/', '')))
  262. throw 'Found incorrect index.json inside snapshot dir' + snapshot_dir
  263. if (url && url.includes('://')) {
  264. urls_to_dirs[url] = path.join(archive_dir, snapshot_dir)
  265. }
  266. }
  267. }
  268. return urls_to_dirs
  269. }
  270. let SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
  271. let all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
  272. // const orphan_snap_dirs = all_snap_dirs.filter(dirname => dirname.startsWith('19999'))
  273. // // scan through existing snapshot dirs, move orphans to orphans/ or correct archive/<snapid>
  274. // for (const snap_id of orphan_snap_dirs) {
  275. // if (snap_id.startsWith('.')) continue
  276. // const src_dir = path.join(ARCHIVE_DIR, snap_id)
  277. // let src_path = src_dir
  278. // assert((await fs.promises.stat(src_dir)).isDirectory())
  279. // let dest_path = null
  280. // const orphan_metrics_path = path.join(src_dir, 'metrics.json')
  281. // if (fs.existsSync(orphan_metrics_path)) {
  282. // const orphan_metrics = JSON.parse(await fs.promises.readFile(orphan_metrics_path, 'utf-8'))
  283. // const url = orphan_metrics.url || orphan_metrics.URL
  284. // const version = orphan_metrics.VERSION || versionStrFromDate(orphan_metrics.start_time)
  285. // // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
  286. // await symlinkBestSnapshotResults(src_dir)
  287. // dest_path = SNAPSHOT_DIRS_BY_URL[url]
  288. // const dest_id = dest_path?.split('/').at(-1)
  289. // if (dest_id && (dest_id != snap_id)) {
  290. // if (fs.existsSync(dest_path)) {
  291. // console.log(` - moving duplicate snap_dir ${src_dir} -> ${dest_path}`)
  292. // } else {
  293. // console.log(` - moving valid snap_dir ${src_dir} -> ${dest_path}`)
  294. // }
  295. // } else if (dest_id == snap_id) {
  296. // continue
  297. // } else {
  298. // dest_path = path.join(ORPHANS_DIR, snap_id)
  299. // console.log(` - moving orphan snap_dir ${src_dir} -> ${dest_path}`)
  300. // }
  301. // } else {
  302. // // corrupt/par
  303. // dest_path = path.join(PARTIALS_DIR, snap_id)
  304. // console.log(` - moving parial snap_dir ${src_dir} -> ${dest_path}`)
  305. // }
  306. // if (dest_path) {
  307. // for (const version_dir of (await fs.promises.readdir(path.join(src_path, 'versions')))) {
  308. // const version_src = path.join(src_path, 'versions', version_dir)
  309. // const version_dst = path.join(dest_path, 'versions', version_dir)
  310. // // move all bare files into ./versions/YYYYMMDD/* and symlink ./* to latest version
  311. // await symlinkBestSnapshotResults(dest_path)
  312. // assert(!fs.existsSync(version_dst))
  313. // await fs.promises.rename(version_src, version_dst)
  314. // console.log(' - ', version_src, '--->', version_dst)
  315. // }
  316. // await fs.promises.rename(src_dir, path.join(PARTIALS_DIR, snap_id))
  317. // await symlinkBestSnapshotResults(dest_path)
  318. // }
  319. // }
  320. // const duplicate_snap_dirs = (await fs.promises.readdir(DUPLICATES_DIR)).filter(dirname => dirname.startsWith('19999'))
  321. // for (const snap_id of duplicate_snap_dirs) {
  322. // const src_dir = path.join(DUPLICATES_DIR, snap_id)
  323. // const metrics = JSON.parse(await fs.promises.readFile(path.join(src_dir, 'metrics.json'), 'utf-8'))
  324. // }
  325. // all_snap_dirs = (await fs.promises.readdir(ARCHIVE_DIR))
  326. // for (const snap_id of all_snap_dirs) {
  327. // if (snap_id.startsWith('.')) continue
  328. // const snap_dir = path.join(ARCHIVE_DIR, snap_id)
  329. // const metrics_path = path.join(snap_dir, 'metrics.json')
  330. // if (fs.existsSync(metrics_path)) {
  331. // // console.log(' - updating snap_dir', snap_dir)
  332. // await symlinkBestSnapshotResults(snap_dir)
  333. // }
  334. // }
  335. // SNAPSHOT_DIRS_BY_URL = locateExistingSnapshots(ARCHIVE_DIR)
  336. fs.writeFileSync(path.join(DATA_DIR, 'queue.csv'), '')
  337. const snapIdFromDir = (dir_path) =>
  338. dir_path.split('/archive/').at(-1)
  339. const snapshot_dir_list = (
  340. Object.entries(SNAPSHOT_DIRS_BY_URL)
  341. .sort(([_ak, a], [_bk, b]) =>
  342. Number(snapIdFromDir(b)) - Number(snapIdFromDir(a)))
  343. .reverse())
  344. for (const [existing_url, snapshot_dir] of snapshot_dir_list) {
  345. // if (existing_url.startsWith('https://www.facebook.com/')) {
  346. const is_desired_url = !(existing_url.includes('facebook.com/') || existing_url.includes('instagram.com/'))
  347. const already_archived = false // fs.existsSync(path.join(SNAPSHOT_DIRS_BY_URL[existing_url], 'versions'))
  348. if (is_desired_url && !already_archived) {
  349. // URLS.push(existing_url)
  350. fs.appendFileSync(
  351. path.join(DATA_DIR, 'queue.csv'),
  352. `${SNAPSHOT_DIRS_BY_URL[existing_url]},${existing_url}\n`,
  353. 'utf-8',
  354. )
  355. }
  356. }
  357. URLS = [...new Set(URLS)]
  358. console.log('[+] Added', URLS.length, 'existing urls to queue...')
  359. /********************** Config: Output Paths **********************************/
  360. // const TASK_PATH = (url) => path.join(DATA_DIR, 'results', `${hashCode(url)}`)
  361. const TASK_PATH = (url) => SNAPSHOT_DIRS_BY_URL[url] || path.join(ARCHIVE_DIR, `1999999999.${hashCode(url)}`)
  362. // const TASK_PATH = (url) => {
  363. // const existing_snap_dir = SNAPSHOT_DIRS_BY_URL[url]
  364. // assert(existing_snap_dir, `Could not find existing snapshot dir for ${url}`)
  365. // return existing_snap_dir
  366. // }
  367. const OUTPUT_PATH = (page, filename, extname='') =>
  368. path.join(TASK_PATH(page._original_url), `${filename}${extname}`)
  369. const SSL_PATH = (page) => OUTPUT_PATH(page, 'ssl.json')
  370. const CONSOLELOG_PATH = (page) => OUTPUT_PATH(page, 'console.log')
  371. const HEADERS_PATH = (page) => OUTPUT_PATH(page, 'headers.json')
  372. const REDIRECTS_PATH = (page) => OUTPUT_PATH(page, 'redirects.json')
  373. const REQUESTS_PATH = (page) => OUTPUT_PATH(page, 'requests.json')
  374. const TRACE_PATH = (page) => OUTPUT_PATH(page, 'trace.json')
  375. const METRICS_PATH = (page) => OUTPUT_PATH(page, 'metrics.json')
  376. const OUTLINKS_PATH = (page) => OUTPUT_PATH(page, 'outlinks.json')
  377. const SEO_PATH = (page) => OUTPUT_PATH(page, 'seo.json')
  378. const FAVICON_PATH = (page) => OUTPUT_PATH(page, 'favicon.json')
  379. const TITLE_PATH = (page) => OUTPUT_PATH(page, 'title.txt')
  380. const BODYTEXT_PATH = (page) => OUTPUT_PATH(page, 'body.txt')
  381. const PANDOC_PATH = (page) => OUTPUT_PATH(page, 'pandoc.md')
  382. const READABILITY_PATH = (page) => OUTPUT_PATH(page, 'readability.json')
  383. const ACCESIBILITY_PATH = (page) => OUTPUT_PATH(page, 'accessibility.json')
  384. const DOM_PATH = (page) => OUTPUT_PATH(page, 'dom.html')
  385. const PDF_PATH = (page) => OUTPUT_PATH(page, 'output.pdf')
  386. const SCREENSHOT_PATH = (page) => OUTPUT_PATH(page, 'screenshot.png')
  387. const SCREENSHOT_JPG_PATH = (page) => OUTPUT_PATH(page, 'screenshot.jpg')
  388. const AIQA_PATH = (page) => OUTPUT_PATH(page, 'aiqa.json')
  389. const SINGLEFILE_PATH = (page) => OUTPUT_PATH(page, 'singlefile.html')
  390. const YTDLP_PATH = (page) => OUTPUT_PATH(page, 'media/')
  391. const GALLERYDL_PATH = (page) => OUTPUT_PATH(page, 'photos/')
  392. const SCREENRECORDING_PATH = (page) => OUTPUT_PATH(page, 'screenrecording.mp4')
  393. const SCREENRECORDGIF_PATH = (page) => OUTPUT_PATH(page, 'screenrecording.gif')
  394. const RESPONSES_PATH = (page) => OUTPUT_PATH(page, 'responses')
  395. const RAW_PATH = (page) => OUTPUT_PATH(page, 'raw')
  396. /********************** Config: Chrome Extensions *****************************/
  397. interface ChromeExtension {
  398. name: string
  399. webstore_id: string
  400. }
  401. interface LoadedChromeExtension extends ChromeExtension {
  402. id?: string
  403. webstore_url?: string
  404. crx_url?: string
  405. crx_path?: string
  406. unpacked_path?: string
  407. read_manifest?: () => any
  408. read_version?: () => string | null
  409. }
  410. const CHROME_EXTENSIONS: LoadedChromeExtension[] = [
  411. // Content access / unblocking / blocking plugins
  412. {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'twocaptcha'}, // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
  413. {webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'},
  414. {webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'},
  415. // {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'},
  416. // {webstore_id: 'mnjggcdmjocbbbhaepdhchncahnbgone', name: 'sponsorblock'},
  417. // {webstore_id: 'iplffkdpngmdjhlpjmppncnlhomiipha', name: 'unpaywall'},
  418. // {webstore_id: 'gofocbepaccnkpphbgjpolififgcakhn', name: 'spaywallnews'},
  419. // Archiving plugins
  420. {webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', name: 'singlefile'},
  421. // {webstore_id: 'fpeoodllldobpkbkabpblcfaogecpndd', name: 'archivewebpage'},
  422. // {webstore_id: 'niloccemoadcdkdjlinkgdfekeahmflj', name: 'pocket'},
  423. // {webstore_id: 'kenncghfghgolcbmckhiljgaabnpcaaa', name: 'warcreate'},
  424. // {webstore_id: 'jjndjgheafjngoipoacpjgeicjeomjli', name: 'puppeteerstream'},
  425. // Utilities for humans setting up/viewing/debugging the archiving session
  426. // {webstore_id: 'aeblfdkhhhdcdjpifhhbdiojplfjncoa', name: '1password'},
  427. // {webstore_id: 'fngmhnnpilhplaeedifhccceomclgfbg', name: 'editthiscookie'},
  428. // {webstore_id: 'cgfpgnepljlgenjclbekbjdlgcodfmjp', name: 'simpletabsorter'},
  429. // Scripting/automation plugins
  430. // {webstore_id: 'jinjaccalgkegednnccohejagnlnfdag', name: 'violentmonkey'},
  431. // {webstore_id: 'infppggnoaenmfagbfknfkancpbljcca', name: 'automa'},
  432. // {webstore_id: 'pfegffhjcgkneoemnlniggnhkfioidjg', name: 'screenscraper'},
  433. ]
  434. /******************** Config: Chrome Profile Preferences **********************/
  435. // https://niek.github.io/chrome-features/
  436. const CHROME_DISABLED_COMPONENTS = [
  437. 'Translate',
  438. 'AcceptCHFrame',
  439. 'OptimizationHints',
  440. 'ProcessPerSiteUpToMainFrameThreshold',
  441. 'InterestFeedContentSuggestions',
  442. 'CalculateNativeWinOcclusion',
  443. 'BackForwardCache',
  444. 'HeavyAdPrivacyMitigations',
  445. 'LazyFrameLoading',
  446. 'ImprovedCookieControls',
  447. 'PrivacySandboxSettings4',
  448. 'AutofillServerCommunication',
  449. 'CertificateTransparencyComponentUpdater',
  450. 'DestroyProfileOnBrowserClose',
  451. 'CrashReporting',
  452. 'OverscrollHistoryNavigation',
  453. 'InfiniteSessionRestore',
  454. //'LockProfileCookieDatabase', // disabling allows multiple chrome instances to concurrently modify profile, but might make chrome much slower https://github.com/yt-dlp/yt-dlp/issues/7271 https://issues.chromium.org/issues/40901624
  455. ]
  456. const CHROME_PREFERENCES_EXTRA = {}
  457. const CHROME_PREFERENCES_DEFAULT = {
  458. // https://chromium.googlesource.com/chromium/src/+/32352ad08ee673a4d43e8593ce988b224f6482d3/chrome/common/pref_names.cc
  459. homepage: 'about:blank', // doesn't work here, managed by Secure Preferences
  460. homepage_is_newtabpage: false, // doesn't work here, managed by Secure Preferences
  461. session: { // doesn't work here, managed by Secure Preferences
  462. restore_on_startup: 4, // doesn't work here, managed by Secure Preferences
  463. startup_urls: 'about:blank', // doesn't work here, managed by Secure Preferences
  464. },
  465. default_apps: 'noinstall',
  466. browser: {
  467. confirm_to_quit: false,
  468. enable_spellchecking: false,
  469. check_default_browser: false,
  470. show_update_promotion_info_bar: false,
  471. },
  472. profile: {
  473. // name: 'ArchiveBox Persona: Default', // doesnt work to change display name, not sure why
  474. // using_default_name: false,
  475. exited_cleanly: true,
  476. default_content_setting_values: {
  477. automatic_downloads: 1,
  478. },
  479. },
  480. bookmark_bar: {show_on_all_tabs: false},
  481. safebrowsing: {enabled: false},
  482. search: {suggest_enabled: false},
  483. download: {
  484. prompt_for_download: false,
  485. open_pdf_in_system_reader: true,
  486. // default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
  487. },
  488. select_file_dialogs: {allowed: false},
  489. autofill: {save_data: false},
  490. printing: {enabled: false},
  491. message_center: {welcome_notification_dismissed_local: true},
  492. extensions: {
  493. ui: {
  494. developer_mode: true,
  495. dismissed_adt_promo: true,
  496. },
  497. // pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
  498. },
  499. webkit: {
  500. webprefs: {
  501. javascript_enabled: true,
  502. minimum_font_size: 9,
  503. // default_font_size: 12,
  504. // web_security_enabled: false,
  505. // allow_displaying_insecure_content: true,
  506. // allow_running_insecure_content: true,
  507. java_enabled: true,
  508. loads_images_automatically: true,
  509. },
  510. },
  511. settings: {
  512. multi_profile_never_show_intro: true,
  513. multi_profile_warning_show_dismissed: true,
  514. first_run_tutorial_shown: true,
  515. },
  516. plugins: {
  517. always_open_pdf_externally: true,
  518. },
  519. }
  520. const CHROME_PREFERENCES_PATH = path.join(CHROME_PROFILE_PATH, 'Default', 'Preferences')
  521. const getChromePreferences = ({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_EXTENSIONS, CHROME_DOWNLOADS_DIR}) =>
  522. merge.all([CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, {
  523. extensions: {
  524. pinned_extensions: CHROME_EXTENSIONS?.map(({id}) => id) || [],
  525. },
  526. download: {
  527. default_directory: CHROME_DOWNLOADS_DIR || path.join(__dirname, 'downloads'),
  528. },
  529. }])
  530. function applyChromePreferences(puppeteer, prefs_path, preferences) {
  531. if (fs.existsSync(prefs_path)) {
  532. const preferences_existing = JSON.parse(fs.readFileSync(prefs_path, 'utf-8'))
  533. const preferences_merged = merge(preferences_existing, preferences)
  534. // console.log(JSON.stringify(preferences_merged, null, 4))
  535. fs.writeFileSync(prefs_path, JSON.stringify(preferences_merged))
  536. } else {
  537. // otherwise profile has not been created yet, use plugin instead (plugin only works on first creation)
  538. puppeteer.use(PrefsPlugin({userPrefs: preferences}))
  539. }
  540. return puppeteer
  541. }
  542. /******************** Config: Chrome Launch Args ******************************/
  543. const CHROME_ARGS_DEFAULT = [
  544. // Headless behavior tuning, determinstic behavior settings
  545. // '--headless=new',
  546. '--test-type',
  547. '--test-type=gpu', // https://github.com/puppeteer/puppeteer/issues/10516
  548. '--deterministic-mode',
  549. '--js-flags=--random-seed=1157259159', // make all JS random numbers deterministic by providing a seed
  550. '--allow-pre-commit-input', // allow JS mutations before page rendering is complete
  551. '--disable-blink-features=AutomationControlled', // hide the signatures that announce browser is being remote-controlled
  552. '--enable-automation', // <- DONT USE THIS, it makes you easily detectable / blocked by cloudflare
  553. // `--proxy-server=https://43.159.28.126:2334:u7ce652b7568805c4-zone-custom-region-us-session-szGWq3FRU-sessTime-60:u7ce652b7568805c4`, // send all network traffic through a proxy https://2captcha.com/proxy
  554. // `--proxy-bypass-list=127.0.0.1`,
  555. // Docker-specific options
  556. // https://github.com/GoogleChrome/lighthouse-ci/tree/main/docs/recipes/docker-client#--no-sandbox-issues-explained
  557. // '--no-sandbox', // rely on docker sandboxing in docker, otherwise we need cap_add: SYS_ADM to use host sandboxing
  558. // '--disable-gpu-sandbox',
  559. // '--disable-setuid-sandbox',
  560. // '--disable-dev-shm-usage', // docker 75mb default shm size is not big enough, disabling just uses /tmp instead
  561. // '--no-xshm',
  562. // Profile data dir setup
  563. // chrome://profile-internals
  564. `--user-data-dir=${CHROME_PROFILE_PATH}`,
  565. `--profile-directory=${CHROME_PROFILE_USER}`,
  566. '--password-store=basic', // use mock keychain instead of OS-provided keychain (we manage auth.json instead)
  567. '--use-mock-keychain',
  568. '--disable-cookie-encryption', // we need to be able to write unencrypted cookies to save/load auth.json
  569. // '--disable-sync', // don't try to use Google account sync features
  570. // Extensions
  571. // chrome://inspect/#extensions
  572. // `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`, // not needed when using existing profile that already has extensions installed
  573. `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({ webstore_id }) => webstore_id).join(',')}`,
  574. '--allow-legacy-extension-manifests',
  575. // Browser window and viewport setup
  576. // chrome://version
  577. // `--user-agent="${DEFAULT_USER_AGENT}"`,
  578. // `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
  579. '--window-position=0,0',
  580. '--hide-scrollbars', // hide scrollbars because otherwise they show up in screenshots
  581. '--install-autogenerated-theme=169,32,85', // red border makes it easier to see which chrome window is archivebox's
  582. '--autoplay-policy=no-user-gesture-required', // auto-start videos so they trigger network requests + show up in outputs
  583. '--disable-gesture-requirement-for-media-playback',
  584. '--lang=en-US,en;q=0.9',
  585. // DANGER: JS isolation security features (to allow easier tampering with pages during archiving)
  586. // chrome://net-internals
  587. // '--disable-web-security', // <- WARNING, breaks some sites that expect/enforce strict CORS headers (try webflow.com)
  588. // '--disable-features=IsolateOrigins,site-per-process', // useful for injecting JS, but some very strict sites can panic / show error pages when isolation is disabled (e.g. webflow.com)
  589. // '--allow-running-insecure-content', // Breaks CORS/CSRF/HSTS etc., useful sometimes but very easy to detect
  590. // '--allow-file-access-from-files', // <- WARNING, dangerous, allows JS to read filesystem using file:// URLs
  591. // // DANGER: Disable HTTPS verification
  592. // '--ignore-certificate-errors',
  593. // '--ignore-ssl-errors',
  594. // '--ignore-certificate-errors-spki-list',
  595. // '--allow-insecure-localhost',
  596. // IO: stdin/stdout, debug port config
  597. // chrome://inspect
  598. '--log-level=2', // 1=DEBUG 2=WARNING 3=ERROR
  599. '--enable-logging=stderr',
  600. '--remote-debugging-address=0.0.0.0',
  601. `--remote-debugging-port=${CHROME_DEBUG_PORT}`,
  602. // GPU, canvas, text, and pdf rendering config
  603. // chrome://gpu
  604. '--enable-webgl', // enable web-gl graphics support
  605. '--font-render-hinting=none', // make rendering more deterministic by ignoring OS font hints, may also need css override, try: * {text-rendering: geometricprecision !important; -webkit-font-smoothing: antialiased;}
  606. '--force-color-profile=srgb', // make rendering more deterministic by using consitent color profile, if browser looks weird, try: generic-rgb
  607. '--disable-partial-raster', // make rendering more deterministic (TODO: verify if still needed)
  608. '--disable-skia-runtime-opts', // make rendering more deterministic by avoiding Skia hot path runtime optimizations
  609. '--disable-2d-canvas-clip-aa', // make rendering more deterministic by disabling antialiasing on 2d canvas clips
  610. // '--disable-gpu', // falls back to more consistent software renderer
  611. // // '--use-gl=swiftshader', <- DO NOT USE, breaks M1 ARM64. it makes rendering more deterministic by using simpler CPU renderer instead of OS GPU renderer bug: https://groups.google.com/a/chromium.org/g/chromium-dev/c/8eR2GctzGuw
  612. // // '--disable-software-rasterizer', <- DO NOT USE, harmless, used in tandem with --disable-gpu
  613. // // '--run-all-compositor-stages-before-draw', <- DO NOT USE, makes headful chrome hang on startup (tested v121 Google Chrome.app on macOS)
  614. // // '--disable-gl-drawing-for-tests', <- DO NOT USE, disables gl output (makes tests run faster if you dont care about canvas)
  615. // // '--blink-settings=imagesEnabled=false', <- DO NOT USE, disables images entirely (only sometimes useful to speed up loading)
  616. // Process management & performance tuning
  617. // chrome://process-internals
  618. '--disable-lazy-loading', // make rendering more deterministic by loading all content up-front instead of on-focus
  619. '--disable-renderer-backgrounding', // dont throttle tab rendering based on focus/visibility
  620. '--disable-background-networking', // dont throttle tab networking based on focus/visibility
  621. '--disable-background-timer-throttling', // dont throttle tab timers based on focus/visibility
  622. '--disable-backgrounding-occluded-windows', // dont throttle tab window based on focus/visibility
  623. '--disable-ipc-flooding-protection', // dont throttle ipc traffic or accessing big request/response/buffer/etc. objects will fail
  624. '--disable-extensions-http-throttling', // dont throttle http traffic based on runtime heuristics
  625. '--disable-field-trial-config', // disable shared field trial state between browser processes
  626. '--disable-back-forward-cache', // disable browsing navigation cache
  627. // '--in-process-gpu', <- DONT USE THIS, makes headful startup time ~5-10s slower (tested v121 Google Chrome.app on macOS)
  628. // '--disable-component-extensions-with-background-pages', // TODO: check this, disables chrome components that only run in background (could lower startup time)
  629. // uncomment to disable hardware camera/mic/speaker access + present fake devices to websites
  630. // (faster to disable, but disabling breaks recording browser audio in puppeteer-stream screenrecordings)
  631. // '--use-fake-device-for-media-stream',
  632. // '--use-fake-ui-for-media-stream',
  633. // '--disable-features=GlobalMediaControls,MediaRouter,DialMediaRouteProvider',
  634. // // Output format options (PDF, screenshot, etc.)
  635. '--export-tagged-pdf', // include table on contents and tags in printed PDFs
  636. '--generate-pdf-document-outline',
  637. // Suppress first-run features, popups, hints, updates, etc.
  638. // chrome://system
  639. '--no-pings',
  640. '--no-first-run',
  641. '--no-default-browser-check',
  642. '--disable-default-apps',
  643. '--ash-no-nudges',
  644. '--disable-infobars',
  645. '--disable-search-engine-choice-screen',
  646. '--disable-session-crashed-bubble',
  647. '--simulate-outdated-no-au="Tue, 31 Dec 2099 23:59:59 GMT"',
  648. '--hide-crash-restore-bubble',
  649. '--suppress-message-center-popups',
  650. '--disable-client-side-phishing-detection',
  651. '--disable-domain-reliability',
  652. '--disable-component-update',
  653. '--disable-datasaver-prompt',
  654. '--disable-hang-monitor',
  655. '--disable-session-crashed-bubble',
  656. '--disable-speech-synthesis-api',
  657. '--disable-speech-api',
  658. '--disable-print-preview',
  659. '--safebrowsing-disable-auto-update',
  660. '--deny-permission-prompts',
  661. '--disable-external-intent-requests',
  662. '--disable-notifications',
  663. '--disable-desktop-notifications',
  664. '--noerrdialogs',
  665. '--disable-popup-blocking',
  666. '--disable-prompt-on-repost',
  667. '--silent-debugger-extension-api',
  668. '--block-new-web-contents',
  669. '--metrics-recording-only',
  670. '--disable-breakpad',
  671. // other feature flags
  672. // chrome://flags chrome://components
  673. `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
  674. '--enable-features=NetworkService',
  675. ]
  676. const CHROME_ARGS_EXTRA = []
  677. const CHROME_LAUNCH_OPTIONS = {
  678. CHROME_PROFILE_PATH,
  679. CHROME_PROFILE_USER,
  680. CHROME_EXTENSIONS,
  681. CHROME_DEBUG_PORT,
  682. CHROME_DISABLED_COMPONENTS,
  683. DEFAULT_VIEWPORT,
  684. CHROME_ARGS_DEFAULT,
  685. CHROME_ARGS_EXTRA,
  686. }
  687. /* Chrome CLI Args Documentation
  688. - https://github.com/GoogleChrome/chrome-launcher/blob/main/docs/chrome-flags-for-tools.md
  689. - https://chromium.googlesource.com/chromium/chromium/+/master/content/public/common/content_switches.cc
  690. - https://jtway.co/optimize-your-chrome-options-for-testing-to-get-x1-25-impact-4f19f071bf45
  691. - https://peter.sh/experiments/chromium-command-line-switches/
  692. - https://www.chromium.org/developers/how-tos/run-chromium-with-flags/
  693. - https://github.com/manoj9788/Chrome-Driver-arguments/blob/master/README.md
  694. */
  695. const getChromeArgs = ({CHROME_ARGS_DEFAULT, CHROME_ARGS_EXTRA,
  696. CHROME_PROFILE_PATH, CHROME_PROFILE_USER,
  697. CHROME_EXTENSIONS,
  698. CHROME_DEBUG_PORT,
  699. CHROME_DISABLED_COMPONENTS,
  700. DEFAULT_VIEWPORT}=CHROME_LAUNCH_OPTIONS) =>
  701. [
  702. ...CHROME_ARGS_DEFAULT,
  703. `--user-data-dir=${CHROME_PROFILE_PATH}`,
  704. `--profile-directory=${CHROME_PROFILE_USER}`,
  705. `--load-extension=${CHROME_EXTENSIONS.map(({unpacked_path}) => unpacked_path).join(',')}`,
  706. `--allowlisted-extension-id=${CHROME_EXTENSIONS.map(({id}) => id).join(',')}`,
  707. `--window-size=${DEFAULT_VIEWPORT.width},${DEFAULT_VIEWPORT.height}`,
  708. `--remote-debugging-port=${CHROME_DEBUG_PORT}`,
  709. `--disable-features=${CHROME_DISABLED_COMPONENTS.join(',')}`,
  710. ...CHROME_ARGS_EXTRA,
  711. ]
  712. /******************** Chrome Extension Management *****************************/
  713. function getExtensionId(unpacked_path) {
  714. const manifest_path = path.join(unpacked_path, 'manifest.json')
  715. if (!fs.existsSync(manifest_path)) return null
  716. // chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id
  717. const hash = crypto.createHash('sha256');
  718. hash.update(Buffer.from(unpacked_path, 'utf-8'));
  719. const detected_extension_id = Array.from(hash.digest('hex'))
  720. .slice(0, 32) // Convert each hexadecimal character to a character in the range 'a'-'p'
  721. .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
  722. .join('');
  723. return detected_extension_id
  724. }
  725. async function installExtension(extension) {
  726. const manifest_path = path.join(extension.unpacked_path, 'manifest.json')
  727. // Download extensions using:
  728. // curl -fsSL 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D$EXTENSION_ID%26uc' > extensionname.crx
  729. // unzip -d extensionname extensionname.zip
  730. if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
  731. console.log("[🛠️] Downloading missing extension", extension.name, extension.webstore_id, '->', extension.crx_path);
  732. // Download crx file from ext.crx_url -> ext.crx_path
  733. const response = await fetch(extension.crx_url) as Response
  734. const crx_file = fs.createWriteStream(extension.crx_path);
  735. if (response.headers.get("content-length") && response.body) {
  736. // @ts-ignore
  737. const crx_stream = Readable.fromWeb(response.body)
  738. await finished(crx_stream.pipe(crx_file))
  739. } else {
  740. console.warn('[⚠️] Failed to download extension', extension.name, extension.webstore_id)
  741. }
  742. }
  743. var {stdout, stderr} = {stdout: '', stderr: ''}
  744. // Unzip crx file from ext.crx_url -> ext.unpacked_path
  745. await fs.promises.mkdir(extension.unpacked_path, {recursive: true})
  746. try {
  747. var {stdout, stderr} = await exec(`/usr/bin/unzip ${extension.crx_path} -d ${extension.unpacked_path}`)
  748. } catch(err1) {
  749. try {
  750. await unzip(extension.crx_path, extension.unpacked_path)
  751. } catch(err2) {
  752. // console.error(`[❌] Failed to install ${extension.crx_path}: could not unzip crx`, err1, err2)
  753. // return false
  754. }
  755. }
  756. if (!fs.existsSync(manifest_path))
  757. console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`, stdout, stderr)
  758. return fs.existsSync(manifest_path)
  759. }
  760. async function loadOrInstallExtension(ext) {
  761. if (!(ext.webstore_id || ext.unpacked_path))
  762. throw 'Extension must have either {webstore_id} or {unpacked_path}'
  763. // Set statically computable extension metadata
  764. ext.webstore_id = ext.webstore_id || ext.id
  765. ext.name = ext.name || ext.webstore_id
  766. ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`
  767. ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`
  768. ext.crx_path = ext.crx_path || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`)
  769. ext.unpacked_path = ext.unpacked_path || path.join(CHROME_EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`)
  770. const manifest_path = path.join(ext.unpacked_path, 'manifest.json')
  771. ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'))
  772. ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null
  773. // if extension is not installed, download and unpack it
  774. if (!ext.read_version()) {
  775. await installExtension(ext)
  776. }
  777. // autodetect id from filesystem path (unpacked extensions dont have stable IDs)
  778. ext.id = getExtensionId(ext.unpacked_path)
  779. ext.version = ext.read_version()
  780. if (!ext.version) {
  781. console.warn('[❌] Unable to detect ID and version of installed extension', prettyPath(ext.unpacked_path))
  782. } else {
  783. console.log(`[➕] Installed extension ${ext.name} (${ext.version})...`.padEnd(82), prettyPath(ext.unpacked_path))
  784. }
  785. return ext
  786. }
  787. async function isTargetExtension(target) {
  788. let target_type
  789. let target_ctx
  790. let target_url
  791. try {
  792. target_type = target.type()
  793. target_ctx = (await target.worker()) || (await target.page()) || null
  794. target_url = target.url() || target_ctx?.url() || null
  795. } catch(err) {
  796. if (String(err).includes('No target with given id found')) {
  797. // because this runs on initial browser startup, we sometimes race with closing the initial
  798. // new tab page. it will throw a harmless error if we try to check a target that's already closed,
  799. // ignore it and return null since that page is definitely not an extension's bg page anyway
  800. target_type = 'closed'
  801. target_ctx = null
  802. target_url = 'about:closed'
  803. } else {
  804. throw err
  805. }
  806. }
  807. const target_is_bg = ['service_worker', 'background_page'].includes(target_type)
  808. const target_is_extension = target_url?.startsWith('chrome-extension://')
  809. const extension_id = (target_is_extension && target_url.split('://')[1].split('/')[0]) || null
  810. const manifest_version = target_type === 'service_worker' ? '3' : '2'
  811. return {
  812. target_type,
  813. target_ctx,
  814. target_url,
  815. target_is_bg,
  816. target_is_extension,
  817. extension_id,
  818. manifest_version,
  819. }
  820. }
  821. async function loadExtensionFromTarget(extensions, target) {
  822. const {
  823. target_is_bg,
  824. target_is_extension,
  825. target_type,
  826. target_ctx,
  827. target_url,
  828. extension_id,
  829. manifest_version,
  830. } = await isTargetExtension(target)
  831. if (!(target_is_bg && extension_id && target_ctx))
  832. return null
  833. const manifest = await target_ctx.evaluate(() =>
  834. // @ts-ignore
  835. chrome.runtime.getManifest())
  836. const { name, version, homepage_url, options_page, options_ui } = manifest
  837. if (!version || !extension_id)
  838. return null
  839. const options_url = await target_ctx.evaluate(
  840. (options_page) => chrome.runtime.getURL(options_page),
  841. options_page || options_ui?.page || 'options.html',
  842. )
  843. const commands = await target_ctx.evaluate(async () =>
  844. (await new Promise((resolve, reject) => {
  845. if (chrome.commands)
  846. chrome.commands.getAll(resolve)
  847. else
  848. resolve({})
  849. }))
  850. )
  851. // console.log(`[+] Found Manifest V${manifest_version} Extension:`, extension_id, name, target_url, Object.keys(commands).length)
  852. let dispatchEval = async (...args) =>
  853. await target_ctx.evaluate(...args)
  854. let dispatchPopup = async () =>
  855. await target_ctx.evaluate('chrome.action?.openPopup() || chrome.tabs.create({url: chrome.runtime.getURL("popup.html")})')
  856. let dispatchAction
  857. let dispatchMessage
  858. let dispatchCommand
  859. if (manifest_version === '3') {
  860. dispatchAction = async (tab) => {
  861. // https://developer.chrome.com/docs/extensions/reference/api/action#event-onClicked
  862. return await target_ctx.evaluate(async (tab) => {
  863. tab = tab || (await new Promise((resolve) =>
  864. chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
  865. // @ts-ignore
  866. return await chrome.action.onClicked.dispatch(tab)
  867. }, tab)
  868. }
  869. dispatchMessage = async (message, options) => {
  870. // https://developer.chrome.com/docs/extensions/reference/api/runtime
  871. return await target_ctx.evaluate(async (extension_id, message, options) => {
  872. return await chrome.runtime.sendMessage(extension_id, message, options)
  873. }, extension_id, message, options)
  874. }
  875. dispatchCommand = async (command, tab) => {
  876. // https://developer.chrome.com/docs/extensions/reference/api/commands#event-onCommand
  877. return await target_ctx.evaluate(async (command, tab) => {
  878. // @ts-ignore
  879. return await chrome.commands.onCommand.dispatch(command, tab)
  880. }, command, tab)
  881. }
  882. } else if (manifest_version === '2') {
  883. dispatchAction = async (tab) => {
  884. // https://developer.chrome.com/docs/extensions/mv2/reference/browserAction#event-onClicked
  885. return await target_ctx.evaluate(async (tab) => {
  886. tab = tab || (await new Promise((resolve) =>
  887. chrome.tabs.query({currentWindow: true, active: true}, ([tab]) => resolve(tab))))
  888. // @ts-ignore
  889. return await chrome.browserAction.onClicked.dispatch(tab)
  890. }, tab)
  891. }
  892. dispatchMessage = async (message, options) => {
  893. // https://developer.chrome.com/docs/extensions/mv2/reference/runtime#method-sendMessage
  894. return await target_ctx.evaluate(async (extension_id, message, options) => {
  895. return await new Promise((resolve) =>
  896. chrome.runtime.sendMessage(extension_id, message, options, resolve)
  897. )
  898. }, extension_id, message, options)
  899. }
  900. dispatchCommand = async (command, tab) => {
  901. // https://developer.chrome.com/docs/extensions/mv2/reference/commands#event-onCommand
  902. return await target_ctx.evaluate(async (command, tab) => {
  903. return await new Promise((resolve) =>
  904. // @ts-ignore
  905. chrome.commands.onCommand.dispatch(command, tab, resolve)
  906. )
  907. }, command, tab)
  908. }
  909. }
  910. const existing_extension = extensions.filter(({id}) => id === extension_id)[0] || {}
  911. const new_extension = {
  912. ...existing_extension,
  913. id: extension_id,
  914. webstore_name: name,
  915. target,
  916. target_ctx,
  917. target_type,
  918. target_url,
  919. manifest_version,
  920. manifest,
  921. version,
  922. homepage_url,
  923. options_url,
  924. dispatchEval, // run some JS in the extension's service worker context
  925. dispatchPopup, // open the extension popup
  926. dispatchAction, // trigger an extension menubar icon click
  927. dispatchMessage, // send a chrome runtime message in the service worker context
  928. dispatchCommand, // trigger an extension keyboard shortcut command
  929. }
  930. console.log(`[➕] Loaded extension ${name.substring(0, 32)} (${version}) ${target_type}...`.padEnd(82), target_url)
  931. Object.assign(existing_extension, new_extension)
  932. return new_extension
  933. }
  934. async function getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR}) {
  935. console.log('*************************************************************************')
  936. console.log(`[⚙️] Installing ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
  937. try {
  938. // read extension metadata from filesystem (installing from Chrome webstore if extension is missing)
  939. for (const extension of CHROME_EXTENSIONS) {
  940. Object.assign(extension, await loadOrInstallExtension(extension))
  941. }
  942. // for easier debugging, write parsed extension info to filesystem
  943. await overwriteFile(
  944. CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.present.json'),
  945. CHROME_EXTENSIONS,
  946. )
  947. } catch(err) {
  948. console.error(err)
  949. }
  950. console.log('*************************************************************************')
  951. return CHROME_EXTENSIONS
  952. }
  953. let _EXTENSIONS_CACHE = null
  954. async function getChromeExtensionsFromCache({browser, extensions=CHROME_EXTENSIONS, extensions_dir=CHROME_EXTENSIONS_DIR}) {
  955. if (_EXTENSIONS_CACHE === null) {
  956. console.log(`[⚙️] Loading ${CHROME_EXTENSIONS.length} chrome extensions from CHROME_EXTENSIONS...`)
  957. // find loaded Extensions at runtime / browser launch time & connect handlers
  958. // looks at all the open targets for extension service workers / bg pages
  959. for (const target of browser.targets()) {
  960. // mutates extensions object in-place to add metadata loaded from filesystem persona dir
  961. await loadExtensionFromTarget(extensions, target)
  962. }
  963. _EXTENSIONS_CACHE = extensions
  964. // write installed extension metadata to filesystem extensions.json for easier debugging
  965. await overwriteFile(
  966. CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
  967. extensions,
  968. )
  969. await overwriteSymlink(
  970. CHROME_EXTENSIONS_JSON_PATH.replace('.json', '.loaded.json'),
  971. CHROME_EXTENSIONS_JSON_PATH,
  972. )
  973. }
  974. return _EXTENSIONS_CACHE
  975. }
  976. async function setup2CaptchaExtension({browser, extensions}) {
  977. let page = null
  978. try {
  979. // open a new tab to finish setting up the 2captcha extension manually using its extension options page
  980. page = await browser.newPage()
  981. const { options_url } = extensions.filter(ext => ext.name === 'twocaptcha')[0]
  982. await page.goto(options_url)
  983. await wait(2_500)
  984. await page.bringToFront()
  985. // type in the API key and click the Login button (and auto-close success modal after it pops up)
  986. await page.evaluate(() => {
  987. const elem = document.querySelector("input[name=apiKey]") as HTMLInputElement
  988. elem.value = ""
  989. })
  990. await page.type('input[name=apiKey]', API_KEY_2CAPTCHA, { delay: 25 })
  991. // toggle all the important switches to ON
  992. await page.evaluate(() => {
  993. const checkboxes = Array.from(document.querySelectorAll<HTMLInputElement>('input#isPluginEnabled, input[name*=enabledFor], input[name*=autoSolve]'));
  994. for (const checkbox of checkboxes) {
  995. if (!checkbox.checked) checkbox.click()
  996. }
  997. })
  998. let dialog_opened = false
  999. page.on('dialog', async (dialog) => {
  1000. setTimeout(async () => {
  1001. await dialog.accept();
  1002. dialog_opened = true
  1003. }, 500);
  1004. })
  1005. await page.click('button#connect')
  1006. await wait(2_500)
  1007. if (!dialog_opened) {
  1008. throw `2captcha extension login confirmation dialog never opened, please check its options page manually: ${options_url}`
  1009. }
  1010. console.log('[🔑] Configured the 2captcha extension using its options page...')
  1011. } catch(err) {
  1012. console.warn(`[❌] Failed to configure the 2captcha extension using its options page!`, err)
  1013. }
  1014. if (page) await page.close()
  1015. }
  1016. async function speedtest({browser, page, measureUpload=true, timeout=25000}: {browser?: Browser, page?: Page, measureUpload?: boolean, timeout?: number}) {
  1017. // run a speedtest using fast.com, printing results once per second
  1018. browser = browser || await page.browser()
  1019. page = page || await browser.newPage()
  1020. // save one speedtest_<date>.json result per day
  1021. const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
  1022. const SPEEDTEST_PATH = path.join(SPEEDTESTS_DIR, `speedtest_${today}.json`)
  1023. // check if we've already run one today, if so return earlier results and skip running again
  1024. try {
  1025. return JSON.parse(await fs.promises.readFile(SPEEDTEST_PATH, 'utf-8'))
  1026. } catch(err) {
  1027. // otherwise speedtest does not exist yet for today, continue onwards...
  1028. }
  1029. console.log('[🚤] Running Speedtest using Fast.com...'.padEnd(82), prettyPath(SPEEDTEST_PATH))
  1030. await page.goto('https://fast.com', {timeout, waitUntil: 'domcontentloaded'});
  1031. await page.waitForSelector('#speed-value', {timeout})
  1032. let result = null
  1033. let loop_idx = 0
  1034. while (loop_idx < 100) {
  1035. result = await page.evaluate(() => {
  1036. const $ = document.querySelector.bind(document);
  1037. return {
  1038. downloadSpeed: Number($('#speed-value').textContent),
  1039. downloadUnit: $('#speed-units').textContent.trim(),
  1040. downloaded: Number($('#down-mb-value').textContent.trim()),
  1041. uploadSpeed: Number($('#upload-value').textContent),
  1042. uploadUnit: $('#upload-units').textContent.trim(),
  1043. uploaded: Number($('#up-mb-value').textContent.trim()),
  1044. latency: Number($('#latency-value').textContent.trim()),
  1045. bufferBloat: Number($('#bufferbloat-value').textContent.trim()),
  1046. userLocation: $('#user-location').textContent.trim(),
  1047. userIp: $('#user-ip').textContent.trim(),
  1048. isDone: Boolean($('#speed-value.succeeded') && $('#upload-value.succeeded')),
  1049. };
  1050. })
  1051. if (result.downloadSpeed > 0) {
  1052. // console.log(JSON.stringify(result).replaceAll('"', '').replaceAll(',', ' ').replaceAll('{', '').replaceAll('}', ''))
  1053. }
  1054. if (result.isDone || (!measureUpload && result.uploadSpeed)) {
  1055. break
  1056. }
  1057. await wait(500)
  1058. loop_idx++
  1059. }
  1060. await Promise.allSettled([
  1061. page.close(),
  1062. overwriteFile(SPEEDTEST_PATH, result)
  1063. ])
  1064. return result
  1065. }
  1066. /******************************************************************************/
  1067. /******************************************************************************/
  1068. const ALREADY_ARCHIVED = new Set(['', 'about:blank', 'chrome://newtab', 'chrome://version'])
  1069. const TASKS_PER_RUN_LIMIT = 200
  1070. async function botArchiveTask({page, data, url=''}) {
  1071. url = url || data // puppeteer-cluster passes in the url value via the data: arg
  1072. const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
  1073. const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
  1074. if (is_unarchivable_url || is_already_archived) return null
  1075. ALREADY_ARCHIVED.add(url.slice(0, 4096))
  1076. if (ALREADY_ARCHIVED.size > TASKS_PER_RUN_LIMIT) {
  1077. console.warn('[❌] Hit maximum URLs archived per browser session, exiting to free memory.')
  1078. console.warn(' Run this process again to continue with the next batch...')
  1079. process.exit(21)
  1080. }
  1081. const browser = await page.browser()
  1082. const client = await page.target().createCDPSession()
  1083. const extensions = await getChromeExtensionsFromCache({browser})
  1084. const browser_version = await browser.version()
  1085. const original_url = url.toString()
  1086. const start_time = (new Date())
  1087. console.log('[0/4]-------------------------------------------------------------------------')
  1088. const snapshot_dir = await setupSnapshotDir({original_url, start_time})
  1089. const snapshot = await setupSnapshotDB({original_url, start_time, snapshot_dir})
  1090. console.log('[1/4]-------------------------------------------------------------------------')
  1091. console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
  1092. const page_state = {
  1093. // global static state
  1094. browser,
  1095. client,
  1096. browser_version,
  1097. extensions,
  1098. // per-page static metadata
  1099. original_url,
  1100. snapshot,
  1101. snapshot_dir,
  1102. start_time: start_time.toISOString(),
  1103. start_ts: Number(start_time),
  1104. version: versionStrFromDate(start_time),
  1105. // per-page mutable archiving state
  1106. main_response: null,
  1107. recorder: null,
  1108. console_log: [],
  1109. traffic_log: {},
  1110. redirects: {},
  1111. }
  1112. page._original_url = original_url
  1113. try {
  1114. // run all page setup functions in parallel
  1115. const results = await Promise.allSettled([
  1116. // loadAuthStorage(page, page_state, { apply: true }),
  1117. startMetadataRecording(page, page_state),
  1118. setupURLRewriting(page, page_state),
  1119. // setupViewport(page, page_state),
  1120. setupModalAutoClosing(page, page_state),
  1121. loadCloudflareCookie(page, page_state),
  1122. startResponseSaving(page, page_state),
  1123. saveYTDLP(page, page_state),
  1124. saveGALLERYDL(page, page_state),
  1125. // saveSourceMaps(page, page_state),
  1126. // TODO: someday setup https://github.com/osnr/TabFS ?
  1127. ]);
  1128. // run all page setup functions in parallel
  1129. const rejected = results
  1130. .filter(result => result.status === 'rejected')
  1131. .map(result => (result as PromiseRejectedResult).reason);
  1132. if (rejected.length) console.warn('[⚠️] Partial failures during page setup:', rejected);
  1133. } catch(err) {
  1134. console.error('[❌] PAGE SETUP ERROR', JSON.stringify(err, null, 4))
  1135. return
  1136. }
  1137. console.log('[2/4]-------------------------------------------------------------------------')
  1138. console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
  1139. const startrecording_promise = startScreenrecording(page, page_state)
  1140. page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
  1141. try {
  1142. const results = await Promise.allSettled([
  1143. startrecording_promise,
  1144. page.bringToFront(),
  1145. page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
  1146. ])
  1147. const rejected = results
  1148. .filter(result => result.status === 'rejected')
  1149. .map(result => (result as PromiseRejectedResult).reason)
  1150. if (rejected.length) console.warn('[⚠️] Parial failures during page load:', rejected)
  1151. } catch(err) {
  1152. console.error('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
  1153. return
  1154. }
  1155. if (page_state.main_response === null) {
  1156. page_state.main_response = await page.waitForResponse(() => true)
  1157. }
  1158. assert(page_state.main_response)
  1159. if (page_state.main_response.status() == 429) {
  1160. throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
  1161. }
  1162. // emulate human browsing behavior
  1163. // await disableAnimations(page, page_state);
  1164. await jiggleMouse(page, page_state);
  1165. await solveCaptchas(page, page_state);
  1166. await blockRedirects(page, page_state);
  1167. await scrollDown(page, page_state);
  1168. // await expandComments(page, page_state);
  1169. await submitForm(page, page_state);
  1170. // await blockJSExecution(page, page_state);
  1171. console.log('[3/4]-------------------------------------------------------------------------')
  1172. // stop tampering with page requests & JS / recording metadata / traffic log
  1173. await stopMetadataRecording(page, page_state)
  1174. // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
  1175. const saveScreenrecording_promise = saveScreenrecording(page, page_state);
  1176. await saveScreenshot(page, page_state);
  1177. await savePDF(page, page_state);
  1178. console.log('[4/4]-------------------------------------------------------------------------')
  1179. // do all async archiving steps that can be run at the same time
  1180. await inlineShadowDOM(page, page_state);
  1181. const results = await Promise.allSettled([
  1182. saveTitle(page, page_state),
  1183. saveSEO(page, page_state),
  1184. saveFavicon(page, page_state),
  1185. saveSSL(page, page_state),
  1186. saveRequests(page, page_state),
  1187. saveRedirects(page, page_state),
  1188. saveHeaders(page, page_state),
  1189. saveRaw(page, page_state),
  1190. saveDOM(page, page_state),
  1191. saveBodyText(page, page_state),
  1192. // savePandoc(page, page_state),
  1193. saveReadability(page, page_state),
  1194. saveAccessibility(page, page_state),
  1195. saveOutlinks(page, page_state),
  1196. // saveAuthStorage(page, page_state),
  1197. saveAIQualityAssuranceResult(page, page_state),
  1198. ]);
  1199. // do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
  1200. const bg_results = Promise.allSettled([
  1201. saveScreenrecording_promise,
  1202. saveSinglefile(page, page_state),
  1203. // saveArchiveWebPage(page, page_state),
  1204. // savePocket(page, page_state),
  1205. ])
  1206. const {duration} = await saveMetrics(page, page_state);
  1207. const rejected = results
  1208. .filter(result => result.status === 'rejected')
  1209. .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises
  1210. if (rejected.length)
  1211. console.warn('[⚠️] Parial failures during archiving:', rejected)
  1212. // Start an interactive REPL here with the `page` instance.
  1213. // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
  1214. // await page.repl()
  1215. // await page.browser().repl()
  1216. console.log(`[✅] ${ANSI.blue}Finished archiving in ${duration/1000}s.${ANSI.reset}`)
  1217. try {
  1218. const rejected = (await bg_results)
  1219. .filter(result => result.status === 'rejected')
  1220. .map(result => (result as PromiseRejectedResult).reason) // not sure why this has a ts-error, .reason does exist on rejected promises
  1221. if (rejected.length)
  1222. console.warn('[⚠️] Parial failures during wrap-up tasks:', rejected)
  1223. console.log('[🗑️] Resetting to about:blank to ensure memory is freed...')
  1224. await page.goto('about:blank')
  1225. await page.close()
  1226. } catch(err) {
  1227. console.log(err)
  1228. }
  1229. // symlink the best results from across all the versions/ into the snapshot dir root
  1230. await symlinkBestSnapshotResults(snapshot_dir)
  1231. // display latest version screenshot GIF
  1232. console.log()
  1233. try {
  1234. const latest_version_gif = path.join(snapshot_dir, 'versions', page_state.version, path.basename(SCREENRECORDGIF_PATH(page)))
  1235. const dirent = await blockUntilExists(latest_version_gif, {min_bytes: 100, timeout: 15_000})
  1236. child_process.spawn('/Users/squash/.iterm2/imgcat', [dirent.abspath], {stdio: [null, 'inherit', 'inherit']})
  1237. } catch(err) {
  1238. console.warn('[⚠️] Failed to display screenrecording.gif...', err)
  1239. console.log()
  1240. }
  1241. // determine whether task succeeded or failed based on AI QA score
  1242. const latest_version_aiqa = path.join(snapshot_dir, 'versions', page_state.version, path.basename(AIQA_PATH(page)))
  1243. const qa_results = JSON.parse((await fs.promises.readFile(latest_version_aiqa)).toString())
  1244. if (qa_results.pct_visible < 50) {
  1245. throw `[❌] Task completed with problems, got AI QA score of ${qa_results.pct_visible}%! ${qa_results.warnings.join(', ')} ${qa_results.error_text || ''}`
  1246. } else {
  1247. console.log(`[💫] Task completed succesfully: ${qa_results.pct_visible}% ${qa_results.warnings.join(', ') || ''}`)
  1248. console.log(` Summary: ${(qa_results.main_content_title || qa_results.description || 'No title/description detected').substring(0, 80)}... ${qa_results.main_content_author || ''} ${qa_results.main_content_date || ''}`)
  1249. return true
  1250. }
  1251. }
  1252. async function passiveArchiveTask({browser, page, url}) {
  1253. // archive passively (e.g. a tab that was opened already by a human), without changing the active page
  1254. const is_unarchivable_url = URL_SCHEMES_IGNORED.includes(url.split(':')[0])
  1255. const is_already_archived = ALREADY_ARCHIVED.has(url.slice(0, 4096))
  1256. if (is_unarchivable_url || is_already_archived) return null
  1257. ALREADY_ARCHIVED.add(url.slice(0, 4096))
  1258. // these have to be as early as possible because we're racing with the page load (we might even be too late)
  1259. // jk nevermind, we now re-open a new bg tab for every tab that's created to re-capture the initial request
  1260. // await page.setRequestInterception(true);
  1261. // await page.setCacheEnabled(false);
  1262. const original_url = url.toString()
  1263. const start_time = (new Date())
  1264. const browser_version = await browser.version()
  1265. console.log('------------------------------------------------------------------------------')
  1266. console.log('[➕] Starting archive of new tab opened in driver browser...', await browser.version())
  1267. const snapshot_dir = await setupSnapshotDir({original_url, start_time})
  1268. const snapshot = await setupSnapshotDB({ original_url, start_time, snapshot_dir })
  1269. console.log('------------------------------------------------------------------------------')
  1270. console.log(`[🪟] Starting page & viewport setup (${browser_version} ${DEFAULT_VIEWPORT.isMobile ? 'mobile' : 'desktop'} ${DEFAULT_VIEWPORT.width}x${DEFAULT_VIEWPORT.height}px)...`)
  1271. // create a new page in the background for archiving
  1272. const old_page = page
  1273. page = await browser.newPage()
  1274. await old_page.bringToFront()
  1275. const client = await page.target().createCDPSession()
  1276. const extensions = await getChromeExtensionsFromCache({ browser })
  1277. const page_state = {
  1278. // global static state
  1279. browser,
  1280. client,
  1281. browser_version,
  1282. extensions,
  1283. // per-page static metadata
  1284. original_url,
  1285. snapshot,
  1286. snapshot_dir,
  1287. start_time: start_time.toISOString(),
  1288. start_ts: Number(start_time),
  1289. version: versionStrFromDate(start_time),
  1290. // per-page mutable archiving state
  1291. main_response: null,
  1292. recorder: null,
  1293. console_log: [],
  1294. traffic_log: {},
  1295. redirects: {},
  1296. }
  1297. page._original_url = original_url
  1298. try {
  1299. // run all page setup functions in parallel
  1300. const results = await Promise.allSettled([
  1301. // loadAuthStorage(page, page_state, {apply: true}),
  1302. startMetadataRecording(page, page_state),
  1303. setupURLRewriting(page, page_state),
  1304. startResponseSaving(page, page_state),
  1305. saveYTDLP(page, page_state),
  1306. saveGALLERYDL(page, page_state),
  1307. // saveSourceMaps(page, page_state),
  1308. ]);
  1309. const rejected = results
  1310. .filter(result => result.status === 'rejected')
  1311. .map(result => (result as PromiseRejectedResult).reason)
  1312. if (rejected.length) console.warn('[⚠️] Parial failures during page setup:', rejected)
  1313. } catch(err) {
  1314. console.warn('[❌] ERROR DURING PAGE SETUP', JSON.stringify(err, null, 4))
  1315. return
  1316. }
  1317. // load the url in the background page, then switch to it once its loaded and close the original tab
  1318. console.log('[➡️] NAVIGATION[INI]', ANSI.blue + url + ANSI.reset)
  1319. const startrecording_promise = startScreenrecording(page, page_state)
  1320. page_state.main_response = await page.goto(url, {waitUntil: 'load', timeout: 40_000})
  1321. // for debugging
  1322. globalThis.page = page
  1323. globalThis.page_state = page_state
  1324. // start loading the page, start screenrecording, close the old page, and wait for loading to finish (all at once, fine for these to race)
  1325. try {
  1326. const results = await Promise.allSettled([
  1327. startrecording_promise,
  1328. page.bringToFront(),
  1329. old_page.close(),
  1330. page.waitForNetworkIdle({concurrency: 0, idleTime: 900, timeout: 20_000}),
  1331. ])
  1332. const rejected = results
  1333. .filter(result => result.status === 'rejected')
  1334. .map(result => (result as PromiseRejectedResult).reason)
  1335. if (rejected.length) console.warn('[⚠️] Parial failures during [age load:', rejected)
  1336. } catch(err) {
  1337. console.warn('[❌] ERROR DURING PAGE LOAD', JSON.stringify(err, null, 4))
  1338. return
  1339. }
  1340. if (page_state.main_response === null) {
  1341. page_state.main_response = await page.waitForResponse(() => true)
  1342. }
  1343. assert(page_state.main_response)
  1344. if (page_state.main_response.status() == 429) {
  1345. throw `[⚠️] Got 429 rate-limit response, skipping this URL for now...`
  1346. }
  1347. // resume page if paused by waitForDebuggerOnStart/dev tools debugger/backgrounding
  1348. try {
  1349. await client.send('Page.enable');
  1350. await client.send('Page.setWebLifecycleState', {state: 'active'});
  1351. await client.send('Runtime.runIfWaitingForDebugger')
  1352. } catch(err) { /* console.warn(err) */ }
  1353. // wait a couple seconds for page to finish loading
  1354. await wait(5_000)
  1355. // emulate human browsing behavior
  1356. // await disableAnimations(page, page_state);
  1357. // await jiggleMouse(page, page_state);
  1358. await solveCaptchas(page, page_state);
  1359. // await blockRedirects(page, page_state);
  1360. // await scrollDown(page, page_state);
  1361. // await expandComments(page, page_state);
  1362. await submitForm(page, page_state);
  1363. // await blockJSExecution(page, page_state);
  1364. await stopMetadataRecording(page, page_state) // stop tampering with page requests & JS
  1365. console.log('[3/4]-------------------------------------------------------------------------')
  1366. // do all synchonous archiving steps that need exclusive use of the whole page while doing stuff
  1367. const saveScreenrecording_promise = saveScreenrecording(page, page_state);
  1368. await saveScreenshot(page, page_state);
  1369. await savePDF(page, page_state);
  1370. console.log('[4/4]-------------------------------------------------------------------------')
  1371. // do all async archiving steps that can be run at the same time
  1372. await inlineShadowDOM(page, page_state);
  1373. const results = await Promise.allSettled([
  1374. saveTitle(page, page_state),
  1375. saveSEO(page, page_state),
  1376. saveFavicon(page, page_state),
  1377. saveSSL(page, page_state),
  1378. saveRequests(page, page_state),
  1379. saveRedirects(page, page_state),
  1380. saveHeaders(page, page_state),
  1381. saveRaw(page, page_state),
  1382. saveDOM(page, page_state),
  1383. saveBodyText(page, page_state),
  1384. // savePandoc(page, page_state),
  1385. saveReadability(page, page_state),
  1386. saveAccessibility(page, page_state),
  1387. saveOutlinks(page, page_state),
  1388. // saveAuthStorage(page, page_state),
  1389. saveAIQualityAssuranceResult(page, page_state),
  1390. ]);
  1391. // do all sync archiving steps that require browser extensions at the very end (they are the buggiest)
  1392. const bg_results = Promise.allSettled([
  1393. saveScreenrecording_promise,
  1394. saveSinglefile(page, page_state),
  1395. // saveArchiveWebPage(page, page_state),
  1396. // savePocket(page, page_state),
  1397. ])
  1398. const {duration} = await saveMetrics(page, page_state);
  1399. const rejected = results
  1400. .filter(result => result.status === 'rejected')
  1401. .map(result => (result as PromiseRejectedResult).reason)
  1402. if (rejected.length)
  1403. console.warn('[⚠️] Parial failures during page archiving:', rejected)
  1404. // Start an interactive REPL here with the `page` instance.
  1405. // https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-repl
  1406. // await page.repl()
  1407. // await page.browser().repl()
  1408. console.log(`[✅] Finished archiving in ${duration/1000}s.`,)
  1409. // await page.tracing.stop();
  1410. try {
  1411. const rejected = (await bg_results)
  1412. .filter(result => result.status === 'rejected')
  1413. .map(result => (result as PromiseRejectedResult).reason)
  1414. if (rejected.length)
  1415. console.warn('[⚠️] Parial failures during page wrap-up tasks:', rejected)
  1416. } catch(err) {
  1417. console.log(err)
  1418. }
  1419. await symlinkBestSnapshotResults(snapshot_dir)
  1420. }
  1421. /******************************************************************************/
  1422. /************************* Page Setup Tasks ***********************************/
  1423. async function setupSnapshotDir({original_url, start_time, snapshot_dir=null}) {
  1424. // setup archive/<id> snapshot output folder, move old files into versions/<date>/* + clear any existing symlinks
  1425. const snap_dir = snapshot_dir || TASK_PATH(original_url)
  1426. console.log()
  1427. console.log()
  1428. console.log(ANSI.blue + original_url + ANSI.reset)
  1429. console.log(ANSI.black + snap_dir + ANSI.reset)
  1430. console.log()
  1431. console.log('[📂] Setting up Snapshot output directory...'.padEnd(82), prettyPath(snap_dir))
  1432. // check for existing data at old legacy paths e.g. ./data/archive/1999999999.1723425
  1433. const hacky_dir = path.join(ARCHIVE_DIR, `1999999999.${hashCode(original_url)}`)
  1434. const known_dir = SNAPSHOT_DIRS_BY_URL[original_url]
  1435. const known_dir_exists = fs.existsSync(known_dir)
  1436. const hacky_dir_exists = fs.existsSync(hacky_dir)
  1437. if (snap_dir == hacky_dir) {
  1438. if (known_dir_exists) {
  1439. throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
  1440. }
  1441. } else if (snap_dir == known_dir) {
  1442. if (hacky_dir_exists) {
  1443. throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
  1444. }
  1445. } else {
  1446. if (known_dir_exists) {
  1447. throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${known_dir}!`
  1448. } else if (hacky_dir_exists) {
  1449. throw `Tried to create snapshot in ${snap_dir} but potential duplicate exists in ${hacky_dir}!`
  1450. } else {
  1451. throw `Tried to create snapshot in ${snap_dir} but its not a recognized snapshot dir path:\n - ${known_dir}\n - ${hacky_dir}`
  1452. }
  1453. }
  1454. // mkdir -p ./data/archive/<snap_id>/versions && cd ./data/archive/<snap_id>
  1455. await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
  1456. process.chdir(snap_dir)
  1457. // clear any /data/archive/<snap_id>/*.* symlinks pointing to existing ./versions/<versionid>/*.* files
  1458. await clearSnapshotDirSymlinks(snap_dir)
  1459. // move /data/archive/<snap_id>/*.* loose output files from any prior run into ./versions/<versionid>/*.*
  1460. await collectSnapshotDirVersionFiles(snap_dir)
  1461. // update /data/indexes/<index_name>/* to include references to /data/archive/<snap_id> as-needed
  1462. await updateSnapshotDirIndexes(snap_dir, {original_url, start_time})
  1463. // assert /data/archive/<snap_id>/ contains no invalid/partial files + is empty/ready to receive new files
  1464. await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
  1465. return snap_dir
  1466. }
  1467. // ./index/<index_name> : index_getter(page_state) => "<index_key_str>"
  1468. const INDEXES = {
  1469. snapshots_by_day: ({start_time}) =>
  1470. versionStrFromDate(start_time, {withDate: true, withTime: false}),
  1471. snapshots_by_domain: ({original_url}) =>
  1472. (new URL(original_url)).hostname || '', // hostname does not include :port
  1473. }
  1474. async function updateSnapshotDirIndexes(snap_dir, page_state, indexes=INDEXES, indexes_dir=INDEXES_DIR) {
  1475. assert(indexes)
  1476. console.log(`[🔎] Linking Snapshot in indexes (${Object.keys(indexes).join(', ')})...`)
  1477. // const {snapshot_dir, original_url, start_ts} = page_state
  1478. for (const [index_name, index_key_getter] of Object.entries(indexes)) {
  1479. const index_entry = await indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir}, page_state)
  1480. }
  1481. }
  1482. async function indexSnapshotDir(snap_dir, {index_name, index_key_getter, indexes_dir=INDEXES_DIR}, page_state) {
  1483. // place symlinks to this snapshot in any /indexes/<index_name/ indexes as-needed
  1484. // const snap_id = snap_dir.split('/').at(-1)
  1485. const index_dir = path.join(indexes_dir, index_name) // /data/index/snapshots_by_day
  1486. await fs.promises.mkdir(index_dir, {recursive: true})
  1487. // calculate the index key, e.g. "200101231" or "example.com"
  1488. assert(index_name && index_key_getter)
  1489. assert(page_state)
  1490. const index_key = String(index_key_getter(page_state)) // '20010131'
  1491. assert(index_key)
  1492. const snap_id = path.parse(snap_dir).base // '19999999.23423523'
  1493. assert(snap_id)
  1494. const index_entries_dir = path.join(index_dir, index_key) // /data/index/snapshots_by_day/20010131
  1495. await fs.promises.mkdir(index_entries_dir, {recursive: true})
  1496. const symlink_path = path.join(index_entries_dir, snap_id) // /data/index/snapshots_by_day/20010131/19999999.23423523
  1497. // create symlink index/snapshots_by_day/<YYYYMMDD>/<snap id> -> ./archive/<snap_id> symlink
  1498. const {symlink_abspath} = await overwriteSymlink(snap_dir, symlink_path, {relative: true, mkdirs: false})
  1499. }
  1500. async function collectSnapshotDirVersionFiles(snap_dir) {
  1501. // move archive/<id>/*.* snapshot output files into archive/<id>/versions/<date>/* dated version folder
  1502. // detect start time / version info from previous result metrics.json
  1503. const snap_id = snap_dir.split('/archive/').at(-1)
  1504. const existing_metrics = path.join(snap_dir, 'metrics.json')
  1505. let {start_time, VERSION} = {start_time: '1970-01-01T00:00:00.000Z', VERSION: '19700101000000'}
  1506. try {
  1507. ;({start_time, VERSION} = JSON.parse(await fs.promises.readFile(existing_metrics, 'utf-8')));
  1508. } catch(err) {
  1509. // continue normally, overwriting existing files is fine if they're broken to begin with
  1510. }
  1511. // create new version folder based on metrics.json start_time (or epoch time as fallback for legacy output)
  1512. const version_dir_name = VERSION || versionStrFromDate(start_time)
  1513. const version_dir = path.join(snap_dir, 'versions', version_dir_name)
  1514. await fs.promises.mkdir(version_dir, {recursive: true})
  1515. // move all result files from snapshot_dir root into version folder
  1516. const existing_snapshot_files =
  1517. (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
  1518. .filter(dirent => {
  1519. if (dirent.name.startsWith('.')) return false // ignore hidden files, dont version them
  1520. if (dirent.name == 'versions') return false // dont try to move versions folder into itself
  1521. if (dirent.isSymbolicLink()) return false // skip existing symbolic links
  1522. return (dirent.isFile() || dirent.isDirectory()) // dont try to version sockets/FIFOs/devs etc.
  1523. })
  1524. if (existing_snapshot_files.length) {
  1525. console.log(`[📅] Moving snapshot results into version dir: ./data/archive/${snap_id}/* ->`.padEnd(82), `./data/archive/${snap_id}/versions/${VERSION}/`)
  1526. }
  1527. const snapshot_files = await getDirInfo(snap_dir, {withRoot: false, filter: ({relpath}) => !relpath.startsWith('versions')})
  1528. const version_files = await getDirInfo(version_dir, {withRoot: false})
  1529. for (const {name} of existing_snapshot_files) {
  1530. const snapdir_entry_abspath = path.join(snap_dir, name)
  1531. const versioned_entry_abspath = path.join(version_dir, name)
  1532. const snapshot_entry = snapshot_files[name]
  1533. const version_entry = version_files[name]
  1534. if (snapshot_entry && version_entry) {
  1535. // a conflicting file/dir already exists in the destination path
  1536. // we have a few options here, we can try to merge them, or we can create a new version
  1537. if (snapshot_entry.sha256 == version_entry.sha256) {
  1538. // both are the same already, delete the duplicate (leaving the copy inside the version dir)
  1539. // if (snapshot_entry.is_dir) {
  1540. // await fs.promises.rmdir(snapshot_entry.abspath, {recursive: true})
  1541. // } else {
  1542. // await fs.promises.unlink(snapshot_entry.abspath)
  1543. // }
  1544. // console.warn(`[!] Found harmless exact duplicate files, leaving as is: ${snapshot_entry.summary} and ${version_entry.summary}`)
  1545. } else {
  1546. // both are different,
  1547. if (snapshot_entry.num_bytes > version_entry.num_bytes) {
  1548. // snapshot entry is bigger, keep it and delete version entry?
  1549. } else {
  1550. // version entry is bigger, keep it and delete snapshot entry
  1551. }
  1552. console.warn(' ', snapshot_entry.summary)
  1553. console.warn(' ', version_entry.summary)
  1554. // throw `Found conflicting duplicate files with different contents: ${name}`
  1555. }
  1556. } else {
  1557. // mv ./data/archive/<snap_id>/example.txt -> ./data/archive/<snap_id>/versions/<version_id>/example.txt
  1558. await fs.promises.rename(snapdir_entry_abspath, versioned_entry_abspath)
  1559. console.log(` ↣ ${prettyPath(snapdir_entry_abspath)} ->`.padEnd(82), prettyPath(versioned_entry_abspath))
  1560. }
  1561. }
  1562. }
  1563. // Extractor definition
  1564. // {
  1565. // phase: setup | load | sync1 | async1 | sync2 | close
  1566. // name: 'media' | 'photos', 'wget', 'singlefile'
  1567. //
  1568. // shouldRun(page, page_state)
  1569. // pageSetup
  1570. // pageLoad
  1571. // pageInteraction clicking around/scrolling
  1572. // archivePhase1 sync
  1573. // archivePhase2 async
  1574. // archivePhase3 async
  1575. // pageClose
  1576. // execute(page, page_state)
  1577. // validateResult(page, page_state)
  1578. // }
  1579. async function clearSnapshotDirSymlinks(snap_dir) {
  1580. // delete all archive/<id>/* symlinks in preparation for new snapshot output to be placed there
  1581. const existing_symlinks =
  1582. (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
  1583. .filter(dirent => {
  1584. if (dirent.name.startsWith('.')) return false // ignore hidden files, dont version them
  1585. if (dirent.name == 'versions') return false // dont try to move versions folder into itself
  1586. return dirent.isSymbolicLink()
  1587. })
  1588. for (const {name: existing_symlink} of existing_symlinks) {
  1589. await fs.promises.unlink(path.join(snap_dir, existing_symlink))
  1590. // if symlinks are not cleared before starting, it can cause issues with outputs writing into previous versions folders
  1591. // e.g. screerecording saves to ./media which could be pointing to previous version's ./versions/<olddate>/media
  1592. }
  1593. }
  1594. async function symlinkBestSnapshotResults(snap_dir) {
  1595. // move any existing files into versions/<date> folder (clear out main folder)
  1596. // symlink latest files from versions/<date>/* into main folder
  1597. await fs.promises.mkdir(path.join(snap_dir, 'versions'), {recursive: true})
  1598. process.chdir(snap_dir)
  1599. const metrics_file = path.join(snap_dir, 'metrics.json')
  1600. // if (!fs.existsSync(metrics_file) || (await fs.promises.lstat(metrics_file)).isSymbolicLink()) {
  1601. // console.warn('[⚠️] Warning, found partial dirty snapshot state (did the snapshot get interrupted?)', snap_dir)
  1602. // }
  1603. // move output files into versioned folder
  1604. await collectSnapshotDirVersionFiles(snap_dir)
  1605. // clear any existing symlinks
  1606. await clearSnapshotDirSymlinks(snap_dir)
  1607. // assert task dir is empty and contains no bare files that might get overwritten, also asserts version dirs are valid
  1608. await assertSnapshotDirIsValid(snap_dir, {is_empty: true})
  1609. const version_dirs = (await fs.promises.readdir(path.join(snap_dir, 'versions'))).sort() // earliest to latest
  1610. const most_recent = version_dirs.at(-1)
  1611. // for each version dir in versions/ (oldest -> newest)
  1612. for (const version_dir of version_dirs) {
  1613. if (version_dir.startsWith('.')) continue
  1614. const version_dir_abspath = path.join(snap_dir, 'versions', version_dir)
  1615. const version_dir_files = (
  1616. (await fs.promises.readdir(version_dir_abspath))
  1617. .filter(filename => !filename.startsWith('.')))
  1618. // iterate through all the files/folders in the version dir
  1619. for (const filename of version_dir_files) {
  1620. const snapdir_entry = path.join(snap_dir, filename) // ./data/archive/<snapid>/filename
  1621. const versiondir_entry = path.join(snap_dir, 'versions', version_dir, filename) // ./data/archive/<snapid>/versions/<versionid>/filename
  1622. if (fs.existsSync(snapdir_entry)) {
  1623. // if an entry already exists in the snapshot root for this filename
  1624. if ((await fs.promises.lstat(snapdir_entry)).isSymbolicLink()) {
  1625. // if a symlink already exists in the root with the same name,
  1626. // check if the version file we're looking at is a better candidate to replace it
  1627. const existing_abspath = await fs.promises.realpath(snapdir_entry)
  1628. const desired_abspath = path.join(version_dir_abspath, filename)
  1629. if (existing_abspath != desired_abspath) {
  1630. // check if the new candidate is larger or if the existing symlink is larger (largest file = most likely to be highest quality capture data)
  1631. const largest_path = await getLargestPath(existing_abspath, desired_abspath)
  1632. if (largest_path != (await fs.promises.realpath(existing_abspath))) {
  1633. const larger_version = path.basename(path.dirname(largest_path))
  1634. const larger_abspath = path.join(snap_dir, 'versions', larger_version, filename)
  1635. // console.log(' - swapping for larger file:', filename, '->', larger_abspath.split('/archive/').at(-1))
  1636. await overwriteSymlink(larger_abspath, snapdir_entry, {search_limit: snap_dir})
  1637. } else {
  1638. // console.log(' - leaving larger file:', largest_path.split('/archive/').at(-1))
  1639. }
  1640. } else {
  1641. // leave existing symlink pointing to current version file, nothing to change
  1642. // console.log(' - leaving current file:', existing_abspath.split('/archive/').at(-1))
  1643. }
  1644. } else {
  1645. // clearSnapshotDirSymlinks() should have already cleared these files out!
  1646. throw `Non-symlink file found in root of snapshot dir! Refusing to overwrite: ${prettyPath(snapdir_entry)}`
  1647. }
  1648. } else {
  1649. // no entry exists in the snapshot root for this filename, create one by linking to the version file
  1650. await overwriteSymlink(versiondir_entry, snapdir_entry, {search_limit: snap_dir})
  1651. }
  1652. // if (version_dir == most_recent) {
  1653. // // only log most recent links even though we link older ones too (otherwise its too noisy)
  1654. // console.log(` 🔗 ./${filename} -> ./${versiondir_entry} linking...`)
  1655. // }
  1656. }
  1657. }
  1658. return snap_dir
  1659. }
  1660. async function assertSnapshotDirIsValid(snap_dir, {is_empty=false}={}) {
  1661. process.chdir(snap_dir)
  1662. console.log()
  1663. console.log(`[☑️] Checking that snapshot records are valid...`)
  1664. // get all directory entries in archive/<snapshot_id>/*
  1665. const snapshot_dir_entries =
  1666. (await fs.promises.readdir(snap_dir, {withFileTypes: true}))
  1667. .filter(dirent => {
  1668. if (dirent.name.startsWith('.')) return false
  1669. if (dirent.name == 'versions') return false
  1670. })
  1671. // assert versions folder exists and is not a symbolic link
  1672. const versions_dir = path.join(snap_dir, 'versions')
  1673. assert(fs.existsSync(versions_dir))
  1674. assert(!(await fs.promises.lstat(versions_dir)).isSymbolicLink())
  1675. // if it should be empty, check that no loose files exist
  1676. if (is_empty) {
  1677. assert(!snapshot_dir_entries.length, `Found loose files in snapshot-dir that shouldn't be there! ${snap_dir}`)
  1678. }
  1679. // assert all non-hidden files in snapshot dir are symbolic links to actual data in versions/<date>/*
  1680. for (const snapshot_dir_entry of snapshot_dir_entries) {
  1681. if (snapshot_dir_entry.name.startsWith('.')) continue
  1682. if (snapshot_dir_entry.name == 'versions') continue
  1683. assert(snapshot_dir_entry.isSymbolicLink(), `Found non-symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
  1684. assert(fs.existsSync(snapshot_dir_entry.name), `Found broken symbolic link in root of snapshot dir! ${snap_dir}/${snapshot_dir_entry.name}`)
  1685. }
  1686. const version_entries = (
  1687. (await fs.promises.readdir(versions_dir))
  1688. .filter(foldername => !foldername.startsWith('.'))
  1689. .sort())
  1690. console.log(` √ ${prettyPath(versions_dir)}`, version_entries.length)
  1691. for (const version_dir of version_entries) {
  1692. await assertVersionDirIsValid(path.join(versions_dir, version_dir))
  1693. }
  1694. // write snapshot dir file listing w/ sizes & hashes to .files.json
  1695. const directory_info = await getDirInfo(snap_dir, {withRoot: true, withHelpers: false, maxdepth: 3})
  1696. await overwriteFile(path.join(snap_dir, '.files.json'), directory_info)
  1697. }
  1698. async function assertVersionDirIsValid(version_dir) {
  1699. const dirname = path.parse(version_dir).name
  1700. assert(fs.existsSync(version_dir), `Version dir does not exist: ${prettyPath(version_dir)}`)
  1701. const dirent = await fs.promises.lstat(version_dir)
  1702. assert(dirent.isDirectory() && !dirent.isSymbolicLink(), `Found non-directory in versions dir! ${prettyPath(version_dir)}`)
  1703. const unix_epoch = '19700101000000'
  1704. const is_name_valid_datestr = /^\d+$/.test(dirname) && (dirname.length == 14) && (dirname.startsWith('2') || dirname == unix_epoch) && parseVersionDateStr(dirname)
  1705. assert(is_name_valid_datestr, `Version directories must be a 14-character long date string like 20251231235959! ${dirname}`)
  1706. // get all directory entries in archive/<snapshot_id>/versions/<version_id>/*
  1707. const version_dir_entries = (
  1708. (await fs.promises.readdir(version_dir, {withFileTypes: true}))
  1709. .filter((dirent) => !dirent.name.startsWith('.')))
  1710. // assert version dir contains only actual snapshot output files (not-symbolic links or other version dirs)
  1711. for (const version_dir_entry of version_dir_entries) {
  1712. assert(version_dir_entry.name != 'versions', `Version dir cannot contain another versions folder! ${prettyPath(version_dir)}/versions`)
  1713. assert(!version_dir_entry.isSymbolicLink(), `Version dir cannot contain symbolic link! ${prettyPath(version_dir)}/${version_dir_entry.name}`)
  1714. }
  1715. // color highlight the unix epoch version in black, and any version created today in blue
  1716. let pretty_dirname = dirname
  1717. if (dirname == unix_epoch) {
  1718. pretty_dirname = ANSI.black + unix_epoch + ANSI.reset
  1719. }
  1720. const today = versionStrFromDate(new Date(), {withDate: true, withTime: false})
  1721. if (dirname.startsWith(today)) {
  1722. pretty_dirname = ANSI.blue + dirname + ANSI.reset
  1723. }
  1724. // write version dir file listing w/ sizes & hashes to .files.json
  1725. const directory_info = await getDirInfo(version_dir, { withRoot: true, withHelpers: false, maxdepth: 3 })
  1726. await overwriteFile(path.join(version_dir, '.files.json'), directory_info)
  1727. console.log(` √ ./versions/${pretty_dirname} contains`, version_dir_entries.length, 'results')
  1728. }
  1729. async function setupSnapshotDB({ original_url, start_time, snapshot_dir }) {
  1730. // setup Snapshot database row, finding it if it already exists or creating a new one
  1731. const timestamp = snapshot_dir.split('/').at(-1)
  1732. const search_attrs = { url: original_url, timestamp }
  1733. const update_attrs = { url: original_url, timestamp, added: start_time, title: null }
  1734. let snapshot = await Snapshot.findOne({ where: search_attrs });
  1735. let created = false
  1736. if (!snapshot) {
  1737. snapshot = await Snapshot.findOne({ where: {url: original_url} });
  1738. if (snapshot) {
  1739. // console.warn(`[X] Found DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) that has different timestamp from existing dir ${prettyPath(snapshot_dir)}!`)
  1740. // throw 'Snapshot DB record does not match filesystem path!'
  1741. } else {
  1742. console.log(`[+] Creating new DB Snapshot [${timestamp}](${original_url.substring(0, 30)}...) for ${prettyPath(snapshot_dir)}...`)
  1743. // ;([snapshot, created] = await Snapshot.findOrCreate({where: search_attrs, defaults: update_attrs }));
  1744. // throw 'Wanted to create new Snapshot but refusing to modify DB during testing!'
  1745. }
  1746. }
  1747. // assert(snapshot && (snapshot instanceof Snapshot))
  1748. return snapshot
  1749. }
  1750. async function setupViewport(page, _page_state) {
  1751. // setup viewport
  1752. await page.setViewport(DEFAULT_VIEWPORT);
  1753. await page.setGeolocation(DEFAULT_GEOLOCATION);
  1754. // await page.setBypassCSP(true); // bypass CSP restrictions (requires --disable-web-security)
  1755. page.setDefaultTimeout(DEFAULT_TIMEOUT);
  1756. // Optional: emulate a mobile device
  1757. // await page.emulate(puppeteer.devices['iPhone 6']);
  1758. // Configure light mode/dark mode & accessibility reduced motion preferences
  1759. await page.emulateMediaFeatures([
  1760. {name: 'prefers-color-scheme', value: DEFAULT_COLOR_SCHEME},
  1761. {name: 'prefers-reduced-motion', value: 'reduce'},
  1762. ]);
  1763. // Setup headers & deterministically chose a random referrer based on URL
  1764. const rand_idx = hashCode(await page.url()) % DEFAULT_REFERRERS.length
  1765. await page.setExtraHTTPHeaders({
  1766. ...DEFAULT_HEADERS,
  1767. referrer: DEFAULT_REFERRERS[rand_idx],
  1768. })
  1769. // Setup alert to trigger if site tries to sniff whether we are a bot
  1770. function sniffDetector() {
  1771. const userAgent = window.navigator.userAgent;
  1772. const platform = window.navigator.platform;
  1773. // @ts-ignore
  1774. window.navigator.__defineGetter__('userAgent', function () {
  1775. // @ts-ignore
  1776. window.navigator.sniffed = true;
  1777. return userAgent;
  1778. });
  1779. // @ts-ignore
  1780. window.navigator.__defineGetter__('platform', function () {
  1781. // @ts-ignore
  1782. window.navigator.sniffed = true;
  1783. return platform;
  1784. });
  1785. }
  1786. await page.evaluateOnNewDocument(sniffDetector);
  1787. // @ts-ignore
  1788. const was_sniffed = await page.evaluate(() => (!!window.navigator.sniffed))
  1789. if (was_sniffed) {
  1790. console.warn('[⚠️] Site tried to sniff if we are a bot! Site may be difficult to archive.')
  1791. }
  1792. return page
  1793. }
  1794. async function setupModalAutoClosing(page, page_state, {timeout=1_250}={}) {
  1795. page.on('dialog', (dialog) => {
  1796. console.log(`[👆] Auto-closing modal that popped up: ${dialog.message()}...`)
  1797. setTimeout(() => {try { dialog.accept() } catch(err) {}}, timeout);
  1798. })
  1799. // if you expect a file-upload dialog, use this to catch it instead:
  1800. // const [fileChooser] = await Promise.all([
  1801. // page.waitForFileChooser(),
  1802. // ]);
  1803. // await fileChooser.accept(['/tmp/myfile.pdf']);
  1804. page.on('close', () => {
  1805. try {
  1806. page.off('dialog')
  1807. } catch(err) {}
  1808. })
  1809. }
  1810. async function startScreenrecording(page, page_state, {duration_limit=60, codec='libx264'}={}) {
  1811. await fs.promises.mkdir(path.dirname(SCREENRECORDING_PATH(page)), {recursive: true})
  1812. // console.log(`[🎬] Starting screen-recording stream...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
  1813. // alternative: interact with low-level puppeteer screencast API directly
  1814. // using puppeteer.page.screencast: https://pptr.dev/api/puppeteer.page.screencast
  1815. // const recorder = await page.screencast({path: SCREENRECORDING_PATH(page)});
  1816. // alternative: use puppeteer-stream for .webm/.mp4 screen recordings with tab audio included
  1817. // works sometimes but has a few issues, e.g.: https://github.com/SamuelScheit/puppeteer-stream/issues/8
  1818. // alternative: puppeteer-screen-recorder (most compatible/stable but doesn't include tab audio output)
  1819. const recorder = new PuppeteerScreenRecorder(page, {
  1820. followNewTab: false,
  1821. recordDurationLimit: duration_limit,
  1822. // fps: 25,
  1823. // ffmpeg_Path: '<path of ffmpeg_path>' || null,
  1824. // videoFrame: {
  1825. // width: 1024,
  1826. // height: 768,
  1827. // },
  1828. // videoCrf: 18,
  1829. videoCodec: codec,
  1830. // videoPreset: 'ultrafast',
  1831. // videoBitrate: 1000,
  1832. // autopad: {
  1833. // color: 'black' | '#35A5FF',
  1834. // },
  1835. // aspectRatio: '4:3',
  1836. });
  1837. page_state.recorder = recorder
  1838. await recorder.start(SCREENRECORDING_PATH(page))
  1839. page.on('close', async () => {await saveScreenrecording(page, page_state)});
  1840. return page_state
  1841. }
  1842. async function startResponseSaving(page, page_state) {
  1843. const dir = RESPONSES_PATH(page)
  1844. await fs.promises.mkdir(dir, {recursive: true})
  1845. console.log(`[🌄] Starting raw response bytes recording...`.padEnd(82), prettyPath(dir) + '/')
  1846. // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
  1847. const types_to_save = [
  1848. // 'document',
  1849. 'script',
  1850. 'stylesheet',
  1851. 'font',
  1852. 'image',
  1853. 'media',
  1854. 'xhr',
  1855. 'websocket',
  1856. ]
  1857. // reset responses index file to empty
  1858. const responses_log_path = path.join(dir, 'index.jsonl')
  1859. await overwriteFile(responses_log_path, '')
  1860. // add handler to save all image repsonses into output directory
  1861. page.on('response', async (response) => {
  1862. try {
  1863. const timestamp = versionStrFromDate(new Date(), {withDate: true, withTime: true, withSeconds: true, withMilliseconds: true})
  1864. if (!page_state.main_response && (response.request().url() == page_state.original_url)) {
  1865. // save first response as main page response (if we havent already caught it earlier)
  1866. page_state.main_response = response
  1867. }
  1868. const status = response.status()
  1869. if ((status >= 300) && (status < 500)) {
  1870. // console.log('Got bad response from', response.url(), 'to', response.headers()['location'])
  1871. return
  1872. }
  1873. const request = response.request()
  1874. const resourceType = request.resourceType()
  1875. const url_scheme = (response.url() || request.url()).split(':')[0].toLowerCase()
  1876. const method = (url_scheme === 'data') ? 'DATA' : request.method()
  1877. // console.log(' ', resourceType, response.url())
  1878. if (types_to_save.includes(resourceType)) {
  1879. // create ./responses/xhr/www.facebook.com/static/images/icons/ subdir based on hostname + path
  1880. const resource_type_dir = path.join(dir, resourceType)
  1881. const url = new URL(response.url())
  1882. let subdir = resource_type_dir
  1883. const url_path = (url.pathname || '').slice(0, 250).endsWith('/')
  1884. ? (url.pathname || '').slice(0, 250)
  1885. : path.dirname((url.pathname || '').slice(0, 250))
  1886. // determine subdirectory based on url type (handles http:,https:,file:,data:,chrome-extension:,about:,etc.)
  1887. if (!URL_SCHEMES_IGNORED.includes(url_scheme)) {
  1888. // is a normal http:// or https:// url, use the domain + path to construct subdirectory
  1889. subdir = path.join(resource_type_dir, (url.hostname || 'data').slice(0, 250), url_path)
  1890. } else if (url_scheme == 'data') {
  1891. // is a data:... url, store in ./data subdirectory
  1892. subdir = path.join(resource_type_dir, 'data')
  1893. } else {
  1894. // is a chrome-extension:// or other special url, use the extension id + path to construct subdirectory
  1895. const url_path = path.dirname((url.pathname || '').slice(0, 999))
  1896. subdir = path.join(resource_type_dir, url_scheme, (url.hostname || 'data').slice(0, 250), url_path)
  1897. }
  1898. // write response to responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
  1899. let abspath = null
  1900. let resp_mimetype = null
  1901. let extension = ''
  1902. let uniq_filename = null
  1903. let uniq_abspath = null
  1904. let symlink_abspath = null
  1905. let responseSha256 = null
  1906. try {
  1907. await fs.promises.mkdir(path.join(dir, 'all'), {recursive: true})
  1908. try {
  1909. await fs.promises.mkdir(subdir, {recursive: true})
  1910. } catch(err) {
  1911. subdir = subdir + '.dir' // TODO: apply this workaround to parent path entries too
  1912. try {
  1913. await fs.promises.mkdir(subdir, {recursive: true})
  1914. } catch(err) {
  1915. subdir = path.join(resource_type_dir, 'data')
  1916. await fs.promises.mkdir(subdir, {recursive: true})
  1917. }
  1918. }
  1919. ;({abspath: symlink_abspath, resp_mimetype, extension} = await detectFilename({page, response, dir: subdir, resourceType}))
  1920. // responses/all/1716861056899__https%3A%2F%2Fwww.instagram.com%2Fgraphql%2Fquery.json
  1921. uniq_filename = `${timestamp}__${method}__` + [encodeURIComponent(url.href).slice(0, 64).replaceAll('/', '_').replace(new RegExp(`.${extension}$`), ''), extension].filter(s => s.length).join('.')
  1922. uniq_abspath = path.join(dir, 'all', uniq_filename)
  1923. let bytesBuffer = null
  1924. try {
  1925. bytesBuffer = await response.buffer()
  1926. } catch(err) {
  1927. if (String(err).includes("Cannot read properties of undefined (reading 'body')")) {
  1928. // not sure why it's happening but seems to be too late to caputre body sometimes? possible race condition
  1929. } else {
  1930. console.warn('[⚠️] Failed to save response bytes for:', response.request().url(), err)
  1931. }
  1932. }
  1933. if (bytesBuffer) {
  1934. // write response data into ./all/<TS>__<METHOD>__<URL>.<EXT>
  1935. await overwriteFile(uniq_abspath, bytesBuffer)
  1936. responseSha256 = crypto.createHash('sha256').update(bytesBuffer).digest('hex')
  1937. // write symlink file to ./<TYPE>/<DOMAIN>/...<PATH>/<FILENAME>.<EXT> -> ./all/<TS>__<METHOD>__<URL>.<EXT>
  1938. await overwriteSymlink(uniq_abspath, symlink_abspath, {relative: dir, mkdirs: true, search_limit: dir})
  1939. }
  1940. // console.log(' ->', symlink_abspath)
  1941. } catch(err) {
  1942. // dont do anything for redirectresponses, error responses, etc.
  1943. console.warn(err)
  1944. }
  1945. const urlSha256 = crypto.createHash('sha256').update(String(request.url())).digest('hex')
  1946. // const headersSha256 = crypto.createHash('sha256').update(String(request.headers())) // someday we may want to save headers hashes too
  1947. const truncated_url = (method == 'DATA') ? request.url().slice(0, 128) : request.url() // don't duplicate bytes in data: urls (we already saved them in the file)
  1948. // this is essentially replicating the functionality of a WARC file, but in directory + index.jsonl form
  1949. await fs.promises.appendFile(
  1950. responses_log_path,
  1951. JSON.stringify({
  1952. ts: timestamp,
  1953. method,
  1954. url: truncated_url,
  1955. urlSha256,
  1956. postData: request.postData(),
  1957. response_url: ((method != 'DATA') && (url.href != request.url())) ? url.href : undefined,
  1958. status,
  1959. resourceType,
  1960. mimeType: resp_mimetype,
  1961. responseSha256,
  1962. path: uniq_abspath?.replace(dir, '.'),
  1963. symlink_path: symlink_abspath?.replace(dir, '.'),
  1964. extension,
  1965. }) + '\n',
  1966. 'utf-8',
  1967. )
  1968. }
  1969. } catch(err) {
  1970. // we should never throw hard errors here because there's nothing above us to catch it
  1971. // and we dont want to crash the entire CDP session / browser / main node process
  1972. console.warn('[❌] Error in response handler (set in startResponseSaving):', err)
  1973. }
  1974. });
  1975. // handled by stopMetadataRecording():
  1976. // page.on('close', () => {
  1977. // page.off('response')
  1978. // })
  1979. }
  1980. function dedupeCookies(cookies) {
  1981. const len_before = cookies.length
  1982. const allowed_cookie_attrs = ['domain', 'path', 'name', 'value', 'expires', 'sameSite', 'sourceScheme', 'url', 'priority', 'secure', 'httpOnly']
  1983. const deduped_cookies = {}
  1984. for (const cookie of cookies) {
  1985. try {
  1986. const unique_id = `${cookie.domain}${cookie.path}${cookie.name}`
  1987. deduped_cookies[unique_id] = {
  1988. ...(deduped_cookies[unique_id] || {}),
  1989. ...cookie,
  1990. expires: 2147483640, // max allowed expiry time (2038-01-18)
  1991. session: false, // make sure cookies dont expire at browser close time
  1992. secure: false, // make cookie restrictions more lax (for archiving scripts)
  1993. httpOnly: false, // make it easier to tamper with cookies from JS (for archiving scripts)
  1994. // "path": "/",
  1995. // "expires": 2147483641,
  1996. // "size": 194,
  1997. // "httpOnly": false,
  1998. // "secure": false,
  1999. // "session": false,
  2000. // "priority": "High",
  2001. // "sameParty": false,
  2002. // "sourceScheme": "Secure",
  2003. // "sourcePort": 443
  2004. // and more... https://pptr.dev/api/puppeteer.cookieparam
  2005. } as Cookie
  2006. if (!deduped_cookies[unique_id].value) {
  2007. delete deduped_cookies[unique_id]
  2008. continue
  2009. }
  2010. if (deduped_cookies[unique_id].name.startsWith('__')) {
  2011. // cookies that start with __ must be secure, see https://github.com/puppeteer/puppeteer/issues/6806
  2012. deduped_cookies[unique_id].secure = true
  2013. deduped_cookies[unique_id].sourceScheme = 'Secure'
  2014. }
  2015. if (deduped_cookies[unique_id].domain.startsWith('.')) {
  2016. deduped_cookies[unique_id].sameParty = false
  2017. deduped_cookies[unique_id].domain = deduped_cookies[unique_id].domain.slice(1)
  2018. }
  2019. for (const key of Object.keys(deduped_cookies[unique_id])) {
  2020. if (!allowed_cookie_attrs.includes(key)) {
  2021. delete deduped_cookies[unique_id][key]
  2022. }
  2023. }
  2024. } catch(err) {
  2025. console.error('[❌] Failed to parse cookie during deduping', cookie)
  2026. throw err
  2027. }
  2028. }
  2029. // console.log(`[🍪] Deduped ${len_before} cookies to ${Object.keys(deduped_cookies).length}...`)
  2030. return Object.values(deduped_cookies) as Cookie[]
  2031. }
  2032. async function loadCookiesTxt() {
  2033. const cookies = [] as Cookie[]
  2034. return cookies // write-only from chrome -> files for now
  2035. if (fs.existsSync(COOKIES_TXT_PATH)) {
  2036. // console.log(`[🍪] Loading cookies/localStorage/sessionStorage from ${COOKIES_TXT_PATH}...`)
  2037. // Read from to cookies.txt file using tough-cookie + @root/file-cookie-store
  2038. const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false});
  2039. cookies_store.getAllCookiesAsync = util.promisify(cookies_store.getAllCookies);
  2040. const exported_cookies = await cookies_store.getAllCookiesAsync()
  2041. for (const cookie of exported_cookies) {
  2042. const cookie_from_tough = cookie.toJSON()
  2043. const domain = cookie_from_tough.hostOnly ? `.${cookie_from_tough.domain}` : cookie_from_tough.domain
  2044. const cookie_for_puppeteer: Cookie = {
  2045. domain,
  2046. name: cookie_from_tough.key,
  2047. path: cookie_from_tough.path,
  2048. value: cookie_from_tough.value,
  2049. secure: cookie_from_tough.secure || false,
  2050. httpOnly: cookie_from_tough.httpOnly || false,
  2051. session: false,
  2052. expires: (new Date(cookie_from_tough.expires)).valueOf()/1000,
  2053. size: undefined,
  2054. }
  2055. // console.log('COOKIE_FROM_TOUGH_TXT', cookie_from_tough, cookie_for_puppeteer)
  2056. cookies.push(cookie_for_puppeteer)
  2057. }
  2058. }
  2059. }
  2060. type AuthJSON = {
  2061. cookies: Cookie[],
  2062. sessionStorage: any,
  2063. localStorage: any,
  2064. }
  2065. async function loadAuthStorage(page, {client}, {apply=true}={}) {
  2066. var {
  2067. cookies,
  2068. sessionStorage,
  2069. localStorage,
  2070. }: AuthJSON = {cookies: [], sessionStorage: {}, localStorage: {}}
  2071. if (!LOAD_AUTH_STORAGE) {
  2072. // dont read auth from filesystem auth.json/cookies.txt, just rely on existing cookies in chrome profile
  2073. return {cookies, sessionStorage, localStorage}
  2074. }
  2075. if (fs.existsSync(COOKIES_TXT_PATH)) {
  2076. try {
  2077. cookies = await loadCookiesTxt()
  2078. } catch(err) {
  2079. console.warn('[⚠️] Loaded invalid cookies.txt, moved it to cookies.txt.corrupted (did two processes try to change it at the same time?)')
  2080. await fs.promises.rename(COOKIES_TXT_PATH, COOKIES_TXT_PATH + '.corrupted')
  2081. }
  2082. // console.log(`[🍪] Loading cookies from cookies.txt...`, cookies.length)
  2083. }
  2084. if (fs.existsSync(AUTH_JSON_PATH)) {
  2085. try {
  2086. var {
  2087. cookies: auth_json_cookies,
  2088. sessionStorage,
  2089. localStorage,
  2090. } = JSON.parse(await fs.promises.readFile(AUTH_JSON_PATH, 'utf-8'));
  2091. cookies = [...cookies, ...auth_json_cookies]
  2092. // console.log(`[🍪] Loading cookies from auth.json...`, auth_json_cookies.length)
  2093. } catch(err) {
  2094. console.warn('[⚠️] Loaded invalid auth.json, moved it to auth.json.corrupted (did two processes try to change it at the same time?)')
  2095. await fs.promises.rename(AUTH_JSON_PATH, AUTH_JSON_PATH + '.corrupted')
  2096. }
  2097. }
  2098. cookies = dedupeCookies(cookies)
  2099. if (apply) {
  2100. console.log(`[🍪] Loading stored cookies/localStorage/sessionStorage into session...`, cookies.length)
  2101. // if (cookies?.length) {
  2102. // try {
  2103. // // try setting all at once first (much faster)
  2104. // await page.setCookie(...cookies)
  2105. // } catch(err) {
  2106. // // if any errors, fall back to setting one-by-one so that individual error can be caught
  2107. // for (const cookie of cookies) {
  2108. // try {
  2109. // await page.setCookie(cookie);
  2110. // } catch(err) {
  2111. // console.error('[❌] Failed to set cookie', cookie)
  2112. // throw err
  2113. // }
  2114. // }
  2115. // }
  2116. // }
  2117. const origin = await page.evaluate(() => window.location.origin)
  2118. await page.evaluate((savedSessionStorage) => {
  2119. for (const [key, value] of Object.entries(savedSessionStorage)) {
  2120. sessionStorage[key] = value;
  2121. }
  2122. }, sessionStorage[origin] || {});
  2123. await page.evaluate((savedLocalStorage) => {
  2124. for (const [key, value] of Object.entries(savedLocalStorage)) {
  2125. localStorage[key] = value;
  2126. }
  2127. }, localStorage[origin] || {});
  2128. // origin/auth context changes when we do page.goto so we have to hook pageload and apply it then as well
  2129. // https://stackoverflow.com/questions/51789038/set-localstorage-items-before-page-loads-in-puppeteer
  2130. await page.evaluateOnNewDocument(({sessionStorage, localStorage}) => {
  2131. const origin = window.location.origin;
  2132. for (const [key, value] of Object.entries(sessionStorage[origin] || {})) {
  2133. window.sessionStorage.setItem(key, value as string)
  2134. }
  2135. for (const [key, value] of Object.entries(localStorage[origin] || {})) {
  2136. window.localStorage.setItem(key, value as string)
  2137. }
  2138. }, {sessionStorage, localStorage});
  2139. }
  2140. return {cookies, sessionStorage, localStorage}
  2141. }
  2142. async function loadCloudflareCookie(page, {original_url}, {timeout=20_000}={}) {
  2143. // make request to FlareSolverr server to get magic cookies that let us bypass cloudflare bot detection
  2144. // docker run -p 8191:8191 -e LOG_LEVEL=info ghcr.io/flaresolverr/flaresolverr
  2145. // alternatives if this stops working:
  2146. // - https://github.com/omkarcloud/botasaurus
  2147. // - https://github.com/ultrafunkamsterdam/nodriver
  2148. // - https://github.com/Akmal-CloudFreed/CloudFreed-CloudFlare-bypass
  2149. // - https://github.com/VeNoMouS/cloudscraper
  2150. const query = { url: original_url, cmd: "request.get", maxTimeout: timeout }
  2151. try {
  2152. const response = await fetch(FLARESOLVERR_API_ENDPOINT, {
  2153. method: 'POST',
  2154. headers: {'Content-Type': 'application/json'},
  2155. body: JSON.stringify(query),
  2156. });
  2157. const data = await response.json();
  2158. const new_cookies = (data?.solution?.cookies || []).map(cookie => ({
  2159. ...cookie,
  2160. 'expires': 2147483640, // overwrite expiration to 32bit maximum timestamp (2038-01-18)
  2161. 'secure': false, // cookie value is plain text (not encrypted/encoded)
  2162. }))
  2163. if (new_cookies.length) {
  2164. console.log(`[☑️] Got Cloudflare bypass cookies (${new_cookies.length}) from FlareSolverr API...`)
  2165. await page.setCookie(...new_cookies);
  2166. return new_cookies
  2167. } else {
  2168. const error_str = JSON.stringify(data?.message || data, null, 4)
  2169. throw `Bad FlareSolverr Response: ${error_str}`
  2170. }
  2171. } catch (error) {
  2172. if (JSON.stringify(error).includes('Challenge not detected')) {
  2173. console.log('[☑️] Page is accessible without FlareSolverr Cloudflare bypass.')
  2174. } else {
  2175. console.warn('[❌] Failed to get Cloudflare bypass cookies from FlareSolverr API.', error)
  2176. }
  2177. }
  2178. return []
  2179. }
  2180. async function setupURLRewriting(page, page_state) {
  2181. await page.setRequestInterception(true);
  2182. const rewrites = URL_REWRITES.sort((a, b) => (a.idx || 0) - (b.idx || 0))
  2183. page.on('request', interceptedRequest => {
  2184. if (interceptedRequest.isInterceptResolutionHandled()) return;
  2185. const original_url = interceptedRequest.url()
  2186. // apply all the rewrites in order to the request URL
  2187. let url = original_url
  2188. for (const rewrite of rewrites) {
  2189. const new_url = url.replace(rewrite.pattern, rewrite.replacement)
  2190. // console.log(rewrite, url, new_url)
  2191. // if url is rewritten to an emptystring, abort the request
  2192. if (!new_url) {
  2193. console.warn('[🟥] Request blocked', rewrite.pattern, ':', url)
  2194. interceptedRequest.abort()
  2195. return
  2196. }
  2197. else if (new_url && new_url != url) {
  2198. // console.warn('[📳] Request rewritten', rewrite.pattern, rewrite.replacement, ':', url, '->', new_url)
  2199. console.warn('[📳] Request rewritten', rewrite.pattern, ':', new_url)
  2200. url = new_url
  2201. }
  2202. }
  2203. if (url == original_url) {
  2204. // if url is unchanged, continue request flow as-is
  2205. interceptedRequest.continue()
  2206. } else {
  2207. // otherwise redirect the browser to our rewritten version
  2208. interceptedRequest.respond({
  2209. status: 302,
  2210. headers: {
  2211. location: url,
  2212. 'x-redirect-by': 'ArchiveBox.setupURLRewriting',
  2213. },
  2214. })
  2215. }
  2216. });
  2217. // handled by stopMetadataRecording():
  2218. // page.on('close', () => {
  2219. // page.off('request')
  2220. // page.setRequestInterception(false)
  2221. // })
  2222. }
  2223. async function startMetadataRecording(page, {original_url, version, client, traffic_log, console_log, redirects}) {
  2224. // update helper state on page
  2225. page._original_url = (original_url || (await page.url())).toString()
  2226. // DEBUGGING: helpers for repl() debugging, dont rely on these (global state is badd mmkay)
  2227. // page._client = client || page._client || await page.target().createCDPSession()
  2228. // page._redirects = redirects
  2229. // page._traffic_log = traffic_log
  2230. // add initial entry to page redirect log
  2231. redirects[original_url] = {
  2232. idx: 0,
  2233. url: original_url,
  2234. src: null,
  2235. type: 'Initial',
  2236. wallTime: Date.now()/1000,
  2237. frameId: page.mainFrame()._id,
  2238. requestId: null,
  2239. initiator: {type: "user"},
  2240. isMainFrame: true,
  2241. }
  2242. // DEBUGGING: record optional chrome debug trace with screenshots (heavy)
  2243. // try {
  2244. // await page.tracing.stop()
  2245. // await wait(200)
  2246. // } catch(err) {}
  2247. // try {
  2248. // await page.tracing.start({path: TRACE_PATH(page), screenshots: true});
  2249. // } catch(err) {}
  2250. let last_main_frame_url = original_url
  2251. // setup network request intercepts handler
  2252. const addCDPRequestDataListener = (eventName) => {
  2253. client.on(eventName, event => {
  2254. try {
  2255. // save any HTTP/JS redirects to redirects for saveRedirects(page) to use later on
  2256. const new_url = event.documentURL
  2257. const http_status = event.redirectResponse?.status || 0
  2258. const is_new_url = (new_url !== original_url) && !redirects[new_url]
  2259. const is_main_frame_navigation = (event.frameId == page.mainFrame()._id)
  2260. const is_http_redirect = (300 < http_status) && (http_status < 400)
  2261. if (new_url && is_new_url && (is_main_frame_navigation || is_http_redirect) && event.type == 'Document') {
  2262. const new_redirect_entry = {
  2263. url: new_url,
  2264. src: event.redirectResponse?.url || last_main_frame_url,
  2265. type: http_status || 'JS',
  2266. wallTime: Date.now()/1000,
  2267. frameId: event.frameId,
  2268. requestId: event.requestId,
  2269. initiator: event.initiator,
  2270. idx: Object.keys(redirects).length,
  2271. isMainFrame: is_main_frame_navigation,
  2272. }
  2273. redirects[new_url] = new_redirect_entry
  2274. if (is_main_frame_navigation) {
  2275. ALREADY_ARCHIVED.add(new_redirect_entry.url.slice(0, 4096)) // we're already archiving this tab as it redirects, dont create a duplicate archive for the destination
  2276. console.warn(`[➡️] NAVIGATION[${new_redirect_entry.type}]${ANSI.blue} ${last_main_frame_url} ${ANSI.reset}\n ->${ANSI.blue} ${new_redirect_entry.url} ${ANSI.reset}`)
  2277. last_main_frame_url = new_url
  2278. }
  2279. }
  2280. if (event.loaderId) {
  2281. traffic_log[event.loaderId] = traffic_log[event.loaderId] || {} // make sure loader is also in requests list first
  2282. // sometimes it's not in the list if we start archiving too late / after a page's initial request was already made
  2283. }
  2284. // save to traffic_log as {8BC2087A2CCEF28017099C0E10E87440: {Network.eventWillBeSent: {eventId,loaderId, request|response, ...}}
  2285. // https://stackoverflow.com/questions/47078655/missing-request-headers-in-puppeteer?noredirect=1&lq=1
  2286. traffic_log[event.requestId] = traffic_log[event.requestId] || {}
  2287. Object.assign(traffic_log[event.requestId], { [eventName]: event })
  2288. // DEBUGGING: log page visits and navigation events to console
  2289. // if (event?.response?.status) {
  2290. // // if we're expecting an HTML response, then we assume it's a page visit & log it to console
  2291. // const acceptMimeType = traffic_log[event.requestId]['Network.requestWillBeSentExtraInfo']?.headers?.accept
  2292. // if (acceptMimeType && acceptMimeType.includes('text/html')) {
  2293. // // log any HTML page responses (less noisy)
  2294. // console.log(`[>] GOT ${event.documentURL}: ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
  2295. // } else {
  2296. // // log ALL responses, inclusing JS,CSS,Images,etc. (very noisy)
  2297. // // console.log(` > ${event.response.status} ${event.response.url} (${event.response.mimeType})`)
  2298. // }
  2299. // }
  2300. } catch(err) {
  2301. console.warn('[X] Error during request/response handler (startMetadataRecording.addCDPRequestDataListener)')
  2302. console.warn(err)
  2303. }
  2304. })
  2305. }
  2306. addCDPRequestDataListener('Network.requestWillBeSent')
  2307. addCDPRequestDataListener('Network.requestWillBeSentExtraInfo')
  2308. addCDPRequestDataListener('Network.responseReceived')
  2309. addCDPRequestDataListener('Network.responseReceivedExtraInfo')
  2310. // clear any existing log entries
  2311. const consolelog_info = {
  2312. TYPE: 'console',
  2313. VERSION: version,
  2314. URL: original_url,
  2315. }
  2316. await overwriteFile(CONSOLELOG_PATH(page), JSON.stringify(consolelog_info) + '\n')
  2317. // record console logs from page
  2318. const appendConsoleLog = async (line) => {
  2319. if (!line) return
  2320. console_log.push(line)
  2321. await fs.promises.appendFile(
  2322. CONSOLELOG_PATH(page),
  2323. line + '\n',
  2324. 'utf-8',
  2325. )
  2326. }
  2327. page.on('console', async(message) =>
  2328. await appendConsoleLog(`${message.type().toUpperCase()} ${message.location()} ${JSON.stringify(message.text())}`))
  2329. page.on('pageerror', async (error) =>
  2330. await appendConsoleLog(error.message || JSON.stringify(error)))
  2331. page.on('requestfailed', async (request) =>
  2332. await appendConsoleLog(`${request.failure()?.errorText} ${request.url() || JSON.stringify(request)}`))
  2333. // set puppeteer options on page
  2334. await client.send('Network.enable') // enable network tampering API
  2335. await client.send('Emulation.clearDeviceMetricsOverride'); // clear timing statistics
  2336. await client.send('Page.setDownloadBehavior', {
  2337. behavior: 'allow',
  2338. downloadPath: CHROME_DOWNLOADS_DIR,
  2339. })
  2340. // handled by stopMetadataRecording():
  2341. // page.on('close', () => {
  2342. // try {
  2343. // page.off('request')
  2344. // page.off('console')
  2345. // page.off('pageerror')
  2346. // page.off('requestfailed')
  2347. // page.setRequestInterception(false)
  2348. // } catch(err) {
  2349. // // some versions of puppeteer have had race conditions here where page is already closed by now
  2350. // console.warn('[X] Error in page close handler', err)
  2351. // }
  2352. // })
  2353. return {original_url, client, redirects, traffic_log, console_log}
  2354. }
  2355. async function stopMetadataRecording(page, _page_state) {
  2356. console.log('[🪝] Stopping CDP event hooks and request interception...')
  2357. try {
  2358. page.off('request')
  2359. page.off('response')
  2360. page.off('console')
  2361. page.off('pageerror')
  2362. page.off('requestfailed')
  2363. page.off('hashchange')
  2364. page.setRequestInterception(false)
  2365. // page.tracing.stop()
  2366. } catch(err) {
  2367. // some versions of puppeteer have had race conditions here where page is already closed by now
  2368. console.warn('[X] Error in page close handler', err)
  2369. }
  2370. }
  2371. /********************** Human Behavior Emulation ******************************/
  2372. async function solveCaptchas(page, page_state, {timeout=90_000}={}) {
  2373. // using puppeteer-extra-plugin-recaptcha auto-solver
  2374. // await page.solveRecaptchas()
  2375. // using 2captcha-solver extension auto-solver
  2376. try {
  2377. // console.log('[🕑] Waiting for CAPTCHA to appear...')
  2378. await page.waitForSelector('.captcha-solver', {timeout: 5_000})
  2379. console.log('[🤖] CAPTCHA challenge found, submitting to 2Captcha for solving...')
  2380. await page.click('.captcha-solver')
  2381. console.log(`[🧠] Waiting up to ${timeout/1000}s for CAPTCHA to be solved...`)
  2382. await page.waitForSelector(`.captcha-solver[data-state="solved"]`, {timeout})
  2383. console.log('[🔓] CAPTCHA solution retrieved from 2captcha.')
  2384. } catch(err) {
  2385. console.log('[☑️] No CATPCHA challenges found, site thinks we are human.')
  2386. }
  2387. }
  2388. async function jiggleMouse(page, page_state, {timeout=600}={}) {
  2389. console.log(`[🐁] Moving mouse around randomly for ${timeout/1000}s...`)
  2390. const randomPoint = await getRandomPagePoint(page)
  2391. const cursor = createCursor(page, randomPoint, true)
  2392. cursor.toggleRandomMove(true)
  2393. await wait(timeout/2);
  2394. await cursor.moveTo({x: DEFAULT_VIEWPORT.width/2, y: DEFAULT_VIEWPORT.height/2});
  2395. await wait(timeout/2);
  2396. cursor.toggleRandomMove(false)
  2397. }
  2398. async function blockRedirects(page, {original_url}) {
  2399. page.on('request', req => {
  2400. if (req.isInterceptResolutionHandled()) return;
  2401. // if it's a top-level navigation event to a new url
  2402. if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== original_url) {
  2403. req.abort('aborted');
  2404. console.warn('[🟥] Blocked page attempt to naviage to new URL', req.url())
  2405. } else {
  2406. req.continue();
  2407. }
  2408. });
  2409. // handled by stopMetadataRecording():
  2410. // page.on('close', () => {
  2411. // page.off('request')
  2412. // page.setRequestInterception(false)
  2413. // })
  2414. await page.setRequestInterception(true);
  2415. }
  2416. async function blockJSExecution(page, _page_state) {
  2417. console.warn('[🟥] Stopping all JS execution on page...')
  2418. await page.evaluate(() => {
  2419. debugger;
  2420. })
  2421. // OR alternatively this (more buggy, breaks many sites):
  2422. // const html = await page.content();
  2423. // page.setJavaScriptEnabled(false);
  2424. // await page.setContent(html, { waitUntil: 'networkidle0' }); // 4
  2425. }
  2426. async function scrollDown(page, _page_state, {timeout=120_000, scroll_delay=SCROLL_DELAY, scroll_distance=SCROLL_DISTANCE, scroll_limit=SCROLL_LIMIT}={}) {
  2427. const starting_height = await page.evaluate('document.body.scrollHeight');
  2428. let last_height = starting_height
  2429. let scroll_count = 0;
  2430. let scroll_position = scroll_count * scroll_distance
  2431. // await page.bringToFront()
  2432. // scroll to top
  2433. await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
  2434. while ((scroll_count < scroll_limit) && ((scroll_delay * scroll_count) < timeout)) {
  2435. console.log(`[⬇️] Scrolling down ${scroll_count}x 1000px... (${scroll_position}/${last_height})`)
  2436. await page.evaluate((y_offset) => { window.scrollTo({ top: y_offset, left: 0, behavior: 'smooth' }); }, scroll_position);
  2437. scroll_count++
  2438. scroll_position = scroll_count * scroll_distance
  2439. // check if any new content was added / if we are infiniscrolling
  2440. let new_height = await page.evaluate('document.body.scrollHeight')
  2441. const added_px = new_height - last_height
  2442. if (added_px > 0) {
  2443. console.log('[✚] Detected infini-scrolling...', `${last_height}+${added_px} => ${new_height}`)
  2444. } else if (scroll_position >= new_height + scroll_distance) {
  2445. // we've reached the bottom, condition isn't true until we've tried to go n+1 past the end (which is fine)
  2446. if (scroll_count > 2)
  2447. break
  2448. }
  2449. last_height = new_height
  2450. // sleep 2s, perform the smooth scroll down by 1000px, and increment the counter
  2451. await wait(scroll_delay);
  2452. // facebook watch pages infiniscroll (more and more recommendations forever), stop them after 3 pages
  2453. if (page._original_url.startsWith('https://www.facebook.com/watch/?v') && scroll_count > 3) break
  2454. }
  2455. // scroll to bottom
  2456. if (scroll_position < last_height) {
  2457. await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
  2458. await wait(scroll_delay)
  2459. await page.evaluate(() => { window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); });
  2460. }
  2461. // Always wait an additional 2sec at the end for scroll animations / loading / rendering to settle down
  2462. console.log('[📉] Reached bottom of the page.', `(${scroll_position}/${last_height})`)
  2463. await wait(scroll_delay);
  2464. await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
  2465. await wait(scroll_delay);
  2466. return last_height
  2467. }
  2468. async function disableAnimations(page, _page_state) {
  2469. console.log(`[⛄️] Disabling all animations using CSS override...`)
  2470. // https://stackoverflow.com/questions/53167644/injecting-css-into-site-with-puppeteer
  2471. const css_override = `*, *::before, *::after {
  2472. -moz-animation: none !important;
  2473. -moz-transition: none !important;
  2474. animation: none !important;
  2475. transition: none !important;
  2476. caret-color: transparent !important;
  2477. }`
  2478. // inject override into current page
  2479. await page.addStyleTag({content: css_override});
  2480. // inject override into any subsequently navigated pages
  2481. await page.evaluateOnNewDocument((css_override) => {
  2482. const style_tag = document.createElement('style')
  2483. style_tag.type = 'text/css'
  2484. style_tag.innerHTML = css_override
  2485. document.getElementsByTagName('head')[0].appendChild(style_tag)
  2486. }, css_override);
  2487. }
  2488. async function expandComments(page, _page_state, {timeout=120_000, limit=15_000, delay=650}={}) {
  2489. console.log(`[🗃️] Expanding up to ${limit} comments every ${delay}ms...`)
  2490. // expand all <details> sections in Github READMEs, HedgeDoc pages, etc.
  2491. await page.$$eval('pierce/article details', elem => {elem.open = true}) // expand Github README details sections
  2492. await page.$$eval('pierce/div.js-discussion details:not(.details-overlay)', elem => {elem.open = true}) // expand Github issue discussion hidden comments
  2493. await page.$$eval('pierce/.markdown-body details', elem => {elem.open = true}) // expand HedgeDoc Markdown details sections
  2494. await page.exposeFunction('onHashChange', url => page.emit('hashchange', url));
  2495. await page.evaluateOnNewDocument(() => {
  2496. // @ts-ignore
  2497. addEventListener('hashchange', (e) => onHashChange(location.href));
  2498. });
  2499. // Listen for hashchange events in node Puppeteer code.
  2500. page.on('hashchange', url => console.log('Page tried to navigate to:', new URL(url)));
  2501. const num_expanded = await page.evaluate(async ({timeout, limit, delay}) => {
  2502. function getElementsByXPath(xpath, ctx?) {
  2503. var results = [];
  2504. var xpathResult = document.evaluate(
  2505. xpath, // e.g. //*[text()='"+text+"']
  2506. ctx || document,
  2507. null,
  2508. XPathResult.ORDERED_NODE_ITERATOR_TYPE,
  2509. null
  2510. );
  2511. var node;
  2512. while ((node = xpathResult.iterateNext()) != null) {
  2513. results.push(node);
  2514. }
  2515. return results;
  2516. }
  2517. let num_expanded = 0
  2518. const getLoadMoreLinks = () => [
  2519. // find all the buttons/links to expand collapsed/hidden/lazy-loaded content
  2520. ...document.querySelectorAll('faceplate-partial[loading=action]'), // new reddit
  2521. ...document.querySelectorAll('a[onclick^="return morechildren"]'), // old reddit show more replies
  2522. ...document.querySelectorAll('a[onclick^="return togglecomment"]'), // old reddit show hidden replies
  2523. // ...document.querySelectorAll('a.js-show-link'), // stack overflow comments show more (TODO: make this only work on SO)
  2524. // ...document.querySelectorAll('a.morelink'), // HackerNews profile show more (TODO: make this only work on HN)
  2525. // ...getElementsByXPath("//*[text()~='View \d+ replies']"), // facebook comment expander
  2526. ...getElementsByXPath("//*[text()='Show more replies']"), // twitter infiniscroll expander
  2527. ...getElementsByXPath("//*[text()='Show replies']"), // twitter replies expander
  2528. ]
  2529. const wait = (ms) => new Promise(res => setTimeout(res, ms))
  2530. let load_more_links = getLoadMoreLinks()
  2531. while (load_more_links.length) {
  2532. console.log('Expanding comments...', load_more_links.length)
  2533. for (const link of load_more_links) {
  2534. link.scrollIntoView({behavior: 'smooth'})
  2535. if (link.slot == 'children') {
  2536. continue
  2537. // patch new reddit "More replies" links that would open in a new window to display inline instead
  2538. // const comment_id = link.src.split('?')[0].split('/').at(-1)
  2539. // link.slot = `children-${comment_id}-0`
  2540. // link.__alwaysShowSlot = false
  2541. }
  2542. // click the "More replies" button
  2543. link.click()
  2544. num_expanded++
  2545. await wait(delay)
  2546. const time_elapsed = num_expanded * delay
  2547. if ((num_expanded > limit) || (time_elapsed > timeout))
  2548. return num_expanded
  2549. }
  2550. load_more_links = getLoadMoreLinks()
  2551. }
  2552. return num_expanded
  2553. }, {timeout, limit, delay});
  2554. page.off('hashchange')
  2555. if (num_expanded) {
  2556. console.log(`[🗃️] Expanded ${num_expanded} comments...`)
  2557. // scroll to bottom, then back up to top
  2558. const final_height = await page.evaluate('document.body.scrollHeight');
  2559. await page.evaluate((top) => { window.scrollTo({ top, left: 0, behavior: 'smooth' }); }, final_height + 1000);
  2560. await wait(delay);
  2561. await page.evaluate(() => { window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); });
  2562. await wait(delay);
  2563. }
  2564. }
  2565. async function submitForm(page, _page_state, {timeout=5_000}={}) {
  2566. try {
  2567. await page.waitForSelector('form button[type=submit]', {timeout: 1_500});
  2568. console.log('[☑️] Submitting form...')
  2569. await page.click('form button[type=submit]')
  2570. await page.waitForNavigation({timeout});
  2571. await page.goBack();
  2572. } catch (err) {
  2573. // no form found
  2574. }
  2575. }
  2576. // TODO: add an evasion to set navigator.connection.rtt = 365 (0 = detectable as headless)
  2577. /******************************************************************************/
  2578. /******************************************************************************/
  2579. /**************** Extension-Based Archive Output Tasks ************************/
  2580. async function saveSinglefile(page, {main_response, extensions}) {
  2581. const extension = extensions.filter(({name}) => name === 'singlefile')[0]
  2582. if (!extension.version) throw 'Could not find Singlefile extension ID, is it installed?'
  2583. const url = await page.url() || main_response.url()
  2584. if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
  2585. // get list of existing past files in downloads/* to ignore
  2586. const files_before = new Set(
  2587. (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
  2588. .filter(fn => fn.endsWith('.html'))
  2589. );
  2590. const out_path = SINGLEFILE_PATH(page)
  2591. console.log(`[🛠️] Saving Singlefile HTML using extension (${extension.id})...`.padEnd(82+1), prettyPath(CHROME_DOWNLOADS_DIR))
  2592. await page.bringToFront() // action button acts on the foreground tab, so it has to be in front :(
  2593. await extension.dispatchAction()
  2594. let files_new = []
  2595. const check_delay = 3_000
  2596. for (const _try in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) {
  2597. await wait(check_delay)
  2598. const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)).filter(fn => fn.endsWith('.html'));
  2599. files_new = files_after.filter(file => !files_before.has(file))
  2600. if (files_new.length == 0) {
  2601. // console.warn(` ...waiting for Singlefile to write HTML into ${CHROME_DOWNLOADS_DIR}...`)
  2602. continue
  2603. }
  2604. // iterate through new downloads and find a matching .html containing our page's URL in the header
  2605. for (const file of files_new) {
  2606. const dl_path = path.join(CHROME_DOWNLOADS_DIR, file)
  2607. const dl_text = await fs.promises.readFile(dl_path, 'utf-8')
  2608. const dl_header = dl_text.split('meta charset')[0]
  2609. if (dl_header.includes(`url: ${url}`)) {
  2610. /// dont need this check anymore as now all output is versioned:
  2611. // if (fs.existsSync(out_path)) {
  2612. // const {size: existingSize} = await fs.promises.stat(out_path)
  2613. // const {size: newFileSize} = await fs.promises.stat(dl_path)
  2614. // if (newFileSize < existingSize) {
  2615. // console.log(`[🗑️] Discarding singlefile output (${file}) as it's smaller than existing ${out_path}...`)
  2616. // await fs.promises.rm(dl_path)
  2617. // return out_path
  2618. // }
  2619. // }
  2620. console.log(`[✍️] Moving Singlefile download from ${file}...`.padEnd(82), prettyPath(out_path))
  2621. await fs.promises.rename(dl_path, out_path)
  2622. return out_path
  2623. }
  2624. }
  2625. }
  2626. console.warn(`[❌] Couldn't find matching Singlefile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay*10)/1000}s:`, files_new.join(', '))
  2627. return null
  2628. }
  2629. async function saveArchiveWebPage(page, {extensions}, {timeout=30_000}={}) {
  2630. // TODO: waiting on them to expose commands so we can generate .wacz easily
  2631. // https://github.com/webrecorder/archiveweb.page/issues/207
  2632. // ...
  2633. const browser = await page.browser()
  2634. const extension = extensions.filter(({name}) => name === 'archivewebpage')[0]
  2635. await page.bringToFront()
  2636. await extension.dispatchPopup()
  2637. await extension.dispatchAction()
  2638. const popup = await browser.waitForTarget(
  2639. target => target.url().toString().startsWith(`chrome-extension://${extension.id}/popup.html`),
  2640. {timeout: 5_000},
  2641. )
  2642. await page.bringToFront()
  2643. // await puppeteer.Locator.race([
  2644. // popup.locator('::-p-aria(Start With Autopilot)'),
  2645. // popup.locator('wr-popup-viewer >>>> input'),
  2646. // popup.locator(':scope >>> input')
  2647. // ])
  2648. // .setTimeout(timeout)
  2649. // .click({
  2650. // offset: {
  2651. // x: 7.7265625,
  2652. // y: 7.203125,
  2653. // },
  2654. // });
  2655. // @ts-ignore
  2656. await puppeteer.Locator.race([
  2657. popup.locator('wr-popup-viewer >>>> div.status-row > p'),
  2658. popup.locator(':scope >>> div.status-row > p'),
  2659. popup.locator('::-p-text(Recording: \n)')
  2660. ]).setTimeout(timeout).click({
  2661. delay: 733.3000000007451,
  2662. offset: {
  2663. x: 293,
  2664. y: 13.5,
  2665. },
  2666. })
  2667. await wait(8_000)
  2668. // @ts-ignore
  2669. await puppeteer.Locator.race([
  2670. popup.locator('wr-popup-viewer >>>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
  2671. popup.locator(':scope >>> div:nth-of-type(2) > button > span:nth-of-type(2)'),
  2672. popup.locator('::-p-text(Stop)')
  2673. ]).setTimeout(timeout).click({
  2674. offset: {
  2675. x: 7.859375,
  2676. y: 23.203125,
  2677. },
  2678. });
  2679. return null
  2680. }
  2681. async function savePocket(page, {extensions}) {
  2682. const browser = await page.browser()
  2683. const extension = extensions.filter(({name}) => name === 'pocket')[0]
  2684. if (!extension.version) throw 'Could not find Pocket extension ID, is it installed?'
  2685. console.log(`[🛠️] Saving URL to Pocket API using extension (${extension.id})...`, 'https://getpocket.com/saves')
  2686. await page.bringToFront() // action button acts on the foreground tab, so it has to be in front
  2687. await extension.dispatchAction()
  2688. try {
  2689. const login_window = await browser.waitForTarget(
  2690. target => target.url().toString().startsWith('https://getpocket.com/'),
  2691. {timeout: 3_000},
  2692. )
  2693. // login window will open if pocket is not signed-in
  2694. if (login_window) return false
  2695. } catch(e) {
  2696. // no new window should open if it saves correctly
  2697. return true
  2698. }
  2699. }
  2700. /***************** Synchronous Archive Output Tasks ***************************/
  2701. async function saveScreenrecording(page, page_state, {save_gif=true}={}) {
  2702. if (page_state.recorder) {
  2703. const duration = Date.now() - page_state.start_ts
  2704. console.log(`[🎥] Saving screen-recording video (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDING_PATH(page)))
  2705. const recorder = page_state.recorder
  2706. page_state.recorder = null
  2707. await recorder.stop()
  2708. // create symlink for legacy path
  2709. const snap_dir = page_state.snapshot_dir
  2710. const legacy_path = path.join(snap_dir, 'media', 'screenrecording.mp4')
  2711. await overwriteSymlink(SCREENRECORDING_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
  2712. // // remove duplicate frames (white frames at start while it loads + static image at end)
  2713. // const video_path = SCREENRECORDING_PATH(page)
  2714. // const short_path = video_path.replace('.mp4', '.short.mp4')
  2715. // try {
  2716. // await exec(
  2717. // // create a shortened video starting from 0:02s to 0:01s with duplicate frames removed (can look jumpy sometimes)
  2718. // `ffmpeg -ss 2 -sseof -1 -y -i ${video_path} -vf mpdecimate,setpts=N/FRAME_RATE/TB ${short_path}`
  2719. // )
  2720. // } catch(err) {
  2721. // console.log('[❌] Failed to shorten screenrecording.mp4')
  2722. // }
  2723. // convert video to GIF
  2724. if (save_gif) {
  2725. try {
  2726. const BIN_NAME = '/Volumes/NVME/Users/squash/bin/ffmpeg'
  2727. const child = child_process.spawn(
  2728. BIN_NAME,
  2729. [
  2730. '-hide_banner',
  2731. '-loglevel', 'error',
  2732. '-ss', '3',
  2733. '-t', '10',
  2734. '-y',
  2735. '-i', SCREENRECORDING_PATH(page),
  2736. '-vf', "fps=10,scale=1024:-1:flags=bicubic,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse",
  2737. '-loop', '0',
  2738. SCREENRECORDGIF_PATH(page),
  2739. ],
  2740. {
  2741. cwd: path.dirname(SCREENRECORDING_PATH(page)),
  2742. timeout: 60_000,
  2743. // stdio: [null, 'pipe', 'pipe'],
  2744. stdio: 'ignore',
  2745. detached: true, // run in background, don't block on response
  2746. },
  2747. )
  2748. await blockUntilExists(SCREENRECORDGIF_PATH(page), {min_bytes: 100, timeout: 40_000})
  2749. console.log(`[🎥] Saved screen-recording GIF with ffmpeg pid=${child.pid} (${duration/1000}s)...`.padEnd(82), prettyPath(SCREENRECORDGIF_PATH(page)))
  2750. const snap_dir = page_state.snapshot_dir
  2751. const legacy_path = path.join(snap_dir, 'media', 'screenrecording.gif')
  2752. await overwriteSymlink(SCREENRECORDGIF_PATH(page), legacy_path, {relative: snap_dir, search_limit: snap_dir})
  2753. } catch(err) {
  2754. console.log('[❌] Failed to convert video to GIF:', err)
  2755. }
  2756. }
  2757. return SCREENRECORDING_PATH(page)
  2758. }
  2759. return null
  2760. }
  2761. async function saveScreenshot(page, _page_state, {aspect_ratio=SCREENSHOT_ASPECT_RATIO, width=null, height=null, jpg_width=1440, jpg_quality=90, timeout=30_000}={}) {
  2762. try {await fs.promises.unlink(SCREENSHOT_PATH(page))} catch(err) {}
  2763. // setup width and height
  2764. width = width || DEFAULT_VIEWPORT.width
  2765. assert((typeof width === 'number') && width > 200)
  2766. height = height || Math.floor(width/aspect_ratio)
  2767. assert((typeof height === 'number') && height > 200)
  2768. console.log(`[📸] Saving full-page screenshot (${width}x${height}px)...`.padEnd(82), prettyPath(SCREENSHOT_PATH(page)))
  2769. // set width, height, and deviceScale factor: https://github.com/puppeteer/puppeteer/issues/1576
  2770. await page.setViewport({ ...DEFAULT_VIEWPORT, width, height, deviceScaleFactor: 2})
  2771. await page.bringToFront()
  2772. await wait(1_250) // page takes a sec settle after foregrounding and viewport update
  2773. // take lossless fullpage screenshot of 1920x1440+px (4:3+) -> ./screenshot.png
  2774. await page.screenshot({ path: SCREENSHOT_PATH(page), fullPage: true, type: 'png' })
  2775. // wait for the screenshot to be created, then set the viewport to the next size
  2776. await blockUntilExists(SCREENSHOT_PATH(page), {min_bytes: 100, timeout})
  2777. await wait(6_000) // puppeteer takes a while to finish writing png data when fullPage: true
  2778. const jpg_height = Math.floor(jpg_width/aspect_ratio)
  2779. await page.setViewport({ ...DEFAULT_VIEWPORT, width: jpg_width, height: jpg_height, deviceScaleFactor: 2})
  2780. await wait(1_250) // page takes a sec settle after foregrounding and viewport update
  2781. // WARNING: make sure you never try to create two screenshots at the same time (especially not fullpage screenshots)
  2782. // thats why there are all these delays here.
  2783. // screenshot creation messes up the whole viewport while it's running,
  2784. // and it writes bad/white empty screenshots if you try to make more than one concurrently
  2785. // take compressed screenshot of jpg_width*jpg_height (4:3) -> ./screenshot.jpg
  2786. await page.screenshot({
  2787. path: SCREENSHOT_JPG_PATH(page),
  2788. type: 'jpeg',
  2789. quality: jpg_quality,
  2790. clip: {
  2791. x: 0,
  2792. y: 0,
  2793. width: jpg_width,
  2794. height: jpg_height,
  2795. },
  2796. captureBeyondViewport: false,
  2797. });
  2798. await blockUntilExists(SCREENSHOT_JPG_PATH(page), {min_bytes: 100, timeout: timeout/2})
  2799. console.log(`[📸] Saved screenshot as screenshot.jpg (${jpg_width}x${jpg_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
  2800. // reset viewport back to defaults
  2801. await wait(1_250)
  2802. await page.setViewport(DEFAULT_VIEWPORT)
  2803. // ALTERNATIVE METHOD based on cropping fullpage png and converting to jpg manually:
  2804. // import {PNG} from 'pngjs';
  2805. // import jpeg from 'jpeg-js';
  2806. // setTimeout(async () => {
  2807. // try {
  2808. // const screenshot_png = SCREENSHOT_PATH(page);
  2809. // const screenshot_jpg = SCREENSHOT_JPG_PATH(page)
  2810. // const jpg_max_height = height
  2811. // const jpg_quality = quality; // Adjust the quality as needed (0-100)
  2812. // fs.createReadStream(screenshot_png)
  2813. // .pipe(new PNG())
  2814. // .on('parsed', function () {
  2815. // const width = this.width;
  2816. // const height = this.height;
  2817. // let cropped_height = height;
  2818. // if (height > jpg_max_height) {
  2819. // cropped_height = jpg_max_height;
  2820. // }
  2821. // const cropped_bytes = new Uint8Array(width * cropped_height * 4);
  2822. // for (let y = 0; y < cropped_height; y++) {
  2823. // for (let x = 0; x < width; x++) {
  2824. // const idx = (width * y + x) << 2;
  2825. // cropped_bytes[idx] = this.data[idx];
  2826. // cropped_bytes[idx + 1] = this.data[idx + 1];
  2827. // cropped_bytes[idx + 2] = this.data[idx + 2];
  2828. // cropped_bytes[idx + 3] = this.data[idx + 3];
  2829. // }
  2830. // }
  2831. // const jpeg_obj = {
  2832. // data: cropped_bytes,
  2833. // width: width,
  2834. // height: cropped_height,
  2835. // };
  2836. // const jpeg_bytes = jpeg.encode(jpeg_obj, jpg_quality);
  2837. // fs.writeFileSync(screenshot_jpg, jpeg_bytes.data);
  2838. // console.log(`[📸] Saved screenshot as screenshot.jpg (${width}x${jpg_max_height}px)...`.padEnd(82), prettyPath(SCREENSHOT_JPG_PATH(page)))
  2839. // });
  2840. // } catch(err) {
  2841. // console.error('[X] Error while generating JPG screenshot', SCREENSHOT_JPG_PATH(page), err)
  2842. // }
  2843. // }, DELAY_BEFORE_JPG_CONVERSION)
  2844. // ALTERNATIVE METHOD TO WRITE SCREENSHOT JPG:
  2845. // await wait(5_000) // puppeteer takes a while to finish writing png data when fullPage: true
  2846. // if ((await page.evaluate('document.body.scrollHeight')) > max_height) {
  2847. // // if page exceeds max_height, save additional cropped screenshot as screenshot.top.png
  2848. // // (needed b.c. uncropped screenshot may have insane 1:20+ aspect ratio that is hard to use elsewhere)
  2849. // await page.screenshot({ path: SCREENSHOT_JPG_PATH(page), type: 'jpg', quality: 100})
  2850. // await wait(1_000) // page takes a sec settle after a screenshot
  2851. // }
  2852. return SCREENSHOT_PATH(page)
  2853. }
  2854. async function savePDF(page, _page_state, {timeout=30_000}={}) {
  2855. const url = page.url() || 'about:blank'
  2856. if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
  2857. const out_path = PDF_PATH(page)
  2858. console.log(`[📓] Saving print-as-PDF export...`.padEnd(82), prettyPath(out_path))
  2859. await page.bringToFront()
  2860. try {await fs.promises.unlink(PDF_PATH(page))} catch(err) {}
  2861. // await page.emulateMediaType('screen') // print as "@media(screen) instead of @media(print)"
  2862. // page.createPDFStream lets us to save larger PDFs than page.pdf() before crashing
  2863. // (streams to disk in chunks instead of all at once)
  2864. const pdf_stream = await page.createPDFStream({
  2865. timeout: timeout,
  2866. printBackground: true,
  2867. outline: true,
  2868. tagged: true,
  2869. format: 'A4',
  2870. displayHeaderFooter: false,
  2871. // margin: { top: '0.5cm', right: '1cm', bottom: '0.8cm', left: '1cm' },
  2872. })
  2873. const reader = pdf_stream.getReader()
  2874. // iterate through reader and append chunks to out_path
  2875. await fs.promises.rm(out_path, {force: true})
  2876. let num_bytes = 0
  2877. let error = '0 bytes written'
  2878. try {
  2879. while (true) {
  2880. const {done, value} = await reader.read()
  2881. if (done) break;
  2882. await fs.promises.appendFile(out_path, value)
  2883. num_bytes += value.length;
  2884. }
  2885. } catch(error) {
  2886. num_bytes = 0
  2887. }
  2888. if (!num_bytes) {
  2889. console.warn('[❌] Failed to save PDF', JSON.stringify(error, null, 4))
  2890. await fs.promises.rm(out_path, {force: true})
  2891. return null
  2892. }
  2893. return out_path
  2894. }
  2895. async function inlineShadowDOM(page, _page_state, {limit=100_000}={}) {
  2896. console.log(`[😎] Replacing Shadow DOM elements with inline HTML...`)
  2897. try {
  2898. const num_replaced = await page.evaluate((limit) => {
  2899. let num_replaced = 0
  2900. // Returns HTML of given shadow DOM.
  2901. const getShadowDomHtml = (shadowRoot) => {
  2902. let shadowHTML = '';
  2903. for (const el of shadowRoot.childNodes) {
  2904. shadowHTML += el.nodeValue || el.outerHTML;
  2905. }
  2906. return shadowHTML;
  2907. };
  2908. // Recursively replaces shadow DOMs with their HTML.
  2909. const replaceShadowDomsWithHtml = (rootElement) => {
  2910. if (num_replaced > limit) return
  2911. for (const el of rootElement.querySelectorAll('*')) {
  2912. if (el.shadowRoot) {
  2913. replaceShadowDomsWithHtml(el.shadowRoot);
  2914. el.innerHTML += getShadowDomHtml(el.shadowRoot);
  2915. }
  2916. }
  2917. num_replaced++
  2918. };
  2919. replaceShadowDomsWithHtml(document.body);
  2920. return num_replaced
  2921. }, limit)
  2922. // console.log(' √ replaced', num_replaced, 'Shadow DOM trees')
  2923. } catch(err) {
  2924. console.log('[⚠️] Inlining Shadow DOM failed', err)
  2925. }
  2926. }
  2927. async function saveAIQualityAssuranceResult(page, {original_url, version}) {
  2928. console.log(`[🧠] Analyzing screenshot with GPT-4o for QA checks...`.padEnd(82), prettyPath(AIQA_PATH(page)))
  2929. let screenshot_path = SCREENSHOT_PATH(page)
  2930. const screenshot_cropped_path = SCREENSHOT_JPG_PATH(page)
  2931. if (fs.existsSync(screenshot_cropped_path)) {
  2932. // screenshot is too tall to pass to openai, send cropped version instead
  2933. screenshot_path = screenshot_cropped_path
  2934. }
  2935. try {
  2936. await blockUntilExists(screenshot_path, {min_bytes: 100, timeout: 7_500})
  2937. } catch (err) {
  2938. console.warn('[❌] Failed to send screenshot to GTP-4o for analysis, no screenshot.{png,jpg} exists', err)
  2939. return null
  2940. }
  2941. var stdout = ''
  2942. var stderr = ''
  2943. let result = null
  2944. const PYTHON_BIN = path.join(__dirname, '.venv/bin/python')
  2945. const SCRIPT_PATH = path.join(__dirname, 'ai_qa.py')
  2946. await blockUntilExists(PYTHON_BIN, {min_bytes: 1, timeout: 250})
  2947. await blockUntilExists(SCRIPT_PATH, {min_bytes: 1, timeout: 250})
  2948. try {
  2949. var {stdout, stderr} = await exec(
  2950. `${PYTHON_BIN} ${SCRIPT_PATH} --attach '${screenshot_path}'`
  2951. )
  2952. result = JSON.parse(stdout.toString())
  2953. if (!result) throw 'Got empty result!'
  2954. result = {
  2955. TYPE: 'aiqa',
  2956. VERSION: version,
  2957. URL: original_url,
  2958. ...result,
  2959. }
  2960. } catch(parse_err) {
  2961. console.warn('[❌] Failed to get OpenAI analysis for screenshot.png', parse_err, stderr)
  2962. }
  2963. if (!(result || stdout)) {
  2964. return null
  2965. }
  2966. await overwriteFile(
  2967. AIQA_PATH(page),
  2968. result || stdout.toString(),
  2969. )
  2970. return result
  2971. }
  2972. async function saveYTDLP(page, {original_url, version}, {max_size='750m'}={}) {
  2973. console.log(`[🎥] Saving media with YT-DLP (<=${max_size})...`.padEnd(82), prettyPath(YTDLP_PATH(page)))
  2974. await fs.promises.mkdir(YTDLP_PATH(page), {recursive: true})
  2975. const cwd = YTDLP_PATH(page)
  2976. const bin_name = 'yt-dlp'
  2977. const timeout = 300_000 // 5min timeout
  2978. const args = [
  2979. '--restrict-filenames',
  2980. '--trim-filenames', '128',
  2981. '--write-description',
  2982. '--write-info-json',
  2983. '--write-annotations',
  2984. '--write-thumbnail',
  2985. '--no-call-home',
  2986. '--write-sub',
  2987. '--write-auto-subs',
  2988. '--convert-subs=srt',
  2989. '--yes-playlist',
  2990. '--continue',
  2991. '--no-abort-on-error',
  2992. '--ignore-errors',
  2993. '--geo-bypass',
  2994. '--add-metadata',
  2995. `--format=(bv*+ba/b)[filesize<=${max_size}][filesize_approx<=?${max_size}]/(bv*+ba/b)`,
  2996. '--no-check-certificate',
  2997. '--no-progress',
  2998. // `--cookies=${COOKIES_TXT_PATH}`, // using logged in cookies actually makes it fail more often, not sure why
  2999. original_url,
  3000. ]
  3001. const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
  3002. return {getResult, ...exec_info}
  3003. }
  3004. async function saveGALLERYDL(page, {original_url, version}) {
  3005. console.log(`[🎥] Saving photos with gallery-dl...`.padEnd(82), prettyPath(GALLERYDL_PATH(page)))
  3006. await fs.promises.mkdir(GALLERYDL_PATH(page), {recursive: true})
  3007. const cwd = GALLERYDL_PATH(page)
  3008. const bin_name = 'gallery-dl'
  3009. const timeout = 300_000 // 5min timeout
  3010. const args = [
  3011. '--verbose',
  3012. '--write-metadata',
  3013. '--write-infojson',
  3014. '--write-tags',
  3015. '--sleep=1.5-2.5',
  3016. `--cookies=${COOKIES_TXT_PATH}`,
  3017. // '--no-check-certificate',
  3018. // `--directory=media`,
  3019. original_url,
  3020. ]
  3021. const {getResult, ...exec_info} = await saveExecResult(bin_name, args, {original_url, version}, {cwd, timeout})
  3022. return {getResult, ...exec_info}
  3023. }
  3024. // async function saveWget(page, {original_url, version}) {
  3025. // console.log(`[⎒] Saving wget site clone...`.padEnd(82), prettyPath(WGET_PATH(page)))
  3026. // const args = [
  3027. // // ...
  3028. // ]
  3029. // spawn(
  3030. // 'wget',
  3031. // [
  3032. // ...args,
  3033. // original_url,
  3034. // ],
  3035. // {
  3036. // cwd: WGET_PATH(page),
  3037. // detached: true, // run in background, don't block on response
  3038. // stdio: 'ignore',
  3039. // timeout: 300_000, // 5min timeout
  3040. // },
  3041. // )
  3042. // return {path: WGET_PATH(page)}
  3043. // }
  3044. /**************** Asynchronous Archive Output Tasks ***************************/
  3045. type FaviconCandidate = {
  3046. url: string,
  3047. basename: string,
  3048. extension: string,
  3049. expected_mimetype: string,
  3050. }
  3051. const faviconFromDomain = (url) => {
  3052. // https://auth:[email protected]:1234/a/bc123 -> https://auth:[email protected]:1234/favicon.ico
  3053. const url_origin = (new URL(url)).origin
  3054. return {
  3055. url: url_origin ? `${url_origin}/favicon.ico` : null,
  3056. basename: 'favicon',
  3057. extension: undefined, // auto-detect extension at download time in case it redirects us to a png
  3058. expected_mimetype: 'image/', // only accept image/* to avoid saving html/txt error reponses as icon
  3059. } as FaviconCandidate
  3060. }
  3061. const faviconFromGoogle = (url, size=256) => {
  3062. // https://auth:[email protected]:1234/a/bc123 -> https://www.google.com/s2.favicons?domain=t.co
  3063. const domain = url && (new URL(url)).hostname
  3064. return {
  3065. url: domain?.includes('.') ? `https://www.google.com/s2/favicons?sz=${size},domain=${domain}` : null,
  3066. basename: 'google_favicon',
  3067. extension: 'png',
  3068. expected_mimetype: 'image/png', // google always provides PNGs in response
  3069. } as FaviconCandidate
  3070. }
  3071. const faviconFromHtml = async (page) => {
  3072. // <link rel="icon" src="https://example.com/static/images/favicon.png"/> -> https://example.com/static/images/favicon.png
  3073. let url
  3074. try {
  3075. url = await page.$eval('link[rel*="icon"]', (elem) => elem?.href)
  3076. if (!url || !url.includes('://'))
  3077. url = null
  3078. } catch(err) {
  3079. url = null
  3080. // console.warn('Failed to find favicon tag in html', JSON.stringify(err, null, 4))
  3081. }
  3082. return {
  3083. url,
  3084. basename: 'favicon',
  3085. extension: undefined, // auto-detect extension at download time
  3086. expected_mimetype: 'image/', // accept any image/* mimetype at download time
  3087. } as FaviconCandidate
  3088. }
  3089. type FaviconResult = {
  3090. url: string,
  3091. num_bytes: number,
  3092. abspath?: string,
  3093. dir?: string,
  3094. filename?: string,
  3095. mimeType?: string,
  3096. }
  3097. async function saveFavicon(page, {original_url, main_response, version}) {
  3098. const dir = path.dirname(FAVICON_PATH(page))
  3099. const response_url = main_response?.url()
  3100. const favicon_downloads_to_try: {[key: string]: FaviconCandidate} = unique([
  3101. await faviconFromHtml(page),
  3102. faviconFromDomain(response_url),
  3103. faviconFromDomain(original_url),
  3104. faviconFromGoogle(response_url),
  3105. faviconFromGoogle(original_url),
  3106. ].filter(({url}) => url), 'url')
  3107. const browser = await page.browser()
  3108. // let logs = []
  3109. // let errors = []
  3110. let output_files: {[key: string]: FaviconResult} = {}
  3111. for (const download_options of Object.values(favicon_downloads_to_try)) {
  3112. let result: FaviconResult = {num_bytes: 0, url: download_options.url}
  3113. // {url, num_bytes, abspath, dir, filename, basename, extension, mimeType}
  3114. try {
  3115. // try getting it with node-fetch first
  3116. const response = await fetch(download_options.url) as Response
  3117. const file_options = await detectFilename({...download_options, response, dir})
  3118. if (response.headers.get("content-length")) {
  3119. const favicon_stream = Readable.fromWeb(response.body as any)
  3120. await overwriteFile(file_options.abspath, favicon_stream)
  3121. result = {
  3122. ...file_options,
  3123. num_bytes: parseInt(response.headers.get("content-length") || '0'),
  3124. mimeType: response.headers.get("content-type"),
  3125. }
  3126. } else {
  3127. throw 'Failed to download favicon with fetch()'
  3128. }
  3129. } catch(err) {
  3130. // console.warn('[!] Failed to get favicon with node-fetch', err)
  3131. // fallback to getting it by opening a new browser tab
  3132. result = await download({...download_options, browser, dir, page})
  3133. }
  3134. // logs.push(...(result.logs || []))
  3135. // errors.push(...(result.errors || []))
  3136. if (result.num_bytes) {
  3137. console.log(`[🌠] Saving page favicon (${result.url.substring(0, 35)}... ${result.mimeType})...`.padEnd(82), prettyPath(result.abspath))
  3138. output_files[result.filename] = result
  3139. break // break here stops after the first successful download, comment out to keep going instead
  3140. }
  3141. }
  3142. const output_file = Object.values(output_files).sort(file => file.num_bytes).at(-1)
  3143. const favicon_info = {
  3144. TYPE: 'favicon',
  3145. VERSION: version,
  3146. URL: original_url,
  3147. succeeded: !!output_file,
  3148. // stdout: JSON.stringify(logs),
  3149. // stderr: JSON.stringify(errors),
  3150. favicon_url: output_file?.url,
  3151. favicon_urls: Object.keys(favicon_downloads_to_try),
  3152. favicon_files: Object.keys(output_files).map(fname => fname.replace(dir, '.')),
  3153. favicon_filename: output_file?.filename,
  3154. favicon_num_bytes: output_file?.num_bytes,
  3155. }
  3156. await overwriteFile(FAVICON_PATH(page), favicon_info)
  3157. return favicon_info
  3158. }
  3159. async function saveTitle(page, {original_url, version}) {
  3160. const title_from_browser = (await page.title()) || null
  3161. const title_from_js = await page.evaluate(() => document?.title || null)
  3162. const title_from_html = await page.evaluate(() => document?.querySelector('title')?.innerText || null)
  3163. const title_from_og = await page.evaluate(() => document?.querySelector('meta[property="og:title"]')?.getAttribute('content') || null)
  3164. // best guess at best title = longest title
  3165. const title = ([title_from_html, title_from_og, title_from_js, title_from_browser]
  3166. .filter(title => title)
  3167. .sort((a, b) => b.length - a.length)[0] || '')
  3168. .replaceAll('\n', ' ')
  3169. if (title?.length) {
  3170. console.log(`[📗] Saving page title (${title.substring(0, 40)})...`.padEnd(82), prettyPath(TITLE_PATH(page)))
  3171. await overwriteFile(TITLE_PATH(page), title)
  3172. }
  3173. const title_info = {
  3174. TYPE: 'title',
  3175. VERSION: version,
  3176. URL: original_url,
  3177. title,
  3178. title_from_html,
  3179. title_from_og,
  3180. title_from_js,
  3181. title_from_browser,
  3182. }
  3183. const title_json_path = TITLE_PATH(page).replace('.txt', '.json')
  3184. await overwriteFile(title_json_path, title_info)
  3185. return title_info
  3186. }
  3187. async function saveRaw(page, {main_response}) {
  3188. const response = main_response
  3189. if (!response) {
  3190. console.warn('[⚠️] Failed to save page RAW bytes, main_response is null', response)
  3191. }
  3192. const dir = RAW_PATH(page)
  3193. await fs.promises.mkdir(dir, {recursive: true})
  3194. const {url, abspath, mimeType} = await detectFilename({page, response, dir})
  3195. console.log(`[🔟] Saving raw response bytes (${mimeType})...`.padEnd(82), prettyPath(abspath))
  3196. await download({page, response, abspath})
  3197. return abspath
  3198. }
  3199. async function saveSourceMaps(page, {original_url, version}) {
  3200. console.log(`[🐛] Saving source maps to ./responses/all/*.{js,css}.map...`)
  3201. const response_index_path = path.join(RESPONSES_PATH(page), 'index.jsonl')
  3202. const response_index = await fs.promises.readFile(response_index_path, 'utf-8')
  3203. const urls_to_download = []
  3204. for (const response of response_index.split('\n')) {
  3205. try {
  3206. const {url, extension} = JSON.parse(response)
  3207. if (['css', 'js'].includes(extension?.toLowerCase())) {
  3208. urls_to_download.push(url + '.map')
  3209. }
  3210. } catch(err) { continue }
  3211. }
  3212. // TODO: fix this, it needs to both after stopSavingMetadata and before stopSavingMetadata
  3213. // fix is to use traffic_log to get response url list instead of waiting for index.jsonl to be created
  3214. await page.evaluate(async (urls_to_download) => {
  3215. const promises = []
  3216. for (const sourcemap_url in urls_to_download) {
  3217. promises.push(fetch(sourcemap_url))
  3218. }
  3219. return Promise.allSettled(promises)
  3220. }, urls_to_download)
  3221. return {
  3222. TYPE: 'sourcemaps',
  3223. URL: original_url,
  3224. VERSION: version,
  3225. sourcemaps: urls_to_download,
  3226. }
  3227. }
  3228. async function saveRequests(page, {original_url, version, traffic_log}) {
  3229. console.log(`[📼] Saving requests log (${Object.keys(traffic_log).length})...`.padEnd(82), prettyPath(REQUESTS_PATH(page)))
  3230. const requests_info = {
  3231. TYPE: 'requests',
  3232. VERSION: version,
  3233. URL: original_url,
  3234. requests: traffic_log,
  3235. }
  3236. await overwriteFile(REQUESTS_PATH(page), requests_info)
  3237. return requests_info
  3238. }
  3239. async function saveRedirects(page, {original_url, main_response, traffic_log, redirects, version}) {
  3240. const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
  3241. const main_response_traffic = traffic_log[main_request_id] || {}
  3242. const url_from_browser = await page.url() || null
  3243. const url_from_request = (
  3244. main_response?.request()?.url()
  3245. || main_response_traffic['Network.requestWillBeSent']?.request?.url
  3246. || null)
  3247. const url_from_response = (
  3248. main_response?.url()
  3249. || main_response_traffic['Network.responseReceived']?.main_response?.url
  3250. || null)
  3251. const http_redirects =
  3252. Object.values(traffic_log)
  3253. .filter(event => event['Network.requestWillBeSent']?.redirectResponse)
  3254. .map(event => event['Network.requestWillBeSent'])
  3255. .map(requestWillBeSent => ({
  3256. url: requestWillBeSent.request.url,
  3257. src: requestWillBeSent.redirectResponse.url,
  3258. status: requestWillBeSent.redirectResponse.status,
  3259. loaderId: requestWillBeSent.loaderId,
  3260. requestId: requestWillBeSent.requestId,
  3261. wallTime: requestWillBeSent.wallTime,
  3262. initiator: requestWillBeSent.initiator,
  3263. isMainFrame: (requestWillBeSent.loaderId == main_request_id),
  3264. }))
  3265. const url_parsed = new URL(url_from_response || url_from_request || url_from_browser)
  3266. const redirects_info = {
  3267. TYPE: 'redirects',
  3268. VERSION: version,
  3269. URL: original_url,
  3270. url_parsed,
  3271. url_from_request,
  3272. url_from_response,
  3273. url_from_browser,
  3274. redirects_from_browser: redirects,
  3275. redirects_from_http: http_redirects,
  3276. }
  3277. console.log(`[🔗] Saving page redirects log (${http_redirects.length})...`.padEnd(82), prettyPath(REDIRECTS_PATH(page)))
  3278. await overwriteFile(REDIRECTS_PATH(page), redirects_info)
  3279. return redirects_info
  3280. }
  3281. async function saveHeaders(page, {original_url, version, traffic_log}) {
  3282. const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
  3283. const main_response_traffic = traffic_log[main_request_id] || {}
  3284. // combine base request with browser-added request headers
  3285. const request = {...main_response_traffic['Network.requestWillBeSent']?.request}
  3286. const request_extra_headers = main_response_traffic['Network.requestWillBeSentExtraInfo']?.headers || {}
  3287. request.headers = {...request.headers, ...request_extra_headers}
  3288. // combine base response with browser-added response headers
  3289. const response = {...main_response_traffic['Network.responseReceived']?.response}
  3290. const response_extra_headers = main_response_traffic['Network.responseReceivedExtraInfo']?.headers || {}
  3291. response.headers = {...response.headers, ...response_extra_headers}
  3292. const headers_info = {
  3293. TYPE: 'headers',
  3294. VERSION: version,
  3295. URL: original_url,
  3296. request,
  3297. response,
  3298. }
  3299. const num_headers = Object.keys({...request.headers, ...response.headers}).length
  3300. if (num_headers) {
  3301. console.log(`[👾] Saving main request & response headers (${num_headers})...`.padEnd(82), prettyPath(HEADERS_PATH(page)))
  3302. await overwriteFile(HEADERS_PATH(page), headers_info)
  3303. }
  3304. return headers_info
  3305. }
  3306. async function saveSSL(page, {original_url, version, traffic_log}) {
  3307. const main_request_id = Object.keys(traffic_log).filter(id => !id.includes('.'))[0]
  3308. const main_response_traffic = traffic_log[main_request_id] || {}
  3309. const relevant_response_keys = [
  3310. 'url',
  3311. 'status',
  3312. 'mimeType',
  3313. 'connectionReused',
  3314. 'remoteIPAddress',
  3315. 'remotePort',
  3316. 'fromServiceWorker',
  3317. 'encodedDataLength',
  3318. 'protocol',
  3319. 'alternateProtocolUsage',
  3320. 'securityState',
  3321. 'securityDetails',
  3322. ]
  3323. let ssl_info = Object.entries(main_response_traffic['Network.responseReceived']?.response || {})
  3324. .reduce((obj, [key, val]) => {
  3325. if (relevant_response_keys.includes(key)) {
  3326. obj[key] = val
  3327. }
  3328. return obj
  3329. }, {}) as any
  3330. // TODO: parse SSL certificate sha256 hash from chrome://system/#chrome_root_store
  3331. // const ssl_certificate = await client.send('Network.getCertificate', {origin: original_url})
  3332. // ssl_info.sslCertSha256 = '<unknown>'
  3333. ssl_info = {
  3334. TYPE: 'ssl',
  3335. VERSION: version,
  3336. URL: original_url,
  3337. ...ssl_info,
  3338. }
  3339. if (Object.keys(ssl_info).length-3) {
  3340. console.log(`[🔏] Saving page SSL details (${ssl_info?.securityDetails?.protocol})...`.padEnd(82), prettyPath(SSL_PATH(page)))
  3341. await overwriteFile(SSL_PATH(page), ssl_info)
  3342. }
  3343. return ssl_info
  3344. }
  3345. async function saveDOM(page, {original_url, version}) {
  3346. const html = await page.content();
  3347. console.log(`[📖] Saving DOM dump (${html.length})...`.padEnd(82), prettyPath(DOM_PATH(page)))
  3348. const html_with_header =
  3349. `<!-- Saved by ArchiveBox TYPE=dom VERSION=${version} URL=${original_url} -->\n${html}`
  3350. await overwriteFile(DOM_PATH(page), html_with_header)
  3351. return DOM_PATH(page)
  3352. }
  3353. async function saveBodyText(page, _page_state) {
  3354. const innerText = await page.evaluate(() => document?.body?.innerText);
  3355. if (innerText?.length) {
  3356. console.log(`[📃] Saving body text (${innerText.length})...`.padEnd(82), prettyPath(BODYTEXT_PATH(page)))
  3357. await overwriteFile(BODYTEXT_PATH(page), innerText)
  3358. }
  3359. // // alternative method: emulate Ctrl+A, Ctrl+C (sometimes gets more than body.innerText)
  3360. // const innerText = await page.$eval('*', (el) => {
  3361. // const selection = window.getSelection();
  3362. // const range = document.createRange();
  3363. // range.selectNode(el);
  3364. // selection.removeAllRanges();
  3365. // selection.addRange(range);
  3366. // return window.getSelection().toString();
  3367. // });
  3368. return innerText
  3369. }
  3370. async function savePandoc(page, { original_url, version }) {
  3371. console.log(`[📒] Converting DOM HTML to markdown with Pandoc...`.padEnd(82), prettyPath(PANDOC_PATH(page)))
  3372. let dom_paths = [DOM_PATH(page), SINGLEFILE_PATH(page)].filter(fs.existsSync)
  3373. if (!dom_paths) return null
  3374. const dom_path = dom_paths[0]
  3375. var stdout: string = ''
  3376. var stderr: string = ''
  3377. let result: any = null
  3378. const BIN_NAME = 'pandoc'
  3379. // pandoc --from html --to markdown_github --citeproc --wrap=none --highlight-style=kate
  3380. const args = [
  3381. BIN_NAME,
  3382. '--from=html',
  3383. '--to=markdown_github',
  3384. '--wrap=none',
  3385. '--citeproc',
  3386. '--highlight-style=kate',
  3387. `--output='${PANDOC_PATH(page)}'`,
  3388. dom_path,
  3389. ]
  3390. try {
  3391. ;({ stdout, stderr } = await exec(args.join(' ')));
  3392. stdout = stdout.toString().trim()
  3393. if (!stdout) throw 'Got empty result!'
  3394. result = {
  3395. TYPE: 'pandoc',
  3396. VERSION: version,
  3397. URL: original_url,
  3398. cmd: args,
  3399. markdown_file: PANDOC_PATH(page),
  3400. }
  3401. } catch (parse_err) {
  3402. console.warn('[❌] Failed to run Pandoc HTML to MD conversion', parse_err, stderr)
  3403. }
  3404. if (!stdout) {return null}
  3405. await overwriteFile(
  3406. PANDOC_PATH(page),
  3407. stdout,
  3408. )
  3409. // pandoc --from markdown_github --to html --citeproc --wrap=none --highlight-style=kate
  3410. const reverse_conversion_args = [
  3411. BIN_NAME,
  3412. '--from=markdown_github',
  3413. '--to=html',
  3414. '--wrap=none',
  3415. '--citeproc',
  3416. '--highlight-style=kate',
  3417. `--output='${PANDOC_PATH(page).replace('.md', '.html')}'`,
  3418. PANDOC_PATH(page),
  3419. ]
  3420. try {
  3421. ; ({ stdout, stderr } = await exec(reverse_conversion_args.join(' ')));
  3422. stdout = stdout.toString().trim()
  3423. if (!stdout) throw 'Got empty result!'
  3424. result = {
  3425. ...result,
  3426. html_file: PANDOC_PATH(page).replace('.md', '.html'),
  3427. }
  3428. } catch (parse_err) {
  3429. console.warn('[❌] Failed to run Pandoc MD to HTML conversion', parse_err, stderr)
  3430. }
  3431. if (!result) { return null }
  3432. await overwriteFile(
  3433. PANDOC_PATH(page).replace('.md', '.html'),
  3434. result,
  3435. )
  3436. return result
  3437. }
  3438. async function saveReadability(page, {original_url, version}) {
  3439. const url = await page.url()
  3440. let html = ''
  3441. let article = null
  3442. try {
  3443. html = await page.content()
  3444. if (html.length > 14_000_000) {
  3445. console.warn('[⚠️] Truncating readability article text because html is too long...', html.length)
  3446. html = html.substring(0, 13_900_000)
  3447. }
  3448. const virtualConsole = new VirtualConsole()
  3449. const dom = new JSDOM(html, {url, virtualConsole})
  3450. const reader = new Readability(dom.window.document);
  3451. article = reader.parse()
  3452. } catch(err) {
  3453. console.warn(`[❌] Failed to get readability article text`)
  3454. return null
  3455. }
  3456. if (article) {
  3457. console.log(`[📜] Saving readability article text (${article.textContent?.length})...`.padEnd(82), prettyPath(READABILITY_PATH(page)))
  3458. const {content, textContent, ...metadata} = article
  3459. if (content.trim()) {
  3460. await overwriteFile(READABILITY_PATH(page).replace('.json', '.html'), content);
  3461. }
  3462. if (textContent.trim()) {
  3463. await overwriteFile(READABILITY_PATH(page).replace('.json', '.txt'), textContent);
  3464. }
  3465. const readability_info = {
  3466. TYPE: 'readability',
  3467. VERSION: version,
  3468. URL: original_url,
  3469. ...metadata,
  3470. }
  3471. await overwriteFile(READABILITY_PATH(page), readability_info)
  3472. return readability_info
  3473. }
  3474. return null
  3475. }
  3476. async function saveAccessibility(page, {original_url, version}) {
  3477. // get accessibility tree
  3478. const accessibility_tree = await page.accessibility.snapshot({interestingOnly: true});
  3479. // console.log(accessibility_tree);
  3480. // get iframe tree
  3481. const iframes = []
  3482. function dumpFrameTree(frame, indent='>') {
  3483. iframes.push(indent + frame.url());
  3484. for (const child of frame.childFrames()) {
  3485. dumpFrameTree(child, indent + '>');
  3486. }
  3487. }
  3488. dumpFrameTree(page.mainFrame(), '');
  3489. // console.log(iframes)
  3490. // generate simple table-of-contents of all the key html elements (e.g. h1, h2, h3, article, main, etc.)
  3491. const outline = await page.evaluate(() => {
  3492. const headings = []
  3493. for (const elem of [...document.querySelectorAll("h1, h2, h3, h4, h5, h6, a, header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe")] as HTMLElement[]) {
  3494. // skip a tags that aren't named anchors
  3495. if (elem.tagName.toLowerCase() == 'a' && !(elem as HTMLAnchorElement).name) continue
  3496. // e.g. article #main-article
  3497. const elem_id = ((typeof elem.id === 'string' && elem.id) || (elem as HTMLAnchorElement).name || elem.ariaLabel || elem.role || '')
  3498. const elem_classes = elem.className.trim().split(' ').slice(0, 3).join(' .') || ''
  3499. const elem_action = (elem as any).action?.split('/')?.slice(-1)?.join('/')
  3500. const summary = elem.innerText.length > 128
  3501. ? `${elem.innerText?.slice(0, 128)}...`
  3502. : elem.innerText
  3503. let prefix = ''
  3504. let title = (elem_id ? `#${elem_id}` : '')
  3505. if (!title && elem_classes) title = `.${elem_classes}`
  3506. if (elem_action) title = `${title} /${elem_action}`
  3507. if (summary) title = `${title}: ${summary}`
  3508. // if elem is a header, prepend a #### prefix based on its level
  3509. const level = Number(elem.tagName.toLowerCase().replace('h', ''))
  3510. if (!isNaN(level)) {
  3511. prefix = '#'.repeat(level)
  3512. title = elem.innerText || elem_id || elem_classes
  3513. } else {
  3514. // set prefix to element's breadcrumb path
  3515. let node = elem
  3516. const parents = [elem.tagName?.toLowerCase().trim()]
  3517. while (node) {
  3518. // add each parent element's name to the path
  3519. // const elem_type = node.tagName?.toLowerCase().trim() || ''
  3520. // if (elem_type && !['div', 'span', 'p', 'body', 'html'].includes(elem_type)) {
  3521. // parents.unshift(elem_type);
  3522. // }
  3523. parents.unshift('') // add emptystring to abbreviate path as >>>> istead of main>article>header>div>...
  3524. node = node.parentNode as HTMLElement
  3525. }
  3526. prefix = parents.join('>')
  3527. }
  3528. // strip all repeated whitespace and newlines
  3529. title = title.replaceAll('\n', ' ').replace(/\s+/g, ' ').trim()
  3530. if (prefix) {
  3531. headings.push(`${prefix} ${title}`)
  3532. }
  3533. }
  3534. // console.log(headings.join('\n'))
  3535. return headings
  3536. })
  3537. console.log(`[🩼] Saving accessibility outline (${Object.keys(accessibility_tree).length})...`.padEnd(82), prettyPath(ACCESIBILITY_PATH(page)))
  3538. // console.log(outline.filter(line => line.startsWith('#')).join('\n'))
  3539. const accessibility_info = {
  3540. TYPE: 'accessibility',
  3541. VERSION: version,
  3542. URL: original_url,
  3543. iframes,
  3544. headings: outline,
  3545. tree: accessibility_tree,
  3546. }
  3547. await overwriteFile(
  3548. ACCESIBILITY_PATH(page),
  3549. accessibility_info,
  3550. )
  3551. return accessibility_info
  3552. }
  3553. async function saveSEO(page, {original_url, version}) {
  3554. // collect all <meta name="title" property="og:title" content="Page Title for SEO | Somesite.com"> tags into dict
  3555. const seo_vars = await page.evaluate(() =>
  3556. [...document.querySelectorAll('meta')]
  3557. .map(tag => ({key: tag.getAttribute('name') || tag.getAttribute('property') || '', value: tag.getAttribute('content') || ''}))
  3558. .filter(obj => obj.key && obj.value)
  3559. .sort((a, b) => a.value.length - b.value.length)
  3560. .reduce((acc, node) => {acc[node.key] = node.value; return acc}, {})
  3561. )
  3562. const seo_info = {
  3563. TYPE: 'seo',
  3564. VERSION: version,
  3565. URL: original_url,
  3566. ...seo_vars,
  3567. }
  3568. const num_vars = Object.keys(seo_vars).length
  3569. if (num_vars) {
  3570. console.log(`[🔎] Saving page SEO metadata (${num_vars})...`.padEnd(82), prettyPath(SEO_PATH(page)))
  3571. await overwriteFile(SEO_PATH(page), seo_info)
  3572. }
  3573. return seo_info
  3574. }
  3575. async function saveOutlinks(page, {original_url, version}) {
  3576. // TODO: slow to iterate over all elements so many times, perhaps we can collapse everything down into one loop
  3577. // Regular expression that matches syntax for a link (https://stackoverflow.com/a/3809435/117030):
  3578. const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
  3579. const filterW3Urls = (urls) =>
  3580. urls.filter(url =>
  3581. url && !url.startsWith('http://www.w3.org/'))
  3582. const filterDataUrls = (urls) =>
  3583. urls.filter(url =>
  3584. url && !url.startsWith('data:'))
  3585. const html = await page.content();
  3586. const raw = html?.match(LINK_REGEX) || [];
  3587. const hrefs = await page.$$eval(
  3588. "pierce/a[href]",
  3589. elems => elems
  3590. .map(elem => elem.href)
  3591. .filter(url => url),
  3592. );
  3593. const links = await page.$$eval(
  3594. "pierce/link[href]",
  3595. elems => elems
  3596. .map(({rel, href}) => ({rel, href}))
  3597. .filter(({rel, href}) => rel !== 'stylesheet')
  3598. .reduce((collection, entry) => {
  3599. const {rel, href} = entry
  3600. const non_empty_rel = collection[href]?.rel || rel
  3601. collection[href] = {rel: non_empty_rel, href}
  3602. return collection
  3603. }, {})
  3604. );
  3605. const iframes = await page.$$eval(
  3606. "pierce/iframe[src]",
  3607. elems => elems.map(iframe => iframe.src).filter(url => url)
  3608. );
  3609. const images = await page.$$eval(
  3610. "pierce/img[src]",
  3611. elems => elems.map(img => img.src).filter(url => url && !url.startsWith('data:'))
  3612. );
  3613. const css_images = await page.$$eval(
  3614. "pierce/*",
  3615. elems => elems
  3616. .map(elem => {
  3617. const css_url_ptn = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i;
  3618. const bg_img = window.getComputedStyle(elem, null).getPropertyValue('background-image')
  3619. const bg_url = css_url_ptn.exec(bg_img)
  3620. return bg_url ? bg_url[1] : null
  3621. })
  3622. )
  3623. const css_stylesheets = await page.$$eval(
  3624. "pierce/link[rel=stylesheet]",
  3625. elems => elems.map(elem => elem.href).filter(url => url)
  3626. );
  3627. const js_scripts = await page.$$eval(
  3628. "pierce/script[src]",
  3629. elems => elems.map(elem => elem.src).filter(url => url)
  3630. );
  3631. const outlinks_info = {
  3632. TYPE: 'outlinks',
  3633. VERSION: version,
  3634. URL: original_url,
  3635. raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
  3636. hrefs: [...new Set(filterDataUrls(hrefs))],
  3637. links: [...Object.values(links)],
  3638. iframes: [...new Set(iframes)],
  3639. images: [...new Set(filterDataUrls(images))],
  3640. css_images: [...new Set(filterDataUrls(css_images))],
  3641. css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
  3642. js_scripts: [...new Set(filterDataUrls(js_scripts))],
  3643. }
  3644. if (raw?.length || hrefs?.length || links?.length || iframes?.length) {
  3645. console.log(`[🖇️] Saving page outgoing links (${raw?.length || hrefs?.length})...`.padEnd(82+1), prettyPath(OUTLINKS_PATH(page)))
  3646. await overwriteFile(OUTLINKS_PATH(page), outlinks_info)
  3647. }
  3648. return outlinks_info
  3649. }
  3650. async function saveAuthStorage(page, {client, version, original_url}) {
  3651. const url = original_url || await page.url()
  3652. if (URL_SCHEMES_IGNORED.includes(url.split(':')[0])) return null
  3653. if (!SAVE_AUTH_STORAGE) return null
  3654. // const cookies = JSON.stringify(await page.cookies()); // doesnt include httponly cookies
  3655. const auth_from_browser = {
  3656. cookies: (await client.send('Network.getAllCookies')).cookies,
  3657. localStorage: {},
  3658. sessionStorage: {},
  3659. }
  3660. // attempt to load localStorage and sessionStorage from browser (may fail in some cases https://github.com/puppeteer/puppeteer/issues/921)
  3661. try {
  3662. auth_from_browser.localStorage = (await page.evaluate(() =>
  3663. JSON.parse(JSON.stringify({[window.location.origin]: window.localStorage}))))
  3664. } catch(err) {
  3665. throw `Failed to get page window.localStorage! ${err}`
  3666. }
  3667. try {
  3668. auth_from_browser.sessionStorage = (await page.evaluate(() =>
  3669. JSON.parse(JSON.stringify({[window.location.origin]: window.sessionStorage}))))
  3670. } catch(err) {
  3671. throw `Failed to get page window.sessionStorage! ${err}`
  3672. }
  3673. // WARNING: small TOCTTOU gap between this read-before-write and the write below
  3674. // can possibly overwrite changes made by other processes in this gap
  3675. const auth_on_disk = await loadAuthStorage(page, {client}, {apply: false})
  3676. const cookies = dedupeCookies([...auth_on_disk.cookies, ...auth_from_browser.cookies])
  3677. const auth_info = {
  3678. TYPE: 'auth',
  3679. VERSION: version,
  3680. URL: original_url,
  3681. cookies: cookies,
  3682. sessionStorage: merge(auth_on_disk.sessionStorage, auth_from_browser.sessionStorage),
  3683. localStorage: merge(auth_on_disk.localStorage, auth_from_browser.localStorage),
  3684. }
  3685. // console.log(`[⛙] Merged ${auth_on_disk.cookies.length} existing + ${auth_from_browser.cookies.length} new -> ${auth_info.cookies.length} cookies`)
  3686. console.log(`[🍪] Saving cookies/localStorage/sessionStorage (${auth_info.cookies.length})...`.padEnd(82), prettyPath(AUTH_JSON_PATH));
  3687. await overwriteFile(AUTH_JSON_PATH, auth_info);
  3688. // Write to cookies.txt file using tough-cookie + @root/file-cookie-store
  3689. await saveCookiesTxt(cookies)
  3690. return auth_info
  3691. }
  3692. async function saveCookiesTxt(cookies) {
  3693. const cookies_store = new FileCookieStore(COOKIES_TXT_PATH, {auto_sync: false, lockfile: false})
  3694. const cookie_jar = new ToughCookie.CookieJar(cookies_store)
  3695. cookie_jar.setCookieAsync = util.promisify(cookie_jar.setCookie)
  3696. cookies_store.saveAsync = util.promisify(cookies_store.save)
  3697. for (const cookie of cookies) {
  3698. const cookie_for_tough = {
  3699. domain: cookie.domain,
  3700. path: cookie.path,
  3701. key: cookie.name,
  3702. value: cookie.value,
  3703. expires: (new Date(cookie.expires * 1000)).toISOString(),
  3704. hostOnly: cookie.domain.startsWith('.'),
  3705. secure: cookie.secure,
  3706. }
  3707. // console.log('COOKIE_FOR_TOUGH_TXT', cookie_for_tough)
  3708. const parsed_cookie = ToughCookie.Cookie.fromJSON(cookie_for_tough)
  3709. // console.log('COOKIE_FOR_TOUGH_TXT_TO_DUMP', parsed_cookie)
  3710. try {
  3711. // assemble a fake URL just to satisfy ToughCookieJar's requirement of having a URL at set time
  3712. let url = cookie.secure ? 'https://' : 'http://'
  3713. if (cookie.domain.startsWith('.')) {
  3714. url = url + cookie.domain.slice(1)
  3715. } else {
  3716. url = url + cookie.domain
  3717. }
  3718. if (cookie.sourcePort && ![80, 443].includes(cookie.sourcePort)) {
  3719. url = `${url}:${cookie.sourcePort}`
  3720. }
  3721. url = `${url}${cookie.path || ''}`
  3722. await cookie_jar.setCookieAsync(parsed_cookie, url, {ignoreError: true})
  3723. } catch(err) {
  3724. console.error('[❌] Failed to dump browser cookie for cookies.txt...', cookie_for_tough, '->', parsed_cookie, err)
  3725. }
  3726. }
  3727. console.log(`[🍪] Saving cookies TXT (${cookies.length})...`.padEnd(82), prettyPath(COOKIES_TXT_PATH));
  3728. await cookies_store.saveAsync()
  3729. }
  3730. async function saveMetrics(page, {original_url, version, start_time, start_ts, traffic_log, redirects}) {
  3731. const end_time = (new Date()).toISOString()
  3732. const end_ts = Date.now()
  3733. const metrics_info = {
  3734. TYPE: 'metrics',
  3735. VERSION: version,
  3736. URL: original_url,
  3737. ...(await page.metrics()),
  3738. start_time,
  3739. start_ts,
  3740. end_time,
  3741. end_ts,
  3742. duration: (end_ts - start_ts),
  3743. num_requests: traffic_log.length,
  3744. num_redirects: Object.keys(redirects).length -1,
  3745. }
  3746. console.log(`[🏎️] Saving final summary + timing metrics...`.padEnd(82+1), prettyPath(METRICS_PATH(page)))
  3747. await overwriteFile(METRICS_PATH(page), metrics_info)
  3748. return metrics_info
  3749. }
  3750. /******************************************************************************/
  3751. /******************************************************************************/
  3752. /**************************** Utility Helpers *********************************/
  3753. function hashCode(str) {
  3754. // get a simple integer hash for a given string (based on java String#hashCode)
  3755. // useful only for throwaway nonces / easy deterministic random identifiers, not a replacement for sha256
  3756. let hash = 0;
  3757. for (let i=0; i<str.length; i++) {
  3758. hash = str.charCodeAt(i) + ((hash << 5) - hash);
  3759. }
  3760. return Math.abs(hash)
  3761. }
  3762. function unique(iter, key: string | ((any, number) => string)='id') {
  3763. // uniqueify an array of objects by a value within them, key can be name of attr or getter function
  3764. // > iter = [{id: 1}, {id: 2}, {id: 1}]
  3765. // > Object.entries(iter) = [
  3766. // [ '0', { id: 1 } ],
  3767. // [ '1', { id: 2 } ],
  3768. // [ '2', { id: 1 } ] ]
  3769. // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
  3770. // > iter = {a1: {id: 1}, b2: {id: 2}, a3: {id: 1}}
  3771. // > Object.entries(iter) = [
  3772. // [ 'a1', { id: 1 } ],
  3773. // [ 'b2', { id: 2 } ],
  3774. // [ 'a3', { id: 1 } ]
  3775. // ]
  3776. // > unique(iter, 'id') => {1: {id: 1}, 2: {id: 2}}
  3777. const key_type = (typeof key)
  3778. if (!['function', 'string'].includes(key_type))
  3779. throw 'key must be either a string lookup key or a function (obj, idx) => return unique_id'
  3780. const key_func = (key_type === 'string')
  3781. ? (entry_obj, idx) => entry_obj[(key as string)]
  3782. : (entry_obj, idx) => (key as Function)(entry_obj, idx) // otherwise key is a callback func
  3783. const seen = {}
  3784. for (const [idx, entry_obj] of Object.entries(iter)) {
  3785. const unique_id = key_func(entry_obj, idx)
  3786. if (seen[unique_id] === undefined) {
  3787. seen[unique_id] = entry_obj
  3788. }
  3789. }
  3790. return seen
  3791. }
  3792. const wait = (ms: number) => new Promise(res => {
  3793. if (ms > 10_000) {
  3794. console.debug(`[⏲️] Waiting ${Math.round(ms/1000)}s...`)
  3795. }
  3796. setTimeout(res, ms)
  3797. })
  3798. const TimeoutError = Symbol()
  3799. const withTimeout = (promise, ms) => {
  3800. // run a promise with a time limit, raises a TimeoutError if it fails
  3801. let timer
  3802. return Promise.race([
  3803. promise,
  3804. new Promise((_r, reject) =>
  3805. timer = setTimeout(reject, ms, TimeoutError)
  3806. ),
  3807. ]).finally(() => clearTimeout(timer))
  3808. }
  3809. const MAX_VALID_DATE = new Date('2150-01-01T00:00:00.000Z')
  3810. const MIN_VALID_DATE = new Date('2010-01-01T00:00:00.000Z')
  3811. const UNIX_EPOCH_DATE = new Date(0)
  3812. const validateDate = (date, {min=MIN_VALID_DATE, max=MAX_VALID_DATE, singleton=UNIX_EPOCH_DATE}={}) => {
  3813. assert((date instanceof Date), `Got invalid type for Date: ${typeof date} ${date} (expected Date)`)
  3814. assert(String(date) !== 'Invalid Date', `Got invalid value for Date: ${typeof date} ${date}`)
  3815. if (Number(date) === Number(singleton)) return date // epoch singleton is always valid
  3816. assert(date < max, `Got Date that was higher than MAX_VALID_DATE=${max}`)
  3817. assert(date > min, `Got Date that was lower than MIN_VALID_DATE=${min}`)
  3818. return date
  3819. }
  3820. const parseVersionDateStr = (yyyymmddtime) => {
  3821. // YYYYMMDDhhmmssxxx or YYYYMMDDhhmmss or YYYYMMDDhhmm or YYYYMMDD -> Date
  3822. const is_only_numbers = /^\d+$/.test(yyyymmddtime.replace('.', ''))
  3823. assert(is_only_numbers, `Non-numeric characters in YYYYMMDD date are not allowed: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
  3824. const num_digits = String(yyyymmddtime).split('.')[0].length
  3825. assert([17, 14, 12, 8].includes(num_digits), `Got invalid number of digits (${num_digits}) in YYYYMMDD date: ${yyyymmddtime} (while trying YYYYMMDDhhmmssxxx format)`)
  3826. const [_all, yyyy, mm, dd, hr, min, sec, ms] = /^(\d{4})(\d{2})(\d{2})(\d{2})?(\d{2})?(\d{2})?(\d{3})?$/.exec(yyyymmddtime)
  3827. assert(yyyy && mm && dd, `Could not find YYYYMMDD`)
  3828. const time_error_msg = `Detected YYYYMMDD[hhmm[ss[xxxx]]] but time segment is invalid ${hr}:${min || '__'}:${ms || '___'}`
  3829. if (ms) assert(hr && min && sec, time_error_msg)
  3830. if (sec) assert(hr && min, time_error_msg)
  3831. if (min) assert(hr, time_error_msg)
  3832. if (hr) assert (min, time_error_msg)
  3833. const iso_str = `${yyyy}-${mm}-${dd}T${hr || '00'}:${min || '00'}:${sec || '00'}.${ms || '00'}Z`
  3834. const parsed_date = new Date(iso_str)
  3835. return validateDate(parsed_date) // 1970-01-01T00:00:00.000Z (ISO format)
  3836. }
  3837. const parseTimestampDateStr = (timestamp) => {
  3838. // 1709724291000 or 1709724291000.000 or 1709724291 or 1709724291.000 -> Date
  3839. timestamp = String(timestamp)
  3840. const is_only_numbers = /^\d+$/.test(timestamp.replace('.', ''))
  3841. assert(is_only_numbers, `Got invalid characters in timstamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
  3842. const num_digits = String(timestamp).split('.')[0].length
  3843. assert([13, 10, 1].includes(num_digits), `Got invalid number of digits (${num_digits}) in timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format)`)
  3844. let parsed_date = null
  3845. if (num_digits === 13) {
  3846. parsed_date = new Date(Number(timestamp)) // 1709724291000 (unix timestamp w/ milliseconds)
  3847. } else if (num_digits === 10) {
  3848. parsed_date = new Date(Number(timestamp) * 1000) // 1709724291 (unix timestamp w/ seconds)
  3849. } else if (num_digits === 1) {
  3850. assert(String(timestamp) === '0', `Got invalid single-digit timestamp: ${timestamp} (while trying xxxxxxxxxxxxx format or 0 for UNIX epoch)`)
  3851. parsed_date = UNIX_EPOCH_DATE
  3852. }
  3853. return validateDate(parsed_date)
  3854. }
  3855. const parseISODateStr = (iso_str) => {
  3856. // 1970-01-01T00:00:00.000Z -> Date
  3857. const num_digits = String(iso_str).length
  3858. assert([24, 19, 16, 10].includes(num_digits), `Got invalid number of digits (${num_digits}) in ISO date: ${iso_str} (while trying 1970-01-01T00:00:00.000Z format)`)
  3859. const parsed_date = new Date(iso_str)
  3860. return validateDate(parsed_date)
  3861. }
  3862. const parseDate = (date) => {
  3863. // date === undefined => use today/now
  3864. // date === null => use unix epoch 0 aka 1970-01-01T00:00:00.000Z
  3865. // date *= YYYYMMDDHHMMSS => use a version date string (e.g. 20010131235958)
  3866. // date *= 1234567... => use a timestmap (e.g. 1709724291000)
  3867. // date *= 1970-01-01T... => use iso datetime (e.g. 1970-01-01T00:00:00.000Z)
  3868. // returns -> Date
  3869. if (date === undefined) {
  3870. return (new Date()) // today (2024-05-29T22:02:34.682Z) aka timestamp=1717020154682
  3871. }
  3872. if (date === null || date == 0) {
  3873. return UNIX_EPOCH_DATE // unix epoch (1970-01-01T00:00:00.000Z) aka timestamp=0
  3874. }
  3875. if (date instanceof Date) {
  3876. return validateDate(date) // JS date Date('1970-01-01T00:00:00.000Z')
  3877. }
  3878. if ((typeof date) === 'number') {
  3879. date = String(date) // unix timestamp e.g. 1717020154682
  3880. }
  3881. assert((typeof date) === 'string', `Tried to parse date but got unsupported type ${(typeof date)}: ${date}`)
  3882. const errors = [`Failed to parse Date from string: ${date}`]
  3883. try {
  3884. return parseVersionDateStr(date)
  3885. } catch(err) { errors.push(err) }
  3886. try {
  3887. return parseTimestampDateStr(date)
  3888. } catch(err) { errors.push(err) }
  3889. try {
  3890. return parseISODateStr(date)
  3891. } catch(err) { errors.push(err) }
  3892. throw errors.join('\n')
  3893. }
  3894. const versionStrFromDate = (date, {withDate=true, withTime=true, withSeconds=true, withMilliseconds=false}={}) => {
  3895. // takes Date, returns YYYYMMDDHHMMSSXXX or YYYYMMDDHHMMSS or YYYYMMDDHHMM or YYYYMMDD
  3896. const parsed_date = parseDate(date)
  3897. const [date_iso, time_iso] = parsed_date.toISOString().split('T') // ['2001-01-31', '23:59:58.090Z']
  3898. const components_to_use = []
  3899. if (withDate) {
  3900. components_to_use.push(date_iso.replaceAll('-', '')) // '20010131'
  3901. }
  3902. if (withTime) {
  3903. const [hr, min, sec, ms] = time_iso.replace('Z', '').replace('.', ':').split(':') // ['23', '59', '58', '090']
  3904. components_to_use.push(hr)
  3905. components_to_use.push(min)
  3906. if (withSeconds) {
  3907. components_to_use.push(sec)
  3908. if (withMilliseconds) {
  3909. components_to_use.push(ms)
  3910. }
  3911. }
  3912. }
  3913. assert(components_to_use.length, 'At least one of {withDate, withTime} must be set.')
  3914. const final_str = components_to_use.join('') // 20010131235958
  3915. assert(parseVersionDateStr(final_str)) // sanity check to make sure it parses correctly
  3916. return final_str
  3917. }
  3918. // test date functions:
  3919. // console.log(parseDate('20120131'))
  3920. // console.log(versionStrFromDate(parseDate('20120131')))
  3921. // console.log(versionStrFromDate(parseDate('0')))
  3922. // console.log(versionStrFromDate(parseDate(0)))
  3923. // console.log(versionStrFromDate(parseDate(null)))
  3924. // console.log(versionStrFromDate())
  3925. // console.log(versionStrFromDate(parseDate('20120131235859090')))
  3926. // console.log(versionStrFromDate(parseDate('1970-01-01T00:00:00.000Z')))
  3927. // console.log(versionStrFromDate(parseDate('2024-12-01T00:00')))
  3928. // console.log(versionStrFromDate(parseDate('2024-12-01'), {withTime: false}))
  3929. const prettyPath = (path) => {
  3930. // return a pretty-printable path where the abspath of the data dir is replaced with /data for brevity/privacy
  3931. return path.replace(DATA_DIR, './data')
  3932. }
  3933. const pathIsHidden = (relpath) => {
  3934. // check if a path or any of the directories above it are hidden (e.g. ./some/.dir/abc or ./.DS_Store)
  3935. // make sure test path behaves like an abspath (avoids edge-cases messing up relpaths on '' or '.' or './')
  3936. let test_path = relpath
  3937. if (test_path.startsWith('./'))
  3938. test_path = test_path.substring(2)
  3939. if (!test_path.startsWith('/'))
  3940. test_path = path.join('/', test_path)
  3941. // iterate through parents, checking if any parent is hidden until we reach /
  3942. while (test_path !== '/') {
  3943. const basename = path.basename(test_path)
  3944. if (basename.startsWith('.')) {
  3945. // console.log('PATH IS HIDDEN', relpath)
  3946. return true
  3947. }
  3948. // otherwise set test_path to parent dir and repeat
  3949. test_path = path.dirname(test_path)
  3950. }
  3951. return false
  3952. }
  3953. const pathDepth = (child_path, relative_to='.') => {
  3954. // get the number of directory hops deep a child path is relative to '.' (or a given parent)
  3955. if (child_path.startsWith('/') && !relative_to.startsWith('/')) {
  3956. // if child_path is absolute, then relative_to must be absolute as well otherwise depth will be depth all the way to the / root
  3957. relative_to = fs.realpathSync(relative_to)
  3958. }
  3959. if (relative_to.startsWith('/') && !child_path.startsWith('/')) {
  3960. // same deal, either both paths have to be relative, or both have to be absolute
  3961. child_path = fs.realpathSync(child_path)
  3962. }
  3963. const relative_path_to_root = path.relative(relative_to, child_path)
  3964. const num_hops_down = relative_path_to_root.split('/').length
  3965. return num_hops_down
  3966. }
  3967. interface DirentWithExtras extends fs.Dirent {
  3968. relpath: string,
  3969. abspath: string,
  3970. reldepth: number,
  3971. }
  3972. async function getDirEntries(dir_path, {pwd=null, recursive=true, includeHidden=false, includeFiles=true, includeDirs=true, includeLinks=false, filter=null, maxdepth=-1}={}) {
  3973. // get the list of all sub-paths under a given path recursively
  3974. // console.log('GETTING DIRECTORY ENTRIES', {dir_path, pwd, recursive, includeHidden, includeFiles, includeDirs, maxdepth})
  3975. pwd = pwd || dir_path
  3976. let dir_abspath = dir_path
  3977. if (!dir_abspath.startsWith(pwd)) {
  3978. dir_abspath = path.join(pwd, dir_abspath)
  3979. }
  3980. assert(fs.existsSync(dir_abspath), `Tried to get directory listing for dir that doesn't exist! ${prettyPath(dir_abspath)}`)
  3981. return (await fs.promises.readdir(dir_abspath, { recursive, withFileTypes: true }))
  3982. .map((dirent: DirentWithExtras) => {
  3983. // filter combined with map because relpath is re-used in both operations
  3984. const relpath = path.join(path.relative(pwd, dirent.parentPath), dirent.name)
  3985. // console.log('CALCULATED RELATIVE PATH', relpath)
  3986. const abspath = path.join(dir_abspath, relpath)
  3987. const basename = path.basename(dirent.name)
  3988. if (!includeLinks && dirent.isSymbolicLink()) return null
  3989. if (!includeFiles && dirent.isFile()) return null
  3990. if (!includeDirs && dirent.isDirectory()) return null
  3991. if (!includeHidden && pathIsHidden(relpath)) return null
  3992. dirent.relpath = relpath
  3993. dirent.abspath = abspath
  3994. dirent.reldepth = pathDepth(relpath)
  3995. // console.log('RELATIVE DEPTH MEASURED', prettyPath(dir_abspath), prettyPath(relpath), dirent.reldepth)
  3996. if (maxdepth >= 0) {
  3997. if ((dirent.reldepth-1) > maxdepth) return null
  3998. }
  3999. if ((typeof filter) === 'function') {
  4000. const should_keep = filter({abspath, relpath, basename, dirent})
  4001. if (!should_keep) {
  4002. // console.log('FILTER EXCLUDED RESULT', {abspath, relpath, basename, dirent})
  4003. return null
  4004. }
  4005. }
  4006. return relpath
  4007. })
  4008. .filter(Boolean)
  4009. .sort() as string[]
  4010. }
  4011. async function getTotalSize(dir_or_file_path, {pwd=null, _cache=null, filter=null, subfiles=null}={}) {
  4012. // get the total size in bytes of a file or directory (recursively adds up file sizes within directory)
  4013. // check _cache first
  4014. if (_cache && (dir_or_file_path in _cache))
  4015. return _cache[dir_or_file_path]
  4016. // make sure dir_or_file_path is under pwd
  4017. pwd = pwd || path.dirname(dir_or_file_path)
  4018. let abspath = dir_or_file_path
  4019. if (!dir_or_file_path.startsWith(pwd)) {
  4020. abspath = path.join(pwd, dir_or_file_path)
  4021. }
  4022. // if it's a file, stat it and return the size
  4023. // console.log('CALCUALTED ABSPATH', {abspath, dir_or_file_path, pwd})
  4024. const dirent = await fs.promises.stat(abspath)
  4025. if (dirent.isFile()) {
  4026. // console.log('CALCULATING FILE SIZE subfile=', prettyPath(abspath))
  4027. return dirent.size
  4028. }
  4029. // if it's not a file and not a directory, give up, dont try to size special files like FIFO/socket/etc.
  4030. if (!dirent.isDirectory()) return 0
  4031. // if it's a directory, size is the sum of all the sizes of files within
  4032. // console.log('CALCULATING SUBDIR SIZE subdir=', prettyPath(abspath))
  4033. let total_bytes = 0
  4034. const files_within = subfiles || await getDirEntries(dir_or_file_path, {
  4035. pwd,
  4036. recursive: true,
  4037. includeDirs: false,
  4038. includeFiles: true,
  4039. filter,
  4040. })
  4041. for (const subpath of files_within) {
  4042. total_bytes += await getTotalSize(subpath, {pwd, _cache, filter})
  4043. }
  4044. return total_bytes
  4045. }
  4046. async function getDirSizes(dir_path, {pwd=null, subfiles=null, withRoot=true, filter=null, maxdepth=-1}={}) {
  4047. // get the size of a directory and all the files within (recursively) as a number of bytes
  4048. // dir_path: path absolute or relative path of the directory you want size info for
  4049. // pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
  4050. // subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
  4051. // withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
  4052. // filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
  4053. // maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
  4054. assert((await fs.promises.stat(dir_path)).isDirectory(), `Tried to calculate directory sizes but path is not a directory! ${dir_path}`)
  4055. pwd = pwd || dir_path
  4056. // {'.': 246, 'example.json': 123, 'example2.txt': 123}
  4057. const sizes = {}
  4058. // first collect the list of all sub-files recursively and calculate their sizes individually
  4059. const files_within = subfiles || await getDirEntries(dir_path, {
  4060. pwd,
  4061. recursive: true,
  4062. includeDirs: false,
  4063. includeFiles: true,
  4064. // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir sizes
  4065. // it never makes sense to ignore subfiles beyond a certain depth for size calculation
  4066. filter, // filter is allowed though, useful to calculcate size of some subset of files that match a pattern
  4067. })
  4068. for (const subpath of files_within) {
  4069. sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter})
  4070. }
  4071. // then calculate the top-level directory total as the sum of all the file sizes under it
  4072. const total_size = Object.values(sizes).reduce((a: number, b: number) => a + b, 0)
  4073. // then calculate the subtotals of all the sub-directories
  4074. const subdirs_within = await getDirEntries(dir_path, {pwd, recursive: true, includeDirs: true, includeFiles: false, filter, maxdepth})
  4075. for (const subpath of subdirs_within) {
  4076. sizes[subpath] = await getTotalSize(subpath, {pwd, _cache: sizes, filter}) // uses _cache to avoid re-computing
  4077. }
  4078. // if maxdepth is passed, filter results to only include paths shallower than max depth
  4079. if (maxdepth >= 0) {
  4080. for (const subpath of Object.keys(sizes)) {
  4081. if (pathDepth(subpath) > maxdepth) {
  4082. delete sizes[subpath]
  4083. }
  4084. }
  4085. }
  4086. // set total_size last so it appears at the bottom of the object in logs for convenience
  4087. if (withRoot) {
  4088. sizes['.'] = total_size
  4089. }
  4090. return sizes
  4091. }
  4092. async function getLargestPath(path_a, path_b) {
  4093. // compare two files/directories and return the largest one of the two (calculating size recursively)
  4094. path_a = await fs.promises.realpath(path_a)
  4095. path_b = await fs.promises.realpath(path_b)
  4096. const size_a = await getTotalSize(path_a)
  4097. const size_b = await getTotalSize(path_b)
  4098. // console.log('COMPARING', prettyPath(path_a), size_a, ' ', prettyPath(path_b), size_b)
  4099. if (size_a > size_b) return path_a
  4100. return path_b
  4101. }
  4102. async function findCommonAncestor(target_abspath, symlink_abspath, {relative=true, search_limit=DATA_DIR}: {relative?: boolean | string, search_limit?: string}={}) {
  4103. // given a target path and a symlink path, find the common ancestor path they both share
  4104. // (searches recursively through absolute path parent directories until a common dir is found, up to search_limit)
  4105. search_limit = await fs.promises.realpath(search_limit)
  4106. let relative_dir = search_limit
  4107. if ((typeof relative) === 'boolean') {
  4108. // if start dir is default, set it to symlinks directory path
  4109. if (relative) {
  4110. relative_dir = path.dirname(symlink_abspath)
  4111. } else {
  4112. relative_dir = search_limit
  4113. }
  4114. } else if ((typeof relative) === 'string') {
  4115. // if start dir is a string, get its absolute path
  4116. relative_dir = relative as string
  4117. } else {
  4118. throw `Got invalid type for relative path during common ancestor search: ${relative}`
  4119. }
  4120. if ((await fs.promises.stat(relative_dir)).isFile()) {
  4121. // if start dir is a file, set it to its parent dir path
  4122. relative_dir = path.dirname(relative_dir)
  4123. }
  4124. assert(
  4125. (await fs.promises.stat(relative_dir)).isDirectory(),
  4126. `Tried to find common ancestor starting from invalid search directory:\n 🔗 ${prettyPath(symlink_abspath)}\n -> ${prettyPath(target_abspath)}\n Error: search dir does not exist or is not a directory: ❌ ${prettyPath(relative_dir)}`,
  4127. )
  4128. const symlink_filename = path.basename(symlink_abspath)
  4129. const target_filename = path.basename(target_abspath)
  4130. const symlink_parent_abspath = await fs.promises.realpath(path.dirname(symlink_abspath))
  4131. const target_parent_abspath = await fs.promises.realpath(path.dirname(target_abspath))
  4132. const search_dir_abspath = await fs.promises.realpath(relative_dir)
  4133. let closest_common_ancestor = search_dir_abspath
  4134. const isAncestorCommon = (ancestor) => (
  4135. target_parent_abspath.startsWith(ancestor)
  4136. && symlink_parent_abspath.startsWith(ancestor))
  4137. // check if both src and target start with the same ancestor path
  4138. while (closest_common_ancestor !== search_limit) {
  4139. if (isAncestorCommon(closest_common_ancestor)) break
  4140. else {
  4141. // otherwise go up one directory and try again
  4142. // console.log(' ...going up a directory', prettyPath(closest_common_ancestor)+'/..')
  4143. closest_common_ancestor = path.dirname(closest_common_ancestor)
  4144. }
  4145. }
  4146. assert(
  4147. isAncestorCommon(closest_common_ancestor),
  4148. `Tried to create relative symlink but could not find common ancestor:\n 🔗 ${prettyPath(symlink_abspath)}\n -> ${prettyPath(target_abspath)}\n Error: target path and symlink path are not both under:\n ❌ ${prettyPath(closest_common_ancestor)}`,
  4149. )
  4150. const symlink_to_ancestor_relpath = path.relative(symlink_parent_abspath, closest_common_ancestor) // ../../..
  4151. const target_from_ancestor_relpath = path.join(path.relative(closest_common_ancestor, target_parent_abspath), target_filename) // 'archive/19999999.23423523'
  4152. const symlink_to_target_relpath = path.join(symlink_to_ancestor_relpath, target_from_ancestor_relpath) // '../../../archive/19999999.23423523'
  4153. return {
  4154. closest_common_ancestor,
  4155. search_dir_abspath,
  4156. target_abspath,
  4157. target_filename,
  4158. target_from_ancestor_relpath,
  4159. symlink_abspath,
  4160. symlink_filename,
  4161. symlink_to_ancestor_relpath,
  4162. symlink_to_target_relpath,
  4163. }
  4164. }
  4165. interface StatsWithExtras extends fs.Stats {
  4166. abspath: string
  4167. relpath?: string
  4168. reldepth?: number
  4169. }
  4170. async function blockUntilExists(file_path, {timeout=7_500, min_bytes=0}={}) {
  4171. // wait up to timeout seconds until file we expect to exist appears on the filesystem
  4172. // (used to handle eventual consistency in network filesystems where we need a delay after writing before reads show up)
  4173. const interval = 250
  4174. const max_tries = timeout / interval
  4175. let tries = 0
  4176. let abspath = null
  4177. while (tries < max_tries) {
  4178. try {
  4179. const abspath = await fs.promises.realpath(file_path)
  4180. assert(fs.existsSync(abspath))
  4181. const dirent = await fs.promises.stat(abspath) as StatsWithExtras
  4182. dirent.abspath = abspath
  4183. if (min_bytes && (dirent.size < min_bytes)) {
  4184. assert(dirent.size >= 1)
  4185. // this is a valid warning but unfortunately its too common to bother showing:
  4186. // console.warn(`[⚠️] Expected file to be >=${Math.round(min_bytes/1000)}kb but was only ${dirent.size/1000}kb:`, prettyPath(file_path))
  4187. }
  4188. return dirent
  4189. } catch(err) {
  4190. const waited = (tries * interval)
  4191. if (waited === 5_000) {
  4192. console.warn(`[⚠️] Waited >${waited/1000}s for file to appear (is filesystem or bg task running slow?):`, prettyPath(file_path))
  4193. }
  4194. await wait(interval)
  4195. tries++
  4196. }
  4197. }
  4198. throw `Expected file does not exist after ${timeout/1000}s: ${prettyPath(file_path)}`
  4199. }
  4200. async function overwriteSymlink(target_path, symlink_path, {relative=true, mkdirs=false, search_limit=DATA_DIR, timeout=5_000}: {relative?: boolean | string, mkdirs?: boolean, search_limit?: string, timeout?: number}={}) {
  4201. // create a symlink from symlink_path -> target_path
  4202. // relative: true => symlink is created as a relative link by default (it will auto-find the closest common ancestor dir, often DATA_DIR)
  4203. // mkdirs: true => optionally creates symlink parent dirs automatically)
  4204. // make sure target file actually exists first
  4205. let target_dirent
  4206. try {
  4207. target_dirent = await blockUntilExists(target_path, {timeout})
  4208. } catch(err) {
  4209. throw `Tried to create symlink pointing to file that does not exist:\n 🔗 ${prettyPath(symlink_path)}\n -> ❌ ${prettyPath(target_path)}\n ${err}`
  4210. }
  4211. const target_abspath = target_dirent.abspath
  4212. const target_filename = path.basename(target_abspath)
  4213. const target_parent_abspath = path.dirname(target_abspath)
  4214. // make sure target is a valid file or directory and not a special character/block device/other weird file
  4215. const target_is_dir = target_dirent.isDirectory()
  4216. const target_is_file = target_dirent.isFile()
  4217. assert(target_is_dir || target_is_file, `Tried to create symlink to an unsupported file type:\n 🔗 ${prettyPath(symlink_path)}\n -> ❌ ${prettyPath(target_path)} (expected file or directory)`)
  4218. // create symlink file parent directories if needed
  4219. const symlink_filename = path.basename(symlink_path)
  4220. const symlink_parent_dir = path.dirname(symlink_path)
  4221. if (mkdirs) {
  4222. await fs.promises.mkdir(symlink_parent_dir, {recursive: true})
  4223. }
  4224. try {
  4225. assert((await fs.promises.stat(symlink_parent_dir)).isDirectory())
  4226. } catch(err) {
  4227. throw `Tried to create symlink in a directory that doesn't exist:\n 🔗 ${symlink_parent_dir}❌/${symlink_filename}\n -> ${target_path}\n ${err}`
  4228. }
  4229. const symlink_parent_abspath = await fs.promises.realpath(symlink_parent_dir)
  4230. const symlink_abspath = path.join(symlink_parent_abspath, symlink_filename)
  4231. // determine nearest common ancestor between symlink dir and target dir
  4232. const {
  4233. closest_common_ancestor,
  4234. symlink_to_ancestor_relpath,
  4235. target_from_ancestor_relpath,
  4236. symlink_to_target_relpath,
  4237. } = await findCommonAncestor(target_abspath, symlink_abspath, {relative, search_limit})
  4238. // set final target path to abspath or relative path depending on {relative} options
  4239. let target_path_final
  4240. if (relative) {
  4241. // make symlink into relative link (based on closest common ancestor dir between symlink_abspath and target_abspath)
  4242. target_path_final = symlink_to_target_relpath
  4243. // console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), `(as relative link: ${target_path_final})`)
  4244. } else {
  4245. // make symlink into an absolute path (verbatim passed target_path)
  4246. target_path_final = target_path
  4247. // console.log(' 🔗', prettyPath(symlink_abspath), '->', prettyPath(target_abspath), '(as absolute path)')
  4248. }
  4249. // remove any existing symlink at destination if there is already one there
  4250. const random_nonce = crypto.randomBytes(16).toString('hex').substring(0, 8)
  4251. const symlink_temp_path = `${symlink_abspath}.${random_nonce}.dup`
  4252. try { await fs.promises.unlink(symlink_abspath) } catch(err) {}
  4253. try { await fs.promises.unlink(symlink_temp_path) } catch(err) {}
  4254. // create the symlink and check that it works after creation
  4255. let created_symlink = null
  4256. try {
  4257. created_symlink = symlink_temp_path
  4258. await fs.promises.symlink(target_path_final, symlink_temp_path)
  4259. created_symlink = symlink_abspath
  4260. await fs.promises.rename(symlink_temp_path, symlink_abspath)
  4261. } catch(err) {
  4262. if (String(err).includes('EISDIR')) {
  4263. // console.warn('[⚠️] Tried to create symlink on top of existing directory', prettyPath(symlink_abspath))
  4264. // no real recourse in this situation, and its too noisy to log every time this happens
  4265. // it's also not always safe to move the dir out of the way, so better to just fail silently here, leaving:
  4266. // ${symlink_abspath}.${random_nonce}.dup
  4267. } else {
  4268. console.warn('[⚠️] Failed to create symlink', prettyPath(created_symlink), err)
  4269. }
  4270. }
  4271. let dirent
  4272. try {
  4273. dirent = await blockUntilExists(created_symlink, {timeout, min_bytes: 0})
  4274. // best we can do here is just check that it exists ^, trying to check that it has the exact expected abspath that we set is bad, because its a race condition:
  4275. // assert(dirent.abspath == target_abspath) // its often already overwritten by later activity, so final abspath may already be different
  4276. } catch(err) {
  4277. throw `Symlink created but does not seem to resolve to intended file:\n 🔗 ${symlink_path}\n -> ❌ ${target_path}\n actual=${dirent?.abspath}\n expected=${target_abspath}\n ${err}`
  4278. }
  4279. return {
  4280. symlink_path,
  4281. symlink_abspath: created_symlink,
  4282. symlink_filename: path.basename(created_symlink),
  4283. symlink_parent_abspath,
  4284. symlink_to_ancestor_relpath,
  4285. symlink_to_target_relpath,
  4286. target_path,
  4287. target_abspath,
  4288. target_filename,
  4289. target_parent_abspath,
  4290. target_from_ancestor_relpath,
  4291. target_path_final,
  4292. target_is_dir,
  4293. target_is_file,
  4294. target_is_relative: Boolean(relative),
  4295. closest_common_ancestor,
  4296. }
  4297. }
  4298. // test symlink and common ancestor finding
  4299. // console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo.json', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269/seo2.json'))
  4300. // console.log(await findCommonAncestor('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', {relative: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
  4301. // console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/archive/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269'))
  4302. // console.log(await overwriteSymlink('/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/snapshots_by_domain/twitter.com/1709724410.19269', '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/index/favorite_snapshots/1709724410.19269', {relative: false, mkdirs: true, search_limit: '/Volumes/NVME/Users/squash/Local/Code/archiveboxes/archivebox-spreadsheet-bot/data/'}))
  4303. async function overwriteDir(path) {
  4304. // delete any existing folder at the destination path (important otherwise we may create a folder inside an existing folder/symlink)
  4305. try {
  4306. await fs.promises.rm(path, { recursive: true, force: true });
  4307. } catch(err) {}
  4308. await fs.promises.mkdir(path, {recursive: true})
  4309. return path
  4310. }
  4311. async function overwriteFile(path, contents, options={encoding: 'utf8', flag: 'w', flush: false, block: true}) {
  4312. // write any JS value to a fresh file (e.g. String, Buffer, WritableStream, etc. anything JSON-serializable)
  4313. const block_until_created = options.block || true
  4314. delete options.block
  4315. try {
  4316. // delete any existing symlink/file present at the destination path
  4317. // (important otherwise we may write into an existing symlink by accident)
  4318. await fs.promises.unlink(path)
  4319. } catch(err) {}
  4320. try {
  4321. let nonce = 1
  4322. while ((await fs.promises.stat(path)).isDirectory()) {
  4323. // if we try to write a file to a path that already has a directory in that location
  4324. // (common when trying to write response JSON e.g. http://www.instagram.com/api/graphql returns json and www.instagram.com/api/graphql/abc returns json)
  4325. path = path.replace(`.${nonce-1}`, '') + `.${nonce}`
  4326. nonce++;
  4327. if (nonce > 20) throw `Too many conflicting files while trying to write to ${prettyPath(path)}`
  4328. }
  4329. } catch(err) {
  4330. if (!String(err).includes('no such file or directory')) {
  4331. console.warn('[⚠️] Warning: Problem with conflicting directory at while trying to write file', err)
  4332. }
  4333. }
  4334. // refuse writing undefined/null/function because its likely an error and not intended
  4335. const content_is_null = (contents === null) || (contents === undefined)
  4336. const content_is_func = (typeof contents === 'function')
  4337. if (content_is_null || content_is_func) {
  4338. throw `Cannot write ${typeof contents} ${contents} to file: ${path}`
  4339. }
  4340. // Numbers, BigInts, and Booleans can be cast to strings, then wrt
  4341. const content_is_primitive = ['number', 'bigint', 'boolean'].includes(typeof contents)
  4342. if (content_is_primitive) {
  4343. contents = String(contents)
  4344. await fs.promises.writeFile(path, contents, options as any)
  4345. if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
  4346. return path
  4347. }
  4348. // Strings and Buffers can be written directly to file
  4349. const content_is_string = (typeof contents === 'string' || contents instanceof String)
  4350. const content_is_buffer = Buffer.isBuffer(contents)
  4351. if (content_is_string || content_is_buffer) {
  4352. await fs.promises.writeFile(path, contents, options as any)
  4353. if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
  4354. return path
  4355. }
  4356. // WritableStream objects can be piped into file
  4357. const content_is_stream = (contents?.pipe)
  4358. if (content_is_stream) {
  4359. const stream_byte_length = contents.writableLength
  4360. const dest_file = fs.createWriteStream(path);
  4361. await finished(contents.pipe(dest_file))
  4362. if (block_until_created) await blockUntilExists(path, {min_bytes: stream_byte_length})
  4363. return path
  4364. }
  4365. // Objects and Arrays can be JSON-stringified then written into file
  4366. const content_is_obj = (Array.isArray(contents) || typeof contents === 'object')
  4367. if (content_is_obj) {
  4368. contents = JSON.stringify(contents, null, 4)
  4369. await fs.promises.writeFile(path, contents, options as any)
  4370. if (block_until_created) await blockUntilExists(path, {min_bytes: Buffer.byteLength(contents)})
  4371. return path
  4372. }
  4373. throw `Cannot write contents of type ${typeof contents} to file: ${path} < ${contents}`
  4374. }
  4375. async function saveExecResult(bin, args=null, {original_url, version}, {cwd='.', timeout=60_000, ...spawn_options}={}) {
  4376. assert(bin)
  4377. assert(original_url && original_url.includes('://'))
  4378. assert(version)
  4379. const BIN_NAME = bin // 'yt-dlp'
  4380. const ARGS = args || [] // ['--some-arg', '--some-other-arg']
  4381. const CWD = cwd || process.cwd() // '.'
  4382. const TIMEOUT = 300_000 // 5min timeout
  4383. const PATH = process.env.PATH
  4384. await fs.promises.mkdir(cwd, {recursive: true})
  4385. // quick-n-dirty dump of cmd to bash script, but this might be better: https://github.com/nodejs/node/issues/34840#issuecomment-677402567
  4386. const cmd_log_str = `#!/usr/bin/env bash
  4387. TYPE="${BIN_NAME}"
  4388. URL="${original_url}"
  4389. VERSION="${version}"
  4390. TIMEOUT=${TIMEOUT}
  4391. CWD="${CWD}"
  4392. PATH="${PATH}:$PATH"
  4393. ${BIN_NAME} ${ARGS.map(arg => JSON.stringify(arg)).join(' ')}
  4394. `
  4395. const cmd_log = path.join(cwd, 'cmd.sh')
  4396. await overwriteFile(cmd_log, cmd_log_str)
  4397. const stdout_log = fs.createWriteStream(path.join(cwd, 'stdout.log'))
  4398. const stderr_log = fs.createWriteStream(path.join(cwd, 'stderr.log'))
  4399. const start_date = new Date()
  4400. const start_ts = Number(start_date)
  4401. const start_time = start_date.toISOString()
  4402. const child = child_process.spawn(
  4403. BIN_NAME,
  4404. ARGS,
  4405. {
  4406. cwd: CWD,
  4407. timeout: TIMEOUT, // 5min timeout
  4408. stdio: [null, 'pipe', 'pipe'], // </dev/null >./stdout.log 2>./stderr.log
  4409. // detached: true, // run in background, don't block on response
  4410. ...(spawn_options || {}),
  4411. },
  4412. )
  4413. child.stdout.setEncoding('utf8')
  4414. child.stdout.pipe(stdout_log)
  4415. child.stderr.setEncoding('utf8')
  4416. child.stderr.pipe(stderr_log)
  4417. const exec_info = {
  4418. TYPE: BIN_NAME,
  4419. URL: original_url,
  4420. VERSION: version,
  4421. bin_name: BIN_NAME,
  4422. args: ARGS,
  4423. timeout: TIMEOUT,
  4424. hostname: os.hostname(),
  4425. bin_paths: PATH,
  4426. ppid: process.pid,
  4427. pid: child.pid,
  4428. start_ts,
  4429. start_time,
  4430. end_time: null,
  4431. end_ts: null,
  4432. duration: null,
  4433. returncode: null,
  4434. log_files: {},
  4435. output_files: {},
  4436. }
  4437. // promise that resolves when the command is finished executing
  4438. // TODO: refactor to use withTimeout
  4439. const getResult = (timeout=TIMEOUT) =>
  4440. new Promise((resolve, reject) => {
  4441. const loop = setInterval(() => {
  4442. if (exec_info.end_time) {
  4443. clearInterval(loop)
  4444. clearTimeout(timer)
  4445. resolve(exec_info)
  4446. }
  4447. }, 100)
  4448. const timer = setTimeout(() => {
  4449. clearInterval(loop)
  4450. if (!exec_info.end_time) {
  4451. reject(new Error(`Process ${BIN_NAME} did not finish within TIMEOUT=${TIMEOUT}`))
  4452. }
  4453. }, timeout);
  4454. })
  4455. const logFilesFilter = ({relpath}) =>
  4456. ['cmd.sh', 'stdout.log', 'stderr.log'].includes(relpath)
  4457. const outputFilesFilter = ({relpath}) =>
  4458. !['cmd.sh', 'stdout.log', 'stderr.log', 'index.json'].includes(relpath)
  4459. const getOutputFiles = async (filter=outputFilesFilter) => {
  4460. return await getDirInfo(CWD, {filter, withHelpers: false, withRoot: false, maxdepth: 6})
  4461. }
  4462. child.on('close', async (returncode) => {
  4463. const end_date = new Date()
  4464. exec_info.returncode = returncode
  4465. exec_info.pid = child.pid
  4466. exec_info.end_ts = Number(end_date)
  4467. exec_info.end_time = end_date.toISOString()
  4468. exec_info.duration = exec_info.end_ts - exec_info.start_ts
  4469. exec_info.log_files = await getOutputFiles(logFilesFilter)
  4470. exec_info.output_files = await getOutputFiles(outputFilesFilter)
  4471. const end_metadata = `
  4472. # END_TIME="${exec_info.end_time}"
  4473. # DURATION=${exec_info.duration}
  4474. # RETURNCODE=${exec_info.returncode }
  4475. `
  4476. await fs.promises.appendFile(cmd_log, end_metadata)
  4477. // write exec_info json (which includes file list) to CWD/index.json
  4478. await overwriteFile(path.join(CWD, 'index.json'), exec_info)
  4479. })
  4480. // child.unref() // dont wait for child process to close
  4481. const start_metadata = `
  4482. #################### LAST RUN LOG ####################
  4483. # HOSTNAME="${exec_info.hostname}"
  4484. # PPID=${exec_info.ppid}
  4485. # PID=${exec_info.pid}
  4486. # START_TIME="${exec_info.start_time}"
  4487. `
  4488. await fs.promises.appendFile(cmd_log, start_metadata)
  4489. return {
  4490. ...exec_info,
  4491. getResult,
  4492. }
  4493. }
  4494. const HASH_CACHE = {}
  4495. async function sha256File(file_path: string, {pwd=null}: {pwd?: string}={}) {
  4496. return new Promise((resolve, reject) => {
  4497. pwd = pwd || path.dirname(file_path);
  4498. if (!file_path.startsWith(pwd)) {
  4499. file_path = path.join(pwd, file_path);
  4500. }
  4501. const dirent = fs.statSync(file_path);
  4502. const abspath = fs.realpathSync(file_path);
  4503. const cache_key = `${abspath}:${dirent.size}:${dirent.mtimeMs}`; // PATH:SIZE:LAST_MODIFIED_TIME
  4504. if (cache_key in HASH_CACHE) {
  4505. resolve(HASH_CACHE[cache_key]);
  4506. }
  4507. const hash = crypto.createHash('sha256');
  4508. const rs = fs.createReadStream(abspath);
  4509. rs.on('error', reject);
  4510. rs.on('data', chunk => hash.update(chunk));
  4511. rs.on('end', () => {
  4512. const final_hash = hash.digest('hex');
  4513. HASH_CACHE[cache_key] = final_hash;
  4514. resolve(final_hash);
  4515. });
  4516. }) as Promise<string>
  4517. }
  4518. async function getDirSha256(dir_path, {pwd=null, withRoot=true, filter=null, maxdepth=-1, subfiles=null}={}) {
  4519. // console.log('CALCULATING SHA256 OF FILES IN DIR', dir_path, {withRoot, filter, maxdepth})
  4520. // dir_path: path absolute or relative path of the directory you want the merkle sha256 for
  4521. // pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
  4522. // withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
  4523. // filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
  4524. // maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
  4525. // subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
  4526. pwd = pwd || dir_path
  4527. if (!dir_path.startsWith(pwd)) {
  4528. dir_path = path.join(pwd, dir_path)
  4529. }
  4530. const dirent = await fs.promises.stat(dir_path)
  4531. assert(dirent.isDirectory(), `Tried to compute sha256 of path but missing or not a directory! ${dir_path}`)
  4532. assert((maxdepth >= -1), `maxdepth must be -1, 0, or 1, 2, 3, etc... (got ${maxdepth})`)
  4533. // assert(!(filter && withRoot), `Cannot generate root hash (consistently) when a custom filter is provided!`)
  4534. // get the sha256 of every file in a directory recursively (excluding hidden files and symlinks)
  4535. // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum
  4536. const all_subfiles = (subfiles as string[]) || await getDirEntries(dir_path, {
  4537. pwd,
  4538. recursive: true,
  4539. includeFiles: true,
  4540. includeDirs: false,
  4541. // ~~maxdepth,~~ // dont pass maxdepth here, we need the entire file listing to accurately calculate parent dir hashes.
  4542. // it never makes sense to ignore subfiles beyond a certain depth for hash calculation. Hashes are
  4543. // only useful IDs if they are consistent+repeatable, hashing to an arbitrary depth will produce
  4544. // many different hashes for the same directory, which is not something we need/want polluting the hash space.
  4545. filter, // we do however allow passing a manual filter funcs which does actually affect the hash
  4546. // this is useful to allow quick checks to see whether a certain subset of files has changed or not
  4547. })
  4548. const hashes: {[key: string]: string} = {}
  4549. let hashable_summary_str = ''
  4550. for (const subfile of all_subfiles) {
  4551. // {'versions/20240413144307/screen recording.mp4': '1df4d9c3aca8b36f1f73e327d56038f80a35db407a298edb16c72576d7dd894e', ...}
  4552. hashes[subfile] = await sha256File(subfile, {pwd})
  4553. const relpath = path.relative(await fs.promises.realpath(dir_path), await fs.promises.realpath(path.join(pwd, subfile)))
  4554. hashable_summary_str += `${hashes[subfile]} ./${relpath}\n`
  4555. }
  4556. // console.log('CALCULATED HASHES FOR ALL SUBFILES IN DIR', dir_path, Object.keys(hashes).length)
  4557. // get list of subdirectories and recursively hash every subdirectory
  4558. // EQUIVALENT TO: find . -type d -not -path '*/.*' -maxdepth ${maxdepth} -print | sort
  4559. const subdirs = await getDirEntries(dir_path, {pwd, recursive: true, includeHidden: false, includeDirs: true, includeFiles: false, filter, maxdepth})
  4560. // for each subdirectory, get its hash recursively and store it in the hash list
  4561. for (const subdir of subdirs) {
  4562. // console.log('GETTING SUBDIR HASH', subdir)
  4563. // a directory's hash is defined as the hash of all the *files* within (excluding dirs/symlinks/hidden)
  4564. const subdir_hashes = await getDirSha256(
  4565. subdir,
  4566. {pwd, withRoot: true, filter, maxdepth: 0},
  4567. )
  4568. hashes[subdir] = subdir_hashes['.']
  4569. }
  4570. // console.log('CALCULATED HASHES FOR ALL SUBDIRS IN DIR', dir_path, subdirs.length)
  4571. // filter results if maxdepth is provided
  4572. if (maxdepth >= 0) {
  4573. for (const subpath of Object.keys(hashes)) {
  4574. if (pathDepth(subpath) > maxdepth) {
  4575. delete hashes[subpath]
  4576. }
  4577. }
  4578. }
  4579. // console.log('LIMITED OUTPUT DUE TO MAXDEPTH', maxdepth, Object.keys(hashes).length)
  4580. // calculate the hash of the root '.' folder by hashing all of hashes of its contents
  4581. // EQUIVALENT TO: find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum
  4582. if (withRoot) {
  4583. // pass the first command's output containing the file list + hashes into another sha256
  4584. // to get the final hash of the whole directory combined
  4585. // console.log('CALCULATING FINAL ROOT HASH for ', dir_path)
  4586. // console.log(hashable_summary_str)
  4587. hashes['.'] = crypto.createHash('sha256').update(hashable_summary_str).digest('hex') as string
  4588. // console.log('--->', hashes['.'])
  4589. }
  4590. return hashes
  4591. }
  4592. async function getDirInfo(dir_path, {pwd=null, withRoot=true, withHelpers=true, filter=null, maxdepth=-1, subfiles=null}={}) {
  4593. // get a detailed JSON/dumpable index of a directory's contents, w/ merkle sha256's, sizes, and mimeTypes
  4594. // dir_path: path absolute or relative path of the directory you want size info for
  4595. // pwd: path (optional) absolute path of the directory you want to interpret dir_path relative to
  4596. // withRoot: bool include a summary entry for the root dir_path dir in the list as '.'
  4597. // withHelpers: bool attach many extra helper attrs/funcs to results (beyond JSON-serializable core data)
  4598. // filter: function (optional) provide a filter func for dir entries ({abspath, relpath, basename, dirent}) => true/false
  4599. // maxdepth: number (optional) does not affect actual calculations, but hides entries below a certain depth in the returned output for brevity
  4600. // subfiles: dirent[] (optional) instead of reading disk, you can manually provide a getDirEntries results list to use
  4601. // {
  4602. // ...
  4603. // 'example.txt': { ... },
  4604. // 'foobar/example.mp3': { ... },
  4605. // '.': { // this is the fully agumented result when withHelpers=true
  4606. // is_file: false,
  4607. // is_dir: true,
  4608. // filename: '.',
  4609. // basename: '1709039915.378868',
  4610. // mimeType: 'inode/directory'
  4611. // extension: undefined,
  4612. // num_bytes: 11540961,
  4613. // num_subpaths: 15,
  4614. // sha256: '9fc58b3ed887e7139338062ebd49bd6795373759e8acb73d2f7a40f1413789da',
  4615. // reldepth: 1,
  4616. // relpath: './',
  4617. // cwd: '/opt/archivebox/data/archive/1709039915.378868/',
  4618. // dirname: '/opt/archivebox/data/archive',
  4619. // abspath: '/opt/archivebox/data/archive/1709039915.378868',
  4620. // dirent: Stats {
  4621. // dev: 16777240,
  4622. // mode: 16895,
  4623. // uid: 501,
  4624. // ...
  4625. // mtimeMs: 1717160622956.1357,
  4626. // ctimeMs: 1717160622956.1357,
  4627. // },
  4628. // created: '2024-05-31T13:03:42.956Z',
  4629. // modified: '2024-05-31T13:03:42.956Z',
  4630. // summary: './data/archive/1709039915.378868 (inode/directory 11541kb 9fc58b3e)',
  4631. // helptext: 'Verify these hashes by running:\n' +
  4632. // ' cd /opt/archivebox/data/archive/1709039915.378868 \n' +
  4633. // " find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum",
  4634. // },
  4635. // }
  4636. pwd = pwd || dir_path
  4637. if (!dir_path.startsWith(pwd)) {
  4638. dir_path = path.join(pwd, dir_path)
  4639. }
  4640. // calculate hashes and sizes recursively
  4641. const hashes = await getDirSha256(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
  4642. const sizes = await getDirSizes(dir_path, {pwd, withRoot, filter, maxdepth, subfiles})
  4643. const num_total_subpaths = Object.keys(hashes).filter(name => name !== '.').length
  4644. const details = {}
  4645. for (const [filename, sha256] of Object.entries(hashes)) {
  4646. if (filename === '.' && !withRoot) continue
  4647. const abspath = await fs.promises.realpath(path.join(dir_path, filename))
  4648. const dirent = await fs.promises.stat(abspath)
  4649. const num_subpaths = Object.keys(hashes).filter(subpath => subpath.startsWith(filename + '/')).length
  4650. const is_file = dirent.isFile()
  4651. const is_dir = dirent.isDirectory()
  4652. // bare-bones info suitable for JSON dumps/exports
  4653. const basic_info = {
  4654. sha256,
  4655. num_bytes: sizes[filename],
  4656. created: (new Date(dirent.ctimeMs)).toISOString(),
  4657. mimeType: undefined,
  4658. extension: undefined,
  4659. num_subpaths: undefined,
  4660. }
  4661. if (is_dir) {
  4662. basic_info.mimeType = 'inode/directory'
  4663. basic_info.extension = undefined
  4664. basic_info.num_subpaths = (filename === '.') ? num_total_subpaths : num_subpaths
  4665. }
  4666. if (is_file) {
  4667. basic_info.mimeType = mime.lookup(abspath) || null
  4668. basic_info.extension = path.extname(filename)
  4669. basic_info.num_subpaths = undefined
  4670. }
  4671. // extra helpers suitable for usage in other areas of the codebase
  4672. const info_with_helpers = {
  4673. ...basic_info,
  4674. filename,
  4675. basename: path.basename(abspath),
  4676. dirname: path.dirname(abspath),
  4677. cwd: dir_path,
  4678. relpath: is_dir ? (filename + '/') : filename,
  4679. reldepth: pathDepth(filename),
  4680. abspath,
  4681. is_file,
  4682. is_dir,
  4683. dirent,
  4684. modified: (new Date(dirent.mtimeMs)).toISOString(),
  4685. summary: `${prettyPath(abspath)} (${basic_info.mimeType} ${Math.round(basic_info.num_bytes/1000)}kb ${sha256.substring(0, 8)})`,
  4686. helptext: undefined,
  4687. }
  4688. if (filename === '.') {
  4689. info_with_helpers.helptext = `Verify these hashes by running:\n cd ${prettyPath(abspath)} \n find . -type f -not -path '*/.*' -print0 | sort -z | xargs -0 sha256sum | sha256sum`
  4690. }
  4691. if ((typeof filter) === 'function') {
  4692. if (!filter(info_with_helpers)) continue
  4693. }
  4694. details[filename] = withHelpers ? info_with_helpers : basic_info
  4695. }
  4696. return details
  4697. }
  4698. // console.log(await getDirSha256(
  4699. // '/opt/archivebox/data/archive/1709039915.378868/',
  4700. // {
  4701. // withRoot: true,
  4702. // maxdepth: -1,
  4703. // filter: ({relpath}) => relpath.startsWith('versions'),
  4704. // },
  4705. // ))
  4706. // console.log(await getDirSizes(
  4707. // '/opt/archivebox/data/archive/1709039915.378868/',
  4708. // {
  4709. // withRoot: false,
  4710. // maxdepth: 2,
  4711. // filter: ({relpath}) => !relpath.startsWith('versions'),
  4712. // },
  4713. // ))
  4714. // console.log(await getDirInfo(
  4715. // '/opt/archivebox/data/archive/1709039915.378868/',
  4716. // {
  4717. // withRoot: true,
  4718. // withHelpers: true,
  4719. // maxdepth: 1,
  4720. // // filter: ({relpath}) => relpath.startsWith('versions'),
  4721. // },
  4722. // ))
  4723. type DetectFilenameOptions = {
  4724. url?: string,
  4725. response?: HTTPResponse | Response,
  4726. page?: Page,
  4727. dir?: string,
  4728. abspath?: string,
  4729. filename?: string,
  4730. basename?: string,
  4731. extension?: string,
  4732. mimeType?: string,
  4733. resourceType?: string,
  4734. }
  4735. async function detectFilename({ url, response, page, dir, abspath, filename, basename, extension, mimeType, resourceType }: DetectFilenameOptions) {
  4736. // this function takes a url (and/or response/page), and detects the abspath,dir,filename,basename,extention,mimeType
  4737. // from the URL (+ any enforced path components passed in via args)
  4738. // example: detectFilename({url: 'https://example.com/favicon.png', extension: 'ico'}) outputs 'favicon.ico'
  4739. //
  4740. // it has some quirks that are specific to archiving and may not behave as you expect
  4741. // e.g. if visiting the url https://example.com/error.zip returns a 500 text/html error page
  4742. // this may still save it as a .zip with mimeType=application/x-zip and ignore the response mimeType the url ends in .zip
  4743. // however, if the url has no extension, e.g. https://example.com/error it will
  4744. // auto-detect the mimeType based on the response and append an extension, saving as error.html
  4745. //
  4746. // ⚠️ SECURITY WARNING: think carefully about the permissions, shell injection, and RCE implications of any changes made here ⚠️
  4747. // this function writes untrusted web content to the filesystem using auto-detected mimetype to co-erce the extension,
  4748. // which can be dangerous (e.g. what if one of these downloads is a malicious ransomware .exe, do we really want to give it .exe?
  4749. // if we do, how do we make sure it never gets executed? (without damaging the integrity of the copy)
  4750. if (!(response || page)) throw 'Either a page or a response must be provided in order to detect mimeType & URL'
  4751. if (response && (typeof response.headers !== 'function')) {
  4752. const node_fetch_response: Response = response as Response
  4753. response = {
  4754. url: () => node_fetch_response.url,
  4755. headers: () => node_fetch_response.headers,
  4756. } as unknown as HTTPResponse
  4757. }
  4758. response = response as HTTPResponse
  4759. url = url || response?.url() || (await page.url())
  4760. if (!url) throw 'URL was not provided and could not be detected from {response, page}'
  4761. // Document, Stylesheet, Image, Media, Font, Script, TextTrack, XHR, Fetch, Prefetch, EventSource, WebSocket, Manifest, SignedExchange, Ping, CSPViolationReport, Preflight, Other
  4762. try {
  4763. resourceType = resourceType || response?.request()?.resourceType()
  4764. } catch(err) {
  4765. // ignore, sometimes response is null/not available
  4766. }
  4767. const resourceTypeToMimeType = {
  4768. 'Stylesheet': 'text/css',
  4769. 'Script': 'application/x-javascript',
  4770. 'WebSocket': 'application/json',
  4771. 'Website': 'text/html',
  4772. }
  4773. mimeType = mimeType || resourceTypeToMimeType[resourceType] // guess extension based on request resourceType
  4774. extension = extension || (mimeType ? mime.extension(mimeType) : null)
  4775. // handle special url cases (e.g. schemes in URL_SCHEMES_IGNORED)
  4776. if (url.startsWith('about:blank')) {
  4777. filename = 'about_blank'
  4778. mimeType = 'text/html'
  4779. }
  4780. else if (url.startsWith('data:')) {
  4781. filename = `data__${hashCode(url)}`
  4782. }
  4783. // console.log('detectFilename>', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
  4784. if (abspath) {
  4785. if (dir || filename || basename || extension)
  4786. throw '{abspath} should not be passed with other options (e.g. dir, filename, basename, extension)'
  4787. var {dir, base: filename, ext: extension, name: basename} = path.parse(abspath)
  4788. // path.parse('/home/user/dir/file.txt') returns:
  4789. // { root: '/',
  4790. // dir: '/home/user/dir',
  4791. // base: 'file.txt',
  4792. // ext: '.txt',
  4793. // name: 'file' }
  4794. } else {
  4795. dir = dir || path.resolve(process.cwd())
  4796. filename = filename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1.zip
  4797. || (new URL(url)).pathname.split('/').at(-1) // https://example.com/file124.rss => file124.rss prefers last component of path with no query/hash, falls back to domain name if no path
  4798. || 'index' // https://example.com/abc/def/ => index.html
  4799. //|| (new URL(url)).hostname.replaceAll('.', '_') // https://example.com => example_com (but if disabled, this would be index.html)
  4800. }
  4801. if (!filename) throw 'filename/abspath were not passed and could not be detected from url'
  4802. const path_extname = path.extname(filename)
  4803. const resp_mimetype = response && (
  4804. (response as any).mimeType
  4805. || response.headers()['content-type']?.split(';')[0]
  4806. || resourceTypeToMimeType[resourceType]
  4807. || 'application/octet-stream'
  4808. )
  4809. mimeType = mimeType // https://example.com/a.1.zip?e.pdf=2#g.h=3 => application/x-zip prefers mimetype based on extension in path, falls back to response mimeType
  4810. || (path_extname && mime.lookup(path_extname)) // https://example.com/file124.rss => application/rss+xml
  4811. || resp_mimetype // https://example.com/get?type=png => image/png
  4812. extension = extension
  4813. || (path_extname && path_extname.replace('.', '')) // https://example.com/a.1.zip?e.pdf=2#g.h=3 => zip prefers extension in path, falls back to response mimeType's suggested extension
  4814. || (resp_mimetype && mime.extension(resp_mimetype)) // https://example.com => html
  4815. || '' // https://example.com/websocket.1 =>
  4816. if (extension.startsWith('.'))
  4817. extension = extension.slice(1)
  4818. basename = basename // https://example.com/a.1.zip?e.pdf=2#g.h=3 => a.1 prefers to filename in path (without extension), falls back to domain name
  4819. || (path.parse(filename).name) // https://mp4dl.example.com => mp4dl_example_com
  4820. basename = basename.slice(0, 120) // truncate at 120 characters (leaving 8 chars for .ext)
  4821. basename = basename.replace(/[^a-zA-Z0-9%+?&=@;_ \.-]/g, '') // strip characters not allowed in filenames
  4822. filename = basename + '.' + extension
  4823. if (filename.endsWith('.'))
  4824. filename = filename.slice(0, -1)
  4825. abspath = abspath || path.join(dir, filename)
  4826. // console.log('detectFilename<', {url, dir, abspath, filename, basename, extension, mimeType, resourceType})
  4827. return {
  4828. url,
  4829. dir,
  4830. abspath,
  4831. filename,
  4832. basename,
  4833. extension,
  4834. mimeType,
  4835. resourceType,
  4836. resp_mimetype,
  4837. }
  4838. }
  4839. interface DowloadOptions extends DetectFilenameOptions {
  4840. browser?: Browser
  4841. expected_mimetype?: string
  4842. timeout?: number
  4843. }
  4844. async function download({ url, browser, page, response, dir, abspath, filename, basename, extension, expected_mimetype, timeout }: DowloadOptions) {
  4845. url = url || (response as HTTPResponse)?.url() || (await page?.url())
  4846. ALREADY_ARCHIVED.add(url.slice(0, 4096)) // prevent running whole archive task on tabs we create for just for downloading
  4847. browser = browser || (page && (await page.browser()))
  4848. timeout = timeout || 120_000
  4849. expected_mimetype = expected_mimetype || ''
  4850. let newPage = null
  4851. let errors = []
  4852. let num_bytes = 0
  4853. let bytesBuffer = null
  4854. // if we need to fetch the url (i.e. it's not already been requested)
  4855. if (!response) {
  4856. if (!browser) throw 'No {browser} or {page} was provided to download with'
  4857. newPage = await browser.newPage()
  4858. if (page) await page.bringToFront() // if origin page is provided, make sure it stays in foreground
  4859. response = await newPage.goto(url, {timeout: timeout, waitUntil: 'networkidle0'})
  4860. if (page) await page.bringToFront() // if origin page is provided, make sure it stays in foreground
  4861. }
  4862. url = url || (response as HTTPResponse)?.url() || (await newPage?.url()) || (await page?.url());
  4863. const response_mimetype = (response as HTTPResponse).headers()['content-type']?.split(';')[0] || 'text/html'
  4864. // detect the filename we should write to based on provided url/response/page/filename/extension suggestions
  4865. var {
  4866. dir,
  4867. abspath,
  4868. filename,
  4869. basename,
  4870. extension,
  4871. mimeType,
  4872. } = await detectFilename({url, page, response, dir, abspath, filename, basename, extension, mimeType})
  4873. // if mimeType is passed, make sure response matches expected mimetype, otherwise consider download a failure
  4874. if (!response_mimetype.startsWith(expected_mimetype)) {
  4875. errors.push(`Expected ${expected_mimetype} but got ${response_mimetype}`)
  4876. } else {
  4877. // download the file using puppeteer's response.buffer()
  4878. try {
  4879. // write the response bytes into the output file
  4880. bytesBuffer = await (response as HTTPResponse).buffer()
  4881. await overwriteFile(abspath, bytesBuffer)
  4882. num_bytes = bytesBuffer.length
  4883. } catch(err) {
  4884. errors.push(err)
  4885. }
  4886. // security check to make sure downloaded file is not executable (random binaries downloaded off the internet = dangerous)
  4887. fs.access(abspath, fs.constants.X_OK, (err) => {
  4888. if (!err) console.warn(
  4889. '[⚠️] SECURITY WARNING: Downloaded file appears to be executable:', prettyPath(abspath),
  4890. '\n (be careful running untrusted programs downloaded from the internet!)'
  4891. )
  4892. })
  4893. }
  4894. // if we opened a dedicated page for downloading, close it now
  4895. if (newPage) {
  4896. newPage.close()
  4897. }
  4898. if (errors.length) {
  4899. // console.warn(`[❌] Downloading ${url} (${mimeType}) to ${abspath} failed:`, JSON.stringify(errors, null, 4))
  4900. } else {
  4901. console.log(`[💾] Downloaded ${url.substring(0, 40)} (${num_bytes} ${mimeType})...`.padEnd(82), prettyPath(abspath))
  4902. }
  4903. return {
  4904. url, response, errors,
  4905. dir, abspath, filename, basename, extension, mimeType,
  4906. bytesBuffer, num_bytes,
  4907. }
  4908. }
  4909. /************************** Puppeteer Launching *******************************/
  4910. async function startCluster(puppeteer, args=CHROME_ARGS_DEFAULT) {
  4911. console.log(`[🎭] Launching ${CHROME_CLUSTER_WORKERS}x Chromium browsers with puppeteer-cluster:`.padEnd(82), prettyPath(CHROME_PROFILE_PATH))
  4912. const cluster = await Cluster.launch({
  4913. puppeteer,
  4914. monitor: true,
  4915. maxConcurrency: CHROME_CLUSTER_WORKERS,
  4916. sameDomainDelay: 2550,
  4917. workerCreationDelay: 250,
  4918. timeout: 300_000, // total ms timeout for an entire task (1000ms * 60s * 5m)
  4919. concurrency: Cluster.CONCURRENCY_PAGE, // share cookies between all tabs in a given browser
  4920. puppeteerOptions: {
  4921. args, // all the chrome launch CLI args
  4922. ignoreDefaultArgs: true, // trust me, we have enough args already...
  4923. // dumpio: true, // full debug log output, super noisy
  4924. }
  4925. })
  4926. console.log('*************************************************************************')
  4927. return cluster
  4928. }
  4929. async function remoteBrowser(puppeteer, {browserURL, browserWSEndpoint}) {
  4930. console.log('[🎭] Connecting Puppeteer to existing Chromium browser via:', browserURL || browserWSEndpoint)
  4931. let completed_initial_connection = false
  4932. const browser = await puppeteer.connect({browserURL, browserWSEndpoint, defaultViewport: null, targetFilter: () => completed_initial_connection})
  4933. completed_initial_connection = true
  4934. console.log('*************************************************************************')
  4935. return browser
  4936. }
  4937. async function startBrowser(puppeteer, args=CHROME_ARGS_DEFAULT) {
  4938. console.log('[🎭] Launching Puppeteer Chromium browser...'.padEnd(82+1), prettyPath(CHROME_PROFILE_PATH))
  4939. const browser = await puppeteer.launch({ignoreDefaultArgs: true, args, dumpio: true})
  4940. globalThis.browser = browser
  4941. console.log('*************************************************************************')
  4942. // store all active tabs on global var by url for easier vscode interactive debugging
  4943. const storeTabForDebugger = async (target) => {
  4944. try {
  4945. globalThis.tabs = globalThis.tabs || {}
  4946. const url = target.url()
  4947. const page = await target.page()
  4948. if (!page || page?.isClosed()) {
  4949. delete globalThis.tabs[url]
  4950. } else {
  4951. globalThis.tab = page
  4952. globalThis.tabs[url] = page
  4953. }
  4954. } catch(err) {console.warn(err)}
  4955. }
  4956. browser.on('targetcreated', storeTabForDebugger)
  4957. browser.on('targetchanged', storeTabForDebugger)
  4958. browser.on('targetdestroyed', storeTabForDebugger)
  4959. // wait for initial extension background.js/service worker targets to load
  4960. await wait(3_000)
  4961. // prime the extensions cache
  4962. const extensions = await getChromeExtensionsFromCache({browser})
  4963. globalThis.extensions = extensions // for easier debugging only
  4964. // give the user 2min to check any issues with the initial startup pages (bot profile pages),
  4965. // solve captchas, re-login, etc. then close them after that to save resources
  4966. const startup_pages = (await browser.pages())
  4967. const startup_page_close_delay = 120_000
  4968. setTimeout(async () => {
  4969. for (const page of startup_pages) {
  4970. try { await page.close() } catch(err) { /* page may already be closed by now, which is fine */ }
  4971. }
  4972. }, startup_page_close_delay)
  4973. // setup any extensions that need final runtime configuration using their options pages
  4974. // await setup2CaptchaExtension({browser, extensions})
  4975. // open a placeholder page so browser window stays open when there are no active archiving pages
  4976. // (it's annoying to have the entire window open/close/open/close/etc every time an archive task runs)
  4977. const empty_page = await browser.newPage()
  4978. await wait(250)
  4979. await empty_page.goto('chrome://version')
  4980. await wait(500)
  4981. console.log('*************************************************************************')
  4982. return browser
  4983. }
  4984. async function startAPIServer(port=API_SERVER_PORT, host=API_SERVER_HOST, taskCallback=null) {
  4985. // taskCallback should be an async function that takes ({url}) => and does something with it
  4986. assert(taskCallback && (typeof taskCallback === 'function'))
  4987. const server = createServer(async (req, res) => {
  4988. if (req.method === 'POST') {
  4989. console.log(`[API][POST] ${req.url}`)
  4990. let body = '';
  4991. req.on('data', (chunk) => {
  4992. body += chunk;
  4993. });
  4994. req.on('end', () => {
  4995. try {
  4996. const jsonData = JSON.parse(body);
  4997. // Process the JSON data
  4998. console.log(jsonData);
  4999. res.writeHead(200, { 'Content-Type': 'application/json' });
  5000. res.end(JSON.stringify({ message: 'JSON data received' }));
  5001. } catch (error) {
  5002. res.writeHead(400, { 'Content-Type': 'application/json' });
  5003. res.end(JSON.stringify({ error: 'Invalid JSON data' }));
  5004. }
  5005. });
  5006. } else if (req.method === 'GET') {
  5007. console.log(`[API][GET] ${req.url}`)
  5008. const parsedUrl = new URL(`http://${host}:${port}${req.url}`)
  5009. const query = new URLSearchParams(parsedUrl.search);
  5010. const url = query.get('url');
  5011. if (url && url.includes('://')) {
  5012. res.writeHead(200, { 'Content-Type': 'text/plain' });
  5013. try {
  5014. await taskCallback({url})
  5015. res.end(`${url}\n${TASK_PATH(url)}`);
  5016. } catch(err) {
  5017. res.end(`${url}\n${TASK_PATH(url)}\n${err}`);
  5018. }
  5019. } else {
  5020. res.writeHead(500, { 'Content-Type': 'text/plain' });
  5021. res.end(`Bad URL: ${url}\n\nExpected: /?url=https://example.com/url/to/archive`);
  5022. }
  5023. } else {
  5024. res.writeHead(405, { 'Content-Type': 'application/json' });
  5025. res.end(JSON.stringify({ error: 'Method not allowed' }));
  5026. }
  5027. })
  5028. server.listen(port, host, () => {
  5029. console.log(`[🎰] API Server listening for requests on http://${host}:${port}/?url=...`);
  5030. })
  5031. console.log('*************************************************************************')
  5032. return server
  5033. }
  5034. async function main(urls, cluster=CHROME_CLUSTER) {
  5035. process.chdir(DATA_DIR)
  5036. const extensions = await getChromeExtensionsFromPersona({CHROME_EXTENSIONS, CHROME_EXTENSIONS_DIR})
  5037. const args = getChromeArgs({...CHROME_LAUNCH_OPTIONS, CHROME_EXTENSIONS: extensions})
  5038. const preferences = getChromePreferences({CHROME_PREFERENCES_DEFAULT, CHROME_PREFERENCES_EXTRA, CHROME_DOWNLOADS_DIR, CHROME_EXTENSIONS: extensions})
  5039. const Puppeteer = applyChromePreferences(PupeteerExtra, CHROME_PREFERENCES_PATH, preferences)
  5040. Puppeteer.use(StealthPlugin());
  5041. // Puppeteer.use(ReplPlugin());
  5042. // handled by uBlock Origin & ReCaptcha browser extensions, probably not needed here anymore:
  5043. // Puppeteer.use(RecaptchaPlugin({
  5044. // provider: {id: '2captcha', token: API_KEY_2CAPTCHA},
  5045. // visualFeedback: true,
  5046. // }))
  5047. // const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
  5048. // puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
  5049. if (cluster) {
  5050. // launch browser with multiple tabs w/ puppeteer
  5051. const cluster = await startCluster(Puppeteer, args)
  5052. const handleTask = async ({url}) => cluster.queue(url, botArchiveTask)
  5053. const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
  5054. console.log('[📋] Running tasks in parallel with puppeteer cluster...')
  5055. for (const url of urls) {
  5056. if (fs.existsSync(path.join(TASK_PATH(url), 'aiqa.json'))) {
  5057. try {
  5058. JSON.parse((await fs.promises.readFile(path.join(TASK_PATH(url), 'aiqa.json'))).toString())
  5059. console.log(' skipping (already present):', TASK_PATH(url), url)
  5060. continue
  5061. } catch(err) {
  5062. // pass
  5063. }
  5064. }
  5065. cluster.queue(url, botArchiveTask)
  5066. await wait(3_000)
  5067. }
  5068. await cluster.idle();
  5069. await cluster.close();
  5070. } else {
  5071. // launch single new browser w/ puppeter / connect to remote CDP browser w/ puppeteer
  5072. const browser = await startBrowser(Puppeteer, args)
  5073. // const browser = await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
  5074. // run speedtest in the background
  5075. speedtest({browser})
  5076. const handleTask = async ({url}) => await botArchiveTask({page: (await browser.newPage()), data: url})
  5077. const server = await startAPIServer(API_SERVER_PORT, API_SERVER_HOST, handleTask)
  5078. // wait for any pre-run setup tasks or server requests
  5079. await wait(5_000)
  5080. let num_succeeded = 0
  5081. let num_failed = 0
  5082. console.log(`[📋] Running ${urls.length} tasks sequentially with puppeteer browser...`)
  5083. for (const url of urls) {
  5084. const run_count = (num_succeeded + num_failed) || 1
  5085. // check if task should be run or skipped based on existing snapshot data present in directory
  5086. const metrics_path = path.join(TASK_PATH(url), 'metrics.json')
  5087. const screenshot_path = path.join(TASK_PATH(url), 'screenrecording.gif')
  5088. const aiqa_path = path.join(TASK_PATH(url), 'aiqa.json')
  5089. const versions_path = path.join(TASK_PATH(url), 'versions')
  5090. if (fs.existsSync(metrics_path) && fs.existsSync(screenshot_path) && fs.existsSync(aiqa_path) && fs.existsSync(versions_path)) {
  5091. try {
  5092. const ai_qa_result = JSON.parse(await fs.promises.readFile(aiqa_path, 'utf-8'))
  5093. console.log(prettyPath(TASK_PATH(url)), `${ai_qa_result.pct_visible}%`, ai_qa_result.website_brand_name, url.substring(0, 80))
  5094. assert(ai_qa_result.website_brand_name)
  5095. continue
  5096. } catch(err) {
  5097. // pass
  5098. }
  5099. }
  5100. let delay = 0
  5101. // create a new browser page and run the archiving task
  5102. const page = (await browser.newPage())
  5103. try {
  5104. console.log(ANSI.black + `◤==============================================================================[${String(run_count).padStart(3)}]/[${urls.length}]◥` + ANSI.reset)
  5105. await botArchiveTask({page, data: url})
  5106. delay = 1_000
  5107. num_succeeded += 1
  5108. } catch(err) {
  5109. console.error('[❌] Archiving task failed!', url)
  5110. console.error(err)
  5111. num_failed += 1
  5112. delay = 15_000 // extra delay if there are errors
  5113. }
  5114. console.log(ANSI.black + `◣==============================================================================[☑ ${num_succeeded}][🆇 ${num_failed}]◢` + ANSI.reset)
  5115. // check for abnormally high failure rates and exit early if needed
  5116. const failure_pct = Math.round((num_failed/run_count) * 100)
  5117. if (failure_pct > 50) {
  5118. if (run_count > 5) {
  5119. console.warn(`[⚠️] ${failure_pct}% Task failure rate is very high! Will self-cancel after 10 URLs if >50% continue to fail...`)
  5120. }
  5121. if (run_count > 10) {
  5122. throw `Too many tasks failed in a row! Quitting early after ${run_count}/${urls.length} tasks.`
  5123. }
  5124. }
  5125. // increase the delay between tasks based on the ratio of how many are failing:succeeding
  5126. delay = Math.pow(4, (num_failed/(num_succeeded + 3))) * delay
  5127. // e.g. 0:1 failure ratio == 1 * delay == 1 ~ 15s
  5128. // 1:1 failure ratio == 5 * delay == 5 ~ 1m ... 5^(failed:succeeded) exponential increase
  5129. // 2:1 failure ratio == 25 * delay == 25s ~ 6m
  5130. // 3:1 failure ratio == 125 * delay == 2m ~ 31m
  5131. // etc...
  5132. // up to 1hr+
  5133. delay = Math.min(delay, 3_600_000) // 1hr maximum delay between tasks
  5134. delay = Math.max(delay, 1_000) // 1s minimum delay between tasks
  5135. if (delay > 2_500) {
  5136. console.log('... waiting', Math.round(delay/1000), 'seconds (self rate-limit)...')
  5137. }
  5138. await wait(delay) // base ratelimit
  5139. console.log()
  5140. }
  5141. if (PASSIVE_ARCHIVING) {
  5142. // replace these as-needed:
  5143. const browserURL = 'http://localhost:9222/'
  5144. const browserWSEndpoint = 'ws://localhost:9222/devtools/browser'
  5145. const driver_browser = browser || await remoteBrowser(Puppeteer, {browserURL, browserWSEndpoint})
  5146. const archiver_browser = {} //await startBrowser(Puppeteer, args)
  5147. const extensions = await getChromeExtensionsFromCache({browser: driver_browser})
  5148. // close both browsers if either one is closed
  5149. let browser_is_open = true
  5150. driver_browser.on('disconnected', async () => {browser_is_open = false}) // await archiver_browser.close()
  5151. // archiver_browser.on('disconnected', async () => {browser_is_open = false; await driver_browser.close()})
  5152. // handle any tab navigation to a new URL in the driver browser
  5153. const handleUserNavigation = async (target) => {
  5154. const url = target.url()
  5155. const page = await target.page()
  5156. // const client = await target.createCDPSession()
  5157. if (target.type() == 'page' && page && url) {
  5158. console.log(ANSI.black + '==============================================================================' + ANSI.reset)
  5159. console.warn('[➕] DRIVER BROWSER NAVIGATED:', ANSI.blue, url, ANSI.reset)
  5160. try {
  5161. await passiveArchiveTask({browser: driver_browser, page, url})
  5162. await wait(3_000)
  5163. } catch(err) {
  5164. console.error('[❌] Archiving task failed!', url)
  5165. console.error(err)
  5166. await wait(10_000) // base ratelimit
  5167. }
  5168. console.log(ANSI.black + '==============================================================================' + ANSI.reset)
  5169. // await client.send('Page.enable')
  5170. // await client.send('Page.setWebLifecycleState', {state: 'active'})
  5171. }
  5172. // await client.send('Runtime.runIfWaitingForDebugger')
  5173. }
  5174. // setup handler to archive new page whenever one is opened
  5175. driver_browser.on('targetcreated', handleUserNavigation)
  5176. driver_browser.on('targetchanged', handleUserNavigation)
  5177. console.log('------------------------------------------------------')
  5178. console.log('[👀] Waiting for browser tabs to be opened by human...')
  5179. while (browser_is_open) {
  5180. await wait(2_000)
  5181. }
  5182. } else {
  5183. while (true) {
  5184. await wait(2_000)
  5185. }
  5186. }
  5187. await browser.close()
  5188. }
  5189. console.log('[✅] Finished all tasks and stopped browsers.')
  5190. process.exit(0);
  5191. }
  5192. /******************************************************************************/
  5193. if (import.meta.main) {
  5194. main(URLS).catch(console.error);
  5195. }
  5196. /******************************************************************************/
  5197. // if we want to handle CLI args in the future, minimist is great:
  5198. // var argv = require('minimist')(process.argv.slice(2));
  5199. // console.log(argv); // --url=https://example.com --binpath=/browsers/chromium-1047/bin/chromium --datadir=/Chromium
  5200. // const {url, binpath, datadir} = argv;
  5201. // OLD CODE, may be useful in the future if we need audio in screenrecordings:
  5202. // async function setupScreenrecordingWithAudio(page, wss) {
  5203. // console.log('[🎬] Setting up screen-recording plugin...');
  5204. // const stream_port = (await wss).options.port;
  5205. // // streamPage = await (page.browser()).newPage()
  5206. // await page.goto(`chrome-extension://jjndjgheafjngoipoacpjgeicjeomjli/options.html#${stream_port}`)
  5207. //
  5208. // // puppeteer-stream recording start
  5209. // streamFile = fs.createWriteStream(SCREENRECORDING_PATH(page))
  5210. // stream = await getStream(page, {
  5211. // audio: true,
  5212. // video: true,
  5213. // bitsPerSecond: 8000000, // 1080p video
  5214. // });
  5215. // stream.pipe(streamFile);
  5216. // return {stream, streamFile}
  5217. //
  5218. // // puppeteer-stream recording stop & cleanup
  5219. // if (stream && streamFile) {
  5220. // await stream?.destroy();
  5221. // streamFile?.close();
  5222. // // await streamPage.close();
  5223. // }
  5224. // }