sphinx.cpp 789 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596165971659816599166001660116602166031660416605166061660716608166091661016611166121661316614166151661616617166181661916620166211662216623166241662516626166271662816629166301663116632166331663416635166361663716638166391664016641166421664316644166451664616647166481664916650166511665216653166541665516656166571665816659166601666116662166631666416665166661666716668166691667016671166721667316674166751667616677166781667916680166811668216683166841668516686166871668816689166901669116692166931669416695166961669716698166991670016701167021670316704167051670616707167081670916710167111671216713167141671516716167171671816719167201672116722167231672416725167261672716728167291673016731167321673316734167351673616737167381673916740167411674216743167441674516746167471674816749167501675116752167531675416755167561675716758167591676016761167621676316764167651676616767167681676916770167711677216773167741677516776167771677816779167801678116782167831678416785167861678716788167891679016791167921679316794167951679616797167981679916800168011680216803168041680516806168071680816809168101681116812168131681416815168161681716818168191682016821168221682316824168251682616827168281682916830168311683216833168341683516836168371683816839168401684116842168431684416845168461684716848168491685016851168521685316854168551685616857168581685916860168611686216863168641686516866168671686816869168701687116872168731687416875168761687716878168791688016881168821688316884168851688616887168881688916890168911689216893168941689516896168971689816899169001690116902169031690416905169061690716908169091691016911169121691316914169151691616917169181691916920169211692216923169241692516926169271692816929169301693116932169331693416935169361693716938169391694016941169421694316944169451694616947169481694916950169511695216953169541695516956169571695816959169601696116962169631696416965169661696716968169691697016971169721697316974169751697616977169781697916980169811698216983169841698516986169871698816989169901699116992169931699416995169961699716998169991700017001170021700317004170051700617007170081700917010170111701217013170141701517016170171701817019170201702117022170231702417025170261702717028170291703017031170321703317034170351703617037170381703917040170411704217043170441704517046170471704817049170501705117052170531705417055170561705717058170591706017061170621706317064170651706617067170681706917070170711707217073170741707517076170771707817079170801708117082170831708417085170861708717088170891709017091170921709317094170951709617097170981709917100171011710217103171041710517106171071710817109171101711117112171131711417115171161711717118171191712017121171221712317124171251712617127171281712917130171311713217133171341713517136171371713817139171401714117142171431714417145171461714717148171491715017151171521715317154171551715617157171581715917160171611716217163171641716517166171671716817169171701717117172171731717417175171761717717178171791718017181171821718317184171851718617187171881718917190171911719217193171941719517196171971719817199172001720117202172031720417205172061720717208172091721017211172121721317214172151721617217172181721917220172211722217223172241722517226172271722817229172301723117232172331723417235172361723717238172391724017241172421724317244172451724617247172481724917250172511725217253172541725517256172571725817259172601726117262172631726417265172661726717268172691727017271172721727317274172751727617277172781727917280172811728217283172841728517286172871728817289172901729117292172931729417295172961729717298172991730017301173021730317304173051730617307173081730917310173111731217313173141731517316173171731817319173201732117322173231732417325173261732717328173291733017331173321733317334173351733617337173381733917340173411734217343173441734517346173471734817349173501735117352173531735417355173561735717358173591736017361173621736317364173651736617367173681736917370173711737217373173741737517376173771737817379173801738117382173831738417385173861738717388173891739017391173921739317394173951739617397173981739917400174011740217403174041740517406174071740817409174101741117412174131741417415174161741717418174191742017421174221742317424174251742617427174281742917430174311743217433174341743517436174371743817439174401744117442174431744417445174461744717448174491745017451174521745317454174551745617457174581745917460174611746217463174641746517466174671746817469174701747117472174731747417475174761747717478174791748017481174821748317484174851748617487174881748917490174911749217493174941749517496174971749817499175001750117502175031750417505175061750717508175091751017511175121751317514175151751617517175181751917520175211752217523175241752517526175271752817529175301753117532175331753417535175361753717538175391754017541175421754317544175451754617547175481754917550175511755217553175541755517556175571755817559175601756117562175631756417565175661756717568175691757017571175721757317574175751757617577175781757917580175811758217583175841758517586175871758817589175901759117592175931759417595175961759717598175991760017601176021760317604176051760617607176081760917610176111761217613176141761517616176171761817619176201762117622176231762417625176261762717628176291763017631176321763317634176351763617637176381763917640176411764217643176441764517646176471764817649176501765117652176531765417655176561765717658176591766017661176621766317664176651766617667176681766917670176711767217673176741767517676176771767817679176801768117682176831768417685176861768717688176891769017691176921769317694176951769617697176981769917700177011770217703177041770517706177071770817709177101771117712177131771417715177161771717718177191772017721177221772317724177251772617727177281772917730177311773217733177341773517736177371773817739177401774117742177431774417745177461774717748177491775017751177521775317754177551775617757177581775917760177611776217763177641776517766177671776817769177701777117772177731777417775177761777717778177791778017781177821778317784177851778617787177881778917790177911779217793177941779517796177971779817799178001780117802178031780417805178061780717808178091781017811178121781317814178151781617817178181781917820178211782217823178241782517826178271782817829178301783117832178331783417835178361783717838178391784017841178421784317844178451784617847178481784917850178511785217853178541785517856178571785817859178601786117862178631786417865178661786717868178691787017871178721787317874178751787617877178781787917880178811788217883178841788517886178871788817889178901789117892178931789417895178961789717898178991790017901179021790317904179051790617907179081790917910179111791217913179141791517916179171791817919179201792117922179231792417925179261792717928179291793017931179321793317934179351793617937179381793917940179411794217943179441794517946179471794817949179501795117952179531795417955179561795717958179591796017961179621796317964179651796617967179681796917970179711797217973179741797517976179771797817979179801798117982179831798417985179861798717988179891799017991179921799317994179951799617997179981799918000180011800218003180041800518006180071800818009180101801118012180131801418015180161801718018180191802018021180221802318024180251802618027180281802918030180311803218033180341803518036180371803818039180401804118042180431804418045180461804718048180491805018051180521805318054180551805618057180581805918060180611806218063180641806518066180671806818069180701807118072180731807418075180761807718078180791808018081180821808318084180851808618087180881808918090180911809218093180941809518096180971809818099181001810118102181031810418105181061810718108181091811018111181121811318114181151811618117181181811918120181211812218123181241812518126181271812818129181301813118132181331813418135181361813718138181391814018141181421814318144181451814618147181481814918150181511815218153181541815518156181571815818159181601816118162181631816418165181661816718168181691817018171181721817318174181751817618177181781817918180181811818218183181841818518186181871818818189181901819118192181931819418195181961819718198181991820018201182021820318204182051820618207182081820918210182111821218213182141821518216182171821818219182201822118222182231822418225182261822718228182291823018231182321823318234182351823618237182381823918240182411824218243182441824518246182471824818249182501825118252182531825418255182561825718258182591826018261182621826318264182651826618267182681826918270182711827218273182741827518276182771827818279182801828118282182831828418285182861828718288182891829018291182921829318294182951829618297182981829918300183011830218303183041830518306183071830818309183101831118312183131831418315183161831718318183191832018321183221832318324183251832618327183281832918330183311833218333183341833518336183371833818339183401834118342183431834418345183461834718348183491835018351183521835318354183551835618357183581835918360183611836218363183641836518366183671836818369183701837118372183731837418375183761837718378183791838018381183821838318384183851838618387183881838918390183911839218393183941839518396183971839818399184001840118402184031840418405184061840718408184091841018411184121841318414184151841618417184181841918420184211842218423184241842518426184271842818429184301843118432184331843418435184361843718438184391844018441184421844318444184451844618447184481844918450184511845218453184541845518456184571845818459184601846118462184631846418465184661846718468184691847018471184721847318474184751847618477184781847918480184811848218483184841848518486184871848818489184901849118492184931849418495184961849718498184991850018501185021850318504185051850618507185081850918510185111851218513185141851518516185171851818519185201852118522185231852418525185261852718528185291853018531185321853318534185351853618537185381853918540185411854218543185441854518546185471854818549185501855118552185531855418555185561855718558185591856018561185621856318564185651856618567185681856918570185711857218573185741857518576185771857818579185801858118582185831858418585185861858718588185891859018591185921859318594185951859618597185981859918600186011860218603186041860518606186071860818609186101861118612186131861418615186161861718618186191862018621186221862318624186251862618627186281862918630186311863218633186341863518636186371863818639186401864118642186431864418645186461864718648186491865018651186521865318654186551865618657186581865918660186611866218663186641866518666186671866818669186701867118672186731867418675186761867718678186791868018681186821868318684186851868618687186881868918690186911869218693186941869518696186971869818699187001870118702187031870418705187061870718708187091871018711187121871318714187151871618717187181871918720187211872218723187241872518726187271872818729187301873118732187331873418735187361873718738187391874018741187421874318744187451874618747187481874918750187511875218753187541875518756187571875818759187601876118762187631876418765187661876718768187691877018771187721877318774187751877618777187781877918780187811878218783187841878518786187871878818789187901879118792187931879418795187961879718798187991880018801188021880318804188051880618807188081880918810188111881218813188141881518816188171881818819188201882118822188231882418825188261882718828188291883018831188321883318834188351883618837188381883918840188411884218843188441884518846188471884818849188501885118852188531885418855188561885718858188591886018861188621886318864188651886618867188681886918870188711887218873188741887518876188771887818879188801888118882188831888418885188861888718888188891889018891188921889318894188951889618897188981889918900189011890218903189041890518906189071890818909189101891118912189131891418915189161891718918189191892018921189221892318924189251892618927189281892918930189311893218933189341893518936189371893818939189401894118942189431894418945189461894718948189491895018951189521895318954189551895618957189581895918960189611896218963189641896518966189671896818969189701897118972189731897418975189761897718978189791898018981189821898318984189851898618987189881898918990189911899218993189941899518996189971899818999190001900119002190031900419005190061900719008190091901019011190121901319014190151901619017190181901919020190211902219023190241902519026190271902819029190301903119032190331903419035190361903719038190391904019041190421904319044190451904619047190481904919050190511905219053190541905519056190571905819059190601906119062190631906419065190661906719068190691907019071190721907319074190751907619077190781907919080190811908219083190841908519086190871908819089190901909119092190931909419095190961909719098190991910019101191021910319104191051910619107191081910919110191111911219113191141911519116191171911819119191201912119122191231912419125191261912719128191291913019131191321913319134191351913619137191381913919140191411914219143191441914519146191471914819149191501915119152191531915419155191561915719158191591916019161191621916319164191651916619167191681916919170191711917219173191741917519176191771917819179191801918119182191831918419185191861918719188191891919019191191921919319194191951919619197191981919919200192011920219203192041920519206192071920819209192101921119212192131921419215192161921719218192191922019221192221922319224192251922619227192281922919230192311923219233192341923519236192371923819239192401924119242192431924419245192461924719248192491925019251192521925319254192551925619257192581925919260192611926219263192641926519266192671926819269192701927119272192731927419275192761927719278192791928019281192821928319284192851928619287192881928919290192911929219293192941929519296192971929819299193001930119302193031930419305193061930719308193091931019311193121931319314193151931619317193181931919320193211932219323193241932519326193271932819329193301933119332193331933419335193361933719338193391934019341193421934319344193451934619347193481934919350193511935219353193541935519356193571935819359193601936119362193631936419365193661936719368193691937019371193721937319374193751937619377193781937919380193811938219383193841938519386193871938819389193901939119392193931939419395193961939719398193991940019401194021940319404194051940619407194081940919410194111941219413194141941519416194171941819419194201942119422194231942419425194261942719428194291943019431194321943319434194351943619437194381943919440194411944219443194441944519446194471944819449194501945119452194531945419455194561945719458194591946019461194621946319464194651946619467194681946919470194711947219473194741947519476194771947819479194801948119482194831948419485194861948719488194891949019491194921949319494194951949619497194981949919500195011950219503195041950519506195071950819509195101951119512195131951419515195161951719518195191952019521195221952319524195251952619527195281952919530195311953219533195341953519536195371953819539195401954119542195431954419545195461954719548195491955019551195521955319554195551955619557195581955919560195611956219563195641956519566195671956819569195701957119572195731957419575195761957719578195791958019581195821958319584195851958619587195881958919590195911959219593195941959519596195971959819599196001960119602196031960419605196061960719608196091961019611196121961319614196151961619617196181961919620196211962219623196241962519626196271962819629196301963119632196331963419635196361963719638196391964019641196421964319644196451964619647196481964919650196511965219653196541965519656196571965819659196601966119662196631966419665196661966719668196691967019671196721967319674196751967619677196781967919680196811968219683196841968519686196871968819689196901969119692196931969419695196961969719698196991970019701197021970319704197051970619707197081970919710197111971219713197141971519716197171971819719197201972119722197231972419725197261972719728197291973019731197321973319734197351973619737197381973919740197411974219743197441974519746197471974819749197501975119752197531975419755197561975719758197591976019761197621976319764197651976619767197681976919770197711977219773197741977519776197771977819779197801978119782197831978419785197861978719788197891979019791197921979319794197951979619797197981979919800198011980219803198041980519806198071980819809198101981119812198131981419815198161981719818198191982019821198221982319824198251982619827198281982919830198311983219833198341983519836198371983819839198401984119842198431984419845198461984719848198491985019851198521985319854198551985619857198581985919860198611986219863198641986519866198671986819869198701987119872198731987419875198761987719878198791988019881198821988319884198851988619887198881988919890198911989219893198941989519896198971989819899199001990119902199031990419905199061990719908199091991019911199121991319914199151991619917199181991919920199211992219923199241992519926199271992819929199301993119932199331993419935199361993719938199391994019941199421994319944199451994619947199481994919950199511995219953199541995519956199571995819959199601996119962199631996419965199661996719968199691997019971199721997319974199751997619977199781997919980199811998219983199841998519986199871998819989199901999119992199931999419995199961999719998199992000020001200022000320004200052000620007200082000920010200112001220013200142001520016200172001820019200202002120022200232002420025200262002720028200292003020031200322003320034200352003620037200382003920040200412004220043200442004520046200472004820049200502005120052200532005420055200562005720058200592006020061200622006320064200652006620067200682006920070200712007220073200742007520076200772007820079200802008120082200832008420085200862008720088200892009020091200922009320094200952009620097200982009920100201012010220103201042010520106201072010820109201102011120112201132011420115201162011720118201192012020121201222012320124201252012620127201282012920130201312013220133201342013520136201372013820139201402014120142201432014420145201462014720148201492015020151201522015320154201552015620157201582015920160201612016220163201642016520166201672016820169201702017120172201732017420175201762017720178201792018020181201822018320184201852018620187201882018920190201912019220193201942019520196201972019820199202002020120202202032020420205202062020720208202092021020211202122021320214202152021620217202182021920220202212022220223202242022520226202272022820229202302023120232202332023420235202362023720238202392024020241202422024320244202452024620247202482024920250202512025220253202542025520256202572025820259202602026120262202632026420265202662026720268202692027020271202722027320274202752027620277202782027920280202812028220283202842028520286202872028820289202902029120292202932029420295202962029720298202992030020301203022030320304203052030620307203082030920310203112031220313203142031520316203172031820319203202032120322203232032420325203262032720328203292033020331203322033320334203352033620337203382033920340203412034220343203442034520346203472034820349203502035120352203532035420355203562035720358203592036020361203622036320364203652036620367203682036920370203712037220373203742037520376203772037820379203802038120382203832038420385203862038720388203892039020391203922039320394203952039620397203982039920400204012040220403204042040520406204072040820409204102041120412204132041420415204162041720418204192042020421204222042320424204252042620427204282042920430204312043220433204342043520436204372043820439204402044120442204432044420445204462044720448204492045020451204522045320454204552045620457204582045920460204612046220463204642046520466204672046820469204702047120472204732047420475204762047720478204792048020481204822048320484204852048620487204882048920490204912049220493204942049520496204972049820499205002050120502205032050420505205062050720508205092051020511205122051320514205152051620517205182051920520205212052220523205242052520526205272052820529205302053120532205332053420535205362053720538205392054020541205422054320544205452054620547205482054920550205512055220553205542055520556205572055820559205602056120562205632056420565205662056720568205692057020571205722057320574205752057620577205782057920580205812058220583205842058520586205872058820589205902059120592205932059420595205962059720598205992060020601206022060320604206052060620607206082060920610206112061220613206142061520616206172061820619206202062120622206232062420625206262062720628206292063020631206322063320634206352063620637206382063920640206412064220643206442064520646206472064820649206502065120652206532065420655206562065720658206592066020661206622066320664206652066620667206682066920670206712067220673206742067520676206772067820679206802068120682206832068420685206862068720688206892069020691206922069320694206952069620697206982069920700207012070220703207042070520706207072070820709207102071120712207132071420715207162071720718207192072020721207222072320724207252072620727207282072920730207312073220733207342073520736207372073820739207402074120742207432074420745207462074720748207492075020751207522075320754207552075620757207582075920760207612076220763207642076520766207672076820769207702077120772207732077420775207762077720778207792078020781207822078320784207852078620787207882078920790207912079220793207942079520796207972079820799208002080120802208032080420805208062080720808208092081020811208122081320814208152081620817208182081920820208212082220823208242082520826208272082820829208302083120832208332083420835208362083720838208392084020841208422084320844208452084620847208482084920850208512085220853208542085520856208572085820859208602086120862208632086420865208662086720868208692087020871208722087320874208752087620877208782087920880208812088220883208842088520886208872088820889208902089120892208932089420895208962089720898208992090020901209022090320904209052090620907209082090920910209112091220913209142091520916209172091820919209202092120922209232092420925209262092720928209292093020931209322093320934209352093620937209382093920940209412094220943209442094520946209472094820949209502095120952209532095420955209562095720958209592096020961209622096320964209652096620967209682096920970209712097220973209742097520976209772097820979209802098120982209832098420985209862098720988209892099020991209922099320994209952099620997209982099921000210012100221003210042100521006210072100821009210102101121012210132101421015210162101721018210192102021021210222102321024210252102621027210282102921030210312103221033210342103521036210372103821039210402104121042210432104421045210462104721048210492105021051210522105321054210552105621057210582105921060210612106221063210642106521066210672106821069210702107121072210732107421075210762107721078210792108021081210822108321084210852108621087210882108921090210912109221093210942109521096210972109821099211002110121102211032110421105211062110721108211092111021111211122111321114211152111621117211182111921120211212112221123211242112521126211272112821129211302113121132211332113421135211362113721138211392114021141211422114321144211452114621147211482114921150211512115221153211542115521156211572115821159211602116121162211632116421165211662116721168211692117021171211722117321174211752117621177211782117921180211812118221183211842118521186211872118821189211902119121192211932119421195211962119721198211992120021201212022120321204212052120621207212082120921210212112121221213212142121521216212172121821219212202122121222212232122421225212262122721228212292123021231212322123321234212352123621237212382123921240212412124221243212442124521246212472124821249212502125121252212532125421255212562125721258212592126021261212622126321264212652126621267212682126921270212712127221273212742127521276212772127821279212802128121282212832128421285212862128721288212892129021291212922129321294212952129621297212982129921300213012130221303213042130521306213072130821309213102131121312213132131421315213162131721318213192132021321213222132321324213252132621327213282132921330213312133221333213342133521336213372133821339213402134121342213432134421345213462134721348213492135021351213522135321354213552135621357213582135921360213612136221363213642136521366213672136821369213702137121372213732137421375213762137721378213792138021381213822138321384213852138621387213882138921390213912139221393213942139521396213972139821399214002140121402214032140421405214062140721408214092141021411214122141321414214152141621417214182141921420214212142221423214242142521426214272142821429214302143121432214332143421435214362143721438214392144021441214422144321444214452144621447214482144921450214512145221453214542145521456214572145821459214602146121462214632146421465214662146721468214692147021471214722147321474214752147621477214782147921480214812148221483214842148521486214872148821489214902149121492214932149421495214962149721498214992150021501215022150321504215052150621507215082150921510215112151221513215142151521516215172151821519215202152121522215232152421525215262152721528215292153021531215322153321534215352153621537215382153921540215412154221543215442154521546215472154821549215502155121552215532155421555215562155721558215592156021561215622156321564215652156621567215682156921570215712157221573215742157521576215772157821579215802158121582215832158421585215862158721588215892159021591215922159321594215952159621597215982159921600216012160221603216042160521606216072160821609216102161121612216132161421615216162161721618216192162021621216222162321624216252162621627216282162921630216312163221633216342163521636216372163821639216402164121642216432164421645216462164721648216492165021651216522165321654216552165621657216582165921660216612166221663216642166521666216672166821669216702167121672216732167421675216762167721678216792168021681216822168321684216852168621687216882168921690216912169221693216942169521696216972169821699217002170121702217032170421705217062170721708217092171021711217122171321714217152171621717217182171921720217212172221723217242172521726217272172821729217302173121732217332173421735217362173721738217392174021741217422174321744217452174621747217482174921750217512175221753217542175521756217572175821759217602176121762217632176421765217662176721768217692177021771217722177321774217752177621777217782177921780217812178221783217842178521786217872178821789217902179121792217932179421795217962179721798217992180021801218022180321804218052180621807218082180921810218112181221813218142181521816218172181821819218202182121822218232182421825218262182721828218292183021831218322183321834218352183621837218382183921840218412184221843218442184521846218472184821849218502185121852218532185421855218562185721858218592186021861218622186321864218652186621867218682186921870218712187221873218742187521876218772187821879218802188121882218832188421885218862188721888218892189021891218922189321894218952189621897218982189921900219012190221903219042190521906219072190821909219102191121912219132191421915219162191721918219192192021921219222192321924219252192621927219282192921930219312193221933219342193521936219372193821939219402194121942219432194421945219462194721948219492195021951219522195321954219552195621957219582195921960219612196221963219642196521966219672196821969219702197121972219732197421975219762197721978219792198021981219822198321984219852198621987219882198921990219912199221993219942199521996219972199821999220002200122002220032200422005220062200722008220092201022011220122201322014220152201622017220182201922020220212202222023220242202522026220272202822029220302203122032220332203422035220362203722038220392204022041220422204322044220452204622047220482204922050220512205222053220542205522056220572205822059220602206122062220632206422065220662206722068220692207022071220722207322074220752207622077220782207922080220812208222083220842208522086220872208822089220902209122092220932209422095220962209722098220992210022101221022210322104221052210622107221082210922110221112211222113221142211522116221172211822119221202212122122221232212422125221262212722128221292213022131221322213322134221352213622137221382213922140221412214222143221442214522146221472214822149221502215122152221532215422155221562215722158221592216022161221622216322164221652216622167221682216922170221712217222173221742217522176221772217822179221802218122182221832218422185221862218722188221892219022191221922219322194221952219622197221982219922200222012220222203222042220522206222072220822209222102221122212222132221422215222162221722218222192222022221222222222322224222252222622227222282222922230222312223222233222342223522236222372223822239222402224122242222432224422245222462224722248222492225022251222522225322254222552225622257222582225922260222612226222263222642226522266222672226822269222702227122272222732227422275222762227722278222792228022281222822228322284222852228622287222882228922290222912229222293222942229522296222972229822299223002230122302223032230422305223062230722308223092231022311223122231322314223152231622317223182231922320223212232222323223242232522326223272232822329223302233122332223332233422335223362233722338223392234022341223422234322344223452234622347223482234922350223512235222353223542235522356223572235822359223602236122362223632236422365223662236722368223692237022371223722237322374223752237622377223782237922380223812238222383223842238522386223872238822389223902239122392223932239422395223962239722398223992240022401224022240322404224052240622407224082240922410224112241222413224142241522416224172241822419224202242122422224232242422425224262242722428224292243022431224322243322434224352243622437224382243922440224412244222443224442244522446224472244822449224502245122452224532245422455224562245722458224592246022461224622246322464224652246622467224682246922470224712247222473224742247522476224772247822479224802248122482224832248422485224862248722488224892249022491224922249322494224952249622497224982249922500225012250222503225042250522506225072250822509225102251122512225132251422515225162251722518225192252022521225222252322524225252252622527225282252922530225312253222533225342253522536225372253822539225402254122542225432254422545225462254722548225492255022551225522255322554225552255622557225582255922560225612256222563225642256522566225672256822569225702257122572225732257422575225762257722578225792258022581225822258322584225852258622587225882258922590225912259222593225942259522596225972259822599226002260122602226032260422605226062260722608226092261022611226122261322614226152261622617226182261922620226212262222623226242262522626226272262822629226302263122632226332263422635226362263722638226392264022641226422264322644226452264622647226482264922650226512265222653226542265522656226572265822659226602266122662226632266422665226662266722668226692267022671226722267322674226752267622677226782267922680226812268222683226842268522686226872268822689226902269122692226932269422695226962269722698226992270022701227022270322704227052270622707227082270922710227112271222713227142271522716227172271822719227202272122722227232272422725227262272722728227292273022731227322273322734227352273622737227382273922740227412274222743227442274522746227472274822749227502275122752227532275422755227562275722758227592276022761227622276322764227652276622767227682276922770227712277222773227742277522776227772277822779227802278122782227832278422785227862278722788227892279022791227922279322794227952279622797227982279922800228012280222803228042280522806228072280822809228102281122812228132281422815228162281722818228192282022821228222282322824228252282622827228282282922830228312283222833228342283522836228372283822839228402284122842228432284422845228462284722848228492285022851228522285322854228552285622857228582285922860228612286222863228642286522866228672286822869228702287122872228732287422875228762287722878228792288022881228822288322884228852288622887228882288922890228912289222893228942289522896228972289822899229002290122902229032290422905229062290722908229092291022911229122291322914229152291622917229182291922920229212292222923229242292522926229272292822929229302293122932229332293422935229362293722938229392294022941229422294322944229452294622947229482294922950229512295222953229542295522956229572295822959229602296122962229632296422965229662296722968229692297022971229722297322974229752297622977229782297922980229812298222983229842298522986229872298822989229902299122992229932299422995229962299722998229992300023001230022300323004230052300623007230082300923010230112301223013230142301523016230172301823019230202302123022230232302423025230262302723028230292303023031230322303323034230352303623037230382303923040230412304223043230442304523046230472304823049230502305123052230532305423055230562305723058230592306023061230622306323064230652306623067230682306923070230712307223073230742307523076230772307823079230802308123082230832308423085230862308723088230892309023091230922309323094230952309623097230982309923100231012310223103231042310523106231072310823109231102311123112231132311423115231162311723118231192312023121231222312323124231252312623127231282312923130231312313223133231342313523136231372313823139231402314123142231432314423145231462314723148231492315023151231522315323154231552315623157231582315923160231612316223163231642316523166231672316823169231702317123172231732317423175231762317723178231792318023181231822318323184231852318623187231882318923190231912319223193231942319523196231972319823199232002320123202232032320423205232062320723208232092321023211232122321323214232152321623217232182321923220232212322223223232242322523226232272322823229232302323123232232332323423235232362323723238232392324023241232422324323244232452324623247232482324923250232512325223253232542325523256232572325823259232602326123262232632326423265232662326723268232692327023271232722327323274232752327623277232782327923280232812328223283232842328523286232872328823289232902329123292232932329423295232962329723298232992330023301233022330323304233052330623307233082330923310233112331223313233142331523316233172331823319233202332123322233232332423325233262332723328233292333023331233322333323334233352333623337233382333923340233412334223343233442334523346233472334823349233502335123352233532335423355233562335723358233592336023361233622336323364233652336623367233682336923370233712337223373233742337523376233772337823379233802338123382233832338423385233862338723388233892339023391233922339323394233952339623397233982339923400234012340223403234042340523406234072340823409234102341123412234132341423415234162341723418234192342023421234222342323424234252342623427234282342923430234312343223433234342343523436234372343823439234402344123442234432344423445234462344723448234492345023451234522345323454234552345623457234582345923460234612346223463234642346523466234672346823469234702347123472234732347423475234762347723478234792348023481234822348323484234852348623487234882348923490234912349223493234942349523496234972349823499235002350123502235032350423505235062350723508235092351023511235122351323514235152351623517235182351923520235212352223523235242352523526235272352823529235302353123532235332353423535235362353723538235392354023541235422354323544235452354623547235482354923550235512355223553235542355523556235572355823559235602356123562235632356423565235662356723568235692357023571235722357323574235752357623577235782357923580235812358223583235842358523586235872358823589235902359123592235932359423595235962359723598235992360023601236022360323604236052360623607236082360923610236112361223613236142361523616236172361823619236202362123622236232362423625236262362723628236292363023631236322363323634236352363623637236382363923640236412364223643236442364523646236472364823649236502365123652236532365423655236562365723658236592366023661236622366323664236652366623667236682366923670236712367223673236742367523676236772367823679236802368123682236832368423685236862368723688236892369023691236922369323694236952369623697236982369923700237012370223703237042370523706237072370823709237102371123712237132371423715237162371723718237192372023721237222372323724237252372623727237282372923730237312373223733237342373523736237372373823739237402374123742237432374423745237462374723748237492375023751237522375323754237552375623757237582375923760237612376223763237642376523766237672376823769237702377123772237732377423775237762377723778237792378023781237822378323784237852378623787237882378923790237912379223793237942379523796237972379823799238002380123802238032380423805238062380723808238092381023811238122381323814238152381623817238182381923820238212382223823238242382523826238272382823829238302383123832238332383423835238362383723838238392384023841238422384323844238452384623847238482384923850238512385223853238542385523856238572385823859238602386123862238632386423865238662386723868238692387023871238722387323874238752387623877238782387923880238812388223883238842388523886238872388823889238902389123892238932389423895238962389723898238992390023901239022390323904239052390623907239082390923910239112391223913239142391523916239172391823919239202392123922239232392423925239262392723928239292393023931239322393323934239352393623937239382393923940239412394223943239442394523946239472394823949239502395123952239532395423955239562395723958239592396023961239622396323964239652396623967239682396923970239712397223973239742397523976239772397823979239802398123982239832398423985239862398723988239892399023991239922399323994239952399623997239982399924000240012400224003240042400524006240072400824009240102401124012240132401424015240162401724018240192402024021240222402324024240252402624027240282402924030240312403224033240342403524036240372403824039240402404124042240432404424045240462404724048240492405024051240522405324054240552405624057240582405924060240612406224063240642406524066240672406824069240702407124072240732407424075240762407724078240792408024081240822408324084240852408624087240882408924090240912409224093240942409524096240972409824099241002410124102241032410424105241062410724108241092411024111241122411324114241152411624117241182411924120241212412224123241242412524126241272412824129241302413124132241332413424135241362413724138241392414024141241422414324144241452414624147241482414924150241512415224153241542415524156241572415824159241602416124162241632416424165241662416724168241692417024171241722417324174241752417624177241782417924180241812418224183241842418524186241872418824189241902419124192241932419424195241962419724198241992420024201242022420324204242052420624207242082420924210242112421224213242142421524216242172421824219242202422124222242232422424225242262422724228242292423024231242322423324234242352423624237242382423924240242412424224243242442424524246242472424824249242502425124252242532425424255242562425724258242592426024261242622426324264242652426624267242682426924270242712427224273242742427524276242772427824279242802428124282242832428424285242862428724288242892429024291242922429324294242952429624297242982429924300243012430224303243042430524306243072430824309243102431124312243132431424315243162431724318243192432024321243222432324324243252432624327243282432924330243312433224333243342433524336243372433824339243402434124342243432434424345243462434724348243492435024351243522435324354243552435624357243582435924360243612436224363243642436524366243672436824369243702437124372243732437424375243762437724378243792438024381243822438324384243852438624387243882438924390243912439224393243942439524396243972439824399244002440124402244032440424405244062440724408244092441024411244122441324414244152441624417244182441924420244212442224423244242442524426244272442824429244302443124432244332443424435244362443724438244392444024441244422444324444244452444624447244482444924450244512445224453244542445524456244572445824459244602446124462244632446424465244662446724468244692447024471244722447324474244752447624477244782447924480244812448224483244842448524486244872448824489244902449124492244932449424495244962449724498244992450024501245022450324504245052450624507245082450924510245112451224513245142451524516245172451824519245202452124522245232452424525245262452724528245292453024531245322453324534245352453624537245382453924540245412454224543245442454524546245472454824549245502455124552245532455424555245562455724558245592456024561245622456324564245652456624567245682456924570245712457224573245742457524576245772457824579245802458124582245832458424585245862458724588245892459024591245922459324594245952459624597245982459924600246012460224603246042460524606246072460824609246102461124612246132461424615246162461724618246192462024621246222462324624246252462624627246282462924630246312463224633246342463524636246372463824639246402464124642246432464424645246462464724648246492465024651246522465324654246552465624657246582465924660246612466224663246642466524666246672466824669246702467124672246732467424675246762467724678246792468024681246822468324684246852468624687246882468924690246912469224693246942469524696246972469824699247002470124702247032470424705247062470724708247092471024711247122471324714247152471624717247182471924720247212472224723247242472524726247272472824729247302473124732247332473424735247362473724738247392474024741247422474324744247452474624747247482474924750247512475224753247542475524756247572475824759247602476124762247632476424765247662476724768247692477024771247722477324774247752477624777247782477924780247812478224783247842478524786247872478824789247902479124792247932479424795247962479724798247992480024801248022480324804248052480624807248082480924810248112481224813248142481524816248172481824819248202482124822248232482424825248262482724828248292483024831248322483324834248352483624837248382483924840248412484224843248442484524846248472484824849248502485124852248532485424855248562485724858248592486024861248622486324864248652486624867248682486924870248712487224873248742487524876248772487824879248802488124882248832488424885248862488724888248892489024891248922489324894248952489624897248982489924900249012490224903249042490524906249072490824909249102491124912249132491424915249162491724918249192492024921249222492324924249252492624927249282492924930249312493224933249342493524936249372493824939249402494124942249432494424945249462494724948249492495024951249522495324954249552495624957249582495924960249612496224963249642496524966249672496824969249702497124972249732497424975249762497724978249792498024981249822498324984249852498624987249882498924990249912499224993249942499524996249972499824999250002500125002250032500425005250062500725008250092501025011250122501325014250152501625017250182501925020250212502225023250242502525026250272502825029250302503125032250332503425035250362503725038250392504025041250422504325044250452504625047250482504925050250512505225053250542505525056250572505825059250602506125062250632506425065250662506725068250692507025071250722507325074250752507625077250782507925080250812508225083250842508525086250872508825089250902509125092250932509425095250962509725098250992510025101251022510325104251052510625107251082510925110251112511225113251142511525116251172511825119251202512125122251232512425125251262512725128251292513025131251322513325134251352513625137251382513925140251412514225143251442514525146251472514825149251502515125152251532515425155251562515725158251592516025161251622516325164251652516625167251682516925170251712517225173251742517525176251772517825179251802518125182251832518425185251862518725188251892519025191251922519325194251952519625197251982519925200252012520225203252042520525206252072520825209252102521125212252132521425215252162521725218252192522025221252222522325224252252522625227252282522925230252312523225233252342523525236252372523825239252402524125242252432524425245252462524725248252492525025251252522525325254252552525625257252582525925260252612526225263252642526525266252672526825269252702527125272252732527425275252762527725278252792528025281252822528325284252852528625287252882528925290252912529225293252942529525296252972529825299253002530125302253032530425305253062530725308253092531025311253122531325314253152531625317253182531925320253212532225323253242532525326253272532825329253302533125332253332533425335253362533725338253392534025341253422534325344253452534625347253482534925350253512535225353253542535525356253572535825359253602536125362253632536425365253662536725368253692537025371253722537325374253752537625377253782537925380253812538225383253842538525386253872538825389253902539125392253932539425395253962539725398253992540025401254022540325404254052540625407254082540925410254112541225413254142541525416254172541825419254202542125422254232542425425254262542725428254292543025431254322543325434254352543625437254382543925440254412544225443254442544525446254472544825449254502545125452254532545425455254562545725458254592546025461254622546325464254652546625467254682546925470254712547225473254742547525476254772547825479254802548125482254832548425485254862548725488254892549025491254922549325494254952549625497254982549925500255012550225503255042550525506255072550825509255102551125512255132551425515255162551725518255192552025521255222552325524255252552625527255282552925530255312553225533255342553525536255372553825539255402554125542255432554425545255462554725548255492555025551255522555325554255552555625557255582555925560255612556225563255642556525566255672556825569255702557125572255732557425575255762557725578255792558025581255822558325584255852558625587255882558925590255912559225593255942559525596255972559825599256002560125602256032560425605256062560725608256092561025611256122561325614256152561625617256182561925620256212562225623256242562525626256272562825629256302563125632256332563425635256362563725638256392564025641256422564325644256452564625647256482564925650256512565225653256542565525656256572565825659256602566125662256632566425665256662566725668256692567025671256722567325674256752567625677256782567925680256812568225683256842568525686256872568825689256902569125692256932569425695256962569725698256992570025701257022570325704257052570625707257082570925710257112571225713257142571525716257172571825719257202572125722257232572425725257262572725728257292573025731257322573325734257352573625737257382573925740257412574225743257442574525746257472574825749257502575125752257532575425755257562575725758257592576025761257622576325764257652576625767257682576925770257712577225773257742577525776257772577825779257802578125782257832578425785257862578725788257892579025791257922579325794257952579625797257982579925800258012580225803258042580525806258072580825809258102581125812258132581425815258162581725818258192582025821258222582325824258252582625827258282582925830258312583225833258342583525836258372583825839258402584125842258432584425845258462584725848258492585025851258522585325854258552585625857258582585925860258612586225863258642586525866258672586825869258702587125872258732587425875258762587725878258792588025881258822588325884258852588625887258882588925890258912589225893258942589525896258972589825899259002590125902259032590425905259062590725908259092591025911259122591325914259152591625917259182591925920259212592225923259242592525926259272592825929259302593125932259332593425935259362593725938259392594025941259422594325944259452594625947259482594925950259512595225953259542595525956259572595825959259602596125962259632596425965259662596725968259692597025971259722597325974259752597625977259782597925980259812598225983259842598525986259872598825989259902599125992259932599425995259962599725998259992600026001260022600326004260052600626007260082600926010260112601226013260142601526016260172601826019260202602126022260232602426025260262602726028260292603026031260322603326034260352603626037260382603926040260412604226043260442604526046260472604826049260502605126052260532605426055260562605726058260592606026061260622606326064260652606626067260682606926070260712607226073260742607526076260772607826079260802608126082260832608426085260862608726088260892609026091260922609326094260952609626097260982609926100261012610226103261042610526106261072610826109261102611126112261132611426115261162611726118261192612026121261222612326124261252612626127261282612926130261312613226133261342613526136261372613826139261402614126142261432614426145261462614726148261492615026151261522615326154261552615626157261582615926160261612616226163261642616526166261672616826169261702617126172261732617426175261762617726178261792618026181261822618326184261852618626187261882618926190261912619226193261942619526196261972619826199262002620126202262032620426205262062620726208262092621026211262122621326214262152621626217262182621926220262212622226223262242622526226262272622826229262302623126232262332623426235262362623726238262392624026241262422624326244262452624626247262482624926250262512625226253262542625526256262572625826259262602626126262262632626426265262662626726268262692627026271262722627326274262752627626277262782627926280262812628226283262842628526286262872628826289262902629126292262932629426295262962629726298262992630026301263022630326304263052630626307263082630926310263112631226313263142631526316263172631826319263202632126322263232632426325263262632726328263292633026331263322633326334263352633626337263382633926340263412634226343263442634526346263472634826349263502635126352263532635426355263562635726358263592636026361263622636326364263652636626367263682636926370263712637226373263742637526376263772637826379263802638126382263832638426385263862638726388263892639026391263922639326394263952639626397263982639926400264012640226403264042640526406264072640826409264102641126412264132641426415264162641726418264192642026421264222642326424264252642626427264282642926430264312643226433264342643526436264372643826439264402644126442264432644426445264462644726448264492645026451264522645326454264552645626457264582645926460264612646226463264642646526466264672646826469264702647126472264732647426475264762647726478264792648026481264822648326484264852648626487264882648926490264912649226493264942649526496264972649826499265002650126502265032650426505265062650726508265092651026511265122651326514265152651626517265182651926520265212652226523265242652526526265272652826529265302653126532265332653426535265362653726538265392654026541265422654326544265452654626547265482654926550265512655226553265542655526556265572655826559265602656126562265632656426565265662656726568265692657026571265722657326574265752657626577265782657926580265812658226583265842658526586265872658826589265902659126592265932659426595265962659726598265992660026601266022660326604266052660626607266082660926610266112661226613266142661526616266172661826619266202662126622266232662426625266262662726628266292663026631266322663326634266352663626637266382663926640266412664226643266442664526646266472664826649266502665126652266532665426655266562665726658266592666026661266622666326664266652666626667266682666926670266712667226673266742667526676266772667826679266802668126682266832668426685266862668726688266892669026691266922669326694266952669626697266982669926700267012670226703267042670526706267072670826709267102671126712267132671426715267162671726718267192672026721267222672326724267252672626727267282672926730267312673226733267342673526736267372673826739267402674126742267432674426745267462674726748267492675026751267522675326754267552675626757267582675926760267612676226763267642676526766267672676826769267702677126772267732677426775267762677726778267792678026781267822678326784267852678626787267882678926790267912679226793267942679526796267972679826799268002680126802268032680426805268062680726808268092681026811268122681326814268152681626817268182681926820268212682226823268242682526826268272682826829268302683126832268332683426835268362683726838268392684026841268422684326844268452684626847268482684926850268512685226853268542685526856268572685826859268602686126862268632686426865268662686726868268692687026871268722687326874268752687626877268782687926880268812688226883268842688526886268872688826889268902689126892268932689426895268962689726898268992690026901269022690326904269052690626907269082690926910269112691226913269142691526916269172691826919269202692126922269232692426925269262692726928269292693026931269322693326934269352693626937269382693926940269412694226943269442694526946269472694826949269502695126952269532695426955269562695726958269592696026961269622696326964269652696626967269682696926970269712697226973269742697526976269772697826979269802698126982269832698426985269862698726988269892699026991269922699326994269952699626997269982699927000270012700227003270042700527006270072700827009270102701127012270132701427015270162701727018270192702027021270222702327024270252702627027270282702927030270312703227033270342703527036270372703827039270402704127042270432704427045270462704727048270492705027051270522705327054270552705627057270582705927060270612706227063270642706527066270672706827069270702707127072270732707427075270762707727078270792708027081270822708327084270852708627087270882708927090270912709227093270942709527096270972709827099271002710127102271032710427105271062710727108271092711027111271122711327114271152711627117271182711927120271212712227123271242712527126271272712827129271302713127132271332713427135271362713727138271392714027141271422714327144271452714627147271482714927150271512715227153271542715527156271572715827159271602716127162271632716427165271662716727168271692717027171271722717327174271752717627177271782717927180271812718227183271842718527186271872718827189271902719127192271932719427195271962719727198271992720027201272022720327204272052720627207272082720927210272112721227213272142721527216272172721827219272202722127222272232722427225272262722727228272292723027231272322723327234272352723627237272382723927240272412724227243272442724527246272472724827249272502725127252272532725427255272562725727258272592726027261272622726327264272652726627267272682726927270272712727227273272742727527276272772727827279272802728127282272832728427285272862728727288272892729027291272922729327294272952729627297272982729927300273012730227303273042730527306273072730827309273102731127312273132731427315273162731727318273192732027321273222732327324273252732627327273282732927330273312733227333273342733527336273372733827339273402734127342273432734427345273462734727348273492735027351273522735327354273552735627357273582735927360273612736227363273642736527366273672736827369273702737127372273732737427375273762737727378273792738027381273822738327384273852738627387273882738927390273912739227393273942739527396273972739827399274002740127402274032740427405274062740727408274092741027411274122741327414274152741627417274182741927420274212742227423274242742527426274272742827429274302743127432274332743427435274362743727438274392744027441274422744327444274452744627447274482744927450274512745227453274542745527456274572745827459274602746127462274632746427465274662746727468274692747027471274722747327474274752747627477274782747927480274812748227483274842748527486274872748827489274902749127492274932749427495274962749727498274992750027501275022750327504275052750627507275082750927510275112751227513275142751527516275172751827519275202752127522275232752427525275262752727528275292753027531275322753327534275352753627537275382753927540275412754227543275442754527546275472754827549275502755127552275532755427555275562755727558275592756027561275622756327564275652756627567275682756927570275712757227573275742757527576275772757827579275802758127582275832758427585275862758727588275892759027591275922759327594275952759627597275982759927600276012760227603276042760527606276072760827609276102761127612276132761427615276162761727618276192762027621276222762327624276252762627627276282762927630276312763227633276342763527636276372763827639276402764127642276432764427645276462764727648276492765027651276522765327654276552765627657276582765927660276612766227663276642766527666276672766827669276702767127672276732767427675276762767727678276792768027681276822768327684276852768627687276882768927690276912769227693276942769527696276972769827699277002770127702277032770427705277062770727708277092771027711277122771327714277152771627717277182771927720277212772227723277242772527726277272772827729277302773127732277332773427735277362773727738277392774027741277422774327744277452774627747277482774927750277512775227753277542775527756277572775827759277602776127762277632776427765277662776727768277692777027771277722777327774277752777627777277782777927780277812778227783277842778527786277872778827789277902779127792277932779427795277962779727798277992780027801278022780327804278052780627807278082780927810278112781227813278142781527816278172781827819278202782127822278232782427825278262782727828278292783027831278322783327834278352783627837278382783927840278412784227843278442784527846278472784827849278502785127852278532785427855278562785727858278592786027861278622786327864278652786627867278682786927870278712787227873278742787527876278772787827879278802788127882278832788427885278862788727888278892789027891278922789327894278952789627897278982789927900279012790227903279042790527906279072790827909279102791127912279132791427915279162791727918279192792027921279222792327924279252792627927279282792927930279312793227933279342793527936279372793827939279402794127942279432794427945279462794727948279492795027951279522795327954279552795627957279582795927960279612796227963279642796527966279672796827969279702797127972279732797427975279762797727978279792798027981279822798327984279852798627987279882798927990279912799227993279942799527996279972799827999280002800128002280032800428005280062800728008280092801028011280122801328014280152801628017280182801928020280212802228023280242802528026280272802828029280302803128032280332803428035280362803728038280392804028041280422804328044280452804628047280482804928050280512805228053280542805528056280572805828059280602806128062280632806428065280662806728068280692807028071280722807328074280752807628077280782807928080280812808228083280842808528086280872808828089280902809128092280932809428095280962809728098280992810028101281022810328104281052810628107281082810928110281112811228113281142811528116281172811828119281202812128122281232812428125281262812728128281292813028131281322813328134281352813628137281382813928140281412814228143281442814528146281472814828149281502815128152281532815428155281562815728158281592816028161281622816328164281652816628167281682816928170281712817228173281742817528176281772817828179281802818128182281832818428185281862818728188281892819028191281922819328194281952819628197281982819928200282012820228203282042820528206282072820828209282102821128212282132821428215282162821728218282192822028221282222822328224282252822628227282282822928230282312823228233282342823528236282372823828239282402824128242282432824428245282462824728248282492825028251282522825328254282552825628257282582825928260282612826228263282642826528266282672826828269282702827128272282732827428275282762827728278282792828028281282822828328284282852828628287282882828928290282912829228293282942829528296282972829828299283002830128302283032830428305283062830728308283092831028311283122831328314283152831628317283182831928320283212832228323283242832528326283272832828329283302833128332283332833428335283362833728338283392834028341283422834328344283452834628347283482834928350283512835228353283542835528356283572835828359283602836128362283632836428365283662836728368283692837028371283722837328374283752837628377283782837928380283812838228383283842838528386283872838828389283902839128392283932839428395283962839728398283992840028401284022840328404284052840628407284082840928410284112841228413284142841528416284172841828419284202842128422284232842428425284262842728428284292843028431284322843328434284352843628437284382843928440284412844228443284442844528446284472844828449284502845128452284532845428455284562845728458284592846028461284622846328464284652846628467284682846928470284712847228473284742847528476284772847828479284802848128482284832848428485284862848728488284892849028491284922849328494284952849628497284982849928500285012850228503285042850528506285072850828509285102851128512285132851428515285162851728518285192852028521285222852328524285252852628527285282852928530285312853228533285342853528536285372853828539285402854128542285432854428545285462854728548285492855028551285522855328554285552855628557285582855928560285612856228563285642856528566285672856828569285702857128572285732857428575285762857728578285792858028581285822858328584285852858628587285882858928590285912859228593285942859528596285972859828599286002860128602286032860428605286062860728608286092861028611286122861328614286152861628617286182861928620286212862228623286242862528626286272862828629286302863128632286332863428635286362863728638286392864028641286422864328644286452864628647286482864928650286512865228653286542865528656286572865828659286602866128662286632866428665286662866728668286692867028671286722867328674286752867628677286782867928680286812868228683286842868528686286872868828689286902869128692286932869428695286962869728698286992870028701287022870328704287052870628707287082870928710287112871228713287142871528716287172871828719287202872128722287232872428725287262872728728287292873028731287322873328734287352873628737287382873928740287412874228743287442874528746287472874828749287502875128752287532875428755287562875728758287592876028761287622876328764287652876628767287682876928770287712877228773287742877528776287772877828779287802878128782287832878428785287862878728788287892879028791287922879328794287952879628797287982879928800288012880228803288042880528806288072880828809288102881128812288132881428815288162881728818288192882028821288222882328824288252882628827288282882928830288312883228833288342883528836288372883828839288402884128842288432884428845288462884728848288492885028851288522885328854288552885628857288582885928860288612886228863288642886528866288672886828869288702887128872288732887428875288762887728878288792888028881288822888328884288852888628887288882888928890288912889228893288942889528896288972889828899289002890128902289032890428905289062890728908289092891028911289122891328914289152891628917289182891928920289212892228923289242892528926289272892828929289302893128932289332893428935289362893728938289392894028941289422894328944289452894628947289482894928950289512895228953289542895528956289572895828959289602896128962289632896428965289662896728968289692897028971289722897328974289752897628977289782897928980289812898228983289842898528986289872898828989289902899128992289932899428995289962899728998289992900029001290022900329004290052900629007290082900929010290112901229013290142901529016290172901829019290202902129022290232902429025290262902729028290292903029031290322903329034290352903629037290382903929040290412904229043290442904529046290472904829049290502905129052290532905429055290562905729058290592906029061290622906329064290652906629067290682906929070290712907229073290742907529076290772907829079290802908129082290832908429085290862908729088290892909029091290922909329094290952909629097290982909929100291012910229103291042910529106291072910829109291102911129112291132911429115291162911729118291192912029121291222912329124291252912629127291282912929130291312913229133291342913529136291372913829139291402914129142291432914429145291462914729148291492915029151291522915329154291552915629157291582915929160291612916229163291642916529166291672916829169291702917129172291732917429175291762917729178291792918029181291822918329184291852918629187291882918929190291912919229193291942919529196291972919829199292002920129202292032920429205292062920729208292092921029211292122921329214292152921629217292182921929220292212922229223292242922529226292272922829229292302923129232292332923429235292362923729238292392924029241292422924329244292452924629247292482924929250292512925229253292542925529256292572925829259292602926129262292632926429265292662926729268292692927029271292722927329274292752927629277292782927929280292812928229283292842928529286292872928829289292902929129292292932929429295292962929729298292992930029301293022930329304293052930629307293082930929310293112931229313293142931529316293172931829319293202932129322293232932429325293262932729328293292933029331293322933329334293352933629337293382933929340293412934229343293442934529346293472934829349293502935129352293532935429355293562935729358293592936029361293622936329364293652936629367293682936929370293712937229373293742937529376293772937829379293802938129382293832938429385293862938729388293892939029391293922939329394293952939629397293982939929400294012940229403294042940529406294072940829409294102941129412294132941429415294162941729418294192942029421294222942329424294252942629427294282942929430294312943229433294342943529436294372943829439294402944129442294432944429445294462944729448294492945029451294522945329454294552945629457294582945929460294612946229463294642946529466294672946829469294702947129472294732947429475294762947729478294792948029481294822948329484294852948629487294882948929490294912949229493294942949529496294972949829499295002950129502295032950429505295062950729508295092951029511295122951329514295152951629517295182951929520295212952229523295242952529526295272952829529295302953129532295332953429535295362953729538295392954029541295422954329544295452954629547295482954929550295512955229553295542955529556295572955829559295602956129562295632956429565295662956729568295692957029571295722957329574295752957629577295782957929580295812958229583295842958529586295872958829589295902959129592295932959429595295962959729598295992960029601296022960329604296052960629607296082960929610296112961229613296142961529616296172961829619296202962129622296232962429625296262962729628296292963029631296322963329634296352963629637296382963929640296412964229643
  1. //
  2. // $Id$
  3. //
  4. //
  5. // Copyright (c) 2001-2012, Andrew Aksyonoff
  6. // Copyright (c) 2008-2012, Sphinx Technologies Inc
  7. // All rights reserved
  8. //
  9. // This program is free software; you can redistribute it and/or modify
  10. // it under the terms of the GNU General Public License. You should have
  11. // received a copy of the GPL license along with this program; if you
  12. // did not, you can find it at http://www.gnu.org/
  13. //
  14. #include "sphinx.h"
  15. #include "sphinxstem.h"
  16. #include "sphinxquery.h"
  17. #include "sphinxutils.h"
  18. #include "sphinxexpr.h"
  19. #include "sphinxfilter.h"
  20. #include "sphinxint.h"
  21. #include "sphinxsearch.h"
  22. #include "sphinxjson.h"
  23. #include <ctype.h>
  24. #include <fcntl.h>
  25. #include <stdio.h>
  26. #include <stdlib.h>
  27. #include <stdarg.h>
  28. #include <sys/types.h>
  29. #include <sys/stat.h>
  30. #include <limits.h>
  31. #include <time.h>
  32. #include <math.h>
  33. #include <float.h>
  34. #define SPH_UNPACK_BUFFER_SIZE 4096
  35. #define SPH_READ_PROGRESS_CHUNK (8192*1024)
  36. #define SPH_READ_NOPROGRESS_CHUNK (32768*1024)
  37. #if USE_LIBSTEMMER
  38. #include <libstemmer.h>
  39. #endif
  40. #if USE_LIBEXPAT
  41. #define XMLIMPORT
  42. #include "expat.h"
  43. // workaround for expat versions prior to 1.95.7
  44. #ifndef XMLCALL
  45. #define XMLCALL
  46. #endif
  47. #endif
  48. #if USE_LIBXML
  49. #include <libxml/xmlreader.h>
  50. #endif
  51. #if USE_LIBICONV
  52. #include "iconv.h"
  53. #endif
  54. #if USE_ZLIB
  55. #include <zlib.h>
  56. #endif
  57. #if USE_ODBC
  58. #include <sql.h>
  59. #endif
  60. #if USE_RE2
  61. #include <string>
  62. #include <re2/re2.h>
  63. #endif
  64. #if USE_WINDOWS
  65. #include <io.h> // for open()
  66. // workaround Windows quirks
  67. #define popen _popen
  68. #define pclose _pclose
  69. #define snprintf _snprintf
  70. #define sphSeek _lseeki64
  71. #define stat _stat64
  72. #define fstat _fstat64
  73. #if _MSC_VER<1400
  74. #define struct_stat __stat64
  75. #else
  76. #define struct_stat struct _stat64
  77. #endif
  78. #define ICONV_INBUF_CONST 1
  79. #else
  80. #include <unistd.h>
  81. #include <sys/time.h>
  82. #define sphSeek lseek
  83. #define struct_stat struct stat
  84. #endif
  85. #if ( USE_WINDOWS && USE_MYSQL )
  86. #pragma comment(linker, "/defaultlib:libmysql.lib")
  87. #pragma message("Automatically linking with libmysql.lib")
  88. #endif
  89. #if ( USE_WINDOWS && USE_PGSQL )
  90. #pragma comment(linker, "/defaultlib:libpq.lib")
  91. #pragma message("Automatically linking with libpq.lib")
  92. #endif
  93. #if ( USE_WINDOWS && USE_LIBSTEMMER )
  94. #pragma comment(linker, "/defaultlib:libstemmer_c.lib")
  95. #pragma message("Automatically linking with libstemmer_c.lib")
  96. #endif
  97. #if ( USE_WINDOWS && USE_LIBEXPAT )
  98. #pragma comment(linker, "/defaultlib:libexpat.lib")
  99. #pragma message("Automatically linking with libexpat.lib")
  100. #endif
  101. #if ( USE_WINDOWS && USE_LIBICONV )
  102. #pragma comment(linker, "/defaultlib:iconv.lib")
  103. #pragma message("Automatically linking with iconv.lib")
  104. #endif
  105. #if ( USE_WINDOWS && USE_LIBXML )
  106. #pragma comment(linker, "/defaultlib:libxml.lib")
  107. #pragma message("Automatically linking with libxml.lib")
  108. #endif
  109. #if ( USE_WINDOWS && USE_RE2 )
  110. #pragma comment(linker, "/defaultlib:re2.lib")
  111. #pragma message("Automatically linking with re2.lib")
  112. #endif
  113. /////////////////////////////////////////////////////////////////////////////
  114. typedef Hitman_c<8> HITMAN;
  115. // logf() is not there sometimes (eg. Solaris 9)
  116. #if !USE_WINDOWS && !HAVE_LOGF
  117. static inline float logf ( float v )
  118. {
  119. return (float) log ( v );
  120. }
  121. #endif
  122. #if USE_WINDOWS
  123. void localtime_r ( const time_t * clock, struct tm * res )
  124. {
  125. *res = *localtime ( clock );
  126. }
  127. #endif
  128. // forward decl
  129. void sphWarn ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 1, 2 ) ) );
  130. static bool sphTruncate ( int iFD );
  131. /////////////////////////////////////////////////////////////////////////////
  132. // GLOBALS
  133. /////////////////////////////////////////////////////////////////////////////
  134. const char * SPHINX_DEFAULT_SBCS_TABLE = "0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF";
  135. const char * SPHINX_DEFAULT_UTF8_TABLE = "0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F, U+401->U+451, U+451";
  136. const char * MAGIC_WORD_SENTENCE = "\3sentence"; // emitted from source on sentence boundary, stored in dictionary
  137. const char * MAGIC_WORD_PARAGRAPH = "\3paragraph"; // emitted from source on paragraph boundary, stored in dictionary
  138. bool g_bJsonStrict = false;
  139. bool g_bJsonAutoconvNumbers = false;
  140. bool g_bJsonKeynamesToLowercase = false;
  141. static const int DEFAULT_READ_BUFFER = 262144;
  142. static const int DEFAULT_READ_UNHINTED = 32768;
  143. static const int MIN_READ_BUFFER = 8192;
  144. static const int MIN_READ_UNHINTED = 1024;
  145. #define READ_NO_SIZE_HINT 0
  146. static bool g_bSphQuiet = false;
  147. static int g_iReadBuffer = DEFAULT_READ_BUFFER;
  148. static int g_iReadUnhinted = DEFAULT_READ_UNHINTED;
  149. #ifndef SHAREDIR
  150. #define SHAREDIR "."
  151. #endif
  152. CSphString g_sLemmatizerBase = SHAREDIR;
  153. // quick hack for indexer crash reporting
  154. // one day, these might turn into a callback or something
  155. int64_t g_iIndexerCurrentDocID = 0;
  156. int64_t g_iIndexerCurrentHits = 0;
  157. int64_t g_iIndexerCurrentRangeMin = 0;
  158. int64_t g_iIndexerCurrentRangeMax = 0;
  159. int64_t g_iIndexerPoolStartDocID = 0;
  160. int64_t g_iIndexerPoolStartHit = 0;
  161. /// global IDF
  162. class CSphGlobalIDF
  163. {
  164. public:
  165. CSphGlobalIDF ()
  166. : m_iTotalDocuments ( 0 )
  167. , m_iTotalWords ( 0 )
  168. {}
  169. bool Touch ( const CSphString & sFilename );
  170. bool Preread ( const CSphString & sFilename, CSphString & sError );
  171. const DWORD GetDocs ( const CSphString & sWord ) const;
  172. float GetIDF ( const CSphString & sWord, int iDocsLocal, int iQwords, bool bPlainIDF );
  173. protected:
  174. #pragma pack(push,4)
  175. struct IDFWord_t
  176. {
  177. uint64_t m_uWordID;
  178. DWORD m_iDocs;
  179. };
  180. #pragma pack(pop)
  181. STATIC_SIZE_ASSERT ( IDFWord_t, 12 );
  182. static const int HASH_BITS = 16;
  183. int64_t m_iTotalDocuments;
  184. int64_t m_iTotalWords;
  185. SphOffset_t m_uMTime;
  186. CSphSharedBuffer<IDFWord_t> m_pWords;
  187. CSphSharedBuffer<int64_t> m_pHash;
  188. };
  189. /// global idf definitions hash
  190. static SmallStringHash_T <CSphGlobalIDF * > g_hGlobalIDFs;
  191. static CSphStaticMutex g_tGlobalIDFLock;
  192. /////////////////////////////////////////////////////////////////////////////
  193. // COMPILE-TIME CHECKS
  194. /////////////////////////////////////////////////////////////////////////////
  195. STATIC_SIZE_ASSERT ( SphOffset_t, 8 );
  196. /////////////////////////////////////////////////////////////////////////////
  197. #if !USE_WINDOWS
  198. bool g_bHeadProcess = true;
  199. void sphSetProcessInfo ( bool bHead )
  200. {
  201. g_bHeadProcess = bHead;
  202. }
  203. #endif // USE_WINDOWS
  204. // whatever to collect IO stats
  205. static bool g_bCollectIOStats = false;
  206. static SphThreadKey_t g_tIOStatsTls;
  207. bool sphInitIOStats ()
  208. {
  209. if ( !sphThreadKeyCreate ( &g_tIOStatsTls ) )
  210. return false;
  211. g_bCollectIOStats = true;
  212. return true;
  213. }
  214. void sphDoneIOStats ()
  215. {
  216. sphThreadKeyDelete ( g_tIOStatsTls );
  217. g_bCollectIOStats = false;
  218. }
  219. CSphIOStats::CSphIOStats ()
  220. : m_iReadTime ( 0 )
  221. , m_iReadOps ( 0 )
  222. , m_iReadBytes ( 0 )
  223. , m_iWriteTime ( 0 )
  224. , m_iWriteOps ( 0 )
  225. , m_iWriteBytes ( 0 )
  226. , m_pPrev ( NULL )
  227. {}
  228. CSphIOStats::~CSphIOStats ()
  229. {
  230. Stop();
  231. }
  232. void CSphIOStats::Start()
  233. {
  234. if ( !g_bCollectIOStats )
  235. return;
  236. m_pPrev = (CSphIOStats *)sphThreadGet ( g_tIOStatsTls );
  237. sphThreadSet ( g_tIOStatsTls, this );
  238. m_bEnabled = true;
  239. }
  240. void CSphIOStats::Stop()
  241. {
  242. if ( !g_bCollectIOStats )
  243. return;
  244. m_bEnabled = false;
  245. sphThreadSet ( g_tIOStatsTls, m_pPrev );
  246. }
  247. void CSphIOStats::Add ( const CSphIOStats & b )
  248. {
  249. m_iReadTime += b.m_iReadTime;
  250. m_iReadOps += b.m_iReadOps;
  251. m_iReadBytes += b.m_iReadBytes;
  252. m_iWriteTime += b.m_iWriteTime;
  253. m_iWriteOps += b.m_iWriteOps;
  254. m_iWriteBytes += b.m_iWriteBytes;
  255. }
  256. static CSphIOStats * GetIOStats ()
  257. {
  258. if ( !g_bCollectIOStats )
  259. return NULL;
  260. CSphIOStats * pIOStats = (CSphIOStats *)sphThreadGet ( g_tIOStatsTls );
  261. if ( !pIOStats || !pIOStats->IsEnabled() )
  262. return NULL;
  263. else
  264. return pIOStats;
  265. }
  266. static size_t sphRead ( int iFD, void * pBuf, size_t iCount )
  267. {
  268. CSphIOStats * pIOStats = GetIOStats();
  269. int64_t tmStart = 0;
  270. if ( pIOStats )
  271. tmStart = sphMicroTimer();
  272. size_t uRead = (size_t) ::read ( iFD, pBuf, iCount );
  273. if ( pIOStats )
  274. {
  275. pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
  276. pIOStats->m_iReadOps++;
  277. pIOStats->m_iReadBytes += iCount;
  278. }
  279. return uRead;
  280. }
  281. static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo );
  282. /////////////////////////////////////////////////////////////////////////////
  283. // INTERNAL SPHINX CLASSES DECLARATIONS
  284. /////////////////////////////////////////////////////////////////////////////
  285. CSphAutofile::CSphAutofile ()
  286. : m_iFD ( -1 )
  287. , m_bTemporary ( false )
  288. , m_bWouldTemporary ( false )
  289. , m_pStat ( NULL )
  290. {
  291. }
  292. CSphAutofile::CSphAutofile ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp )
  293. : m_iFD ( -1 )
  294. , m_bTemporary ( false )
  295. , m_bWouldTemporary ( false )
  296. , m_pStat ( NULL )
  297. {
  298. Open ( sName, iMode, sError, bTemp );
  299. }
  300. CSphAutofile::~CSphAutofile ()
  301. {
  302. Close ();
  303. }
  304. int CSphAutofile::Open ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp )
  305. {
  306. assert ( m_iFD==-1 && m_sFilename.IsEmpty () );
  307. assert ( !sName.IsEmpty() );
  308. #if USE_WINDOWS
  309. if ( iMode==SPH_O_READ )
  310. {
  311. intptr_t tFD = (intptr_t)CreateFile ( sName.cstr(), GENERIC_READ , FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL );
  312. m_iFD = _open_osfhandle ( tFD, 0 );
  313. } else
  314. m_iFD = ::open ( sName.cstr(), iMode, 0644 );
  315. #else
  316. m_iFD = ::open ( sName.cstr(), iMode, 0644 );
  317. #endif
  318. m_sFilename = sName; // not exactly sure why is this uncoditional. for error reporting later, i suppose
  319. if ( m_iFD<0 )
  320. sError.SetSprintf ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
  321. else
  322. {
  323. m_bTemporary = bTemp; // only if we managed to actually open it
  324. m_bWouldTemporary = true; // if a shit happen - we could delete the file.
  325. }
  326. return m_iFD;
  327. }
  328. void CSphAutofile::Close ()
  329. {
  330. if ( m_iFD>=0 )
  331. {
  332. ::close ( m_iFD );
  333. if ( m_bTemporary )
  334. ::unlink ( m_sFilename.cstr() );
  335. }
  336. m_iFD = -1;
  337. m_sFilename = "";
  338. m_bTemporary = false;
  339. m_bWouldTemporary = false;
  340. }
  341. void CSphAutofile::SetTemporary()
  342. {
  343. m_bTemporary = m_bWouldTemporary;
  344. }
  345. const char * CSphAutofile::GetFilename () const
  346. {
  347. assert ( m_sFilename.cstr() );
  348. return m_sFilename.cstr();
  349. }
  350. SphOffset_t CSphAutofile::GetSize ( SphOffset_t iMinSize, bool bCheckSizeT, CSphString & sError )
  351. {
  352. struct_stat st;
  353. if ( stat ( GetFilename(), &st )<0 )
  354. {
  355. sError.SetSprintf ( "failed to stat %s: %s", GetFilename(), strerror(errno) );
  356. return -1;
  357. }
  358. if ( st.st_size<iMinSize )
  359. {
  360. sError.SetSprintf ( "failed to load %s: bad size "INT64_FMT" (at least "INT64_FMT" bytes expected)",
  361. GetFilename(), (int64_t)st.st_size, (int64_t)iMinSize );
  362. return -1;
  363. }
  364. if ( bCheckSizeT )
  365. {
  366. size_t sCheck = (size_t)st.st_size;
  367. if ( st.st_size!=SphOffset_t(sCheck) )
  368. {
  369. sError.SetSprintf ( "failed to load %s: bad size "INT64_FMT" (out of size_t; 4 GB limit on 32-bit machine hit?)",
  370. GetFilename(), (int64_t)st.st_size );
  371. return -1;
  372. }
  373. }
  374. return st.st_size;
  375. }
  376. SphOffset_t CSphAutofile::GetSize ()
  377. {
  378. CSphString sTmp;
  379. return GetSize ( 0, false, sTmp );
  380. }
  381. bool CSphAutofile::Read ( void * pBuf, int64_t iCount, CSphString & sError )
  382. {
  383. int64_t iToRead = iCount;
  384. BYTE * pCur = (BYTE *)pBuf;
  385. while ( iToRead>0 )
  386. {
  387. int64_t iToReadOnce = ( m_pStat )
  388. ? Min ( SPH_READ_PROGRESS_CHUNK, iToRead )
  389. : Min ( SPH_READ_NOPROGRESS_CHUNK, iToRead );
  390. int64_t iGot = (int64_t) sphRead ( GetFD(), pCur, (size_t)iToReadOnce );
  391. if ( iGot<=0 )
  392. break;
  393. iToRead -= iGot;
  394. pCur += iGot;
  395. if ( m_pStat )
  396. {
  397. m_pStat->m_iBytes += iGot;
  398. m_pStat->Show ( false );
  399. }
  400. }
  401. if ( iToRead!=0 )
  402. {
  403. sError.SetSprintf ( "read error in %s; "INT64_FMT" of "INT64_FMT" bytes read",
  404. GetFilename(), iCount-iToRead, iCount );
  405. return false;
  406. }
  407. return true;
  408. }
  409. void CSphAutofile::SetProgressCallback ( CSphIndexProgress * pStat )
  410. {
  411. m_pStat = pStat;
  412. }
  413. /////////////////////////////////////////////////////////////////////////////
  414. /// generic stateless priority queue
  415. template < typename T, typename COMP > class CSphQueue
  416. {
  417. protected:
  418. T * m_pData;
  419. int m_iUsed;
  420. int m_iSize;
  421. public:
  422. /// ctor
  423. explicit CSphQueue ( int iSize )
  424. : m_iUsed ( 0 )
  425. , m_iSize ( iSize )
  426. {
  427. assert ( iSize>0 );
  428. m_pData = new T [ iSize ];
  429. assert ( m_pData );
  430. }
  431. /// dtor
  432. virtual ~CSphQueue ()
  433. {
  434. SafeDeleteArray ( m_pData );
  435. }
  436. /// add entry to the queue
  437. virtual bool Push ( const T & tEntry )
  438. {
  439. if ( m_iUsed==m_iSize )
  440. {
  441. // if it's worse that current min, reject it, else pop off current min
  442. if ( COMP::IsLess ( tEntry, m_pData[0] ) )
  443. return true;
  444. else
  445. Pop ();
  446. }
  447. // do add
  448. m_pData [ m_iUsed ] = tEntry;
  449. int iEntry = m_iUsed++;
  450. // sift up if needed, so that worst (lesser) ones float to the top
  451. while ( iEntry )
  452. {
  453. int iParent = ( iEntry-1 ) >> 1;
  454. if ( !COMP::IsLess ( m_pData[iEntry], m_pData[iParent] ) )
  455. break;
  456. // entry is less than parent, should float to the top
  457. Swap ( m_pData[iEntry], m_pData[iParent] );
  458. iEntry = iParent;
  459. }
  460. return true;
  461. }
  462. /// remove root (ie. top priority) entry
  463. virtual void Pop ()
  464. {
  465. assert ( m_iUsed );
  466. if ( !(--m_iUsed) ) // empty queue? just return
  467. return;
  468. // make the last entry my new root
  469. m_pData[0] = m_pData[m_iUsed];
  470. // sift down if needed
  471. int iEntry = 0;
  472. for ( ;; )
  473. {
  474. // select child
  475. int iChild = (iEntry<<1) + 1;
  476. if ( iChild>=m_iUsed )
  477. break;
  478. // select smallest child
  479. if ( iChild+1<m_iUsed )
  480. if ( COMP::IsLess ( m_pData[iChild+1], m_pData[iChild] ) )
  481. iChild++;
  482. // if smallest child is less than entry, do float it to the top
  483. if ( COMP::IsLess ( m_pData[iChild], m_pData[iEntry] ) )
  484. {
  485. Swap ( m_pData[iChild], m_pData[iEntry] );
  486. iEntry = iChild;
  487. continue;
  488. }
  489. break;
  490. }
  491. }
  492. /// get entries count
  493. inline int GetLength () const
  494. {
  495. return m_iUsed;
  496. };
  497. /// get current root
  498. inline const T & Root () const
  499. {
  500. assert ( m_iUsed );
  501. return m_pData[0];
  502. }
  503. };
  504. //////////////////////////////////////////////////////////////////////////
  505. /// possible bin states
  506. enum ESphBinState
  507. {
  508. BIN_ERR_READ = -2, ///< bin read error
  509. BIN_ERR_END = -1, ///< bin end
  510. BIN_POS = 0, ///< bin is in "expects pos delta" state
  511. BIN_DOC = 1, ///< bin is in "expects doc delta" state
  512. BIN_WORD = 2 ///< bin is in "expects word delta" state
  513. };
  514. enum ESphBinRead
  515. {
  516. BIN_READ_OK, ///< bin read ok
  517. BIN_READ_EOF, ///< bin end
  518. BIN_READ_ERROR, ///< bin read error
  519. BIN_PRECACHE_OK, ///< precache ok
  520. BIN_PRECACHE_ERROR ///< precache failed
  521. };
  522. /// aggregated hit info
  523. struct CSphAggregateHit
  524. {
  525. SphDocID_t m_iDocID; ///< document ID
  526. SphWordID_t m_iWordID; ///< word ID in current dictionary
  527. BYTE * m_sKeyword; ///< word itself (in keywords dictionary case only)
  528. Hitpos_t m_iWordPos; ///< word position in current document, or hit count in case of aggregate hit
  529. CSphSmallBitvec m_dFieldMask; ///< mask of fields containing this word, 0 for regular hits, non-0 for aggregate hits
  530. CSphAggregateHit()
  531. : m_iDocID ( 0 )
  532. , m_iWordID ( 0 )
  533. , m_sKeyword ( NULL )
  534. {}
  535. int GetAggrCount () const
  536. {
  537. assert ( !m_dFieldMask.TestAll ( false ) );
  538. return m_iWordPos;
  539. }
  540. void SetAggrCount ( int iVal )
  541. {
  542. m_iWordPos = iVal;
  543. }
  544. };
  545. static const int MAX_KEYWORD_BYTES = SPH_MAX_WORD_LEN*3+4;
  546. /// bin, block input buffer
  547. struct CSphBin
  548. {
  549. static const int MIN_SIZE = 8192;
  550. static const int WARN_SIZE = 262144;
  551. protected:
  552. ESphHitless m_eMode;
  553. int m_iSize;
  554. BYTE * m_dBuffer;
  555. BYTE * m_pCurrent;
  556. int m_iLeft;
  557. int m_iDone;
  558. ESphBinState m_eState;
  559. bool m_bWordDict;
  560. bool m_bError; // FIXME? sort of redundant, but states are a mess
  561. CSphAggregateHit m_tHit; ///< currently decoded hit
  562. BYTE m_sKeyword [ MAX_KEYWORD_BYTES ]; ///< currently decoded hit keyword (in keywords dict mode)
  563. #ifndef NDEBUG
  564. SphWordID_t m_iLastWordID;
  565. BYTE m_sLastKeyword [ MAX_KEYWORD_BYTES ];
  566. #endif
  567. int m_iFile; ///< my file
  568. SphOffset_t * m_pFilePos; ///< shared current offset in file
  569. ThrottleState_t * m_pThrottle;
  570. public:
  571. SphOffset_t m_iFilePos; ///< my current offset in file
  572. int m_iFileLeft; ///< how much data is still unread from the file
  573. public:
  574. explicit CSphBin ( ESphHitless eMode = SPH_HITLESS_NONE, bool bWordDict = false );
  575. ~CSphBin ();
  576. static int CalcBinSize ( int iMemoryLimit, int iBlocks, const char * sPhase, bool bWarn = true );
  577. void Init ( int iFD, SphOffset_t * pSharedOffset, const int iBinSize );
  578. SphWordID_t ReadVLB ();
  579. int ReadByte ();
  580. ESphBinRead ReadBytes ( void * pDest, int iBytes );
  581. int ReadHit ( CSphAggregateHit * pHit, int iRowitems, CSphRowitem * pRowitems );
  582. DWORD UnzipInt ();
  583. SphOffset_t UnzipOffset ();
  584. bool IsEOF () const;
  585. bool IsDone () const;
  586. bool IsError () const { return m_bError; }
  587. ESphBinRead Precache ();
  588. void SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
  589. };
  590. /////////////////////////////////////////////////////////////////////////////
  591. class CSphIndex_VLN;
  592. /// everything required to setup search term
  593. class DiskIndexQwordSetup_c : public ISphQwordSetup
  594. {
  595. public:
  596. const CSphAutofile & m_tDoclist;
  597. const CSphAutofile & m_tHitlist;
  598. const CSphAutofile & m_tWordlist;
  599. bool m_bSetupReaders;
  600. const BYTE * m_pSkips;
  601. BYTE * m_pDictBuf;
  602. CSphQueryProfile * m_pProfile;
  603. public:
  604. DiskIndexQwordSetup_c ( const CSphAutofile & tDoclist, const CSphAutofile & tHitlist, const CSphAutofile & tWordlist, int iDictBufSize, const BYTE * pSkips, CSphQueryProfile * pProfile )
  605. : m_tDoclist ( tDoclist )
  606. , m_tHitlist ( tHitlist )
  607. , m_tWordlist ( tWordlist )
  608. , m_bSetupReaders ( false )
  609. , m_pSkips ( pSkips )
  610. , m_pDictBuf ( NULL )
  611. , m_pProfile ( pProfile )
  612. {
  613. if ( iDictBufSize>0 )
  614. m_pDictBuf = new BYTE [iDictBufSize];
  615. }
  616. virtual ~DiskIndexQwordSetup_c()
  617. {
  618. SafeDeleteArray ( m_pDictBuf );
  619. }
  620. virtual ISphQword * QwordSpawn ( const XQKeyword_t & tWord ) const;
  621. virtual bool QwordSetup ( ISphQword * ) const;
  622. protected:
  623. template < class T > bool Setup ( ISphQword * ) const;
  624. };
  625. #if USE_WINDOWS
  626. #pragma warning(disable:4127) // conditional expr is const for MSVC
  627. #endif
  628. /// query word from the searcher's point of view
  629. class DiskIndexQwordTraits_c : public ISphQword
  630. {
  631. static const int MINIBUFFER_LEN = 1024;
  632. public:
  633. /// tricky bit
  634. /// m_uHitPosition is always a current position in the .spp file
  635. /// base ISphQword::m_iHitlistPos carries the inlined hit data when m_iDocs==1
  636. /// but this one is always a real position, used for delta coding
  637. SphOffset_t m_uHitPosition;
  638. Hitpos_t m_uInlinedHit;
  639. DWORD m_uHitState;
  640. bool m_bDupe; ///< whether the word occurs only once in current query
  641. CSphMatch m_tDoc; ///< current match (partial)
  642. Hitpos_t m_iHitPos; ///< current hit postition, from hitlist
  643. BYTE m_dDoclistBuf [ MINIBUFFER_LEN ];
  644. BYTE m_dHitlistBuf [ MINIBUFFER_LEN ];
  645. CSphReader m_rdDoclist; ///< my doclist reader
  646. CSphReader m_rdHitlist; ///< my hitlist reader
  647. SphDocID_t m_iMinID; ///< min ID to fixup
  648. int m_iInlineAttrs; ///< inline attributes count
  649. const CSphRowitem * m_pInlineFixup; ///< inline attributes fixup (POINTER TO EXTERNAL DATA, NOT MANAGED BY THIS CLASS!)
  650. #ifndef NDEBUG
  651. bool m_bHitlistOver;
  652. #endif
  653. public:
  654. explicit DiskIndexQwordTraits_c ( bool bUseMini, bool bExcluded )
  655. : m_uHitPosition ( 0 )
  656. , m_uHitState ( 0 )
  657. , m_bDupe ( false )
  658. , m_iHitPos ()
  659. , m_rdDoclist ( bUseMini ? m_dDoclistBuf : NULL, bUseMini ? MINIBUFFER_LEN : 0 )
  660. , m_rdHitlist ( bUseMini ? m_dHitlistBuf : NULL, bUseMini ? MINIBUFFER_LEN : 0 )
  661. , m_iMinID ( 0 )
  662. , m_iInlineAttrs ( 0 )
  663. , m_pInlineFixup ( NULL )
  664. #ifndef NDEBUG
  665. , m_bHitlistOver ( true )
  666. #endif
  667. {
  668. m_iHitPos = EMPTY_HIT;
  669. m_bExcluded = bExcluded;
  670. }
  671. };
  672. bool operator < ( const SkiplistEntry_t & a, SphDocID_t b ) { return a.m_iBaseDocid<b; }
  673. bool operator == ( const SkiplistEntry_t & a, SphDocID_t b ) { return a.m_iBaseDocid==b; }
  674. bool operator < ( SphDocID_t a, const SkiplistEntry_t & b ) { return a<b.m_iBaseDocid; }
  675. /// query word from the searcher's point of view
  676. template < bool INLINE_HITS, bool INLINE_DOCINFO, bool DISABLE_HITLIST_SEEK >
  677. class DiskIndexQword_c : public DiskIndexQwordTraits_c
  678. {
  679. public:
  680. explicit DiskIndexQword_c ( bool bUseMinibuffer, bool bExcluded )
  681. : DiskIndexQwordTraits_c ( bUseMinibuffer, bExcluded )
  682. {}
  683. virtual void Reset ()
  684. {
  685. m_uHitPosition = 0;
  686. m_uHitState = 0;
  687. m_rdDoclist.Reset ();
  688. m_rdDoclist.Reset ();
  689. ISphQword::Reset();
  690. m_iHitPos = EMPTY_HIT;
  691. m_iInlineAttrs = 0;
  692. }
  693. void GetHitlistEntry ()
  694. {
  695. assert ( !m_bHitlistOver );
  696. DWORD iDelta = m_rdHitlist.UnzipInt ();
  697. if ( iDelta )
  698. {
  699. m_iHitPos += iDelta;
  700. } else
  701. {
  702. m_iHitPos = EMPTY_HIT;
  703. #ifndef NDEBUG
  704. m_bHitlistOver = true;
  705. #endif
  706. }
  707. }
  708. virtual void HintDocid ( SphDocID_t uMinID )
  709. {
  710. // tricky bit
  711. // FindSpan() will match a block where BaseDocid is >= RefValue
  712. // meaning that the subsequent ids decoded will be strictly > RefValue
  713. // meaning that if previous (!) blocks end with uMinID exactly,
  714. // and we use uMinID itself as RefValue, that document gets lost!
  715. // OPTIMIZE? keep last matched block index maybe?
  716. int iBlock = FindSpan ( m_dSkiplist, uMinID - m_iMinID - 1 );
  717. if ( iBlock<0 )
  718. return;
  719. const SkiplistEntry_t & t = m_dSkiplist [ iBlock ];
  720. if ( t.m_iOffset<=m_rdDoclist.GetPos() )
  721. return;
  722. m_rdDoclist.SeekTo ( t.m_iOffset, -1 );
  723. m_tDoc.m_iDocID = t.m_iBaseDocid + m_iMinID;
  724. m_uHitPosition = m_iHitlistPos = t.m_iBaseHitlistPos;
  725. }
  726. virtual const CSphMatch & GetNextDoc ( DWORD * pDocinfo )
  727. {
  728. SphDocID_t iDelta = m_rdDoclist.UnzipDocid();
  729. if ( iDelta )
  730. {
  731. m_bAllFieldsKnown = false;
  732. m_tDoc.m_iDocID += iDelta;
  733. if ( INLINE_DOCINFO )
  734. {
  735. assert ( pDocinfo );
  736. for ( int i=0; i<m_iInlineAttrs; i++ )
  737. pDocinfo[i] = m_rdDoclist.UnzipInt() + m_pInlineFixup[i];
  738. }
  739. if ( INLINE_HITS )
  740. {
  741. m_uMatchHits = m_rdDoclist.UnzipInt();
  742. const DWORD uFirst = m_rdDoclist.UnzipInt();
  743. if ( m_uMatchHits==1 && m_bHasHitlist )
  744. {
  745. const DWORD uField = m_rdDoclist.UnzipInt(); // field and end marker
  746. m_iHitlistPos = uFirst | ( uField << 23 ) | ( U64C(1)<<63 );
  747. m_dQwordFields.Unset();
  748. m_dQwordFields.Set ( uField >> 1 );
  749. m_bAllFieldsKnown = true;
  750. } else
  751. {
  752. m_dQwordFields.Assign32 ( uFirst );
  753. m_uHitPosition += m_rdDoclist.UnzipOffset();
  754. m_iHitlistPos = m_uHitPosition;
  755. }
  756. } else
  757. {
  758. SphOffset_t iDeltaPos = m_rdDoclist.UnzipOffset();
  759. assert ( iDeltaPos>=0 );
  760. m_iHitlistPos += iDeltaPos;
  761. m_dQwordFields.Assign32 ( m_rdDoclist.UnzipInt() );
  762. m_uMatchHits = m_rdDoclist.UnzipInt();
  763. }
  764. } else
  765. {
  766. m_tDoc.m_iDocID = 0;
  767. }
  768. return m_tDoc;
  769. }
  770. virtual void SeekHitlist ( SphOffset_t uOff )
  771. {
  772. if ( uOff >> 63 )
  773. {
  774. m_uHitState = 1;
  775. m_uInlinedHit = (DWORD)uOff; // truncate high dword
  776. } else
  777. {
  778. m_uHitState = 0;
  779. m_iHitPos = EMPTY_HIT;
  780. if ( DISABLE_HITLIST_SEEK )
  781. assert ( m_rdHitlist.GetPos()==uOff ); // make sure we're where caller thinks we are.
  782. else
  783. m_rdHitlist.SeekTo ( uOff, READ_NO_SIZE_HINT );
  784. }
  785. #ifndef NDEBUG
  786. m_bHitlistOver = false;
  787. #endif
  788. }
  789. virtual Hitpos_t GetNextHit ()
  790. {
  791. assert ( m_bHasHitlist );
  792. switch ( m_uHitState )
  793. {
  794. case 0: // read hit from hitlist
  795. GetHitlistEntry ();
  796. return m_iHitPos;
  797. case 1: // return inlined hit
  798. m_uHitState = 2;
  799. return m_uInlinedHit;
  800. case 2: // return end-of-hitlist marker after inlined hit
  801. #ifndef NDEBUG
  802. m_bHitlistOver = true;
  803. #endif
  804. m_uHitState = 0;
  805. return EMPTY_HIT;
  806. }
  807. sphDie ( "INTERNAL ERROR: impossible hit emitter state" );
  808. return EMPTY_HIT;
  809. }
  810. };
  811. #if USE_WINDOWS
  812. #pragma warning(default:4127) // conditional expr is const for MSVC
  813. #endif
  814. //////////////////////////////////////////////////////////////////////////////
  815. #define WITH_QWORD(INDEX, NO_SEEK, NAME, ACTION) \
  816. { \
  817. CSphIndex_VLN * pIndex = (CSphIndex_VLN *)INDEX; \
  818. DWORD uInlineHits = pIndex->m_tSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE; \
  819. DWORD uInlineDocinfo = pIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE; \
  820. \
  821. switch ( ( uInlineHits<<1 ) | uInlineDocinfo ) \
  822. { \
  823. case 0: { typedef DiskIndexQword_c < false, false, NO_SEEK > NAME; ACTION; break; } \
  824. case 1: { typedef DiskIndexQword_c < false, true, NO_SEEK > NAME; ACTION; break; } \
  825. case 2: { typedef DiskIndexQword_c < true, false, NO_SEEK > NAME; ACTION; break; } \
  826. case 3: { typedef DiskIndexQword_c < true, true, NO_SEEK > NAME; ACTION; break; } \
  827. default: \
  828. sphDie ( "INTERNAL ERROR: impossible qword settings" ); \
  829. } \
  830. }
  831. /////////////////////////////////////////////////////////////////////////////
  832. struct CSphWordlistCheckpoint
  833. {
  834. union
  835. {
  836. SphWordID_t m_iWordID;
  837. const char * m_sWord;
  838. };
  839. SphOffset_t m_iWordlistOffset;
  840. };
  841. // pre-v11 wordlist checkpoint
  842. struct CSphWordlistCheckpoint_v10
  843. {
  844. SphWordID_t m_iWordID;
  845. DWORD m_iWordlistOffset;
  846. };
  847. /////////////////////////////////////////////////////////////////////////////
  848. /// ordinals accumulation and sorting
  849. struct Ordinal_t
  850. {
  851. SphDocID_t m_uDocID; ///< doc id
  852. CSphString m_sValue; ///< string value
  853. };
  854. struct OrdinalEntry_t : public Ordinal_t
  855. {
  856. int m_iTag;
  857. };
  858. struct OrdinalId_t
  859. {
  860. SphDocID_t m_uDocID;
  861. DWORD m_uId;
  862. };
  863. struct OrdinalIdEntry_t : public OrdinalId_t
  864. {
  865. int m_iTag;
  866. };
  867. void Swap ( Ordinal_t & a, Ordinal_t & b )
  868. {
  869. Swap ( a.m_uDocID, b.m_uDocID );
  870. Swap ( a.m_sValue, b.m_sValue );
  871. }
  872. void Swap ( OrdinalEntry_t & a, OrdinalEntry_t & b )
  873. {
  874. Swap ( a.m_uDocID, b.m_uDocID );
  875. Swap ( a.m_sValue, b.m_sValue );
  876. Swap ( a.m_iTag, b.m_iTag );
  877. }
  878. //////////////////////////////////////////////////////////////////////////
  879. static void ReadFileInfo ( CSphReader & tReader, const char * szFilename, CSphSavedFile & tFile, CSphString * sWarning )
  880. {
  881. tFile.m_uSize = tReader.GetOffset ();
  882. tFile.m_uCTime = tReader.GetOffset ();
  883. tFile.m_uMTime = tReader.GetOffset ();
  884. tFile.m_uCRC32 = tReader.GetDword ();
  885. tFile.m_sFilename = szFilename;
  886. if ( szFilename && *szFilename && sWarning )
  887. {
  888. struct_stat tFileInfo;
  889. if ( stat ( szFilename, &tFileInfo ) < 0 )
  890. sWarning->SetSprintf ( "failed to stat %s: %s", szFilename, strerror(errno) );
  891. else
  892. {
  893. DWORD uMyCRC32 = 0;
  894. if ( !sphCalcFileCRC32 ( szFilename, uMyCRC32 ) )
  895. sWarning->SetSprintf ( "failed to calculate CRC32 for %s", szFilename );
  896. else
  897. if ( uMyCRC32!=tFile.m_uCRC32 || tFileInfo.st_size!=tFile.m_uSize
  898. || tFileInfo.st_ctime!=tFile.m_uCTime || tFileInfo.st_mtime!=tFile.m_uMTime )
  899. sWarning->SetSprintf ( "'%s' differs from the original", szFilename );
  900. }
  901. }
  902. }
  903. static void WriteFileInfo ( CSphWriter & tWriter, const CSphSavedFile & tInfo )
  904. {
  905. tWriter.PutOffset ( tInfo.m_uSize );
  906. tWriter.PutOffset ( tInfo.m_uCTime );
  907. tWriter.PutOffset ( tInfo.m_uMTime );
  908. tWriter.PutDword ( tInfo.m_uCRC32 );
  909. }
  910. /// dict=keywords block reader
  911. class KeywordsBlockReader_c : public CSphDictEntry
  912. {
  913. private:
  914. const BYTE * m_pBuf;
  915. BYTE m_sWord [ MAX_KEYWORD_BYTES ];
  916. int m_iLen;
  917. BYTE m_uHint;
  918. bool m_bHaveSkips;
  919. public:
  920. explicit KeywordsBlockReader_c ( const BYTE * pBuf, bool bHaveSkiplists );
  921. bool UnpackWord();
  922. const char * GetWord() const { return (const char*)m_sWord; }
  923. int GetWordLen() const { return m_iLen; }
  924. };
  925. // dictionary header
  926. struct DictHeader_t
  927. {
  928. int m_iDictCheckpoints; ///< how many dict checkpoints (keyword blocks) are there
  929. SphOffset_t m_iDictCheckpointsOffset; ///< dict checkpoints file position
  930. int m_iInfixCodepointBytes; ///< max bytes per infix codepoint (0 means no infixes)
  931. int m_iInfixBlocksOffset; ///< infix blocks file position (32bit as keywords dictionary is pretty small)
  932. int m_iInfixBlocksWordsSize; ///< infix checkpoints size
  933. DictHeader_t()
  934. : m_iDictCheckpoints ( 0 )
  935. , m_iDictCheckpointsOffset ( 0 )
  936. , m_iInfixCodepointBytes ( 0 )
  937. , m_iInfixBlocksOffset ( 0 )
  938. , m_iInfixBlocksWordsSize ( 0 )
  939. {}
  940. };
  941. // !COMMIT eliminate this, move it to proper dict impls
  942. class CWordlist : public ISphWordlist, public DictHeader_t
  943. {
  944. public:
  945. CSphFixedVector<CSphWordlistCheckpoint> m_dCheckpoints; ///< checkpoint offsets
  946. CSphVector<InfixBlock_t> m_dInfixBlocks;
  947. CSphAutofile m_tFile; ///< file
  948. int64_t m_iSize; ///< file size
  949. CSphSharedBuffer<BYTE> m_pBuf; ///< my cache
  950. int m_iMaxChunk; ///< max size of entry between checkpoints
  951. SphOffset_t m_iWordsEnd; ///< end of wordlist
  952. bool m_bHaveSkips; ///< whether there are skiplists
  953. BYTE * m_pWords; ///< arena for checkpoint's words
  954. BYTE * m_pInfixBlocksWords; ///< arena for infix checkpoint's words
  955. public:
  956. explicit CWordlist ();
  957. ~CWordlist ();
  958. void Reset ();
  959. bool ReadCP ( CSphAutofile & tFile, DWORD uVersion, bool bWordDict, CSphString & sError );
  960. const CSphWordlistCheckpoint * FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const;
  961. bool GetWord ( const BYTE * pBuf, SphWordID_t iWordID, CSphDictEntry & tWord ) const;
  962. const BYTE * AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint, int iFD, BYTE * pDictBuf ) const;
  963. virtual void GetPrefixedWords ( const char * sPrefix, int iPrefixLen, const char * sWildcard, CSphVector<CSphNamedInt> & dExpanded, BYTE * pDictBuf, int iFD ) const;
  964. virtual void GetInfixedWords ( const char * sInfix, int iInfix, const char * sWildcard, CSphVector<CSphNamedInt> & dPrefixedWords ) const;
  965. private:
  966. bool m_bWordDict;
  967. };
  968. class CSphHitBuilder;
  969. struct BuildHeader_t : public CSphSourceStats, public DictHeader_t
  970. {
  971. explicit BuildHeader_t ( const CSphSourceStats & tStat )
  972. : m_sHeaderExtension ( NULL )
  973. , m_pThrottle ( NULL )
  974. , m_pMinRow ( NULL )
  975. , m_iMinDocid ( 0 )
  976. , m_iKillListSize ( 0 )
  977. , m_uMinMaxIndex ( 0 )
  978. {
  979. m_iTotalDocuments = tStat.m_iTotalDocuments;
  980. m_iTotalBytes = tStat.m_iTotalBytes;
  981. }
  982. const char * m_sHeaderExtension;
  983. ThrottleState_t * m_pThrottle;
  984. const CSphRowitem * m_pMinRow;
  985. SphDocID_t m_iMinDocid;
  986. DWORD m_iKillListSize;
  987. int64_t m_uMinMaxIndex;
  988. };
  989. /// this is my actual VLN-compressed phrase index implementation
  990. class CSphIndex_VLN : public CSphIndex
  991. {
  992. friend class DiskIndexQwordSetup_c;
  993. friend class CSphMerger;
  994. friend class AttrIndexBuilder_t<SphDocID_t>;
  995. public:
  996. explicit CSphIndex_VLN ( const char* sIndexName, const char * sFilename );
  997. ~CSphIndex_VLN ();
  998. virtual int Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer );
  999. virtual void SetProgressCallback ( CSphIndexProgress::IndexingProgress_fn pfnProgress ) { m_tProgress.m_fnProgress = pfnProgress; }
  1000. virtual bool LoadHeader ( const char * sHeaderName, bool bStripPath, CSphString & sWarning );
  1001. virtual bool WriteHeader ( const BuildHeader_t & tBuildHeader, CSphWriter & fdInfo ) const;
  1002. virtual void DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig );
  1003. virtual void DebugDumpDocids ( FILE * fp );
  1004. virtual void DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID );
  1005. virtual void DebugDumpDict ( FILE * fp );
  1006. virtual int DebugCheck ( FILE * fp );
  1007. template <class Qword> void DumpHitlist ( FILE * fp, const char * sKeyword, bool bID );
  1008. virtual bool Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning );
  1009. virtual bool Mlock ();
  1010. virtual void Dealloc ();
  1011. virtual bool Preread ();
  1012. template<typename T> bool PrereadSharedBuffer ( CSphSharedBuffer<T> & pBuffer, const char * sExt, int64_t iExpected=0, int64_t iOffset=0 );
  1013. virtual void SetBase ( const char * sNewBase );
  1014. virtual bool Rename ( const char * sNewBase );
  1015. virtual bool Lock ();
  1016. virtual void Unlock ();
  1017. virtual void PostSetup() {}
  1018. virtual bool MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag, bool bFactors ) const;
  1019. virtual bool MultiQueryEx ( int iQueries, const CSphQuery * pQueries, CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag, bool bFactors ) const;
  1020. virtual bool GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString & sError ) const;
  1021. template <class Qword> bool DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, bool bFillOnly, CSphString & sError ) const;
  1022. virtual bool FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, CSphString & sError ) const;
  1023. virtual bool Merge ( CSphIndex * pSource, const CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists );
  1024. template <class QWORDDST, class QWORDSRC>
  1025. static bool MergeWords ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex, const ISphFilter * pFilter, SphDocID_t iMinID, CSphHitBuilder * pHitBuilder, CSphString & sError, CSphSourceStats & tStat, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle );
  1026. static bool DoMerge ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex, bool bMergeKillLists, ISphFilter * pFilter, CSphString & sError, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle );
  1027. virtual int UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError );
  1028. virtual bool SaveAttributes ( CSphString & sError ) const;
  1029. virtual DWORD GetAttributeStatus () const;
  1030. bool EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const;
  1031. virtual void SetKeepAttrs ( bool bKeepAttrs ) { m_bKeepAttrs = bKeepAttrs; }
  1032. virtual SphAttr_t * GetKillList () const;
  1033. virtual int GetKillListSize () const { return m_iKillListSize; }
  1034. virtual bool HasDocid ( SphDocID_t uDocid ) const;
  1035. virtual const CSphSourceStats & GetStats () const { return m_tStats; }
  1036. virtual int64_t * GetFieldLens() const { return m_tSettings.m_bIndexFieldLens ? m_dFieldLens.Begin() : NULL; }
  1037. virtual CSphIndexStatus GetStatus () const;
  1038. private:
  1039. static const int MIN_WRITE_BUFFER = 262144; ///< min write buffer size
  1040. static const int DEFAULT_WRITE_BUFFER = 1048576; ///< default write buffer size
  1041. private:
  1042. // common stuff
  1043. int m_iLockFD;
  1044. CSphSourceStats m_tStats; ///< my stats
  1045. CSphFixedVector<CSphRowitem> m_dMinRow;
  1046. SphDocID_t m_iMinDocid;
  1047. CSphFixedVector<int64_t> m_dFieldLens; ///< total per-field lengths summed over entire indexed data, in tokens
  1048. private:
  1049. CSphIndexProgress m_tProgress;
  1050. bool LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords );
  1051. private:
  1052. // searching-only, per-index
  1053. static const int DOCINFO_HASH_BITS = 18; // FIXME! make this configurable
  1054. CSphSharedBuffer<DWORD> m_pDocinfo; ///< my docinfo cache
  1055. int64_t m_iDocinfo; ///< my docinfo cache size
  1056. CSphSharedBuffer<DWORD> m_pDocinfoHash; ///< hashed ids, to accelerate lookups
  1057. int64_t m_iDocinfoIndex; ///< docinfo "index" entries count (each entry is 2x docinfo rows, for min/max)
  1058. DWORD * m_pDocinfoIndex; ///< docinfo "index", to accelerate filtering during full-scan (2x rows for each block, and 2x rows for the whole index, 1+m_uDocinfoIndex entries)
  1059. CSphSharedBuffer<DWORD> m_pMva; ///< my multi-valued attrs cache
  1060. CSphSharedBuffer<BYTE> m_pStrings; ///< my in-RAM strings cache
  1061. CWordlist m_tWordlist; ///< my wordlist
  1062. bool m_bKeepAttrs; ///< retain attributes on reindexing
  1063. CSphSharedBuffer<SphAttr_t> m_pKillList; ///< killlist
  1064. DWORD m_iKillListSize; ///< killlist size (in elements)
  1065. CSphSharedBuffer<BYTE> m_pSkiplists; ///< (compressed) skiplists data
  1066. int64_t m_uMinMaxIndex; ///< stored min/max cache offset (counted in DWORDs)
  1067. CSphAutofile m_tDoclistFile; ///< doclist file
  1068. CSphAutofile m_tHitlistFile; ///< hitlist file
  1069. #define SPH_SHARED_VARS_COUNT 2
  1070. DWORD * m_pPreread;
  1071. DWORD * m_pAttrsStatus;
  1072. CSphSharedBuffer<DWORD> m_dShared; ///< are we ready to search
  1073. bool m_bPreallocated; ///< are we ready to preread
  1074. DWORD m_uVersion; ///< data files version
  1075. bool m_bUse64; ///< whether the header is id64
  1076. bool m_bHaveSkips; ///< whether we have skiplists
  1077. int m_iIndexTag; ///< my ids for MVA updates pool
  1078. static int m_iIndexTagSeq; ///< static ids sequence
  1079. bool m_bIsEmpty; ///< do we have actually indexed documents (m_iTotalDocuments is just fetched documents, not indexed!)
  1080. private:
  1081. CSphString GetIndexFileName ( const char * sExt ) const;
  1082. bool ParsedMultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const XQQuery_t & tXQ, CSphDict * pDict, const CSphVector<CSphFilterSettings> * pExtraFilters, CSphQueryNodeCache * pNodeCache, int iTag, bool bFactors ) const;
  1083. bool MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag, bool bFactors ) const;
  1084. void MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery, int iSorters, ISphMatchSorter ** ppSorters, ISphRanker * pRanker, int iTag ) const;
  1085. const DWORD * FindDocinfo ( SphDocID_t uDocID ) const;
  1086. void CopyDocinfo ( CSphQueryContext * pCtx, CSphMatch & tMatch, const DWORD * pFound ) const;
  1087. bool BuildMVA ( const CSphVector<CSphSource*> & dSources, CSphFixedVector<CSphWordHit> & dHits, int iArenaSize, int iFieldFD, int nFieldMVAs, int iFieldMVAInPool, CSphIndex_VLN * pPrevIndex );
  1088. bool IsStarDict() const;
  1089. CSphDict * SetupStarDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const;
  1090. CSphDict * SetupExactDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const;
  1091. bool RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSize, SphOffset_t * pFileSize, CSphBin * pMinBin, SphOffset_t * pSharedOffset );
  1092. bool PrecomputeMinMax();
  1093. private:
  1094. static const int MAX_ORDINAL_STR_LEN = 4096; ///< maximum ordinal string length in bytes
  1095. static const int ORDINAL_READ_SIZE = 262144; ///< sorted ordinal id read buffer size in bytes
  1096. ESphBinRead ReadOrdinal ( CSphBin & Reader, Ordinal_t & Ordinal );
  1097. SphOffset_t DumpOrdinals ( CSphWriter & Writer, CSphVector<Ordinal_t> & dOrdinals );
  1098. bool SortOrdinals ( const char * szToFile, int iFromFD, int iArenaSize, int iOrdinalsInPool, CSphVector< CSphVector<SphOffset_t> > & dOrdBlockSize, bool bWarnOfMem );
  1099. bool SortOrdinalIds ( const char * szToFile, int iFromFD, int iArenaSize, CSphVector < CSphVector < SphOffset_t > > & dOrdBlockSize, bool bWarnOfMem );
  1100. const DWORD * GetMVAPool () const { return m_pMva.GetWritePtr(); }
  1101. bool LoadPersistentMVA ( CSphString & sError );
  1102. bool JuggleFile ( const char* szExt, CSphString & sError, bool bNeedOrigin=true ) const;
  1103. XQNode_t * ExpandPrefix ( XQNode_t * pNode, CSphString & sError, CSphQueryResultMeta * pResult ) const;
  1104. bool BuildDone ( const BuildHeader_t & tBuildHeader, CSphString & sError ) const;
  1105. };
  1106. int CSphIndex_VLN::m_iIndexTagSeq = 0;
  1107. /////////////////////////////////////////////////////////////////////////////
  1108. // UTILITY FUNCTIONS
  1109. /////////////////////////////////////////////////////////////////////////////
  1110. /// indexer warning
  1111. void sphWarn ( const char * sTemplate, ... )
  1112. {
  1113. va_list ap;
  1114. va_start ( ap, sTemplate );
  1115. fprintf ( stdout, "WARNING: " );
  1116. vfprintf ( stdout, sTemplate, ap );
  1117. fprintf ( stdout, "\n" );
  1118. va_end ( ap );
  1119. }
  1120. //////////////////////////////////////////////////////////////////////////
  1121. static ThrottleState_t g_tThrottle;
  1122. void sphSetThrottling ( int iMaxIOps, int iMaxIOSize )
  1123. {
  1124. g_tThrottle.m_iMaxIOps = iMaxIOps;
  1125. g_tThrottle.m_iMaxIOSize = iMaxIOSize;
  1126. }
  1127. static inline void sphThrottleSleep ( ThrottleState_t * pState )
  1128. {
  1129. assert ( pState );
  1130. if ( pState->m_iMaxIOps>0 )
  1131. {
  1132. int64_t tmTimer = sphMicroTimer();
  1133. int64_t tmSleep = Max ( 0, pState->m_tmLastIOTime + 1000000/pState->m_iMaxIOps - tmTimer );
  1134. sphSleepMsec ( (int)(tmSleep/1000) );
  1135. pState->m_tmLastIOTime = tmTimer + tmSleep;
  1136. }
  1137. }
  1138. bool sphWriteThrottled ( int iFD, const void * pBuf, int64_t iCount, const char * sName, CSphString & sError, ThrottleState_t * pThrottle )
  1139. {
  1140. assert ( pThrottle );
  1141. if ( iCount<=0 )
  1142. return true;
  1143. // by default, slice ios by at most 1 GB
  1144. int iChunkSize = ( 1UL<<30 );
  1145. // when there's a sane max_iosize (4K to 1GB), use it
  1146. if ( pThrottle->m_iMaxIOSize>=4096 )
  1147. iChunkSize = Min ( iChunkSize, pThrottle->m_iMaxIOSize );
  1148. CSphIOStats * pIOStats = GetIOStats();
  1149. // while there's data, write it chunk by chunk
  1150. const BYTE * p = (const BYTE*) pBuf;
  1151. while ( iCount>0 )
  1152. {
  1153. // wait for a timely occasion
  1154. sphThrottleSleep ( pThrottle );
  1155. // write (and maybe time)
  1156. int64_t tmTimer = 0;
  1157. if ( pIOStats )
  1158. tmTimer = sphMicroTimer();
  1159. int iToWrite = iChunkSize;
  1160. if ( iCount<iChunkSize )
  1161. iToWrite = (int)iCount;
  1162. int iWritten = ::write ( iFD, p, iToWrite );
  1163. if ( pIOStats )
  1164. {
  1165. pIOStats->m_iWriteTime += sphMicroTimer() - tmTimer;
  1166. pIOStats->m_iWriteOps++;
  1167. pIOStats->m_iWriteBytes += iToWrite;
  1168. }
  1169. // success? rinse, repeat
  1170. if ( iWritten==iToWrite )
  1171. {
  1172. iCount -= iToWrite;
  1173. p += iToWrite;
  1174. continue;
  1175. }
  1176. // failure? report, bailout
  1177. if ( iWritten<0 )
  1178. sError.SetSprintf ( "%s: write error: %s", sName, strerror(errno) );
  1179. else
  1180. sError.SetSprintf ( "%s: write error: %d of %d bytes written", sName, iWritten, iToWrite );
  1181. return false;
  1182. }
  1183. return true;
  1184. }
  1185. size_t sphReadThrottled ( int iFD, void * pBuf, size_t iCount, ThrottleState_t * pThrottle )
  1186. {
  1187. assert ( pThrottle );
  1188. if ( pThrottle->m_iMaxIOSize && int(iCount) > pThrottle->m_iMaxIOSize )
  1189. {
  1190. size_t nChunks = iCount / pThrottle->m_iMaxIOSize;
  1191. size_t nBytesLeft = iCount % pThrottle->m_iMaxIOSize;
  1192. size_t nBytesRead = 0;
  1193. size_t iRead = 0;
  1194. for ( size_t i=0; i<nChunks; i++ )
  1195. {
  1196. iRead = sphReadThrottled ( iFD, (char *)pBuf + i*pThrottle->m_iMaxIOSize, pThrottle->m_iMaxIOSize, pThrottle );
  1197. nBytesRead += iRead;
  1198. if ( iRead!=(size_t)pThrottle->m_iMaxIOSize )
  1199. return nBytesRead;
  1200. }
  1201. if ( nBytesLeft > 0 )
  1202. {
  1203. iRead = sphReadThrottled ( iFD, (char *)pBuf + nChunks*pThrottle->m_iMaxIOSize, nBytesLeft, pThrottle );
  1204. nBytesRead += iRead;
  1205. if ( iRead!=nBytesLeft )
  1206. return nBytesRead;
  1207. }
  1208. return nBytesRead;
  1209. }
  1210. sphThrottleSleep ( pThrottle );
  1211. return sphRead ( iFD, pBuf, iCount );
  1212. }
  1213. void SafeClose ( int & iFD )
  1214. {
  1215. if ( iFD>=0 )
  1216. ::close ( iFD );
  1217. iFD = -1;
  1218. }
  1219. //////////////////////////////////////////////////////////////////////////
  1220. #if !USE_WINDOWS
  1221. char * strlwr ( char * s )
  1222. {
  1223. while ( *s )
  1224. {
  1225. *s = tolower ( *s );
  1226. s++;
  1227. }
  1228. return s;
  1229. }
  1230. #endif
  1231. char * sphStrMacro ( const char * sTemplate, const char * sMacro, SphDocID_t uValue )
  1232. {
  1233. // expand macro
  1234. char sExp[32];
  1235. snprintf ( sExp, sizeof(sExp), DOCID_FMT, uValue );
  1236. // calc lengths
  1237. int iExp = strlen ( sExp );
  1238. int iMacro = strlen ( sMacro );
  1239. int iDelta = iExp-iMacro;
  1240. // calc result length
  1241. int iRes = strlen ( sTemplate );
  1242. const char * sCur = sTemplate;
  1243. while ( ( sCur = strstr ( sCur, sMacro ) )!=NULL )
  1244. {
  1245. iRes += iDelta;
  1246. sCur++;
  1247. }
  1248. // build result
  1249. char * sRes = new char [ iRes+1 ];
  1250. char * sOut = sRes;
  1251. const char * sLast = sTemplate;
  1252. sCur = sTemplate;
  1253. while ( ( sCur = strstr ( sCur, sMacro ) )!=NULL )
  1254. {
  1255. strncpy ( sOut, sLast, sCur-sLast ); sOut += sCur-sLast;
  1256. strcpy ( sOut, sExp ); sOut += iExp; // NOLINT
  1257. sCur += iMacro;
  1258. sLast = sCur;
  1259. }
  1260. if ( *sLast )
  1261. strcpy ( sOut, sLast ); // NOLINT
  1262. assert ( (int)strlen(sRes)==iRes );
  1263. return sRes;
  1264. }
  1265. float sphToFloat ( const char * s )
  1266. {
  1267. if ( !s ) return 0.0f;
  1268. return (float)strtod ( s, NULL );
  1269. }
  1270. DWORD sphToDword ( const char * s )
  1271. {
  1272. if ( !s ) return 0;
  1273. return strtoul ( s, NULL, 10 );
  1274. }
  1275. uint64_t sphToUint64 ( const char * s )
  1276. {
  1277. if ( !s ) return 0;
  1278. return strtoull ( s, NULL, 10 );
  1279. }
  1280. int64_t sphToInt64 ( const char * s )
  1281. {
  1282. if ( !s ) return 0;
  1283. return strtoll ( s, NULL, 10 );
  1284. }
  1285. #if USE_64BIT
  1286. #define sphToDocid sphToUint64
  1287. #else
  1288. #define sphToDocid sphToDword
  1289. #endif
  1290. #if USE_WINDOWS
  1291. bool sphLockEx ( int iFile, bool bWait )
  1292. {
  1293. HANDLE hHandle = (HANDLE) _get_osfhandle ( iFile );
  1294. if ( hHandle!=INVALID_HANDLE_VALUE )
  1295. {
  1296. OVERLAPPED tOverlapped;
  1297. memset ( &tOverlapped, 0, sizeof ( tOverlapped ) );
  1298. return !!LockFileEx ( hHandle, LOCKFILE_EXCLUSIVE_LOCK | ( bWait ? 0 : LOCKFILE_FAIL_IMMEDIATELY ), 0, 1, 0, &tOverlapped );
  1299. }
  1300. return false;
  1301. }
  1302. void sphLockUn ( int iFile )
  1303. {
  1304. HANDLE hHandle = (HANDLE) _get_osfhandle ( iFile );
  1305. if ( hHandle!=INVALID_HANDLE_VALUE )
  1306. {
  1307. OVERLAPPED tOverlapped;
  1308. memset ( &tOverlapped, 0, sizeof ( tOverlapped ) );
  1309. UnlockFileEx ( hHandle, 0, 1, 0, &tOverlapped );
  1310. }
  1311. }
  1312. #else
  1313. bool sphLockEx ( int iFile, bool bWait )
  1314. {
  1315. struct flock tLock;
  1316. tLock.l_type = F_WRLCK;
  1317. tLock.l_whence = SEEK_SET;
  1318. tLock.l_start = 0;
  1319. tLock.l_len = 0;
  1320. int iCmd = bWait ? F_SETLKW : F_SETLK; // FIXME! check for HAVE_F_SETLKW?
  1321. return ( fcntl ( iFile, iCmd, &tLock )!=-1 );
  1322. }
  1323. void sphLockUn ( int iFile )
  1324. {
  1325. struct flock tLock;
  1326. tLock.l_type = F_UNLCK;
  1327. tLock.l_whence = SEEK_SET;
  1328. tLock.l_start = 0;
  1329. tLock.l_len = 0;
  1330. fcntl ( iFile, F_SETLK, &tLock );
  1331. }
  1332. #endif
  1333. void sphSleepMsec ( int iMsec )
  1334. {
  1335. if ( iMsec<0 )
  1336. return;
  1337. #if USE_WINDOWS
  1338. Sleep ( iMsec );
  1339. #else
  1340. struct timeval tvTimeout;
  1341. tvTimeout.tv_sec = iMsec / 1000; // full seconds
  1342. tvTimeout.tv_usec = ( iMsec % 1000 ) * 1000; // remainder is msec, so *1000 for usec
  1343. select ( 0, NULL, NULL, NULL, &tvTimeout ); // FIXME? could handle EINTR
  1344. #endif
  1345. }
  1346. bool sphIsReadable ( const char * sPath, CSphString * pError )
  1347. {
  1348. int iFD = ::open ( sPath, O_RDONLY );
  1349. if ( iFD<0 )
  1350. {
  1351. if ( pError )
  1352. pError->SetSprintf ( "%s unreadable: %s", sPath, strerror(errno) );
  1353. return false;
  1354. }
  1355. close ( iFD );
  1356. return true;
  1357. }
  1358. void sphSetReadBuffers ( int iReadBuffer, int iReadUnhinted )
  1359. {
  1360. if ( iReadBuffer<=0 )
  1361. iReadBuffer = DEFAULT_READ_BUFFER;
  1362. g_iReadBuffer = Max ( iReadBuffer, MIN_READ_BUFFER );
  1363. if ( iReadUnhinted<=0 )
  1364. iReadUnhinted = DEFAULT_READ_UNHINTED;
  1365. g_iReadUnhinted = Max ( iReadUnhinted, MIN_READ_UNHINTED );
  1366. }
  1367. //////////////////////////////////////////////////////////////////////////
  1368. // DOCINFO
  1369. //////////////////////////////////////////////////////////////////////////
  1370. static DWORD * g_pMvaArena = NULL; ///< initialized by sphArenaInit()
  1371. // OPTIMIZE! try to inline or otherwise simplify maybe
  1372. const DWORD * CSphMatch::GetAttrMVA ( const CSphAttrLocator & tLoc, const DWORD * pPool ) const
  1373. {
  1374. DWORD uIndex = MVA_DOWNSIZE ( GetAttr ( tLoc ) );
  1375. if ( !uIndex )
  1376. return NULL;
  1377. if ( uIndex & MVA_ARENA_FLAG )
  1378. return g_pMvaArena + ( uIndex & MVA_OFFSET_MASK );
  1379. assert ( pPool );
  1380. return pPool + uIndex;
  1381. }
  1382. /////////////////////////////////////////////////////////////////////////////
  1383. // TOKENIZERS
  1384. /////////////////////////////////////////////////////////////////////////////
  1385. #if USE_WINDOWS
  1386. #pragma warning(disable:4127) // conditional expr is const for MSVC
  1387. #endif
  1388. inline int sphUTF8Decode ( BYTE * & pBuf ); // forward ref for GCC
  1389. inline int sphUTF8Encode ( BYTE * pBuf, int iCode ); // forward ref for GCC
  1390. /// synonym list entry
  1391. struct CSphSynonym
  1392. {
  1393. CSphString m_sFrom; ///< specially packed list of map-from tokens
  1394. CSphString m_sTo; ///< map-to string
  1395. int m_iFromLen; ///< cached m_sFrom length
  1396. int m_iToLen; ///< cached m_sTo length
  1397. inline bool operator < ( const CSphSynonym & rhs ) const
  1398. {
  1399. return strcmp ( m_sFrom.cstr(), rhs.m_sFrom.cstr() ) < 0;
  1400. }
  1401. };
  1402. /// base that is completely identical in both SBCS and UTF8 tokenizers
  1403. class CSphTokenizerBase : public ISphTokenizer
  1404. {
  1405. public:
  1406. CSphTokenizerBase ();
  1407. virtual bool SetCaseFolding ( const char * sConfig, CSphString & sError );
  1408. virtual bool LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError );
  1409. virtual void WriteSynonyms ( CSphWriter & tWriter );
  1410. virtual void CloneBase ( const CSphTokenizerBase * pFrom, ESphTokenizerClone eMode );
  1411. virtual const char * GetTokenStart () const { return (const char *) m_pTokenStart; }
  1412. virtual const char * GetTokenEnd () const { return (const char *) m_pTokenEnd; }
  1413. virtual const char * GetBufferPtr () const { return (const char *) m_pCur; }
  1414. virtual const char * GetBufferEnd () const { return (const char *) m_pBufferMax; }
  1415. virtual void SetBufferPtr ( const char * sNewPtr );
  1416. virtual bool SetBlendChars ( const char * sConfig, CSphString & sError );
  1417. public:
  1418. // lightweight clones must impose a lockdown on some methods
  1419. // (specifically those that change the lowercaser data table)
  1420. virtual void AddPlainChar ( char c )
  1421. {
  1422. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  1423. ISphTokenizer::AddPlainChar ( c );
  1424. }
  1425. virtual void AddSpecials ( const char * sSpecials )
  1426. {
  1427. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  1428. ISphTokenizer::AddSpecials ( sSpecials );
  1429. }
  1430. virtual void Setup ( const CSphTokenizerSettings & tSettings )
  1431. {
  1432. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  1433. ISphTokenizer::Setup ( tSettings );
  1434. }
  1435. virtual bool RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError )
  1436. {
  1437. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  1438. return ISphTokenizer::RemapCharacters ( sConfig, uFlags, sSource, bCanRemap, sError );
  1439. }
  1440. protected:
  1441. bool BlendAdjust ( BYTE * pPosition );
  1442. int CodepointArbitrationI ( int iCodepoint );
  1443. int CodepointArbitrationQ ( int iCodepoint, bool bWasEscaped, BYTE uNextByte );
  1444. typedef CSphOrderedHash <int, int, IdentityHash_fn, 4096> CSphSynonymHash;
  1445. bool LoadSynonym ( char * sBuffer, const char * szFilename, int iLine, CSphSynonymHash & tHash, CSphString & sError );
  1446. protected:
  1447. BYTE * m_pBuffer; ///< my buffer
  1448. BYTE * m_pBufferMax; ///< max buffer ptr, exclusive (ie. this ptr is invalid, but every ptr below is ok)
  1449. BYTE * m_pCur; ///< current position
  1450. BYTE * m_pTokenStart; ///< last token start point
  1451. BYTE * m_pTokenEnd; ///< last token end point
  1452. BYTE m_sAccum [ 3*SPH_MAX_WORD_LEN+3 ]; ///< folded token accumulator
  1453. BYTE * m_pAccum; ///< current accumulator position
  1454. int m_iAccum; ///< boundary token size
  1455. BYTE m_sAccumBlend [ 3*SPH_MAX_WORD_LEN+3 ]; ///< blend-acc, an accumulator copy for additional blended variants
  1456. int m_iBlendNormalStart; ///< points to first normal char in the accumulators (might be NULL)
  1457. int m_iBlendNormalEnd; ///< points just past (!) last normal char in the accumulators (might be NULL)
  1458. CSphVector<CSphSynonym> m_dSynonyms; ///< active synonyms
  1459. CSphVector<int> m_dSynStart; ///< map 1st byte to candidate range start
  1460. CSphVector<int> m_dSynEnd; ///< map 1st byte to candidate range end
  1461. bool m_bHasBlend;
  1462. BYTE * m_pBlendStart;
  1463. BYTE * m_pBlendEnd;
  1464. ESphTokenizerClone m_eMode;
  1465. };
  1466. /// methods taht get specialized with regards to charset type
  1467. /// aka GetCodepoint() decoder and everything that depends on it
  1468. template < bool IS_UTF8 >
  1469. class CSphTokenizerBase2 : public CSphTokenizerBase
  1470. {
  1471. protected:
  1472. /// get codepoint
  1473. inline int GetCodepoint ()
  1474. {
  1475. if ( IS_UTF8 )
  1476. {
  1477. while ( m_pCur<m_pBufferMax )
  1478. {
  1479. int iCode = sphUTF8Decode ( m_pCur );
  1480. if ( iCode>=0 )
  1481. return iCode; // succesful decode
  1482. }
  1483. return -1; // eof
  1484. } else
  1485. {
  1486. return m_pCur>=m_pBufferMax
  1487. ? -1
  1488. : int ( *m_pCur++ );
  1489. }
  1490. }
  1491. /// accum codepoint
  1492. inline void AccumCodepoint ( int iCode )
  1493. {
  1494. assert ( iCode>0 );
  1495. assert ( m_iAccum>=0 );
  1496. // throw away everything which is over the token size
  1497. if ( m_iAccum<SPH_MAX_WORD_LEN )
  1498. {
  1499. if ( IS_UTF8 )
  1500. m_pAccum += sphUTF8Encode ( m_pAccum, iCode );
  1501. else
  1502. *m_pAccum++ = BYTE(iCode);
  1503. assert ( m_pAccum>=m_sAccum && m_pAccum<m_sAccum+sizeof(m_sAccum) );
  1504. m_iAccum++;
  1505. }
  1506. }
  1507. protected:
  1508. BYTE * GetTokenSyn ( bool bQueryMode );
  1509. BYTE * GetBlendedVariant ();
  1510. public:
  1511. virtual int SkipBlended ();
  1512. };
  1513. /// single-byte charset tokenizer
  1514. template < bool IS_QUERY >
  1515. class CSphTokenizer_SBCS : public CSphTokenizerBase2<false>
  1516. {
  1517. public:
  1518. CSphTokenizer_SBCS ();
  1519. virtual void SetBuffer ( BYTE * sBuffer, int iLength );
  1520. virtual BYTE * GetToken ();
  1521. virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
  1522. virtual bool IsUtf8 () const { return false; }
  1523. virtual int GetCodepointLength ( int ) const { return 1; }
  1524. virtual int GetMaxCodepointLength () const { return 1; }
  1525. };
  1526. /// templated UTF-8 implementation of GetToken
  1527. class CSphTokenizer_UTF8_Base : public CSphTokenizerBase2<true>
  1528. {
  1529. protected:
  1530. template < bool IS_QUERY, bool IS_BLEND >
  1531. BYTE * DoGetToken();
  1532. void FlushAccum ();
  1533. };
  1534. /// UTF-8 tokenizer
  1535. template < bool IS_QUERY >
  1536. class CSphTokenizer_UTF8 : public CSphTokenizer_UTF8_Base
  1537. {
  1538. public:
  1539. CSphTokenizer_UTF8 ();
  1540. virtual void SetBuffer ( BYTE * sBuffer, int iLength );
  1541. virtual BYTE * GetToken ();
  1542. virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
  1543. virtual bool IsUtf8 () const { return true; }
  1544. virtual int GetCodepointLength ( int iCode ) const;
  1545. virtual int GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
  1546. };
  1547. /// UTF-8 tokenizer with n-grams
  1548. template < bool IS_QUERY >
  1549. class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
  1550. {
  1551. public:
  1552. CSphTokenizer_UTF8Ngram () : m_iNgramLen ( 1 ) {}
  1553. public:
  1554. virtual bool SetNgramChars ( const char * sConfig, CSphString & sError );
  1555. virtual void SetNgramLen ( int iLen );
  1556. virtual BYTE * GetToken ();
  1557. protected:
  1558. int m_iNgramLen;
  1559. CSphString m_sNgramCharsStr;
  1560. };
  1561. struct CSphMultiform
  1562. {
  1563. CSphString m_sNormalForm;
  1564. int m_iNormalTokenLen;
  1565. CSphVector<CSphString> m_dTokens;
  1566. };
  1567. struct CSphMultiforms
  1568. {
  1569. int m_iMinTokens;
  1570. int m_iMaxTokens;
  1571. CSphVector<CSphMultiform*> m_pForms; // OPTIMIZE? blobify?
  1572. };
  1573. struct CSphMultiformContainer
  1574. {
  1575. CSphMultiformContainer () : m_iMaxTokens ( 0 ) {}
  1576. int m_iMaxTokens;
  1577. typedef CSphOrderedHash < CSphMultiforms *, CSphString, CSphStrHashFunc, 131072 > CSphMultiformHash;
  1578. CSphMultiformHash m_Hash;
  1579. };
  1580. /// token filter for multiforms support
  1581. class CSphMultiformTokenizer : public CSphTokenFilter
  1582. {
  1583. public:
  1584. CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer );
  1585. ~CSphMultiformTokenizer ();
  1586. public:
  1587. virtual void SetBuffer ( BYTE * sBuffer, int iLength );
  1588. virtual BYTE * GetToken ();
  1589. virtual void EnableTokenizedMultiformTracking () { m_bBuildMultiform = true; }
  1590. virtual int GetLastTokenLen () const { return m_pLastToken->m_iTokenLen; }
  1591. virtual bool GetBoundary () { return m_pLastToken->m_bBoundary; }
  1592. virtual bool WasTokenSpecial () { return m_pLastToken->m_bSpecial; }
  1593. virtual int GetOvershortCount () { return m_pLastToken->m_iOvershortCount; }
  1594. virtual BYTE * GetTokenizedMultiform () { return m_sTokenizedMultiform[0] ? m_sTokenizedMultiform : NULL; }
  1595. public:
  1596. virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const;
  1597. virtual const char * GetTokenStart () const { return m_pLastToken->m_szTokenStart; }
  1598. virtual const char * GetTokenEnd () const { return m_pLastToken->m_szTokenEnd; }
  1599. virtual const char * GetBufferPtr () const { return m_pLastToken ? m_pLastToken->m_pBufferPtr : m_pTokenizer->GetBufferPtr(); }
  1600. virtual void SetBufferPtr ( const char * sNewPtr );
  1601. private:
  1602. const CSphMultiformContainer * m_pMultiWordforms;
  1603. int m_iStoredStart;
  1604. int m_iStoredLen;
  1605. bool m_bBuildMultiform;
  1606. BYTE m_sTokenizedMultiform [ 3*SPH_MAX_WORD_LEN+4 ];
  1607. BYTE m_sOutMultiform [ 3*SPH_MAX_WORD_LEN+4 ];
  1608. struct StoredToken_t
  1609. {
  1610. BYTE m_sToken [3*SPH_MAX_WORD_LEN+4];
  1611. int m_iTokenLen;
  1612. bool m_bBoundary;
  1613. bool m_bSpecial;
  1614. int m_iOvershortCount;
  1615. const char * m_szTokenStart;
  1616. const char * m_szTokenEnd;
  1617. const char * m_pBufferPtr;
  1618. };
  1619. CSphVector<StoredToken_t> m_dStoredTokens;
  1620. StoredToken_t m_tLastToken;
  1621. StoredToken_t * m_pLastToken;
  1622. void FillTokenInfo ( StoredToken_t * pToken );
  1623. };
  1624. #if USE_WINDOWS
  1625. #pragma warning(default:4127) // conditional expr is const
  1626. #endif
  1627. /// token filter for bigram indexing
  1628. ///
  1629. /// passes tokens through until an eligible pair is found
  1630. /// then buffers and returns that pair as a blended token
  1631. /// then returns the first token as a regular one
  1632. /// then pops the first one and cycles again
  1633. ///
  1634. /// pair (aka bigram) eligibility depends on bigram_index value
  1635. /// "all" means that all token pairs gets indexed
  1636. /// "first_freq" means that 1st token must be from bigram_freq_words
  1637. /// "both_freq" means that both tokens must be from bigram_freq_words
  1638. class CSphBigramTokenizer : public CSphTokenFilter
  1639. {
  1640. protected:
  1641. enum
  1642. {
  1643. BIGRAM_CLEAN, ///< clean slate, nothing accumulated
  1644. BIGRAM_PAIR, ///< just returned a pair from m_sBuf, and m_iFirst/m_pSecond are correct
  1645. BIGRAM_FIRST ///< just returned a first token from m_sBuf, so m_iFirst/m_pSecond are still good
  1646. } m_eState;
  1647. BYTE m_sBuf [ MAX_KEYWORD_BYTES ]; ///< pair buffer
  1648. BYTE * m_pSecond; ///< second token pointer
  1649. int m_iFirst; ///< first token length, bytes
  1650. ESphBigram m_eMode; ///< bigram indexing mode
  1651. int m_iMaxLen; ///< max bigram_freq_words length
  1652. int m_dWordsHash[256]; ///< offsets into m_dWords hashed by 1st byte
  1653. CSphVector<BYTE> m_dWords; ///< case-folded, sorted bigram_freq_words
  1654. public:
  1655. CSphBigramTokenizer ( ISphTokenizer * pTok, ESphBigram eMode, CSphVector<CSphString> & dWords )
  1656. : CSphTokenFilter ( pTok )
  1657. {
  1658. assert ( pTok );
  1659. assert ( eMode!=SPH_BIGRAM_NONE );
  1660. assert ( eMode==SPH_BIGRAM_ALL || dWords.GetLength() );
  1661. m_sBuf[0] = 0;
  1662. m_pSecond = NULL;
  1663. m_eState = BIGRAM_CLEAN;
  1664. memset ( m_dWordsHash, 0, sizeof(m_dWordsHash) );
  1665. m_eMode = eMode;
  1666. m_iMaxLen = 0;
  1667. // only keep unique, real, short enough words
  1668. dWords.Uniq();
  1669. ARRAY_FOREACH ( i, dWords )
  1670. {
  1671. int iLen = Min ( dWords[i].Length(), 255 );
  1672. if ( !iLen )
  1673. continue;
  1674. m_iMaxLen = Max ( m_iMaxLen, iLen );
  1675. // hash word blocks by the first letter
  1676. BYTE uFirst = *(BYTE*)( dWords[i].cstr() );
  1677. if ( !m_dWordsHash [ uFirst ] )
  1678. {
  1679. m_dWords.Add ( 0 ); // end marker for the previous block
  1680. m_dWordsHash [ uFirst ] = m_dWords.GetLength(); // hash new block
  1681. }
  1682. // store that word
  1683. int iPos = m_dWords.GetLength();
  1684. m_dWords.Resize ( iPos+iLen+1 );
  1685. m_dWords[iPos] = (BYTE)iLen;
  1686. memcpy ( &m_dWords [ iPos+1 ], dWords[i].cstr(), iLen );
  1687. }
  1688. m_dWords.Add ( 0 );
  1689. }
  1690. CSphBigramTokenizer ( ISphTokenizer * pTok, const CSphBigramTokenizer * pBase )
  1691. : CSphTokenFilter ( pTok )
  1692. {
  1693. m_sBuf[0] = 0;
  1694. m_pSecond = NULL;
  1695. m_eState = BIGRAM_CLEAN;
  1696. m_eMode = pBase->m_eMode;
  1697. m_iMaxLen = pBase->m_iMaxLen;
  1698. memcpy ( m_dWordsHash, pBase->m_dWordsHash, sizeof(m_dWordsHash) );
  1699. m_dWords = pBase->m_dWords;
  1700. }
  1701. ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
  1702. {
  1703. ISphTokenizer * pTok = m_pTokenizer->Clone ( eMode );
  1704. return new CSphBigramTokenizer ( pTok, this );
  1705. }
  1706. void SetBuffer ( BYTE * sBuffer, int iLength )
  1707. {
  1708. m_pTokenizer->SetBuffer ( sBuffer, iLength );
  1709. }
  1710. bool TokenIsBlended() const
  1711. {
  1712. if ( m_eState==BIGRAM_PAIR )
  1713. return true;
  1714. if ( m_eState==BIGRAM_FIRST )
  1715. return false;
  1716. return m_pTokenizer->TokenIsBlended();
  1717. }
  1718. bool IsFreq ( int iLen, BYTE * sWord )
  1719. {
  1720. // early check
  1721. if ( iLen>m_iMaxLen )
  1722. return false;
  1723. // hash lookup, then linear scan
  1724. int iPos = m_dWordsHash [ *sWord ];
  1725. if ( !iPos )
  1726. return false;
  1727. while ( m_dWords[iPos] )
  1728. {
  1729. if ( m_dWords[iPos]==iLen && !memcmp ( sWord, &m_dWords[iPos+1], iLen ) )
  1730. break;
  1731. iPos += 1+m_dWords[iPos];
  1732. }
  1733. return m_dWords[iPos]!=0;
  1734. }
  1735. BYTE * GetToken()
  1736. {
  1737. if ( m_eState==BIGRAM_FIRST || m_eState==BIGRAM_CLEAN )
  1738. {
  1739. BYTE * pFirst;
  1740. if ( m_eState==BIGRAM_FIRST )
  1741. {
  1742. // first out, clean slate again, actually
  1743. // and second will now become our next first
  1744. assert ( m_pSecond );
  1745. m_eState = BIGRAM_CLEAN;
  1746. pFirst = m_pSecond;
  1747. m_pSecond = NULL;
  1748. } else
  1749. {
  1750. // just clean slate
  1751. // assure we're, well, clean
  1752. assert ( !m_pSecond );
  1753. pFirst = m_pTokenizer->GetToken();
  1754. }
  1755. // clean slate
  1756. // get first non-blended token
  1757. if ( !pFirst )
  1758. return NULL;
  1759. // pass through blended
  1760. // could handle them as first too, but.. cumbersome
  1761. if ( m_pTokenizer->TokenIsBlended() )
  1762. return pFirst;
  1763. // check pair
  1764. // in first_freq and both_freq modes, 1st token must be listed
  1765. m_iFirst = strlen ( (const char*)pFirst );
  1766. if ( m_eMode!=SPH_BIGRAM_ALL && !IsFreq ( m_iFirst, pFirst ) )
  1767. return pFirst;
  1768. // copy it
  1769. // subsequent calls can and will override token accumulator
  1770. memcpy ( m_sBuf, pFirst, m_iFirst+1 );
  1771. // grow a pair!
  1772. // get a second one (lookahead, in a sense)
  1773. BYTE * pSecond = m_pTokenizer->GetToken();
  1774. // eof? oi
  1775. if ( !pSecond )
  1776. return m_sBuf;
  1777. // got a pair!
  1778. // check combined length
  1779. m_pSecond = pSecond;
  1780. int iSecond = strlen ( (const char*)pSecond );
  1781. if ( m_iFirst+iSecond+1 > SPH_MAX_WORD_LEN )
  1782. {
  1783. // too long pair
  1784. // return first token as is
  1785. m_eState = BIGRAM_FIRST;
  1786. return m_sBuf;
  1787. }
  1788. // check pair
  1789. // in freq2 mode, both tokens must be listed
  1790. if ( m_eMode==SPH_BIGRAM_BOTHFREQ && !IsFreq ( iSecond, m_pSecond ) )
  1791. {
  1792. m_eState = BIGRAM_FIRST;
  1793. return m_sBuf;
  1794. }
  1795. // ok, this is a eligible pair
  1796. // begin with returning first+second pair (as blended)
  1797. m_eState = BIGRAM_PAIR;
  1798. m_sBuf [ m_iFirst ] = MAGIC_WORD_BIGRAM;
  1799. assert ( m_iFirst + strlen ( (const char*)pSecond ) < sizeof(m_sBuf) );
  1800. strcpy ( (char*)m_sBuf+m_iFirst+1, (const char*)pSecond ); //NOLINT
  1801. return m_sBuf;
  1802. } else if ( m_eState==BIGRAM_PAIR )
  1803. {
  1804. // pair (aka bigram) out, return first token as a regular token
  1805. m_eState = BIGRAM_FIRST;
  1806. m_sBuf [ m_iFirst ] = 0;
  1807. return m_sBuf;
  1808. }
  1809. assert ( 0 && "unhandled bigram tokenizer internal state" );
  1810. return NULL;
  1811. }
  1812. };
  1813. /////////////////////////////////////////////////////////////////////////////
  1814. ISphTokenizer * sphCreateSBCSTokenizer ()
  1815. {
  1816. return new CSphTokenizer_SBCS<false> ();
  1817. }
  1818. ISphTokenizer * sphCreateUTF8Tokenizer ()
  1819. {
  1820. return new CSphTokenizer_UTF8<false> ();
  1821. }
  1822. ISphTokenizer * sphCreateUTF8NgramTokenizer ()
  1823. {
  1824. return new CSphTokenizer_UTF8Ngram<false> ();
  1825. }
  1826. /////////////////////////////////////////////////////////////////////////////
  1827. enum
  1828. {
  1829. MASK_CODEPOINT = 0x00ffffffUL, // mask off codepoint flags
  1830. MASK_FLAGS = 0xff000000UL, // mask off codepoint value
  1831. FLAG_CODEPOINT_SPECIAL = 0x01000000UL, // this codepoint is special
  1832. FLAG_CODEPOINT_DUAL = 0x02000000UL, // this codepoint is special but also a valid word part
  1833. FLAG_CODEPOINT_NGRAM = 0x04000000UL, // this codepoint is n-gram indexed
  1834. FLAG_CODEPOINT_SYNONYM = 0x08000000UL, // this codepoint is used in synonym tokens only
  1835. FLAG_CODEPOINT_BOUNDARY = 0x10000000UL, // this codepoint is phrase boundary
  1836. FLAG_CODEPOINT_IGNORE = 0x20000000UL, // this codepoint is ignored
  1837. FLAG_CODEPOINT_BLEND = 0x40000000UL // this codepoint is "blended" (indexed both as a character, and as a separator)
  1838. };
  1839. CSphLowercaser::CSphLowercaser ()
  1840. : m_pData ( NULL )
  1841. {
  1842. }
  1843. void CSphLowercaser::Reset()
  1844. {
  1845. SafeDeleteArray ( m_pData );
  1846. m_pData = new int [ CHUNK_SIZE ];
  1847. memset ( m_pData, 0, CHUNK_SIZE*sizeof(int) );
  1848. m_iChunks = 1;
  1849. m_pChunk[0] = m_pData; // chunk 0 must always be allocated, for utf-8 tokenizer shortcut to work
  1850. for ( int i=1; i<CHUNK_COUNT; i++ )
  1851. m_pChunk[i] = NULL;
  1852. }
  1853. CSphLowercaser::~CSphLowercaser ()
  1854. {
  1855. SafeDeleteArray ( m_pData );
  1856. }
  1857. void CSphLowercaser::SetRemap ( const CSphLowercaser * pLC )
  1858. {
  1859. if ( !pLC )
  1860. return;
  1861. SafeDeleteArray ( m_pData );
  1862. m_iChunks = pLC->m_iChunks;
  1863. m_pData = new int [ m_iChunks*CHUNK_SIZE ];
  1864. memcpy ( m_pData, pLC->m_pData, sizeof(int)*m_iChunks*CHUNK_SIZE ); // NOLINT sizeof(int)
  1865. for ( int i=0; i<CHUNK_COUNT; i++ )
  1866. m_pChunk[i] = pLC->m_pChunk[i]
  1867. ? pLC->m_pChunk[i] - pLC->m_pData + m_pData
  1868. : NULL;
  1869. }
  1870. void CSphLowercaser::AddRemaps ( const CSphVector<CSphRemapRange> & dRemaps, DWORD uFlags )
  1871. {
  1872. if ( !dRemaps.GetLength() )
  1873. return;
  1874. // build new chunks map
  1875. // 0 means "was unused"
  1876. // 1 means "was used"
  1877. // 2 means "is used now"
  1878. int dUsed [ CHUNK_COUNT ];
  1879. for ( int i=0; i<CHUNK_COUNT; i++ )
  1880. dUsed[i] = m_pChunk[i] ? 1 : 0;
  1881. int iNewChunks = m_iChunks;
  1882. ARRAY_FOREACH ( i, dRemaps )
  1883. {
  1884. const CSphRemapRange & tRemap = dRemaps[i];
  1885. #define LOC_CHECK_RANGE(_a) assert ( (_a)>=0 && (_a)<MAX_CODE );
  1886. LOC_CHECK_RANGE ( tRemap.m_iStart );
  1887. LOC_CHECK_RANGE ( tRemap.m_iEnd );
  1888. LOC_CHECK_RANGE ( tRemap.m_iRemapStart );
  1889. LOC_CHECK_RANGE ( tRemap.m_iRemapStart + tRemap.m_iEnd - tRemap.m_iStart );
  1890. #undef LOC_CHECK_RANGE
  1891. for ( int iChunk=( tRemap.m_iStart >> CHUNK_BITS ); iChunk<=( tRemap.m_iEnd >> CHUNK_BITS ); iChunk++ )
  1892. if ( dUsed[iChunk]==0 )
  1893. {
  1894. dUsed[iChunk] = 2;
  1895. iNewChunks++;
  1896. }
  1897. }
  1898. // alloc new tables and copy, if necessary
  1899. if ( iNewChunks>m_iChunks )
  1900. {
  1901. int * pData = new int [ iNewChunks*CHUNK_SIZE ];
  1902. memset ( pData, 0, sizeof(int)*iNewChunks*CHUNK_SIZE ); // NOLINT sizeof(int)
  1903. int * pChunk = pData;
  1904. for ( int i=0; i<CHUNK_COUNT; i++ )
  1905. {
  1906. int * pOldChunk = m_pChunk[i];
  1907. // build new ptr
  1908. if ( dUsed[i] )
  1909. {
  1910. m_pChunk[i] = pChunk;
  1911. pChunk += CHUNK_SIZE;
  1912. }
  1913. // copy old data
  1914. if ( dUsed[i]==1 )
  1915. memcpy ( m_pChunk[i], pOldChunk, sizeof(int)*CHUNK_SIZE ); // NOLINT sizeof(int)
  1916. }
  1917. assert ( pChunk-pData==iNewChunks*CHUNK_SIZE );
  1918. SafeDeleteArray ( m_pData );
  1919. m_pData = pData;
  1920. m_iChunks = iNewChunks;
  1921. }
  1922. // fill new stuff
  1923. ARRAY_FOREACH ( i, dRemaps )
  1924. {
  1925. const CSphRemapRange & tRemap = dRemaps[i];
  1926. DWORD iRemapped = tRemap.m_iRemapStart;
  1927. for ( int j=tRemap.m_iStart; j<=tRemap.m_iEnd; j++, iRemapped++ )
  1928. {
  1929. assert ( m_pChunk [ j >> CHUNK_BITS ] );
  1930. int & iCodepoint = m_pChunk [ j >> CHUNK_BITS ] [ j & CHUNK_MASK ];
  1931. bool bWordPart = ( iCodepoint & MASK_CODEPOINT ) && !( iCodepoint & FLAG_CODEPOINT_SYNONYM );
  1932. int iNew = iRemapped | uFlags | ( iCodepoint & MASK_FLAGS );
  1933. if ( bWordPart && ( uFlags & FLAG_CODEPOINT_SPECIAL ) )
  1934. iNew |= FLAG_CODEPOINT_DUAL;
  1935. iCodepoint = iNew;
  1936. // new code-point flag removes SYNONYM
  1937. if ( ( iCodepoint & FLAG_CODEPOINT_SYNONYM ) && uFlags==0 && iRemapped!=0 )
  1938. iCodepoint &= ~FLAG_CODEPOINT_SYNONYM;
  1939. }
  1940. }
  1941. }
  1942. void CSphLowercaser::AddSpecials ( const char * sSpecials )
  1943. {
  1944. assert ( sSpecials );
  1945. int iSpecials = strlen(sSpecials);
  1946. CSphVector<CSphRemapRange> dRemaps;
  1947. dRemaps.Resize ( iSpecials );
  1948. ARRAY_FOREACH ( i, dRemaps )
  1949. dRemaps[i].m_iStart = dRemaps[i].m_iEnd = dRemaps[i].m_iRemapStart = sSpecials[i];
  1950. AddRemaps ( dRemaps, FLAG_CODEPOINT_SPECIAL );
  1951. }
  1952. const CSphLowercaser & CSphLowercaser::operator = ( const CSphLowercaser & rhs )
  1953. {
  1954. SetRemap ( &rhs );
  1955. return * this;
  1956. }
  1957. uint64_t CSphLowercaser::GetFNV () const
  1958. {
  1959. int iLen = ( sizeof(int) * m_iChunks * CHUNK_SIZE ) / sizeof(BYTE); // NOLINT
  1960. return sphFNV64 ( (BYTE *)m_pData, iLen );
  1961. }
  1962. int CSphLowercaser::GetMaxCodepointLength () const
  1963. {
  1964. int iMax = 0;
  1965. for ( int iChunk=0; iChunk<CHUNK_COUNT; iChunk++ )
  1966. {
  1967. int * pChunk = m_pChunk[iChunk];
  1968. if ( !pChunk )
  1969. continue;
  1970. int * pMax = pChunk + CHUNK_SIZE;
  1971. while ( pChunk<pMax )
  1972. {
  1973. int iCode = *pChunk++ & MASK_CODEPOINT;
  1974. iMax = Max ( iMax, iCode );
  1975. }
  1976. }
  1977. if ( iMax<0x80 )
  1978. return 1;
  1979. if ( iMax<0x800 )
  1980. return 2;
  1981. return 3; // actually, 4 once we hit 0x10000
  1982. }
  1983. /////////////////////////////////////////////////////////////////////////////
  1984. /// parser to build lowercaser from textual config
  1985. class CSphCharsetDefinitionParser
  1986. {
  1987. public:
  1988. CSphCharsetDefinitionParser () : m_bError ( false ) {}
  1989. bool Parse ( const char * sConfig, CSphVector<CSphRemapRange> & dRanges );
  1990. const char * GetLastError ();
  1991. protected:
  1992. bool m_bError;
  1993. char m_sError [ 1024 ];
  1994. const char * m_pCurrent;
  1995. bool Error ( const char * sMessage );
  1996. void SkipSpaces ();
  1997. bool IsEof ();
  1998. bool CheckEof ();
  1999. int HexDigit ( int c );
  2000. int ParseCharsetCode ();
  2001. bool AddRange ( const CSphRemapRange & tRange, CSphVector<CSphRemapRange> & dRanges );
  2002. };
  2003. const char * CSphCharsetDefinitionParser::GetLastError ()
  2004. {
  2005. return m_bError ? m_sError : NULL;
  2006. }
  2007. bool CSphCharsetDefinitionParser::IsEof ()
  2008. {
  2009. return ( *m_pCurrent )==0;
  2010. }
  2011. bool CSphCharsetDefinitionParser::CheckEof ()
  2012. {
  2013. if ( IsEof() )
  2014. {
  2015. Error ( "unexpected end of line" );
  2016. return true;
  2017. } else
  2018. {
  2019. return false;
  2020. }
  2021. }
  2022. bool CSphCharsetDefinitionParser::Error ( const char * sMessage )
  2023. {
  2024. char sErrorBuffer[32];
  2025. strncpy ( sErrorBuffer, m_pCurrent, sizeof(sErrorBuffer) );
  2026. sErrorBuffer [ sizeof(sErrorBuffer)-1 ] = '\0';
  2027. snprintf ( m_sError, sizeof(m_sError), "%s near '%s'",
  2028. sMessage, sErrorBuffer );
  2029. m_sError [ sizeof(m_sError)-1 ] = '\0';
  2030. m_bError = true;
  2031. return false;
  2032. }
  2033. int CSphCharsetDefinitionParser::HexDigit ( int c )
  2034. {
  2035. if ( c>='0' && c<='9' ) return c-'0';
  2036. if ( c>='a' && c<='f' ) return c-'a'+10;
  2037. if ( c>='A' && c<='F' ) return c-'A'+10;
  2038. return 0;
  2039. }
  2040. void CSphCharsetDefinitionParser::SkipSpaces ()
  2041. {
  2042. while ( ( *m_pCurrent ) && isspace ( (BYTE)*m_pCurrent ) )
  2043. m_pCurrent++;
  2044. }
  2045. int CSphCharsetDefinitionParser::ParseCharsetCode ()
  2046. {
  2047. const char * p = m_pCurrent;
  2048. int iCode = 0;
  2049. if ( p[0]=='U' && p[1]=='+' )
  2050. {
  2051. p += 2;
  2052. while ( isxdigit(*p) )
  2053. {
  2054. iCode = iCode*16 + HexDigit ( *p++ );
  2055. }
  2056. while ( isspace(*p) )
  2057. p++;
  2058. } else
  2059. {
  2060. if ( (*(BYTE*)p)<32 || (*(BYTE*)p)>127 )
  2061. {
  2062. Error ( "non-ASCII characters not allowed, use 'U+00AB' syntax" );
  2063. return -1;
  2064. }
  2065. iCode = *p++;
  2066. while ( isspace(*p) )
  2067. p++;
  2068. }
  2069. m_pCurrent = p;
  2070. return iCode;
  2071. }
  2072. bool CSphCharsetDefinitionParser::AddRange ( const CSphRemapRange & tRange, CSphVector<CSphRemapRange> & dRanges )
  2073. {
  2074. if ( tRange.m_iRemapStart>=0x20 )
  2075. {
  2076. dRanges.Add ( tRange );
  2077. return true;
  2078. }
  2079. CSphString sError;
  2080. sError.SetSprintf ( "dest range (U+%x) below U+20, not allowed", tRange.m_iRemapStart );
  2081. Error ( sError.cstr() );
  2082. return false;
  2083. }
  2084. bool CSphCharsetDefinitionParser::Parse ( const char * sConfig, CSphVector<CSphRemapRange> & dRanges )
  2085. {
  2086. m_pCurrent = sConfig;
  2087. dRanges.Reset ();
  2088. // do parse
  2089. while ( *m_pCurrent )
  2090. {
  2091. SkipSpaces ();
  2092. if ( IsEof () )
  2093. break;
  2094. // check for stray comma
  2095. if ( *m_pCurrent==',' )
  2096. return Error ( "stray ',' not allowed, use 'U+002C' instead" );
  2097. // parse char code
  2098. const char * pStart = m_pCurrent;
  2099. int iStart = ParseCharsetCode();
  2100. if ( iStart<0 )
  2101. return false;
  2102. // stray char?
  2103. if ( !*m_pCurrent || *m_pCurrent==',' )
  2104. {
  2105. // stray char
  2106. if ( !AddRange ( CSphRemapRange ( iStart, iStart, iStart ), dRanges ) )
  2107. return false;
  2108. if ( IsEof () )
  2109. break;
  2110. m_pCurrent++;
  2111. continue;
  2112. }
  2113. // stray remap?
  2114. if ( m_pCurrent[0]=='-' && m_pCurrent[1]=='>' )
  2115. {
  2116. // parse and add
  2117. m_pCurrent += 2;
  2118. int iDest = ParseCharsetCode ();
  2119. if ( iDest<0 )
  2120. return false;
  2121. if ( !AddRange ( CSphRemapRange ( iStart, iStart, iDest ), dRanges ) )
  2122. return false;
  2123. // it's either end of line now, or must be followed by comma
  2124. if ( *m_pCurrent )
  2125. if ( *m_pCurrent++!=',' )
  2126. return Error ( "syntax error" );
  2127. continue;
  2128. }
  2129. // range start?
  2130. if (!( m_pCurrent[0]=='.' && m_pCurrent[1]=='.' ))
  2131. return Error ( "syntax error" );
  2132. m_pCurrent += 2;
  2133. SkipSpaces ();
  2134. if ( CheckEof () )
  2135. return false;
  2136. // parse range end char code
  2137. int iEnd = ParseCharsetCode ();
  2138. if ( iEnd<0 )
  2139. return false;
  2140. if ( iStart>iEnd )
  2141. {
  2142. m_pCurrent = pStart;
  2143. return Error ( "range end less than range start" );
  2144. }
  2145. // stray range?
  2146. if ( !*m_pCurrent || *m_pCurrent==',' )
  2147. {
  2148. if ( !AddRange ( CSphRemapRange ( iStart, iEnd, iStart ), dRanges ) )
  2149. return false;
  2150. if ( IsEof () )
  2151. break;
  2152. m_pCurrent++;
  2153. continue;
  2154. }
  2155. // "checkerboard" range?
  2156. if ( m_pCurrent[0]=='/' && m_pCurrent[1]=='2' )
  2157. {
  2158. for ( int i=iStart; i<iEnd; i+=2 )
  2159. {
  2160. if ( !AddRange ( CSphRemapRange ( i, i, i+1 ), dRanges ) )
  2161. return false;
  2162. if ( !AddRange ( CSphRemapRange ( i+1, i+1, i+1 ), dRanges ) )
  2163. return false;
  2164. }
  2165. // skip "/2", expect ","
  2166. m_pCurrent += 2;
  2167. SkipSpaces ();
  2168. if ( *m_pCurrent )
  2169. if ( *m_pCurrent++!=',' )
  2170. return Error ( "expected end of line or ','" );
  2171. continue;
  2172. }
  2173. // remapped range?
  2174. if (!( m_pCurrent[0]=='-' && m_pCurrent[1]=='>' ))
  2175. return Error ( "expected end of line, ',' or '-><char>'" );
  2176. m_pCurrent += 2;
  2177. SkipSpaces ();
  2178. if ( CheckEof () )
  2179. return false;
  2180. // parse dest start
  2181. const char * pRemapStart = m_pCurrent;
  2182. int iRemapStart = ParseCharsetCode ();
  2183. if ( iRemapStart<0 )
  2184. return false;
  2185. // expect '..'
  2186. if ( CheckEof () )
  2187. return false;
  2188. if (!( m_pCurrent[0]=='.' && m_pCurrent[1]=='.' ))
  2189. return Error ( "expected '..'" );
  2190. m_pCurrent += 2;
  2191. // parse dest end
  2192. int iRemapEnd = ParseCharsetCode ();
  2193. if ( iRemapEnd<0 )
  2194. return false;
  2195. // check dest range
  2196. if ( iRemapStart>iRemapEnd )
  2197. {
  2198. m_pCurrent = pRemapStart;
  2199. return Error ( "dest range end less than dest range start" );
  2200. }
  2201. // check for length mismatch
  2202. if ( ( iRemapEnd-iRemapStart )!=( iEnd-iStart ) )
  2203. {
  2204. m_pCurrent = pStart;
  2205. return Error ( "dest range length must match src range length" );
  2206. }
  2207. // remapped ok
  2208. if ( !AddRange ( CSphRemapRange ( iStart, iEnd, iRemapStart ), dRanges ) )
  2209. return false;
  2210. if ( IsEof () )
  2211. break;
  2212. if ( *m_pCurrent!=',' )
  2213. return Error ( "expected ','" );
  2214. m_pCurrent++;
  2215. }
  2216. dRanges.Sort ();
  2217. for ( int i=0; i<dRanges.GetLength()-1; i++ )
  2218. {
  2219. if ( dRanges[i].m_iEnd>=dRanges[i+1].m_iStart )
  2220. {
  2221. // FIXME! add an ambiguity check
  2222. dRanges[i].m_iEnd = Max ( dRanges[i].m_iEnd, dRanges[i+1].m_iEnd );
  2223. dRanges.Remove ( i+1 );
  2224. i--;
  2225. }
  2226. }
  2227. return true;
  2228. }
  2229. //////////////////////////////////////////////////////////////////////////
  2230. bool sphParseCharset ( const char * sCharset, CSphVector<CSphRemapRange> & dRemaps )
  2231. {
  2232. CSphCharsetDefinitionParser tParser;
  2233. return tParser.Parse ( sCharset, dRemaps );
  2234. }
  2235. /////////////////////////////////////////////////////////////////////////////
  2236. CSphSavedFile::CSphSavedFile ()
  2237. : m_uSize ( 0 )
  2238. , m_uCTime ( 0 )
  2239. , m_uMTime ( 0 )
  2240. , m_uCRC32 ( 0 )
  2241. {
  2242. }
  2243. CSphEmbeddedFiles::CSphEmbeddedFiles ()
  2244. : m_bEmbeddedSynonyms ( false )
  2245. , m_bEmbeddedStopwords ( false )
  2246. , m_bEmbeddedWordforms ( false )
  2247. {
  2248. }
  2249. CSphTokenizerSettings::CSphTokenizerSettings ()
  2250. : m_iType ( TOKENIZER_SBCS )
  2251. , m_iMinWordLen ( 1 )
  2252. , m_iNgramLen ( 0 )
  2253. {
  2254. }
  2255. void LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSettings,
  2256. CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning )
  2257. {
  2258. if ( uVersion<9 )
  2259. return;
  2260. tSettings.m_iType = tReader.GetByte ();
  2261. tSettings.m_sCaseFolding = tReader.GetString ();
  2262. tSettings.m_iMinWordLen = tReader.GetDword ();
  2263. tEmbeddedFiles.m_bEmbeddedSynonyms = false;
  2264. if ( uVersion>=30 )
  2265. {
  2266. tEmbeddedFiles.m_bEmbeddedSynonyms = !!tReader.GetByte();
  2267. if ( tEmbeddedFiles.m_bEmbeddedSynonyms )
  2268. {
  2269. int nSynonyms = (int)tReader.GetDword();
  2270. tEmbeddedFiles.m_dSynonyms.Resize ( nSynonyms );
  2271. ARRAY_FOREACH ( i, tEmbeddedFiles.m_dSynonyms )
  2272. tEmbeddedFiles.m_dSynonyms[i] = tReader.GetString();
  2273. }
  2274. }
  2275. tSettings.m_sSynonymsFile = tReader.GetString ();
  2276. ReadFileInfo ( tReader, tSettings.m_sSynonymsFile.cstr (),
  2277. tEmbeddedFiles.m_tSynonymFile, tEmbeddedFiles.m_bEmbeddedSynonyms ? NULL : &sWarning );
  2278. tSettings.m_sBoundary = tReader.GetString ();
  2279. tSettings.m_sIgnoreChars = tReader.GetString ();
  2280. tSettings.m_iNgramLen = tReader.GetDword ();
  2281. tSettings.m_sNgramChars = tReader.GetString ();
  2282. if ( uVersion>=15 )
  2283. tSettings.m_sBlendChars = tReader.GetString ();
  2284. if ( uVersion>=24 )
  2285. tSettings.m_sBlendMode = tReader.GetString();
  2286. }
  2287. /// gets called from and MUST be in sync with RtIndex_t::SaveDiskHeader()!
  2288. /// note that SaveDiskHeader() occasionaly uses some PREVIOUS format version!
  2289. void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, int iEmbeddedLimit )
  2290. {
  2291. assert ( pTokenizer );
  2292. const CSphTokenizerSettings & tSettings = pTokenizer->GetSettings ();
  2293. tWriter.PutByte ( tSettings.m_iType );
  2294. tWriter.PutString ( tSettings.m_sCaseFolding.cstr () );
  2295. tWriter.PutDword ( tSettings.m_iMinWordLen );
  2296. bool bEmbedSynonyms = pTokenizer->GetSynFileInfo ().m_uSize<=(SphOffset_t)iEmbeddedLimit;
  2297. tWriter.PutByte ( bEmbedSynonyms ? 1 : 0 );
  2298. if ( bEmbedSynonyms )
  2299. pTokenizer->WriteSynonyms ( tWriter );
  2300. tWriter.PutString ( tSettings.m_sSynonymsFile.cstr () );
  2301. WriteFileInfo ( tWriter, pTokenizer->GetSynFileInfo () );
  2302. tWriter.PutString ( tSettings.m_sBoundary.cstr () );
  2303. tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
  2304. tWriter.PutDword ( tSettings.m_iNgramLen );
  2305. tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
  2306. tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
  2307. tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
  2308. }
  2309. void LoadDictionarySettings ( CSphReader & tReader, CSphDictSettings & tSettings,
  2310. CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning )
  2311. {
  2312. if ( uVersion<9 )
  2313. return;
  2314. tSettings.m_sMorphology = tReader.GetString ();
  2315. tEmbeddedFiles.m_bEmbeddedStopwords = false;
  2316. if ( uVersion>=30 )
  2317. {
  2318. tEmbeddedFiles.m_bEmbeddedStopwords = !!tReader.GetByte();
  2319. if ( tEmbeddedFiles.m_bEmbeddedStopwords )
  2320. {
  2321. int nStopwords = (int)tReader.GetDword();
  2322. tEmbeddedFiles.m_dStopwords.Resize ( nStopwords );
  2323. ARRAY_FOREACH ( i, tEmbeddedFiles.m_dStopwords )
  2324. tEmbeddedFiles.m_dStopwords[i] = (SphWordID_t)tReader.UnzipOffset();
  2325. }
  2326. }
  2327. tSettings.m_sStopwords = tReader.GetString ();
  2328. int nFiles = tReader.GetDword ();
  2329. CSphString sFile;
  2330. tEmbeddedFiles.m_dStopwordFiles.Resize ( nFiles );
  2331. for ( int i = 0; i < nFiles; i++ )
  2332. {
  2333. sFile = tReader.GetString ();
  2334. ReadFileInfo ( tReader, sFile.cstr (), tEmbeddedFiles.m_dStopwordFiles[i], tEmbeddedFiles.m_bEmbeddedSynonyms ? NULL : &sWarning );
  2335. }
  2336. tEmbeddedFiles.m_bEmbeddedWordforms = false;
  2337. if ( uVersion>=30 )
  2338. {
  2339. tEmbeddedFiles.m_bEmbeddedWordforms = !!tReader.GetByte();
  2340. if ( tEmbeddedFiles.m_bEmbeddedWordforms )
  2341. {
  2342. int nWordforms = (int)tReader.GetDword();
  2343. tEmbeddedFiles.m_dWordforms.Resize ( nWordforms );
  2344. ARRAY_FOREACH ( i, tEmbeddedFiles.m_dWordforms )
  2345. tEmbeddedFiles.m_dWordforms[i] = tReader.GetString();
  2346. }
  2347. }
  2348. if ( uVersion>=29 )
  2349. tSettings.m_dWordforms.Resize ( tReader.GetDword() );
  2350. else
  2351. tSettings.m_dWordforms.Resize(1);
  2352. tEmbeddedFiles.m_dWordformFiles.Resize ( tSettings.m_dWordforms.GetLength() );
  2353. ARRAY_FOREACH ( i, tSettings.m_dWordforms )
  2354. {
  2355. tSettings.m_dWordforms[i] = tReader.GetString();
  2356. ReadFileInfo ( tReader, tSettings.m_dWordforms[i].cstr(),
  2357. tEmbeddedFiles.m_dWordformFiles[i], tEmbeddedFiles.m_bEmbeddedWordforms ? NULL : &sWarning );
  2358. }
  2359. if ( uVersion>=13 )
  2360. tSettings.m_iMinStemmingLen = tReader.GetDword ();
  2361. tSettings.m_bWordDict = false; // default to crc for old indexes
  2362. if ( uVersion>=21 )
  2363. tSettings.m_bWordDict = ( tReader.GetByte()!=0 );
  2364. if ( uVersion>=36 )
  2365. tSettings.m_bStopwordsStem = ( tReader.GetByte()!=0 );
  2366. if ( uVersion>=37 )
  2367. tSettings.m_sMorphFingerprint = tReader.GetString();
  2368. }
  2369. /// gets called from and MUST be in sync with RtIndex_t::SaveDiskHeader()!
  2370. /// note that SaveDiskHeader() occasionaly uses some PREVIOUS format version!
  2371. void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict, int iEmbeddedLimit )
  2372. {
  2373. assert ( pDict );
  2374. const CSphDictSettings & tSettings = pDict->GetSettings ();
  2375. tWriter.PutString ( tSettings.m_sMorphology.cstr () );
  2376. const CSphVector <CSphSavedFile> & dSWFileInfos = pDict->GetStopwordsFileInfos ();
  2377. SphOffset_t uTotalSize = 0;
  2378. ARRAY_FOREACH ( i, dSWFileInfos )
  2379. uTotalSize += dSWFileInfos[i].m_uSize;
  2380. bool bEmbedStopwords = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
  2381. tWriter.PutByte ( bEmbedStopwords ? 1 : 0 );
  2382. if ( bEmbedStopwords )
  2383. pDict->WriteStopwords ( tWriter );
  2384. tWriter.PutString ( tSettings.m_sStopwords.cstr () );
  2385. tWriter.PutDword ( dSWFileInfos.GetLength () );
  2386. ARRAY_FOREACH ( i, dSWFileInfos )
  2387. {
  2388. tWriter.PutString ( dSWFileInfos[i].m_sFilename.cstr () );
  2389. WriteFileInfo ( tWriter, dSWFileInfos[i] );
  2390. }
  2391. const CSphVector <CSphSavedFile> & dWFFileInfos = pDict->GetWordformsFileInfos ();
  2392. uTotalSize = 0;
  2393. ARRAY_FOREACH ( i, dWFFileInfos )
  2394. uTotalSize += dWFFileInfos[i].m_uSize;
  2395. bool bEmbedWordforms = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
  2396. tWriter.PutByte ( bEmbedWordforms ? 1 : 0 );
  2397. if ( bEmbedWordforms )
  2398. pDict->WriteWordforms ( tWriter );
  2399. tWriter.PutDword ( dWFFileInfos.GetLength() );
  2400. ARRAY_FOREACH ( i, dWFFileInfos )
  2401. {
  2402. tWriter.PutString ( dWFFileInfos[i].m_sFilename.cstr() );
  2403. WriteFileInfo ( tWriter, dWFFileInfos[i] );
  2404. }
  2405. tWriter.PutDword ( tSettings.m_iMinStemmingLen );
  2406. tWriter.PutByte ( tSettings.m_bWordDict || bForceWordDict );
  2407. tWriter.PutByte ( tSettings.m_bStopwordsStem );
  2408. tWriter.PutString ( pDict->GetMorphDataFingerprint() );
  2409. }
  2410. void LoadFieldFilterSettings ( CSphReader & tReader, CSphFieldFilterSettings & tFieldFilterSettings )
  2411. {
  2412. int nRegexps = tReader.GetDword();
  2413. if ( !nRegexps )
  2414. return;
  2415. tFieldFilterSettings.m_dRegexps.Resize ( nRegexps );
  2416. ARRAY_FOREACH ( i, tFieldFilterSettings.m_dRegexps )
  2417. tFieldFilterSettings.m_dRegexps[i] = tReader.GetString();
  2418. tFieldFilterSettings.m_bUTF8 = !!tReader.GetByte();
  2419. }
  2420. void SaveFieldFilterSettings ( CSphWriter & tWriter, ISphFieldFilter * pFieldFilter )
  2421. {
  2422. if ( !pFieldFilter )
  2423. {
  2424. tWriter.PutDword ( 0 );
  2425. return;
  2426. }
  2427. CSphFieldFilterSettings tSettings;
  2428. pFieldFilter->GetSettings ( tSettings );
  2429. tWriter.PutDword ( tSettings.m_dRegexps.GetLength() );
  2430. ARRAY_FOREACH ( i, tSettings.m_dRegexps )
  2431. tWriter.PutString ( tSettings.m_dRegexps[i] );
  2432. tWriter.PutByte ( tSettings.m_bUTF8 ? 1 : 0 );
  2433. }
  2434. static inline bool ShortTokenFilter ( BYTE * pToken, int iLen )
  2435. {
  2436. return pToken[0]=='*' || ( iLen > 0 && pToken[iLen-1]=='*' );
  2437. }
  2438. /////////////////////////////////////////////////////////////////////////////
  2439. ISphTokenizer::ISphTokenizer ()
  2440. : m_iLastTokenLen ( 0 )
  2441. , m_bTokenBoundary ( false )
  2442. , m_bBoundary ( false )
  2443. , m_bWasSpecial ( false )
  2444. , m_iOvershortCount ( 0 )
  2445. , m_bBlended ( false )
  2446. , m_bNonBlended ( true )
  2447. , m_bBlendedPart ( false )
  2448. , m_bBlendAdd ( false )
  2449. , m_uBlendVariants ( BLEND_TRIM_NONE )
  2450. , m_uBlendVariantsPending ( 0 )
  2451. , m_bBlendSkipPure ( false )
  2452. , m_bShortTokenFilter ( false )
  2453. , m_bDetectSentences ( false )
  2454. , m_bPhrase ( false )
  2455. {}
  2456. bool ISphTokenizer::SetCaseFolding ( const char * sConfig, CSphString & sError )
  2457. {
  2458. CSphVector<CSphRemapRange> dRemaps;
  2459. CSphCharsetDefinitionParser tParser;
  2460. if ( !tParser.Parse ( sConfig, dRemaps ) )
  2461. {
  2462. sError = tParser.GetLastError();
  2463. return false;
  2464. }
  2465. const int MIN_CODE = 0x21;
  2466. ARRAY_FOREACH ( i, dRemaps )
  2467. {
  2468. CSphRemapRange & tMap = dRemaps[i];
  2469. if ( tMap.m_iStart<MIN_CODE || tMap.m_iStart>=m_tLC.MAX_CODE )
  2470. {
  2471. sphWarning ( "wrong character mapping start specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED",
  2472. tMap.m_iStart, MIN_CODE, m_tLC.MAX_CODE-1 );
  2473. tMap.m_iStart = Min ( Max ( tMap.m_iStart, MIN_CODE ), m_tLC.MAX_CODE-1 );
  2474. }
  2475. if ( tMap.m_iEnd<MIN_CODE || tMap.m_iEnd>=m_tLC.MAX_CODE )
  2476. {
  2477. sphWarning ( "wrong character mapping end specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED",
  2478. tMap.m_iEnd, MIN_CODE, m_tLC.MAX_CODE-1 );
  2479. tMap.m_iEnd = Min ( Max ( tMap.m_iEnd, MIN_CODE ), m_tLC.MAX_CODE-1 );
  2480. }
  2481. if ( tMap.m_iRemapStart<MIN_CODE || tMap.m_iRemapStart>=m_tLC.MAX_CODE )
  2482. {
  2483. sphWarning ( "wrong character remapping start specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED",
  2484. tMap.m_iRemapStart, MIN_CODE, m_tLC.MAX_CODE-1 );
  2485. tMap.m_iRemapStart = Min ( Max ( tMap.m_iRemapStart, MIN_CODE ), m_tLC.MAX_CODE-1 );
  2486. }
  2487. int iRemapEnd = tMap.m_iRemapStart+tMap.m_iEnd-tMap.m_iStart;
  2488. if ( iRemapEnd<MIN_CODE || iRemapEnd>=m_tLC.MAX_CODE )
  2489. {
  2490. sphWarning ( "wrong character remapping end specified: U+%x, should be between U+%x and U+%x (inclusive); IGNORED",
  2491. iRemapEnd, MIN_CODE, m_tLC.MAX_CODE-1 );
  2492. dRemaps.Remove(i);
  2493. i--;
  2494. }
  2495. }
  2496. m_tLC.Reset ();
  2497. m_tLC.AddRemaps ( dRemaps, 0 );
  2498. return true;
  2499. }
  2500. void ISphTokenizer::AddPlainChar ( char c )
  2501. {
  2502. CSphVector<CSphRemapRange> dTmp ( 1 );
  2503. dTmp[0].m_iStart = dTmp[0].m_iEnd = dTmp[0].m_iRemapStart = c;
  2504. m_tLC.AddRemaps ( dTmp, 0 );
  2505. }
  2506. void ISphTokenizer::AddSpecials ( const char * sSpecials )
  2507. {
  2508. m_tLC.AddSpecials ( sSpecials );
  2509. }
  2510. static int TokenizeOnWhitespace ( CSphVector<CSphString> & dTokens, BYTE * sFrom, bool bUtf8 )
  2511. {
  2512. BYTE sAccum [ 3*SPH_MAX_WORD_LEN+16 ];
  2513. BYTE * pAccum = sAccum;
  2514. int iAccum = 0;
  2515. for ( ;; )
  2516. {
  2517. int iCode = bUtf8 ? sphUTF8Decode(sFrom) : *sFrom++;
  2518. // eof or whitespace?
  2519. if ( !iCode || sphIsSpace(iCode) )
  2520. {
  2521. // flush accum
  2522. if ( iAccum )
  2523. {
  2524. *pAccum = '\0';
  2525. dTokens.Add ( (char*)sAccum );
  2526. pAccum = sAccum;
  2527. iAccum = 0;
  2528. }
  2529. // break on eof
  2530. if ( !iCode )
  2531. break;
  2532. } else
  2533. {
  2534. // accumulate everything else
  2535. if ( iAccum<SPH_MAX_WORD_LEN )
  2536. {
  2537. if ( bUtf8 )
  2538. {
  2539. pAccum += sphUTF8Encode ( pAccum, iCode );
  2540. iAccum++;
  2541. } else
  2542. {
  2543. *pAccum++ = BYTE(iCode);
  2544. iAccum++;
  2545. }
  2546. }
  2547. }
  2548. }
  2549. return dTokens.GetLength();
  2550. }
  2551. static BYTE * sphTrim ( BYTE * s )
  2552. {
  2553. // skip to first non-whitespace from start
  2554. while ( *s && sphIsSpace(*s) )
  2555. s++;
  2556. if ( !*s )
  2557. return s;
  2558. // find the end
  2559. BYTE * sEnd = s;
  2560. while ( *sEnd )
  2561. sEnd++;
  2562. sEnd--;
  2563. // skip to first non-whitespace from end
  2564. while ( sEnd>s && sphIsSpace(*sEnd) )
  2565. sEnd--;
  2566. *++sEnd = '\0';
  2567. return s;
  2568. }
  2569. void ISphTokenizer::Setup ( const CSphTokenizerSettings & tSettings )
  2570. {
  2571. m_tSettings = tSettings;
  2572. }
  2573. ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError )
  2574. {
  2575. CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL );
  2576. switch ( tSettings.m_iType )
  2577. {
  2578. case TOKENIZER_SBCS: pTokenizer = sphCreateSBCSTokenizer (); break;
  2579. case TOKENIZER_UTF8: pTokenizer = sphCreateUTF8Tokenizer (); break;
  2580. case TOKENIZER_NGRAM: pTokenizer = sphCreateUTF8NgramTokenizer (); break;
  2581. default:
  2582. sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
  2583. return NULL;
  2584. }
  2585. pTokenizer->Setup ( tSettings );
  2586. if ( !tSettings.m_sCaseFolding.IsEmpty () && !pTokenizer->SetCaseFolding ( tSettings.m_sCaseFolding.cstr (), sError ) )
  2587. {
  2588. sError.SetSprintf ( "'charset_table': %s", sError.cstr() );
  2589. return NULL;
  2590. }
  2591. if ( !tSettings.m_sSynonymsFile.IsEmpty () && !pTokenizer->LoadSynonyms ( tSettings.m_sSynonymsFile.cstr (),
  2592. pFiles && pFiles->m_bEmbeddedSynonyms ? pFiles : NULL, sError ) )
  2593. {
  2594. sError.SetSprintf ( "'synonyms': %s", sError.cstr() );
  2595. return NULL;
  2596. }
  2597. if ( !tSettings.m_sBoundary.IsEmpty () && !pTokenizer->SetBoundary ( tSettings.m_sBoundary.cstr (), sError ) )
  2598. {
  2599. sError.SetSprintf ( "'phrase_boundary': %s", sError.cstr() );
  2600. return NULL;
  2601. }
  2602. if ( !tSettings.m_sIgnoreChars.IsEmpty () && !pTokenizer->SetIgnoreChars ( tSettings.m_sIgnoreChars.cstr (), sError ) )
  2603. {
  2604. sError.SetSprintf ( "'ignore_chars': %s", sError.cstr() );
  2605. return NULL;
  2606. }
  2607. if ( !tSettings.m_sBlendChars.IsEmpty () && !pTokenizer->SetBlendChars ( tSettings.m_sBlendChars.cstr (), sError ) )
  2608. {
  2609. sError.SetSprintf ( "'blend_chars': %s", sError.cstr() );
  2610. return NULL;
  2611. }
  2612. if ( !pTokenizer->SetBlendMode ( tSettings.m_sBlendMode.cstr (), sError ) )
  2613. {
  2614. sError.SetSprintf ( "'blend_mode': %s", sError.cstr() );
  2615. return NULL;
  2616. }
  2617. pTokenizer->SetNgramLen ( tSettings.m_iNgramLen );
  2618. if ( !tSettings.m_sNgramChars.IsEmpty () && !pTokenizer->SetNgramChars ( tSettings.m_sNgramChars.cstr (), sError ) )
  2619. {
  2620. sError.SetSprintf ( "'ngram_chars': %s", sError.cstr() );
  2621. return NULL;
  2622. }
  2623. return pTokenizer.LeakPtr ();
  2624. }
  2625. ISphTokenizer * ISphTokenizer::CreateMultiformFilter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
  2626. {
  2627. if ( !pContainer )
  2628. return pTokenizer;
  2629. return new CSphMultiformTokenizer ( pTokenizer, pContainer );
  2630. }
  2631. ISphTokenizer * ISphTokenizer::CreateBigramFilter ( ISphTokenizer * pTokenizer, ESphBigram eBigramIndex, const CSphString & sBigramWords, CSphString & sError )
  2632. {
  2633. assert ( pTokenizer );
  2634. if ( eBigramIndex==SPH_BIGRAM_NONE )
  2635. return pTokenizer;
  2636. CSphVector<CSphString> dFreq;
  2637. if ( eBigramIndex!=SPH_BIGRAM_ALL )
  2638. {
  2639. const BYTE * pTok = NULL;
  2640. pTokenizer->SetBuffer ( (BYTE*)const_cast<char*> ( sBigramWords.cstr() ), sBigramWords.Length() );
  2641. while ( ( pTok = pTokenizer->GetToken() )!=NULL )
  2642. dFreq.Add ( (const char*)pTok );
  2643. if ( !dFreq.GetLength() )
  2644. {
  2645. SafeDelete ( pTokenizer );
  2646. sError.SetSprintf ( "bigram_freq_words does not contain any valid words" );
  2647. return NULL;
  2648. }
  2649. }
  2650. return new CSphBigramTokenizer ( pTokenizer, eBigramIndex, dFreq );
  2651. }
  2652. bool ISphTokenizer::AddSpecialsSPZ ( const char * sSpecials, const char * sDirective, CSphString & sError )
  2653. {
  2654. for ( int i=0; sSpecials[i]; i++ )
  2655. {
  2656. int iCode = m_tLC.ToLower ( sSpecials[i] );
  2657. if ( iCode & ( FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_BOUNDARY | FLAG_CODEPOINT_IGNORE ) )
  2658. {
  2659. sError.SetSprintf ( "%s requires that character '%c' is not in ngram_chars, phrase_boundary, or ignore_chars",
  2660. sDirective, sSpecials[i] );
  2661. return false;
  2662. }
  2663. }
  2664. AddSpecials ( sSpecials );
  2665. return true;
  2666. }
  2667. bool ISphTokenizer::EnableSentenceIndexing ( CSphString & sError )
  2668. {
  2669. const char sSpecials[] = { '.', '?', '!', MAGIC_CODE_PARAGRAPH, 0 };
  2670. if ( !AddSpecialsSPZ ( sSpecials, "index_sp", sError ) )
  2671. return false;
  2672. m_bDetectSentences = true;
  2673. return true;
  2674. }
  2675. bool ISphTokenizer::EnableZoneIndexing ( CSphString & sError )
  2676. {
  2677. const char sSpecials[] = { MAGIC_CODE_ZONE, 0 };
  2678. return AddSpecialsSPZ ( sSpecials, "index_zones", sError );
  2679. }
  2680. //////////////////////////////////////////////////////////////////////////
  2681. CSphTokenizerBase::CSphTokenizerBase ()
  2682. : m_pBuffer ( NULL )
  2683. , m_pBufferMax ( NULL )
  2684. , m_pCur ( NULL )
  2685. , m_pTokenStart ( NULL )
  2686. , m_pTokenEnd ( NULL )
  2687. , m_iAccum ( 0 )
  2688. , m_bHasBlend ( false )
  2689. , m_pBlendStart ( NULL )
  2690. , m_pBlendEnd ( NULL )
  2691. , m_eMode ( SPH_CLONE_INDEX )
  2692. {
  2693. m_pAccum = m_sAccum;
  2694. }
  2695. bool CSphTokenizerBase::SetCaseFolding ( const char * sConfig, CSphString & sError )
  2696. {
  2697. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  2698. if ( m_dSynonyms.GetLength() )
  2699. {
  2700. sError = "SetCaseFolding() must not be called after LoadSynonyms()";
  2701. return false;
  2702. }
  2703. m_bHasBlend = false;
  2704. return ISphTokenizer::SetCaseFolding ( sConfig, sError );
  2705. }
  2706. bool CSphTokenizerBase::SetBlendChars ( const char * sConfig, CSphString & sError )
  2707. {
  2708. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  2709. bool bRes = ISphTokenizer::SetBlendChars ( sConfig, sError );
  2710. if ( bRes )
  2711. m_bHasBlend = true;
  2712. return bRes;
  2713. }
  2714. bool CSphTokenizerBase::LoadSynonym ( char * sBuffer, const char * sFilename,
  2715. int iLine, CSphSynonymHash & tHash, CSphString & sError )
  2716. {
  2717. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  2718. CSphVector<CSphString> dFrom;
  2719. // extract map-from and map-to parts
  2720. char * sSplit = strstr ( sBuffer, "=>" );
  2721. if ( !sSplit )
  2722. {
  2723. sError.SetSprintf ( "%s line %d: mapping token (=>) not found", sFilename, iLine );
  2724. return false;
  2725. }
  2726. BYTE * sFrom = (BYTE *) sBuffer;
  2727. BYTE * sTo = (BYTE *)( sSplit + strlen ( "=>" ) );
  2728. *sSplit = '\0';
  2729. // tokenize map-from
  2730. if ( !TokenizeOnWhitespace ( dFrom, sFrom, IsUtf8() ) )
  2731. {
  2732. sError.SetSprintf ( "%s line %d: empty map-from part", sFilename, iLine );
  2733. return false;
  2734. }
  2735. // trim map-to
  2736. sTo = sphTrim ( sTo );
  2737. if ( !*sTo )
  2738. {
  2739. sError.SetSprintf ( "%s line %d: empty map-to part", sFilename, iLine );
  2740. return false;
  2741. }
  2742. // check lengths
  2743. ARRAY_FOREACH ( i, dFrom )
  2744. {
  2745. int iFromLen = IsUtf8() ? sphUTF8Len ( dFrom[i].cstr() ) : strlen ( dFrom[i].cstr() );
  2746. if ( iFromLen>SPH_MAX_WORD_LEN )
  2747. {
  2748. sError.SetSprintf ( "%s line %d: map-from token too long (over %d bytes)", sFilename, iLine, SPH_MAX_WORD_LEN );
  2749. return false;
  2750. }
  2751. }
  2752. int iToLen = IsUtf8() ? sphUTF8Len ( (const char*)sTo ) : strlen ( (const char*)sTo );
  2753. if ( iToLen>SPH_MAX_WORD_LEN )
  2754. {
  2755. sError.SetSprintf ( "%s line %d: map-to token too long (over %d bytes)", sFilename, iLine, SPH_MAX_WORD_LEN );
  2756. return false;
  2757. }
  2758. // pack and store it
  2759. int iFromLen = 1;
  2760. ARRAY_FOREACH ( i, dFrom )
  2761. iFromLen += strlen ( dFrom[i].cstr() ) + 1;
  2762. if ( iFromLen>MAX_SYNONYM_LEN )
  2763. {
  2764. sError.SetSprintf ( "%s line %d: map-from part too long (over %d bytes)", sFilename, iLine, MAX_SYNONYM_LEN );
  2765. return false;
  2766. }
  2767. CSphSynonym & tSyn = m_dSynonyms.Add ();
  2768. tSyn.m_sFrom.Reserve ( iFromLen );
  2769. tSyn.m_iFromLen = iFromLen;
  2770. tSyn.m_sTo = (char*)sTo;
  2771. tSyn.m_iToLen = iToLen;
  2772. char * sCur = const_cast<char*> ( tSyn.m_sFrom.cstr() );
  2773. ARRAY_FOREACH ( i, dFrom )
  2774. {
  2775. int iLen = strlen ( dFrom[i].cstr() );
  2776. memcpy ( sCur, dFrom[i].cstr(), iLen );
  2777. sCur[iLen] = MAGIC_SYNONYM_WHITESPACE;
  2778. sCur += iLen+1;
  2779. }
  2780. *sCur++ = '\0';
  2781. assert ( sCur-tSyn.m_sFrom.cstr()==iFromLen );
  2782. // track synonym-only codepoints in map-from
  2783. for ( ;; )
  2784. {
  2785. int iCode = IsUtf8() ? sphUTF8Decode(sFrom) : *sFrom++;
  2786. if ( !iCode )
  2787. break;
  2788. if ( iCode>0 && !sphIsSpace(iCode) && !m_tLC.ToLower(iCode) )
  2789. tHash.Add ( 1, iCode );
  2790. }
  2791. return true;
  2792. }
  2793. bool CSphTokenizerBase::LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError )
  2794. {
  2795. assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  2796. m_dSynonyms.Reset ();
  2797. CSphSynonymHash hSynonymOnly;
  2798. if ( pFiles )
  2799. {
  2800. m_tSynFileInfo = pFiles->m_tSynonymFile;
  2801. ARRAY_FOREACH ( i, pFiles->m_dSynonyms )
  2802. {
  2803. if ( !LoadSynonym ( (char*)pFiles->m_dSynonyms[i].cstr(), pFiles->m_tSynonymFile.m_sFilename.cstr(), i, hSynonymOnly, sError ) )
  2804. sphWarning ( "%s", sError.cstr() );
  2805. }
  2806. } else
  2807. {
  2808. if ( !sFilename || !*sFilename )
  2809. return true;
  2810. GetFileStats ( sFilename, m_tSynFileInfo );
  2811. CSphAutoreader tReader;
  2812. if ( !tReader.Open ( sFilename, sError ) )
  2813. return NULL;
  2814. char sBuffer[1024];
  2815. int iLine = 0;
  2816. while ( tReader.GetLine ( sBuffer, sizeof(sBuffer) )>=0 )
  2817. {
  2818. iLine++;
  2819. if ( !LoadSynonym ( sBuffer, sFilename, iLine, hSynonymOnly, sError ) )
  2820. sphWarning ( "%s", sError.cstr() );
  2821. }
  2822. // sort the list
  2823. m_dSynonyms.Sort ();
  2824. }
  2825. // build simple lookup table
  2826. m_dSynStart.Resize ( 256 );
  2827. m_dSynEnd.Resize ( 256 );
  2828. for ( int i=0; i<256; i++ )
  2829. {
  2830. m_dSynStart[i] = INT_MAX;
  2831. m_dSynEnd[i] = -INT_MAX;
  2832. }
  2833. ARRAY_FOREACH ( i, m_dSynonyms )
  2834. {
  2835. int iCh = *(BYTE*)( m_dSynonyms[i].m_sFrom.cstr() );
  2836. m_dSynStart[iCh] = Min ( m_dSynStart[iCh], i );
  2837. m_dSynEnd[iCh] = Max ( m_dSynEnd[iCh], i );
  2838. }
  2839. // add synonym-only remaps
  2840. CSphVector<CSphRemapRange> dRemaps;
  2841. dRemaps.Reserve ( hSynonymOnly.GetLength() );
  2842. hSynonymOnly.IterateStart ();
  2843. while ( hSynonymOnly.IterateNext() )
  2844. {
  2845. CSphRemapRange & tRange = dRemaps.Add ();
  2846. tRange.m_iStart = tRange.m_iEnd = tRange.m_iRemapStart = hSynonymOnly.IterateGetKey();
  2847. }
  2848. m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_SYNONYM );
  2849. return true;
  2850. }
  2851. void CSphTokenizerBase::WriteSynonyms ( CSphWriter & tWriter )
  2852. {
  2853. tWriter.PutDword ( m_dSynonyms.GetLength() );
  2854. ARRAY_FOREACH ( i, m_dSynonyms )
  2855. {
  2856. CSphString sFrom, sLine;
  2857. sFrom = m_dSynonyms[i].m_sFrom;
  2858. char * pFrom = (char*)sFrom.cstr();
  2859. while ( pFrom && *pFrom )
  2860. {
  2861. if ( *pFrom==MAGIC_SYNONYM_WHITESPACE )
  2862. *pFrom = ' ';
  2863. pFrom++;
  2864. }
  2865. sFrom.Trim();
  2866. sLine.SetSprintf ( "%s => %s", sFrom.cstr(), m_dSynonyms[i].m_sTo.cstr() );
  2867. tWriter.PutString ( sLine );
  2868. }
  2869. }
  2870. void CSphTokenizerBase::CloneBase ( const CSphTokenizerBase * pFrom, ESphTokenizerClone eMode )
  2871. {
  2872. m_eMode = eMode;
  2873. m_dSynonyms = pFrom->m_dSynonyms;
  2874. m_dSynStart = pFrom->m_dSynStart;
  2875. m_dSynEnd = pFrom->m_dSynEnd;
  2876. m_tSettings = pFrom->m_tSettings;
  2877. m_bHasBlend = pFrom->m_bHasBlend;
  2878. m_uBlendVariants = pFrom->m_uBlendVariants;
  2879. m_bBlendSkipPure = pFrom->m_bBlendSkipPure;
  2880. m_bShortTokenFilter = ( eMode!=SPH_CLONE_INDEX );
  2881. switch ( eMode )
  2882. {
  2883. case SPH_CLONE_INDEX:
  2884. m_tLC = pFrom->m_tLC;
  2885. break;
  2886. case SPH_CLONE_QUERY:
  2887. {
  2888. m_tLC = pFrom->m_tLC;
  2889. CSphVector<CSphRemapRange> dRemaps;
  2890. CSphRemapRange Range;
  2891. Range.m_iStart = Range.m_iEnd = Range.m_iRemapStart = '\\';
  2892. dRemaps.Add ( Range );
  2893. m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_SPECIAL );
  2894. m_uBlendVariants = BLEND_TRIM_NONE;
  2895. break;
  2896. }
  2897. case SPH_CLONE_QUERY_LIGHTWEIGHT:
  2898. {
  2899. // FIXME? avoid double lightweight clones, too?
  2900. assert ( pFrom->m_eMode!=SPH_CLONE_INDEX );
  2901. assert ( pFrom->m_tLC.ToLower('\\') & FLAG_CODEPOINT_SPECIAL );
  2902. // lightweight tokenizer clone
  2903. // copy 3 KB of lowercaser chunk pointers, but do NOT copy the table data
  2904. SafeDelete ( m_tLC.m_pData );
  2905. m_tLC.m_iChunks = 0;
  2906. m_tLC.m_pData = NULL;
  2907. for ( int i=0; i<CSphLowercaser::CHUNK_COUNT; i++ )
  2908. m_tLC.m_pChunk[i] = pFrom->m_tLC.m_pChunk[i];
  2909. break;
  2910. }
  2911. }
  2912. }
  2913. void CSphTokenizerBase::SetBufferPtr ( const char * sNewPtr )
  2914. {
  2915. assert ( (BYTE*)sNewPtr>=m_pBuffer && (BYTE*)sNewPtr<=m_pBufferMax );
  2916. m_pCur = Min ( m_pBufferMax, Max ( m_pBuffer, (BYTE*)sNewPtr ) );
  2917. m_iAccum = 0;
  2918. m_pAccum = m_sAccum;
  2919. m_pTokenStart = m_pTokenEnd = NULL;
  2920. m_pBlendStart = m_pBlendEnd = NULL;
  2921. }
  2922. template < bool IS_UTF8 >
  2923. int CSphTokenizerBase2<IS_UTF8>::SkipBlended()
  2924. {
  2925. if ( !m_pBlendEnd )
  2926. return 0;
  2927. BYTE * pMax = m_pBufferMax;
  2928. m_pBufferMax = m_pBlendEnd;
  2929. // loop until the blended token end
  2930. int iBlended = 0; // how many blended subtokens we have seen so far
  2931. int iAccum = 0; // how many non-blended chars in a row we have seen so far
  2932. while ( m_pCur < m_pBufferMax )
  2933. {
  2934. int iCode = GetCodepoint();
  2935. if ( iCode=='\\' )
  2936. iCode = GetCodepoint(); // no boundary check, GetCP does it
  2937. iCode = m_tLC.ToLower ( iCode ); // no -1 check, ToLower does it
  2938. if ( iCode<0 )
  2939. iCode = 0;
  2940. if ( iCode & FLAG_CODEPOINT_BLEND )
  2941. iCode = 0;
  2942. if ( iCode & MASK_CODEPOINT )
  2943. {
  2944. iAccum++;
  2945. continue;
  2946. }
  2947. if ( iAccum>=m_tSettings.m_iMinWordLen )
  2948. iBlended++;
  2949. iAccum = 0;
  2950. }
  2951. if ( iAccum>=m_tSettings.m_iMinWordLen )
  2952. iBlended++;
  2953. m_pBufferMax = pMax;
  2954. return iBlended;
  2955. }
  2956. /// adjusts blending magic when we're about to return a token (any token)
  2957. /// returns false if current token should be skipped, true otherwise
  2958. bool CSphTokenizerBase::BlendAdjust ( BYTE * pCur )
  2959. {
  2960. // check if all we got is a bunch of blended characters (pure-blended case)
  2961. if ( m_bBlended && !m_bNonBlended )
  2962. {
  2963. // we either skip this token, or pretend it was normal
  2964. // in both cases, clear the flag
  2965. m_bBlended = false;
  2966. // do we need to skip it?
  2967. if ( m_bBlendSkipPure )
  2968. {
  2969. m_pBlendStart = NULL;
  2970. return false;
  2971. }
  2972. }
  2973. m_bNonBlended = false;
  2974. // adjust buffer pointers
  2975. if ( m_bBlended && m_pBlendStart )
  2976. {
  2977. // called once per blended token, on processing start
  2978. // at this point, full blended token is in the accumulator
  2979. // and we're about to return it
  2980. m_pCur = m_pBlendStart;
  2981. m_pBlendEnd = pCur;
  2982. m_pBlendStart = NULL;
  2983. m_bBlendedPart = true;
  2984. } else if ( pCur>=m_pBlendEnd )
  2985. {
  2986. // tricky bit, as at this point, token we're about to return
  2987. // can either be a blended subtoken, or the next one
  2988. m_bBlendedPart = ( m_pTokenStart!=NULL ) && ( m_pTokenStart<m_pBlendEnd );
  2989. m_pBlendEnd = NULL;
  2990. m_pBlendStart = NULL;
  2991. } else if ( !m_pBlendEnd )
  2992. {
  2993. // we aren't re-parsing blended; so clear the "blended subtoken" flag
  2994. m_bBlendedPart = false;
  2995. }
  2996. return true;
  2997. }
  2998. static inline void CopySubstring ( BYTE * pDst, const BYTE * pSrc, int iLen )
  2999. {
  3000. while ( iLen-->0 && *pSrc )
  3001. *pDst++ = *pSrc++;
  3002. *pDst++ = '\0';
  3003. }
  3004. template < bool IS_UTF8 >
  3005. BYTE * CSphTokenizerBase2<IS_UTF8>::GetBlendedVariant ()
  3006. {
  3007. // we can get called on several occasions
  3008. // case 1, a new blended token was just accumulated
  3009. if ( m_bBlended && !m_bBlendAdd )
  3010. {
  3011. // fast path for the default case (trim_none)
  3012. if ( m_uBlendVariants==BLEND_TRIM_NONE )
  3013. return m_sAccum;
  3014. // analyze the full token, find non-blended bounds
  3015. m_iBlendNormalStart = -1;
  3016. m_iBlendNormalEnd = -1;
  3017. // OPTIMIZE? we can skip this based on non-blended flag from adjust
  3018. BYTE * p = m_sAccum;
  3019. while ( *p )
  3020. {
  3021. int iLast = (int)( p-m_sAccum );
  3022. int iCode = IS_UTF8
  3023. ? sphUTF8Decode ( p )
  3024. : *p++;
  3025. if (!( m_tLC.ToLower ( iCode ) & FLAG_CODEPOINT_BLEND ))
  3026. {
  3027. m_iBlendNormalEnd = (int)( p-m_sAccum );
  3028. if ( m_iBlendNormalStart<0 )
  3029. m_iBlendNormalStart = iLast;
  3030. }
  3031. }
  3032. // build todo mask
  3033. // check and revert a few degenerate cases
  3034. m_uBlendVariantsPending = m_uBlendVariants;
  3035. if ( m_uBlendVariantsPending & BLEND_TRIM_BOTH )
  3036. {
  3037. if ( m_iBlendNormalStart<0 )
  3038. {
  3039. // no heading blended; revert BOTH to TAIL
  3040. m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
  3041. m_uBlendVariantsPending |= BLEND_TRIM_TAIL;
  3042. } else if ( m_iBlendNormalEnd<0 )
  3043. {
  3044. // no trailing blended; revert BOTH to HEAD
  3045. m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
  3046. m_uBlendVariantsPending |= BLEND_TRIM_HEAD;
  3047. }
  3048. }
  3049. if ( m_uBlendVariantsPending & BLEND_TRIM_HEAD )
  3050. {
  3051. // either no heading blended, or pure blended; revert HEAD to NONE
  3052. if ( m_iBlendNormalStart<=0 )
  3053. {
  3054. m_uBlendVariantsPending &= ~BLEND_TRIM_HEAD;
  3055. m_uBlendVariantsPending |= BLEND_TRIM_NONE;
  3056. }
  3057. }
  3058. if ( m_uBlendVariantsPending & BLEND_TRIM_TAIL )
  3059. {
  3060. // either no trailing blended, or pure blended; revert TAIL to NONE
  3061. if ( m_iBlendNormalEnd<=0 || m_sAccum[m_iBlendNormalEnd]==0 )
  3062. {
  3063. m_uBlendVariantsPending &= ~BLEND_TRIM_TAIL;
  3064. m_uBlendVariantsPending |= BLEND_TRIM_NONE;
  3065. }
  3066. }
  3067. // ok, we are going to return a few variants after all, flag that
  3068. // OPTIMIZE? add fast path for "single" variants?
  3069. m_bBlendAdd = true;
  3070. assert ( m_uBlendVariantsPending );
  3071. // we also have to stash the original blended token
  3072. // because accumulator contents may get trashed by caller (say, when stemming)
  3073. strncpy ( (char*)m_sAccumBlend, (char*)m_sAccum, sizeof(m_sAccumBlend) );
  3074. }
  3075. // case 2, caller is checking for pending variants, have we even got any?
  3076. if ( !m_bBlendAdd )
  3077. return false;
  3078. // handle trim_none
  3079. // this MUST be the first handler, so that we could avoid copying below, and just return the original accumulator
  3080. if ( m_uBlendVariantsPending & BLEND_TRIM_NONE )
  3081. {
  3082. m_uBlendVariantsPending &= ~BLEND_TRIM_NONE;
  3083. m_bBlended = true;
  3084. return m_sAccum;
  3085. }
  3086. // handle trim_both
  3087. if ( m_uBlendVariantsPending & BLEND_TRIM_BOTH )
  3088. {
  3089. m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
  3090. if ( m_iBlendNormalStart<0 )
  3091. m_uBlendVariantsPending |= BLEND_TRIM_TAIL; // no heading blended; revert BOTH to TAIL
  3092. else if ( m_iBlendNormalEnd<0 )
  3093. m_uBlendVariantsPending |= BLEND_TRIM_HEAD; // no trailing blended; revert BOTH to HEAD
  3094. else
  3095. {
  3096. assert ( m_iBlendNormalStart<m_iBlendNormalEnd );
  3097. CopySubstring ( m_sAccum, m_sAccumBlend+m_iBlendNormalStart, m_iBlendNormalEnd-m_iBlendNormalStart );
  3098. m_bBlended = true;
  3099. return m_sAccum;
  3100. }
  3101. }
  3102. // handle TRIM_HEAD
  3103. if ( m_uBlendVariantsPending & BLEND_TRIM_HEAD )
  3104. {
  3105. m_uBlendVariantsPending &= ~BLEND_TRIM_HEAD;
  3106. if ( m_iBlendNormalStart>=0 )
  3107. {
  3108. // FIXME! need we check for overshorts?
  3109. CopySubstring ( m_sAccum, m_sAccumBlend+m_iBlendNormalStart, sizeof(m_sAccum) );
  3110. m_bBlended = true;
  3111. return m_sAccum;
  3112. }
  3113. }
  3114. // handle TRIM_TAIL
  3115. if ( m_uBlendVariantsPending & BLEND_TRIM_TAIL )
  3116. {
  3117. m_uBlendVariantsPending &= ~BLEND_TRIM_TAIL;
  3118. if ( m_iBlendNormalEnd>0 )
  3119. {
  3120. // FIXME! need we check for overshorts?
  3121. CopySubstring ( m_sAccum, m_sAccumBlend, m_iBlendNormalEnd );
  3122. m_bBlended = true;
  3123. return m_sAccum;
  3124. }
  3125. }
  3126. // all clear, no more variants to go
  3127. m_bBlendAdd = false;
  3128. return NULL;
  3129. }
  3130. static inline bool IsCapital ( int iCh )
  3131. {
  3132. return iCh>='A' && iCh<='Z';
  3133. }
  3134. static inline bool IsWhitespace ( BYTE c )
  3135. {
  3136. return ( c=='\0' || c==' ' || c=='\t' || c=='\r' || c=='\n' );
  3137. }
  3138. static inline bool IsWhitespace ( int c )
  3139. {
  3140. return ( c=='\0' || c==' ' || c=='\t' || c=='\r' || c=='\n' );
  3141. }
  3142. static inline bool IsBoundary ( BYTE c, bool bPhrase )
  3143. {
  3144. // FIXME? sorta intersects with specials
  3145. // then again, a shortened-down list (more strict syntax) is reasonble here too
  3146. return IsWhitespace(c) || c=='"' || ( !bPhrase && ( c=='(' || c==')' || c=='|' ) );
  3147. }
  3148. int CSphTokenizerBase::CodepointArbitrationI ( int iCode )
  3149. {
  3150. if ( !m_bDetectSentences )
  3151. return iCode;
  3152. // detect sentence boundaries
  3153. // FIXME! should use charset_table (or add a new directive) and support languages other than English
  3154. int iSymbol = iCode & MASK_CODEPOINT;
  3155. if ( iSymbol=='?' || iSymbol=='!' )
  3156. {
  3157. // definitely a sentence boundary
  3158. return MAGIC_CODE_SENTENCE | FLAG_CODEPOINT_SPECIAL;
  3159. }
  3160. if ( iSymbol=='.' )
  3161. {
  3162. // inline dot ("in the U.K and"), not a boundary
  3163. bool bInwordDot = ( sphIsAlpha ( m_pCur[0] ) || m_pCur[0]==',' );
  3164. // followed by a small letter or an opening paren, not a boundary
  3165. // FIXME? might want to scan for more than one space
  3166. // Yoyodine Inc. exists ...
  3167. // Yoyodine Inc. (the company) ..
  3168. bool bInphraseDot = ( sphIsSpace ( m_pCur[0] )
  3169. && ( ( 'a'<=m_pCur[1] && m_pCur[1]<='z' )
  3170. || ( m_pCur[1]=='(' && 'a'<=m_pCur[2] && m_pCur[2]<='z' ) ) );
  3171. // preceded by something that looks like a middle name, opening first name, salutation
  3172. bool bMiddleName = false;
  3173. switch ( m_iAccum )
  3174. {
  3175. case 1:
  3176. // 1-char capital letter
  3177. // example: J. R. R. Tolkien, who wrote Hobbit ...
  3178. // example: John D. Doe ...
  3179. bMiddleName = IsCapital ( m_pCur[-2] );
  3180. break;
  3181. case 2:
  3182. // 2-char token starting with a capital
  3183. if ( IsCapital ( m_pCur[-3] ) )
  3184. {
  3185. // capital+small
  3186. // example: Known as Mr. Doe ...
  3187. if ( !IsCapital ( m_pCur[-2] ) )
  3188. bMiddleName = true;
  3189. // known capital+capital (MR, DR, MS)
  3190. if (
  3191. ( m_pCur[-3]=='M' && m_pCur[-2]=='R' ) ||
  3192. ( m_pCur[-3]=='M' && m_pCur[-2]=='S' ) ||
  3193. ( m_pCur[-3]=='D' && m_pCur[-2]=='R' ) )
  3194. bMiddleName = true;
  3195. }
  3196. break;
  3197. case 3:
  3198. // preceded by a known 3-byte token (MRS, DRS)
  3199. // example: Survived by Mrs. Doe ...
  3200. if ( ( m_sAccum[0]=='m' || m_sAccum[0]=='d' ) && m_sAccum[1]=='r' && m_sAccum[2]=='s' )
  3201. bMiddleName = true;
  3202. break;
  3203. }
  3204. if ( !bInwordDot && !bInphraseDot && !bMiddleName )
  3205. {
  3206. // sentence boundary
  3207. return MAGIC_CODE_SENTENCE | FLAG_CODEPOINT_SPECIAL;
  3208. } else
  3209. {
  3210. // just a character
  3211. if ( ( iCode & MASK_FLAGS )==FLAG_CODEPOINT_SPECIAL )
  3212. return 0; // special only, not dual? then in this context, it is a separator
  3213. else
  3214. return iCode & ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL ); // perhaps it was blended, so return the original code
  3215. }
  3216. }
  3217. // pass-through
  3218. return iCode;
  3219. }
  3220. int CSphTokenizerBase::CodepointArbitrationQ ( int iCode, bool bWasEscaped, BYTE uNextByte )
  3221. {
  3222. if ( iCode & FLAG_CODEPOINT_NGRAM )
  3223. return iCode; // ngrams are handled elsewhere
  3224. int iSymbol = iCode & MASK_CODEPOINT;
  3225. // codepoints can't be blended and special at the same time
  3226. if ( ( iCode & FLAG_CODEPOINT_BLEND ) && ( iCode & FLAG_CODEPOINT_SPECIAL ) )
  3227. {
  3228. bool bBlend =
  3229. bWasEscaped || // escaped characters should always act as blended
  3230. ( m_bPhrase && !sphIsModifier ( iSymbol ) && iSymbol!='"' ) || // non-modifier special inside phrase
  3231. ( m_iAccum && ( iSymbol=='@' || iSymbol=='/' || iSymbol=='-' ) ); // some specials in the middle of a token
  3232. // clear special or blend flags
  3233. iCode &= bBlend
  3234. ? ~( FLAG_CODEPOINT_DUAL | FLAG_CODEPOINT_SPECIAL )
  3235. : ~( FLAG_CODEPOINT_DUAL | FLAG_CODEPOINT_BLEND );
  3236. }
  3237. // escaped specials are not special
  3238. // dash and dollar inside the word are not special (however, single opening modifier is not a word!)
  3239. // non-modifier specials within phrase are not special
  3240. bool bDashInside = ( m_iAccum && iSymbol=='-' && !( m_iAccum==1 && sphIsModifier ( m_sAccum[0] ) ));
  3241. if ( iCode & FLAG_CODEPOINT_SPECIAL )
  3242. if ( bWasEscaped
  3243. || bDashInside
  3244. || ( m_iAccum && iSymbol=='$' && !IsBoundary ( uNextByte, m_bPhrase ) )
  3245. || ( m_bPhrase && iSymbol!='"' && !sphIsModifier ( iSymbol ) ) )
  3246. {
  3247. if ( iCode & FLAG_CODEPOINT_DUAL )
  3248. iCode &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
  3249. else if ( bDashInside && ( iCode & FLAG_CODEPOINT_SYNONYM ) )
  3250. // if we return zero here, we will break the tokens like 'Ms-Dos'
  3251. iCode &= ~( FLAG_CODEPOINT_SPECIAL );
  3252. else
  3253. iCode = 0;
  3254. }
  3255. // if we didn't remove special by now, it must win
  3256. if ( iCode & FLAG_CODEPOINT_DUAL )
  3257. {
  3258. assert ( iCode & FLAG_CODEPOINT_SPECIAL );
  3259. iCode = iSymbol | FLAG_CODEPOINT_SPECIAL;
  3260. }
  3261. // ideally, all conflicts must be resolved here
  3262. // well, at least most
  3263. assert ( sphBitCount ( iCode & MASK_FLAGS )<=1
  3264. || ( iCode & FLAG_CODEPOINT_SYNONYM ) );
  3265. return iCode;
  3266. }
  3267. enum SynCheck_e
  3268. {
  3269. SYNCHECK_LESS,
  3270. SYNCHECK_PARTIAL,
  3271. SYNCHECK_EXACT,
  3272. SYNCHECK_GREATER
  3273. };
  3274. static inline SynCheck_e SynCheckPrefix ( const CSphSynonym & tCandidate, int iOff, const BYTE * sCur, int iBytes, bool bMaybeSeparator )
  3275. {
  3276. const BYTE * sCand = ( (const BYTE*)tCandidate.m_sFrom.cstr() ) + iOff;
  3277. while ( iBytes-->0 )
  3278. {
  3279. if ( *sCand!=*sCur )
  3280. {
  3281. // incoming synonym-only char vs. ending sequence (eg. 2nd slash in "OS/2/3"); we actually have a match
  3282. if ( bMaybeSeparator && sCand[0]==MAGIC_SYNONYM_WHITESPACE && sCand[1]=='\0' )
  3283. return SYNCHECK_EXACT;
  3284. // otherwise, it is a mismatch
  3285. return ( *sCand<*sCur ) ? SYNCHECK_LESS : SYNCHECK_GREATER;
  3286. }
  3287. sCand++;
  3288. sCur++;
  3289. }
  3290. // full match after a full separator
  3291. if ( sCand[0]=='\0' )
  3292. return SYNCHECK_EXACT;
  3293. // full match after my last synonym-only char
  3294. if ( bMaybeSeparator && sCand[0]==MAGIC_SYNONYM_WHITESPACE && sCand[1]=='\0' )
  3295. return SYNCHECK_EXACT;
  3296. // otherwise, partial match so far
  3297. return SYNCHECK_PARTIAL;
  3298. }
  3299. #if !USE_WINDOWS
  3300. #define __forceinline inline
  3301. #endif
  3302. static __forceinline bool IsSeparator ( int iFolded, bool bFirst )
  3303. {
  3304. // eternal separator
  3305. if ( iFolded<0 || ( iFolded & MASK_CODEPOINT )==0 )
  3306. return true;
  3307. // just a codepoint
  3308. if (!( iFolded & MASK_FLAGS ))
  3309. return false;
  3310. // any magic flag, besides dual
  3311. if (!( iFolded & FLAG_CODEPOINT_DUAL ))
  3312. return true;
  3313. // FIXME? n-grams currently also set dual
  3314. if ( iFolded & FLAG_CODEPOINT_NGRAM )
  3315. return true;
  3316. // dual depends on position
  3317. return bFirst;
  3318. }
  3319. // handles escaped specials that are not in the character set
  3320. // returns true if the codepoint should be processed as a simple codepoint,
  3321. // returns false if it should be processed as a whitespace
  3322. // for example: aaa\!bbb => aaa bbb
  3323. static inline bool Special2Simple ( int & iCodepoint )
  3324. {
  3325. if ( ( iCodepoint & FLAG_CODEPOINT_DUAL ) || !( iCodepoint & FLAG_CODEPOINT_SPECIAL ) )
  3326. {
  3327. iCodepoint &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
  3328. return true;
  3329. }
  3330. return false;
  3331. }
  3332. #if USE_WINDOWS
  3333. #pragma warning(disable:4127) // conditional expr is const for MSVC
  3334. #endif
  3335. template < bool IS_UTF8 >
  3336. BYTE * CSphTokenizerBase2<IS_UTF8>::GetTokenSyn ( bool bQueryMode )
  3337. {
  3338. assert ( m_dSynonyms.GetLength() );
  3339. BYTE * pCur;
  3340. m_bTokenBoundary = false;
  3341. for ( ;; )
  3342. {
  3343. // initialize accumulators and range
  3344. BYTE * pFirstSeparator = NULL;
  3345. m_iAccum = 0;
  3346. m_pAccum = m_sAccum;
  3347. int iSynStart = 0;
  3348. int iSynEnd = m_dSynonyms.GetLength()-1;
  3349. int iSynOff = 0;
  3350. int iLastCodepoint = 0;
  3351. int iLastFolded = 0;
  3352. BYTE * pRescan = NULL;
  3353. int iExact = -1;
  3354. BYTE * pExact = NULL;
  3355. // main refinement loop
  3356. for ( ;; )
  3357. {
  3358. // store current position (to be able to restart from it on folded boundary)
  3359. pCur = m_pCur;
  3360. // get next codepoint, fold it, lookup flags
  3361. int iCode;
  3362. int iFolded;
  3363. if ( pCur<m_pBufferMax && *pCur<128 )
  3364. {
  3365. // fastpath, ascii7 is identical in both SBCS and UTF8 encodings
  3366. iCode = *m_pCur++;
  3367. iFolded = m_tLC.m_pChunk[0][iCode];
  3368. } else
  3369. {
  3370. iCode = GetCodepoint(); // advances m_pCur
  3371. iFolded = m_tLC.ToLower ( iCode );
  3372. }
  3373. // handle early-out
  3374. if ( iCode<0 )
  3375. {
  3376. // eof at token start? we're done
  3377. if ( iSynOff==0 )
  3378. return NULL;
  3379. // eof after whitespace? we already checked the candidate last time, so break
  3380. if ( iLastFolded==0 )
  3381. break;
  3382. }
  3383. // handle boundaries
  3384. if ( m_bBoundary && ( iFolded==0 ) ) m_bTokenBoundary = true;
  3385. m_bBoundary = ( iFolded & FLAG_CODEPOINT_BOUNDARY )!=0;
  3386. // handle escapes
  3387. if ( bQueryMode )
  3388. {
  3389. if ( iCode=='\\' && iLastCodepoint!='\\' )
  3390. {
  3391. iLastCodepoint = iCode;
  3392. continue;
  3393. } else if ( iLastCodepoint=='\\' && !Special2Simple ( iFolded ) )
  3394. {
  3395. iLastCodepoint = 0;
  3396. continue;
  3397. }
  3398. iLastCodepoint = iCode;
  3399. }
  3400. // skip continuous whitespace
  3401. // (must be here, because boundaries and escapes might fold to whitespace)
  3402. if ( iLastFolded==0 && iFolded==0 )
  3403. continue;
  3404. if ( bQueryMode )
  3405. iFolded = CodepointArbitrationQ ( iFolded, false, *m_pCur );
  3406. else if ( m_bDetectSentences )
  3407. iFolded = CodepointArbitrationI ( iFolded );
  3408. iLastFolded = iFolded;
  3409. if ( m_iAccum==0 )
  3410. m_pTokenStart = pCur;
  3411. // handle specials at the very word start
  3412. if ( ( iFolded & FLAG_CODEPOINT_SPECIAL ) && m_iAccum==0 )
  3413. {
  3414. m_bWasSpecial = !( iFolded & FLAG_CODEPOINT_NGRAM );
  3415. AccumCodepoint ( iFolded & MASK_CODEPOINT );
  3416. *m_pAccum = '\0';
  3417. m_iLastTokenLen = 1;
  3418. m_pTokenStart = pCur;
  3419. m_pTokenEnd = m_pCur;
  3420. return m_sAccum;
  3421. }
  3422. // handle specials
  3423. bool bJustSpecial = ( iFolded & FLAG_CODEPOINT_SPECIAL )
  3424. && !( iFolded & FLAG_CODEPOINT_DUAL ) // OPTIMIZE?
  3425. && !( iFolded & FLAG_CODEPOINT_SYNONYM ); // OPTIMIZE?
  3426. // if candidate starts with something special, and turns out to be not a synonym,
  3427. // we will need to rescan from current position later
  3428. if ( iSynOff==0 )
  3429. pRescan = IsSeparator ( iFolded, true ) ? m_pCur : NULL;
  3430. // accumulate folded token
  3431. if ( !pFirstSeparator )
  3432. {
  3433. if ( IsSeparator ( iFolded, m_iAccum==0 ) )
  3434. {
  3435. if ( m_iAccum )
  3436. pFirstSeparator = pCur;
  3437. } else
  3438. {
  3439. if ( m_iAccum==0 )
  3440. m_pTokenStart = pCur;
  3441. AccumCodepoint ( iFolded & MASK_CODEPOINT );
  3442. }
  3443. }
  3444. // accumulate next raw synonym symbol to refine
  3445. // note that we need a special check for whitespace here, to avoid "MS*DOS" being treated as "MS DOS" synonym
  3446. BYTE sTest[4];
  3447. int iTest;
  3448. int iMasked = ( iCode & MASK_CODEPOINT );
  3449. if ( iFolded<=0 || bJustSpecial )
  3450. {
  3451. sTest[0] = MAGIC_SYNONYM_WHITESPACE;
  3452. iTest = 1;
  3453. if (!( iMasked==' ' || iMasked=='\t' ))
  3454. {
  3455. sTest[1] = '\0';
  3456. iTest = 2;
  3457. }
  3458. } else
  3459. {
  3460. if ( IsUtf8() )
  3461. {
  3462. iTest = sphUTF8Encode ( sTest, iMasked );
  3463. } else
  3464. {
  3465. iTest = 1;
  3466. sTest[0] = BYTE(iMasked);
  3467. }
  3468. }
  3469. // refine synonyms range
  3470. #define LOC_RETURN_SYNONYM(_idx) \
  3471. { \
  3472. m_pTokenEnd = m_iAccum ? pCur : m_pCur; \
  3473. if ( bJustSpecial || ( iFolded & FLAG_CODEPOINT_SPECIAL )!=0 ) m_pCur = pCur; \
  3474. strncpy ( (char*)m_sAccum, m_dSynonyms[_idx].m_sTo.cstr(), sizeof(m_sAccum) ); \
  3475. m_iLastTokenLen = m_dSynonyms[_idx].m_iToLen; \
  3476. return m_sAccum; \
  3477. }
  3478. #define LOC_REFINE_BREAK() \
  3479. { \
  3480. if ( iExact>=0 ) { m_pCur = pCur = pExact; LOC_RETURN_SYNONYM ( iExact ); } \
  3481. break; \
  3482. }
  3483. // if this is the first symbol, use prebuilt lookup table to speedup initial range search
  3484. if ( iSynOff==0 )
  3485. {
  3486. iSynStart = m_dSynStart[sTest[0]];
  3487. iSynEnd = m_dSynEnd[sTest[0]];
  3488. if ( iSynStart>iSynEnd )
  3489. break;
  3490. }
  3491. // this is to catch intermediate separators (eg. "OS/2/3")
  3492. bool bMaybeSeparator = ( iFolded & FLAG_CODEPOINT_SYNONYM )!=0 || ( iFolded<0 );
  3493. SynCheck_e eStart = SynCheckPrefix ( m_dSynonyms[iSynStart], iSynOff, sTest, iTest, bMaybeSeparator );
  3494. if ( eStart==SYNCHECK_EXACT )
  3495. {
  3496. if ( iSynStart==iSynEnd ) LOC_RETURN_SYNONYM ( iSynStart );
  3497. iExact = iSynStart;
  3498. pExact = pCur;
  3499. }
  3500. if ( eStart==SYNCHECK_GREATER || ( iSynStart==iSynEnd && eStart!=SYNCHECK_PARTIAL ) )
  3501. LOC_REFINE_BREAK();
  3502. SynCheck_e eEnd = SynCheckPrefix ( m_dSynonyms[iSynEnd], iSynOff, sTest, iTest, bMaybeSeparator );
  3503. if ( eEnd==SYNCHECK_LESS )
  3504. LOC_REFINE_BREAK();
  3505. if ( eEnd==SYNCHECK_EXACT )
  3506. {
  3507. iExact = iSynEnd;
  3508. pExact = pCur;
  3509. }
  3510. // refine left boundary
  3511. if ( eStart!=SYNCHECK_PARTIAL && eStart!=SYNCHECK_EXACT )
  3512. {
  3513. assert ( eStart==SYNCHECK_LESS );
  3514. int iL = iSynStart;
  3515. int iR = iSynEnd;
  3516. SynCheck_e eL = eStart;
  3517. SynCheck_e eR = eEnd;
  3518. while ( iR-iL>1 )
  3519. {
  3520. int iM = iL + (iR-iL)/2;
  3521. SynCheck_e eMid = SynCheckPrefix ( m_dSynonyms[iM], iSynOff, sTest, iTest, bMaybeSeparator );
  3522. if ( eMid==SYNCHECK_LESS )
  3523. {
  3524. iL = iM;
  3525. eL = eMid;
  3526. } else
  3527. {
  3528. iR = iM;
  3529. eR = eMid;
  3530. }
  3531. }
  3532. assert ( eL==SYNCHECK_LESS );
  3533. assert ( eR!=SYNCHECK_LESS );
  3534. assert ( iR-iL==1 );
  3535. if ( eR==SYNCHECK_GREATER ) LOC_REFINE_BREAK();
  3536. if ( eR==SYNCHECK_EXACT && iR==iSynEnd ) LOC_RETURN_SYNONYM ( iR );
  3537. assert ( eR==SYNCHECK_PARTIAL || eR==SYNCHECK_EXACT );
  3538. iSynStart = iR;
  3539. eStart = eR;
  3540. }
  3541. // refine right boundary
  3542. if ( eEnd!=SYNCHECK_PARTIAL && eEnd!=SYNCHECK_EXACT )
  3543. {
  3544. assert ( eEnd==SYNCHECK_GREATER );
  3545. int iL = iSynStart;
  3546. int iR = iSynEnd;
  3547. SynCheck_e eL = eStart;
  3548. SynCheck_e eR = eEnd;
  3549. while ( iR-iL>1 )
  3550. {
  3551. int iM = iL + (iR-iL)/2;
  3552. SynCheck_e eMid = SynCheckPrefix ( m_dSynonyms[iM], iSynOff, sTest, iTest, bMaybeSeparator );
  3553. if ( eMid==SYNCHECK_GREATER )
  3554. {
  3555. iR = iM;
  3556. eR = eMid;
  3557. } else
  3558. {
  3559. iL = iM;
  3560. eL = eMid;
  3561. }
  3562. }
  3563. assert ( eR==SYNCHECK_GREATER );
  3564. assert ( eL!=SYNCHECK_GREATER );
  3565. assert ( iR-iL==1 );
  3566. if ( eL==SYNCHECK_LESS ) LOC_REFINE_BREAK();
  3567. if ( eL==SYNCHECK_EXACT && iL==iSynStart ) LOC_RETURN_SYNONYM ( iL );
  3568. assert ( eL==SYNCHECK_PARTIAL || eL==SYNCHECK_EXACT );
  3569. iSynEnd = iL;
  3570. eEnd = eL;
  3571. }
  3572. // handle eof
  3573. if ( iCode<0 )
  3574. break;
  3575. // we still have a partial synonym match, continue;
  3576. iSynOff += iTest;
  3577. }
  3578. // at this point, that was not a synonym
  3579. if ( pRescan )
  3580. {
  3581. m_pCur = pRescan;
  3582. continue;
  3583. }
  3584. // at this point, it also started with a valid char
  3585. assert ( m_iAccum>0 );
  3586. // find the proper separator
  3587. if ( !pFirstSeparator )
  3588. {
  3589. // if there was none, scan until found
  3590. for ( ;; )
  3591. {
  3592. pCur = m_pCur;
  3593. int iCode;
  3594. int iFolded;
  3595. if ( pCur<m_pBufferMax && *pCur<128 )
  3596. {
  3597. // fastpath, ascii7 is identical in both SBCS and UTF8 encodings
  3598. iCode = *m_pCur++;
  3599. iFolded = m_tLC.m_pChunk[0][iCode];
  3600. } else
  3601. {
  3602. iCode = GetCodepoint(); // advances m_pCur
  3603. iFolded = m_tLC.ToLower ( iCode );
  3604. }
  3605. if ( iFolded<0 )
  3606. break; // eof
  3607. if ( bQueryMode && iCode=='\\' )
  3608. {
  3609. iCode = GetCodepoint(); // advances m_pCur
  3610. iFolded = m_tLC.ToLower ( iCode );
  3611. if ( iFolded<0 )
  3612. break;
  3613. if ( !Special2Simple ( iFolded ) )
  3614. break;
  3615. }
  3616. if ( bQueryMode )
  3617. iFolded = CodepointArbitrationQ ( iFolded, false, *m_pCur );
  3618. else if ( m_bDetectSentences )
  3619. iFolded = CodepointArbitrationI ( iFolded );
  3620. if ( IsSeparator ( iFolded, false ) )
  3621. {
  3622. if ( iFolded!=0 )
  3623. m_pCur = pCur; // force rescan
  3624. break;
  3625. }
  3626. // the hottest accumulation point
  3627. // so do this manually, no function calls, that is quickest
  3628. if ( m_iAccum<SPH_MAX_WORD_LEN )
  3629. {
  3630. m_iAccum++;
  3631. if ( IS_UTF8 )
  3632. {
  3633. iFolded &= MASK_CODEPOINT;
  3634. SPH_UTF8_ENCODE ( m_pAccum, iFolded );
  3635. } else
  3636. *m_pAccum++ = BYTE(iFolded);
  3637. }
  3638. }
  3639. } else
  3640. {
  3641. // if there was, token is ready but we should restart from that separator
  3642. m_pCur = pFirstSeparator;
  3643. pCur = m_pCur;
  3644. }
  3645. // return accumulated token
  3646. if ( m_iAccum<m_tSettings.m_iMinWordLen )
  3647. {
  3648. if ( m_bShortTokenFilter )
  3649. {
  3650. *m_pAccum = '\0';
  3651. if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
  3652. {
  3653. m_iLastTokenLen = m_iAccum;
  3654. m_pTokenEnd = pCur;
  3655. m_iAccum = 0;
  3656. return m_sAccum;
  3657. }
  3658. }
  3659. if ( m_iAccum )
  3660. m_iOvershortCount++;
  3661. m_iAccum = 0;
  3662. continue;
  3663. }
  3664. *m_pAccum = '\0';
  3665. m_iLastTokenLen = m_iAccum;
  3666. m_pTokenEnd = pCur;
  3667. return m_sAccum;
  3668. }
  3669. }
  3670. #if USE_WINDOWS
  3671. #pragma warning(default:4127) // conditional expr is const for MSVC
  3672. #endif
  3673. bool ISphTokenizer::RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError )
  3674. {
  3675. // parse
  3676. CSphVector<CSphRemapRange> dRemaps;
  3677. CSphCharsetDefinitionParser tParser;
  3678. if ( !tParser.Parse ( sConfig, dRemaps ) )
  3679. {
  3680. sError = tParser.GetLastError();
  3681. return false;
  3682. }
  3683. // check
  3684. ARRAY_FOREACH ( i, dRemaps )
  3685. {
  3686. const CSphRemapRange & r = dRemaps[i];
  3687. if ( !bCanRemap && r.m_iStart!=r.m_iRemapStart )
  3688. {
  3689. sError.SetSprintf ( "%s characters must not be remapped (map-from=U+%x, map-to=U+%x)",
  3690. sSource, r.m_iStart, r.m_iRemapStart );
  3691. return false;
  3692. }
  3693. for ( int j=r.m_iStart; j<=r.m_iEnd; j++ )
  3694. if ( m_tLC.ToLower(j) )
  3695. {
  3696. sError.SetSprintf ( "%s characters must not be referenced anywhere else (code=U+%x)", sSource, j );
  3697. return false;
  3698. }
  3699. if ( bCanRemap )
  3700. for ( int j=r.m_iRemapStart; j<=r.m_iRemapStart + r.m_iEnd - r.m_iStart; j++ )
  3701. if ( m_tLC.ToLower(j) )
  3702. {
  3703. sError.SetSprintf ( "%s characters must not be referenced anywhere else (code=U+%x)", sSource, j );
  3704. return false;
  3705. }
  3706. }
  3707. // add mapping
  3708. m_tLC.AddRemaps ( dRemaps, uFlags );
  3709. return true;
  3710. }
  3711. bool ISphTokenizer::SetBoundary ( const char * sConfig, CSphString & sError )
  3712. {
  3713. return RemapCharacters ( sConfig, FLAG_CODEPOINT_BOUNDARY, "phrase boundary", false, sError );
  3714. }
  3715. bool ISphTokenizer::SetIgnoreChars ( const char * sConfig, CSphString & sError )
  3716. {
  3717. return RemapCharacters ( sConfig, FLAG_CODEPOINT_IGNORE, "ignored", false, sError );
  3718. }
  3719. bool ISphTokenizer::SetBlendChars ( const char * sConfig, CSphString & sError )
  3720. {
  3721. return RemapCharacters ( sConfig, FLAG_CODEPOINT_BLEND, "blend", true, sError );
  3722. }
  3723. static bool sphStrncmp ( const char * sCheck, int iCheck, const char * sRef )
  3724. {
  3725. return ( iCheck==(int)strlen(sRef) && memcmp ( sCheck, sRef, iCheck )==0 );
  3726. }
  3727. bool ISphTokenizer::SetBlendMode ( const char * sMode, CSphString & sError )
  3728. {
  3729. if ( !sMode || !*sMode )
  3730. {
  3731. m_uBlendVariants = BLEND_TRIM_NONE;
  3732. m_bBlendSkipPure = false;
  3733. return true;
  3734. }
  3735. m_uBlendVariants = 0;
  3736. const char * p = sMode;
  3737. while ( *p )
  3738. {
  3739. while ( !sphIsAlpha(*p) )
  3740. p++;
  3741. if ( !*p )
  3742. break;
  3743. const char * sTok = p;
  3744. while ( sphIsAlpha(*p) )
  3745. p++;
  3746. if ( sphStrncmp ( sTok, p-sTok, "trim_none" ) )
  3747. m_uBlendVariants |= BLEND_TRIM_NONE;
  3748. else if ( sphStrncmp ( sTok, p-sTok, "trim_head" ) )
  3749. m_uBlendVariants |= BLEND_TRIM_HEAD;
  3750. else if ( sphStrncmp ( sTok, p-sTok, "trim_tail" ) )
  3751. m_uBlendVariants |= BLEND_TRIM_TAIL;
  3752. else if ( sphStrncmp ( sTok, p-sTok, "trim_both" ) )
  3753. m_uBlendVariants |= BLEND_TRIM_BOTH;
  3754. else if ( sphStrncmp ( sTok, p-sTok, "skip_pure" ) )
  3755. m_bBlendSkipPure = true;
  3756. else
  3757. {
  3758. sError.SetSprintf ( "unknown blend_mode option near '%s'", sTok );
  3759. return false;
  3760. }
  3761. }
  3762. if ( !m_uBlendVariants )
  3763. {
  3764. sError.SetSprintf ( "blend_mode must define at least one variant to index" );
  3765. m_uBlendVariants = BLEND_TRIM_NONE;
  3766. m_bBlendSkipPure = false;
  3767. return false;
  3768. }
  3769. return true;
  3770. }
  3771. /////////////////////////////////////////////////////////////////////////////
  3772. template < bool IS_QUERY >
  3773. CSphTokenizer_SBCS<IS_QUERY>::CSphTokenizer_SBCS ()
  3774. {
  3775. CSphString sTmp;
  3776. SetCaseFolding ( SPHINX_DEFAULT_SBCS_TABLE, sTmp );
  3777. }
  3778. template < bool IS_QUERY >
  3779. void CSphTokenizer_SBCS<IS_QUERY>::SetBuffer ( BYTE * sBuffer, int iLength )
  3780. {
  3781. // check that old one is over and that new length is sane
  3782. assert ( iLength>=0 );
  3783. // set buffer
  3784. m_pBuffer = sBuffer;
  3785. m_pBufferMax = sBuffer + iLength;
  3786. m_pCur = sBuffer;
  3787. m_pTokenStart = m_pTokenEnd = NULL;
  3788. m_pBlendStart = m_pBlendEnd = NULL;
  3789. m_iOvershortCount = 0;
  3790. m_bBoundary = m_bTokenBoundary = false;
  3791. }
  3792. #if USE_WINDOWS
  3793. #pragma warning(disable:4127) // conditional expr is const for MSVC
  3794. #endif
  3795. template < bool IS_QUERY >
  3796. BYTE * CSphTokenizer_SBCS<IS_QUERY>::GetToken ()
  3797. {
  3798. m_bWasSpecial = false;
  3799. m_bBlended = false;
  3800. m_iOvershortCount = 0;
  3801. m_bTokenBoundary = false;
  3802. if ( m_dSynonyms.GetLength() )
  3803. return GetTokenSyn ( IS_QUERY );
  3804. // return pending blending variants
  3805. BYTE * pVar = GetBlendedVariant ();
  3806. if ( pVar )
  3807. return pVar;
  3808. m_bBlendedPart = ( m_pBlendEnd!=NULL );
  3809. for ( ;; )
  3810. {
  3811. // memorize buffer start
  3812. BYTE * pCur = m_pCur;
  3813. // get next codepoint, real or virtual
  3814. int iCodepoint = 0;
  3815. int iCode = 0;
  3816. bool bWasEscaped = false; // whether current char was escaped
  3817. if ( m_pCur<m_pBufferMax )
  3818. {
  3819. // get next codepoint
  3820. iCodepoint = *m_pCur++;
  3821. iCode = m_tLC.ToLower ( iCodepoint );
  3822. // handle escaping
  3823. if ( IS_QUERY && iCodepoint=='\\' )
  3824. {
  3825. if ( m_pCur<m_pBufferMax )
  3826. {
  3827. // fetch, fold, and then forcibly demote special
  3828. iCodepoint = *m_pCur++;
  3829. iCode = m_tLC.ToLower ( iCodepoint );
  3830. if ( !Special2Simple ( iCode ) )
  3831. iCode = 0;
  3832. bWasEscaped = true;
  3833. } else
  3834. {
  3835. // stray slash on a buffer end
  3836. // handle it as a separator
  3837. iCode = 0;
  3838. }
  3839. }
  3840. } else
  3841. {
  3842. // out of buffer
  3843. // but still need to handle short tokens
  3844. if ( m_iAccum<m_tSettings.m_iMinWordLen )
  3845. {
  3846. bool bShortToken = false;
  3847. if ( m_bShortTokenFilter )
  3848. {
  3849. m_sAccum[m_iAccum] = '\0';
  3850. if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
  3851. bShortToken = true;
  3852. }
  3853. if ( !bShortToken )
  3854. {
  3855. if ( m_iAccum )
  3856. m_iOvershortCount++;
  3857. m_iAccum = 0;
  3858. m_iLastTokenLen = 0;
  3859. BlendAdjust ( pCur );
  3860. return NULL;
  3861. }
  3862. }
  3863. }
  3864. // handle all the flags..
  3865. if ( IS_QUERY )
  3866. iCode = CodepointArbitrationQ ( iCode, bWasEscaped, *m_pCur );
  3867. else if ( m_bDetectSentences )
  3868. iCode = CodepointArbitrationI ( iCode );
  3869. // handle ignored chars
  3870. if ( iCode & FLAG_CODEPOINT_IGNORE )
  3871. continue;
  3872. // handle blended characters
  3873. if ( iCode & FLAG_CODEPOINT_BLEND )
  3874. {
  3875. if ( m_pBlendEnd )
  3876. iCode = 0;
  3877. else
  3878. {
  3879. m_bBlended = true;
  3880. m_pBlendStart = m_iAccum ? m_pTokenStart : pCur;
  3881. }
  3882. }
  3883. // handle whitespace and boundary
  3884. if ( m_bBoundary && ( iCode==0 ) )
  3885. {
  3886. m_bTokenBoundary = true;
  3887. m_iBoundaryOffset = pCur - m_pBuffer - 1;
  3888. }
  3889. m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
  3890. if ( iCode==0 || m_bBoundary )
  3891. {
  3892. if ( m_iAccum<m_tSettings.m_iMinWordLen )
  3893. {
  3894. bool bShortToken = false;
  3895. if ( m_bShortTokenFilter )
  3896. {
  3897. m_sAccum[m_iAccum] = '\0';
  3898. if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
  3899. bShortToken = true;
  3900. }
  3901. if ( !bShortToken )
  3902. {
  3903. if ( m_iAccum )
  3904. m_iOvershortCount++;
  3905. m_iAccum = 0;
  3906. BlendAdjust ( pCur );
  3907. continue;
  3908. }
  3909. }
  3910. m_iLastTokenLen = m_iAccum;
  3911. m_sAccum[m_iAccum] = '\0';
  3912. m_iAccum = 0;
  3913. m_pTokenEnd = pCur>=m_pBufferMax ? m_pCur : pCur;
  3914. if ( !BlendAdjust ( pCur ) )
  3915. continue;
  3916. if ( m_bBlended )
  3917. return GetBlendedVariant();
  3918. return m_sAccum;
  3919. }
  3920. // handle specials
  3921. bool bSpecial = ( iCode & FLAG_CODEPOINT_SPECIAL )!=0;
  3922. bool bNoBlend = !( iCode & FLAG_CODEPOINT_BLEND );
  3923. iCode &= MASK_CODEPOINT;
  3924. if ( bSpecial )
  3925. {
  3926. // skip short words
  3927. if ( m_iAccum<m_tSettings.m_iMinWordLen )
  3928. {
  3929. if ( m_iAccum )
  3930. m_iOvershortCount++;
  3931. bool bShortToken = false;
  3932. if ( m_bShortTokenFilter )
  3933. {
  3934. m_sAccum[m_iAccum] = '\0';
  3935. if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
  3936. bShortToken = true;
  3937. }
  3938. if ( !bShortToken )
  3939. {
  3940. if ( m_iAccum )
  3941. m_iOvershortCount++;
  3942. m_iAccum = 0;
  3943. }
  3944. }
  3945. m_pTokenEnd = m_pCur;
  3946. if ( m_iAccum==0 )
  3947. {
  3948. // nice standalone special
  3949. m_iLastTokenLen = 1;
  3950. m_sAccum[0] = (BYTE)iCode;
  3951. m_sAccum[1] = '\0';
  3952. m_pTokenStart = pCur;
  3953. m_bWasSpecial = true;
  3954. } else
  3955. {
  3956. // flush prev accum and redo this special
  3957. m_iLastTokenLen = m_iAccum;
  3958. m_sAccum[m_iAccum] = '\0';
  3959. m_pCur--;
  3960. m_pTokenEnd--;
  3961. }
  3962. m_iAccum = 0;
  3963. if ( !BlendAdjust ( pCur ) )
  3964. continue;
  3965. if ( m_bBlended )
  3966. return GetBlendedVariant();
  3967. return m_sAccum;
  3968. }
  3969. // just accumulate
  3970. assert ( iCode>0 );
  3971. if ( m_iAccum<SPH_MAX_WORD_LEN )
  3972. {
  3973. if ( m_iAccum==0 )
  3974. m_pTokenStart = pCur;
  3975. // tricky bit
  3976. // heading modifiers must not (!) affected blended status
  3977. // eg. we want stuff like '=-' (w/o apostrophes) thrown away when pure_blend is on
  3978. if (!( IS_QUERY && !m_iAccum && sphIsModifier(iCode) ) )
  3979. m_bNonBlended = m_bNonBlended || bNoBlend;
  3980. m_sAccum[m_iAccum++] = (BYTE)iCode;
  3981. }
  3982. }
  3983. }
  3984. #if USE_WINDOWS
  3985. #pragma warning(default:4127) // conditional expr is const for MSVC
  3986. #endif
  3987. template < bool IS_QUERY >
  3988. ISphTokenizer * CSphTokenizer_SBCS<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
  3989. {
  3990. CSphTokenizerBase * pClone;
  3991. if ( eMode!=SPH_CLONE_INDEX )
  3992. pClone = new CSphTokenizer_SBCS<true>();
  3993. else
  3994. pClone = new CSphTokenizer_SBCS<false>();
  3995. pClone->CloneBase ( this, eMode );
  3996. return pClone;
  3997. }
  3998. /////////////////////////////////////////////////////////////////////////////
  3999. template < bool IS_QUERY >
  4000. CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
  4001. {
  4002. CSphString sTmp;
  4003. SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
  4004. m_bHasBlend = false;
  4005. }
  4006. template < bool IS_QUERY >
  4007. void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( BYTE * sBuffer, int iLength )
  4008. {
  4009. // check that old one is over and that new length is sane
  4010. assert ( iLength>=0 );
  4011. // set buffer
  4012. m_pBuffer = sBuffer;
  4013. m_pBufferMax = sBuffer + iLength;
  4014. m_pCur = sBuffer;
  4015. m_pTokenStart = m_pTokenEnd = NULL;
  4016. m_pBlendStart = m_pBlendEnd = NULL;
  4017. // fixup embedded zeroes with spaces
  4018. for ( BYTE * p = m_pBuffer; p < m_pBufferMax; p++ )
  4019. if ( !*p )
  4020. *p = ' ';
  4021. m_iOvershortCount = 0;
  4022. m_bBoundary = m_bTokenBoundary = false;
  4023. }
  4024. #if USE_WINDOWS
  4025. #pragma warning(disable:4127) // conditional expr is const for MSVC
  4026. #endif
  4027. template < bool IS_QUERY >
  4028. BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
  4029. {
  4030. m_bWasSpecial = false;
  4031. m_bBlended = false;
  4032. m_iOvershortCount = 0;
  4033. m_bTokenBoundary = false;
  4034. if ( m_dSynonyms.GetLength() )
  4035. return GetTokenSyn ( IS_QUERY );
  4036. return m_bHasBlend
  4037. ? DoGetToken<IS_QUERY,true>()
  4038. : DoGetToken<IS_QUERY,false>();
  4039. }
  4040. template < bool IS_QUERY, bool IS_BLEND >
  4041. BYTE * CSphTokenizer_UTF8_Base::DoGetToken ()
  4042. {
  4043. // return pending blending variants
  4044. if ( IS_BLEND )
  4045. {
  4046. BYTE * pVar = GetBlendedVariant ();
  4047. if ( pVar )
  4048. return pVar;
  4049. m_bBlendedPart = ( m_pBlendEnd!=NULL );
  4050. }
  4051. // in query mode, lets capture (soft-whitespace hard-whitespace) sequences and adjust overshort counter
  4052. // sample queries would be (one NEAR $$$) or (one | $$$ two) where $ is not a valid character
  4053. bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases
  4054. bool bGotSoft = false; // hey Beavis he said soft huh huhhuh
  4055. for ( ;; )
  4056. {
  4057. // get next codepoint
  4058. BYTE * pCur = m_pCur; // to redo special char, if there's a token already
  4059. int iCodePoint;
  4060. int iCode;
  4061. if ( pCur<m_pBufferMax && *pCur<128 )
  4062. {
  4063. iCodePoint = *m_pCur++;
  4064. iCode = m_tLC.m_pChunk[0][iCodePoint];
  4065. } else
  4066. {
  4067. iCodePoint = GetCodepoint(); // advances m_pCur
  4068. iCode = m_tLC.ToLower ( iCodePoint );
  4069. }
  4070. // handle escaping
  4071. bool bWasEscaped = ( IS_QUERY && iCodePoint=='\\' ); // whether current codepoint was escaped
  4072. if ( bWasEscaped )
  4073. {
  4074. iCodePoint = GetCodepoint();
  4075. iCode = m_tLC.ToLower ( iCodePoint );
  4076. if ( !Special2Simple ( iCode ) )
  4077. iCode = 0;
  4078. }
  4079. // handle eof
  4080. if ( iCode<0 )
  4081. {
  4082. // skip trailing short word
  4083. FlushAccum ();
  4084. if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen )
  4085. {
  4086. if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) )
  4087. {
  4088. if ( m_iLastTokenLen )
  4089. m_iOvershortCount++;
  4090. m_iLastTokenLen = 0;
  4091. if ( IS_BLEND )
  4092. BlendAdjust ( pCur );
  4093. return NULL;
  4094. }
  4095. }
  4096. // return trailing word
  4097. if ( IS_BLEND && !BlendAdjust ( pCur ) )
  4098. return NULL;
  4099. m_pTokenEnd = m_pCur;
  4100. if ( IS_BLEND && m_bBlended )
  4101. return GetBlendedVariant();
  4102. return m_sAccum;
  4103. }
  4104. // handle all the flags..
  4105. if ( IS_QUERY )
  4106. iCode = CodepointArbitrationQ ( iCode, bWasEscaped, *m_pCur );
  4107. else if ( m_bDetectSentences )
  4108. iCode = CodepointArbitrationI ( iCode );
  4109. // handle ignored chars
  4110. if ( iCode & FLAG_CODEPOINT_IGNORE )
  4111. continue;
  4112. // handle blended characters
  4113. if ( IS_BLEND && ( iCode & FLAG_CODEPOINT_BLEND ) )
  4114. {
  4115. if ( m_pBlendEnd )
  4116. iCode = 0;
  4117. else
  4118. {
  4119. m_bBlended = true;
  4120. m_pBlendStart = m_iAccum ? m_pTokenStart : pCur;
  4121. }
  4122. }
  4123. // handle soft-whitespace-only tokens
  4124. if ( !bGotNonToken && !m_iAccum )
  4125. {
  4126. if ( !bGotSoft )
  4127. {
  4128. // detect opening soft whitespace
  4129. if ( ( iCode==0 && !( iCode & MASK_FLAGS ) && !IsWhitespace ( iCodePoint ) )
  4130. || ( ( iCode & FLAG_CODEPOINT_BLEND ) && !m_iAccum ) )
  4131. {
  4132. bGotSoft = true;
  4133. }
  4134. } else
  4135. {
  4136. // detect closing hard whitespace or special
  4137. // (if there was anything meaningful in the meantime, we must never get past the outer if!)
  4138. if ( IsWhitespace ( iCodePoint ) || ( iCode & FLAG_CODEPOINT_SPECIAL ) )
  4139. {
  4140. m_iOvershortCount++;
  4141. bGotNonToken = true;
  4142. }
  4143. }
  4144. }
  4145. // handle whitespace and boundary
  4146. if ( m_bBoundary && ( iCode==0 ) )
  4147. {
  4148. m_bTokenBoundary = true;
  4149. m_iBoundaryOffset = pCur - m_pBuffer - 1;
  4150. }
  4151. m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
  4152. if ( iCode==0 || m_bBoundary )
  4153. {
  4154. FlushAccum ();
  4155. if ( IS_BLEND && !BlendAdjust ( pCur ) )
  4156. continue;
  4157. if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen
  4158. && !( m_bShortTokenFilter && ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) ) )
  4159. {
  4160. if ( m_iLastTokenLen )
  4161. m_iOvershortCount++;
  4162. continue;
  4163. } else
  4164. {
  4165. m_pTokenEnd = pCur;
  4166. if ( IS_BLEND && m_bBlended )
  4167. return GetBlendedVariant();
  4168. return m_sAccum;
  4169. }
  4170. }
  4171. // handle specials
  4172. if ( iCode & FLAG_CODEPOINT_SPECIAL )
  4173. {
  4174. // skip short words preceding specials
  4175. if ( m_iAccum<m_tSettings.m_iMinWordLen )
  4176. {
  4177. m_sAccum[m_iAccum] = '\0';
  4178. if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iAccum ) )
  4179. {
  4180. if ( m_iAccum )
  4181. m_iOvershortCount++;
  4182. FlushAccum ();
  4183. }
  4184. }
  4185. if ( m_iAccum==0 )
  4186. {
  4187. m_bNonBlended = m_bNonBlended || ( !( iCode & FLAG_CODEPOINT_BLEND ) && !( iCode & FLAG_CODEPOINT_SPECIAL ) );
  4188. m_bWasSpecial = !( iCode & FLAG_CODEPOINT_NGRAM );
  4189. m_pTokenStart = pCur;
  4190. m_pTokenEnd = m_pCur;
  4191. AccumCodepoint ( iCode & MASK_CODEPOINT ); // handle special as a standalone token
  4192. } else
  4193. {
  4194. m_pCur = pCur; // we need to flush current accum and then redo special char again
  4195. m_pTokenEnd = pCur;
  4196. }
  4197. FlushAccum ();
  4198. if ( IS_BLEND )
  4199. {
  4200. if ( !BlendAdjust ( pCur ) )
  4201. continue;
  4202. if ( m_bBlended )
  4203. return GetBlendedVariant();
  4204. }
  4205. return m_sAccum;
  4206. }
  4207. if ( m_iAccum==0 )
  4208. m_pTokenStart = pCur;
  4209. // tricky bit
  4210. // heading modifiers must not (!) affected blended status
  4211. // eg. we want stuff like '=-' (w/o apostrophes) thrown away when pure_blend is on
  4212. if ( IS_BLEND )
  4213. if (!( IS_QUERY && !m_iAccum && sphIsModifier ( iCode & MASK_CODEPOINT ) ) )
  4214. m_bNonBlended = m_bNonBlended || !( iCode & FLAG_CODEPOINT_BLEND );
  4215. // just accumulate
  4216. // manual inlining of utf8 encoder gives us a few extra percent
  4217. // which is important here, this is a hotspot
  4218. if ( m_iAccum<SPH_MAX_WORD_LEN )
  4219. {
  4220. iCode &= MASK_CODEPOINT;
  4221. m_iAccum++;
  4222. SPH_UTF8_ENCODE ( m_pAccum, iCode );
  4223. }
  4224. }
  4225. }
  4226. #if USE_WINDOWS
  4227. #pragma warning(default:4127) // conditional expr is const for MSVC
  4228. #endif
  4229. void CSphTokenizer_UTF8_Base::FlushAccum ()
  4230. {
  4231. assert ( m_pAccum-m_sAccum < (int)sizeof(m_sAccum) );
  4232. m_iLastTokenLen = m_iAccum;
  4233. *m_pAccum = 0;
  4234. m_iAccum = 0;
  4235. m_pAccum = m_sAccum;
  4236. }
  4237. template < bool IS_QUERY >
  4238. ISphTokenizer * CSphTokenizer_UTF8<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
  4239. {
  4240. CSphTokenizerBase * pClone;
  4241. if ( eMode!=SPH_CLONE_INDEX )
  4242. pClone = new CSphTokenizer_UTF8<true>();
  4243. else
  4244. pClone = new CSphTokenizer_UTF8<false>();
  4245. pClone->CloneBase ( this, eMode );
  4246. return pClone;
  4247. }
  4248. template < bool IS_QUERY >
  4249. int CSphTokenizer_UTF8<IS_QUERY>::GetCodepointLength ( int iCode ) const
  4250. {
  4251. if ( iCode<128 )
  4252. return 1;
  4253. int iBytes = 0;
  4254. while ( iCode & 0x80 )
  4255. {
  4256. iBytes++;
  4257. iCode <<= 1;
  4258. }
  4259. assert ( iBytes>=2 && iBytes<=4 );
  4260. return iBytes;
  4261. }
  4262. /////////////////////////////////////////////////////////////////////////////
  4263. template < bool IS_QUERY >
  4264. bool CSphTokenizer_UTF8Ngram<IS_QUERY>::SetNgramChars ( const char * sConfig, CSphString & sError )
  4265. {
  4266. assert ( this->m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  4267. CSphVector<CSphRemapRange> dRemaps;
  4268. CSphCharsetDefinitionParser tParser;
  4269. if ( !tParser.Parse ( sConfig, dRemaps ) )
  4270. {
  4271. sError = tParser.GetLastError();
  4272. return false;
  4273. }
  4274. // gcc braindamage requires this
  4275. this->m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL ); // !COMMIT support other n-gram lengths than 1
  4276. m_sNgramCharsStr = sConfig;
  4277. return true;
  4278. }
  4279. template < bool IS_QUERY >
  4280. void CSphTokenizer_UTF8Ngram<IS_QUERY>::SetNgramLen ( int iLen )
  4281. {
  4282. assert ( this->m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
  4283. assert ( iLen>0 );
  4284. m_iNgramLen = iLen;
  4285. }
  4286. template < bool IS_QUERY >
  4287. BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
  4288. {
  4289. // !COMMIT support other n-gram lengths than 1
  4290. assert ( m_iNgramLen==1 );
  4291. return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
  4292. }
  4293. //////////////////////////////////////////////////////////////////////////
  4294. CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
  4295. : CSphTokenFilter ( pTokenizer )
  4296. , m_pMultiWordforms ( pContainer )
  4297. , m_iStoredStart ( 0 )
  4298. , m_iStoredLen ( 0 )
  4299. , m_bBuildMultiform ( false )
  4300. , m_pLastToken ( NULL )
  4301. {
  4302. assert ( pTokenizer && pContainer );
  4303. m_dStoredTokens.Resize ( pContainer->m_iMaxTokens + 1 );
  4304. m_sTokenizedMultiform[0] = '\0';
  4305. }
  4306. CSphMultiformTokenizer::~CSphMultiformTokenizer ()
  4307. {
  4308. SafeDelete ( m_pTokenizer );
  4309. }
  4310. void CSphMultiformTokenizer::FillTokenInfo ( StoredToken_t * pToken )
  4311. {
  4312. pToken->m_bBoundary = m_pTokenizer->GetBoundary ();
  4313. pToken->m_bSpecial = m_pTokenizer->WasTokenSpecial ();
  4314. pToken->m_iOvershortCount = m_pTokenizer->GetOvershortCount ();
  4315. pToken->m_iTokenLen = m_pTokenizer->GetLastTokenLen ();
  4316. pToken->m_szTokenStart = m_pTokenizer->GetTokenStart ();
  4317. pToken->m_szTokenEnd = m_pTokenizer->GetTokenEnd ();
  4318. pToken->m_pBufferPtr = m_pTokenizer->GetBufferPtr ();
  4319. }
  4320. BYTE * CSphMultiformTokenizer::GetToken ()
  4321. {
  4322. m_sTokenizedMultiform[0] = '\0';
  4323. BYTE * pToken = ( m_iStoredLen>0 )
  4324. ? m_dStoredTokens [m_iStoredStart].m_sToken
  4325. : m_pTokenizer->GetToken ();
  4326. if ( !pToken )
  4327. {
  4328. memset ( &m_tLastToken, 0, sizeof ( m_tLastToken ) );
  4329. m_pLastToken = &m_tLastToken;
  4330. return NULL;
  4331. }
  4332. int iSize = m_dStoredTokens.GetLength ();
  4333. CSphMultiforms ** pWordforms = m_pMultiWordforms->m_Hash ( (const char *)pToken );
  4334. if ( !pWordforms )
  4335. {
  4336. if ( m_iStoredLen )
  4337. {
  4338. m_pLastToken = &(m_dStoredTokens[m_iStoredStart]);
  4339. m_iStoredLen--;
  4340. m_iStoredStart = (m_iStoredStart + 1) % iSize;
  4341. } else
  4342. {
  4343. FillTokenInfo ( &m_tLastToken );
  4344. m_pLastToken = &m_tLastToken;
  4345. bool bBlended = m_pTokenizer->TokenIsBlended();
  4346. m_bBlended = bBlended;
  4347. m_bNonBlended = !bBlended;
  4348. }
  4349. return pToken;
  4350. }
  4351. if ( !m_iStoredLen )
  4352. {
  4353. FillTokenInfo ( &m_dStoredTokens[m_iStoredStart] );
  4354. strcpy ( (char *)m_dStoredTokens[m_iStoredStart].m_sToken, (const char *)pToken ); // NOLINT
  4355. m_iStoredLen++;
  4356. }
  4357. int iTokensNeeded = (*pWordforms)->m_iMaxTokens - m_iStoredLen + 1;
  4358. for ( int i = 0; i < iTokensNeeded; i++ )
  4359. {
  4360. pToken = m_pTokenizer->GetToken ();
  4361. if ( !pToken )
  4362. break;
  4363. int iIndex = (m_iStoredStart+m_iStoredLen) % iSize;
  4364. FillTokenInfo ( &(m_dStoredTokens[iIndex]) );
  4365. strcpy ( (char *)m_dStoredTokens[iIndex].m_sToken, (const char *)pToken ); // NOLINT
  4366. m_iStoredLen++;
  4367. }
  4368. if ( !m_iStoredLen )
  4369. return NULL;
  4370. if ( m_iStoredLen<=(*pWordforms)->m_iMinTokens )
  4371. {
  4372. m_pLastToken = &(m_dStoredTokens [m_iStoredStart]);
  4373. m_iStoredLen--;
  4374. m_iStoredStart = (m_iStoredStart + 1) % iSize;
  4375. return m_pLastToken->m_sToken;
  4376. }
  4377. for ( int i = (*pWordforms)->m_pForms.GetLength()-1; i>=0; i-- )
  4378. {
  4379. CSphMultiform * pCurForm = (*pWordforms)->m_pForms[i];
  4380. if ( m_iStoredLen<=pCurForm->m_dTokens.GetLength () )
  4381. continue;
  4382. bool bFound = true;
  4383. for ( int j = 0; j < pCurForm->m_dTokens.GetLength (); j++ )
  4384. {
  4385. int iIndex = ( m_iStoredStart + j + 1 ) % iSize;
  4386. const char * szStored = (const char*)m_dStoredTokens[iIndex].m_sToken;
  4387. const char * szNormal = pCurForm->m_dTokens[j].cstr ();
  4388. if ( *szNormal!=*szStored || strcasecmp ( szNormal, szStored ) )
  4389. {
  4390. bFound = false;
  4391. break;
  4392. }
  4393. }
  4394. if ( bFound )
  4395. {
  4396. int iTokensPerForm = 1+pCurForm->m_dTokens.GetLength();
  4397. m_tLastToken.m_bBoundary = false;
  4398. m_tLastToken.m_bSpecial = false;
  4399. m_tLastToken.m_iOvershortCount = m_dStoredTokens[m_iStoredStart].m_iOvershortCount;
  4400. m_tLastToken.m_iTokenLen = pCurForm->m_iNormalTokenLen;
  4401. m_tLastToken.m_szTokenStart = m_dStoredTokens[m_iStoredStart].m_szTokenStart;
  4402. m_tLastToken.m_szTokenEnd = m_dStoredTokens[ ( m_iStoredStart+iTokensPerForm-1 ) % iSize ].m_szTokenEnd;
  4403. m_tLastToken.m_pBufferPtr = m_dStoredTokens[ ( m_iStoredStart+iTokensPerForm-1 ) % iSize ].m_pBufferPtr;
  4404. m_pLastToken = &m_tLastToken;
  4405. if ( m_bBuildMultiform )
  4406. {
  4407. BYTE * pOut = m_sTokenizedMultiform;
  4408. BYTE * pMax = pOut + sizeof(m_sTokenizedMultiform);
  4409. for ( int i=0; i<iTokensPerForm && pOut<pMax; i++ )
  4410. {
  4411. const BYTE * sTok = m_dStoredTokens [ ( m_iStoredStart+i ) % iSize ].m_sToken;
  4412. if ( i && pOut<pMax )
  4413. *pOut++ = ' ';
  4414. while ( *sTok && pOut<pMax )
  4415. *pOut++ = *sTok++;
  4416. }
  4417. if ( pOut<pMax )
  4418. *pOut++ = '\0';
  4419. else
  4420. pMax[-1] = '\0';
  4421. }
  4422. m_iStoredStart = ( m_iStoredStart+iTokensPerForm ) % iSize;
  4423. m_iStoredLen -= iTokensPerForm;
  4424. assert ( m_iStoredLen>=0 );
  4425. strcpy ( (char *)m_sOutMultiform, pCurForm->m_sNormalForm.cstr () ); // NOLINT
  4426. return m_sOutMultiform;
  4427. }
  4428. }
  4429. pToken = m_dStoredTokens[m_iStoredStart].m_sToken;
  4430. m_pLastToken = &(m_dStoredTokens[m_iStoredStart]);
  4431. m_iStoredStart = (m_iStoredStart + 1) % iSize;
  4432. m_iStoredLen--;
  4433. return pToken;
  4434. }
  4435. ISphTokenizer * CSphMultiformTokenizer::Clone ( ESphTokenizerClone eMode ) const
  4436. {
  4437. ISphTokenizer * pClone = m_pTokenizer->Clone ( eMode );
  4438. return CreateMultiformFilter ( pClone, m_pMultiWordforms );
  4439. }
  4440. void CSphMultiformTokenizer::SetBufferPtr ( const char * sNewPtr )
  4441. {
  4442. m_pLastToken = NULL;
  4443. m_iStoredLen = 0;
  4444. m_iStoredStart = 0;
  4445. m_pTokenizer->SetBufferPtr ( sNewPtr );
  4446. }
  4447. void CSphMultiformTokenizer::SetBuffer ( BYTE * sBuffer, int iLength )
  4448. {
  4449. m_pTokenizer->SetBuffer ( sBuffer, iLength );
  4450. SetBufferPtr ( (const char *)sBuffer );
  4451. }
  4452. /////////////////////////////////////////////////////////////////////////////
  4453. // FILTER
  4454. /////////////////////////////////////////////////////////////////////////////
  4455. CSphFilterSettings::CSphFilterSettings ()
  4456. : m_sAttrName ( "" )
  4457. , m_bExclude ( false )
  4458. , m_bHasEqual ( true )
  4459. , m_iMinValue ( LLONG_MIN )
  4460. , m_iMaxValue ( LLONG_MAX )
  4461. , m_pValues ( NULL )
  4462. , m_nValues ( 0 )
  4463. {}
  4464. CSphFilterSettings::CSphFilterSettings ( const CSphFilterSettings & rhs )
  4465. {
  4466. assert ( 0 );
  4467. (*this) = rhs;
  4468. }
  4469. void CSphFilterSettings::SetExternalValues ( const SphAttr_t * pValues, int nValues )
  4470. {
  4471. m_pValues = pValues;
  4472. m_nValues = nValues;
  4473. }
  4474. bool CSphFilterSettings::operator == ( const CSphFilterSettings & rhs ) const
  4475. {
  4476. // check name, mode, type
  4477. if ( m_sAttrName!=rhs.m_sAttrName || m_bExclude!=rhs.m_bExclude || m_eType!=rhs.m_eType )
  4478. return false;
  4479. switch ( m_eType )
  4480. {
  4481. case SPH_FILTER_RANGE:
  4482. return m_iMinValue==rhs.m_iMinValue && m_iMaxValue==rhs.m_iMaxValue;
  4483. case SPH_FILTER_FLOATRANGE:
  4484. return m_fMinValue==rhs.m_fMinValue && m_fMaxValue==rhs.m_fMaxValue;
  4485. case SPH_FILTER_VALUES:
  4486. if ( m_dValues.GetLength()!=rhs.m_dValues.GetLength() )
  4487. return false;
  4488. ARRAY_FOREACH ( i, m_dValues )
  4489. if ( m_dValues[i]!=rhs.m_dValues[i] )
  4490. return false;
  4491. return true;
  4492. default:
  4493. assert ( 0 && "internal error: unhandled filter type in comparison" );
  4494. return false;
  4495. }
  4496. }
  4497. /////////////////////////////////////////////////////////////////////////////
  4498. // QUERY
  4499. /////////////////////////////////////////////////////////////////////////////
  4500. CSphQuery::CSphQuery ()
  4501. : m_sIndexes ( "*" )
  4502. , m_sQuery ( "" )
  4503. , m_sRawQuery ( "" )
  4504. , m_iOffset ( 0 )
  4505. , m_iLimit ( 20 )
  4506. , m_pWeights ( NULL )
  4507. , m_iWeights ( 0 )
  4508. , m_eMode ( SPH_MATCH_ALL )
  4509. , m_eRanker ( SPH_RANK_DEFAULT )
  4510. , m_eSort ( SPH_SORT_RELEVANCE )
  4511. , m_iMaxMatches ( 1000 )
  4512. , m_bSortKbuffer ( false )
  4513. , m_bZSlist ( false )
  4514. , m_bSimplify ( false )
  4515. , m_bPlainIDF ( false )
  4516. , m_bGlobalIDF ( false )
  4517. , m_eGroupFunc ( SPH_GROUPBY_ATTR )
  4518. , m_sGroupSortBy ( "@groupby desc" )
  4519. , m_sGroupDistinct ( "" )
  4520. , m_iCutoff ( 0 )
  4521. , m_iRetryCount ( 0 )
  4522. , m_iRetryDelay ( 0 )
  4523. , m_iAgentQueryTimeout ( 0 )
  4524. , m_bGeoAnchor ( false )
  4525. , m_fGeoLatitude ( 0.0f )
  4526. , m_fGeoLongitude ( 0.0f )
  4527. , m_uMaxQueryMsec ( 0 )
  4528. , m_iMaxPredictedMsec ( 0 )
  4529. , m_sComment ( "" )
  4530. , m_sSelect ( "" )
  4531. , m_iOuterOffset ( 0 )
  4532. , m_iOuterLimit ( 0 )
  4533. , m_bHasOuter ( false )
  4534. , m_bReverseScan ( false )
  4535. , m_bIgnoreNonexistent ( false )
  4536. , m_bIgnoreNonexistentIndexes ( false )
  4537. , m_iSQLSelectStart ( -1 )
  4538. , m_iSQLSelectEnd ( -1 )
  4539. , m_iOldVersion ( 0 )
  4540. , m_iOldGroups ( 0 )
  4541. , m_pOldGroups ( NULL )
  4542. , m_iOldMinTS ( 0 )
  4543. , m_iOldMaxTS ( UINT_MAX )
  4544. , m_iOldMinGID ( 0 )
  4545. , m_iOldMaxGID ( UINT_MAX )
  4546. , m_eCollation ( SPH_COLLATION_DEFAULT )
  4547. , m_bAgent ( false )
  4548. {}
  4549. CSphQuery::~CSphQuery ()
  4550. {
  4551. }
  4552. int CSphQuery::GetIndexWeight ( const char * sName ) const
  4553. {
  4554. ARRAY_FOREACH ( i, m_dIndexWeights )
  4555. if ( m_dIndexWeights[i].m_sName==sName )
  4556. return m_dIndexWeights[i].m_iValue;
  4557. return 1;
  4558. }
  4559. //////////////////////////////////////////////////////////////////////////
  4560. struct SelectBounds_t
  4561. {
  4562. int m_iStart;
  4563. int m_iEnd;
  4564. };
  4565. #define YYSTYPE SelectBounds_t
  4566. #include "yysphinxselect.h"
  4567. class SelectParser_t
  4568. {
  4569. public:
  4570. int GetToken ( YYSTYPE * lvalp );
  4571. void AddItem ( YYSTYPE * pExpr, ESphAggrFunc eAggrFunc=SPH_AGGR_NONE, YYSTYPE * pStart=NULL, YYSTYPE * pEnd=NULL );
  4572. void AddItem ( const char * pToken, YYSTYPE * pStart=NULL, YYSTYPE * pEnd=NULL );
  4573. void AliasLastItem ( YYSTYPE * pAlias );
  4574. void AddOption ( YYSTYPE * pOpt, YYSTYPE * pVal );
  4575. private:
  4576. void AutoAlias ( CSphQueryItem & tItem, YYSTYPE * pStart, YYSTYPE * pEnd );
  4577. bool IsTokenEqual ( YYSTYPE * pTok, const char * sRef );
  4578. public:
  4579. CSphString m_sParserError;
  4580. const char * m_pLastTokenStart;
  4581. const char * m_pStart;
  4582. const char * m_pCur;
  4583. CSphQuery * m_pQuery;
  4584. };
  4585. int yylex ( YYSTYPE * lvalp, SelectParser_t * pParser )
  4586. {
  4587. return pParser->GetToken ( lvalp );
  4588. }
  4589. void yyerror ( SelectParser_t * pParser, const char * sMessage )
  4590. {
  4591. pParser->m_sParserError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastTokenStart );
  4592. }
  4593. #include "yysphinxselect.c"
  4594. int SelectParser_t::GetToken ( YYSTYPE * lvalp )
  4595. {
  4596. // skip whitespace, check eof
  4597. while ( isspace ( *m_pCur ) )
  4598. m_pCur++;
  4599. if ( !*m_pCur )
  4600. return 0;
  4601. // begin working that token
  4602. m_pLastTokenStart = m_pCur;
  4603. lvalp->m_iStart = m_pCur-m_pStart;
  4604. // check for constant
  4605. if ( isdigit ( *m_pCur ) )
  4606. {
  4607. char * pEnd = NULL;
  4608. double fDummy; // to avoid gcc unused result warning
  4609. fDummy = strtod ( m_pCur, &pEnd );
  4610. m_pCur = pEnd;
  4611. lvalp->m_iEnd = m_pCur-m_pStart;
  4612. return SEL_TOKEN;
  4613. }
  4614. // check for token
  4615. if ( sphIsAttr ( m_pCur[0] ) || ( m_pCur[0]=='@' && sphIsAttr ( m_pCur[1] ) && !isdigit ( m_pCur[1] ) ) )
  4616. {
  4617. m_pCur++;
  4618. while ( sphIsAttr ( *m_pCur ) || *m_pCur=='.' ) m_pCur++; // json.field is valid attribute name now
  4619. lvalp->m_iEnd = m_pCur-m_pStart;
  4620. #define LOC_CHECK(_str,_len,_ret) \
  4621. if ( lvalp->m_iEnd==_len+lvalp->m_iStart && strncasecmp ( m_pStart+lvalp->m_iStart, _str, _len )==0 ) return _ret;
  4622. LOC_CHECK ( "ID", 2, SEL_ID );
  4623. LOC_CHECK ( "AS", 2, SEL_AS );
  4624. LOC_CHECK ( "OR", 2, TOK_OR );
  4625. LOC_CHECK ( "AND", 3, TOK_AND );
  4626. LOC_CHECK ( "NOT", 3, TOK_NOT );
  4627. LOC_CHECK ( "AVG", 3, SEL_AVG );
  4628. LOC_CHECK ( "MIN", 3, SEL_MIN );
  4629. LOC_CHECK ( "MAX", 3, SEL_MAX );
  4630. LOC_CHECK ( "SUM", 3, SEL_SUM );
  4631. LOC_CHECK ( "COUNT", 5, SEL_COUNT );
  4632. LOC_CHECK ( "DISTINCT", 8, SEL_DISTINCT );
  4633. LOC_CHECK ( "WEIGHT", 6, SEL_WEIGHT );
  4634. LOC_CHECK ( "OPTION", 6, SEL_OPTION );
  4635. #undef LOC_CHECK
  4636. return SEL_TOKEN;
  4637. }
  4638. // check for equality checks
  4639. lvalp->m_iEnd = 1+lvalp->m_iStart;
  4640. switch ( *m_pCur )
  4641. {
  4642. case '<':
  4643. m_pCur++;
  4644. if ( *m_pCur=='>' ) { m_pCur++; lvalp->m_iEnd++; return TOK_NE; }
  4645. if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; return TOK_LTE; }
  4646. return '<';
  4647. case '>':
  4648. m_pCur++;
  4649. if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; return TOK_GTE; }
  4650. return '>';
  4651. case '=':
  4652. m_pCur++;
  4653. if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; }
  4654. return TOK_EQ;
  4655. case '\'':
  4656. {
  4657. const char cEnd = *m_pCur;
  4658. for ( const char * s = m_pCur+1; *s; s++ )
  4659. {
  4660. if ( *s==cEnd )
  4661. {
  4662. m_pCur = s+1;
  4663. return TOK_CONST_STRING;
  4664. }
  4665. }
  4666. return -1;
  4667. }
  4668. }
  4669. // check for comment begin/end
  4670. if ( m_pCur[0]=='/' && m_pCur[1]=='*' )
  4671. {
  4672. m_pCur += 2;
  4673. lvalp->m_iEnd += 1;
  4674. return SEL_COMMENT_OPEN;
  4675. }
  4676. if ( m_pCur[0]=='*' && m_pCur[1]=='/' )
  4677. {
  4678. m_pCur += 2;
  4679. lvalp->m_iEnd += 1;
  4680. return SEL_COMMENT_CLOSE;
  4681. }
  4682. // return char as a token
  4683. return *m_pCur++;
  4684. }
  4685. void SelectParser_t::AutoAlias ( CSphQueryItem & tItem, YYSTYPE * pStart, YYSTYPE * pEnd )
  4686. {
  4687. if ( pStart && pEnd )
  4688. {
  4689. tItem.m_sAlias.SetBinary ( m_pStart + pStart->m_iStart, pEnd->m_iEnd - pStart->m_iStart );
  4690. tItem.m_sAlias.ToLower();
  4691. } else
  4692. tItem.m_sAlias = tItem.m_sExpr;
  4693. }
  4694. void SelectParser_t::AddItem ( YYSTYPE * pExpr, ESphAggrFunc eAggrFunc, YYSTYPE * pStart, YYSTYPE * pEnd )
  4695. {
  4696. CSphQueryItem & tItem = m_pQuery->m_dItems.Add();
  4697. tItem.m_sExpr.SetBinary ( m_pStart + pExpr->m_iStart, pExpr->m_iEnd - pExpr->m_iStart );
  4698. tItem.m_sExpr.ToLower();
  4699. tItem.m_eAggrFunc = eAggrFunc;
  4700. AutoAlias ( tItem, pStart, pEnd );
  4701. }
  4702. void SelectParser_t::AddItem ( const char * pToken, YYSTYPE * pStart, YYSTYPE * pEnd )
  4703. {
  4704. CSphQueryItem & tItem = m_pQuery->m_dItems.Add();
  4705. tItem.m_sExpr = pToken;
  4706. tItem.m_eAggrFunc = SPH_AGGR_NONE;
  4707. tItem.m_sExpr.ToLower();
  4708. AutoAlias ( tItem, pStart, pEnd );
  4709. }
  4710. void SelectParser_t::AliasLastItem ( YYSTYPE * pAlias )
  4711. {
  4712. if ( pAlias )
  4713. {
  4714. CSphQueryItem & tItem = m_pQuery->m_dItems.Last();
  4715. tItem.m_sAlias.SetBinary ( m_pStart + pAlias->m_iStart, pAlias->m_iEnd - pAlias->m_iStart );
  4716. tItem.m_sAlias.ToLower();
  4717. }
  4718. }
  4719. bool SelectParser_t::IsTokenEqual ( YYSTYPE * pTok, const char * sRef )
  4720. {
  4721. int iLen = strlen(sRef);
  4722. if ( iLen!=( pTok->m_iEnd - pTok->m_iStart ) )
  4723. return false;
  4724. return strncasecmp ( m_pStart + pTok->m_iStart, sRef, iLen )==0;
  4725. }
  4726. void SelectParser_t::AddOption ( YYSTYPE * pOpt, YYSTYPE * pVal )
  4727. {
  4728. if ( IsTokenEqual ( pOpt, "reverse_scan" ) )
  4729. {
  4730. if ( IsTokenEqual ( pVal, "1" ) )
  4731. m_pQuery->m_bReverseScan = true;
  4732. } else if ( IsTokenEqual ( pOpt, "sort_method" ) )
  4733. {
  4734. if ( IsTokenEqual ( pVal, "kbuffer" ) )
  4735. m_pQuery->m_bSortKbuffer = true;
  4736. }
  4737. }
  4738. bool CSphQuery::ParseSelectList ( CSphString & sError )
  4739. {
  4740. m_dItems.Reset ();
  4741. if ( m_sSelect.IsEmpty() )
  4742. return true; // empty is ok; will just return everything
  4743. SelectParser_t tParser;
  4744. tParser.m_pStart = m_sSelect.cstr();
  4745. tParser.m_pCur = m_sSelect.cstr();
  4746. tParser.m_pQuery = this;
  4747. yyparse ( &tParser );
  4748. sError = tParser.m_sParserError;
  4749. return sError.IsEmpty ();
  4750. }
  4751. /////////////////////////////////////////////////////////////////////////////
  4752. // SCHEMA
  4753. /////////////////////////////////////////////////////////////////////////////
  4754. static CSphString sphDumpAttr ( const CSphColumnInfo & tAttr )
  4755. {
  4756. CSphString sRes;
  4757. sRes.SetSprintf ( "%s %s:%d@%d", sphTypeName ( tAttr.m_eAttrType ), tAttr.m_sName.cstr(),
  4758. tAttr.m_tLocator.m_iBitCount, tAttr.m_tLocator.m_iBitOffset );
  4759. return sRes;
  4760. }
  4761. /// make string lowercase but keep case of JSON.field
  4762. void sphColumnToLowercase ( char * sVal )
  4763. {
  4764. if ( !sVal || !*sVal )
  4765. return;
  4766. // make all chars lowercase but only prior to '.' delimiter
  4767. for ( ; *sVal && *sVal!='.'; sVal++ )
  4768. *sVal = (char) tolower ( *sVal );
  4769. }
  4770. CSphColumnInfo::CSphColumnInfo ( const char * sName, ESphAttr eType )
  4771. : m_sName ( sName )
  4772. , m_eAttrType ( eType )
  4773. , m_eWordpart ( SPH_WORDPART_WHOLE )
  4774. , m_bIndexed ( false )
  4775. , m_iIndex ( -1 )
  4776. , m_eSrc ( SPH_ATTRSRC_NONE )
  4777. , m_pExpr ( NULL )
  4778. , m_eAggrFunc ( SPH_AGGR_NONE )
  4779. , m_eStage ( SPH_EVAL_STATIC )
  4780. , m_bPayload ( false )
  4781. , m_bFilename ( false )
  4782. , m_bWeight ( false )
  4783. {
  4784. sphColumnToLowercase ( const_cast<char *>( m_sName.cstr() ) );
  4785. }
  4786. bool CSphSchema::CompareTo ( const CSphSchema & rhs, CSphString & sError, bool bFullComparison ) const
  4787. {
  4788. // check attr count
  4789. if ( GetAttrsCount()!=rhs.GetAttrsCount() )
  4790. {
  4791. sError.SetSprintf ( "attribute count mismatch (me=%s, in=%s, myattrs=%d, inattrs=%d)",
  4792. m_sName.cstr(), rhs.m_sName.cstr(),
  4793. GetAttrsCount(), rhs.GetAttrsCount() );
  4794. return false;
  4795. }
  4796. // check attrs
  4797. ARRAY_FOREACH ( i, m_dAttrs )
  4798. {
  4799. const CSphColumnInfo & tAttr1 = rhs.m_dAttrs[i];
  4800. const CSphColumnInfo & tAttr2 = m_dAttrs[i];
  4801. bool bMismatch;
  4802. if ( bFullComparison )
  4803. bMismatch = !(tAttr1==tAttr2);
  4804. else
  4805. {
  4806. ESphAttr eAttr1 = tAttr1.m_eAttrType;
  4807. ESphAttr eAttr2 = tAttr2.m_eAttrType;
  4808. if ( eAttr1==SPH_ATTR_WORDCOUNT )
  4809. eAttr1 = SPH_ATTR_INTEGER;
  4810. if ( eAttr2==SPH_ATTR_WORDCOUNT )
  4811. eAttr2 = SPH_ATTR_INTEGER;
  4812. bMismatch = tAttr1.m_sName!=tAttr2.m_sName || eAttr1!=eAttr2 || tAttr1.m_eWordpart!=tAttr2.m_eWordpart ||
  4813. tAttr1.m_bIndexed!=tAttr2.m_bIndexed || tAttr1.m_tLocator.m_iBitCount!=tAttr2.m_tLocator.m_iBitCount ||
  4814. tAttr1.m_tLocator.m_iBitOffset!=tAttr2.m_tLocator.m_iBitOffset;
  4815. }
  4816. if ( bMismatch )
  4817. {
  4818. sError.SetSprintf ( "attribute mismatch (me=%s, in=%s, idx=%d, myattr=%s, inattr=%s)",
  4819. m_sName.cstr(), rhs.m_sName.cstr(), i, sphDumpAttr ( m_dAttrs[i] ).cstr(), sphDumpAttr ( rhs.m_dAttrs[i] ).cstr() );
  4820. return false;
  4821. }
  4822. }
  4823. // check field count
  4824. if ( rhs.m_dFields.GetLength()!=m_dFields.GetLength() )
  4825. {
  4826. sError.SetSprintf ( "fulltext fields count mismatch (me=%s, in=%s, myfields=%d, infields=%d)",
  4827. m_sName.cstr(), rhs.m_sName.cstr(),
  4828. m_dFields.GetLength(), rhs.m_dFields.GetLength() );
  4829. return false;
  4830. }
  4831. // check fulltext field names
  4832. ARRAY_FOREACH ( i, rhs.m_dFields )
  4833. if ( rhs.m_dFields[i].m_sName!=m_dFields[i].m_sName )
  4834. {
  4835. sError.SetSprintf ( "fulltext field mismatch (me=%s, myfield=%s, idx=%d, in=%s, infield=%s)",
  4836. m_sName.cstr(), rhs.m_sName.cstr(),
  4837. i, m_dFields[i].m_sName.cstr(), rhs.m_dFields[i].m_sName.cstr() );
  4838. return false;
  4839. }
  4840. return true;
  4841. }
  4842. int CSphSchema::GetFieldIndex ( const char * sName ) const
  4843. {
  4844. if ( !sName )
  4845. return -1;
  4846. ARRAY_FOREACH ( i, m_dFields )
  4847. if ( strcasecmp ( m_dFields[i].m_sName.cstr(), sName )==0 )
  4848. return i;
  4849. return -1;
  4850. }
  4851. int CSphSchema::GetAttrIndex ( const char * sName ) const
  4852. {
  4853. if ( !sName )
  4854. return -1;
  4855. ARRAY_FOREACH ( i, m_dAttrs )
  4856. if ( m_dAttrs[i].m_sName==sName )
  4857. return i;
  4858. return -1;
  4859. }
  4860. const CSphColumnInfo * CSphSchema::GetAttr ( const char * sName ) const
  4861. {
  4862. int iIndex = GetAttrIndex ( sName );
  4863. if ( iIndex>=0 )
  4864. return &m_dAttrs[iIndex];
  4865. return NULL;
  4866. }
  4867. void CSphSchema::Reset ()
  4868. {
  4869. m_dFields.Reset();
  4870. ResetAttrs ();
  4871. }
  4872. void CSphSchema::ResetAttrs ()
  4873. {
  4874. m_dAttrs.Reset();
  4875. m_dStaticUsed.Reset();
  4876. m_dDynamicUsed.Reset();
  4877. m_dPtrAttrs.Reset();
  4878. m_dFactorAttrs.Reset();
  4879. m_iStaticSize = 0;
  4880. }
  4881. void CSphSchema::AddAttr ( const CSphColumnInfo & tCol, bool bDynamic )
  4882. {
  4883. assert ( tCol.m_eAttrType!=SPH_ATTR_NONE );
  4884. if ( tCol.m_eAttrType==SPH_ATTR_NONE )
  4885. return;
  4886. m_dAttrs.Add ( tCol );
  4887. CSphAttrLocator & tLoc = m_dAttrs.Last().m_tLocator;
  4888. if ( tLoc.IsID() )
  4889. return;
  4890. int iBits = ROWITEM_BITS;
  4891. if ( tCol.m_tLocator.m_iBitCount>0 ) iBits = tCol.m_tLocator.m_iBitCount;
  4892. if ( tCol.m_eAttrType==SPH_ATTR_BOOL ) iBits = 1;
  4893. if ( tCol.m_eAttrType==SPH_ATTR_BIGINT || tCol.m_eAttrType==SPH_ATTR_JSON_FIELD ) iBits = 64;
  4894. tLoc.m_bDynamic = bDynamic;
  4895. CSphVector<int> & dUsed = bDynamic ? m_dDynamicUsed : m_dStaticUsed;
  4896. if ( tCol.m_eAttrType==SPH_ATTR_STRINGPTR )
  4897. {
  4898. iBits = ROWITEMPTR_BITS;
  4899. PtrAttr_t & tPtrAttr = m_dPtrAttrs.Add();
  4900. tPtrAttr.m_iOffset = dUsed.GetLength();
  4901. tPtrAttr.m_sName = tCol.m_sName;
  4902. }
  4903. if ( tCol.m_eAttrType==SPH_ATTR_FACTORS )
  4904. {
  4905. iBits = ROWITEMPTR_BITS;
  4906. PtrAttr_t & tPtrAttr = m_dFactorAttrs.Add();
  4907. tPtrAttr.m_iOffset = dUsed.GetLength();
  4908. tPtrAttr.m_sName = tCol.m_sName;
  4909. }
  4910. tLoc.m_iBitCount = iBits;
  4911. if ( iBits>=ROWITEM_BITS )
  4912. {
  4913. tLoc.m_iBitOffset = dUsed.GetLength()*ROWITEM_BITS;
  4914. int iItems = (iBits+ROWITEM_BITS-1) / ROWITEM_BITS;
  4915. for ( int i=0; i<iItems; i++ )
  4916. {
  4917. dUsed.Add ( ROWITEM_BITS );
  4918. if ( !bDynamic )
  4919. m_iStaticSize++;
  4920. }
  4921. } else
  4922. {
  4923. int iItem;
  4924. for ( iItem=0; iItem<dUsed.GetLength(); iItem++ )
  4925. if ( dUsed[iItem]+iBits<=ROWITEM_BITS )
  4926. break;
  4927. if ( iItem==dUsed.GetLength() )
  4928. {
  4929. dUsed.Add ( 0 );
  4930. if ( !bDynamic )
  4931. m_iStaticSize++;
  4932. }
  4933. tLoc.m_iBitOffset = iItem*ROWITEM_BITS + dUsed[iItem];
  4934. dUsed[iItem] += iBits;
  4935. }
  4936. }
  4937. void CSphSchema::RemoveAttr ( int iIndex )
  4938. {
  4939. // adjust size
  4940. CSphAttrLocator & tLoc = m_dAttrs[iIndex].m_tLocator;
  4941. assert ( !tLoc.m_bDynamic );
  4942. int iItem = tLoc.m_iBitOffset / ROWITEM_BITS;
  4943. if ( tLoc.m_iBitCount>=ROWITEM_BITS )
  4944. {
  4945. for ( int i=0; i<tLoc.m_iBitCount/ROWITEM_BITS; i++ )
  4946. {
  4947. m_dStaticUsed[i+iItem] = 0;
  4948. m_iStaticSize--;
  4949. }
  4950. } else
  4951. {
  4952. m_dStaticUsed[iItem] -= tLoc.m_iBitCount;
  4953. assert ( m_dStaticUsed[iItem]>=0 );
  4954. if ( m_dStaticUsed[iItem]<=0 )
  4955. m_iStaticSize--;
  4956. }
  4957. // do remove
  4958. m_dAttrs.Remove ( iIndex );
  4959. ARRAY_FOREACH ( i, m_dPtrAttrs )
  4960. if ( m_dPtrAttrs[i].m_iOffset==iItem )
  4961. {
  4962. m_dPtrAttrs.Remove(i);
  4963. break;
  4964. }
  4965. ARRAY_FOREACH ( i, m_dFactorAttrs )
  4966. if ( m_dFactorAttrs[i].m_iOffset==iItem )
  4967. {
  4968. m_dFactorAttrs.Remove(i);
  4969. break;
  4970. }
  4971. }
  4972. void FixupPtrAttrs ( const CSphVector<CSphSchema::PtrAttr_t> & dSrcPtrAttrs, const CSphVector<CSphColumnInfo> & dDstAttrs, CSphVector<CSphSchema::PtrAttr_t> & dDstPtrAttrs )
  4973. {
  4974. dDstPtrAttrs.Reset();
  4975. ARRAY_FOREACH ( iSrcPtrAttr, dSrcPtrAttrs )
  4976. ARRAY_FOREACH ( iDstAttr, dDstAttrs )
  4977. if ( dSrcPtrAttrs[iSrcPtrAttr].m_sName==dDstAttrs[iDstAttr].m_sName )
  4978. {
  4979. CSphSchema::PtrAttr_t & tPtrAttr = dDstPtrAttrs.Add();
  4980. tPtrAttr.m_iOffset = dDstAttrs[iDstAttr].m_tLocator.m_iBitOffset / ROWITEM_BITS;
  4981. tPtrAttr.m_sName = dDstAttrs[iDstAttr].m_sName;
  4982. break;
  4983. }
  4984. }
  4985. void CSphSchema::AdoptPtrAttrs ( const CSphSchema & tSrc )
  4986. {
  4987. FixupPtrAttrs ( tSrc.m_dPtrAttrs, m_dAttrs, m_dPtrAttrs );
  4988. FixupPtrAttrs ( tSrc.m_dFactorAttrs, m_dAttrs, m_dFactorAttrs );
  4989. }
  4990. void CSphSchema::CloneMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const
  4991. {
  4992. assert ( pDst );
  4993. FreeStringPtrs ( pDst );
  4994. pDst->Clone ( rhs, GetDynamicSize() );
  4995. CopyStrings ( pDst, rhs );
  4996. };
  4997. void CSphSchema::CloneWholeMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const
  4998. {
  4999. assert ( pDst );
  5000. FreeStringPtrs ( pDst );
  5001. pDst->Clone ( rhs, GetRowSize() );
  5002. CopyStrings ( pDst, rhs );
  5003. };
  5004. void CSphSchema::CopyStrings ( CSphMatch * pDst, const CSphMatch & rhs, int iUpBound ) const
  5005. {
  5006. if ( iUpBound<0 )
  5007. {
  5008. ARRAY_FOREACH ( i, m_dPtrAttrs )
  5009. *(const char**) (pDst->m_pDynamic+m_dPtrAttrs[i].m_iOffset) = CSphString (*(const char**)(rhs.m_pDynamic+m_dPtrAttrs[i].m_iOffset)).Leak();
  5010. } else
  5011. {
  5012. ARRAY_FOREACH ( i, m_dPtrAttrs )
  5013. if ( m_dPtrAttrs[i].m_iOffset < iUpBound )
  5014. *(const char**) (pDst->m_pDynamic+m_dPtrAttrs[i].m_iOffset) = CSphString (*(const char**)(rhs.m_pDynamic+m_dPtrAttrs[i].m_iOffset)).Leak();
  5015. else
  5016. break;
  5017. }
  5018. // not immediately obvious: this is not needed while pushing matches to sorters; factors are held in an outer hash table
  5019. // but it is necessary to copy factors when combining results from several indexes via a sorter because at this moment matches are the owners of factor data
  5020. ARRAY_FOREACH ( i, m_dFactorAttrs )
  5021. {
  5022. int iOffset = m_dFactorAttrs[i].m_iOffset;
  5023. BYTE * pData = *(BYTE**)(rhs.m_pDynamic+iOffset);
  5024. if ( pData )
  5025. {
  5026. DWORD uDataSize = *(DWORD*)pData;
  5027. assert ( uDataSize );
  5028. BYTE * pCopy = new BYTE[uDataSize];
  5029. memcpy ( pCopy, pData, uDataSize );
  5030. *(BYTE**)(pDst->m_pDynamic+iOffset) = pCopy;
  5031. }
  5032. }
  5033. }
  5034. void CSphSchema::FreeStringPtrs ( CSphMatch * pMatch, int iUpBound ) const
  5035. {
  5036. assert ( pMatch );
  5037. if ( !pMatch->m_pDynamic )
  5038. return;
  5039. if ( m_dPtrAttrs.GetLength() )
  5040. {
  5041. CSphString sStr;
  5042. if ( iUpBound<0 )
  5043. {
  5044. ARRAY_FOREACH ( i, m_dPtrAttrs )
  5045. sStr.Adopt ( (char**) (pMatch->m_pDynamic+m_dPtrAttrs[i].m_iOffset));
  5046. } else
  5047. {
  5048. ARRAY_FOREACH ( i, m_dPtrAttrs )
  5049. if ( m_dPtrAttrs[i].m_iOffset < iUpBound )
  5050. sStr.Adopt ( (char**) (pMatch->m_pDynamic+m_dPtrAttrs[i].m_iOffset));
  5051. else
  5052. break;
  5053. }
  5054. }
  5055. ARRAY_FOREACH ( i, m_dFactorAttrs )
  5056. {
  5057. int iOffset = m_dFactorAttrs[i].m_iOffset;
  5058. BYTE * pData = *(BYTE**)(pMatch->m_pDynamic+iOffset);
  5059. if ( pData )
  5060. {
  5061. delete [] pData;
  5062. *(BYTE**)(pMatch->m_pDynamic+iOffset) = NULL;
  5063. }
  5064. }
  5065. }
  5066. ///////////////////////////////////////////////////////////////////////////////
  5067. // BIT-ENCODED FILE OUTPUT
  5068. ///////////////////////////////////////////////////////////////////////////////
  5069. CSphWriter::CSphWriter ()
  5070. : m_sName ( "" )
  5071. , m_iPos ( -1 )
  5072. , m_iWritten ( 0 )
  5073. , m_iFD ( -1 )
  5074. , m_iPoolUsed ( 0 )
  5075. , m_pBuffer ( NULL )
  5076. , m_pPool ( NULL )
  5077. , m_bOwnFile ( false )
  5078. , m_pSharedOffset ( NULL )
  5079. , m_iBufferSize ( 262144 )
  5080. , m_bError ( false )
  5081. , m_pError ( NULL )
  5082. {
  5083. m_pThrottle = &g_tThrottle;
  5084. }
  5085. void CSphWriter::SetBufferSize ( int iBufferSize )
  5086. {
  5087. if ( iBufferSize!=m_iBufferSize )
  5088. {
  5089. m_iBufferSize = Max ( iBufferSize, 262144 );
  5090. if ( m_pBuffer )
  5091. SafeDeleteArray ( m_pBuffer );
  5092. }
  5093. }
  5094. bool CSphWriter::OpenFile ( const CSphString & sName, CSphString & sErrorBuffer )
  5095. {
  5096. assert ( !sName.IsEmpty() );
  5097. assert ( m_iFD<0 && "already open" );
  5098. m_bOwnFile = true;
  5099. m_sName = sName;
  5100. m_pError = &sErrorBuffer;
  5101. if ( !m_pBuffer )
  5102. m_pBuffer = new BYTE [ m_iBufferSize ];
  5103. m_iFD = ::open ( m_sName.cstr(), SPH_O_NEW, 0644 );
  5104. m_pPool = m_pBuffer;
  5105. m_iPoolUsed = 0;
  5106. m_iPos = 0;
  5107. m_iWritten = 0;
  5108. m_bError = ( m_iFD<0 );
  5109. if ( m_bError )
  5110. m_pError->SetSprintf ( "failed to create %s: %s" , sName.cstr(), strerror(errno) );
  5111. return !m_bError;
  5112. }
  5113. void CSphWriter::SetFile ( CSphAutofile & tAuto, SphOffset_t * pSharedOffset, CSphString & sError )
  5114. {
  5115. assert ( m_iFD<0 && "already open" );
  5116. m_bOwnFile = false;
  5117. if ( !m_pBuffer )
  5118. m_pBuffer = new BYTE [ m_iBufferSize ];
  5119. m_iFD = tAuto.GetFD();
  5120. m_sName = tAuto.GetFilename();
  5121. m_pPool = m_pBuffer;
  5122. m_iPoolUsed = 0;
  5123. m_iPos = 0;
  5124. m_iWritten = 0;
  5125. m_pSharedOffset = pSharedOffset;
  5126. m_pError = &sError;
  5127. assert ( m_pError );
  5128. }
  5129. CSphWriter::~CSphWriter ()
  5130. {
  5131. CloseFile ();
  5132. SafeDeleteArray ( m_pBuffer );
  5133. }
  5134. void CSphWriter::CloseFile ( bool bTruncate )
  5135. {
  5136. if ( m_iFD>=0 )
  5137. {
  5138. Flush ();
  5139. if ( bTruncate )
  5140. sphTruncate ( m_iFD );
  5141. if ( m_bOwnFile )
  5142. ::close ( m_iFD );
  5143. m_iFD = -1;
  5144. }
  5145. }
  5146. void CSphWriter::UnlinkFile()
  5147. {
  5148. if ( m_bOwnFile )
  5149. {
  5150. if ( m_iFD>=0 )
  5151. ::close ( m_iFD );
  5152. m_iFD = -1;
  5153. ::unlink ( m_sName.cstr() );
  5154. m_sName = "";
  5155. }
  5156. SafeDeleteArray ( m_pBuffer );
  5157. }
  5158. void CSphWriter::PutByte ( int data )
  5159. {
  5160. assert ( m_pPool );
  5161. if ( m_iPoolUsed==m_iBufferSize )
  5162. Flush ();
  5163. *m_pPool++ = BYTE ( data & 0xff );
  5164. m_iPoolUsed++;
  5165. m_iPos++;
  5166. }
  5167. void CSphWriter::PutBytes ( const void * pData, int64_t iSize )
  5168. {
  5169. assert ( m_pPool );
  5170. const BYTE * pBuf = (const BYTE *) pData;
  5171. while ( iSize>0 )
  5172. {
  5173. int iPut = ( iSize<m_iBufferSize ? int(iSize) : m_iBufferSize ); // comparison int64 to int32
  5174. if ( m_iPoolUsed+iPut>m_iBufferSize )
  5175. Flush ();
  5176. assert ( m_iPoolUsed+iPut<=m_iBufferSize );
  5177. memcpy ( m_pPool, pBuf, iPut );
  5178. m_pPool += iPut;
  5179. m_iPoolUsed += iPut;
  5180. m_iPos += iPut;
  5181. pBuf += iPut;
  5182. iSize -= iPut;
  5183. }
  5184. }
  5185. void CSphWriter::ZipInt ( DWORD uValue )
  5186. {
  5187. int iBytes = 1;
  5188. DWORD u = ( uValue>>7 );
  5189. while ( u )
  5190. {
  5191. u >>= 7;
  5192. iBytes++;
  5193. }
  5194. while ( iBytes-- )
  5195. PutByte (
  5196. ( 0x7f & ( uValue >> (7*iBytes) ) )
  5197. | ( iBytes ? 0x80 : 0 ) );
  5198. }
  5199. void CSphWriter::ZipOffset ( SphOffset_t uValue )
  5200. {
  5201. int iBytes = 1;
  5202. uint64_t u = ((uint64_t)uValue)>>7;
  5203. while ( u )
  5204. {
  5205. u >>= 7;
  5206. iBytes++;
  5207. }
  5208. while ( iBytes-- )
  5209. PutByte (
  5210. ( 0x7f & (DWORD)( uValue >> (7*iBytes) ) )
  5211. | ( iBytes ? 0x80 : 0 ) );
  5212. }
  5213. void CSphWriter::ZipOffsets ( CSphVector<SphOffset_t> * pData )
  5214. {
  5215. assert ( pData );
  5216. SphOffset_t * pValue = &((*pData)[0]);
  5217. int n = pData->GetLength ();
  5218. while ( n-->0 )
  5219. {
  5220. SphOffset_t uValue = *pValue++;
  5221. int iBytes = 1;
  5222. uint64_t u = ((uint64_t)uValue)>>7;
  5223. while ( u )
  5224. {
  5225. u >>= 7;
  5226. iBytes++;
  5227. }
  5228. while ( iBytes-- )
  5229. PutByte (
  5230. ( 0x7f & (DWORD)( uValue >> (7*iBytes) ) )
  5231. | ( iBytes ? 0x80 : 0 ) );
  5232. }
  5233. }
  5234. void CSphWriter::Flush ()
  5235. {
  5236. // PROFILE ( write_hits );
  5237. if ( m_pSharedOffset && *m_pSharedOffset!=m_iWritten )
  5238. sphSeek ( m_iFD, m_iWritten, SEEK_SET );
  5239. if ( !sphWriteThrottled ( m_iFD, m_pBuffer, m_iPoolUsed, m_sName.cstr(), *m_pError, m_pThrottle ) )
  5240. m_bError = true;
  5241. m_iWritten += m_iPoolUsed;
  5242. m_iPoolUsed = 0;
  5243. m_pPool = m_pBuffer;
  5244. if ( m_pSharedOffset )
  5245. *m_pSharedOffset = m_iWritten;
  5246. }
  5247. void CSphWriter::PutString ( const char * szString )
  5248. {
  5249. int iLen = szString ? strlen ( szString ) : 0;
  5250. PutDword ( iLen );
  5251. if ( iLen )
  5252. PutBytes ( szString, iLen );
  5253. }
  5254. void CSphWriter::PutString ( const CSphString & sString )
  5255. {
  5256. int iLen = sString.Length();
  5257. PutDword ( iLen );
  5258. if ( iLen )
  5259. PutBytes ( sString.cstr(), iLen );
  5260. }
  5261. void CSphWriter::Tag ( const char * sTag )
  5262. {
  5263. assert ( sTag && *sTag ); // empty tags are nonsense
  5264. assert ( strlen(sTag)<64 ); // huge tags are nonsense
  5265. PutBytes ( sTag, strlen(sTag) );
  5266. }
  5267. void CSphWriter::SeekTo ( SphOffset_t iPos )
  5268. {
  5269. assert ( iPos>=0 );
  5270. if ( iPos>=m_iWritten && iPos<=( m_iWritten + m_iPoolUsed ) )
  5271. {
  5272. // seeking inside the buffer
  5273. m_iPoolUsed = (int)( iPos - m_iWritten );
  5274. m_pPool = m_pBuffer + m_iPoolUsed;
  5275. } else
  5276. {
  5277. assert ( iPos<m_iWritten ); // seeking forward in a writer, we don't support it
  5278. sphSeek ( m_iFD, iPos, SEEK_SET );
  5279. // seeking outside the buffer; so the buffer must be discarded
  5280. // also, current write position must be adjusted
  5281. m_pPool = m_pBuffer;
  5282. m_iPoolUsed = 0;
  5283. m_iWritten = iPos;
  5284. }
  5285. m_iPos = iPos;
  5286. }
  5287. ///////////////////////////////////////////////////////////////////////////////
  5288. // BIT-ENCODED FILE INPUT
  5289. ///////////////////////////////////////////////////////////////////////////////
  5290. CSphReader::CSphReader ( BYTE * pBuf, int iSize )
  5291. : m_pProfile ( NULL )
  5292. , m_eProfileState ( SPH_QSTATE_IO )
  5293. , m_iFD ( -1 )
  5294. , m_iPos ( 0 )
  5295. , m_iBuffPos ( 0 )
  5296. , m_iBuffUsed ( 0 )
  5297. , m_pBuff ( pBuf )
  5298. , m_iSizeHint ( 0 )
  5299. , m_iBufSize ( iSize )
  5300. , m_bBufOwned ( false )
  5301. , m_iReadUnhinted ( DEFAULT_READ_UNHINTED )
  5302. , m_bError ( false )
  5303. {
  5304. assert ( pBuf==NULL || iSize>0 );
  5305. m_pThrottle = &g_tThrottle;
  5306. }
  5307. CSphReader::~CSphReader ()
  5308. {
  5309. if ( m_bBufOwned )
  5310. SafeDeleteArray ( m_pBuff );
  5311. }
  5312. void CSphReader::SetBuffers ( int iReadBuffer, int iReadUnhinted )
  5313. {
  5314. if ( !m_pBuff )
  5315. m_iBufSize = iReadBuffer;
  5316. m_iReadUnhinted = iReadUnhinted;
  5317. }
  5318. void CSphReader::SetFile ( int iFD, const char * sFilename )
  5319. {
  5320. m_iFD = iFD;
  5321. m_iPos = 0;
  5322. m_iBuffPos = 0;
  5323. m_iBuffUsed = 0;
  5324. m_sFilename = sFilename;
  5325. }
  5326. void CSphReader::SetFile ( const CSphAutofile & tFile )
  5327. {
  5328. SetFile ( tFile.GetFD(), tFile.GetFilename() );
  5329. }
  5330. void CSphReader::Reset ()
  5331. {
  5332. SetFile ( -1, "" );
  5333. }
  5334. /// sizehint > 0 means we expect to read approx that much bytes
  5335. /// sizehint == 0 means no hint, use default (happens later in UpdateCache())
  5336. /// sizehint == -1 means reposition and adjust current hint
  5337. void CSphReader::SeekTo ( SphOffset_t iPos, int iSizeHint )
  5338. {
  5339. assert ( iPos>=0 );
  5340. assert ( iSizeHint>=-1 );
  5341. #ifndef NDEBUG
  5342. #if PARANOID
  5343. struct_stat tStat;
  5344. fstat ( m_iFD, &tStat );
  5345. if ( iPos > tStat.st_size )
  5346. sphDie ( "INTERNAL ERROR: seeking past the end of file" );
  5347. #endif
  5348. #endif
  5349. if ( iPos>=m_iPos && iPos<m_iPos+m_iBuffUsed )
  5350. {
  5351. m_iBuffPos = (int)( iPos-m_iPos ); // reposition to proper byte
  5352. m_iSizeHint = iSizeHint - ( m_iBuffUsed - m_iBuffPos ); // we already have some bytes cached, so let's adjust size hint
  5353. assert ( m_iBuffPos<m_iBuffUsed );
  5354. } else
  5355. {
  5356. m_iPos = iPos;
  5357. m_iBuffPos = 0; // for GetPos() to work properly, aaaargh
  5358. m_iBuffUsed = 0;
  5359. if ( iSizeHint==-1 )
  5360. {
  5361. // the adjustment bureau
  5362. // we need to seek but still keep the current hint
  5363. // happens on a skiplist jump, for instance
  5364. int64_t iHintLeft = m_iPos + m_iSizeHint - iPos;
  5365. if ( iHintLeft>0 && iHintLeft<INT_MAX )
  5366. iSizeHint = (int)iHintLeft;
  5367. else
  5368. iSizeHint = 0;
  5369. }
  5370. // get that hint
  5371. assert ( iSizeHint>=0 );
  5372. m_iSizeHint = iSizeHint;
  5373. }
  5374. }
  5375. void CSphReader::SkipBytes ( int iCount )
  5376. {
  5377. // 0 means "no hint", so this clamp works alright
  5378. SeekTo ( m_iPos+m_iBuffPos+iCount, Max ( m_iSizeHint-m_iBuffPos-iCount, 0 ) );
  5379. }
  5380. #if USE_WINDOWS
  5381. // atomic seek+read for Windows
  5382. int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
  5383. {
  5384. if ( iBytes==0 )
  5385. return 0;
  5386. CSphIOStats * pIOStats = GetIOStats();
  5387. int64_t tmStart = 0;
  5388. if ( pIOStats )
  5389. tmStart = sphMicroTimer();
  5390. HANDLE hFile;
  5391. hFile = (HANDLE) _get_osfhandle ( iFD );
  5392. if ( hFile==INVALID_HANDLE_VALUE )
  5393. return -1;
  5394. STATIC_SIZE_ASSERT ( SphOffset_t, 8 );
  5395. OVERLAPPED tOverlapped = { 0 };
  5396. tOverlapped.Offset = (DWORD)( iOffset & I64C(0xffffffff) );
  5397. tOverlapped.OffsetHigh = (DWORD)( iOffset>>32 );
  5398. DWORD uRes;
  5399. if ( !ReadFile ( hFile, pBuf, iBytes, &uRes, &tOverlapped ) )
  5400. {
  5401. DWORD uErr = GetLastError();
  5402. if ( uErr==ERROR_HANDLE_EOF )
  5403. return 0;
  5404. errno = uErr; // FIXME! should remap from Win to POSIX
  5405. return -1;
  5406. }
  5407. if ( pIOStats )
  5408. {
  5409. pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
  5410. pIOStats->m_iReadOps++;
  5411. pIOStats->m_iReadBytes += iBytes;
  5412. }
  5413. return uRes;
  5414. }
  5415. #else
  5416. #if HAVE_PREAD
  5417. // atomic seek+read for non-Windows systems with pread() call
  5418. int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
  5419. {
  5420. CSphIOStats * pIOStats = GetIOStats();
  5421. if ( !pIOStats )
  5422. return ::pread ( iFD, pBuf, iBytes, iOffset );
  5423. int64_t tmStart = sphMicroTimer();
  5424. int iRes = (int) ::pread ( iFD, pBuf, iBytes, iOffset );
  5425. if ( pIOStats )
  5426. {
  5427. pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
  5428. pIOStats->m_iReadOps++;
  5429. pIOStats->m_iReadBytes += iBytes;
  5430. }
  5431. return iRes;
  5432. }
  5433. #else
  5434. // generic fallback; prone to races between seek and read
  5435. int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
  5436. {
  5437. if ( sphSeek ( iFD, iOffset, SEEK_SET )==-1 )
  5438. return -1;
  5439. return sphReadThrottled ( iFD, pBuf, iBytes, &g_tThrottle );
  5440. }
  5441. #endif // HAVE_PREAD
  5442. #endif // USE_WINDOWS
  5443. void CSphReader::UpdateCache ()
  5444. {
  5445. ESphQueryState eOld = SPH_QSTATE_TOTAL;
  5446. if ( m_pProfile )
  5447. eOld = m_pProfile->Switch ( m_eProfileState );
  5448. assert ( m_iFD>=0 );
  5449. // alloc buf on first actual read
  5450. if ( !m_pBuff )
  5451. {
  5452. if ( m_iBufSize<=0 )
  5453. m_iBufSize = DEFAULT_READ_BUFFER;
  5454. m_bBufOwned = true;
  5455. m_pBuff = new BYTE [ m_iBufSize ];
  5456. }
  5457. // stream position could be changed externally
  5458. // so let's just hope that the OS optimizes redundant seeks
  5459. SphOffset_t iNewPos = m_iPos + Min ( m_iBuffPos, m_iBuffUsed );
  5460. if ( m_iSizeHint<=0 )
  5461. m_iSizeHint = ( m_iReadUnhinted>0 ) ? m_iReadUnhinted : DEFAULT_READ_UNHINTED;
  5462. int iReadLen = Min ( m_iSizeHint, m_iBufSize );
  5463. m_iBuffPos = 0;
  5464. m_iBuffUsed = sphPread ( m_iFD, m_pBuff, iReadLen, iNewPos ); // FIXME! what about throttling?
  5465. if ( m_iBuffUsed<0 )
  5466. {
  5467. m_iBuffUsed = m_iBuffPos = 0;
  5468. m_bError = true;
  5469. m_sError.SetSprintf ( "pread error in %s: pos="INT64_FMT", len=%d, code=%d, msg=%s",
  5470. m_sFilename.cstr(), (int64_t)iNewPos, iReadLen, errno, strerror(errno) );
  5471. if ( m_pProfile )
  5472. m_pProfile->Switch ( eOld );
  5473. return;
  5474. }
  5475. // all fine, adjust offset and hint
  5476. m_iSizeHint -= m_iBuffUsed;
  5477. m_iPos = iNewPos;
  5478. if ( m_pProfile )
  5479. m_pProfile->Switch ( eOld );
  5480. }
  5481. int CSphReader::GetByte ()
  5482. {
  5483. if ( m_iBuffPos>=m_iBuffUsed )
  5484. {
  5485. UpdateCache ();
  5486. if ( m_iBuffPos>=m_iBuffUsed )
  5487. return 0; // unexpected io failure
  5488. }
  5489. assert ( m_iBuffPos<m_iBuffUsed );
  5490. return m_pBuff [ m_iBuffPos++ ];
  5491. }
  5492. void CSphReader::GetBytes ( void * pData, int iSize )
  5493. {
  5494. BYTE * pOut = (BYTE*) pData;
  5495. while ( iSize>m_iBufSize )
  5496. {
  5497. int iLen = m_iBuffUsed - m_iBuffPos;
  5498. assert ( iLen<=m_iBufSize );
  5499. memcpy ( pOut, m_pBuff+m_iBuffPos, iLen );
  5500. m_iBuffPos += iLen;
  5501. pOut += iLen;
  5502. iSize -= iLen;
  5503. m_iSizeHint = iSize; // FIXME!
  5504. if ( iSize>0 )
  5505. {
  5506. UpdateCache ();
  5507. if ( !m_iBuffUsed )
  5508. {
  5509. memset ( pData, 0, iSize );
  5510. return; // unexpected io failure
  5511. }
  5512. }
  5513. }
  5514. if ( m_iBuffPos+iSize>m_iBuffUsed )
  5515. {
  5516. // move old buffer tail to buffer head to avoid losing the data
  5517. const int iLen = m_iBuffUsed - m_iBuffPos;
  5518. if ( iLen>0 )
  5519. {
  5520. memcpy ( pOut, m_pBuff+m_iBuffPos, iLen );
  5521. m_iBuffPos += iLen;
  5522. pOut += iLen;
  5523. iSize -= iLen;
  5524. }
  5525. m_iSizeHint = iSize - m_iBuffUsed + m_iBuffPos; // FIXME!
  5526. UpdateCache ();
  5527. if ( m_iBuffPos+iSize>m_iBuffUsed )
  5528. {
  5529. memset ( pData, 0, iSize ); // unexpected io failure
  5530. return;
  5531. }
  5532. }
  5533. assert ( (m_iBuffPos+iSize)<=m_iBuffUsed );
  5534. memcpy ( pOut, m_pBuff+m_iBuffPos, iSize );
  5535. m_iBuffPos += iSize;
  5536. }
  5537. int CSphReader::GetBytesZerocopy ( const BYTE ** ppData, int iMax )
  5538. {
  5539. if ( m_iBuffPos>=m_iBuffUsed )
  5540. {
  5541. UpdateCache ();
  5542. if ( m_iBuffPos>=m_iBuffUsed )
  5543. return 0; // unexpected io failure
  5544. }
  5545. int iChunk = Min ( m_iBuffUsed-m_iBuffPos, iMax );
  5546. *ppData = m_pBuff + m_iBuffPos;
  5547. m_iBuffPos += iChunk;
  5548. return iChunk;
  5549. }
  5550. int CSphReader::GetLine ( char * sBuffer, int iMaxLen )
  5551. {
  5552. int iOutPos = 0;
  5553. iMaxLen--; // reserve space for trailing '\0'
  5554. // grab as many chars as we can
  5555. while ( iOutPos<iMaxLen )
  5556. {
  5557. // read next chunk if necessary
  5558. if ( m_iBuffPos>=m_iBuffUsed )
  5559. {
  5560. UpdateCache ();
  5561. if ( m_iBuffPos>=m_iBuffUsed )
  5562. {
  5563. if ( iOutPos==0 ) return -1; // current line is empty; indicate eof
  5564. break; // return current line; will return eof next time
  5565. }
  5566. }
  5567. // break on CR or LF
  5568. if ( m_pBuff[m_iBuffPos]=='\r' || m_pBuff[m_iBuffPos]=='\n' )
  5569. break;
  5570. // one more valid char
  5571. sBuffer[iOutPos++] = m_pBuff[m_iBuffPos++];
  5572. }
  5573. // skip everything until the newline or eof
  5574. for ( ;; )
  5575. {
  5576. // read next chunk if necessary
  5577. if ( m_iBuffPos>=m_iBuffUsed )
  5578. UpdateCache ();
  5579. // eof?
  5580. if ( m_iBuffPos>=m_iBuffUsed )
  5581. break;
  5582. // newline?
  5583. if ( m_pBuff[m_iBuffPos++]=='\n' )
  5584. break;
  5585. }
  5586. // finalize
  5587. sBuffer[iOutPos] = '\0';
  5588. return iOutPos;
  5589. }
  5590. /////////////////////////////////////////////////////////////////////////////
  5591. #if PARANOID
  5592. #define SPH_VARINT_DECODE(_type,_getexpr) \
  5593. register DWORD b = 0; \
  5594. register _type v = 0; \
  5595. int it = 0; \
  5596. do { b = _getexpr; v = ( v<<7 ) + ( b&0x7f ); it++; } while ( b&0x80 ); \
  5597. assert ( (it-1)*7<=sizeof(_type)*8 ); \
  5598. return v;
  5599. #else
  5600. #define SPH_VARINT_DECODE(_type,_getexpr) \
  5601. register DWORD b = _getexpr; \
  5602. register _type res = 0; \
  5603. while ( b & 0x80 ) \
  5604. { \
  5605. res = ( res<<7 ) + ( b & 0x7f ); \
  5606. b = _getexpr; \
  5607. } \
  5608. res = ( res<<7 ) + b; \
  5609. return res;
  5610. #endif // PARANOID
  5611. DWORD sphUnzipInt ( const BYTE * & pBuf ) { SPH_VARINT_DECODE ( DWORD, *pBuf++ ); }
  5612. SphOffset_t sphUnzipOffset ( const BYTE * & pBuf ) { SPH_VARINT_DECODE ( SphOffset_t, *pBuf++ ); }
  5613. DWORD CSphReader::UnzipInt () { SPH_VARINT_DECODE ( DWORD, GetByte() ); }
  5614. SphOffset_t CSphReader::UnzipOffset () { SPH_VARINT_DECODE ( SphOffset_t, GetByte() ); }
  5615. #if USE_64BIT
  5616. #define sphUnzipWordid sphUnzipOffset
  5617. #else
  5618. #define sphUnzipWordid sphUnzipInt
  5619. #endif
  5620. /////////////////////////////////////////////////////////////////////////////
  5621. const CSphReader & CSphReader::operator = ( const CSphReader & rhs )
  5622. {
  5623. SetFile ( rhs.m_iFD, rhs.m_sFilename.cstr() );
  5624. SeekTo ( rhs.m_iPos + rhs.m_iBuffPos, rhs.m_iSizeHint );
  5625. return *this;
  5626. }
  5627. DWORD CSphReader::GetDword ()
  5628. {
  5629. DWORD uRes = 0;
  5630. GetBytes ( &uRes, sizeof(DWORD) );
  5631. return uRes;
  5632. }
  5633. SphOffset_t CSphReader::GetOffset ()
  5634. {
  5635. SphOffset_t uRes = 0;
  5636. GetBytes ( &uRes, sizeof(SphOffset_t) );
  5637. return uRes;
  5638. }
  5639. CSphString CSphReader::GetString ()
  5640. {
  5641. CSphString sRes;
  5642. DWORD iLen = GetDword ();
  5643. if ( iLen )
  5644. {
  5645. char * sBuf = new char [ iLen ];
  5646. GetBytes ( sBuf, iLen );
  5647. sRes.SetBinary ( sBuf, iLen );
  5648. SafeDeleteArray ( sBuf );
  5649. }
  5650. return sRes;
  5651. }
  5652. bool CSphReader::Tag ( const char * sTag )
  5653. {
  5654. if ( m_bError )
  5655. return false;
  5656. assert ( sTag && *sTag ); // empty tags are nonsense
  5657. assert ( strlen(sTag)<64 ); // huge tags are nonsense
  5658. int iLen = strlen(sTag);
  5659. char sBuf[64];
  5660. GetBytes ( sBuf, iLen );
  5661. if ( !memcmp ( sBuf, sTag, iLen ) )
  5662. return true;
  5663. m_bError = true;
  5664. m_sError.SetSprintf ( "expected tag %s was not found", sTag );
  5665. return false;
  5666. }
  5667. //////////////////////////////////////////////////////////////////////////
  5668. CSphAutoreader::~CSphAutoreader ()
  5669. {
  5670. Close ();
  5671. }
  5672. bool CSphAutoreader::Open ( const CSphString & sFilename, CSphString & sError )
  5673. {
  5674. assert ( m_iFD<0 );
  5675. assert ( !sFilename.IsEmpty() );
  5676. m_iFD = ::open ( sFilename.cstr(), SPH_O_READ, 0644 );
  5677. m_iPos = 0;
  5678. m_iBuffPos = 0;
  5679. m_iBuffUsed = 0;
  5680. m_sFilename = sFilename;
  5681. if ( m_iFD<0 )
  5682. sError.SetSprintf ( "failed to open %s: %s", sFilename.cstr(), strerror(errno) );
  5683. return ( m_iFD>=0 );
  5684. }
  5685. void CSphAutoreader::Close ()
  5686. {
  5687. if ( m_iFD>=0 )
  5688. ::close ( m_iFD );
  5689. m_iFD = -1;
  5690. }
  5691. SphOffset_t CSphAutoreader::GetFilesize ()
  5692. {
  5693. assert ( m_iFD>=0 );
  5694. struct_stat st;
  5695. if ( m_iFD<0 || fstat ( m_iFD, &st )<0 )
  5696. return -1;
  5697. return st.st_size;
  5698. }
  5699. /////////////////////////////////////////////////////////////////////////////
  5700. // QUERY RESULT
  5701. /////////////////////////////////////////////////////////////////////////////
  5702. CSphQueryResult::CSphQueryResult ()
  5703. : m_tSchema ( "query_result" )
  5704. {
  5705. m_iQueryTime = 0;
  5706. m_iRealQueryTime = 0;
  5707. m_iCpuTime = 0;
  5708. m_iMultiplier = 1;
  5709. m_iTotalMatches = 0;
  5710. m_pMva = NULL;
  5711. m_pStrings = NULL;
  5712. m_iOffset = 0;
  5713. m_iCount = 0;
  5714. m_iSuccesses = 0;
  5715. m_pProfile = NULL;
  5716. }
  5717. CSphQueryResult::~CSphQueryResult ()
  5718. {
  5719. ARRAY_FOREACH ( i, m_dStorage2Free )
  5720. {
  5721. SafeDeleteArray ( m_dStorage2Free[i] );
  5722. }
  5723. ARRAY_FOREACH ( i, m_dMatches )
  5724. m_tSchema.FreeStringPtrs ( &m_dMatches[i] );
  5725. }
  5726. void CSphQueryResult::LeakStorages ( CSphQueryResult & tDst )
  5727. {
  5728. ARRAY_FOREACH ( i, m_dStorage2Free )
  5729. tDst.m_dStorage2Free.Add ( m_dStorage2Free[i] );
  5730. m_dStorage2Free.Reset();
  5731. }
  5732. /////////////////////////////////////////////////////////////////////////////
  5733. // CHUNK READER
  5734. /////////////////////////////////////////////////////////////////////////////
  5735. CSphBin::CSphBin ( ESphHitless eMode, bool bWordDict )
  5736. : m_eMode ( eMode )
  5737. , m_dBuffer ( NULL )
  5738. , m_pCurrent ( NULL )
  5739. , m_iLeft ( 0 )
  5740. , m_iDone ( 0 )
  5741. , m_eState ( BIN_POS )
  5742. , m_bWordDict ( bWordDict )
  5743. , m_bError ( false )
  5744. , m_iFile ( -1 )
  5745. , m_pFilePos ( NULL )
  5746. , m_iFilePos ( 0 )
  5747. , m_iFileLeft ( 0 )
  5748. {
  5749. m_tHit.m_sKeyword = bWordDict ? m_sKeyword : NULL;
  5750. m_sKeyword[0] = '\0';
  5751. m_pThrottle = &g_tThrottle;
  5752. #ifndef NDEBUG
  5753. m_iLastWordID = 0;
  5754. m_sLastKeyword[0] = '\0';
  5755. #endif
  5756. }
  5757. int CSphBin::CalcBinSize ( int iMemoryLimit, int iBlocks, const char * sPhase, bool bWarn )
  5758. {
  5759. if ( iBlocks<=0 )
  5760. return CSphBin::MIN_SIZE;
  5761. int iBinSize = ( ( iMemoryLimit/iBlocks + 2048 ) >> 12 ) << 12; // round to 4k
  5762. if ( iBinSize<CSphBin::MIN_SIZE )
  5763. {
  5764. iBinSize = CSphBin::MIN_SIZE;
  5765. sphWarn ( "%s: mem_limit=%d kb extremely low, increasing to %d kb",
  5766. sPhase, iMemoryLimit/1024, iBinSize*iBlocks/1024 );
  5767. }
  5768. if ( iBinSize<CSphBin::WARN_SIZE && bWarn )
  5769. {
  5770. sphWarn ( "%s: merge_block_size=%d kb too low, increasing mem_limit may improve performance",
  5771. sPhase, iBinSize/1024 );
  5772. }
  5773. return iBinSize;
  5774. }
  5775. void CSphBin::Init ( int iFD, SphOffset_t * pSharedOffset, const int iBinSize )
  5776. {
  5777. assert ( !m_dBuffer );
  5778. assert ( iBinSize>=MIN_SIZE );
  5779. assert ( pSharedOffset );
  5780. m_iFile = iFD;
  5781. m_pFilePos = pSharedOffset;
  5782. m_iSize = iBinSize;
  5783. m_dBuffer = new BYTE [ iBinSize ];
  5784. m_pCurrent = m_dBuffer;
  5785. m_tHit.m_iDocID = 0;
  5786. m_tHit.m_iWordID = 0;
  5787. m_tHit.m_iWordPos = EMPTY_HIT;
  5788. m_tHit.m_dFieldMask.Unset();
  5789. m_bError = false;
  5790. }
  5791. CSphBin::~CSphBin ()
  5792. {
  5793. SafeDeleteArray ( m_dBuffer );
  5794. }
  5795. int CSphBin::ReadByte ()
  5796. {
  5797. BYTE r;
  5798. if ( !m_iLeft )
  5799. {
  5800. // PROFILE ( read_hits );
  5801. if ( *m_pFilePos!=m_iFilePos )
  5802. {
  5803. sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
  5804. *m_pFilePos = m_iFilePos;
  5805. }
  5806. int n = m_iFileLeft > m_iSize
  5807. ? m_iSize
  5808. : (int)m_iFileLeft;
  5809. if ( n==0 )
  5810. {
  5811. m_iDone = 1;
  5812. m_iLeft = 1;
  5813. } else
  5814. {
  5815. assert ( m_dBuffer );
  5816. if ( sphReadThrottled ( m_iFile, m_dBuffer, n, m_pThrottle )!=(size_t)n )
  5817. {
  5818. m_bError = true;
  5819. return -2;
  5820. }
  5821. m_iLeft = n;
  5822. m_iFilePos += n;
  5823. m_iFileLeft -= n;
  5824. m_pCurrent = m_dBuffer;
  5825. *m_pFilePos += n;
  5826. }
  5827. }
  5828. if ( m_iDone )
  5829. {
  5830. m_bError = true; // unexpected (!) eof
  5831. return -1;
  5832. }
  5833. m_iLeft--;
  5834. r = *(m_pCurrent);
  5835. m_pCurrent++;
  5836. return r;
  5837. }
  5838. ESphBinRead CSphBin::ReadBytes ( void * pDest, int iBytes )
  5839. {
  5840. assert ( iBytes>0 );
  5841. assert ( iBytes<=m_iSize );
  5842. if ( m_iDone )
  5843. return BIN_READ_EOF;
  5844. if ( m_iLeft<iBytes )
  5845. {
  5846. if ( *m_pFilePos!=m_iFilePos )
  5847. {
  5848. sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
  5849. *m_pFilePos = m_iFilePos;
  5850. }
  5851. int n = Min ( m_iFileLeft, m_iSize - m_iLeft );
  5852. if ( n==0 )
  5853. {
  5854. m_iDone = 1;
  5855. m_bError = true; // unexpected (!) eof
  5856. return BIN_READ_EOF;
  5857. }
  5858. assert ( m_dBuffer );
  5859. memmove ( m_dBuffer, m_pCurrent, m_iLeft );
  5860. if ( sphReadThrottled ( m_iFile, m_dBuffer + m_iLeft, n, m_pThrottle )!=(size_t)n )
  5861. {
  5862. m_bError = true;
  5863. return BIN_READ_ERROR;
  5864. }
  5865. m_iLeft += n;
  5866. m_iFilePos += n;
  5867. m_iFileLeft -= n;
  5868. m_pCurrent = m_dBuffer;
  5869. *m_pFilePos += n;
  5870. }
  5871. assert ( m_iLeft>=iBytes );
  5872. m_iLeft -= iBytes;
  5873. memcpy ( pDest, m_pCurrent, iBytes );
  5874. m_pCurrent += iBytes;
  5875. return BIN_READ_OK;
  5876. }
  5877. SphWordID_t CSphBin::ReadVLB ()
  5878. {
  5879. SphWordID_t uValue = 0;
  5880. int iByte, iOffset = 0;
  5881. do
  5882. {
  5883. if ( ( iByte = ReadByte() )<0 )
  5884. return 0;
  5885. uValue += ( ( SphWordID_t ( iByte & 0x7f ) ) << iOffset );
  5886. iOffset += 7;
  5887. }
  5888. while ( iByte & 0x80 );
  5889. return uValue;
  5890. }
  5891. DWORD CSphBin::UnzipInt ()
  5892. {
  5893. register int b = 0;
  5894. register DWORD v = 0;
  5895. do
  5896. {
  5897. b = ReadByte();
  5898. if ( b<0 )
  5899. b = 0;
  5900. v = ( v<<7 ) + ( b & 0x7f );
  5901. } while ( b & 0x80 );
  5902. return v;
  5903. }
  5904. SphOffset_t CSphBin::UnzipOffset ()
  5905. {
  5906. register int b = 0;
  5907. register SphOffset_t v = 0;
  5908. do
  5909. {
  5910. b = ReadByte();
  5911. if ( b<0 )
  5912. b = 0;
  5913. v = ( v<<7 ) + ( b & 0x7f );
  5914. } while ( b & 0x80 );
  5915. return v;
  5916. }
  5917. int CSphBin::ReadHit ( CSphAggregateHit * pOut, int iRowitems, CSphRowitem * pRowitems )
  5918. {
  5919. // expected EOB
  5920. if ( m_iDone )
  5921. {
  5922. pOut->m_iWordID = 0;
  5923. return 1;
  5924. }
  5925. CSphAggregateHit & tHit = m_tHit; // shortcut
  5926. for ( ;; )
  5927. {
  5928. // SPH_MAX_WORD_LEN is now 42 only to keep ReadVLB() below
  5929. // technically, we can just use different functions on different paths, if ever needed
  5930. STATIC_ASSERT ( SPH_MAX_WORD_LEN*3<=127, KEYWORD_TOO_LONG );
  5931. SphWordID_t uDelta = ReadVLB();
  5932. if ( uDelta )
  5933. {
  5934. switch ( m_eState )
  5935. {
  5936. case BIN_WORD:
  5937. if ( m_bWordDict )
  5938. {
  5939. #ifdef NDEBUG
  5940. // FIXME?! move this under PARANOID or something?
  5941. // or just introduce an assert() checked release build?
  5942. if ( uDelta>=sizeof(m_sKeyword) )
  5943. sphDie ( "INTERNAL ERROR: corrupted keyword length (len="UINT64_FMT", deltapos="UINT64_FMT")",
  5944. (uint64_t)uDelta, (uint64_t)(m_iFilePos-m_iLeft) );
  5945. #else
  5946. assert ( uDelta>0 && uDelta<sizeof(m_sKeyword)-1 );
  5947. #endif
  5948. ReadBytes ( m_sKeyword, (int)uDelta );
  5949. m_sKeyword[uDelta] = '\0';
  5950. tHit.m_iWordID = sphCRC32 ( m_sKeyword ); // must be in sync with dict!
  5951. #ifndef NDEBUG
  5952. assert ( ( m_iLastWordID<tHit.m_iWordID )
  5953. || ( m_iLastWordID==tHit.m_iWordID && strcmp ( (char*)m_sLastKeyword, (char*)m_sKeyword )<0 ) );
  5954. strncpy ( (char*)m_sLastKeyword, (char*)m_sKeyword, sizeof(m_sLastKeyword) );
  5955. #endif
  5956. } else
  5957. {
  5958. tHit.m_iWordID += uDelta;
  5959. }
  5960. tHit.m_iDocID = 0;
  5961. tHit.m_iWordPos = EMPTY_HIT;
  5962. tHit.m_dFieldMask.Unset();
  5963. m_eState = BIN_DOC;
  5964. break;
  5965. case BIN_DOC:
  5966. // doc id
  5967. m_eState = BIN_POS;
  5968. tHit.m_iDocID += uDelta;
  5969. tHit.m_iWordPos = EMPTY_HIT;
  5970. for ( int i=0; i<iRowitems; i++, pRowitems++ )
  5971. *pRowitems = (DWORD)ReadVLB(); // FIXME? check range?
  5972. break;
  5973. case BIN_POS:
  5974. if ( m_eMode==SPH_HITLESS_ALL )
  5975. {
  5976. tHit.m_dFieldMask.Assign32 ( (DWORD)ReadVLB() );
  5977. m_eState = BIN_DOC;
  5978. } else if ( m_eMode==SPH_HITLESS_SOME )
  5979. {
  5980. if ( uDelta & 1 )
  5981. {
  5982. tHit.m_dFieldMask.Assign32 ( (DWORD)ReadVLB() );
  5983. m_eState = BIN_DOC;
  5984. }
  5985. uDelta >>= 1;
  5986. }
  5987. tHit.m_iWordPos += (DWORD)uDelta;
  5988. *pOut = tHit;
  5989. return 1;
  5990. default:
  5991. sphDie ( "INTERNAL ERROR: unknown bin state (state=%d)", m_eState );
  5992. }
  5993. } else
  5994. {
  5995. switch ( m_eState )
  5996. {
  5997. case BIN_POS: m_eState = BIN_DOC; break;
  5998. case BIN_DOC: m_eState = BIN_WORD; break;
  5999. case BIN_WORD: m_iDone = 1; pOut->m_iWordID = 0; return 1;
  6000. default: sphDie ( "INTERNAL ERROR: unknown bin state (state=%d)", m_eState );
  6001. }
  6002. }
  6003. }
  6004. }
  6005. bool CSphBin::IsEOF () const
  6006. {
  6007. return m_iDone!=0 || m_iFileLeft<=0;
  6008. }
  6009. bool CSphBin::IsDone () const
  6010. {
  6011. return m_iDone!=0 || ( m_iFileLeft<=0 && m_iLeft<=0 );
  6012. }
  6013. ESphBinRead CSphBin::Precache ()
  6014. {
  6015. if ( m_iFileLeft > m_iSize-m_iLeft )
  6016. {
  6017. m_bError = true;
  6018. return BIN_PRECACHE_ERROR;
  6019. }
  6020. if ( !m_iFileLeft )
  6021. return BIN_PRECACHE_OK;
  6022. if ( *m_pFilePos!=m_iFilePos )
  6023. {
  6024. sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
  6025. *m_pFilePos = m_iFilePos;
  6026. }
  6027. assert ( m_dBuffer );
  6028. memmove ( m_dBuffer, m_pCurrent, m_iLeft );
  6029. if ( sphReadThrottled ( m_iFile, m_dBuffer+m_iLeft, m_iFileLeft, m_pThrottle )!=(size_t)m_iFileLeft )
  6030. {
  6031. m_bError = true;
  6032. return BIN_READ_ERROR;
  6033. }
  6034. m_iLeft += m_iFileLeft;
  6035. m_iFilePos += m_iFileLeft;
  6036. m_iFileLeft -= m_iFileLeft;
  6037. m_pCurrent = m_dBuffer;
  6038. *m_pFilePos += m_iFileLeft;
  6039. return BIN_PRECACHE_OK;
  6040. }
  6041. //////////////////////////////////////////////////////////////////////////
  6042. // INDEX SETTINGS
  6043. //////////////////////////////////////////////////////////////////////////
  6044. CSphIndexSettings::CSphIndexSettings ()
  6045. : m_eDocinfo ( SPH_DOCINFO_NONE )
  6046. , m_eHitFormat ( SPH_HIT_FORMAT_PLAIN )
  6047. , m_bHtmlStrip ( false )
  6048. , m_eHitless ( SPH_HITLESS_NONE )
  6049. , m_iEmbeddedLimit ( 0 )
  6050. , m_eBigramIndex ( SPH_BIGRAM_NONE )
  6051. , m_bAotFilter ( false )
  6052. {
  6053. }
  6054. //////////////////////////////////////////////////////////////////////////
  6055. // GLOBAL MVA STORAGE ARENA
  6056. //////////////////////////////////////////////////////////////////////////
  6057. class tTester : public ISphNoncopyable
  6058. {
  6059. public:
  6060. virtual void Reset() = 0;
  6061. virtual void TestData ( int iData ) = 0;
  6062. virtual ~tTester() {}
  6063. };
  6064. /// shared-memory arena allocator
  6065. /// manages small tagged dword strings, upto 4096 bytes in size
  6066. class CSphArena
  6067. {
  6068. public:
  6069. CSphArena ();
  6070. ~CSphArena ();
  6071. DWORD * ReInit ( int uMaxBytes );
  6072. const char * GetError () const { return m_sError.cstr(); }
  6073. int TaggedAlloc ( int iTag, int iBytes );
  6074. void TaggedFreeIndex ( int iTag, int iIndex );
  6075. void TaggedFreeTag ( int iTag );
  6076. void ExamineTag ( tTester* pTest, int iTag );
  6077. protected:
  6078. static const int MIN_BITS = 4;
  6079. static const int MAX_BITS = 12;
  6080. static const int NUM_SIZES = MAX_BITS-MIN_BITS+2; ///< one for 0 (empty pages), and one for each size from min to max
  6081. static const int PAGE_SIZE = 1<<MAX_BITS;
  6082. static const int PAGE_ALLOCS = 1<<( MAX_BITS-MIN_BITS);
  6083. static const int PAGE_BITMAP = ( PAGE_ALLOCS+8*sizeof(DWORD)-1 )/( 8*sizeof(DWORD) );
  6084. static const int MAX_TAGS = 1024;
  6085. static const int MAX_LOGENTRIES = 29;
  6086. ///< page descriptor
  6087. struct PageDesc_t
  6088. {
  6089. int m_iSizeBits; ///< alloc size
  6090. int m_iPrev; ///< prev free page of this size
  6091. int m_iNext; ///< next free page of this size
  6092. int m_iUsed; ///< usage count
  6093. DWORD m_uBitmap[PAGE_BITMAP]; ///< usage bitmap
  6094. };
  6095. ///< tag descriptor
  6096. struct TagDesc_t
  6097. {
  6098. int m_iTag; ///< tag value
  6099. int m_iAllocs; ///< active allocs
  6100. int m_iLogHead; ///< pointer to head allocs log entry
  6101. };
  6102. ///< allocs log entry
  6103. struct AllocsLogEntry_t
  6104. {
  6105. int m_iUsed;
  6106. int m_iNext;
  6107. int m_dEntries[MAX_LOGENTRIES];
  6108. };
  6109. STATIC_SIZE_ASSERT ( AllocsLogEntry_t, 124 );
  6110. protected:
  6111. DWORD * Init ( int uMaxBytes );
  6112. int RawAlloc ( int iBytes );
  6113. void RawFree ( int iIndex );
  6114. void RemoveTag ( TagDesc_t * pTag );
  6115. protected:
  6116. CSphProcessSharedMutex m_tProcMutex;
  6117. CSphMutex m_tThdMutex;
  6118. int m_iPages; ///< max pages count
  6119. CSphSharedBuffer<DWORD> m_pArena; ///< arena that stores everything (all other pointers point here)
  6120. PageDesc_t * m_pPages; ///< page descriptors
  6121. int * m_pFreelistHeads; ///< free-list heads
  6122. int * m_pTagCount;
  6123. TagDesc_t * m_pTags;
  6124. DWORD * m_pBasePtr; ///< base data storage pointer
  6125. CSphString m_sError;
  6126. #if ARENADEBUG
  6127. protected:
  6128. int * m_pTotalAllocs;
  6129. int * m_pTotalBytes;
  6130. public:
  6131. void CheckFreelists ();
  6132. #else
  6133. inline void CheckFreelists () {}
  6134. #endif // ARENADEBUG
  6135. };
  6136. class tDocCollector : public tTester
  6137. {
  6138. CSphVector<SphDocID_t> * m_dCollection;
  6139. public:
  6140. explicit tDocCollector ( CSphVector<SphDocID_t> & dCollection )
  6141. : m_dCollection ( &dCollection )
  6142. {}
  6143. virtual void Reset()
  6144. {
  6145. m_dCollection->Reset();
  6146. }
  6147. virtual void TestData ( int iData )
  6148. {
  6149. if ( !g_pMvaArena )
  6150. return;
  6151. m_dCollection->Add ( *(SphDocID_t*)(g_pMvaArena + iData) );
  6152. }
  6153. };
  6154. //////////////////////////////////////////////////////////////////////////
  6155. CSphArena::CSphArena ()
  6156. : m_iPages ( 0 )
  6157. {
  6158. m_tThdMutex.Init();
  6159. }
  6160. CSphArena::~CSphArena ()
  6161. {
  6162. // notify callers that arena no longer exists
  6163. g_pMvaArena = NULL;
  6164. m_tThdMutex.Done();
  6165. }
  6166. DWORD * CSphArena::ReInit ( int uMaxBytes )
  6167. {
  6168. if ( m_iPages!=0 )
  6169. {
  6170. m_pArena.Reset();
  6171. m_iPages = 0;
  6172. }
  6173. return Init ( uMaxBytes );
  6174. }
  6175. DWORD * CSphArena::Init ( int uMaxBytes )
  6176. {
  6177. m_iPages = ( uMaxBytes+PAGE_SIZE-1 ) / PAGE_SIZE;
  6178. int iData = m_iPages*PAGE_SIZE; // data size, bytes
  6179. int iMyTaglist = sizeof(int) + MAX_TAGS*sizeof(TagDesc_t); // int length, TagDesc_t[] tags; NOLINT
  6180. int iMy = m_iPages*sizeof(PageDesc_t) + NUM_SIZES*sizeof(int) + iMyTaglist; // my internal structures size, bytes; NOLINT
  6181. #if ARENADEBUG
  6182. iMy += 2*sizeof(int); // debugging counters; NOLINT
  6183. #endif
  6184. assert ( iData%sizeof(DWORD)==0 );
  6185. assert ( iMy%sizeof(DWORD)==0 );
  6186. CSphString sError, sWarning;
  6187. if ( m_tProcMutex.GetError() || !m_pArena.Alloc ( (iData+iMy)/sizeof(DWORD), sError, sWarning ) )
  6188. {
  6189. m_iPages = 0;
  6190. if ( m_tProcMutex.GetError() )
  6191. m_sError = m_tProcMutex.GetError();
  6192. else
  6193. m_sError.SetSprintf ( "alloc, error='%s', warning='%s'", sError.cstr(), sWarning.cstr() );
  6194. return NULL;
  6195. }
  6196. // setup internal pointers
  6197. DWORD * pCur = m_pArena.GetWritePtr();
  6198. m_pPages = (PageDesc_t*) pCur;
  6199. pCur += sizeof(PageDesc_t)*m_iPages/sizeof(DWORD);
  6200. m_pFreelistHeads = (int*) pCur;
  6201. pCur += NUM_SIZES; // one for each size, and one extra for zero
  6202. m_pTagCount = (int*) pCur++;
  6203. m_pTags = (TagDesc_t*) pCur;
  6204. pCur += sizeof(TagDesc_t)*MAX_TAGS/sizeof(DWORD);
  6205. #if ARENADEBUG
  6206. m_pTotalAllocs = (int*) pCur++;
  6207. m_pTotalBytes = (int*) pCur++;
  6208. *m_pTotalAllocs = 0;
  6209. *m_pTotalBytes = 0;
  6210. #endif
  6211. m_pBasePtr = m_pArena.GetWritePtr() + iMy/sizeof(DWORD);
  6212. assert ( m_pBasePtr==pCur );
  6213. // setup initial state
  6214. for ( int i=0; i<m_iPages; i++ )
  6215. {
  6216. m_pPages[i].m_iSizeBits = 0; // fully empty
  6217. m_pPages[i].m_iPrev = ( i>0 ) ? i-1 : -1;
  6218. m_pPages[i].m_iNext = ( i<m_iPages-1 ) ? i+1 : -1;
  6219. }
  6220. m_pFreelistHeads[0] = 0;
  6221. for ( int i=1; i<NUM_SIZES; i++ )
  6222. m_pFreelistHeads[i] = -1;
  6223. *m_pTagCount = 0;
  6224. return m_pBasePtr;
  6225. }
  6226. int CSphArena::RawAlloc ( int iBytes )
  6227. {
  6228. CheckFreelists ();
  6229. if ( iBytes<=0 || iBytes>( ( 1 << MAX_BITS ) - (int)sizeof(int) ) )
  6230. return -1;
  6231. int iSizeBits = sphLog2 ( iBytes+2*sizeof(int)-1 ); // always reserve sizeof(int) for the tag and AllocsLogEntry_t backtrack; NOLINT
  6232. iSizeBits = Max ( iSizeBits, MIN_BITS );
  6233. assert ( iSizeBits>=MIN_BITS && iSizeBits<=MAX_BITS );
  6234. int iSizeSlot = iSizeBits-MIN_BITS+1;
  6235. assert ( iSizeSlot>=1 && iSizeSlot<NUM_SIZES );
  6236. // get semi-free page for this size
  6237. PageDesc_t * pPage = NULL;
  6238. if ( m_pFreelistHeads[iSizeSlot]>=0 )
  6239. {
  6240. // got something in the free-list
  6241. pPage = m_pPages + m_pFreelistHeads[iSizeSlot];
  6242. } else
  6243. {
  6244. // nothing in free-list, alloc next empty one
  6245. if ( m_pFreelistHeads[0]<0 )
  6246. return -1; // out of memory
  6247. // update the page
  6248. pPage = m_pPages + m_pFreelistHeads[0];
  6249. assert ( pPage->m_iPrev==-1 );
  6250. m_pFreelistHeads[iSizeSlot] = m_pFreelistHeads[0];
  6251. m_pFreelistHeads[0] = pPage->m_iNext;
  6252. if ( pPage->m_iNext>=0 )
  6253. m_pPages[pPage->m_iNext].m_iPrev = -1;
  6254. pPage->m_iSizeBits = iSizeBits;
  6255. pPage->m_iUsed = 0;
  6256. pPage->m_iNext = -1;
  6257. CheckFreelists ();
  6258. // setup bitmap
  6259. int iUsedBits = ( 1<<(MAX_BITS-iSizeBits) ); // max-used-bits = page-size/alloc-size = ( 1<<page-bitsize )/( 1<<alloc-bitsize )
  6260. assert ( iUsedBits>0 && iUsedBits<=(PAGE_BITMAP<<5) );
  6261. for ( int i=0; i<PAGE_BITMAP; i++ )
  6262. pPage->m_uBitmap[i] = ( ( i<<5 )>=iUsedBits ) ? 0xffffffffUL : 0;
  6263. if ( iUsedBits<32 )
  6264. pPage->m_uBitmap[0] = ( 0xffffffffUL<<iUsedBits );
  6265. }
  6266. // get free alloc slot and use it
  6267. assert ( pPage );
  6268. assert ( pPage->m_iSizeBits==iSizeBits );
  6269. for ( int i=0; i<PAGE_BITMAP; i++ ) // FIXME! optimize, can scan less
  6270. {
  6271. if ( pPage->m_uBitmap[i]==0xffffffffUL )
  6272. continue;
  6273. int iFree = FindBit ( pPage->m_uBitmap[i] );
  6274. pPage->m_uBitmap[i] |= ( 1<<iFree );
  6275. pPage->m_iUsed++;
  6276. if ( pPage->m_iUsed==( PAGE_SIZE >> pPage->m_iSizeBits ) )
  6277. {
  6278. // this page is full now, unchain from the free-list
  6279. assert ( m_pFreelistHeads[iSizeSlot]==pPage-m_pPages );
  6280. m_pFreelistHeads[iSizeSlot] = pPage->m_iNext;
  6281. if ( pPage->m_iNext>=0 )
  6282. {
  6283. assert ( m_pPages[pPage->m_iNext].m_iPrev==pPage-m_pPages );
  6284. m_pPages[pPage->m_iNext].m_iPrev = -1;
  6285. }
  6286. pPage->m_iNext = -1;
  6287. }
  6288. #if ARENADEBUG
  6289. (*m_pTotalAllocs)++;
  6290. (*m_pTotalBytes) += ( 1<<iSizeBits );
  6291. #endif
  6292. CheckFreelists ();
  6293. int iOffset = ( pPage-m_pPages )*PAGE_SIZE + ( i*32+iFree )*( 1<<iSizeBits ); // raw internal byte offset (FIXME! optimize with shifts?)
  6294. int iIndex = 2 + ( iOffset/sizeof(DWORD) ); // dword index with tag and backtrack fixup
  6295. m_pBasePtr[iIndex-1] = DWORD(-1); // untagged by default
  6296. m_pBasePtr[iIndex-2] = DWORD(-1); // backtrack nothere
  6297. return iIndex;
  6298. }
  6299. assert ( 0 && "internal error, no free slots in free page" );
  6300. return -1;
  6301. }
  6302. void CSphArena::RawFree ( int iIndex )
  6303. {
  6304. CheckFreelists ();
  6305. int iOffset = (iIndex-2)*sizeof(DWORD); // remove tag fixup, and go to raw internal byte offset
  6306. int iPage = iOffset / PAGE_SIZE;
  6307. if ( iPage<0 || iPage>m_iPages )
  6308. {
  6309. assert ( 0 && "internal error, freed index out of arena" );
  6310. return;
  6311. }
  6312. PageDesc_t * pPage = m_pPages + iPage;
  6313. int iBit = ( iOffset % PAGE_SIZE ) >> pPage->m_iSizeBits;
  6314. assert ( ( iOffset % PAGE_SIZE )==( iBit << pPage->m_iSizeBits ) && "internal error, freed offset is unaligned" );
  6315. if (!( pPage->m_uBitmap[iBit>>5] & ( 1UL<<(iBit & 31) ) ))
  6316. {
  6317. assert ( 0 && "internal error, freed index already freed" );
  6318. return;
  6319. }
  6320. pPage->m_uBitmap[iBit>>5] &= ~( 1UL << ( iBit & 31 ) );
  6321. pPage->m_iUsed--;
  6322. #if ARENADEBUG
  6323. (*m_pTotalAllocs)--;
  6324. (*m_pTotalBytes) -= ( 1<<pPage->m_iSizeBits );
  6325. #endif
  6326. CheckFreelists ();
  6327. int iSizeSlot = pPage->m_iSizeBits-MIN_BITS+1;
  6328. if ( pPage->m_iUsed==( PAGE_SIZE >> pPage->m_iSizeBits )-1 )
  6329. {
  6330. // this page was full, but it's semi-free now
  6331. // chain to free-list
  6332. assert ( pPage->m_iPrev==-1 ); // full pages must not be in any list
  6333. assert ( pPage->m_iNext==-1 );
  6334. pPage->m_iNext = m_pFreelistHeads[iSizeSlot];
  6335. if ( pPage->m_iNext>=0 )
  6336. {
  6337. assert ( m_pPages[pPage->m_iNext].m_iPrev==-1 );
  6338. assert ( m_pPages[pPage->m_iNext].m_iSizeBits==pPage->m_iSizeBits );
  6339. m_pPages[pPage->m_iNext].m_iPrev = iPage;
  6340. }
  6341. m_pFreelistHeads[iSizeSlot] = iPage;
  6342. }
  6343. if ( pPage->m_iUsed==0 )
  6344. {
  6345. // this page is empty now
  6346. // unchain from free-list
  6347. if ( pPage->m_iPrev>=0 )
  6348. {
  6349. // non-head page
  6350. assert ( m_pPages[pPage->m_iPrev].m_iNext==iPage );
  6351. m_pPages[pPage->m_iPrev].m_iNext = pPage->m_iNext;
  6352. if ( pPage->m_iNext>=0 )
  6353. {
  6354. assert ( m_pPages[pPage->m_iNext].m_iPrev==iPage );
  6355. m_pPages[pPage->m_iNext].m_iPrev = pPage->m_iPrev;
  6356. }
  6357. } else
  6358. {
  6359. // head page
  6360. assert ( m_pFreelistHeads[iSizeSlot]==iPage );
  6361. assert ( pPage->m_iPrev==-1 );
  6362. if ( pPage->m_iNext>=0 )
  6363. {
  6364. assert ( m_pPages[pPage->m_iNext].m_iPrev==iPage );
  6365. m_pPages[pPage->m_iNext].m_iPrev = -1;
  6366. }
  6367. m_pFreelistHeads[iSizeSlot] = pPage->m_iNext;
  6368. }
  6369. pPage->m_iSizeBits = 0;
  6370. pPage->m_iPrev = -1;
  6371. pPage->m_iNext = m_pFreelistHeads[0];
  6372. if ( pPage->m_iNext>=0 )
  6373. {
  6374. assert ( m_pPages[pPage->m_iNext].m_iPrev==-1 );
  6375. assert ( m_pPages[pPage->m_iNext].m_iSizeBits==0 );
  6376. m_pPages[pPage->m_iNext].m_iPrev = iPage;
  6377. }
  6378. m_pFreelistHeads[0] = iPage;
  6379. }
  6380. CheckFreelists ();
  6381. }
  6382. int CSphArena::TaggedAlloc ( int iTag, int iBytes )
  6383. {
  6384. if ( !m_iPages )
  6385. return -1; // uninitialized
  6386. assert ( iTag>=0 );
  6387. CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
  6388. CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
  6389. // find that tag first
  6390. TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
  6391. if ( !pTag )
  6392. {
  6393. if ( *m_pTagCount==MAX_TAGS )
  6394. return -1; // out of tags
  6395. int iLogHead = RawAlloc ( sizeof(AllocsLogEntry_t) );
  6396. if ( iLogHead<0 )
  6397. return -1; // out of memory
  6398. assert ( iLogHead>=2 );
  6399. AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLogHead );
  6400. pLog->m_iUsed = 0;
  6401. pLog->m_iNext = -1;
  6402. // add new tag
  6403. pTag = m_pTags + (*m_pTagCount)++;
  6404. pTag->m_iTag = iTag;
  6405. pTag->m_iAllocs = 0;
  6406. pTag->m_iLogHead = iLogHead;
  6407. // re-sort
  6408. // OPTIMIZE! full-blown sort is overkill here
  6409. sphSort ( m_pTags, *m_pTagCount, sphMemberLess ( &TagDesc_t::m_iTag ) );
  6410. // we must be able to find it now
  6411. pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
  6412. assert ( pTag && "internal error, fresh tag not found in TaggedAlloc()" );
  6413. if ( !pTag )
  6414. return -1; // internal error
  6415. }
  6416. // grow the log if needed
  6417. int iLogEntry = pTag->m_iLogHead;
  6418. AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + pTag->m_iLogHead );
  6419. if ( pLog->m_iUsed==MAX_LOGENTRIES )
  6420. {
  6421. int iNewEntry = RawAlloc ( sizeof(AllocsLogEntry_t) );
  6422. if ( iNewEntry<0 )
  6423. return -1; // out of memory
  6424. assert ( iNewEntry>=2 );
  6425. iLogEntry = iNewEntry;
  6426. AllocsLogEntry_t * pNew = (AllocsLogEntry_t*) ( m_pBasePtr + iNewEntry );
  6427. pNew->m_iUsed = 0;
  6428. pNew->m_iNext = pTag->m_iLogHead;
  6429. pTag->m_iLogHead = iNewEntry;
  6430. pLog = pNew;
  6431. }
  6432. // do the alloc itself
  6433. int iIndex = RawAlloc ( iBytes );
  6434. if ( iIndex<0 )
  6435. return -1; // out of memory
  6436. assert ( iIndex>=2 );
  6437. // tag it
  6438. m_pBasePtr[iIndex-1] = iTag;
  6439. // set data->AllocsLogEntry_t backtrack
  6440. m_pBasePtr[iIndex-2] = iLogEntry;
  6441. // log it
  6442. assert ( pLog->m_iUsed<MAX_LOGENTRIES );
  6443. pLog->m_dEntries [ pLog->m_iUsed++ ] = iIndex;
  6444. pTag->m_iAllocs++;
  6445. // and we're done
  6446. return iIndex;
  6447. }
  6448. void CSphArena::TaggedFreeIndex ( int iTag, int iIndex )
  6449. {
  6450. if ( !m_iPages )
  6451. return; // uninitialized
  6452. assert ( iTag>=0 );
  6453. CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
  6454. CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
  6455. // find that tag
  6456. TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
  6457. assert ( pTag && "internal error, unknown tag in TaggedFreeIndex()" );
  6458. assert ( m_pBasePtr[iIndex-1]==DWORD(iTag) && "internal error, tag mismatch in TaggedFreeIndex()" );
  6459. // defence against internal errors
  6460. if ( !pTag )
  6461. return;
  6462. // untag it
  6463. m_pBasePtr[iIndex-1] = DWORD(-1);
  6464. // free it
  6465. RawFree ( iIndex );
  6466. // update AllocsLogEntry_t
  6467. int iLogEntry = m_pBasePtr[iIndex-2];
  6468. assert ( iLogEntry>=2 );
  6469. m_pBasePtr[iIndex-2] = DWORD(-1);
  6470. AllocsLogEntry_t * pLogEntry = (AllocsLogEntry_t*) ( m_pBasePtr + iLogEntry );
  6471. for ( int i = 0; i<MAX_LOGENTRIES; i++ )
  6472. {
  6473. if ( pLogEntry->m_dEntries[i]!=iIndex )
  6474. continue;
  6475. pLogEntry->m_dEntries[i] = pLogEntry->m_dEntries[pLogEntry->m_iUsed-1]; // RemoveFast
  6476. pLogEntry->m_iUsed--;
  6477. break;
  6478. }
  6479. assert ( pLogEntry->m_iUsed>=0 );
  6480. // remove from tag entries list
  6481. if ( pLogEntry->m_iUsed==0 )
  6482. {
  6483. if ( pTag->m_iLogHead==iLogEntry )
  6484. {
  6485. pTag->m_iLogHead = pLogEntry->m_iNext;
  6486. } else
  6487. {
  6488. int iLog = pTag->m_iLogHead;
  6489. while ( iLog>=0 )
  6490. {
  6491. AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
  6492. if ( iLogEntry!=pLog->m_iNext )
  6493. {
  6494. iLog = pLog->m_iNext;
  6495. continue;
  6496. } else
  6497. {
  6498. pLog->m_iNext = pLogEntry->m_iNext;
  6499. break;
  6500. }
  6501. }
  6502. }
  6503. RawFree ( iLogEntry );
  6504. }
  6505. // update the tag descriptor
  6506. pTag->m_iAllocs--;
  6507. assert ( pTag->m_iAllocs>=0 );
  6508. // remove the descriptor if its empty now
  6509. if ( pTag->m_iAllocs==0 )
  6510. RemoveTag ( pTag );
  6511. }
  6512. void CSphArena::TaggedFreeTag ( int iTag )
  6513. {
  6514. if ( !m_iPages )
  6515. return; // uninitialized
  6516. assert ( iTag>=0 );
  6517. CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
  6518. CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
  6519. // find that tag
  6520. TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
  6521. if ( !pTag )
  6522. return;
  6523. // walk the log and free it
  6524. int iLog = pTag->m_iLogHead;
  6525. while ( iLog>=0 )
  6526. {
  6527. AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
  6528. iLog = pLog->m_iNext;
  6529. // free each alloc if tag still matches
  6530. for ( int i=0; i<pLog->m_iUsed; i++ )
  6531. {
  6532. int iIndex = pLog->m_dEntries[i];
  6533. if ( m_pBasePtr[iIndex-1]==DWORD(iTag) )
  6534. {
  6535. m_pBasePtr[iIndex-1] = DWORD(-1); // avoid double free
  6536. RawFree ( iIndex );
  6537. pTag->m_iAllocs--;
  6538. }
  6539. }
  6540. }
  6541. // check for mismatches
  6542. assert ( pTag->m_iAllocs==0 );
  6543. // remove the descriptor
  6544. RemoveTag ( pTag );
  6545. }
  6546. void CSphArena::ExamineTag ( tTester* pTest, int iTag )
  6547. {
  6548. if ( !pTest )
  6549. return;
  6550. pTest->Reset();
  6551. if ( !m_iPages )
  6552. return; // uninitialized
  6553. assert ( iTag>=0 );
  6554. CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
  6555. CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
  6556. // find that tag
  6557. TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
  6558. if ( !pTag )
  6559. return;
  6560. // walk the log and tick it's chunks
  6561. int iLog = pTag->m_iLogHead;
  6562. while ( iLog>=0 )
  6563. {
  6564. AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
  6565. iLog = pLog->m_iNext;
  6566. // tick each alloc
  6567. for ( int i=0; i<pLog->m_iUsed; i++ )
  6568. pTest->TestData ( pLog->m_dEntries[i] );
  6569. }
  6570. }
  6571. void CSphArena::RemoveTag ( TagDesc_t * pTag )
  6572. {
  6573. assert ( pTag );
  6574. assert ( pTag->m_iAllocs==0 );
  6575. // dealloc log chain
  6576. int iLog = pTag->m_iLogHead;
  6577. while ( iLog>=0 )
  6578. {
  6579. AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
  6580. int iNext = pLog->m_iNext;
  6581. RawFree ( iLog );
  6582. iLog = iNext;
  6583. }
  6584. // remove tag from the list
  6585. int iTail = m_pTags + (*m_pTagCount) - pTag - 1;
  6586. memmove ( pTag, pTag+1, iTail*sizeof(TagDesc_t) );
  6587. (*m_pTagCount)--;
  6588. }
  6589. #if ARENADEBUG
  6590. void CSphArena::CheckFreelists ()
  6591. {
  6592. assert ( m_pFreelistHeads[0]==-1 || m_pPages[m_pFreelistHeads[0]].m_iSizeBits==0 );
  6593. for ( int iSizeSlot=1; iSizeSlot<NUM_SIZES; iSizeSlot++ )
  6594. assert ( m_pFreelistHeads[iSizeSlot]==-1 || m_pPages[m_pFreelistHeads[iSizeSlot]].m_iSizeBits-MIN_BITS+1==iSizeSlot );
  6595. }
  6596. #endif // ARENADEBUG
  6597. //////////////////////////////////////////////////////////////////////////
  6598. static CSphArena g_MvaArena; // global mega-arena
  6599. const char * sphArenaInit ( int iMaxBytes )
  6600. {
  6601. if ( !g_pMvaArena )
  6602. g_pMvaArena = g_MvaArena.ReInit ( iMaxBytes );
  6603. const char * sError = g_MvaArena.GetError();
  6604. return sError;
  6605. }
  6606. /////////////////////////////////////////////////////////////////////////////
  6607. // INDEX
  6608. /////////////////////////////////////////////////////////////////////////////
  6609. CSphIndex::CSphIndex ( const char * sIndexName, const char * sFilename )
  6610. : m_iTID ( 0 )
  6611. , m_bExpandKeywords ( false )
  6612. , m_iExpansionLimit ( 0 )
  6613. , m_tSchema ( sFilename )
  6614. , m_bInplaceSettings ( false )
  6615. , m_iHitGap ( 0 )
  6616. , m_iDocinfoGap ( 0 )
  6617. , m_fRelocFactor ( 0.0f )
  6618. , m_fWriteFactor ( 0.0f )
  6619. , m_bKeepFilesOpen ( false )
  6620. , m_bPreloadWordlist ( true )
  6621. , m_bStripperInited ( true )
  6622. , m_bEnableStar ( false )
  6623. , m_bId32to64 ( false )
  6624. , m_pFieldFilter ( NULL )
  6625. , m_pTokenizer ( NULL )
  6626. , m_pQueryTokenizer ( NULL )
  6627. , m_pDict ( NULL )
  6628. , m_iMaxCachedDocs ( 0 )
  6629. , m_iMaxCachedHits ( 0 )
  6630. , m_sIndexName ( sIndexName )
  6631. , m_sFilename ( sFilename )
  6632. {
  6633. }
  6634. CSphIndex::~CSphIndex ()
  6635. {
  6636. SafeDelete ( m_pFieldFilter );
  6637. SafeDelete ( m_pTokenizer );
  6638. SafeDelete ( m_pDict );
  6639. }
  6640. void CSphIndex::SetInplaceSettings ( int iHitGap, int iDocinfoGap, float fRelocFactor, float fWriteFactor )
  6641. {
  6642. m_iHitGap = iHitGap;
  6643. m_iDocinfoGap = iDocinfoGap;
  6644. m_fRelocFactor = fRelocFactor;
  6645. m_fWriteFactor = fWriteFactor;
  6646. m_bInplaceSettings = true;
  6647. }
  6648. void CSphIndex::SetFieldFilter ( ISphFieldFilter * pFieldFilter )
  6649. {
  6650. if ( m_pFieldFilter!=pFieldFilter )
  6651. SafeDelete ( m_pFieldFilter );
  6652. m_pFieldFilter = pFieldFilter;
  6653. }
  6654. void CSphIndex::SetTokenizer ( ISphTokenizer * pTokenizer )
  6655. {
  6656. if ( m_pTokenizer!=pTokenizer )
  6657. SafeDelete ( m_pTokenizer );
  6658. m_pTokenizer = pTokenizer;
  6659. }
  6660. void CSphIndex::SetupQueryTokenizer()
  6661. {
  6662. // create and setup a master copy of query time tokenizer
  6663. // that we can then use to create lightweight clones
  6664. SafeDelete ( m_pQueryTokenizer );
  6665. m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
  6666. if ( IsStarDict() )
  6667. m_pQueryTokenizer->AddPlainChar ( '*' );
  6668. if ( m_tSettings.m_bIndexExactWords )
  6669. m_pQueryTokenizer->AddPlainChar ( '=' );
  6670. m_pQueryTokenizer->AddSpecials ( "()|-!@~\"/^$<" );
  6671. m_pQueryTokenizer->AddPlainChar ( '?' );
  6672. m_pQueryTokenizer->AddPlainChar ( '%' );
  6673. }
  6674. ISphTokenizer * CSphIndex::LeakTokenizer ()
  6675. {
  6676. ISphTokenizer * pTokenizer = m_pTokenizer;
  6677. m_pTokenizer = NULL;
  6678. return pTokenizer;
  6679. }
  6680. void CSphIndex::SetDictionary ( CSphDict * pDict )
  6681. {
  6682. if ( m_pDict!=pDict )
  6683. SafeDelete ( m_pDict );
  6684. m_pDict = pDict;
  6685. }
  6686. CSphDict * CSphIndex::LeakDictionary ()
  6687. {
  6688. CSphDict * pDict = m_pDict;
  6689. m_pDict = NULL;
  6690. return pDict;
  6691. }
  6692. void CSphIndex::Setup ( const CSphIndexSettings & tSettings )
  6693. {
  6694. m_bStripperInited = true;
  6695. m_tSettings = tSettings;
  6696. }
  6697. void CSphIndex::SetCacheSize ( int iMaxCachedDocs, int iMaxCachedHits )
  6698. {
  6699. m_iMaxCachedDocs = iMaxCachedDocs;
  6700. m_iMaxCachedHits = iMaxCachedHits;
  6701. }
  6702. float CSphIndex::GetGlobalIDF ( const CSphString & sWord, int iDocsLocal, int iQwords, bool bPlainIDF ) const
  6703. {
  6704. g_tGlobalIDFLock.Lock ();
  6705. CSphGlobalIDF ** ppGlobalIDF = g_hGlobalIDFs ( m_sGlobalIDFPath );
  6706. float fIDF = ppGlobalIDF && *ppGlobalIDF ? ( *ppGlobalIDF )->GetIDF ( sWord, iDocsLocal, iQwords, bPlainIDF ) : 0.0f;
  6707. g_tGlobalIDFLock.Unlock ();
  6708. return fIDF;
  6709. }
  6710. /////////////////////////////////////////////////////////////////////////////
  6711. CSphIndex * sphCreateIndexPhrase ( const char* szIndexName, const char * sFilename )
  6712. {
  6713. return new CSphIndex_VLN ( szIndexName, sFilename );
  6714. }
  6715. CSphIndex_VLN::CSphIndex_VLN ( const char* sIndexName, const char * sFilename )
  6716. : CSphIndex ( sIndexName, sFilename )
  6717. , m_iLockFD ( -1 )
  6718. , m_dMinRow ( 0 )
  6719. , m_dFieldLens ( SPH_MAX_FIELDS )
  6720. , m_bKeepAttrs ( false )
  6721. {
  6722. m_sFilename = sFilename;
  6723. m_iDocinfo = 0;
  6724. m_iDocinfoIndex = 0;
  6725. m_pDocinfoIndex = NULL;
  6726. m_bPreallocated = false;
  6727. m_uVersion = INDEX_FORMAT_VERSION;
  6728. m_iKillListSize = 0;
  6729. m_uMinMaxIndex = 0;
  6730. m_iIndexTag = -1;
  6731. m_bIsEmpty = true;
  6732. m_pPreread = NULL;
  6733. m_pAttrsStatus = NULL;
  6734. m_iMinDocid = 0;
  6735. ARRAY_FOREACH ( i, m_dFieldLens )
  6736. m_dFieldLens[i] = 0;
  6737. }
  6738. CSphIndex_VLN::~CSphIndex_VLN ()
  6739. {
  6740. #if USE_WINDOWS
  6741. if ( m_iIndexTag>=0 && g_pMvaArena )
  6742. #else
  6743. if ( m_iIndexTag>=0 && g_bHeadProcess && g_pMvaArena )
  6744. #endif
  6745. g_MvaArena.TaggedFreeTag ( m_iIndexTag );
  6746. #if !USE_WINDOWS
  6747. if ( g_bHeadProcess )
  6748. #endif
  6749. Unlock();
  6750. }
  6751. /////////////////////////////////////////////////////////////////////////////
  6752. int CSphIndex_VLN::UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError )
  6753. {
  6754. // check if we can
  6755. if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
  6756. {
  6757. sError.SetSprintf ( "docinfo=extern required for updates" );
  6758. return -1;
  6759. }
  6760. assert ( tUpd.m_dDocids.GetLength()==0 || tUpd.m_dRows.GetLength()==0 );
  6761. DWORD uRows = Max ( tUpd.m_dDocids.GetLength(), tUpd.m_dRows.GetLength() );
  6762. bool bRaw = tUpd.m_dDocids.GetLength()==0;
  6763. // check if we have to
  6764. assert ( (int)uRows==tUpd.m_dRowOffset.GetLength() );
  6765. if ( !m_iDocinfo || !uRows )
  6766. return 0;
  6767. if ( g_pBinlog )
  6768. g_pBinlog->BinlogUpdateAttributes ( &m_iTID, m_sIndexName.cstr(), tUpd );
  6769. // remap update schema to index schema
  6770. CSphVector<CSphAttrLocator> dLocators;
  6771. CSphVector<int> dIndexes;
  6772. CSphVector<bool> dFloats;
  6773. CSphVector<bool> dBigints;
  6774. dLocators.Reserve ( tUpd.m_dAttrs.GetLength() );
  6775. dIndexes.Reserve ( tUpd.m_dAttrs.GetLength() );
  6776. dFloats.Reserve ( tUpd.m_dAttrs.GetLength() );
  6777. dBigints.Reserve ( tUpd.m_dAttrs.GetLength() ); // bigint flags for *source* schema.
  6778. uint64_t uDst64 = 0;
  6779. ARRAY_FOREACH ( i, tUpd.m_dAttrs )
  6780. {
  6781. int iIndex = m_tSchema.GetAttrIndex ( tUpd.m_dAttrs[i].m_sName.cstr() );
  6782. if ( iIndex>=0 )
  6783. {
  6784. // forbid updates on non-int columns
  6785. const CSphColumnInfo & tCol = m_tSchema.GetAttr(iIndex);
  6786. if (!( tCol.m_eAttrType==SPH_ATTR_BOOL || tCol.m_eAttrType==SPH_ATTR_INTEGER || tCol.m_eAttrType==SPH_ATTR_TIMESTAMP
  6787. || tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET
  6788. || tCol.m_eAttrType==SPH_ATTR_BIGINT || tCol.m_eAttrType==SPH_ATTR_FLOAT ))
  6789. {
  6790. sError.SetSprintf ( "attribute '%s' can not be updated "
  6791. "(must be boolean, integer, bigint, float, timestamp, or MVA)",
  6792. tUpd.m_dAttrs[i].m_sName.cstr() );
  6793. return -1;
  6794. }
  6795. // forbid updates on MVA columns if there's no arena
  6796. if ( ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET ) && !g_pMvaArena )
  6797. {
  6798. sError.SetSprintf ( "MVA attribute '%s' can not be updated (MVA arena not initialized)", tCol.m_sName.cstr() );
  6799. return -1;
  6800. }
  6801. bool bSrcMva = ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET );
  6802. bool bDstMva = ( tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_UINT32SET || tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_INT64SET );
  6803. if ( bSrcMva!=bDstMva )
  6804. {
  6805. sError.SetSprintf ( "attribute '%s' MVA flag mismatch", tUpd.m_dAttrs[i].m_sName.cstr() );
  6806. return -1;
  6807. }
  6808. if ( tCol.m_eAttrType==SPH_ATTR_UINT32SET && tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_INT64SET )
  6809. {
  6810. sError.SetSprintf ( "attribute '%s' MVA bits (dst=%d, src=%d) mismatch", tUpd.m_dAttrs[i].m_sName.cstr(),
  6811. tCol.m_eAttrType, tUpd.m_dAttrs[i].m_eAttrType );
  6812. return -1;
  6813. }
  6814. if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
  6815. uDst64 |= ( U64C(1)<<i );
  6816. dFloats.Add ( tCol.m_eAttrType==SPH_ATTR_FLOAT );
  6817. dLocators.Add ( tCol.m_tLocator );
  6818. } else if ( !tUpd.m_bIgnoreNonexistent )
  6819. {
  6820. sError.SetSprintf ( "attribute '%s' not found", tUpd.m_dAttrs[i].m_sName.cstr() );
  6821. return -1;
  6822. }
  6823. dBigints.Add ( tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_BIGINT );
  6824. // find dupes to optimize
  6825. ARRAY_FOREACH ( i, dIndexes )
  6826. if ( dIndexes[i]==iIndex )
  6827. {
  6828. dIndexes[i] = -1;
  6829. break;
  6830. }
  6831. dIndexes.Add ( iIndex );
  6832. }
  6833. assert ( tUpd.m_bIgnoreNonexistent || ( dLocators.GetLength()==tUpd.m_dAttrs.GetLength() ) );
  6834. // FIXME! FIXME! FIXME! overwriting just-freed blocks might hurt concurrent searchers;
  6835. // should implement a simplistic MVCC-style delayed-free to avoid that
  6836. // do the update
  6837. const int iFirst = ( iIndex<0 ) ? 0 : iIndex;
  6838. const int iLast = ( iIndex<0 ) ? uRows : iIndex+1;
  6839. // row update must leave it in cosistent state; so let's preallocate all the needed MVA
  6840. // storage upfront to avoid suddenly having to rollback if allocation fails later
  6841. int iNumMVA = 0;
  6842. ARRAY_FOREACH ( i, tUpd.m_dAttrs )
  6843. if ( dIndexes[i]>=0 && ( tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_UINT32SET || tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_INT64SET ) )
  6844. iNumMVA++;
  6845. // OPTIMIZE! execute the code below conditionally
  6846. CSphVector<DWORD*> dRowPtrs;
  6847. CSphVector<int> dMvaPtrs;
  6848. dRowPtrs.Resize ( uRows );
  6849. dMvaPtrs.Resize ( uRows*iNumMVA );
  6850. dMvaPtrs.Fill ( -1 );
  6851. // preallocate
  6852. bool bFailed = false;
  6853. for ( int iUpd=iFirst; iUpd<iLast && !bFailed; iUpd++ )
  6854. {
  6855. dRowPtrs[iUpd] = const_cast < DWORD * > ( bRaw ? tUpd.m_dRows[iUpd] : FindDocinfo ( tUpd.m_dDocids[iUpd] ) );
  6856. if ( !dRowPtrs[iUpd] )
  6857. continue; // no such id
  6858. int iPoolPos = tUpd.m_dRowOffset[iUpd];
  6859. int iMvaPtr = iUpd*iNumMVA;
  6860. ARRAY_FOREACH_COND ( iCol, tUpd.m_dAttrs, !bFailed )
  6861. {
  6862. bool bSrcMva32 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_UINT32SET );
  6863. bool bSrcMva64 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_INT64SET );
  6864. if (!( bSrcMva32 || bSrcMva64 )) // FIXME! optimize using a prebuilt dword mask?
  6865. {
  6866. iPoolPos++;
  6867. if ( dBigints[iCol] )
  6868. iPoolPos++;
  6869. continue;
  6870. }
  6871. // get the requested new count
  6872. int iNewCount = (int)tUpd.m_dPool[iPoolPos++];
  6873. iPoolPos += iNewCount;
  6874. // try to alloc
  6875. if ( dIndexes[iCol]>=0 )
  6876. {
  6877. int iAlloc = -1;
  6878. if ( iNewCount )
  6879. {
  6880. bool bDst64 = ( uDst64 & ( U64C(1) << iCol ) )!=0;
  6881. assert ( (iNewCount%2)==0 );
  6882. int iLen = ( bDst64 ? iNewCount : iNewCount/2 );
  6883. iAlloc = g_MvaArena.TaggedAlloc ( m_iIndexTag, (1+iLen)*sizeof(DWORD)+sizeof(SphDocID_t) );
  6884. if ( iAlloc<0 )
  6885. bFailed = true;
  6886. }
  6887. // whatever the outcome, move the pointer
  6888. dMvaPtrs[iMvaPtr++] = iAlloc;
  6889. }
  6890. }
  6891. }
  6892. // if there were any allocation failures, rollback everything
  6893. if ( bFailed )
  6894. {
  6895. ARRAY_FOREACH ( i, dMvaPtrs )
  6896. if ( dMvaPtrs[i]>=0 )
  6897. g_MvaArena.TaggedFreeIndex ( m_iIndexTag, dMvaPtrs[i] );
  6898. sError.SetSprintf ( "out of pool memory on MVA update" );
  6899. return -1;
  6900. }
  6901. // preallocation went OK; do the actual update
  6902. int iRowStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  6903. int iUpdated = 0;
  6904. DWORD uUpdateMask = 0;
  6905. for ( int iUpd=iFirst; iUpd<iLast; iUpd++ )
  6906. {
  6907. DWORD * pEntry = dRowPtrs[iUpd];
  6908. if ( !pEntry )
  6909. continue; // no such id
  6910. int64_t iBlock = int64_t ( pEntry-m_pDocinfo.GetWritePtr() ) / ( iRowStride*DOCINFO_INDEX_FREQ );
  6911. DWORD * pBlockRanges = const_cast < DWORD * > ( &m_pDocinfoIndex[iBlock*iRowStride*2] );
  6912. DWORD * pIndexRanges = const_cast < DWORD * > ( &m_pDocinfoIndex[m_iDocinfoIndex*iRowStride*2] );
  6913. assert ( iBlock>=0 && iBlock<m_iDocinfoIndex );
  6914. assert ( bRaw || ( DOCINFO2ID(pEntry)==tUpd.m_dDocids[iUpd] ) );
  6915. pEntry = DOCINFO2ATTRS(pEntry);
  6916. int iPos = tUpd.m_dRowOffset[iUpd];
  6917. int iMvaPtr = iUpd*iNumMVA;
  6918. ARRAY_FOREACH ( iCol, tUpd.m_dAttrs )
  6919. {
  6920. bool bSrcMva32 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_UINT32SET );
  6921. bool bSrcMva64 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_INT64SET );
  6922. if (!( bSrcMva32 || bSrcMva64 )) // FIXME! optimize using a prebuilt dword mask?
  6923. {
  6924. // plain update
  6925. if ( dIndexes[iCol]>=0 )
  6926. {
  6927. SphAttr_t uValue = dBigints[iCol] ? MVA_UPSIZE ( &tUpd.m_dPool[iPos] ) : tUpd.m_dPool[iPos];
  6928. sphSetRowAttr ( pEntry, dLocators[iCol], uValue );
  6929. // update block and index ranges
  6930. for ( int i=0; i<2; i++ )
  6931. {
  6932. DWORD * pBlock = i ? pBlockRanges : pIndexRanges;
  6933. SphAttr_t uMin = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol] );
  6934. SphAttr_t uMax = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ) , dLocators[iCol] );
  6935. if ( dFloats[iCol] ) // update float's indexes assumes float comparision
  6936. {
  6937. float fValue = sphDW2F ( (DWORD) uValue );
  6938. float fMin = sphDW2F ( (DWORD) uMin );
  6939. float fMax = sphDW2F ( (DWORD) uMax );
  6940. if ( fValue<fMin )
  6941. sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], sphF2DW ( fValue ) );
  6942. if ( fValue>fMax )
  6943. sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], sphF2DW ( fValue ) );
  6944. } else // update usual integers
  6945. {
  6946. if ( uValue<uMin )
  6947. sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], uValue );
  6948. if ( uValue>uMax )
  6949. sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], uValue );
  6950. }
  6951. }
  6952. uUpdateMask |= ATTRS_UPDATED;
  6953. }
  6954. iPos += dBigints[iCol]?2:1;
  6955. continue;
  6956. }
  6957. // MVA update
  6958. DWORD uOldIndex = MVA_DOWNSIZE ( sphGetRowAttr ( pEntry, dLocators[iCol] ) );
  6959. // get new count, store new data if needed
  6960. DWORD uNew = tUpd.m_dPool[iPos++];
  6961. const DWORD * pSrc = tUpd.m_dPool.Begin() + iPos;
  6962. iPos += uNew;
  6963. if ( dIndexes[iCol]>=0 )
  6964. {
  6965. int64_t iNewMin = LLONG_MAX, iNewMax = LLONG_MIN;
  6966. int iNewIndex = dMvaPtrs[iMvaPtr++];
  6967. if ( uNew )
  6968. {
  6969. assert ( iNewIndex>=0 );
  6970. SphDocID_t* pDocid = (SphDocID_t *)(g_pMvaArena + iNewIndex);
  6971. *pDocid++ = ( bRaw ? DOCINFO2ID ( tUpd.m_dRows[iUpd] ) : tUpd.m_dDocids[iUpd] );
  6972. iNewIndex = (DWORD *)pDocid - g_pMvaArena;
  6973. assert ( iNewIndex>=0 );
  6974. DWORD * pDst = g_pMvaArena + iNewIndex;
  6975. bool bDst64 = ( uDst64 & ( U64C(1) << iCol ) )!=0;
  6976. assert ( ( uNew%2 )==0 );
  6977. int iLen = ( bDst64 ? uNew : uNew/2 );
  6978. // setup new value (flagged index) to store within row
  6979. uNew = DWORD(iNewIndex) | MVA_ARENA_FLAG;
  6980. // MVA values counter first
  6981. *pDst++ = iLen;
  6982. if ( bDst64 )
  6983. {
  6984. while ( iLen )
  6985. {
  6986. int64_t uValue = MVA_UPSIZE ( pSrc );
  6987. iNewMin = Min ( iNewMin, uValue );
  6988. iNewMax = Max ( iNewMax, uValue );
  6989. *pDst++ = *pSrc++;
  6990. *pDst++ = *pSrc++;
  6991. iLen -= 2;
  6992. }
  6993. } else
  6994. {
  6995. while ( iLen-- )
  6996. {
  6997. DWORD uValue = *pSrc;
  6998. pSrc += 2;
  6999. *pDst++ = uValue;
  7000. iNewMin = Min ( iNewMin, uValue );
  7001. iNewMax = Max ( iNewMax, uValue );
  7002. }
  7003. }
  7004. }
  7005. // store new value
  7006. sphSetRowAttr ( pEntry, dLocators[iCol], uNew );
  7007. // update block and index ranges
  7008. if ( uNew )
  7009. for ( int i=0; i<2; i++ )
  7010. {
  7011. DWORD * pBlock = i ? pBlockRanges : pIndexRanges;
  7012. int64_t iMin = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol] );
  7013. int64_t iMax = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol] );
  7014. if ( iNewMin<iMin || iNewMax>iMax )
  7015. {
  7016. sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], Min ( iMin, iNewMin ) );
  7017. sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], Max ( iMax, iNewMax ) );
  7018. }
  7019. }
  7020. // free old storage if needed
  7021. if ( uOldIndex & MVA_ARENA_FLAG )
  7022. {
  7023. uOldIndex = ((DWORD*)((SphDocID_t*)(g_pMvaArena + (uOldIndex & MVA_OFFSET_MASK))-1))-g_pMvaArena;
  7024. g_MvaArena.TaggedFreeIndex ( m_iIndexTag, uOldIndex );
  7025. }
  7026. uUpdateMask |= ATTRS_MVA_UPDATED;
  7027. }
  7028. }
  7029. iUpdated++;
  7030. }
  7031. *m_pAttrsStatus |= uUpdateMask; // FIXME! add lock/atomic?
  7032. return iUpdated;
  7033. }
  7034. bool CSphIndex_VLN::LoadPersistentMVA ( CSphString & sError )
  7035. {
  7036. // prepare the file to load
  7037. CSphAutoreader fdReader;
  7038. if ( !fdReader.Open ( GetIndexFileName("mvp"), m_sLastError ) )
  7039. {
  7040. // no mvp means no saved attributes.
  7041. m_sLastError = "";
  7042. return true;
  7043. }
  7044. // check if we can
  7045. if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
  7046. {
  7047. sError.SetSprintf ( "docinfo=extern required for updates" );
  7048. return false;
  7049. }
  7050. DWORD uDocs = fdReader.GetDword();
  7051. // if we have docs to update
  7052. if ( !uDocs )
  7053. return false;
  7054. CSphVector<SphDocID_t> dAffected ( uDocs );
  7055. fdReader.GetBytes ( &dAffected[0], uDocs*sizeof(SphDocID_t) );
  7056. // collect the indexes of MVA schema attributes
  7057. CSphVector<CSphAttrLocator> dMvaLocators;
  7058. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  7059. {
  7060. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  7061. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
  7062. dMvaLocators.Add ( tAttr.m_tLocator );
  7063. }
  7064. #ifndef NDEBUG
  7065. int iMva64 = dMvaLocators.GetLength();
  7066. #endif
  7067. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  7068. {
  7069. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  7070. if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  7071. dMvaLocators.Add ( tAttr.m_tLocator );
  7072. }
  7073. assert ( dMvaLocators.GetLength()!=0 );
  7074. if ( g_MvaArena.GetError() ) // have to reset affected MVA in case of ( persistent MVA + no MVA arena )
  7075. {
  7076. ARRAY_FOREACH ( iDoc, dAffected )
  7077. {
  7078. DWORD * pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[iDoc] ) );
  7079. assert ( pDocinfo );
  7080. DWORD * pAttrs = DOCINFO2ATTRS ( pDocinfo );
  7081. ARRAY_FOREACH ( iMva, dMvaLocators )
  7082. {
  7083. // reset MVA from arena
  7084. if ( MVA_DOWNSIZE ( sphGetRowAttr ( pAttrs, dMvaLocators[iMva] ) ) & MVA_ARENA_FLAG )
  7085. sphSetRowAttr ( pAttrs, dMvaLocators[iMva], 0 );
  7086. }
  7087. }
  7088. sphWarning ( "index '%s' forced to reset persistent MVAs ( %s )", m_sIndexName.cstr(), g_MvaArena.GetError() );
  7089. fdReader.Close();
  7090. return true;
  7091. }
  7092. CSphVector<DWORD*> dRowPtrs ( uDocs );
  7093. CSphVector<int> dAllocs;
  7094. dAllocs.Reserve ( uDocs );
  7095. // prealloc values (and also preload)
  7096. bool bFailed = false;
  7097. ARRAY_FOREACH ( i, dAffected )
  7098. {
  7099. DWORD* pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[i] ) );
  7100. assert ( pDocinfo );
  7101. pDocinfo = DOCINFO2ATTRS ( pDocinfo );
  7102. ARRAY_FOREACH_COND ( j, dMvaLocators, !bFailed )
  7103. {
  7104. // if this MVA was updated
  7105. if ( MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) ) & MVA_ARENA_FLAG )
  7106. {
  7107. DWORD uCount = fdReader.GetDword();
  7108. if ( uCount )
  7109. {
  7110. assert ( j<iMva64 || ( uCount%2 )==0 );
  7111. int iAlloc = g_MvaArena.TaggedAlloc ( m_iIndexTag, (1+uCount)*sizeof(DWORD)+sizeof(SphDocID_t) );
  7112. if ( iAlloc<0 )
  7113. bFailed = true;
  7114. else
  7115. {
  7116. SphDocID_t *pDocid = (SphDocID_t*)(g_pMvaArena + iAlloc);
  7117. *pDocid++ = dAffected[i];
  7118. DWORD * pData = (DWORD*)pDocid;
  7119. *pData++ = uCount;
  7120. fdReader.GetBytes ( pData, uCount*sizeof(DWORD) );
  7121. dAllocs.Add ( iAlloc );
  7122. }
  7123. }
  7124. }
  7125. }
  7126. if ( bFailed )
  7127. break;
  7128. dRowPtrs[i] = pDocinfo;
  7129. }
  7130. fdReader.Close();
  7131. if ( bFailed )
  7132. {
  7133. ARRAY_FOREACH ( i, dAllocs )
  7134. g_MvaArena.TaggedFreeIndex ( m_iIndexTag, dAllocs[i] );
  7135. sError.SetSprintf ( "out of pool memory on loading persistent MVA values" );
  7136. return false;
  7137. }
  7138. // prealloc && load ok, fix the attributes now
  7139. int iAllocIndex = 0;
  7140. ARRAY_FOREACH ( i, dAffected )
  7141. {
  7142. DWORD* pDocinfo = dRowPtrs[i];
  7143. assert ( pDocinfo );
  7144. ARRAY_FOREACH_COND ( j, dMvaLocators, !bFailed )
  7145. // if this MVA was updated
  7146. if ( MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) ) & MVA_ARENA_FLAG )
  7147. sphSetRowAttr ( pDocinfo, dMvaLocators[j],
  7148. ((DWORD*)(((SphDocID_t*)(g_pMvaArena + dAllocs[iAllocIndex++]))+1) - g_pMvaArena) | MVA_ARENA_FLAG );
  7149. }
  7150. return true;
  7151. }
  7152. //////////////////////////////////////////////////////////////////////////
  7153. bool CSphIndex_VLN::PrecomputeMinMax()
  7154. {
  7155. if ( !m_iDocinfo )
  7156. return true;
  7157. AttrIndexBuilder_c tBuilder ( m_tSchema );
  7158. tBuilder.Prepare ( m_pDocinfoIndex, m_pDocinfoIndex + ( m_iDocinfoIndex+1 ) * 2 * ( DOCINFO_IDSIZE + m_tSchema.GetRowSize() ) );
  7159. int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  7160. m_tProgress.m_ePhase = CSphIndexProgress::PHASE_PRECOMPUTE;
  7161. m_tProgress.m_iDone = 0;
  7162. m_uMinMaxIndex = 0;
  7163. for ( int64_t iIndexEntry=0; iIndexEntry<m_iDocinfo; iIndexEntry++ )
  7164. {
  7165. if ( !tBuilder.Collect ( m_pDocinfo.GetWritePtr() + iIndexEntry * iStride, m_pMva.GetWritePtr(),
  7166. (int64_t)m_pMva.GetNumEntries(), m_sLastError, true ) )
  7167. return false;
  7168. m_uMinMaxIndex += iStride;
  7169. // show progress
  7170. int64_t iDone = (iIndexEntry+1)*1000/m_iDocinfoIndex;
  7171. if ( iDone!=m_tProgress.m_iDone )
  7172. {
  7173. m_tProgress.m_iDone = (int)iDone;
  7174. m_tProgress.Show ( m_tProgress.m_iDone==1000 );
  7175. }
  7176. }
  7177. tBuilder.FinishCollect();
  7178. return true;
  7179. }
  7180. // safely rename an index file
  7181. bool CSphIndex_VLN::JuggleFile ( const char* szExt, CSphString & sError, bool bNeedOrigin ) const
  7182. {
  7183. CSphString sExt = GetIndexFileName ( szExt );
  7184. CSphString sExtNew, sExtOld;
  7185. sExtNew.SetSprintf ( "%s.tmpnew", sExt.cstr() );
  7186. sExtOld.SetSprintf ( "%s.tmpold", sExt.cstr() );
  7187. if ( ::rename ( sExt.cstr(), sExtOld.cstr() ) )
  7188. {
  7189. if ( bNeedOrigin )
  7190. {
  7191. sError.SetSprintf ( "rename '%s' to '%s' failed: %s", sExt.cstr(), sExtOld.cstr(), strerror(errno) );
  7192. return false;
  7193. }
  7194. }
  7195. if ( ::rename ( sExtNew.cstr(), sExt.cstr() ) )
  7196. {
  7197. if ( bNeedOrigin && !::rename ( sExtOld.cstr(), sExt.cstr() ) )
  7198. {
  7199. // rollback failed too!
  7200. sError.SetSprintf ( "rollback rename to '%s' failed: %s; INDEX UNUSABLE; FIX FILE NAMES MANUALLY", sExt.cstr(), strerror(errno) );
  7201. } else
  7202. {
  7203. // rollback went ok
  7204. sError.SetSprintf ( "rename '%s' to '%s' failed: %s", sExtNew.cstr(), sExt.cstr(), strerror(errno) );
  7205. }
  7206. return false;
  7207. }
  7208. // all done
  7209. ::unlink ( sExtOld.cstr() );
  7210. return true;
  7211. }
  7212. bool CSphIndex_VLN::SaveAttributes ( CSphString & sError ) const
  7213. {
  7214. if ( !m_pAttrsStatus || !*m_pAttrsStatus || !m_iDocinfo )
  7215. return true;
  7216. DWORD uAttrStatus = *m_pAttrsStatus;
  7217. sphLogDebugvv ( "index '%s' attrs (%d) saving...", m_sIndexName.cstr(), uAttrStatus );
  7218. assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && m_iDocinfo && m_pDocinfo.GetWritePtr() );
  7219. for ( ; uAttrStatus & ATTRS_MVA_UPDATED ; )
  7220. {
  7221. // collect the indexes of MVA schema attributes
  7222. CSphVector<CSphAttrLocator> dMvaLocators;
  7223. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  7224. {
  7225. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  7226. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
  7227. dMvaLocators.Add ( tAttr.m_tLocator );
  7228. }
  7229. #ifndef NDEBUG
  7230. int iMva64 = dMvaLocators.GetLength();
  7231. #endif
  7232. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  7233. {
  7234. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  7235. if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  7236. dMvaLocators.Add ( tAttr.m_tLocator );
  7237. }
  7238. assert ( dMvaLocators.GetLength()!=0 );
  7239. // collect the list of all docids with changed MVA attributes
  7240. CSphVector<SphDocID_t> dAffected;
  7241. {
  7242. tDocCollector dCollect ( dAffected );
  7243. g_MvaArena.ExamineTag ( &dCollect, m_iIndexTag );
  7244. }
  7245. dAffected.Uniq();
  7246. if ( !dAffected.GetLength() )
  7247. break;
  7248. // prepare the file to save into;
  7249. CSphWriter fdFlushMVA;
  7250. fdFlushMVA.OpenFile ( GetIndexFileName("mvp.tmpnew"), sError );
  7251. if ( fdFlushMVA.IsError() )
  7252. return false;
  7253. // save the vector of affected docids
  7254. DWORD uPos = dAffected.GetLength();
  7255. fdFlushMVA.PutDword ( uPos );
  7256. fdFlushMVA.PutBytes ( &dAffected[0], uPos*sizeof(SphDocID_t) );
  7257. // save the updated MVA vectors
  7258. ARRAY_FOREACH ( i, dAffected )
  7259. {
  7260. DWORD* pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[i] ) );
  7261. assert ( pDocinfo );
  7262. pDocinfo = DOCINFO2ATTRS ( pDocinfo );
  7263. ARRAY_FOREACH ( j, dMvaLocators )
  7264. {
  7265. DWORD uOldIndex = MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) );
  7266. // if this MVA was updated
  7267. if ( uOldIndex & MVA_ARENA_FLAG )
  7268. {
  7269. DWORD * pMva = g_pMvaArena + ( uOldIndex & MVA_OFFSET_MASK );
  7270. DWORD uCount = *pMva;
  7271. assert ( j<iMva64 || ( uCount%2 )==0 );
  7272. fdFlushMVA.PutDword ( uCount );
  7273. fdFlushMVA.PutBytes ( pMva+1, uCount*sizeof(DWORD) );
  7274. }
  7275. }
  7276. }
  7277. fdFlushMVA.CloseFile();
  7278. if ( !JuggleFile ( "mvp", sError, false ) )
  7279. return false;
  7280. break;
  7281. }
  7282. if ( m_bId32to64 )
  7283. {
  7284. sError.SetSprintf ( "id32 index loaded by id64 binary; saving is not (yet) possible" );
  7285. return false;
  7286. }
  7287. assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && m_iDocinfo && m_pDocinfo.GetWritePtr() );
  7288. // save current state
  7289. CSphAutofile fdTmpnew ( GetIndexFileName("spa.tmpnew"), SPH_O_NEW, sError );
  7290. if ( fdTmpnew.GetFD()<0 )
  7291. return false;
  7292. int uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  7293. int64_t iSize = m_iDocinfo*sizeof(DWORD)*uStride;
  7294. if ( m_uVersion>=20 )
  7295. iSize += (m_iDocinfoIndex+1)*uStride*sizeof(CSphRowitem)*2;
  7296. if ( !sphWriteThrottled ( fdTmpnew.GetFD(), m_pDocinfo.GetWritePtr(), iSize, "docinfo", sError, &g_tThrottle ) )
  7297. return false;
  7298. fdTmpnew.Close ();
  7299. if ( !JuggleFile ( "spa", sError ) )
  7300. return false;
  7301. if ( g_pBinlog )
  7302. g_pBinlog->NotifyIndexFlush ( m_sIndexName.cstr(), m_iTID, false );
  7303. if ( *m_pAttrsStatus==uAttrStatus )
  7304. *m_pAttrsStatus = 0;
  7305. sphLogDebugvv ( "index '%s' attrs (%d) saved", m_sIndexName.cstr(), *m_pAttrsStatus );
  7306. return true;
  7307. }
  7308. DWORD CSphIndex_VLN::GetAttributeStatus () const
  7309. {
  7310. assert ( m_pAttrsStatus );
  7311. return *m_pAttrsStatus;
  7312. }
  7313. /////////////////////////////////////////////////////////////////////////////
  7314. #define SPH_CMPHIT_LESS(a,b) \
  7315. ( a.m_iWordID<b.m_iWordID || \
  7316. ( a.m_iWordID==b.m_iWordID && a.m_iDocID<b.m_iDocID ) || \
  7317. ( a.m_iWordID==b.m_iWordID && a.m_iDocID==b.m_iDocID && a.m_iWordPos<b.m_iWordPos ) )
  7318. struct CmpHit_fn
  7319. {
  7320. inline bool IsLess ( const CSphWordHit & a, const CSphWordHit & b ) const
  7321. {
  7322. return SPH_CMPHIT_LESS ( a, b );
  7323. }
  7324. };
  7325. /// sort baked docinfos by document ID
  7326. struct DocinfoSort_fn
  7327. {
  7328. typedef SphDocID_t MEDIAN_TYPE;
  7329. int m_iStride;
  7330. explicit DocinfoSort_fn ( int iStride )
  7331. : m_iStride ( iStride )
  7332. {}
  7333. SphDocID_t Key ( DWORD * pData ) const
  7334. {
  7335. return DOCINFO2ID(pData);
  7336. }
  7337. void CopyKey ( SphDocID_t * pMed, DWORD * pVal ) const
  7338. {
  7339. *pMed = Key(pVal);
  7340. }
  7341. bool IsLess ( SphDocID_t a, SphDocID_t b ) const
  7342. {
  7343. return a < b;
  7344. }
  7345. void Swap ( DWORD * a, DWORD * b ) const
  7346. {
  7347. for ( int i=0; i<m_iStride; i++ )
  7348. ::Swap ( a[i], b[i] );
  7349. }
  7350. DWORD * Add ( DWORD * p, int i ) const
  7351. {
  7352. return p+i*m_iStride;
  7353. }
  7354. int Sub ( DWORD * b, DWORD * a ) const
  7355. {
  7356. return (int)((b-a)/m_iStride);
  7357. }
  7358. };
  7359. void sphSortDocinfos ( DWORD * pBuf, int iCount, int iStride )
  7360. {
  7361. DocinfoSort_fn fnSort ( iStride );
  7362. sphSort ( pBuf, iCount, fnSort, fnSort );
  7363. }
  7364. CSphString CSphIndex_VLN::GetIndexFileName ( const char * sExt ) const
  7365. {
  7366. CSphString sRes;
  7367. sRes.SetSprintf ( "%s.%s", m_sFilename.cstr(), sExt );
  7368. return sRes;
  7369. }
  7370. class CSphHitBuilder
  7371. {
  7372. public:
  7373. CSphHitBuilder ( const CSphIndexSettings & tSettings, const CSphVector<SphWordID_t> & dHitless, bool bMerging, int iBufSize, CSphDict * pDict, CSphString * sError );
  7374. ~CSphHitBuilder () {}
  7375. bool CreateIndexFiles ( const char * sDocName, const char * sHitName, const char * sSkipName, bool bInplace, int iWriteBuffer, CSphAutofile & tHit, SphOffset_t * pSharedOffset );
  7376. void HitReset ();
  7377. void DoclistBeginEntry ( SphDocID_t uDocid, const DWORD * pAttrs );
  7378. void DoclistEndEntry ( Hitpos_t uLastPos );
  7379. void DoclistEndList ();
  7380. void cidxHit ( CSphAggregateHit * pHit, const CSphRowitem * pAttrs );
  7381. bool cidxDone ( int iMemLimit, int iMinInfixLen, int iMaxCodepointLen, DictHeader_t * pDictHeader );
  7382. int cidxWriteRawVLB ( int fd, CSphWordHit * pHit, int iHits, DWORD * pDocinfo, int iDocinfos, int iStride );
  7383. SphOffset_t GetHitfilePos () const { return m_wrHitlist.GetPos (); }
  7384. void CloseHitlist () { m_wrHitlist.CloseFile (); }
  7385. bool IsError () const { return ( m_pDict->DictIsError() || m_wrDoclist.IsError() || m_wrHitlist.IsError() ); }
  7386. void SetMin ( const CSphRowitem * pDynamic, int iDynamic );
  7387. void HitblockBegin () { m_pDict->HitblockBegin(); }
  7388. bool IsWordDict () const { return m_pDict->GetSettings().m_bWordDict; }
  7389. void SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
  7390. private:
  7391. CSphWriter m_wrDoclist; ///< wordlist writer
  7392. CSphWriter m_wrHitlist; ///< hitlist writer
  7393. CSphWriter m_wrSkiplist; ///< skiplist writer
  7394. CSphFixedVector<BYTE> m_dWriteBuffer; ///< my write buffer (for temp files)
  7395. ThrottleState_t * m_pThrottle;
  7396. CSphFixedVector<CSphRowitem> m_dMinRow;
  7397. CSphAggregateHit m_tLastHit; ///< hitlist entry
  7398. BYTE m_sLastKeyword [ MAX_KEYWORD_BYTES ];
  7399. const CSphVector<SphWordID_t> & m_dHitlessWords;
  7400. CSphDict * m_pDict;
  7401. CSphString * m_pLastError;
  7402. SphOffset_t m_iLastHitlistPos; ///< doclist entry
  7403. SphOffset_t m_iLastHitlistDelta; ///< doclist entry
  7404. CSphSmallBitvec m_dLastDocFields; ///< doclist entry
  7405. DWORD m_uLastDocHits; ///< doclist entry
  7406. CSphDictEntry m_tWord; ///< dictionary entry
  7407. ESphHitFormat m_eHitFormat;
  7408. ESphHitless m_eHitless;
  7409. bool m_bMerging;
  7410. CSphVector<SkiplistEntry_t> m_dSkiplist;
  7411. };
  7412. CSphHitBuilder::CSphHitBuilder ( const CSphIndexSettings & tSettings,
  7413. const CSphVector<SphWordID_t> & dHitless, bool bMerging, int iBufSize,
  7414. CSphDict * pDict, CSphString * sError )
  7415. : m_dWriteBuffer ( iBufSize )
  7416. , m_dMinRow ( 0 )
  7417. , m_dHitlessWords ( dHitless )
  7418. , m_pDict ( pDict )
  7419. , m_pLastError ( sError )
  7420. , m_eHitFormat ( tSettings.m_eHitFormat )
  7421. , m_eHitless ( tSettings.m_eHitless )
  7422. , m_bMerging ( bMerging )
  7423. {
  7424. m_sLastKeyword[0] = '\0';
  7425. HitReset();
  7426. m_iLastHitlistPos = 0;
  7427. m_iLastHitlistDelta = 0;
  7428. m_dLastDocFields.Unset();
  7429. m_uLastDocHits = 0;
  7430. m_tWord.m_iDoclistOffset = 0;
  7431. m_tWord.m_iDocs = 0;
  7432. m_tWord.m_iHits = 0;
  7433. assert ( m_pDict );
  7434. assert ( m_pLastError );
  7435. m_pThrottle = &g_tThrottle;
  7436. }
  7437. void CSphHitBuilder::SetMin ( const CSphRowitem * pDynamic, int iDynamic )
  7438. {
  7439. assert ( !iDynamic || pDynamic );
  7440. m_dMinRow.Reset ( iDynamic );
  7441. ARRAY_FOREACH ( i, m_dMinRow )
  7442. {
  7443. m_dMinRow[i] = pDynamic[i];
  7444. }
  7445. }
  7446. bool CSphHitBuilder::CreateIndexFiles ( const char * sDocName, const char * sHitName, const char * sSkipName,
  7447. bool bInplace, int iWriteBuffer, CSphAutofile & tHit, SphOffset_t * pSharedOffset )
  7448. {
  7449. // doclist and hitlist files
  7450. m_wrDoclist.CloseFile();
  7451. m_wrHitlist.CloseFile();
  7452. m_wrSkiplist.CloseFile();
  7453. m_wrDoclist.SetBufferSize ( m_dWriteBuffer.GetLength() );
  7454. m_wrHitlist.SetBufferSize ( bInplace ? iWriteBuffer : m_dWriteBuffer.GetLength() );
  7455. m_wrDoclist.SetThrottle ( m_pThrottle );
  7456. m_wrHitlist.SetThrottle ( m_pThrottle );
  7457. if ( !m_wrDoclist.OpenFile ( sDocName, *m_pLastError ) )
  7458. return false;
  7459. if ( bInplace )
  7460. {
  7461. sphSeek ( tHit.GetFD(), 0, SEEK_SET );
  7462. m_wrHitlist.SetFile ( tHit, pSharedOffset, *m_pLastError );
  7463. } else
  7464. {
  7465. if ( !m_wrHitlist.OpenFile ( sHitName, *m_pLastError ) )
  7466. return false;
  7467. }
  7468. if ( !m_wrSkiplist.OpenFile ( sSkipName, *m_pLastError ) )
  7469. return false;
  7470. // put dummy byte (otherwise offset would start from 0, first delta would be 0
  7471. // and VLB encoding of offsets would fuckup)
  7472. BYTE bDummy = 1;
  7473. m_wrDoclist.PutBytes ( &bDummy, 1 );
  7474. m_wrHitlist.PutBytes ( &bDummy, 1 );
  7475. m_wrSkiplist.PutBytes ( &bDummy, 1 );
  7476. return true;
  7477. }
  7478. void CSphHitBuilder::HitReset()
  7479. {
  7480. m_tLastHit.m_iDocID = 0;
  7481. m_tLastHit.m_iWordID = 0;
  7482. m_tLastHit.m_iWordPos = EMPTY_HIT;
  7483. m_tLastHit.m_sKeyword = m_sLastKeyword;
  7484. }
  7485. // doclist entry format
  7486. // (with the new and shiny "inline hit" format, that is)
  7487. //
  7488. // zint docid_delta
  7489. // zint[] inline_attrs
  7490. // zint doc_hits
  7491. // if doc_hits==1:
  7492. // zint field_pos
  7493. // zint field_no
  7494. // else:
  7495. // zint field_mask
  7496. // zint hlist_offset_delta
  7497. //
  7498. // so 4 bytes/doc minimum
  7499. // avg 4-6 bytes/doc according to our tests
  7500. void CSphHitBuilder::DoclistBeginEntry ( SphDocID_t uDocid, const DWORD * pAttrs )
  7501. {
  7502. // build skiplist
  7503. // that is, save decoder state and doclist position per every 128 documents
  7504. if ( ( m_tWord.m_iDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
  7505. {
  7506. SkiplistEntry_t & tBlock = m_dSkiplist.Add();
  7507. tBlock.m_iBaseDocid = m_tLastHit.m_iDocID;
  7508. tBlock.m_iOffset = m_wrDoclist.GetPos();
  7509. tBlock.m_iBaseHitlistPos = m_iLastHitlistPos;
  7510. }
  7511. // begin doclist entry
  7512. m_wrDoclist.ZipOffset ( uDocid - m_tLastHit.m_iDocID );
  7513. assert ( !pAttrs || m_dMinRow.GetLength() );
  7514. if ( pAttrs )
  7515. {
  7516. ARRAY_FOREACH ( i, m_dMinRow )
  7517. m_wrDoclist.ZipInt ( pAttrs[i] - m_dMinRow[i] );
  7518. }
  7519. }
  7520. void CSphHitBuilder::DoclistEndEntry ( Hitpos_t uLastPos )
  7521. {
  7522. // end doclist entry
  7523. if ( m_eHitFormat==SPH_HIT_FORMAT_INLINE )
  7524. {
  7525. bool bIgnoreHits =
  7526. ( m_eHitless==SPH_HITLESS_ALL ) ||
  7527. ( m_eHitless==SPH_HITLESS_SOME && ( m_tWord.m_iDocs & 0x80000000 ) );
  7528. // inline the only hit into doclist (unless it is completely discarded)
  7529. // and finish doclist entry
  7530. m_wrDoclist.ZipInt ( m_uLastDocHits );
  7531. if ( m_uLastDocHits==1 && !bIgnoreHits )
  7532. {
  7533. m_wrHitlist.SeekTo ( m_iLastHitlistPos );
  7534. m_wrDoclist.ZipInt ( uLastPos & 0x7FFFFF );
  7535. m_wrDoclist.ZipInt ( uLastPos >> 23 );
  7536. m_iLastHitlistPos -= m_iLastHitlistDelta;
  7537. assert ( m_iLastHitlistPos>=0 );
  7538. } else
  7539. {
  7540. m_wrDoclist.ZipInt ( m_dLastDocFields.GetMask32() );
  7541. m_wrDoclist.ZipOffset ( m_iLastHitlistDelta );
  7542. }
  7543. } else // plain format - finish doclist entry
  7544. {
  7545. assert ( m_eHitFormat==SPH_HIT_FORMAT_PLAIN );
  7546. m_wrDoclist.ZipOffset ( m_iLastHitlistDelta );
  7547. m_wrDoclist.ZipInt ( m_dLastDocFields.GetMask32() );
  7548. m_wrDoclist.ZipInt ( m_uLastDocHits );
  7549. }
  7550. m_dLastDocFields.Unset();
  7551. m_uLastDocHits = 0;
  7552. // update keyword stats
  7553. m_tWord.m_iDocs++;
  7554. }
  7555. void CSphHitBuilder::DoclistEndList ()
  7556. {
  7557. // emit eof marker
  7558. m_wrDoclist.ZipInt ( 0 );
  7559. // emit skiplist
  7560. // OPTIMIZE? placing it after doclist means an extra seek on searching
  7561. // however placing it before means some (longer) doclist data moves while indexing
  7562. if ( m_tWord.m_iDocs>SPH_SKIPLIST_BLOCK )
  7563. {
  7564. assert ( m_dSkiplist.GetLength() );
  7565. assert ( m_dSkiplist[0].m_iOffset==m_tWord.m_iDoclistOffset );
  7566. assert ( m_dSkiplist[0].m_iBaseDocid==0 );
  7567. assert ( m_dSkiplist[0].m_iBaseHitlistPos==0 );
  7568. m_tWord.m_iSkiplistOffset = m_wrSkiplist.GetPos();
  7569. // delta coding, but with a couple of skiplist specific tricks
  7570. // 1) first entry is omitted, it gets reconstructed from dict itself
  7571. // both base values are zero, and offset equals doclist offset
  7572. // 2) docids are at least SKIPLIST_BLOCK apart
  7573. // doclist entries are at least 4*SKIPLIST_BLOCK bytes apart
  7574. // so we additionally subtract that to improve delta coding
  7575. // 3) zero deltas are allowed and *not* used as any markers,
  7576. // as we know the exact skiplist entry count anyway
  7577. SkiplistEntry_t tLast = m_dSkiplist[0];
  7578. for ( int i=1; i<m_dSkiplist.GetLength(); i++ )
  7579. {
  7580. const SkiplistEntry_t & t = m_dSkiplist[i];
  7581. assert ( t.m_iBaseDocid - tLast.m_iBaseDocid>=SPH_SKIPLIST_BLOCK );
  7582. assert ( t.m_iOffset - tLast.m_iOffset>=4*SPH_SKIPLIST_BLOCK );
  7583. m_wrSkiplist.ZipOffset ( t.m_iBaseDocid - tLast.m_iBaseDocid - SPH_SKIPLIST_BLOCK );
  7584. m_wrSkiplist.ZipOffset ( t.m_iOffset - tLast.m_iOffset - 4*SPH_SKIPLIST_BLOCK );
  7585. m_wrSkiplist.ZipOffset ( t.m_iBaseHitlistPos - tLast.m_iBaseHitlistPos );
  7586. tLast = t;
  7587. }
  7588. }
  7589. // in any event, reset skiplist
  7590. m_dSkiplist.Resize ( 0 );
  7591. }
  7592. void CSphHitBuilder::cidxHit ( CSphAggregateHit * pHit, const CSphRowitem * pAttrs )
  7593. {
  7594. assert (
  7595. ( pHit->m_iWordID!=0 && pHit->m_iWordPos!=EMPTY_HIT && pHit->m_iDocID!=0 ) || // it's either ok hit
  7596. ( pHit->m_iWordID==0 && pHit->m_iWordPos==EMPTY_HIT ) ); // or "flush-hit"
  7597. /////////////
  7598. // next word
  7599. /////////////
  7600. bool bNextWord = ( m_tLastHit.m_iWordID!=pHit->m_iWordID ||
  7601. ( m_pDict->GetSettings().m_bWordDict && strcmp ( (char*)m_tLastHit.m_sKeyword, (char*)pHit->m_sKeyword ) ) ); // OPTIMIZE?
  7602. bool bNextDoc = bNextWord || ( m_tLastHit.m_iDocID!=pHit->m_iDocID );
  7603. if ( bNextDoc )
  7604. {
  7605. // finish hitlist, if any
  7606. Hitpos_t uLastPos = m_tLastHit.m_iWordPos;
  7607. if ( m_tLastHit.m_iWordPos!=EMPTY_HIT )
  7608. {
  7609. m_wrHitlist.ZipInt ( 0 );
  7610. m_tLastHit.m_iWordPos = EMPTY_HIT;
  7611. }
  7612. // finish doclist entry, if any
  7613. if ( m_tLastHit.m_iDocID )
  7614. DoclistEndEntry ( uLastPos );
  7615. }
  7616. if ( bNextWord )
  7617. {
  7618. // finish doclist, if any
  7619. if ( m_tLastHit.m_iDocID )
  7620. {
  7621. // emit end-of-doclist marker
  7622. DoclistEndList ();
  7623. // emit dict entry
  7624. m_tWord.m_uWordID = m_tLastHit.m_iWordID;
  7625. m_tWord.m_sKeyword = m_tLastHit.m_sKeyword;
  7626. m_tWord.m_iDoclistLength = m_wrDoclist.GetPos() - m_tWord.m_iDoclistOffset;
  7627. m_pDict->DictEntry ( m_tWord );
  7628. // reset trackers
  7629. m_tWord.m_iDocs = 0;
  7630. m_tWord.m_iHits = 0;
  7631. m_tLastHit.m_iDocID = 0;
  7632. m_iLastHitlistPos = 0;
  7633. }
  7634. // flush wordlist, if this is the end
  7635. if ( pHit->m_iWordPos==EMPTY_HIT )
  7636. {
  7637. m_pDict->DictEndEntries ( m_wrDoclist.GetPos() );
  7638. return;
  7639. }
  7640. assert ( pHit->m_iWordID > m_tLastHit.m_iWordID
  7641. || ( m_pDict->GetSettings().m_bWordDict &&
  7642. pHit->m_iWordID==m_tLastHit.m_iWordID && strcmp ( (char*)pHit->m_sKeyword, (char*)m_tLastHit.m_sKeyword )>0 )
  7643. || m_bMerging );
  7644. m_tWord.m_iDoclistOffset = m_wrDoclist.GetPos();
  7645. m_tLastHit.m_iWordID = pHit->m_iWordID;
  7646. if ( m_pDict->GetSettings().m_bWordDict )
  7647. {
  7648. assert ( strlen ( (char *)pHit->m_sKeyword )<sizeof(m_sLastKeyword)-1 );
  7649. strncpy ( (char*)m_tLastHit.m_sKeyword, (char*)pHit->m_sKeyword, sizeof(m_sLastKeyword) ); // OPTIMIZE?
  7650. }
  7651. }
  7652. if ( bNextDoc )
  7653. {
  7654. // begin new doclist entry for new doc id
  7655. assert ( pHit->m_iDocID>m_tLastHit.m_iDocID );
  7656. assert ( m_wrHitlist.GetPos()>=m_iLastHitlistPos );
  7657. DoclistBeginEntry ( pHit->m_iDocID, pAttrs );
  7658. m_iLastHitlistDelta = m_wrHitlist.GetPos() - m_iLastHitlistPos;
  7659. m_tLastHit.m_iDocID = pHit->m_iDocID;
  7660. m_iLastHitlistPos = m_wrHitlist.GetPos();
  7661. }
  7662. ///////////
  7663. // the hit
  7664. ///////////
  7665. if ( !pHit->m_dFieldMask.TestAll(false) ) // merge aggregate hits into the current hit
  7666. {
  7667. int iHitCount = pHit->GetAggrCount();
  7668. assert ( m_eHitless );
  7669. assert ( iHitCount );
  7670. assert ( !pHit->m_dFieldMask.TestAll(false) );
  7671. m_uLastDocHits += iHitCount;
  7672. m_dLastDocFields |= pHit->m_dFieldMask;
  7673. m_tWord.m_iHits += iHitCount;
  7674. if ( m_eHitless==SPH_HITLESS_SOME )
  7675. m_tWord.m_iDocs |= 0x80000000;
  7676. } else // handle normal hits
  7677. {
  7678. // add hit delta
  7679. if ( pHit->m_iWordPos==m_tLastHit.m_iWordPos )
  7680. return;
  7681. assert ( m_tLastHit.m_iWordPos < pHit->m_iWordPos );
  7682. m_wrHitlist.ZipInt ( pHit->m_iWordPos - m_tLastHit.m_iWordPos );
  7683. m_tLastHit.m_iWordPos = pHit->m_iWordPos;
  7684. m_tWord.m_iHits++;
  7685. // update matched fields mask
  7686. m_dLastDocFields.Set ( HITMAN::GetField ( pHit->m_iWordPos ) );
  7687. m_uLastDocHits++;
  7688. }
  7689. }
  7690. static void ReadSchemaColumn ( CSphReader & rdInfo, CSphColumnInfo & tCol, DWORD uVersion )
  7691. {
  7692. tCol.m_sName = rdInfo.GetString ();
  7693. if ( tCol.m_sName.IsEmpty () )
  7694. tCol.m_sName = "@emptyname";
  7695. tCol.m_sName.ToLower ();
  7696. tCol.m_eAttrType = (ESphAttr) rdInfo.GetDword (); // FIXME? check/fixup?
  7697. if ( uVersion>=5 ) // m_uVersion for searching
  7698. {
  7699. rdInfo.GetDword (); // ignore rowitem
  7700. tCol.m_tLocator.m_iBitOffset = rdInfo.GetDword ();
  7701. tCol.m_tLocator.m_iBitCount = rdInfo.GetDword ();
  7702. } else
  7703. {
  7704. tCol.m_tLocator.m_iBitOffset = -1;
  7705. tCol.m_tLocator.m_iBitCount = -1;
  7706. }
  7707. if ( uVersion>=16 ) // m_uVersion for searching
  7708. tCol.m_bPayload = ( rdInfo.GetByte()!=0 );
  7709. // WARNING! max version used here must be in sync with RtIndex_t::Prealloc
  7710. }
  7711. void ReadSchema ( CSphReader & rdInfo, CSphSchema & m_tSchema, DWORD uVersion, bool bDynamic )
  7712. {
  7713. m_tSchema.Reset ();
  7714. m_tSchema.m_dFields.Resize ( rdInfo.GetDword() );
  7715. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  7716. ReadSchemaColumn ( rdInfo, m_tSchema.m_dFields[i], uVersion );
  7717. int iNumAttrs = rdInfo.GetDword();
  7718. for ( int i=0; i<iNumAttrs; i++ )
  7719. {
  7720. CSphColumnInfo tCol;
  7721. ReadSchemaColumn ( rdInfo, tCol, uVersion );
  7722. m_tSchema.AddAttr ( tCol, bDynamic );
  7723. }
  7724. }
  7725. static void WriteSchemaColumn ( CSphWriter & fdInfo, const CSphColumnInfo & tCol )
  7726. {
  7727. int iLen = strlen ( tCol.m_sName.cstr() );
  7728. fdInfo.PutDword ( iLen );
  7729. fdInfo.PutBytes ( tCol.m_sName.cstr(), iLen );
  7730. ESphAttr eAttrType = tCol.m_eAttrType;
  7731. if ( eAttrType==SPH_ATTR_WORDCOUNT )
  7732. eAttrType = SPH_ATTR_INTEGER;
  7733. fdInfo.PutDword ( eAttrType );
  7734. fdInfo.PutDword ( tCol.m_tLocator.CalcRowitem() ); // for backwards compatibility
  7735. fdInfo.PutDword ( tCol.m_tLocator.m_iBitOffset );
  7736. fdInfo.PutDword ( tCol.m_tLocator.m_iBitCount );
  7737. fdInfo.PutByte ( tCol.m_bPayload );
  7738. }
  7739. void WriteSchema ( CSphWriter & fdInfo, const CSphSchema & tSchema )
  7740. {
  7741. // schema
  7742. fdInfo.PutDword ( tSchema.m_dFields.GetLength() );
  7743. ARRAY_FOREACH ( i, tSchema.m_dFields )
  7744. WriteSchemaColumn ( fdInfo, tSchema.m_dFields[i] );
  7745. fdInfo.PutDword ( tSchema.GetAttrsCount() );
  7746. for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
  7747. WriteSchemaColumn ( fdInfo, tSchema.GetAttr(i) );
  7748. }
  7749. void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings )
  7750. {
  7751. tWriter.PutDword ( tSettings.m_iMinPrefixLen );
  7752. tWriter.PutDword ( tSettings.m_iMinInfixLen );
  7753. tWriter.PutDword ( tSettings.m_iMaxSubstringLen );
  7754. tWriter.PutByte ( tSettings.m_bHtmlStrip ? 1 : 0 );
  7755. tWriter.PutString ( tSettings.m_sHtmlIndexAttrs.cstr () );
  7756. tWriter.PutString ( tSettings.m_sHtmlRemoveElements.cstr () );
  7757. tWriter.PutByte ( tSettings.m_bIndexExactWords ? 1 : 0 );
  7758. tWriter.PutDword ( tSettings.m_eHitless );
  7759. tWriter.PutDword ( tSettings.m_eHitFormat );
  7760. tWriter.PutByte ( tSettings.m_bIndexSP );
  7761. tWriter.PutString ( tSettings.m_sZones );
  7762. tWriter.PutDword ( tSettings.m_iBoundaryStep );
  7763. tWriter.PutDword ( tSettings.m_iStopwordStep );
  7764. tWriter.PutDword ( tSettings.m_iOvershortStep );
  7765. tWriter.PutDword ( tSettings.m_iEmbeddedLimit );
  7766. tWriter.PutByte ( tSettings.m_eBigramIndex );
  7767. tWriter.PutString ( tSettings.m_sBigramWords );
  7768. tWriter.PutByte ( tSettings.m_bIndexFieldLens );
  7769. }
  7770. bool CSphIndex_VLN::WriteHeader ( const BuildHeader_t & tBuildHeader, CSphWriter & fdInfo ) const
  7771. {
  7772. // version
  7773. fdInfo.PutDword ( INDEX_MAGIC_HEADER );
  7774. fdInfo.PutDword ( INDEX_FORMAT_VERSION );
  7775. // bits
  7776. fdInfo.PutDword ( USE_64BIT );
  7777. // docinfo
  7778. fdInfo.PutDword ( m_tSettings.m_eDocinfo );
  7779. // schema
  7780. WriteSchema ( fdInfo, m_tSchema );
  7781. // min doc
  7782. fdInfo.PutOffset ( tBuildHeader.m_iMinDocid ); // was dword in v.1
  7783. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  7784. fdInfo.PutBytes ( tBuildHeader.m_pMinRow, m_tSchema.GetRowSize()*sizeof(CSphRowitem) );
  7785. // wordlist checkpoints
  7786. fdInfo.PutOffset ( tBuildHeader.m_iDictCheckpointsOffset );
  7787. fdInfo.PutDword ( tBuildHeader.m_iDictCheckpoints );
  7788. fdInfo.PutByte ( tBuildHeader.m_iInfixCodepointBytes );
  7789. fdInfo.PutDword ( tBuildHeader.m_iInfixBlocksOffset );
  7790. fdInfo.PutDword ( tBuildHeader.m_iInfixBlocksWordsSize );
  7791. // index stats
  7792. fdInfo.PutDword ( (DWORD)tBuildHeader.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
  7793. fdInfo.PutOffset ( tBuildHeader.m_iTotalBytes );
  7794. // index settings
  7795. SaveIndexSettings ( fdInfo, m_tSettings );
  7796. // tokenizer info
  7797. assert ( m_pTokenizer );
  7798. SaveTokenizerSettings ( fdInfo, m_pTokenizer, m_tSettings.m_iEmbeddedLimit );
  7799. // dictionary info
  7800. assert ( m_pDict );
  7801. SaveDictionarySettings ( fdInfo, m_pDict, false, m_tSettings.m_iEmbeddedLimit );
  7802. fdInfo.PutDword ( tBuildHeader.m_iKillListSize );
  7803. fdInfo.PutOffset ( tBuildHeader.m_uMinMaxIndex );
  7804. // field filter info
  7805. SaveFieldFilterSettings ( fdInfo, m_pFieldFilter );
  7806. // average field lengths
  7807. if ( m_tSettings.m_bIndexFieldLens )
  7808. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  7809. fdInfo.PutOffset ( m_dFieldLens[i] );
  7810. return true;
  7811. }
  7812. bool CSphIndex_VLN::BuildDone ( const BuildHeader_t & tBuildHeader, CSphString & sError ) const
  7813. {
  7814. CSphWriter fdInfo;
  7815. fdInfo.SetThrottle ( tBuildHeader.m_pThrottle );
  7816. fdInfo.OpenFile ( GetIndexFileName ( tBuildHeader.m_sHeaderExtension ), sError );
  7817. if ( fdInfo.IsError() )
  7818. return false;
  7819. if ( !WriteHeader ( tBuildHeader, fdInfo ) )
  7820. return false;
  7821. // close header
  7822. fdInfo.CloseFile ();
  7823. return !fdInfo.IsError();
  7824. }
  7825. bool CSphHitBuilder::cidxDone ( int iMemLimit, int iMinInfixLen, int iMaxCodepointLen, DictHeader_t * pDictHeader )
  7826. {
  7827. assert ( pDictHeader );
  7828. // finalize dictionary
  7829. // in dict=crc mode, just flushes wordlist checkpoints
  7830. // in dict=keyword mode, also creates infix index, if needed
  7831. if ( iMinInfixLen>0 && m_pDict->GetSettings().m_bWordDict )
  7832. pDictHeader->m_iInfixCodepointBytes = iMaxCodepointLen;
  7833. if ( !m_pDict->DictEnd ( pDictHeader, iMemLimit, *m_pLastError, m_pThrottle ) )
  7834. return false;
  7835. // close all data files
  7836. m_wrDoclist.CloseFile ();
  7837. m_wrHitlist.CloseFile ( true );
  7838. return !IsError();
  7839. }
  7840. inline int encodeVLB ( BYTE * buf, DWORD v )
  7841. {
  7842. register BYTE b;
  7843. register int n = 0;
  7844. do
  7845. {
  7846. b = (BYTE)(v & 0x7f);
  7847. v >>= 7;
  7848. if ( v )
  7849. b |= 0x80;
  7850. *buf++ = b;
  7851. n++;
  7852. } while ( v );
  7853. return n;
  7854. }
  7855. inline int encodeVLB8 ( BYTE * buf, uint64_t v )
  7856. {
  7857. register BYTE b;
  7858. register int n = 0;
  7859. do {
  7860. b = (BYTE)(v & 0x7f);
  7861. v >>= 7;
  7862. if ( v )
  7863. b |= 0x80;
  7864. *buf++ = b;
  7865. n++;
  7866. } while ( v );
  7867. return n;
  7868. }
  7869. inline int encodeKeyword ( BYTE * pBuf, const char * pKeyword )
  7870. {
  7871. int iLen = strlen ( pKeyword ); // OPTIMIZE! remove this and memcpy and check if thats faster
  7872. assert ( iLen>0 && iLen<128 ); // so that ReadVLB()
  7873. *pBuf = (BYTE) iLen;
  7874. memcpy ( pBuf+1, pKeyword, iLen );
  7875. return 1+iLen;
  7876. }
  7877. int CSphHitBuilder::cidxWriteRawVLB ( int fd, CSphWordHit * pHit, int iHits, DWORD * pDocinfo, int iDocinfos, int iStride )
  7878. {
  7879. // PROFILE ( write_hits );
  7880. assert ( pHit );
  7881. assert ( iHits>0 );
  7882. /////////////////////////////
  7883. // do simple bitwise hashing
  7884. /////////////////////////////
  7885. static const int HBITS = 11;
  7886. static const int HSIZE = ( 1 << HBITS );
  7887. SphDocID_t iStartID = 0;
  7888. int dHash [ HSIZE+1 ];
  7889. int iShift = 0;
  7890. if ( pDocinfo )
  7891. {
  7892. iStartID = DOCINFO2ID ( pDocinfo );
  7893. int iBits = sphLog2 ( DOCINFO2ID ( pDocinfo + (iDocinfos-1)*iStride ) - iStartID );
  7894. iShift = ( iBits<HBITS ) ? 0 : ( iBits-HBITS );
  7895. #ifndef NDEBUG
  7896. for ( int i=0; i<=HSIZE; i++ )
  7897. dHash[i] = -1;
  7898. #endif
  7899. dHash[0] = 0;
  7900. int iHashed = 0;
  7901. for ( int i=0; i<iDocinfos; i++ )
  7902. {
  7903. int iHash = (int)( ( DOCINFO2ID ( pDocinfo+i*iStride ) - iStartID ) >> iShift );
  7904. assert ( iHash>=0 && iHash<HSIZE );
  7905. if ( iHash>iHashed )
  7906. {
  7907. dHash [ iHashed+1 ] = i-1; // right boundary for prev hash value
  7908. dHash [ iHash ] = i; // left boundary for next hash value
  7909. iHashed = iHash;
  7910. }
  7911. }
  7912. dHash [ iHashed+1 ] = iDocinfos-1; // right boundary for last hash value
  7913. }
  7914. ///////////////////////////////////////
  7915. // encode through a small write buffer
  7916. ///////////////////////////////////////
  7917. BYTE *pBuf, *maxP;
  7918. int n = 0, w;
  7919. SphWordID_t d1, l1 = 0;
  7920. SphDocID_t d2, l2 = 0;
  7921. DWORD d3, l3 = 0; // !COMMIT must be wide enough
  7922. int iGap = Max ( 128, 16*sizeof(DWORD) + iStride*sizeof(DWORD) + ( m_pDict->GetSettings().m_bWordDict ? MAX_KEYWORD_BYTES : 0 ) );
  7923. pBuf = m_dWriteBuffer.Begin();
  7924. maxP = m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() - iGap;
  7925. SphDocID_t iAttrID = 0; // current doc id
  7926. DWORD * pAttrs = NULL; // current doc attrs
  7927. // hit aggregation state
  7928. DWORD uHitCount = 0;
  7929. DWORD uHitFieldMask = 0;
  7930. const int iPositionShift = m_eHitless==SPH_HITLESS_SOME ? 1 : 0;
  7931. while ( iHits-- )
  7932. {
  7933. // find attributes by id
  7934. if ( pDocinfo && iAttrID!=pHit->m_iDocID )
  7935. {
  7936. int iHash = (int)( ( pHit->m_iDocID - iStartID ) >> iShift );
  7937. assert ( iHash>=0 && iHash<HSIZE );
  7938. int iStart = dHash[iHash];
  7939. int iEnd = dHash[iHash+1];
  7940. if ( pHit->m_iDocID==DOCINFO2ID ( pDocinfo + iStart*iStride ) )
  7941. {
  7942. pAttrs = DOCINFO2ATTRS ( pDocinfo + iStart*iStride );
  7943. } else if ( pHit->m_iDocID==DOCINFO2ID ( pDocinfo + iEnd*iStride ) )
  7944. {
  7945. pAttrs = DOCINFO2ATTRS ( pDocinfo + iEnd*iStride );
  7946. } else
  7947. {
  7948. pAttrs = NULL;
  7949. while ( iEnd-iStart>1 )
  7950. {
  7951. // check if nothing found
  7952. if (
  7953. pHit->m_iDocID < DOCINFO2ID ( pDocinfo + iStart*iStride ) ||
  7954. pHit->m_iDocID > DOCINFO2ID ( pDocinfo + iEnd*iStride ) )
  7955. break;
  7956. assert ( pHit->m_iDocID > DOCINFO2ID ( pDocinfo + iStart*iStride ) );
  7957. assert ( pHit->m_iDocID < DOCINFO2ID ( pDocinfo + iEnd*iStride ) );
  7958. int iMid = iStart + (iEnd-iStart)/2;
  7959. if ( pHit->m_iDocID==DOCINFO2ID ( pDocinfo + iMid*iStride ) )
  7960. {
  7961. pAttrs = DOCINFO2ATTRS ( pDocinfo + iMid*iStride );
  7962. break;
  7963. }
  7964. if ( pHit->m_iDocID<DOCINFO2ID ( pDocinfo + iMid*iStride ) )
  7965. iEnd = iMid;
  7966. else
  7967. iStart = iMid;
  7968. }
  7969. }
  7970. if ( !pAttrs )
  7971. sphDie ( "INTERNAL ERROR: failed to lookup attributes while saving collected hits" );
  7972. assert ( DOCINFO2ID ( pAttrs - DOCINFO_IDSIZE )==pHit->m_iDocID );
  7973. iAttrID = pHit->m_iDocID;
  7974. }
  7975. // calc deltas
  7976. d1 = pHit->m_iWordID - l1;
  7977. d2 = pHit->m_iDocID - l2;
  7978. d3 = pHit->m_iWordPos - l3;
  7979. // ignore duplicate hits
  7980. if ( d1==0 && d2==0 && d3==0 ) // OPTIMIZE? check if ( 0==(d1|d2|d3) ) is faster
  7981. {
  7982. pHit++;
  7983. continue;
  7984. }
  7985. // non-zero delta restarts all the fields after it
  7986. // because their deltas might now be negative
  7987. if ( d1 ) d2 = pHit->m_iDocID;
  7988. if ( d2 ) d3 = pHit->m_iWordPos;
  7989. // when we moved to the next word or document
  7990. bool bFlushed = false;
  7991. if ( d1 || d2 )
  7992. {
  7993. // flush previous aggregate hit
  7994. if ( uHitCount )
  7995. {
  7996. // we either skip all hits or the high bit must be available for marking
  7997. // failing that, we can't produce a consistent index
  7998. assert ( m_eHitless!=SPH_HITLESS_NONE );
  7999. assert ( m_eHitless==SPH_HITLESS_ALL || !( uHitCount & 0x80000000UL ) );
  8000. if ( m_eHitless!=SPH_HITLESS_ALL )
  8001. uHitCount = ( uHitCount << 1 ) | 1;
  8002. pBuf += encodeVLB ( pBuf, uHitCount );
  8003. pBuf += encodeVLB ( pBuf, uHitFieldMask );
  8004. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8005. uHitCount = 0;
  8006. uHitFieldMask = 0;
  8007. bFlushed = true;
  8008. }
  8009. // start aggregating if we're skipping all hits or this word is in a list of ignored words
  8010. if ( ( m_eHitless==SPH_HITLESS_ALL ) ||
  8011. ( m_eHitless==SPH_HITLESS_SOME && m_dHitlessWords.BinarySearch ( pHit->m_iWordID ) ) )
  8012. {
  8013. uHitCount = 1;
  8014. uHitFieldMask |= 1 << HITMAN::GetField ( pHit->m_iWordPos );
  8015. }
  8016. } else if ( uHitCount ) // next hit for the same word/doc pair, update state if we need it
  8017. {
  8018. uHitCount++;
  8019. uHitFieldMask |= 1 << HITMAN::GetField ( pHit->m_iWordPos );
  8020. }
  8021. // encode enough restart markers
  8022. if ( d1 ) pBuf += encodeVLB ( pBuf, 0 );
  8023. if ( d2 && !bFlushed ) pBuf += encodeVLB ( pBuf, 0 );
  8024. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8025. // encode deltas
  8026. #if USE_64BIT
  8027. #define LOC_ENCODE encodeVLB8
  8028. #else
  8029. #define LOC_ENCODE encodeVLB
  8030. #endif
  8031. // encode keyword
  8032. if ( d1 )
  8033. {
  8034. if ( m_pDict->GetSettings().m_bWordDict )
  8035. pBuf += encodeKeyword ( pBuf, m_pDict->HitblockGetKeyword ( pHit->m_iWordID ) ); // keyword itself in case of keywords dict
  8036. else
  8037. pBuf += LOC_ENCODE ( pBuf, d1 ); // delta in case of CRC dict
  8038. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8039. }
  8040. // encode docid delta
  8041. if ( d2 )
  8042. {
  8043. pBuf += LOC_ENCODE ( pBuf, d2 );
  8044. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8045. }
  8046. #undef LOC_ENCODE
  8047. // encode attrs
  8048. if ( d2 && pAttrs )
  8049. {
  8050. for ( int i=0; i<iStride-DOCINFO_IDSIZE; i++ )
  8051. {
  8052. pBuf += encodeVLB ( pBuf, pAttrs[i] );
  8053. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8054. }
  8055. }
  8056. assert ( d3 );
  8057. if ( !uHitCount ) // encode position delta, unless accumulating hits
  8058. {
  8059. pBuf += encodeVLB ( pBuf, d3 << iPositionShift );
  8060. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8061. }
  8062. // update current state
  8063. l1 = pHit->m_iWordID;
  8064. l2 = pHit->m_iDocID;
  8065. l3 = pHit->m_iWordPos;
  8066. pHit++;
  8067. if ( pBuf>maxP )
  8068. {
  8069. w = (int)(pBuf - m_dWriteBuffer.Begin());
  8070. assert ( w<m_dWriteBuffer.GetLength() );
  8071. if ( !sphWriteThrottled ( fd, m_dWriteBuffer.Begin(), w, "raw_hits", *m_pLastError, m_pThrottle ) )
  8072. return -1;
  8073. n += w;
  8074. pBuf = m_dWriteBuffer.Begin();
  8075. }
  8076. }
  8077. // flush last aggregate
  8078. if ( uHitCount )
  8079. {
  8080. assert ( m_eHitless!=SPH_HITLESS_NONE );
  8081. assert ( m_eHitless==SPH_HITLESS_ALL || !( uHitCount & 0x80000000UL ) );
  8082. if ( m_eHitless!=SPH_HITLESS_ALL )
  8083. uHitCount = ( uHitCount << 1 ) | 1;
  8084. pBuf += encodeVLB ( pBuf, uHitCount );
  8085. pBuf += encodeVLB ( pBuf, uHitFieldMask );
  8086. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8087. }
  8088. pBuf += encodeVLB ( pBuf, 0 );
  8089. pBuf += encodeVLB ( pBuf, 0 );
  8090. pBuf += encodeVLB ( pBuf, 0 );
  8091. assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
  8092. w = (int)(pBuf - m_dWriteBuffer.Begin());
  8093. assert ( w<m_dWriteBuffer.GetLength() );
  8094. if ( !sphWriteThrottled ( fd, m_dWriteBuffer.Begin(), w, "raw_hits", *m_pLastError, m_pThrottle ) )
  8095. return -1;
  8096. n += w;
  8097. return n;
  8098. }
  8099. /////////////////////////////////////////////////////////////////////////////
  8100. // OPTIMIZE?
  8101. inline bool SPH_CMPAGGRHIT_LESS ( const CSphAggregateHit & a, const CSphAggregateHit & b )
  8102. {
  8103. if ( a.m_iWordID < b.m_iWordID )
  8104. return true;
  8105. if ( a.m_iWordID > b.m_iWordID )
  8106. return false;
  8107. if ( a.m_sKeyword )
  8108. {
  8109. int iCmp = strcmp ( (char*)a.m_sKeyword, (char*)b.m_sKeyword ); // OPTIMIZE?
  8110. if ( iCmp!=0 )
  8111. return ( iCmp<0 );
  8112. }
  8113. return
  8114. ( a.m_iDocID < b.m_iDocID ) ||
  8115. ( a.m_iDocID==b.m_iDocID && a.m_iWordPos<b.m_iWordPos );
  8116. }
  8117. /// hit priority queue entry
  8118. struct CSphHitQueueEntry : public CSphAggregateHit
  8119. {
  8120. int m_iBin;
  8121. };
  8122. /// hit priority queue
  8123. struct CSphHitQueue
  8124. {
  8125. public:
  8126. CSphHitQueueEntry * m_pData;
  8127. int m_iSize;
  8128. int m_iUsed;
  8129. public:
  8130. /// create queue
  8131. explicit CSphHitQueue ( int iSize )
  8132. {
  8133. assert ( iSize>0 );
  8134. m_iSize = iSize;
  8135. m_iUsed = 0;
  8136. m_pData = new CSphHitQueueEntry [ iSize ];
  8137. }
  8138. /// destroy queue
  8139. ~CSphHitQueue ()
  8140. {
  8141. SafeDeleteArray ( m_pData );
  8142. }
  8143. /// add entry to the queue
  8144. void Push ( CSphAggregateHit & tHit, int iBin )
  8145. {
  8146. // check for overflow and do add
  8147. assert ( m_iUsed<m_iSize );
  8148. m_pData [ m_iUsed ].m_iDocID = tHit.m_iDocID;
  8149. m_pData [ m_iUsed ].m_iWordID = tHit.m_iWordID;
  8150. m_pData [ m_iUsed ].m_sKeyword = tHit.m_sKeyword; // bin must hold the actual data for the queue
  8151. m_pData [ m_iUsed ].m_iWordPos = tHit.m_iWordPos;
  8152. m_pData [ m_iUsed ].m_dFieldMask = tHit.m_dFieldMask;
  8153. m_pData [ m_iUsed ].m_iBin = iBin;
  8154. int iEntry = m_iUsed++;
  8155. // sift up if needed
  8156. while ( iEntry )
  8157. {
  8158. int iParent = ( iEntry-1 ) >> 1;
  8159. if ( SPH_CMPAGGRHIT_LESS ( m_pData[iEntry], m_pData[iParent] ) )
  8160. {
  8161. // entry is less than parent, should float to the top
  8162. Swap ( m_pData[iEntry], m_pData[iParent] );
  8163. iEntry = iParent;
  8164. } else
  8165. {
  8166. break;
  8167. }
  8168. }
  8169. }
  8170. /// remove root (ie. top priority) entry
  8171. void Pop ()
  8172. {
  8173. assert ( m_iUsed );
  8174. if ( !(--m_iUsed) ) // empty queue? just return
  8175. return;
  8176. // make the last entry my new root
  8177. m_pData[0] = m_pData[m_iUsed];
  8178. // sift down if needed
  8179. int iEntry = 0;
  8180. for ( ;; )
  8181. {
  8182. // select child
  8183. int iChild = (iEntry<<1) + 1;
  8184. if ( iChild>=m_iUsed )
  8185. break;
  8186. // select smallest child
  8187. if ( iChild+1<m_iUsed )
  8188. if ( SPH_CMPAGGRHIT_LESS ( m_pData[iChild+1], m_pData[iChild] ) )
  8189. iChild++;
  8190. // if smallest child is less than entry, do float it to the top
  8191. if ( SPH_CMPAGGRHIT_LESS ( m_pData[iChild], m_pData[iEntry] ) )
  8192. {
  8193. Swap ( m_pData[iChild], m_pData[iEntry] );
  8194. iEntry = iChild;
  8195. continue;
  8196. }
  8197. break;
  8198. }
  8199. }
  8200. };
  8201. struct CmpQueuedDocinfo_fn
  8202. {
  8203. static DWORD * m_pStorage;
  8204. static int m_iStride;
  8205. static inline bool IsLess ( const int a, const int b )
  8206. {
  8207. return DOCINFO2ID ( m_pStorage + a*m_iStride ) < DOCINFO2ID ( m_pStorage + b*m_iStride );
  8208. };
  8209. };
  8210. DWORD * CmpQueuedDocinfo_fn::m_pStorage = NULL;
  8211. int CmpQueuedDocinfo_fn::m_iStride = 1;
  8212. #define MAX_SOURCE_HITS 32768
  8213. static const int MIN_KEYWORDS_DICT = 4*1048576; // FIXME! ideally must be in sync with impl (ENTRY_CHUNKS, KEYWORD_CHUNKS)
  8214. /////////////////////////////////////////////////////////////////////////////
  8215. struct MvaEntry_t
  8216. {
  8217. SphDocID_t m_uDocID;
  8218. int m_iAttr;
  8219. int64_t m_iValue;
  8220. inline bool operator < ( const MvaEntry_t & rhs ) const
  8221. {
  8222. if ( m_uDocID!=rhs.m_uDocID ) return m_uDocID<rhs.m_uDocID;
  8223. if ( m_iAttr!=rhs.m_iAttr ) return m_iAttr<rhs.m_iAttr;
  8224. return m_iValue<rhs.m_iValue;
  8225. }
  8226. };
  8227. struct MvaEntryTag_t : public MvaEntry_t
  8228. {
  8229. int m_iTag;
  8230. };
  8231. struct MvaEntryCmp_fn
  8232. {
  8233. static inline bool IsLess ( const MvaEntry_t & a, const MvaEntry_t & b )
  8234. {
  8235. return a<b;
  8236. };
  8237. };
  8238. bool CSphIndex_VLN::BuildMVA ( const CSphVector<CSphSource*> & dSources, CSphFixedVector<CSphWordHit> & dHits,
  8239. int iArenaSize, int iFieldFD, int nFieldMVAs, int iFieldMVAInPool, CSphIndex_VLN * pPrevIndex )
  8240. {
  8241. // initialize writer (data file must always exist)
  8242. CSphWriter wrMva;
  8243. if ( !wrMva.OpenFile ( GetIndexFileName("spm"), m_sLastError ) )
  8244. return false;
  8245. // calcs and checks
  8246. bool bOnlyFieldMVAs = true;
  8247. CSphVector<int> dMvaIndexes;
  8248. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  8249. {
  8250. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  8251. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
  8252. {
  8253. dMvaIndexes.Add ( i );
  8254. if ( tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
  8255. bOnlyFieldMVAs = false;
  8256. }
  8257. }
  8258. int iMva64 = dMvaIndexes.GetLength();
  8259. // mva32 first
  8260. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  8261. {
  8262. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  8263. if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  8264. {
  8265. dMvaIndexes.Add ( i );
  8266. if ( tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
  8267. bOnlyFieldMVAs = false;
  8268. }
  8269. }
  8270. if ( dMvaIndexes.GetLength()<=0 )
  8271. return true;
  8272. // reuse hits pool
  8273. MvaEntry_t * pMvaPool = (MvaEntry_t*) dHits.Begin();
  8274. MvaEntry_t * pMvaMax = pMvaPool + ( iArenaSize/sizeof(MvaEntry_t) );
  8275. MvaEntry_t * pMva = pMvaPool;
  8276. // create temp file
  8277. CSphAutofile fdTmpMva ( GetIndexFileName("tmp3"), SPH_O_NEW, m_sLastError, true );
  8278. if ( fdTmpMva.GetFD()<0 )
  8279. return false;
  8280. //////////////////////////////
  8281. // collect and partially sort
  8282. //////////////////////////////
  8283. CSphVector<int> dBlockLens;
  8284. dBlockLens.Reserve ( 1024 );
  8285. m_tProgress.m_ePhase = CSphIndexProgress::PHASE_COLLECT_MVA;
  8286. if ( !bOnlyFieldMVAs )
  8287. {
  8288. ARRAY_FOREACH ( iSource, dSources )
  8289. {
  8290. CSphSource * pSource = dSources[iSource];
  8291. if ( !pSource->Connect ( m_sLastError ) )
  8292. return false;
  8293. ARRAY_FOREACH ( i, dMvaIndexes )
  8294. {
  8295. int iAttr = dMvaIndexes[i];
  8296. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(iAttr);
  8297. if ( tAttr.m_eSrc==SPH_ATTRSRC_FIELD )
  8298. continue;
  8299. if ( !pSource->IterateMultivaluedStart ( iAttr, m_sLastError ) )
  8300. return false;
  8301. while ( pSource->IterateMultivaluedNext () )
  8302. {
  8303. if ( pPrevIndex && pPrevIndex->FindDocinfo ( pSource->m_tDocInfo.m_iDocID ) )
  8304. continue;
  8305. pMva->m_uDocID = pSource->m_tDocInfo.m_iDocID;
  8306. pMva->m_iAttr = i;
  8307. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
  8308. {
  8309. pMva->m_iValue = pSource->m_dMva[0];
  8310. } else
  8311. {
  8312. pMva->m_iValue = MVA_UPSIZE ( pSource->m_dMva.Begin() );
  8313. }
  8314. if ( ++pMva>=pMvaMax )
  8315. {
  8316. sphSort ( pMvaPool, pMva-pMvaPool );
  8317. if ( !sphWriteThrottled ( fdTmpMva.GetFD(), pMvaPool, (pMva-pMvaPool)*sizeof(MvaEntry_t), "temp_mva", m_sLastError, &g_tThrottle ) )
  8318. return false;
  8319. dBlockLens.Add ( pMva-pMvaPool );
  8320. m_tProgress.m_iAttrs += pMva-pMvaPool;
  8321. pMva = pMvaPool;
  8322. m_tProgress.Show ( false );
  8323. }
  8324. }
  8325. }
  8326. pSource->Disconnect ();
  8327. }
  8328. if ( pMva>pMvaPool )
  8329. {
  8330. sphSort ( pMvaPool, pMva-pMvaPool );
  8331. if ( !sphWriteThrottled ( fdTmpMva.GetFD(), pMvaPool, (pMva-pMvaPool)*sizeof(MvaEntry_t), "temp_mva", m_sLastError, &g_tThrottle ) )
  8332. return false;
  8333. dBlockLens.Add ( pMva-pMvaPool );
  8334. m_tProgress.m_iAttrs += pMva-pMvaPool;
  8335. }
  8336. }
  8337. m_tProgress.Show ( true );
  8338. ///////////////////////////
  8339. // free memory for sorting
  8340. ///////////////////////////
  8341. dHits.Reset ( 0 );
  8342. //////////////
  8343. // fully sort
  8344. //////////////
  8345. m_tProgress.m_ePhase = CSphIndexProgress::PHASE_SORT_MVA;
  8346. m_tProgress.m_iAttrs = m_tProgress.m_iAttrs + nFieldMVAs;
  8347. m_tProgress.m_iAttrsTotal = m_tProgress.m_iAttrs;
  8348. m_tProgress.Show ( false );
  8349. int nLastBlockFieldMVAs = iFieldMVAInPool ? ( nFieldMVAs % iFieldMVAInPool ) : 0;
  8350. int nFieldBlocks = iFieldMVAInPool ? ( nFieldMVAs / iFieldMVAInPool + ( nLastBlockFieldMVAs ? 1 : 0 ) ) : 0;
  8351. // initialize readers
  8352. CSphVector<CSphBin*> dBins;
  8353. dBins.Reserve ( dBlockLens.GetLength() + nFieldBlocks );
  8354. int iBinSize = CSphBin::CalcBinSize ( iArenaSize, dBlockLens.GetLength() + nFieldBlocks, "sort_mva" );
  8355. SphOffset_t iSharedOffset = -1;
  8356. ARRAY_FOREACH ( i, dBlockLens )
  8357. {
  8358. dBins.Add ( new CSphBin() );
  8359. dBins[i]->m_iFileLeft = dBlockLens[i]*sizeof(MvaEntry_t);
  8360. dBins[i]->m_iFilePos = ( i==0 ) ? 0 : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
  8361. dBins[i]->Init ( fdTmpMva.GetFD(), &iSharedOffset, iBinSize );
  8362. }
  8363. SphOffset_t iSharedFieldOffset = -1;
  8364. SphOffset_t uStart = 0;
  8365. for ( int i = 0; i < nFieldBlocks; i++ )
  8366. {
  8367. dBins.Add ( new CSphBin() );
  8368. int iBin = dBins.GetLength () - 1;
  8369. dBins[iBin]->m_iFileLeft = sizeof(MvaEntry_t)*( i==nFieldBlocks-1
  8370. ? ( nLastBlockFieldMVAs ? nLastBlockFieldMVAs : iFieldMVAInPool )
  8371. : iFieldMVAInPool );
  8372. dBins[iBin]->m_iFilePos = uStart;
  8373. dBins[iBin]->Init ( iFieldFD, &iSharedFieldOffset, iBinSize );
  8374. uStart += dBins [iBin]->m_iFileLeft;
  8375. }
  8376. // do the sort
  8377. CSphQueue < MvaEntryTag_t, MvaEntryCmp_fn > qMva ( Max ( 1, dBins.GetLength() ) );
  8378. ARRAY_FOREACH ( i, dBins )
  8379. {
  8380. MvaEntryTag_t tEntry;
  8381. if ( dBins[i]->ReadBytes ( (MvaEntry_t*) &tEntry, sizeof(MvaEntry_t) )!=BIN_READ_OK )
  8382. {
  8383. m_sLastError.SetSprintf ( "sort_mva: warmup failed (io error?)" );
  8384. return false;
  8385. }
  8386. tEntry.m_iTag = i;
  8387. qMva.Push ( tEntry );
  8388. }
  8389. // spm-file := info-list [ 0+ ]
  8390. // info-list := docid, values-list [ index.schema.mva-count ]
  8391. // values-list := values-count, value [ values-count ]
  8392. // note that mva32 come first then mva64
  8393. SphDocID_t uCurID = 0;
  8394. CSphVector < CSphVector<int64_t> > dCurInfo;
  8395. dCurInfo.Resize ( dMvaIndexes.GetLength() );
  8396. for ( ;; )
  8397. {
  8398. // flush previous per-document info-list
  8399. if ( !qMva.GetLength() || qMva.Root().m_uDocID!=uCurID )
  8400. {
  8401. if ( uCurID )
  8402. {
  8403. wrMva.PutDocid ( uCurID );
  8404. ARRAY_FOREACH ( i, dCurInfo )
  8405. {
  8406. int iLen = dCurInfo[i].GetLength();
  8407. if ( i>=iMva64 )
  8408. {
  8409. wrMva.PutDword ( iLen*2 );
  8410. wrMva.PutBytes ( dCurInfo[i].Begin(), sizeof(int64_t)*iLen );
  8411. } else
  8412. {
  8413. wrMva.PutDword ( iLen );
  8414. ARRAY_FOREACH ( iVal, dCurInfo[i] )
  8415. {
  8416. wrMva.PutDword ( (DWORD)dCurInfo[i][iVal] );
  8417. }
  8418. }
  8419. }
  8420. }
  8421. if ( !qMva.GetLength() )
  8422. break;
  8423. uCurID = qMva.Root().m_uDocID;
  8424. ARRAY_FOREACH ( i, dCurInfo )
  8425. dCurInfo[i].Resize ( 0 );
  8426. }
  8427. // accumulate this entry
  8428. #if PARANOID
  8429. assert ( dCurInfo [ qMva.Root().m_iAttr ].GetLength()==0
  8430. || dCurInfo [ qMva.Root().m_iAttr ].Last()<=qMva.Root().m_iValue );
  8431. #endif
  8432. dCurInfo [ qMva.Root().m_iAttr ].AddUnique ( qMva.Root().m_iValue );
  8433. // get next entry
  8434. int iBin = qMva.Root().m_iTag;
  8435. qMva.Pop ();
  8436. MvaEntryTag_t tEntry;
  8437. ESphBinRead iRes = dBins[iBin]->ReadBytes ( (MvaEntry_t*)&tEntry, sizeof(MvaEntry_t) );
  8438. tEntry.m_iTag = iBin;
  8439. if ( iRes==BIN_READ_OK )
  8440. qMva.Push ( tEntry );
  8441. if ( iRes==BIN_READ_ERROR )
  8442. {
  8443. m_sLastError.SetSprintf ( "sort_mva: read error" );
  8444. return false;
  8445. }
  8446. }
  8447. // clean up readers
  8448. ARRAY_FOREACH ( i, dBins )
  8449. SafeDelete ( dBins[i] );
  8450. wrMva.CloseFile ();
  8451. if ( wrMva.IsError() )
  8452. return false;
  8453. m_tProgress.Show ( true );
  8454. return true;
  8455. }
  8456. struct CmpOrdinalsValue_fn
  8457. {
  8458. inline bool IsLess ( const Ordinal_t & a, const Ordinal_t & b ) const
  8459. {
  8460. return strcmp ( a.m_sValue.cstr(), b.m_sValue.cstr() )<0;
  8461. }
  8462. };
  8463. struct CmpOrdinalsEntry_fn
  8464. {
  8465. static inline bool IsLess ( const OrdinalEntry_t & a, const OrdinalEntry_t & b )
  8466. {
  8467. return strcmp ( a.m_sValue.cstr(), b.m_sValue.cstr() )<0;
  8468. }
  8469. };
  8470. struct CmpOrdinalsDocid_fn
  8471. {
  8472. inline bool IsLess ( const OrdinalId_t & a, const OrdinalId_t & b ) const
  8473. {
  8474. return a.m_uDocID < b.m_uDocID;
  8475. }
  8476. };
  8477. struct CmpMvaEntries_fn
  8478. {
  8479. inline bool IsLess ( const MvaEntry_t & a, const MvaEntry_t & b ) const
  8480. {
  8481. return a<b;
  8482. }
  8483. };
  8484. struct CmpOrdinalIdEntry_fn
  8485. {
  8486. static inline bool IsLess ( const OrdinalIdEntry_t & a, const OrdinalIdEntry_t & b )
  8487. {
  8488. return a.m_uDocID < b.m_uDocID;
  8489. }
  8490. };
  8491. SphOffset_t CSphIndex_VLN::DumpOrdinals ( CSphWriter & Writer, CSphVector<Ordinal_t> & dOrdinals )
  8492. {
  8493. SphOffset_t uSize = ( sizeof ( SphDocID_t ) + sizeof ( DWORD ) ) * dOrdinals.GetLength ();
  8494. ARRAY_FOREACH ( i, dOrdinals )
  8495. {
  8496. Ordinal_t & Ord = dOrdinals[i];
  8497. DWORD uValueLen = Ord.m_sValue.cstr () ? strlen ( Ord.m_sValue.cstr () ) : 0;
  8498. Writer.PutBytes ( &(Ord.m_uDocID), sizeof ( Ord.m_uDocID ) );
  8499. Writer.PutBytes ( &uValueLen, sizeof ( uValueLen ) );
  8500. Writer.PutBytes ( Ord.m_sValue.cstr (), uValueLen );
  8501. uSize += uValueLen;
  8502. if ( Writer.IsError () )
  8503. return 0;
  8504. }
  8505. return uSize;
  8506. }
  8507. ESphBinRead CSphIndex_VLN::ReadOrdinal ( CSphBin & Reader, Ordinal_t & Ordinal )
  8508. {
  8509. ESphBinRead eRes = Reader.ReadBytes ( &Ordinal.m_uDocID, sizeof ( Ordinal.m_uDocID ) );
  8510. if ( eRes!=BIN_READ_OK )
  8511. return eRes;
  8512. DWORD uStrLen;
  8513. eRes = Reader.ReadBytes ( &uStrLen, sizeof ( DWORD ) );
  8514. if ( eRes!=BIN_READ_OK )
  8515. return eRes;
  8516. if ( uStrLen>=(DWORD)MAX_ORDINAL_STR_LEN )
  8517. return BIN_READ_ERROR;
  8518. char dBuffer [MAX_ORDINAL_STR_LEN];
  8519. if ( uStrLen > 0 )
  8520. {
  8521. eRes = Reader.ReadBytes ( dBuffer, uStrLen );
  8522. if ( eRes!=BIN_READ_OK )
  8523. return eRes;
  8524. }
  8525. dBuffer [uStrLen] = '\0';
  8526. Ordinal.m_sValue = dBuffer;
  8527. return BIN_READ_OK;
  8528. }
  8529. bool CSphIndex_VLN::SortOrdinals ( const char * szToFile, int iFromFD, int iArenaSize,
  8530. int iOrdinalsInPool, CSphVector < CSphVector < SphOffset_t > > & dOrdBlockSize, bool bWarnOfMem )
  8531. {
  8532. int nAttrs = dOrdBlockSize.GetLength ();
  8533. int nBlocks = dOrdBlockSize[0].GetLength ();
  8534. CSphWriter Writer;
  8535. if ( !Writer.OpenFile ( szToFile, m_sLastError ) )
  8536. return false;
  8537. int iBinSize = CSphBin::CalcBinSize ( iArenaSize, nBlocks, "ordinals", bWarnOfMem );
  8538. SphOffset_t iSharedOffset = -1;
  8539. CSphQueue < OrdinalEntry_t, CmpOrdinalsEntry_fn > qOrdinals ( Max ( 1, nBlocks ) );
  8540. OrdinalEntry_t tOrdinalEntry;
  8541. DWORD uOrdinalId = 0;
  8542. CSphVector < OrdinalId_t > dOrdinalIdPool;
  8543. dOrdinalIdPool.Reserve ( nBlocks );
  8544. CSphVector < CSphVector < SphOffset_t > > dStarts;
  8545. dStarts.Resize ( nAttrs );
  8546. ARRAY_FOREACH ( i, dStarts )
  8547. dStarts[i].Resize ( nBlocks );
  8548. SphOffset_t uStart = 0;
  8549. for ( int iBlock = 0; iBlock < nBlocks; iBlock++ )
  8550. for ( int iAttr = 0; iAttr < nAttrs; iAttr++ )
  8551. {
  8552. dStarts [iAttr][iBlock] = uStart;
  8553. uStart += dOrdBlockSize [iAttr][iBlock];
  8554. }
  8555. for ( int iAttr = 0; iAttr < nAttrs; iAttr++ )
  8556. {
  8557. CSphVector < CSphBin > dBins;
  8558. dBins.Resize ( nBlocks );
  8559. ARRAY_FOREACH ( i, dBins )
  8560. {
  8561. dBins[i].m_iFileLeft = (int)dOrdBlockSize[iAttr][i];
  8562. dBins[i].m_iFilePos = dStarts[iAttr][i];
  8563. dBins[i].Init ( iFromFD, &iSharedOffset, iBinSize );
  8564. }
  8565. dOrdBlockSize [iAttr].Resize ( 0 );
  8566. for ( int iBlock = 0; iBlock < nBlocks; iBlock++ )
  8567. {
  8568. if ( ReadOrdinal ( dBins [iBlock], tOrdinalEntry )!=BIN_READ_OK )
  8569. {
  8570. m_sLastError = "sort_ordinals: warmup failed (io error?)";
  8571. return false;
  8572. }
  8573. tOrdinalEntry.m_iTag = iBlock;
  8574. qOrdinals.Push ( tOrdinalEntry );
  8575. }
  8576. SphDocID_t uCurID = 0;
  8577. CSphString sLastOrdValue;
  8578. int iMyBlock = 0;
  8579. for ( ;; )
  8580. {
  8581. if ( !qOrdinals.GetLength () || qOrdinals.Root ().m_uDocID!=uCurID )
  8582. {
  8583. if ( uCurID )
  8584. {
  8585. OrdinalId_t tId;
  8586. tId.m_uDocID = uCurID;
  8587. tId.m_uId = uOrdinalId;
  8588. dOrdinalIdPool.Add ( tId );
  8589. if ( qOrdinals.GetLength () > 0 )
  8590. {
  8591. if ( sLastOrdValue.cstr()[0]!=qOrdinals.Root ().m_sValue.cstr()[0] )
  8592. uOrdinalId++;
  8593. else
  8594. if ( strcmp ( sLastOrdValue.cstr (), qOrdinals.Root ().m_sValue.cstr () ) )
  8595. uOrdinalId++;
  8596. }
  8597. if ( dOrdinalIdPool.GetLength()==iOrdinalsInPool )
  8598. {
  8599. dOrdinalIdPool.Sort ( CmpOrdinalsDocid_fn () );
  8600. Writer.PutBytes ( &dOrdinalIdPool[0], sizeof(OrdinalId_t)*dOrdinalIdPool.GetLength() );
  8601. if ( Writer.IsError () )
  8602. {
  8603. m_sLastError = "sort_ordinals: io error";
  8604. return false;
  8605. }
  8606. dOrdBlockSize [iAttr].Add ( dOrdinalIdPool.GetLength () * sizeof ( OrdinalId_t ) );
  8607. dOrdinalIdPool.Resize ( 0 );
  8608. }
  8609. }
  8610. if ( !qOrdinals.GetLength () )
  8611. break;
  8612. uCurID = qOrdinals.Root().m_uDocID;
  8613. const_cast < CSphString & > ( qOrdinals.Root ().m_sValue ).Swap ( sLastOrdValue );
  8614. }
  8615. // get next entry
  8616. iMyBlock = qOrdinals.Root().m_iTag;
  8617. qOrdinals.Pop ();
  8618. ESphBinRead eRes = ReadOrdinal ( dBins [iMyBlock], tOrdinalEntry );
  8619. tOrdinalEntry.m_iTag = iMyBlock;
  8620. if ( eRes==BIN_READ_OK )
  8621. qOrdinals.Push ( tOrdinalEntry );
  8622. if ( eRes==BIN_READ_ERROR )
  8623. {
  8624. m_sLastError = "sort_ordinals: read error";
  8625. return false;
  8626. }
  8627. }
  8628. // flush last ordinal ids
  8629. if ( dOrdinalIdPool.GetLength () )
  8630. {
  8631. dOrdinalIdPool.Sort ( CmpOrdinalsDocid_fn () );
  8632. Writer.PutBytes ( &dOrdinalIdPool[0], sizeof(OrdinalId_t)*dOrdinalIdPool.GetLength () );
  8633. if ( Writer.IsError () )
  8634. {
  8635. m_sLastError = "sort_ordinals: io error";
  8636. return false;
  8637. }
  8638. dOrdBlockSize [iAttr].Add ( dOrdinalIdPool.GetLength()*sizeof(OrdinalId_t) );
  8639. dOrdinalIdPool.Resize ( 0 );
  8640. }
  8641. }
  8642. Writer.CloseFile ();
  8643. if ( Writer.IsError () )
  8644. return false;
  8645. return true;
  8646. }
  8647. bool CSphIndex_VLN::SortOrdinalIds ( const char * szToFile, int iFromFD, int iArenaSize,
  8648. CSphVector < CSphVector < SphOffset_t > > & dOrdBlockSize, bool bWarnOfMem )
  8649. {
  8650. int nAttrs = dOrdBlockSize.GetLength ();
  8651. int nMaxBlocks = 0;
  8652. ARRAY_FOREACH ( i, dOrdBlockSize )
  8653. if ( dOrdBlockSize[i].GetLength () > nMaxBlocks )
  8654. nMaxBlocks = dOrdBlockSize[i].GetLength ();
  8655. CSphWriter Writer;
  8656. if ( !Writer.OpenFile ( szToFile, m_sLastError ) )
  8657. return false;
  8658. int iBinSize = CSphBin::CalcBinSize ( iArenaSize, nMaxBlocks, "ordinals", bWarnOfMem );
  8659. SphOffset_t uStart = 0;
  8660. OrdinalIdEntry_t tOrdinalIdEntry;
  8661. OrdinalId_t tOrdinalId;
  8662. for ( int iAttr = 0; iAttr < nAttrs; ++iAttr )
  8663. {
  8664. int nBlocks = dOrdBlockSize [iAttr].GetLength ();
  8665. CSphQueue < OrdinalIdEntry_t, CmpOrdinalIdEntry_fn > qOrdinalIds ( Max ( 1, nBlocks ) );
  8666. CSphVector < CSphBin > dBins;
  8667. dBins.Resize ( nBlocks );
  8668. SphOffset_t iSharedOffset = -1;
  8669. ARRAY_FOREACH ( i, dBins )
  8670. {
  8671. dBins[i].m_iFileLeft = (int)dOrdBlockSize [iAttr][i];
  8672. dBins[i].m_iFilePos = uStart;
  8673. dBins[i].Init ( iFromFD, &iSharedOffset, iBinSize );
  8674. uStart += dBins[i].m_iFileLeft;
  8675. }
  8676. for ( int iBlock = 0; iBlock < nBlocks; iBlock++ )
  8677. {
  8678. if ( dBins[iBlock].ReadBytes ( &tOrdinalId, sizeof ( tOrdinalId ) )!=BIN_READ_OK )
  8679. {
  8680. m_sLastError = "sort_ordinals: warmup failed (io error?)";
  8681. return false;
  8682. }
  8683. tOrdinalIdEntry.m_uDocID = tOrdinalId.m_uDocID;
  8684. tOrdinalIdEntry.m_uId = tOrdinalId.m_uId;
  8685. tOrdinalIdEntry.m_iTag = iBlock;
  8686. qOrdinalIds.Push ( tOrdinalIdEntry );
  8687. }
  8688. OrdinalId_t tCachedId;
  8689. tCachedId.m_uDocID = 0;
  8690. SphOffset_t uResultSize = 0;
  8691. for ( ;; )
  8692. {
  8693. if ( !qOrdinalIds.GetLength () || qOrdinalIds.Root ().m_uDocID!=tCachedId.m_uDocID )
  8694. {
  8695. if ( tCachedId.m_uDocID )
  8696. {
  8697. uResultSize += sizeof ( OrdinalId_t );
  8698. Writer.PutBytes ( &tCachedId, sizeof ( OrdinalId_t ) );
  8699. if ( Writer.IsError () )
  8700. {
  8701. m_sLastError = "sort_ordinals: io error";
  8702. return false;
  8703. }
  8704. }
  8705. if ( !qOrdinalIds.GetLength () )
  8706. break;
  8707. tCachedId.m_uDocID = qOrdinalIds.Root().m_uDocID;
  8708. tCachedId.m_uId = qOrdinalIds.Root ().m_uId;
  8709. }
  8710. // get next entry
  8711. int iBlock = qOrdinalIds.Root().m_iTag;
  8712. qOrdinalIds.Pop ();
  8713. ESphBinRead eRes = dBins [iBlock].ReadBytes ( &tOrdinalId, sizeof ( tOrdinalId ) );
  8714. tOrdinalIdEntry.m_uDocID = tOrdinalId.m_uDocID;
  8715. tOrdinalIdEntry.m_uId = tOrdinalId.m_uId;
  8716. tOrdinalIdEntry.m_iTag = iBlock;
  8717. if ( eRes==BIN_READ_OK )
  8718. qOrdinalIds.Push ( tOrdinalIdEntry );
  8719. if ( eRes==BIN_READ_ERROR )
  8720. {
  8721. m_sLastError = "sort_ordinals: read error";
  8722. return false;
  8723. }
  8724. }
  8725. dOrdBlockSize [iAttr].Resize ( 0 );
  8726. dOrdBlockSize [iAttr].Add ( uResultSize );
  8727. }
  8728. return true;
  8729. }
  8730. struct FieldMVARedirect_t
  8731. {
  8732. CSphAttrLocator m_tLocator;
  8733. int m_iAttr;
  8734. int m_iMVAAttr;
  8735. bool m_bMva64;
  8736. };
  8737. bool CSphIndex_VLN::RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSize,
  8738. SphOffset_t * pFileSize, CSphBin * pMinBin, SphOffset_t * pSharedOffset )
  8739. {
  8740. assert ( pBuffer && pFileSize && pMinBin && pSharedOffset );
  8741. SphOffset_t iBlockStart = pMinBin->m_iFilePos;
  8742. SphOffset_t iBlockLeft = pMinBin->m_iFileLeft;
  8743. ESphBinRead eRes = pMinBin->Precache ();
  8744. switch ( eRes )
  8745. {
  8746. case BIN_PRECACHE_OK:
  8747. return true;
  8748. case BIN_READ_ERROR:
  8749. m_sLastError = "block relocation: preread error";
  8750. return false;
  8751. default:
  8752. break;
  8753. }
  8754. int nTransfers = (int)( ( iBlockLeft+iRelocationSize-1) / iRelocationSize );
  8755. SphOffset_t uTotalRead = 0;
  8756. SphOffset_t uNewBlockStart = *pFileSize;
  8757. for ( int i = 0; i < nTransfers; i++ )
  8758. {
  8759. sphSeek ( iFile, iBlockStart + uTotalRead, SEEK_SET );
  8760. int iToRead = i==nTransfers-1 ? (int)( iBlockLeft % iRelocationSize ) : iRelocationSize;
  8761. size_t iRead = sphReadThrottled ( iFile, pBuffer, iToRead, &g_tThrottle );
  8762. if ( iRead!=size_t(iToRead) )
  8763. {
  8764. m_sLastError.SetSprintf ( "block relocation: read error (%d of %d bytes read): %s", (int)iRead, iToRead, strerror(errno) );
  8765. return false;
  8766. }
  8767. sphSeek ( iFile, *pFileSize, SEEK_SET );
  8768. uTotalRead += iToRead;
  8769. if ( !sphWriteThrottled ( iFile, pBuffer, iToRead, "block relocation", m_sLastError, &g_tThrottle ) )
  8770. return false;
  8771. *pFileSize += iToRead;
  8772. }
  8773. assert ( uTotalRead==iBlockLeft );
  8774. // update block pointers
  8775. pMinBin->m_iFilePos = uNewBlockStart;
  8776. *pSharedOffset = *pFileSize;
  8777. return true;
  8778. }
  8779. static int CountWords ( const CSphString & sData, ISphTokenizer * pTokenizer )
  8780. {
  8781. BYTE * sField = (BYTE*) sData.cstr();
  8782. if ( !sField )
  8783. return 0;
  8784. int iCount = 0;
  8785. pTokenizer->SetBuffer ( sField, (int)strlen ( (char*)sField ) );
  8786. while ( pTokenizer->GetToken() )
  8787. iCount++;
  8788. return iCount;
  8789. }
  8790. bool CSphIndex_VLN::LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords )
  8791. {
  8792. assert ( dHitlessWords.GetLength()==0 );
  8793. if ( m_tSettings.m_sHitlessFiles.IsEmpty() )
  8794. return true;
  8795. const char * szStart = m_tSettings.m_sHitlessFiles.cstr();
  8796. while ( *szStart )
  8797. {
  8798. while ( *szStart && ( sphIsSpace ( *szStart ) || *szStart==',' ) )
  8799. ++szStart;
  8800. if ( !*szStart )
  8801. break;
  8802. const char * szWordStart = szStart;
  8803. while ( *szStart && !sphIsSpace ( *szStart ) && *szStart!=',' )
  8804. ++szStart;
  8805. if ( szStart - szWordStart > 0 )
  8806. {
  8807. CSphString sFilename;
  8808. sFilename.SetBinary ( szWordStart, szStart-szWordStart );
  8809. CSphAutofile tFile ( sFilename.cstr(), SPH_O_READ, m_sLastError );
  8810. if ( tFile.GetFD()==-1 )
  8811. return false;
  8812. CSphVector<BYTE> dBuffer ( (int)tFile.GetSize() );
  8813. if ( !tFile.Read ( &dBuffer[0], dBuffer.GetLength(), m_sLastError ) )
  8814. return false;
  8815. // FIXME!!! dict=keywords + hitless_words=some
  8816. m_pTokenizer->SetBuffer ( &dBuffer[0], dBuffer.GetLength() );
  8817. while ( BYTE * sToken = m_pTokenizer->GetToken() )
  8818. dHitlessWords.Add ( m_pDict->GetWordID ( sToken ) );
  8819. }
  8820. }
  8821. dHitlessWords.Uniq();
  8822. return true;
  8823. }
  8824. static bool sphTruncate ( int iFD )
  8825. {
  8826. #if USE_WINDOWS
  8827. return SetEndOfFile ( (HANDLE) _get_osfhandle(iFD) )!=0;
  8828. #else
  8829. return ::ftruncate ( iFD, ::lseek ( iFD, 0, SEEK_CUR ) )==0;
  8830. #endif
  8831. }
  8832. class DeleteOnFail : public ISphNoncopyable
  8833. {
  8834. public:
  8835. DeleteOnFail() : m_bShitHappened ( true )
  8836. {}
  8837. inline ~DeleteOnFail()
  8838. {
  8839. if ( m_bShitHappened )
  8840. {
  8841. ARRAY_FOREACH ( i, m_dWriters )
  8842. m_dWriters[i]->UnlinkFile();
  8843. ARRAY_FOREACH ( i, m_dAutofiles )
  8844. m_dAutofiles[i]->SetTemporary();
  8845. }
  8846. }
  8847. inline void AddWriter ( CSphWriter* pWr )
  8848. {
  8849. if ( pWr )
  8850. m_dWriters.Add ( pWr );
  8851. }
  8852. inline void AddAutofile ( CSphAutofile* pAf )
  8853. {
  8854. if ( pAf )
  8855. m_dAutofiles.Add ( pAf );
  8856. }
  8857. inline void AllIsDone()
  8858. {
  8859. m_bShitHappened = false;
  8860. }
  8861. private:
  8862. bool m_bShitHappened;
  8863. CSphVector<CSphWriter*> m_dWriters;
  8864. CSphVector<CSphAutofile*> m_dAutofiles;
  8865. };
  8866. int CSphIndex_VLN::Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer )
  8867. {
  8868. assert ( dSources.GetLength() );
  8869. CSphVector<SphWordID_t> dHitlessWords;
  8870. if ( !LoadHitlessWords ( dHitlessWords ) )
  8871. return 0;
  8872. int iHitBuilderBufferSize = ( iWriteBuffer>0 )
  8873. ? Max ( iWriteBuffer, MIN_WRITE_BUFFER )
  8874. : DEFAULT_WRITE_BUFFER;
  8875. // vars shared between phases
  8876. CSphVector<CSphBin*> dBins;
  8877. SphOffset_t iSharedOffset = -1;
  8878. m_pDict->HitblockBegin();
  8879. // setup sources
  8880. ARRAY_FOREACH ( iSource, dSources )
  8881. {
  8882. CSphSource * pSource = dSources[iSource];
  8883. assert ( pSource );
  8884. pSource->SetDict ( m_pDict );
  8885. pSource->Setup ( m_tSettings );
  8886. }
  8887. // connect 1st source and fetch its schema
  8888. if ( !dSources[0]->Connect ( m_sLastError )
  8889. || !dSources[0]->IterateStart ( m_sLastError )
  8890. || !dSources[0]->UpdateSchema ( &m_tSchema, m_sLastError ) )
  8891. {
  8892. return 0;
  8893. }
  8894. if ( m_tSchema.m_dFields.GetLength()==0 )
  8895. {
  8896. m_sLastError.SetSprintf ( "No fields in schema - will not index" );
  8897. return 0;
  8898. }
  8899. // check docinfo
  8900. if ( m_tSchema.GetAttrsCount()==0 && m_tSettings.m_eDocinfo!=SPH_DOCINFO_NONE )
  8901. {
  8902. sphWarning ( "Attribute count is 0: switching to none docinfo" );
  8903. m_tSettings.m_eDocinfo = SPH_DOCINFO_NONE;
  8904. }
  8905. if ( dSources[0]->HasJoinedFields() && m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  8906. {
  8907. m_sLastError.SetSprintf ( "got joined fields, but docinfo is 'inline' (fix your config file)" );
  8908. return 0;
  8909. }
  8910. if ( m_tSchema.GetAttrsCount()>0 && m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
  8911. {
  8912. m_sLastError.SetSprintf ( "got attributes, but docinfo is 'none' (fix your config file)" );
  8913. return 0;
  8914. }
  8915. bool bHaveFieldMVAs = false;
  8916. int iFieldLens = -1;
  8917. CSphVector<int> dMvaIndexes;
  8918. CSphVector<CSphAttrLocator> dMvaLocators;
  8919. // ordinals and strings storage
  8920. CSphVector<int> dOrdinalAttrs;
  8921. CSphVector<int> dStringAttrs;
  8922. CSphVector<int> dWordcountAttrs;
  8923. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  8924. {
  8925. const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
  8926. switch ( tCol.m_eAttrType )
  8927. {
  8928. case SPH_ATTR_UINT32SET:
  8929. if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
  8930. bHaveFieldMVAs = true;
  8931. dMvaIndexes.Add ( i );
  8932. dMvaLocators.Add ( tCol.m_tLocator );
  8933. break;
  8934. case SPH_ATTR_ORDINAL:
  8935. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
  8936. dOrdinalAttrs.Add ( i );
  8937. break;
  8938. case SPH_ATTR_STRING:
  8939. case SPH_ATTR_JSON:
  8940. dStringAttrs.Add ( i );
  8941. break;
  8942. case SPH_ATTR_WORDCOUNT:
  8943. dWordcountAttrs.Add ( i );
  8944. break;
  8945. case SPH_ATTR_TOKENCOUNT:
  8946. if ( iFieldLens<0 )
  8947. iFieldLens = i;
  8948. break;
  8949. default:
  8950. break;
  8951. }
  8952. }
  8953. // no field lengths for docinfo=inline
  8954. assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN || iFieldLens==-1 );
  8955. // this loop must NOT be merged with the previous one;
  8956. // mva64 must intentionally be after all the mva32
  8957. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  8958. {
  8959. const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
  8960. if ( tCol.m_eAttrType!=SPH_ATTR_INT64SET )
  8961. continue;
  8962. if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
  8963. bHaveFieldMVAs = true;
  8964. dMvaIndexes.Add ( i );
  8965. dMvaLocators.Add ( tCol.m_tLocator );
  8966. }
  8967. bool bGotMVA = ( dMvaIndexes.GetLength()!=0 );
  8968. if ( bGotMVA && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
  8969. {
  8970. m_sLastError.SetSprintf ( "multi-valued attributes require docinfo=extern (fix your config file)" );
  8971. return 0;
  8972. }
  8973. bool bHaveOrdinals = ( dOrdinalAttrs.GetLength() > 0 );
  8974. if ( bHaveOrdinals && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
  8975. {
  8976. m_sLastError.SetSprintf ( "ordinal string attributes require docinfo=extern (fix your config file)" );
  8977. return 0;
  8978. }
  8979. if ( dStringAttrs.GetLength() && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
  8980. {
  8981. m_sLastError.SetSprintf ( "string attributes require docinfo=extern (fix your config file)" );
  8982. return 0;
  8983. }
  8984. CSphHitBuilder tHitBuilder ( m_tSettings, dHitlessWords, false, iHitBuilderBufferSize, m_pDict, &m_sLastError );
  8985. ////////////////////////////////////////////////
  8986. // collect and partially sort hits and docinfos
  8987. ////////////////////////////////////////////////
  8988. // killlist storage
  8989. CSphVector <SphAttr_t> dKillList;
  8990. // adjust memory requirements
  8991. int iOldLimit = iMemoryLimit;
  8992. // book memory to store at least 64K attribute rows
  8993. const int iDocinfoStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  8994. int iDocinfoMax = Max ( 65536, iMemoryLimit/16/iDocinfoStride/sizeof(DWORD) );
  8995. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
  8996. iDocinfoMax = 1;
  8997. // book at least 32 KB for ordinals, if needed
  8998. int iOrdinalPoolSize = Max ( 32768, iMemoryLimit/8 );
  8999. if ( !bHaveOrdinals )
  9000. iOrdinalPoolSize = 0;
  9001. // book at least 32 KB for field MVAs, if needed
  9002. int iFieldMVAPoolSize = Max ( 32768, iMemoryLimit/16 );
  9003. if ( bHaveFieldMVAs==0 )
  9004. iFieldMVAPoolSize = 0;
  9005. // book at least 2 MB for keywords dict, if needed
  9006. int iDictSize = 0;
  9007. if ( m_pDict->GetSettings().m_bWordDict )
  9008. iDictSize = Max ( MIN_KEYWORDS_DICT, iMemoryLimit/8 );
  9009. // do we have enough left for hits?
  9010. int iHitsMax = 1048576;
  9011. iMemoryLimit -= iDocinfoMax*iDocinfoStride*sizeof(DWORD) + iOrdinalPoolSize + iFieldMVAPoolSize + iDictSize;
  9012. if ( iMemoryLimit < iHitsMax*(int)sizeof(CSphWordHit) )
  9013. {
  9014. iMemoryLimit = iOldLimit + iHitsMax*sizeof(CSphWordHit) - iMemoryLimit;
  9015. sphWarn ( "collect_hits: mem_limit=%d kb too low, increasing to %d kb",
  9016. iOldLimit/1024, iMemoryLimit/1024 );
  9017. } else
  9018. {
  9019. iHitsMax = iMemoryLimit / sizeof(CSphWordHit);
  9020. }
  9021. // allocate raw hits block
  9022. CSphFixedVector<CSphWordHit> dHits ( iHitsMax + MAX_SOURCE_HITS );
  9023. CSphWordHit * pHits = dHits.Begin();
  9024. CSphWordHit * pHitsMax = dHits.Begin() + iHitsMax;
  9025. // allocate docinfos buffer
  9026. CSphFixedVector<DWORD> dDocinfos ( iDocinfoMax*iDocinfoStride );
  9027. DWORD * pDocinfo = dDocinfos.Begin();
  9028. const DWORD * pDocinfoMax = dDocinfos.Begin() + iDocinfoMax*iDocinfoStride;
  9029. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
  9030. {
  9031. pDocinfo = NULL;
  9032. pDocinfoMax = NULL;
  9033. }
  9034. int nOrdinals = 0;
  9035. SphOffset_t uMaxOrdinalAttrBlockSize = 0;
  9036. int iCurrentBlockSize = 0;
  9037. CSphVector < CSphVector < Ordinal_t > > dOrdinals;
  9038. dOrdinals.Resize ( dOrdinalAttrs.GetLength() );
  9039. ARRAY_FOREACH ( i, dOrdinals )
  9040. dOrdinals[i].Reserve ( 65536 );
  9041. CSphVector < CSphVector<SphOffset_t> > dOrdBlockSize;
  9042. dOrdBlockSize.Resize ( dOrdinalAttrs.GetLength () );
  9043. ARRAY_FOREACH ( i, dOrdBlockSize )
  9044. dOrdBlockSize[i].Reserve ( 8192 );
  9045. int iMaxOrdLen = 0;
  9046. CSphVector < MvaEntry_t > dFieldMVAs;
  9047. dFieldMVAs.Reserve ( 16384 );
  9048. CSphVector < SphOffset_t > dFieldMVABlocks;
  9049. dFieldMVABlocks.Reserve ( 4096 );
  9050. CSphVector < FieldMVARedirect_t > dFieldMvaIndexes;
  9051. if ( bHaveFieldMVAs )
  9052. dFieldMvaIndexes.Reserve ( 8 );
  9053. int iMaxPoolFieldMVAs = iFieldMVAPoolSize / sizeof ( MvaEntry_t );
  9054. int nFieldMVAs = 0;
  9055. CSphScopedPtr<CSphIndex_VLN> pPrevIndex(NULL);
  9056. if ( m_bKeepAttrs )
  9057. {
  9058. CSphString sWarning;
  9059. pPrevIndex = dynamic_cast<CSphIndex_VLN *>( sphCreateIndexPhrase ( NULL, m_sFilename.cstr() ) );
  9060. pPrevIndex->SetWordlistPreload ( false );
  9061. if ( !pPrevIndex->Prealloc ( false, false, sWarning ) || !pPrevIndex->Preread() )
  9062. pPrevIndex.Reset();
  9063. else
  9064. {
  9065. // check schemas
  9066. CSphString sError;
  9067. if ( !m_tSchema.CompareTo ( pPrevIndex->m_tSchema, sError, false ) )
  9068. {
  9069. sphWarn ( "schemas are different (%s); ignoring --keep-attrs", sError.cstr() );
  9070. pPrevIndex.Reset();
  9071. }
  9072. }
  9073. }
  9074. // create temp files
  9075. CSphAutofile fdLock ( GetIndexFileName("tmp0"), SPH_O_NEW, m_sLastError, true );
  9076. CSphAutofile fdHits ( GetIndexFileName ( m_bInplaceSettings ? "spp" : "tmp1" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
  9077. CSphAutofile fdDocinfos ( GetIndexFileName ( m_bInplaceSettings ? "spa" : "tmp2" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
  9078. CSphAutofile fdTmpFieldMVAs ( GetIndexFileName("tmp7"), SPH_O_NEW, m_sLastError, true );
  9079. CSphWriter tOrdWriter;
  9080. CSphWriter tStrWriter;
  9081. CSphString sRawOrdinalsFile = GetIndexFileName("tmp4");
  9082. if ( bHaveOrdinals && !tOrdWriter.OpenFile ( sRawOrdinalsFile.cstr (), m_sLastError ) )
  9083. return 0;
  9084. if ( !tStrWriter.OpenFile ( GetIndexFileName("sps"), m_sLastError ) )
  9085. return 0;
  9086. tStrWriter.PutByte ( 0 ); // dummy byte, to reserve magic zero offset
  9087. DeleteOnFail dFileWatchdog;
  9088. if ( m_bInplaceSettings )
  9089. {
  9090. dFileWatchdog.AddAutofile ( &fdHits );
  9091. dFileWatchdog.AddAutofile ( &fdDocinfos );
  9092. }
  9093. dFileWatchdog.AddWriter ( &tStrWriter );
  9094. if ( fdLock.GetFD()<0 || fdHits.GetFD()<0 || fdDocinfos.GetFD()<0 || fdTmpFieldMVAs.GetFD ()<0 )
  9095. return 0;
  9096. SphOffset_t iHitsGap = 0;
  9097. SphOffset_t iDocinfosGap = 0;
  9098. if ( m_bInplaceSettings )
  9099. {
  9100. const int HIT_SIZE_AVG = 4;
  9101. const float HIT_BLOCK_FACTOR = 1.0f;
  9102. const float DOCINFO_BLOCK_FACTOR = 1.0f;
  9103. if ( m_iHitGap )
  9104. iHitsGap = (SphOffset_t) m_iHitGap;
  9105. else
  9106. iHitsGap = (SphOffset_t)( iHitsMax*HIT_BLOCK_FACTOR*HIT_SIZE_AVG );
  9107. iHitsGap = Max ( iHitsGap, 1 );
  9108. sphSeek ( fdHits.GetFD (), iHitsGap, SEEK_SET );
  9109. if ( m_iDocinfoGap )
  9110. iDocinfosGap = (SphOffset_t) m_iDocinfoGap;
  9111. else
  9112. iDocinfosGap = (SphOffset_t)( iDocinfoMax*DOCINFO_BLOCK_FACTOR*iDocinfoStride*sizeof(DWORD) );
  9113. iDocinfosGap = Max ( iDocinfosGap, 1 );
  9114. sphSeek ( fdDocinfos.GetFD (), iDocinfosGap, SEEK_SET );
  9115. }
  9116. if ( !sphLockEx ( fdLock.GetFD(), false ) )
  9117. {
  9118. m_sLastError.SetSprintf ( "failed to lock '%s': another indexer running?", fdLock.GetFilename() );
  9119. return 0;
  9120. }
  9121. // setup accumulating docinfo IDs range
  9122. m_dMinRow.Reset ( m_tSchema.GetRowSize() );
  9123. m_iMinDocid = DOCID_MAX;
  9124. ARRAY_FOREACH ( i, m_dMinRow )
  9125. m_dMinRow[i] = ROWITEM_MAX;
  9126. // build raw log
  9127. // PROFILE_BEGIN ( collect_hits );
  9128. m_tStats.Reset ();
  9129. m_tProgress.m_ePhase = CSphIndexProgress::PHASE_COLLECT;
  9130. m_tProgress.m_iAttrs = 0;
  9131. CSphVector<int> dHitBlocks;
  9132. dHitBlocks.Reserve ( 1024 );
  9133. int iDocinfoBlocks = 0;
  9134. ARRAY_FOREACH ( iSource, dSources )
  9135. {
  9136. // connect and check schema, if it's not the first one
  9137. CSphSource * pSource = dSources[iSource];
  9138. if ( iSource )
  9139. {
  9140. if ( !pSource->Connect ( m_sLastError )
  9141. || !pSource->IterateStart ( m_sLastError )
  9142. || !pSource->UpdateSchema ( &m_tSchema, m_sLastError ) )
  9143. {
  9144. return 0;
  9145. }
  9146. if ( pSource->HasJoinedFields() && m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  9147. {
  9148. m_sLastError.SetSprintf ( "got joined fields, but docinfo is 'inline' (fix your config file)" );
  9149. return 0;
  9150. }
  9151. }
  9152. dFieldMvaIndexes.Resize ( 0 );
  9153. ARRAY_FOREACH ( i, dMvaIndexes )
  9154. {
  9155. int iAttr = dMvaIndexes[i];
  9156. const CSphColumnInfo & tCol = m_tSchema.GetAttr ( iAttr );
  9157. if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
  9158. {
  9159. FieldMVARedirect_t & tRedirect = dFieldMvaIndexes.Add();
  9160. tRedirect.m_tLocator = tCol.m_tLocator;
  9161. tRedirect.m_iAttr = iAttr;
  9162. tRedirect.m_iMVAAttr = i;
  9163. tRedirect.m_bMva64 = ( tCol.m_eAttrType==SPH_ATTR_INT64SET );
  9164. }
  9165. }
  9166. // joined filter
  9167. bool bGotJoined = ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_INLINE ) && pSource->HasJoinedFields();
  9168. CSphVector<SphDocID_t> dAllIds; // FIXME! unlimited RAM use..
  9169. // fetch documents
  9170. for ( ;; )
  9171. {
  9172. // get next doc, and handle errors
  9173. bool bGotDoc = pSource->IterateDocument ( m_sLastError );
  9174. if ( !bGotDoc )
  9175. return 0;
  9176. // ensure docid is sane
  9177. if ( pSource->m_tDocInfo.m_iDocID==DOCID_MAX )
  9178. {
  9179. m_sLastError.SetSprintf ( "docid==DOCID_MAX (source broken?)" );
  9180. return 0;
  9181. }
  9182. // check for eof
  9183. if ( !pSource->m_tDocInfo.m_iDocID )
  9184. break;
  9185. if ( bGotJoined )
  9186. dAllIds.Add ( pSource->m_tDocInfo.m_iDocID );
  9187. // show progress bar
  9188. if ( ( pSource->GetStats().m_iTotalDocuments % 1000 )==0 )
  9189. {
  9190. m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments + pSource->GetStats().m_iTotalDocuments;
  9191. m_tProgress.m_iBytes = m_tStats.m_iTotalBytes + pSource->GetStats().m_iTotalBytes;
  9192. m_tProgress.Show ( false );
  9193. }
  9194. // update crashdump
  9195. g_iIndexerCurrentDocID = pSource->m_tDocInfo.m_iDocID;
  9196. g_iIndexerCurrentHits = pHits-dHits.Begin();
  9197. DWORD * pPrevDocinfo = NULL;
  9198. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pPrevIndex.Ptr() )
  9199. pPrevDocinfo = const_cast<DWORD*>( pPrevIndex->FindDocinfo ( pSource->m_tDocInfo.m_iDocID ) );
  9200. if ( dMvaIndexes.GetLength() && pPrevDocinfo && pPrevIndex->GetMVAPool() )
  9201. {
  9202. // fetch old mva values
  9203. ARRAY_FOREACH ( i, dMvaIndexes )
  9204. {
  9205. const CSphColumnInfo & tCol = m_tSchema.GetAttr ( dMvaIndexes[i] );
  9206. SphAttr_t uOff = sphGetRowAttr ( DOCINFO2ATTRS ( pPrevDocinfo ), tCol.m_tLocator );
  9207. if ( !uOff )
  9208. continue;
  9209. const DWORD * pMVA = pPrevIndex->GetMVAPool()+uOff;
  9210. int nMVAs = *pMVA++;
  9211. for ( int iMVA = 0; iMVA < nMVAs; iMVA++ )
  9212. {
  9213. MvaEntry_t & tMva = dFieldMVAs.Add();
  9214. tMva.m_uDocID = pSource->m_tDocInfo.m_iDocID;
  9215. tMva.m_iAttr = i;
  9216. if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
  9217. {
  9218. tMva.m_iValue = MVA_UPSIZE(pMVA);
  9219. pMVA++;
  9220. } else
  9221. tMva.m_iValue = *pMVA;
  9222. pMVA++;
  9223. int iLength = dFieldMVAs.GetLength ();
  9224. if ( iLength==iMaxPoolFieldMVAs )
  9225. {
  9226. dFieldMVAs.Sort ( CmpMvaEntries_fn () );
  9227. if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0],
  9228. iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError, &g_tThrottle ) )
  9229. return 0;
  9230. dFieldMVAs.Resize ( 0 );
  9231. nFieldMVAs += iMaxPoolFieldMVAs;
  9232. }
  9233. }
  9234. }
  9235. } else if ( bHaveFieldMVAs )
  9236. {
  9237. // store field MVAs
  9238. ARRAY_FOREACH ( i, dFieldMvaIndexes )
  9239. {
  9240. int iAttr = dFieldMvaIndexes[i].m_iAttr;
  9241. int iMVA = dFieldMvaIndexes[i].m_iMVAAttr;
  9242. bool bMva64 = dFieldMvaIndexes[i].m_bMva64;
  9243. int iStep = ( bMva64 ? 2 : 1 );
  9244. // store per-document MVAs
  9245. SphRange_t tFieldMva = pSource->IterateFieldMVAStart ( iAttr );
  9246. m_tProgress.m_iAttrs += ( tFieldMva.m_iLength / iStep );
  9247. assert ( ( tFieldMva.m_iStart + tFieldMva.m_iLength )<=pSource->m_dMva.GetLength() );
  9248. for ( int i=tFieldMva.m_iStart; i<( tFieldMva.m_iStart+tFieldMva.m_iLength); i+=iStep )
  9249. {
  9250. MvaEntry_t & tMva = dFieldMVAs.Add();
  9251. tMva.m_uDocID = pSource->m_tDocInfo.m_iDocID;
  9252. tMva.m_iAttr = iMVA;
  9253. if ( bMva64 )
  9254. {
  9255. tMva.m_iValue = MVA_UPSIZE ( pSource->m_dMva.Begin() + i );
  9256. } else
  9257. {
  9258. tMva.m_iValue = pSource->m_dMva[i];
  9259. }
  9260. int iLength = dFieldMVAs.GetLength ();
  9261. if ( iLength==iMaxPoolFieldMVAs )
  9262. {
  9263. dFieldMVAs.Sort ( CmpMvaEntries_fn () );
  9264. if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0],
  9265. iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError, &g_tThrottle ) )
  9266. return 0;
  9267. dFieldMVAs.Resize ( 0 );
  9268. nFieldMVAs += iMaxPoolFieldMVAs;
  9269. }
  9270. }
  9271. }
  9272. }
  9273. // store ordinals
  9274. iCurrentBlockSize += ( sizeof ( SphOffset_t ) + sizeof ( DWORD ) ) * dOrdinalAttrs.GetLength ();
  9275. ARRAY_FOREACH ( i, dOrdinalAttrs )
  9276. {
  9277. CSphVector<Ordinal_t> & dCol = dOrdinals[i];
  9278. dCol.Add();
  9279. Ordinal_t & tLastOrd = dCol.Last();
  9280. tLastOrd.m_uDocID = pSource->m_tDocInfo.m_iDocID;
  9281. Swap ( tLastOrd.m_sValue, pSource->m_dStrAttrs[dOrdinalAttrs[i]] );
  9282. int iOrdStrLen = strlen ( tLastOrd.m_sValue.cstr () );
  9283. if ( iOrdStrLen > MAX_ORDINAL_STR_LEN )
  9284. {
  9285. iMaxOrdLen = iOrdStrLen;
  9286. // truncate
  9287. iOrdStrLen = MAX_ORDINAL_STR_LEN;
  9288. tLastOrd.m_sValue = tLastOrd.m_sValue.SubString ( 0, iOrdStrLen - 1 );
  9289. }
  9290. iCurrentBlockSize += iOrdStrLen;
  9291. }
  9292. if ( bHaveOrdinals )
  9293. {
  9294. if ( iCurrentBlockSize>=iOrdinalPoolSize )
  9295. {
  9296. iCurrentBlockSize = 0;
  9297. nOrdinals += dOrdinals[0].GetLength ();
  9298. ARRAY_FOREACH ( i, dOrdinalAttrs )
  9299. {
  9300. CSphVector<Ordinal_t> & dCol = dOrdinals[i];
  9301. dCol.Sort ( CmpOrdinalsValue_fn() );
  9302. SphOffset_t uSize = DumpOrdinals ( tOrdWriter, dCol );
  9303. if ( !uSize )
  9304. {
  9305. m_sLastError = "dump ordinals: io error";
  9306. return 0;
  9307. }
  9308. if ( uSize > uMaxOrdinalAttrBlockSize )
  9309. uMaxOrdinalAttrBlockSize = uSize;
  9310. dOrdBlockSize[i].Add ( uSize );
  9311. dCol.Resize ( 0 );
  9312. }
  9313. }
  9314. }
  9315. // store strings and JSON blobs
  9316. if ( pPrevDocinfo )
  9317. {
  9318. CSphRowitem * pPrevAttrs = DOCINFO2ATTRS ( pPrevDocinfo );
  9319. ARRAY_FOREACH ( i, dStringAttrs )
  9320. {
  9321. const CSphAttrLocator & tLoc = m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator;
  9322. SphAttr_t uPrevOff = sphGetRowAttr ( pPrevAttrs, tLoc );
  9323. BYTE * pBase = pPrevIndex->m_pStrings.GetWritePtr();
  9324. if ( !uPrevOff || !pBase )
  9325. sphSetRowAttr ( pPrevAttrs, tLoc, 0 );
  9326. else
  9327. {
  9328. const BYTE * pStr = NULL;
  9329. int iLen = sphUnpackStr ( pBase+uPrevOff, &pStr );
  9330. if ( !iLen )
  9331. sphSetRowAttr ( pPrevAttrs, tLoc, 0 );
  9332. else
  9333. {
  9334. SphOffset_t uOff = tStrWriter.GetPos();
  9335. if ( uint64_t(uOff)>>32 )
  9336. {
  9337. m_sLastError.SetSprintf ( "too many string attributes (current index format allows up to 4 GB)" );
  9338. return 0;
  9339. }
  9340. sphSetRowAttr ( pPrevAttrs, tLoc, DWORD(uOff) );
  9341. BYTE dPackedLen[4];
  9342. int iLenLen = sphPackStrlen ( dPackedLen, iLen );
  9343. tStrWriter.PutBytes ( &dPackedLen, iLenLen );
  9344. tStrWriter.PutBytes ( pStr, iLen );
  9345. }
  9346. }
  9347. }
  9348. } else
  9349. {
  9350. ARRAY_FOREACH ( i, dStringAttrs )
  9351. {
  9352. // FIXME! optimize locators etc?
  9353. // FIXME! support binary strings w/embedded zeroes?
  9354. // get data, calc length
  9355. const char * sData = pSource->m_dStrAttrs[dStringAttrs[i]].cstr();
  9356. int iLen = sData ? strlen ( sData ) : 0;
  9357. // no data
  9358. if ( !iLen )
  9359. {
  9360. pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, 0 );
  9361. continue;
  9362. }
  9363. // handle JSON
  9364. CSphVector<BYTE> dBuf; // FIXME? optimize?
  9365. if ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_eAttrType==SPH_ATTR_JSON ) // FIXME? optimize?
  9366. {
  9367. // WARNING, tricky bit
  9368. // flex lexer needs last two (!) bytes to be zeroes
  9369. // asciiz string supplies one, and we fill out the extra one
  9370. // and that works, because CSphString always allocates a small extra gap
  9371. char * pData = const_cast<char*>(sData);
  9372. pData[iLen+1] = '\0';
  9373. if ( !sphJsonParse ( dBuf, pData, g_bJsonAutoconvNumbers, g_bJsonKeynamesToLowercase, m_sLastError ) )
  9374. {
  9375. m_sLastError.SetSprintf ( "document " DOCID_FMT ", attribute %s: JSON error: %s",
  9376. pSource->m_tDocInfo.m_iDocID, m_tSchema.GetAttr ( dStringAttrs[i] ).m_sName.cstr(),
  9377. m_sLastError.cstr() );
  9378. // bail?
  9379. if ( g_bJsonStrict )
  9380. return 0;
  9381. // warn and ignore
  9382. sphWarning ( "%s", m_sLastError.cstr() );
  9383. m_sLastError = "";
  9384. pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, 0 );
  9385. continue;
  9386. }
  9387. if ( !dBuf.GetLength() )
  9388. {
  9389. // empty SphinxBSON, need not save any data
  9390. pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, 0 );
  9391. continue;
  9392. }
  9393. // let's go save the newly built SphinxBSON blob
  9394. sData = (const char*)dBuf.Begin();
  9395. iLen = dBuf.GetLength();
  9396. }
  9397. // calc offset, do sanity checks
  9398. SphOffset_t uOff = tStrWriter.GetPos();
  9399. if ( uint64_t(uOff)>>32 )
  9400. {
  9401. m_sLastError.SetSprintf ( "too many string attributes (current index format allows up to 4 GB)" );
  9402. return 0;
  9403. }
  9404. pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, DWORD(uOff) );
  9405. // pack length, emit it, emit data
  9406. BYTE dPackedLen[4];
  9407. int iLenLen = sphPackStrlen ( dPackedLen, iLen );
  9408. tStrWriter.PutBytes ( &dPackedLen, iLenLen );
  9409. tStrWriter.PutBytes ( sData, iLen );
  9410. }
  9411. }
  9412. // count words
  9413. if ( !pPrevDocinfo )
  9414. ARRAY_FOREACH ( i, dWordcountAttrs )
  9415. {
  9416. int iAttr = dWordcountAttrs[i];
  9417. int iNumWords = CountWords ( pSource->m_dStrAttrs[iAttr], m_pTokenizer );
  9418. pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr(iAttr).m_tLocator, iNumWords );
  9419. }
  9420. // docinfo=inline might be flushed while collecting hits
  9421. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  9422. {
  9423. // store next entry
  9424. DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_iDocID );
  9425. memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSource->m_tDocInfo.m_pDynamic, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
  9426. pDocinfo += iDocinfoStride;
  9427. // update min docinfo
  9428. assert ( pSource->m_tDocInfo.m_iDocID );
  9429. m_iMinDocid = Min ( m_iMinDocid, pSource->m_tDocInfo.m_iDocID );
  9430. ARRAY_FOREACH ( i, m_dMinRow )
  9431. m_dMinRow[i] = Min ( m_dMinRow[i], pSource->m_tDocInfo.m_pDynamic[i] );
  9432. }
  9433. // store hits
  9434. while ( const ISphHits * pDocHits = pSource->IterateHits ( m_sLastWarning ) )
  9435. {
  9436. int iDocHits = pDocHits->Length();
  9437. #if PARANOID
  9438. for ( int i=0; i<iDocHits; i++ )
  9439. {
  9440. assert ( pDocHits->m_dData[i].m_iDocID==pSource->m_tDocInfo.m_iDocID );
  9441. assert ( pDocHits->m_dData[i].m_iWordID );
  9442. assert ( pDocHits->m_dData[i].m_iWordPos );
  9443. }
  9444. #endif
  9445. assert ( ( pHits+iDocHits )<=( pHitsMax+MAX_SOURCE_HITS ) );
  9446. memcpy ( pHits, pDocHits->First(), iDocHits*sizeof(CSphWordHit) );
  9447. pHits += iDocHits;
  9448. // check if we need to flush
  9449. if ( pHits<pHitsMax
  9450. && !( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE && pDocinfo>=pDocinfoMax )
  9451. && !( iDictSize && m_pDict->HitblockGetMemUse() > iDictSize ) )
  9452. {
  9453. continue;
  9454. }
  9455. // update crashdump
  9456. g_iIndexerPoolStartDocID = pSource->m_tDocInfo.m_iDocID;
  9457. g_iIndexerPoolStartHit = pHits-dHits.Begin();
  9458. // sort hits
  9459. int iHits = pHits - dHits.Begin();
  9460. {
  9461. // PROFILE ( sort_hits );
  9462. sphSort ( dHits.Begin(), iHits, CmpHit_fn() );
  9463. m_pDict->HitblockPatch ( dHits.Begin(), iHits );
  9464. }
  9465. pHits = dHits.Begin();
  9466. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  9467. {
  9468. // we're inlining, so let's flush both hits and docs
  9469. int iDocs = ( pDocinfo - dDocinfos.Begin() ) / iDocinfoStride;
  9470. pDocinfo = dDocinfos.Begin();
  9471. sphSortDocinfos ( dDocinfos.Begin(), iDocs, iDocinfoStride );
  9472. dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits,
  9473. dDocinfos.Begin(), iDocs, iDocinfoStride ) );
  9474. // we are inlining, so if there are more hits in this document,
  9475. // we'll need to know it's info next flush
  9476. if ( iDocHits )
  9477. {
  9478. DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_iDocID );
  9479. memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSource->m_tDocInfo.m_pDynamic, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
  9480. pDocinfo += iDocinfoStride;
  9481. }
  9482. } else
  9483. {
  9484. // we're not inlining, so only flush hits, docs are flushed independently
  9485. dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits,
  9486. NULL, 0, 0 ) );
  9487. }
  9488. m_pDict->HitblockReset ();
  9489. if ( dHitBlocks.Last()<0 )
  9490. return 0;
  9491. // progress bar
  9492. m_tProgress.m_iHitsTotal += iHits;
  9493. m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments + pSource->GetStats().m_iTotalDocuments;
  9494. m_tProgress.m_iBytes = m_tStats.m_iTotalBytes + pSource->GetStats().m_iTotalBytes;
  9495. m_tProgress.Show ( false );
  9496. }
  9497. // update min docinfo
  9498. assert ( pSource->m_tDocInfo.m_iDocID );
  9499. m_iMinDocid = Min ( m_iMinDocid, pSource->m_tDocInfo.m_iDocID );
  9500. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  9501. {
  9502. ARRAY_FOREACH ( i, m_dMinRow )
  9503. m_dMinRow[i] = Min ( m_dMinRow[i], pSource->m_tDocInfo.m_pDynamic[i] );
  9504. }
  9505. // update total field lengths
  9506. if ( iFieldLens>=0 )
  9507. {
  9508. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  9509. m_dFieldLens[i] += pSource->m_tDocInfo.GetAttr ( m_tSchema.GetAttr ( i+iFieldLens ).m_tLocator );
  9510. }
  9511. // store docinfo
  9512. // with the advent of SPH_ATTR_TOKENCOUNT, now MUST be done AFTER iterating the hits
  9513. // because field lengths are computed during that iterating
  9514. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
  9515. {
  9516. // store next entry
  9517. DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_iDocID );
  9518. // old docinfo found, use it instead of the new one
  9519. const DWORD * pSrc = pPrevDocinfo ? DOCINFO2ATTRS ( pPrevDocinfo ) : pSource->m_tDocInfo.m_pDynamic;
  9520. memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSrc, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
  9521. pDocinfo += iDocinfoStride;
  9522. // if not inlining, flush buffer if it's full
  9523. // (if inlining, it will flushed later, along with the hits)
  9524. if ( pDocinfo>=pDocinfoMax )
  9525. {
  9526. assert ( pDocinfo==pDocinfoMax );
  9527. int iLen = iDocinfoMax*iDocinfoStride*sizeof(DWORD);
  9528. sphSortDocinfos ( dDocinfos.Begin(), iDocinfoMax, iDocinfoStride );
  9529. if ( !sphWriteThrottled ( fdDocinfos.GetFD(), dDocinfos.Begin(), iLen, "raw_docinfos", m_sLastError, &g_tThrottle ) )
  9530. return 0;
  9531. pDocinfo = dDocinfos.Begin();
  9532. iDocinfoBlocks++;
  9533. }
  9534. }
  9535. // go on, loop next document
  9536. }
  9537. // FIXME! uncontrolled memory usage; add checks and/or diskbased sort in the future?
  9538. if ( pSource->IterateKillListStart ( m_sLastError ) )
  9539. {
  9540. SphDocID_t tDocId;
  9541. while ( pSource->IterateKillListNext ( tDocId ) )
  9542. dKillList.Add ( tDocId );
  9543. }
  9544. // fetch joined fields
  9545. if ( bGotJoined )
  9546. {
  9547. dAllIds.Uniq();
  9548. SphDocID_t uLastID = 0;
  9549. bool bLastFound = 0;
  9550. for ( ;; )
  9551. {
  9552. // get next doc, and handle errors
  9553. ISphHits * pJoinedHits = pSource->IterateJoinedHits ( m_sLastError );
  9554. if ( !pJoinedHits )
  9555. return 0;
  9556. // ensure docid is sane
  9557. if ( pSource->m_tDocInfo.m_iDocID==DOCID_MAX )
  9558. {
  9559. m_sLastError.SetSprintf ( "joined_docid==DOCID_MAX (source broken?)" );
  9560. return 0;
  9561. }
  9562. // check for eof
  9563. if ( !pSource->m_tDocInfo.m_iDocID )
  9564. break;
  9565. // filter and store hits
  9566. for ( const CSphWordHit * pHit = pJoinedHits->First(); pHit<=pJoinedHits->Last(); pHit++ )
  9567. {
  9568. // flush if needed
  9569. if ( pHits>=pHitsMax )
  9570. {
  9571. // sort hits
  9572. int iHits = pHits - dHits.Begin();
  9573. {
  9574. // PROFILE ( sort_hits );
  9575. sphSort ( dHits.Begin(), iHits, CmpHit_fn() );
  9576. m_pDict->HitblockPatch ( dHits.Begin(), iHits );
  9577. }
  9578. pHits = dHits.Begin();
  9579. m_tProgress.m_iHitsTotal += iHits;
  9580. // we're not inlining, so only flush hits, docs are flushed independently
  9581. dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits,
  9582. NULL, 0, 0 ) );
  9583. m_pDict->HitblockReset ();
  9584. if ( dHitBlocks.Last()<0 )
  9585. return 0;
  9586. }
  9587. // filter
  9588. SphDocID_t uHitID = pHit->m_iDocID;
  9589. if ( uHitID!=uLastID )
  9590. {
  9591. uLastID = uHitID;
  9592. bLastFound = ( dAllIds.BinarySearch ( uHitID )!=NULL );
  9593. }
  9594. // copy next hit
  9595. if ( bLastFound )
  9596. *pHits++ = *pHit;
  9597. }
  9598. }
  9599. }
  9600. // this source is over, disconnect and update stats
  9601. pSource->Disconnect ();
  9602. m_tStats.m_iTotalDocuments += pSource->GetStats().m_iTotalDocuments;
  9603. m_tStats.m_iTotalBytes += pSource->GetStats().m_iTotalBytes;
  9604. }
  9605. if ( m_tStats.m_iTotalDocuments>=INT_MAX )
  9606. {
  9607. m_sLastError.SetSprintf ( "index over %d documents not supported (got documents count="INT64_FMT")", INT_MAX, m_tStats.m_iTotalDocuments );
  9608. return 0;
  9609. }
  9610. // flush last docinfo block
  9611. int iDocinfoLastBlockSize = 0;
  9612. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pDocinfo>dDocinfos.Begin() )
  9613. {
  9614. iDocinfoLastBlockSize = ( pDocinfo - dDocinfos.Begin() ) / iDocinfoStride;
  9615. assert ( pDocinfo==( dDocinfos.Begin() + iDocinfoLastBlockSize*iDocinfoStride ) );
  9616. int iLen = iDocinfoLastBlockSize*iDocinfoStride*sizeof(DWORD);
  9617. sphSortDocinfos ( dDocinfos.Begin(), iDocinfoLastBlockSize, iDocinfoStride );
  9618. if ( !sphWriteThrottled ( fdDocinfos.GetFD(), dDocinfos.Begin(), iLen, "raw_docinfos", m_sLastError, &g_tThrottle ) )
  9619. return 0;
  9620. iDocinfoBlocks++;
  9621. }
  9622. // flush last hit block
  9623. if ( pHits>dHits.Begin() )
  9624. {
  9625. int iHits = pHits - dHits.Begin();
  9626. {
  9627. // PROFILE ( sort_hits );
  9628. sphSort ( dHits.Begin(), iHits, CmpHit_fn() );
  9629. m_pDict->HitblockPatch ( dHits.Begin(), iHits );
  9630. }
  9631. m_tProgress.m_iHitsTotal += iHits;
  9632. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  9633. {
  9634. int iDocs = ( pDocinfo - dDocinfos.Begin() ) / iDocinfoStride;
  9635. sphSortDocinfos ( dDocinfos.Begin(), iDocs, iDocinfoStride );
  9636. dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits,
  9637. dDocinfos.Begin(), iDocs, iDocinfoStride ) );
  9638. } else
  9639. {
  9640. dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits, NULL, 0, 0 ) );
  9641. }
  9642. m_pDict->HitblockReset ();
  9643. if ( dHitBlocks.Last()<0 )
  9644. return 0;
  9645. }
  9646. // flush last field MVA block
  9647. if ( bHaveFieldMVAs && dFieldMVAs.GetLength () )
  9648. {
  9649. int iLength = dFieldMVAs.GetLength ();
  9650. nFieldMVAs += iLength;
  9651. dFieldMVAs.Sort ( CmpMvaEntries_fn () );
  9652. if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0],
  9653. iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError, &g_tThrottle ) )
  9654. return 0;
  9655. dFieldMVAs.Reset ();
  9656. }
  9657. // flush last ordinals block
  9658. if ( bHaveOrdinals && dOrdinals[0].GetLength () )
  9659. {
  9660. nOrdinals += dOrdinals[0].GetLength ();
  9661. ARRAY_FOREACH ( i, dOrdinalAttrs )
  9662. {
  9663. CSphVector<Ordinal_t> & dCol = dOrdinals[i];
  9664. dCol.Sort ( CmpOrdinalsValue_fn() );
  9665. SphOffset_t uSize = DumpOrdinals ( tOrdWriter, dCol );
  9666. if ( !uSize )
  9667. {
  9668. m_sLastError = "dump ordinals: io error";
  9669. return 0;
  9670. }
  9671. if ( uSize > uMaxOrdinalAttrBlockSize )
  9672. uMaxOrdinalAttrBlockSize = uSize;
  9673. dOrdBlockSize[i].Add ( uSize );
  9674. dCol.Reset ();
  9675. }
  9676. }
  9677. m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments;
  9678. m_tProgress.m_iBytes = m_tStats.m_iTotalBytes;
  9679. m_tProgress.Show ( true );
  9680. // PROFILE_END ( collect_hits );
  9681. ///////////////////////////////////////
  9682. // collect and sort multi-valued attrs
  9683. ///////////////////////////////////////
  9684. if ( !BuildMVA ( dSources, dHits, iHitsMax*sizeof(CSphWordHit), fdTmpFieldMVAs.GetFD (), nFieldMVAs, iMaxPoolFieldMVAs, pPrevIndex.Ptr() ) )
  9685. return 0;
  9686. // reset persistent mva update pool
  9687. ::unlink ( GetIndexFileName("mvp").cstr() );
  9688. // reset hits pool
  9689. dHits.Reset ( 0 );
  9690. CSphString sFieldMVAFile = fdTmpFieldMVAs.GetFilename ();
  9691. fdTmpFieldMVAs.Close ();
  9692. ::unlink ( sFieldMVAFile.cstr () );
  9693. /////////////////
  9694. // sort docinfos
  9695. /////////////////
  9696. tOrdWriter.CloseFile ();
  9697. if ( tOrdWriter.IsError () )
  9698. return 0;
  9699. CSphString sSortedOrdinalIdFile = GetIndexFileName("tmp6");
  9700. // sort ordinals
  9701. if ( bHaveOrdinals && !dOrdBlockSize[0].GetLength () )
  9702. {
  9703. bHaveOrdinals = false;
  9704. ::unlink ( sRawOrdinalsFile.cstr () );
  9705. }
  9706. if ( bHaveOrdinals )
  9707. {
  9708. if ( iMaxOrdLen > MAX_ORDINAL_STR_LEN )
  9709. sphWarn ( "some ordinal attributes are too long (len=%d,max=%d)", iMaxOrdLen, MAX_ORDINAL_STR_LEN );
  9710. CSphString sUnsortedIdFile = GetIndexFileName("tmp5");
  9711. CSphAutofile fdRawOrdinals ( sRawOrdinalsFile.cstr (), SPH_O_READ, m_sLastError, true );
  9712. if ( fdRawOrdinals.GetFD () < 0 )
  9713. return 0;
  9714. const float ARENA_PERCENT = 0.5f;
  9715. int nBlocks = dOrdBlockSize[0].GetLength ();
  9716. SphOffset_t uMemNeededForReaders = SphOffset_t ( nBlocks ) * uMaxOrdinalAttrBlockSize;
  9717. SphOffset_t uMemNeededForSorting = sizeof ( OrdinalId_t ) * nOrdinals;
  9718. int iArenaSize = (int) Min ( SphOffset_t ( iMemoryLimit * ARENA_PERCENT ), uMemNeededForReaders );
  9719. iArenaSize = Max ( CSphBin::MIN_SIZE * nBlocks, iArenaSize );
  9720. int iOrdinalsInPool = (int)Min ( (SphOffset_t)( iMemoryLimit*( 1.0f-ARENA_PERCENT ) ), uMemNeededForSorting )/sizeof(OrdinalId_t);
  9721. if ( !SortOrdinals ( sUnsortedIdFile.cstr (), fdRawOrdinals.GetFD(),
  9722. iArenaSize, iOrdinalsInPool, dOrdBlockSize, iArenaSize < uMemNeededForReaders ) )
  9723. return 0;
  9724. CSphAutofile fdUnsortedId ( sUnsortedIdFile.cstr (), SPH_O_READ, m_sLastError, true );
  9725. if ( fdUnsortedId.GetFD () < 0 )
  9726. return 0;
  9727. iArenaSize = Min ( iMemoryLimit, (int)uMemNeededForSorting );
  9728. iArenaSize = Max ( CSphBin::MIN_SIZE * ( nOrdinals / iOrdinalsInPool + 1 ), iArenaSize );
  9729. if ( !SortOrdinalIds ( sSortedOrdinalIdFile.cstr (), fdUnsortedId.GetFD(),
  9730. iArenaSize, dOrdBlockSize, iArenaSize < uMemNeededForSorting ) )
  9731. return 0;
  9732. }
  9733. // initialize MVA reader
  9734. CSphAutoreader rdMva;
  9735. if ( !rdMva.Open ( GetIndexFileName("spm"), m_sLastError ) )
  9736. return 0;
  9737. SphDocID_t uMvaID = rdMva.GetDocid();
  9738. // initialize writer
  9739. int iDocinfoFD = -1;
  9740. SphOffset_t iDocinfoWritePos = 0;
  9741. CSphScopedPtr<CSphAutofile> pfdDocinfoFinal ( NULL );
  9742. if ( m_bInplaceSettings )
  9743. iDocinfoFD = fdDocinfos.GetFD ();
  9744. else
  9745. {
  9746. pfdDocinfoFinal = new CSphAutofile ( GetIndexFileName("spa"), SPH_O_NEW, m_sLastError );
  9747. iDocinfoFD = pfdDocinfoFinal->GetFD();
  9748. if ( iDocinfoFD < 0 )
  9749. return 0;
  9750. }
  9751. int iDupes = 0;
  9752. int iMinBlock = -1;
  9753. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && iDocinfoBlocks )
  9754. {
  9755. // initialize readers
  9756. assert ( dBins.GetLength()==0 );
  9757. dBins.Reserve ( iDocinfoBlocks );
  9758. float fReadFactor = 1.0f;
  9759. float fRelocFactor = 0.0f;
  9760. if ( m_bInplaceSettings )
  9761. {
  9762. assert ( m_fRelocFactor > 0.005f && m_fRelocFactor < 0.95f );
  9763. fRelocFactor = m_fRelocFactor;
  9764. fReadFactor -= fRelocFactor;
  9765. }
  9766. int iBinSize = CSphBin::CalcBinSize ( int ( iMemoryLimit * fReadFactor ), iDocinfoBlocks, "sort_docinfos" );
  9767. int iRelocationSize = m_bInplaceSettings ? int ( iMemoryLimit * fRelocFactor ) : 0;
  9768. CSphFixedVector<BYTE> dRelocationBuffer ( iRelocationSize );
  9769. iSharedOffset = -1;
  9770. for ( int i=0; i<iDocinfoBlocks; i++ )
  9771. {
  9772. dBins.Add ( new CSphBin() );
  9773. dBins[i]->m_iFileLeft = ( ( i==iDocinfoBlocks-1 ) ? iDocinfoLastBlockSize : iDocinfoMax )*iDocinfoStride*sizeof(DWORD);
  9774. dBins[i]->m_iFilePos = ( i==0 ) ? iDocinfosGap : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
  9775. dBins[i]->Init ( fdDocinfos.GetFD(), &iSharedOffset, iBinSize );
  9776. }
  9777. SphOffset_t iDocinfoFileSize = 0;
  9778. if ( iDocinfoBlocks )
  9779. iDocinfoFileSize = dBins [iDocinfoBlocks-1]->m_iFilePos + dBins [iDocinfoBlocks-1]->m_iFileLeft;
  9780. // docinfo queue
  9781. CSphFixedVector<DWORD> dDocinfoQueue ( iDocinfoBlocks*iDocinfoStride );
  9782. CSphQueue < int, CmpQueuedDocinfo_fn > qDocinfo ( iDocinfoBlocks );
  9783. CmpQueuedDocinfo_fn::m_pStorage = dDocinfoQueue.Begin();
  9784. CmpQueuedDocinfo_fn::m_iStride = iDocinfoStride;
  9785. pDocinfo = dDocinfoQueue.Begin();
  9786. for ( int i=0; i<iDocinfoBlocks; i++ )
  9787. {
  9788. if ( dBins[i]->ReadBytes ( pDocinfo, iDocinfoStride*sizeof(DWORD) )!=BIN_READ_OK )
  9789. {
  9790. m_sLastError.SetSprintf ( "sort_docinfos: warmup failed (io error?)" );
  9791. return 0;
  9792. }
  9793. pDocinfo += iDocinfoStride;
  9794. qDocinfo.Push ( i );
  9795. }
  9796. CSphVector < CSphBin > dOrdReaders;
  9797. SphOffset_t iSharedOrdOffset = -1;
  9798. CSphAutofile fdTmpSortedIds ( sSortedOrdinalIdFile.cstr (), SPH_O_READ, m_sLastError, true );
  9799. if ( bHaveOrdinals )
  9800. {
  9801. if ( fdTmpSortedIds.GetFD () < 0 )
  9802. return 0;
  9803. dOrdReaders.Resize ( dOrdinalAttrs.GetLength () );
  9804. SphOffset_t uStart = 0;
  9805. ARRAY_FOREACH ( i, dOrdReaders )
  9806. {
  9807. dOrdReaders[i].m_iFileLeft = (int)dOrdBlockSize [i][0];
  9808. dOrdReaders[i].m_iFilePos = uStart;
  9809. dOrdReaders[i].Init ( fdTmpSortedIds.GetFD(), &iSharedOrdOffset, ORDINAL_READ_SIZE );
  9810. uStart += dOrdReaders[i].m_iFileLeft;
  9811. }
  9812. }
  9813. // while the queue has data for us
  9814. int iOrd = 0;
  9815. pDocinfo = dDocinfos.Begin();
  9816. SphDocID_t uLastId = 0;
  9817. m_uMinMaxIndex = 0;
  9818. // prepare the collector for min/max of attributes
  9819. AttrIndexBuilder_c tMinMax ( m_tSchema );
  9820. int64_t iMinMaxSize = tMinMax.GetExpectedSize ( m_tStats.m_iTotalDocuments );
  9821. if ( iMinMaxSize>INT_MAX || m_tStats.m_iTotalDocuments>INT_MAX )
  9822. {
  9823. m_sLastError.SetSprintf ( "attribute files (.spa) over 128 GB are not supported (min-max approximate="INT64_FMT", documents count="INT64_FMT")",
  9824. iMinMaxSize, m_tStats.m_iTotalDocuments );
  9825. return 0;
  9826. }
  9827. CSphFixedVector<DWORD> dMinMaxBuffer ( (int)iMinMaxSize );
  9828. // { fixed row + dummy value ( zero offset elemination ) + mva data for that row } fixed row - for MinMaxBuilder
  9829. CSphVector < DWORD > dMvaPool;
  9830. tMinMax.Prepare ( dMinMaxBuffer.Begin(), dMinMaxBuffer.Begin() + dMinMaxBuffer.GetLength() ); // FIXME!!! for over INT_MAX blocks
  9831. SphDocID_t uLastDupe = 0;
  9832. while ( qDocinfo.GetLength() )
  9833. {
  9834. // obtain bin index and next entry
  9835. int iBin = qDocinfo.Root();
  9836. DWORD * pEntry = dDocinfoQueue.Begin() + iBin*iDocinfoStride;
  9837. if ( DOCINFO2ID ( pEntry )<uLastId )
  9838. {
  9839. m_sLastError.SetSprintf ( "descending document prev id="DOCID_FMT", curr="DOCID_FMT" bin=%d", uLastId, DOCINFO2ID ( pEntry ), iBin );
  9840. return 0;
  9841. }
  9842. // skip duplicates
  9843. if ( DOCINFO2ID ( pEntry )==uLastId )
  9844. {
  9845. // dupe, report it
  9846. if ( m_tSettings.m_bVerbose && uLastDupe!=uLastId )
  9847. sphWarn ( "duplicated document id="DOCID_FMT, uLastId );
  9848. uLastDupe = uLastId;
  9849. iDupes++;
  9850. } else
  9851. {
  9852. // new unique document, handle it
  9853. // update ordinals
  9854. ARRAY_FOREACH ( i, dOrdinalAttrs )
  9855. {
  9856. OrdinalId_t Id;
  9857. if ( dOrdReaders[i].ReadBytes ( &Id, sizeof(Id) )!=BIN_READ_OK )
  9858. {
  9859. m_sLastError = "update ordinals: io error";
  9860. return 0;
  9861. }
  9862. assert ( Id.m_uDocID==DOCINFO2ID(pEntry) );
  9863. sphSetRowAttr ( DOCINFO2ATTRS(pEntry), m_tSchema.GetAttr(dOrdinalAttrs[i]).m_tLocator, Id.m_uId );
  9864. }
  9865. iOrd++;
  9866. m_uMinMaxIndex += iDocinfoStride;
  9867. CSphRowitem * pCollectibleRow = pEntry;
  9868. // update MVA
  9869. if ( bGotMVA )
  9870. {
  9871. // go to next id
  9872. while ( uMvaID<DOCINFO2ID(pEntry) )
  9873. {
  9874. ARRAY_FOREACH ( i, dMvaIndexes )
  9875. {
  9876. int iCount = rdMva.GetDword();
  9877. rdMva.SkipBytes ( iCount*sizeof(DWORD) );
  9878. }
  9879. uMvaID = rdMva.GetDocid();
  9880. if ( !uMvaID )
  9881. uMvaID = DOCID_MAX;
  9882. }
  9883. assert ( uMvaID>=DOCINFO2ID(pEntry) );
  9884. if ( uMvaID==DOCINFO2ID(pEntry) )
  9885. {
  9886. // fixed row + dummy value ( zero offset elemination )
  9887. dMvaPool.Resize ( iDocinfoStride+1 );
  9888. memcpy ( dMvaPool.Begin(), pEntry, iDocinfoStride * sizeof(DWORD) );
  9889. CSphRowitem * pAttr = DOCINFO2ATTRS ( pEntry );
  9890. ARRAY_FOREACH ( i, dMvaIndexes )
  9891. {
  9892. SphOffset_t iMvaOff = rdMva.GetPos()/sizeof(DWORD);
  9893. assert ( iMvaOff<UINT_MAX );
  9894. int iPoolOff = dMvaPool.GetLength();
  9895. sphSetRowAttr ( pAttr, dMvaLocators[i], iMvaOff );
  9896. // there is the cloned row at the beginning of MVA pool, lets skip it
  9897. sphSetRowAttr ( dMvaPool.Begin()+DOCINFO_IDSIZE, dMvaLocators[i], iPoolOff - iDocinfoStride );
  9898. DWORD iMvaCount = rdMva.GetDword();
  9899. dMvaPool.Resize ( iPoolOff+iMvaCount+1 );
  9900. dMvaPool[iPoolOff] = iMvaCount;
  9901. rdMva.GetBytes ( dMvaPool.Begin()+iPoolOff+1, sizeof(DWORD)*iMvaCount );
  9902. }
  9903. pCollectibleRow = dMvaPool.Begin();
  9904. uMvaID = rdMva.GetDocid();
  9905. if ( !uMvaID )
  9906. uMvaID = DOCID_MAX;
  9907. }
  9908. }
  9909. if ( !tMinMax.Collect ( pCollectibleRow, dMvaPool.Begin()+iDocinfoStride, dMvaPool.GetLength()-iDocinfoStride, m_sLastError, false ) )
  9910. return 0;
  9911. dMvaPool.Resize ( iDocinfoStride );
  9912. // emit it
  9913. memcpy ( pDocinfo, pEntry, iDocinfoStride*sizeof(DWORD) );
  9914. pDocinfo += iDocinfoStride;
  9915. uLastId = DOCINFO2ID(pEntry);
  9916. if ( pDocinfo>=pDocinfoMax )
  9917. {
  9918. int iLen = iDocinfoMax*iDocinfoStride*sizeof(DWORD);
  9919. if ( m_bInplaceSettings )
  9920. {
  9921. if ( iMinBlock==-1 || dBins[iMinBlock]->IsEOF () )
  9922. {
  9923. iMinBlock = -1;
  9924. ARRAY_FOREACH ( i, dBins )
  9925. if ( !dBins[i]->IsEOF () && ( iMinBlock==-1 || dBins [i]->m_iFilePos<dBins [iMinBlock]->m_iFilePos ) )
  9926. iMinBlock = i;
  9927. }
  9928. if ( iMinBlock!=-1 && ( iDocinfoWritePos + iLen ) > dBins[iMinBlock]->m_iFilePos )
  9929. {
  9930. if ( !RelocateBlock ( iDocinfoFD, dRelocationBuffer.Begin(), iRelocationSize, &iDocinfoFileSize, dBins[iMinBlock], &iSharedOffset ) )
  9931. return 0;
  9932. iMinBlock = (iMinBlock+1) % dBins.GetLength ();
  9933. }
  9934. sphSeek ( iDocinfoFD, iDocinfoWritePos, SEEK_SET );
  9935. iSharedOffset = iDocinfoWritePos;
  9936. }
  9937. if ( !sphWriteThrottled ( iDocinfoFD, dDocinfos.Begin(), iLen, "sort_docinfo", m_sLastError, &g_tThrottle ) )
  9938. return 0;
  9939. iDocinfoWritePos += iLen;
  9940. pDocinfo = dDocinfos.Begin();
  9941. }
  9942. }
  9943. // pop its index, update it, push its index again
  9944. qDocinfo.Pop ();
  9945. ESphBinRead eRes = dBins[iBin]->ReadBytes ( pEntry, iDocinfoStride*sizeof(DWORD) );
  9946. if ( eRes==BIN_READ_ERROR )
  9947. {
  9948. m_sLastError.SetSprintf ( "sort_docinfo: failed to read entry" );
  9949. return 0;
  9950. }
  9951. if ( eRes==BIN_READ_OK )
  9952. qDocinfo.Push ( iBin );
  9953. }
  9954. if ( pDocinfo>dDocinfos.Begin() )
  9955. {
  9956. assert ( 0==( pDocinfo-dDocinfos.Begin() ) % iDocinfoStride );
  9957. int iLen = ( pDocinfo - dDocinfos.Begin() )*sizeof(DWORD);
  9958. if ( m_bInplaceSettings )
  9959. sphSeek ( iDocinfoFD, iDocinfoWritePos, SEEK_SET );
  9960. if ( !sphWriteThrottled ( iDocinfoFD, dDocinfos.Begin(), iLen, "sort_docinfo", m_sLastError, &g_tThrottle ) )
  9961. return 0;
  9962. if ( m_bInplaceSettings )
  9963. if ( !sphTruncate ( iDocinfoFD ) )
  9964. sphWarn ( "failed to truncate %s", fdDocinfos.GetFilename() );
  9965. }
  9966. tMinMax.FinishCollect();
  9967. int64_t iMinMaxRealSize = tMinMax.GetActualSize() * sizeof(DWORD);
  9968. if ( !sphWriteThrottled ( iDocinfoFD, dMinMaxBuffer.Begin(), iMinMaxRealSize, "minmax_docinfo", m_sLastError, &g_tThrottle ) )
  9969. return 0;
  9970. // clean up readers
  9971. ARRAY_FOREACH ( i, dBins )
  9972. SafeDelete ( dBins[i] );
  9973. dBins.Reset ();
  9974. }
  9975. dDocinfos.Reset ( 0 );
  9976. pDocinfo = NULL;
  9977. // it might be zero-length, but it must exist
  9978. if ( m_bInplaceSettings )
  9979. fdDocinfos.Close ();
  9980. else
  9981. {
  9982. assert ( pfdDocinfoFinal.Ptr () );
  9983. pfdDocinfoFinal->Close ();
  9984. }
  9985. // dump killlist
  9986. CSphAutofile fdKillList ( GetIndexFileName("spk"), SPH_O_NEW, m_sLastError );
  9987. if ( fdKillList.GetFD()<0 )
  9988. return 0;
  9989. if ( dKillList.GetLength () )
  9990. {
  9991. dKillList.Uniq ();
  9992. m_iKillListSize = dKillList.GetLength ();
  9993. if ( !sphWriteThrottled ( fdKillList.GetFD(), &dKillList[0],
  9994. m_iKillListSize*sizeof(SphAttr_t), "kill list", m_sLastError, &g_tThrottle ) )
  9995. return 0;
  9996. }
  9997. fdKillList.Close ();
  9998. ///////////////////////////////////
  9999. // sort and write compressed index
  10000. ///////////////////////////////////
  10001. // PROFILE_BEGIN ( invert_hits );
  10002. // initialize readers
  10003. assert ( dBins.GetLength()==0 );
  10004. dBins.Reserve ( dHitBlocks.GetLength() );
  10005. iSharedOffset = -1;
  10006. float fReadFactor = 1.0f;
  10007. int iRelocationSize = 0;
  10008. iWriteBuffer = iHitBuilderBufferSize;
  10009. if ( m_bInplaceSettings )
  10010. {
  10011. assert ( m_fRelocFactor > 0.005f && m_fRelocFactor < 0.95f );
  10012. assert ( m_fWriteFactor > 0.005f && m_fWriteFactor < 0.95f );
  10013. assert ( m_fWriteFactor+m_fRelocFactor < 1.0f );
  10014. fReadFactor -= m_fRelocFactor + m_fWriteFactor;
  10015. iRelocationSize = int ( iMemoryLimit * m_fRelocFactor );
  10016. iWriteBuffer = int ( iMemoryLimit * m_fWriteFactor );
  10017. }
  10018. int iBinSize = CSphBin::CalcBinSize ( int ( iMemoryLimit * fReadFactor ),
  10019. dHitBlocks.GetLength() + m_pDict->GetSettings().m_bWordDict, "sort_hits" );
  10020. CSphFixedVector <BYTE> dRelocationBuffer ( iRelocationSize );
  10021. iSharedOffset = -1;
  10022. ARRAY_FOREACH ( i, dHitBlocks )
  10023. {
  10024. dBins.Add ( new CSphBin ( m_tSettings.m_eHitless, m_pDict->GetSettings().m_bWordDict ) );
  10025. dBins[i]->m_iFileLeft = dHitBlocks[i];
  10026. dBins[i]->m_iFilePos = ( i==0 ) ? iHitsGap : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
  10027. dBins[i]->Init ( fdHits.GetFD(), &iSharedOffset, iBinSize );
  10028. }
  10029. // if there were no hits, create zero-length index files
  10030. int iRawBlocks = dBins.GetLength();
  10031. //////////////////////////////
  10032. // create new index files set
  10033. //////////////////////////////
  10034. tHitBuilder.CreateIndexFiles ( GetIndexFileName("spd").cstr(), GetIndexFileName("spp").cstr(),
  10035. GetIndexFileName("spe").cstr(), m_bInplaceSettings, iWriteBuffer, fdHits, &iSharedOffset );
  10036. // dict files
  10037. CSphAutofile fdTmpDict ( GetIndexFileName("tmp8"), SPH_O_NEW, m_sLastError, true );
  10038. CSphAutofile fdDict ( GetIndexFileName("spi"), SPH_O_NEW, m_sLastError, false );
  10039. if ( fdTmpDict.GetFD()<0 || fdDict.GetFD()<0 )
  10040. return 0;
  10041. m_pDict->DictBegin ( fdTmpDict, fdDict, iBinSize, &g_tThrottle );
  10042. // adjust min IDs, and fill header
  10043. assert ( m_iMinDocid>0 );
  10044. m_iMinDocid--;
  10045. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  10046. ARRAY_FOREACH ( i, m_dMinRow )
  10047. m_dMinRow[i]--;
  10048. tHitBuilder.SetMin ( m_dMinRow.Begin(), m_dMinRow.GetLength() );
  10049. //////////////
  10050. // final sort
  10051. //////////////
  10052. if ( iRawBlocks )
  10053. {
  10054. int iLastBin = dBins.GetLength () - 1;
  10055. SphOffset_t iHitFileSize = dBins[iLastBin]->m_iFilePos + dBins [iLastBin]->m_iFileLeft;
  10056. CSphHitQueue tQueue ( iRawBlocks );
  10057. CSphAggregateHit tHit;
  10058. // initialize hitlist encoder state
  10059. tHitBuilder.HitReset();
  10060. // initial fill
  10061. int iRowitems = ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ) ? m_tSchema.GetRowSize() : 0;
  10062. CSphFixedVector<CSphRowitem> dInlineAttrs ( iRawBlocks*iRowitems );
  10063. CSphFixedVector<BYTE> dActive ( iRawBlocks );
  10064. for ( int i=0; i<iRawBlocks; i++ )
  10065. {
  10066. if ( !dBins[i]->ReadHit ( &tHit, iRowitems, dInlineAttrs.Begin() + i * iRowitems ) )
  10067. {
  10068. m_sLastError.SetSprintf ( "sort_hits: warmup failed (io error?)" );
  10069. return 0;
  10070. }
  10071. dActive[i] = ( tHit.m_iWordID!=0 );
  10072. if ( dActive[i] )
  10073. tQueue.Push ( tHit, i );
  10074. }
  10075. // init progress meter
  10076. m_tProgress.m_ePhase = CSphIndexProgress::PHASE_SORT;
  10077. m_tProgress.m_iHits = 0;
  10078. // while the queue has data for us
  10079. // FIXME! analyze binsRead return code
  10080. int iHitsSorted = 0;
  10081. iMinBlock = -1;
  10082. while ( tQueue.m_iUsed )
  10083. {
  10084. int iBin = tQueue.m_pData->m_iBin;
  10085. // pack and emit queue root
  10086. tQueue.m_pData->m_iDocID -= m_iMinDocid;
  10087. if ( m_bInplaceSettings )
  10088. {
  10089. if ( iMinBlock==-1 || dBins[iMinBlock]->IsEOF () || !dActive[iMinBlock] )
  10090. {
  10091. iMinBlock = -1;
  10092. ARRAY_FOREACH ( i, dBins )
  10093. if ( !dBins[i]->IsEOF () && dActive[i] && ( iMinBlock==-1 || dBins[i]->m_iFilePos < dBins[iMinBlock]->m_iFilePos ) )
  10094. iMinBlock = i;
  10095. }
  10096. int iToWriteMax = 3*sizeof(DWORD);
  10097. if ( iMinBlock!=-1 && ( tHitBuilder.GetHitfilePos() + iToWriteMax ) > dBins[iMinBlock]->m_iFilePos )
  10098. {
  10099. if ( !RelocateBlock ( fdHits.GetFD (), dRelocationBuffer.Begin(), iRelocationSize, &iHitFileSize, dBins[iMinBlock], &iSharedOffset ) )
  10100. return 0;
  10101. iMinBlock = (iMinBlock+1) % dBins.GetLength ();
  10102. }
  10103. }
  10104. tHitBuilder.cidxHit ( tQueue.m_pData, iRowitems ? dInlineAttrs.Begin() + iBin * iRowitems : NULL );
  10105. if ( tHitBuilder.IsError() )
  10106. return 0;
  10107. // pop queue root and push next hit from popped bin
  10108. tQueue.Pop ();
  10109. if ( dActive[iBin] )
  10110. {
  10111. dBins[iBin]->ReadHit ( &tHit, iRowitems, dInlineAttrs.Begin() + iBin * iRowitems );
  10112. dActive[iBin] = ( tHit.m_iWordID!=0 );
  10113. if ( dActive[iBin] )
  10114. tQueue.Push ( tHit, iBin );
  10115. }
  10116. // progress
  10117. if ( ++iHitsSorted==1000000 )
  10118. {
  10119. m_tProgress.m_iHits += iHitsSorted;
  10120. m_tProgress.Show ( false );
  10121. iHitsSorted = 0;
  10122. }
  10123. }
  10124. m_tProgress.m_iHits = m_tProgress.m_iHitsTotal; // sum might be less than total because of dupes!
  10125. m_tProgress.Show ( true );
  10126. ARRAY_FOREACH ( i, dBins )
  10127. SafeDelete ( dBins[i] );
  10128. dBins.Reset ();
  10129. CSphAggregateHit tFlush;
  10130. tFlush.m_iDocID = 0;
  10131. tFlush.m_iWordID = 0;
  10132. tFlush.m_sKeyword = NULL;
  10133. tFlush.m_iWordPos = EMPTY_HIT;
  10134. tFlush.m_dFieldMask.Unset();
  10135. tHitBuilder.cidxHit ( &tFlush, NULL );
  10136. if ( m_bInplaceSettings )
  10137. {
  10138. tHitBuilder.CloseHitlist();
  10139. if ( !sphTruncate ( fdHits.GetFD () ) )
  10140. sphWarn ( "failed to truncate %s", fdHits.GetFilename() );
  10141. }
  10142. }
  10143. if ( iDupes )
  10144. sphWarn ( "%d duplicate document id pairs found", iDupes );
  10145. // PROFILE_END ( invert_hits );
  10146. BuildHeader_t tBuildHeader ( m_tStats );
  10147. if ( !tHitBuilder.cidxDone ( iMemoryLimit, m_tSettings.m_iMinInfixLen, m_pTokenizer->GetMaxCodepointLength(), &tBuildHeader ) )
  10148. return 0;
  10149. tBuildHeader.m_sHeaderExtension = "sph";
  10150. tBuildHeader.m_pMinRow = m_dMinRow.Begin();
  10151. tBuildHeader.m_iMinDocid = m_iMinDocid;
  10152. tBuildHeader.m_pThrottle = &g_tThrottle;
  10153. tBuildHeader.m_iKillListSize = m_iKillListSize;
  10154. tBuildHeader.m_uMinMaxIndex = m_uMinMaxIndex;
  10155. // we're done
  10156. if ( !BuildDone ( tBuildHeader, m_sLastError ) )
  10157. return 0;
  10158. // when the party's over..
  10159. ARRAY_FOREACH ( i, dSources )
  10160. dSources[i]->PostIndex ();
  10161. dFileWatchdog.AllIsDone();
  10162. return 1;
  10163. } // NOLINT function length
  10164. /////////////////////////////////////////////////////////////////////////////
  10165. // MERGER HELPERS
  10166. /////////////////////////////////////////////////////////////////////////////
  10167. static bool CopyFile ( const char * sSrc, const char * sDst, CSphString & sErrStr, ThrottleState_t * pThrottle )
  10168. {
  10169. assert ( sSrc );
  10170. assert ( sDst );
  10171. const DWORD iMaxBufSize = 1024 * 1024;
  10172. CSphAutofile tSrcFile ( sSrc, SPH_O_READ, sErrStr );
  10173. CSphAutofile tDstFile ( sDst, SPH_O_NEW, sErrStr );
  10174. if ( tSrcFile.GetFD()<0 || tDstFile.GetFD()<0 )
  10175. return false;
  10176. SphOffset_t iFileSize = tSrcFile.GetSize();
  10177. DWORD iBufSize = (DWORD) Min ( iFileSize, (SphOffset_t)iMaxBufSize );
  10178. if ( iFileSize )
  10179. {
  10180. BYTE * pData = new BYTE[iBufSize];
  10181. if ( !pData )
  10182. {
  10183. sErrStr.SetSprintf ( "memory allocation error" );
  10184. return false;
  10185. }
  10186. bool bError = true;
  10187. while ( iFileSize > 0 )
  10188. {
  10189. DWORD iSize = (DWORD) Min ( iFileSize, (SphOffset_t)iBufSize );
  10190. size_t iRead = sphReadThrottled ( tSrcFile.GetFD(), pData, iSize, pThrottle );
  10191. if ( iRead!=iSize )
  10192. {
  10193. sErrStr.SetSprintf ( "read error in %s; "INT64_FMT" of %d bytes read", sSrc, (int64_t)iRead, iSize );
  10194. break;
  10195. }
  10196. if ( !sphWriteThrottled ( tDstFile.GetFD(), pData, iSize, "CopyFile", sErrStr, pThrottle ) )
  10197. break;
  10198. iFileSize -= iSize;
  10199. if ( !iFileSize )
  10200. bError = false;
  10201. }
  10202. SafeDeleteArray ( pData );
  10203. return ( bError==false );
  10204. }
  10205. return true;
  10206. }
  10207. static void CopyRowString ( const BYTE * pBase, const CSphVector<CSphAttrLocator> & dString, CSphRowitem * pRow, CSphWriter & wrTo )
  10208. {
  10209. if ( !dString.GetLength() )
  10210. return;
  10211. CSphRowitem * pAttr = DOCINFO2ATTRS ( pRow );
  10212. ARRAY_FOREACH ( i, dString )
  10213. {
  10214. SphAttr_t uOff = sphGetRowAttr ( pAttr, dString[i] );
  10215. // magic offset? do nothing
  10216. if ( !uOff )
  10217. continue;
  10218. const BYTE * pStr = NULL;
  10219. int iLen = sphUnpackStr ( pBase + uOff, &pStr );
  10220. // no data? do nothing
  10221. if ( !iLen )
  10222. continue;
  10223. // copy bytes
  10224. uOff = (SphAttr_t)wrTo.GetPos();
  10225. assert ( uOff<UINT_MAX );
  10226. sphSetRowAttr ( pAttr, dString[i], uOff );
  10227. BYTE dPackedLen[4];
  10228. int iLenLen = sphPackStrlen ( dPackedLen, iLen );
  10229. wrTo.PutBytes ( &dPackedLen, iLenLen );
  10230. wrTo.PutBytes ( pStr, iLen );
  10231. }
  10232. }
  10233. static void CopyRowMVA ( const DWORD * pBase, const CSphVector<CSphAttrLocator> & dMva,
  10234. SphDocID_t iDocid, CSphRowitem * pRow, CSphWriter & wrTo )
  10235. {
  10236. if ( !dMva.GetLength() )
  10237. return;
  10238. CSphRowitem * pAttr = DOCINFO2ATTRS ( pRow );
  10239. bool bDocidWriten = false;
  10240. ARRAY_FOREACH ( i, dMva )
  10241. {
  10242. SphAttr_t uOff = sphGetRowAttr ( pAttr, dMva[i] );
  10243. if ( !uOff )
  10244. continue;
  10245. assert ( pBase );
  10246. if ( !bDocidWriten )
  10247. {
  10248. assert ( DOCINFO2ID ( pBase + uOff - DOCINFO_IDSIZE )==iDocid ); // there is DocID prior to 1st MVA
  10249. wrTo.PutDocid ( iDocid );
  10250. bDocidWriten = true;
  10251. }
  10252. assert ( wrTo.GetPos()/sizeof(DWORD)<=UINT_MAX );
  10253. SphAttr_t uNewOff = ( DWORD )wrTo.GetPos() / sizeof( DWORD );
  10254. sphSetRowAttr ( pAttr, dMva[i], uNewOff );
  10255. DWORD iValues = pBase[uOff];
  10256. wrTo.PutBytes ( pBase + uOff, ( iValues+1 )*sizeof(DWORD) );
  10257. }
  10258. }
  10259. static const int DOCLIST_HINT_THRESH = 256;
  10260. static int DoclistHintUnpack ( int iDocs, BYTE uHint )
  10261. {
  10262. if ( iDocs<DOCLIST_HINT_THRESH )
  10263. return 8*iDocs;
  10264. else
  10265. return 4*iDocs + (int)( int64_t(iDocs)*uHint/64 );
  10266. }
  10267. BYTE sphDoclistHintPack ( SphOffset_t iDocs, SphOffset_t iLen )
  10268. {
  10269. // we won't really store a hint for small lists
  10270. if ( iDocs<DOCLIST_HINT_THRESH )
  10271. return 0;
  10272. // for bigger lists len/docs varies 4x-6x on test indexes
  10273. // so lets assume that 4x-8x should be enough for everybody
  10274. SphOffset_t iDelta = Min ( Max ( iLen-4*iDocs, 0 ), 4*iDocs-1 ); // len delta over 4x, clamped to [0x..4x) range
  10275. BYTE uHint = (BYTE)( 64*iDelta/iDocs ); // hint now must be in [0..256) range
  10276. while ( uHint<255 && ( iDocs*uHint/64 )<iDelta ) // roundoff (suddenly, my guru math skillz failed me)
  10277. uHint++;
  10278. return uHint;
  10279. }
  10280. // !COMMIT eliminate this, move to dict (or at least couple with CWordlist)
  10281. class CSphDictReader
  10282. {
  10283. public:
  10284. // current word
  10285. SphWordID_t m_iWordID;
  10286. SphOffset_t m_iDoclistOffset;
  10287. int m_iDocs;
  10288. int m_iHits;
  10289. bool m_bHasHitlist;
  10290. int m_iHint;
  10291. private:
  10292. ESphHitless m_eHitless;
  10293. CSphAutoreader m_tMyReader;
  10294. CSphReader * m_pReader;
  10295. SphOffset_t m_iMaxPos;
  10296. bool m_bWordDict;
  10297. char m_sWord[MAX_KEYWORD_BYTES];
  10298. int m_iCheckpoint;
  10299. bool m_bHasSkips;
  10300. public:
  10301. CSphDictReader()
  10302. : m_iWordID ( 0 )
  10303. , m_iDoclistOffset ( 0 )
  10304. , m_iHint ( 0 )
  10305. , m_iMaxPos ( 0 )
  10306. , m_bWordDict ( false )
  10307. , m_iCheckpoint ( 1 )
  10308. , m_bHasSkips ( false )
  10309. {
  10310. m_sWord[0] = '\0';
  10311. }
  10312. bool Setup ( const CSphString & sFilename, SphOffset_t iMaxPos, ESphHitless eHitless,
  10313. CSphString & sError, bool bWordDict, ThrottleState_t * pThrottle, bool bHasSkips )
  10314. {
  10315. if ( !m_tMyReader.Open ( sFilename, sError ) )
  10316. return false;
  10317. Setup ( &m_tMyReader, iMaxPos, eHitless, bWordDict, pThrottle, bHasSkips );
  10318. return true;
  10319. }
  10320. void Setup ( CSphReader * pReader, SphOffset_t iMaxPos, ESphHitless eHitless, bool bWordDict, ThrottleState_t * pThrottle, bool bHasSkips )
  10321. {
  10322. m_pReader = pReader;
  10323. m_pReader->SetThrottle ( pThrottle );
  10324. m_pReader->SeekTo ( 1, READ_NO_SIZE_HINT );
  10325. m_iMaxPos = iMaxPos;
  10326. m_eHitless = eHitless;
  10327. m_bWordDict = bWordDict;
  10328. m_sWord[0] = '\0';
  10329. m_iCheckpoint = 1;
  10330. m_bHasSkips = bHasSkips;
  10331. }
  10332. bool Read()
  10333. {
  10334. if ( m_pReader->GetPos()>=m_iMaxPos )
  10335. return false;
  10336. // get leading value
  10337. SphWordID_t iWord0 = m_bWordDict ? m_pReader->GetByte() : m_pReader->UnzipWordid();
  10338. if ( !iWord0 )
  10339. {
  10340. // handle checkpoint
  10341. m_iCheckpoint++;
  10342. m_pReader->UnzipOffset();
  10343. m_iWordID = 0;
  10344. m_iDoclistOffset = 0;
  10345. m_sWord[0] = '\0';
  10346. if ( m_pReader->GetPos()>=m_iMaxPos )
  10347. return false;
  10348. iWord0 = m_bWordDict ? m_pReader->GetByte() : m_pReader->UnzipWordid(); // get next word
  10349. }
  10350. if ( !iWord0 )
  10351. return false; // some failure
  10352. // get word entry
  10353. if ( m_bWordDict )
  10354. {
  10355. // unpack next word
  10356. // must be in sync with DictEnd()!
  10357. assert ( iWord0<=255 );
  10358. BYTE uPack = (BYTE) iWord0;
  10359. int iMatch, iDelta;
  10360. if ( uPack & 0x80 )
  10361. {
  10362. iDelta = ( ( uPack>>4 ) & 7 ) + 1;
  10363. iMatch = uPack & 15;
  10364. } else
  10365. {
  10366. iDelta = uPack & 127;
  10367. iMatch = m_pReader->GetByte();
  10368. }
  10369. assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
  10370. assert ( iMatch<=(int)strlen(m_sWord) );
  10371. m_pReader->GetBytes ( m_sWord + iMatch, iDelta );
  10372. m_sWord [ iMatch+iDelta ] = '\0';
  10373. m_iDoclistOffset = m_pReader->UnzipOffset();
  10374. m_iDocs = m_pReader->UnzipInt();
  10375. m_iHits = m_pReader->UnzipInt();
  10376. m_iHint = 0;
  10377. if ( m_iDocs>=DOCLIST_HINT_THRESH )
  10378. m_iHint = m_pReader->GetByte();
  10379. DoclistHintUnpack ( m_iDocs, (BYTE) m_iHint );
  10380. if ( m_bHasSkips && ( m_iDocs > SPH_SKIPLIST_BLOCK ) )
  10381. m_pReader->UnzipInt();
  10382. m_iWordID = (SphWordID_t) sphCRC32 ( GetWord() ); // set wordID for indexing
  10383. } else
  10384. {
  10385. m_iWordID += iWord0;
  10386. m_iDoclistOffset += m_pReader->UnzipOffset();
  10387. m_iDocs = m_pReader->UnzipInt();
  10388. m_iHits = m_pReader->UnzipInt();
  10389. if ( m_bHasSkips && ( m_iDocs > SPH_SKIPLIST_BLOCK ) )
  10390. m_pReader->UnzipOffset();
  10391. }
  10392. m_bHasHitlist =
  10393. ( m_eHitless==SPH_HITLESS_NONE ) ||
  10394. ( m_eHitless==SPH_HITLESS_SOME && !( m_iDocs & 0x80000000 ) );
  10395. m_iDocs = m_eHitless==SPH_HITLESS_SOME ? ( m_iDocs & 0x7FFFFFFF ) : m_iDocs;
  10396. return true; // FIXME? errorflag?
  10397. }
  10398. int CmpWord ( const CSphDictReader & tOther ) const
  10399. {
  10400. if ( m_bWordDict )
  10401. return strcmp ( m_sWord, tOther.m_sWord );
  10402. int iRes = 0;
  10403. iRes = m_iWordID<tOther.m_iWordID ? -1 : iRes;
  10404. iRes = m_iWordID>tOther.m_iWordID ? 1 : iRes;
  10405. return iRes;
  10406. }
  10407. BYTE * GetWord() const { return (BYTE *)m_sWord; }
  10408. int GetCheckpoint() const { return m_iCheckpoint; }
  10409. };
  10410. static ISphFilter * CreateMergeFilters ( const CSphVector<CSphFilterSettings> & dSettings,
  10411. const CSphSchema & tSchema, const DWORD * pMvaPool, const BYTE * pStrings )
  10412. {
  10413. CSphString sError;
  10414. ISphFilter * pResult = NULL;
  10415. ARRAY_FOREACH ( i, dSettings )
  10416. {
  10417. ISphFilter * pFilter = sphCreateFilter ( dSettings[i], tSchema, pMvaPool, pStrings, sError );
  10418. if ( pFilter )
  10419. pResult = sphJoinFilters ( pResult, pFilter );
  10420. }
  10421. return pResult;
  10422. }
  10423. static bool CheckDocsCount ( int64_t iDocs, CSphString & sError )
  10424. {
  10425. if ( iDocs<INT_MAX )
  10426. return true;
  10427. sError.SetSprintf ( "index over %d documents not supported (got "INT64_FMT" documents)", INT_MAX, iDocs );
  10428. return false;
  10429. }
  10430. class CSphMerger
  10431. {
  10432. private:
  10433. CSphFixedVector<CSphRowitem> m_dInlineRow;
  10434. CSphHitBuilder * m_pHitBuilder;
  10435. SphDocID_t m_iMinID;
  10436. public:
  10437. explicit CSphMerger ( CSphHitBuilder * pHitBuilder, int iInlineCount, SphDocID_t iMinID )
  10438. : m_dInlineRow ( iInlineCount )
  10439. , m_pHitBuilder ( pHitBuilder )
  10440. , m_iMinID ( iMinID )
  10441. {
  10442. }
  10443. template < typename QWORD > static inline
  10444. void PrepareQword ( QWORD & tQword, const CSphDictReader & tReader, SphDocID_t iMinID, bool bWordDict ) //NOLINT
  10445. {
  10446. tQword.m_iMinID = iMinID;
  10447. tQword.m_tDoc.m_iDocID = iMinID;
  10448. tQword.m_iDocs = tReader.m_iDocs;
  10449. tQword.m_iHits = tReader.m_iHits;
  10450. tQword.m_bHasHitlist = tReader.m_bHasHitlist;
  10451. tQword.m_uHitPosition = 0;
  10452. tQword.m_iHitlistPos = 0;
  10453. if ( bWordDict )
  10454. tQword.m_rdDoclist.SeekTo ( tReader.m_iDoclistOffset, tReader.m_iHint );
  10455. }
  10456. template < typename QWORD >
  10457. inline bool NextDocument ( QWORD & tQword, const CSphIndex_VLN * pSourceIndex, const ISphFilter * pFilter )
  10458. {
  10459. for ( ;; )
  10460. {
  10461. tQword.GetNextDoc ( m_dInlineRow.Begin() );
  10462. if ( tQword.m_tDoc.m_iDocID )
  10463. {
  10464. tQword.SeekHitlist ( tQword.m_iHitlistPos );
  10465. if ( pFilter )
  10466. {
  10467. CSphMatch tMatch;
  10468. tMatch.m_iDocID = tQword.m_tDoc.m_iDocID;
  10469. if ( pFilter->UsesAttrs() )
  10470. {
  10471. if ( m_dInlineRow.GetLength() )
  10472. tMatch.m_pDynamic = m_dInlineRow.Begin();
  10473. else
  10474. {
  10475. const DWORD * pInfo = pSourceIndex->FindDocinfo ( tQword.m_tDoc.m_iDocID );
  10476. tMatch.m_pStatic = pInfo?DOCINFO2ATTRS ( pInfo ):NULL;
  10477. }
  10478. }
  10479. bool bResult = pFilter->Eval ( tMatch );
  10480. tMatch.m_pDynamic = NULL;
  10481. if ( !bResult )
  10482. {
  10483. while ( tQword.m_bHasHitlist && tQword.GetNextHit()!=EMPTY_HIT );
  10484. continue;
  10485. }
  10486. }
  10487. return true;
  10488. } else
  10489. return false;
  10490. }
  10491. }
  10492. template < typename QWORD >
  10493. inline void TransferData ( QWORD & tQword, SphWordID_t iWordID, BYTE * sWord,
  10494. const CSphIndex_VLN * pSourceIndex, const ISphFilter * pFilter )
  10495. {
  10496. CSphAggregateHit tHit;
  10497. tHit.m_iWordID = iWordID;
  10498. tHit.m_sKeyword = sWord;
  10499. tHit.m_dFieldMask.Unset();
  10500. while ( CSphMerger::NextDocument ( tQword, pSourceIndex, pFilter ) )
  10501. {
  10502. if ( tQword.m_bHasHitlist )
  10503. TransferHits ( tQword, tHit );
  10504. else
  10505. {
  10506. // convert to aggregate if there is no hit-list
  10507. tHit.m_iDocID = tQword.m_tDoc.m_iDocID - m_iMinID;
  10508. tHit.m_dFieldMask = tQword.m_dQwordFields;
  10509. tHit.SetAggrCount ( tQword.m_uMatchHits );
  10510. m_pHitBuilder->cidxHit ( &tHit, m_dInlineRow.Begin() );
  10511. }
  10512. }
  10513. }
  10514. template < typename QWORD >
  10515. inline void TransferHits ( QWORD & tQword, CSphAggregateHit & tHit )
  10516. {
  10517. assert ( tQword.m_bHasHitlist );
  10518. tHit.m_iDocID = tQword.m_tDoc.m_iDocID - m_iMinID;
  10519. for ( Hitpos_t uHit = tQword.GetNextHit(); uHit!=EMPTY_HIT; uHit = tQword.GetNextHit() )
  10520. {
  10521. tHit.m_iWordPos = uHit;
  10522. m_pHitBuilder->cidxHit ( &tHit, m_dInlineRow.Begin() );
  10523. }
  10524. }
  10525. template < typename QWORD >
  10526. static inline void ConfigureQword ( QWORD & tQword, CSphAutofile & tHits, CSphAutofile & tDocs,
  10527. int iDynamic, int iInline, const CSphRowitem * pMin, ThrottleState_t * pThrottle )
  10528. {
  10529. tQword.m_iInlineAttrs = iInline;
  10530. tQword.m_pInlineFixup = iInline ? pMin : NULL;
  10531. tQword.m_rdHitlist.SetThrottle ( pThrottle );
  10532. tQword.m_rdHitlist.SetFile ( tHits );
  10533. tQword.m_rdHitlist.GetByte();
  10534. tQword.m_rdDoclist.SetThrottle ( pThrottle );
  10535. tQword.m_rdDoclist.SetFile ( tDocs );
  10536. tQword.m_rdDoclist.GetByte();
  10537. tQword.m_tDoc.Reset ( iDynamic );
  10538. }
  10539. const CSphRowitem * GetInline () const { return m_dInlineRow.Begin(); }
  10540. CSphRowitem * AcquireInline () const { return m_dInlineRow.Begin(); }
  10541. };
  10542. template < typename QWORDDST, typename QWORDSRC >
  10543. bool CSphIndex_VLN::MergeWords ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex,
  10544. const ISphFilter * pFilter, SphDocID_t iMinID, CSphHitBuilder * pHitBuilder, CSphString & sError,
  10545. CSphSourceStats & tStat, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle )
  10546. {
  10547. CSphAutofile tDummy;
  10548. pHitBuilder->CreateIndexFiles ( pDstIndex->GetIndexFileName("tmp.spd").cstr(),
  10549. pDstIndex->GetIndexFileName("tmp.spp").cstr(),
  10550. pDstIndex->GetIndexFileName("tmp.spe").cstr(),
  10551. false, 0, tDummy, NULL );
  10552. CSphDictReader tDstReader;
  10553. CSphDictReader tSrcReader;
  10554. bool bWordDict = pHitBuilder->IsWordDict();
  10555. if ( !tDstReader.Setup ( pDstIndex->GetIndexFileName("spi"), pDstIndex->m_tWordlist.m_iWordsEnd,
  10556. pDstIndex->m_tSettings.m_eHitless, sError, bWordDict, pThrottle, pDstIndex->m_tWordlist.m_bHaveSkips ) )
  10557. return false;
  10558. if ( !tSrcReader.Setup ( pSrcIndex->GetIndexFileName("spi"), pSrcIndex->m_tWordlist.m_iWordsEnd,
  10559. pSrcIndex->m_tSettings.m_eHitless, sError, bWordDict, pThrottle, pSrcIndex->m_tWordlist.m_bHaveSkips ) )
  10560. return false;
  10561. const SphDocID_t iDstMinID = pDstIndex->m_iMinDocid;
  10562. const SphDocID_t iSrcMinID = pSrcIndex->m_iMinDocid;
  10563. /// prepare for indexing
  10564. pHitBuilder->HitblockBegin();
  10565. pHitBuilder->HitReset();
  10566. pHitBuilder->SetMin ( pDstIndex->m_dMinRow.Begin(), pDstIndex->m_dMinRow.GetLength() );
  10567. /// setup qwords
  10568. QWORDDST tDstQword ( false, false );
  10569. QWORDSRC tSrcQword ( false, false );
  10570. CSphAutofile fSrcDocs, fSrcHits;
  10571. fSrcDocs.Open ( pSrcIndex->GetIndexFileName("spd"), SPH_O_READ, sError );
  10572. fSrcHits.Open ( pSrcIndex->GetIndexFileName("spp"), SPH_O_READ, sError );
  10573. CSphAutofile fDstDocs, fDstHits;
  10574. fDstDocs.Open ( pDstIndex->GetIndexFileName("spd"), SPH_O_READ, sError );
  10575. fDstHits.Open ( pDstIndex->GetIndexFileName("spp"), SPH_O_READ, sError );
  10576. if ( !sError.IsEmpty() )
  10577. return false;
  10578. int iDstInlineSize = pDstIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ? pDstIndex->m_tSchema.GetRowSize() : 0;
  10579. int iSrcInlineSize = pSrcIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ? pSrcIndex->m_tSchema.GetRowSize() : 0;
  10580. CSphMerger tMerger ( pHitBuilder, Max ( iDstInlineSize, iSrcInlineSize ), iMinID );
  10581. CSphMerger::ConfigureQword<QWORDDST> ( tDstQword, fDstHits, fDstDocs,
  10582. pDstIndex->m_tSchema.GetDynamicSize(), iDstInlineSize,
  10583. pDstIndex->m_dMinRow.Begin(), pThrottle );
  10584. CSphMerger::ConfigureQword<QWORDSRC> ( tSrcQword, fSrcHits, fSrcDocs,
  10585. pSrcIndex->m_tSchema.GetDynamicSize(), iSrcInlineSize,
  10586. pSrcIndex->m_dMinRow.Begin(), pThrottle );
  10587. /// merge
  10588. bool bDstWord = tDstReader.Read();
  10589. bool bSrcWord = tSrcReader.Read();
  10590. tProgress.m_ePhase = CSphIndexProgress::PHASE_MERGE;
  10591. tProgress.Show ( false );
  10592. int iWords = 0;
  10593. int iHitlistsDiscarded = 0;
  10594. while ( bDstWord || bSrcWord )
  10595. {
  10596. if ( iWords==1000 )
  10597. {
  10598. tProgress.m_iWords += 1000;
  10599. tProgress.Show ( false );
  10600. iWords = 0;
  10601. }
  10602. const int iCmp = tDstReader.CmpWord ( tSrcReader );
  10603. if ( !bSrcWord || ( bDstWord && iCmp<0 ) )
  10604. {
  10605. // transfer documents and hits from destination
  10606. CSphMerger::PrepareQword<QWORDDST> ( tDstQword, tDstReader, iDstMinID, bWordDict );
  10607. tMerger.TransferData<QWORDDST> ( tDstQword, tDstReader.m_iWordID, tDstReader.GetWord(), pDstIndex, pFilter );
  10608. iWords++;
  10609. bDstWord = tDstReader.Read();
  10610. } else if ( !bDstWord || ( bSrcWord && iCmp>0 ) )
  10611. {
  10612. // transfer documents and hits from source
  10613. CSphMerger::PrepareQword<QWORDSRC> ( tSrcQword, tSrcReader, iSrcMinID, bWordDict );
  10614. tMerger.TransferData<QWORDSRC> ( tSrcQword, tSrcReader.m_iWordID, tSrcReader.GetWord(), pSrcIndex, NULL );
  10615. iWords++;
  10616. bSrcWord = tSrcReader.Read();
  10617. } else // merge documents and hits inside the word
  10618. {
  10619. assert ( iCmp==0 );
  10620. bool bHitless = !tDstReader.m_bHasHitlist;
  10621. if ( tDstReader.m_bHasHitlist!=tSrcReader.m_bHasHitlist )
  10622. {
  10623. iHitlistsDiscarded++;
  10624. bHitless = true;
  10625. }
  10626. CSphMerger::PrepareQword<QWORDDST> ( tDstQword, tDstReader, iDstMinID, bWordDict );
  10627. CSphMerger::PrepareQword<QWORDSRC> ( tSrcQword, tSrcReader, iSrcMinID, bWordDict );
  10628. CSphAggregateHit tHit;
  10629. tHit.m_iWordID = tDstReader.m_iWordID; // !COMMIT m_sKeyword anyone?
  10630. tHit.m_sKeyword = tDstReader.GetWord();
  10631. tHit.m_dFieldMask.Unset();
  10632. bool bDstDocs = tMerger.NextDocument ( tDstQword, pDstIndex, pFilter );
  10633. bool bSrcDocs = true;
  10634. tSrcQword.GetNextDoc ( tMerger.AcquireInline() );
  10635. tSrcQword.SeekHitlist ( tSrcQword.m_iHitlistPos );
  10636. while ( bDstDocs || bSrcDocs )
  10637. {
  10638. if ( !bSrcDocs || ( bDstDocs && tDstQword.m_tDoc.m_iDocID < tSrcQword.m_tDoc.m_iDocID ) )
  10639. {
  10640. // transfer hits from destination
  10641. if ( bHitless )
  10642. {
  10643. while ( tDstQword.m_bHasHitlist && tDstQword.GetNextHit()!=EMPTY_HIT );
  10644. tHit.m_iDocID = tDstQword.m_tDoc.m_iDocID - iMinID;
  10645. tHit.m_dFieldMask = tDstQword.m_dQwordFields;
  10646. tHit.SetAggrCount ( tDstQword.m_uMatchHits );
  10647. pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
  10648. } else
  10649. tMerger.TransferHits ( tDstQword, tHit );
  10650. bDstDocs = tMerger.NextDocument ( tDstQword, pDstIndex, pFilter );
  10651. } else if ( !bDstDocs || ( bSrcDocs && tDstQword.m_tDoc.m_iDocID > tSrcQword.m_tDoc.m_iDocID ) )
  10652. {
  10653. // transfer hits from source
  10654. if ( bHitless )
  10655. {
  10656. while ( tSrcQword.m_bHasHitlist && tSrcQword.GetNextHit()!=EMPTY_HIT );
  10657. tHit.m_iDocID = tSrcQword.m_tDoc.m_iDocID - iMinID;
  10658. tHit.m_dFieldMask = tSrcQword.m_dQwordFields;
  10659. tHit.SetAggrCount ( tSrcQword.m_uMatchHits );
  10660. pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
  10661. } else
  10662. tMerger.TransferHits ( tSrcQword, tHit );
  10663. bSrcDocs = tMerger.NextDocument ( tSrcQword, pSrcIndex, NULL );
  10664. } else
  10665. {
  10666. // merge hits inside the document
  10667. assert ( bDstDocs );
  10668. assert ( bSrcDocs );
  10669. assert ( tDstQword.m_tDoc.m_iDocID==tSrcQword.m_tDoc.m_iDocID );
  10670. tHit.m_iDocID = tDstQword.m_tDoc.m_iDocID - iMinID;
  10671. if ( bHitless )
  10672. {
  10673. while ( tDstQword.m_bHasHitlist && tDstQword.GetNextHit()!=EMPTY_HIT );
  10674. while ( tSrcQword.m_bHasHitlist && tSrcQword.GetNextHit()!=EMPTY_HIT );
  10675. tHit.m_dFieldMask = tDstQword.m_dQwordFields | tSrcQword.m_dQwordFields;
  10676. tHit.SetAggrCount ( tDstQword.m_uMatchHits + tSrcQword.m_uMatchHits );
  10677. pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
  10678. } else
  10679. {
  10680. Hitpos_t uDstHit = tDstQword.GetNextHit();
  10681. Hitpos_t uSrcHit = tSrcQword.GetNextHit();
  10682. while ( uDstHit!=EMPTY_HIT || uSrcHit!=EMPTY_HIT )
  10683. {
  10684. if ( uSrcHit==EMPTY_HIT || ( uDstHit!=EMPTY_HIT && uDstHit<uSrcHit ) )
  10685. {
  10686. tHit.m_iWordPos = uDstHit;
  10687. pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
  10688. uDstHit = tDstQword.GetNextHit();
  10689. } else if ( uDstHit==EMPTY_HIT || ( uSrcHit!=EMPTY_HIT && uSrcHit<uDstHit ) )
  10690. {
  10691. tHit.m_iWordPos = uSrcHit;
  10692. pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
  10693. uSrcHit = tSrcQword.GetNextHit();
  10694. } else
  10695. {
  10696. assert ( uDstHit==uSrcHit );
  10697. tHit.m_iWordPos = uDstHit;
  10698. pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
  10699. uDstHit = tDstQword.GetNextHit();
  10700. uSrcHit = tSrcQword.GetNextHit();
  10701. }
  10702. }
  10703. }
  10704. // next document
  10705. bDstDocs = tMerger.NextDocument ( tDstQword, pDstIndex, pFilter );
  10706. bSrcDocs = tMerger.NextDocument ( tSrcQword, pSrcIndex, NULL );
  10707. }
  10708. }
  10709. // next word
  10710. bDstWord = tDstReader.Read();
  10711. bSrcWord = tSrcReader.Read();
  10712. iWords++;
  10713. }
  10714. }
  10715. tStat.m_iTotalDocuments += pSrcIndex->m_tStats.m_iTotalDocuments;
  10716. tStat.m_iTotalBytes += pSrcIndex->m_tStats.m_iTotalBytes;
  10717. tProgress.m_iWords += iWords;
  10718. tProgress.Show ( false );
  10719. if ( iHitlistsDiscarded )
  10720. sphWarning ( "discarded hitlists for %u words", iHitlistsDiscarded );
  10721. return true;
  10722. }
  10723. bool CSphIndex_VLN::Merge ( CSphIndex * pSource, const CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists )
  10724. {
  10725. CSphString sWarning;
  10726. if ( !Prealloc ( false, false, sWarning ) || !Preread() )
  10727. return false;
  10728. if ( !pSource->Prealloc ( false, false, sWarning ) || !pSource->Preread() )
  10729. {
  10730. m_sLastError.SetSprintf ( "source index preload failed: %s", pSource->GetLastError().cstr() );
  10731. return false;
  10732. }
  10733. // create filters
  10734. CSphScopedPtr<ISphFilter> pFilter ( CreateMergeFilters ( dFilters, m_tSchema, GetMVAPool(), m_pStrings.GetWritePtr() ) );
  10735. DWORD nKillListSize = pSource->GetKillListSize ();
  10736. if ( nKillListSize )
  10737. {
  10738. CSphFilterSettings tKillListFilter;
  10739. const SphAttr_t * pKillList = pSource->GetKillList ();
  10740. tKillListFilter.m_bExclude = true;
  10741. tKillListFilter.m_eType = SPH_FILTER_VALUES;
  10742. tKillListFilter.m_iMinValue = pKillList[0];
  10743. tKillListFilter.m_iMaxValue = pKillList[nKillListSize -1];
  10744. tKillListFilter.m_sAttrName = "@id";
  10745. tKillListFilter.SetExternalValues ( pKillList, nKillListSize );
  10746. ISphFilter * pKillListFilter = sphCreateFilter ( tKillListFilter, m_tSchema, GetMVAPool(), m_pStrings.GetWritePtr(), m_sLastError );
  10747. pFilter = sphJoinFilters ( pFilter.LeakPtr(), pKillListFilter );
  10748. }
  10749. return CSphIndex_VLN::DoMerge ( this, dynamic_cast<const CSphIndex_VLN *>( pSource ),
  10750. bMergeKillLists, pFilter.Ptr(), m_sLastError, m_tProgress, &g_tThrottle );
  10751. }
  10752. bool CSphIndex_VLN::DoMerge ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex,
  10753. bool bMergeKillLists, ISphFilter * pFilter, CSphString & sError,
  10754. CSphIndexProgress & tProgress, ThrottleState_t * pThrottle )
  10755. {
  10756. assert ( pDstIndex && pSrcIndex );
  10757. const CSphSchema & tDstSchema = pDstIndex->m_tSchema;
  10758. const CSphSchema & tSrcSchema = pSrcIndex->m_tSchema;
  10759. if ( !tDstSchema.CompareTo ( tSrcSchema, sError ) )
  10760. return false;
  10761. if ( pDstIndex->m_tSettings.m_eHitless!=pSrcIndex->m_tSettings.m_eHitless )
  10762. {
  10763. sError = "hitless settings must be the same on merged indices";
  10764. return false;
  10765. }
  10766. // FIXME!
  10767. if ( pDstIndex->m_tSettings.m_eDocinfo!=pSrcIndex->m_tSettings.m_eDocinfo && !( pDstIndex->m_bIsEmpty || pSrcIndex->m_bIsEmpty ) )
  10768. {
  10769. sError.SetSprintf ( "docinfo storage on non-empty indexes must be the same (dst docinfo %d, empty %d, src docinfo %d, empty %d",
  10770. pDstIndex->m_tSettings.m_eDocinfo, pDstIndex->m_bIsEmpty, pSrcIndex->m_tSettings.m_eDocinfo, pSrcIndex->m_bIsEmpty );
  10771. return false;
  10772. }
  10773. if ( pDstIndex->m_pDict->GetSettings().m_bWordDict!=pSrcIndex->m_pDict->GetSettings().m_bWordDict )
  10774. {
  10775. sError.SetSprintf ( "dictionary types must be the same (dst dict=%s, src dict=%s )",
  10776. pDstIndex->m_pDict->GetSettings().m_bWordDict ? "keywords" : "crc",
  10777. pSrcIndex->m_pDict->GetSettings().m_bWordDict ? "keywords" : "crc" );
  10778. return false;
  10779. }
  10780. BuildHeader_t tBuildHeader ( pDstIndex->m_tStats );
  10781. /////////////////////////////////////////
  10782. // merging attributes (.spa, .spm, .sps)
  10783. /////////////////////////////////////////
  10784. CSphWriter tSPMWriter, tSPSWriter;
  10785. tSPMWriter.SetThrottle ( pThrottle );
  10786. tSPSWriter.SetThrottle ( pThrottle );
  10787. if ( !tSPMWriter.OpenFile ( pDstIndex->GetIndexFileName("tmp.spm"), sError )
  10788. || !tSPSWriter.OpenFile ( pDstIndex->GetIndexFileName("tmp.sps"), sError ) )
  10789. {
  10790. return false;
  10791. }
  10792. tSPSWriter.PutByte ( 0 ); // dummy byte, to reserve magic zero offset
  10793. /// merging
  10794. CSphVector<CSphAttrLocator> dMvaLocators;
  10795. CSphVector<CSphAttrLocator> dStringLocators;
  10796. for ( int i=0; i<tDstSchema.GetAttrsCount(); i++ )
  10797. {
  10798. const CSphColumnInfo & tInfo = tDstSchema.GetAttr(i);
  10799. if ( tInfo.m_eAttrType==SPH_ATTR_UINT32SET )
  10800. dMvaLocators.Add ( tInfo.m_tLocator );
  10801. if ( tInfo.m_eAttrType==SPH_ATTR_STRING || tInfo.m_eAttrType==SPH_ATTR_JSON )
  10802. dStringLocators.Add ( tInfo.m_tLocator );
  10803. }
  10804. for ( int i=0; i<tDstSchema.GetAttrsCount(); i++ )
  10805. {
  10806. const CSphColumnInfo & tInfo = tDstSchema.GetAttr(i);
  10807. if ( tInfo.m_eAttrType==SPH_ATTR_INT64SET )
  10808. dMvaLocators.Add ( tInfo.m_tLocator );
  10809. }
  10810. CSphVector<SphAttr_t> dPhantomKiller;
  10811. int64_t iTotalDocuments = 0;
  10812. bool bNeedInfinum = true;
  10813. // minimal docid-1 for merging
  10814. SphDocID_t iMergeInfinum = 0;
  10815. if ( pDstIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pSrcIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
  10816. {
  10817. int iStride = DOCINFO_IDSIZE + pDstIndex->m_tSchema.GetRowSize();
  10818. CSphFixedVector<CSphRowitem> dRow ( iStride );
  10819. CSphWriter wrRows;
  10820. wrRows.SetThrottle ( pThrottle );
  10821. if ( !wrRows.OpenFile ( pDstIndex->GetIndexFileName("tmp.spa"), sError ) )
  10822. return false;
  10823. int64_t iExpectedDocs = pDstIndex->m_tStats.m_iTotalDocuments + pSrcIndex->GetStats().m_iTotalDocuments;
  10824. AttrIndexBuilder_c tMinMax ( pDstIndex->m_tSchema );
  10825. int64_t iMinMaxSize = tMinMax.GetExpectedSize ( iExpectedDocs );
  10826. if ( iMinMaxSize>INT_MAX || iExpectedDocs>INT_MAX )
  10827. {
  10828. sError.SetSprintf ( "attribute files (.spa) over 128 GB are not supported (min-max approximate="INT64_FMT", documents count="INT64_FMT")",
  10829. iMinMaxSize, iExpectedDocs );
  10830. return false;
  10831. }
  10832. CSphFixedVector<DWORD> dMinMaxBuffer ( (int)iMinMaxSize );
  10833. tMinMax.Prepare ( dMinMaxBuffer.Begin(), dMinMaxBuffer.Begin() + dMinMaxBuffer.GetLength() ); // FIXME!!! for over INT_MAX blocks
  10834. const DWORD * pSrcRow = pSrcIndex->m_pDocinfo.GetWritePtr(); // they *can* be null if the respective index is empty
  10835. const DWORD * pDstRow = pDstIndex->m_pDocinfo.GetWritePtr();
  10836. int64_t iSrcCount = 0;
  10837. int64_t iDstCount = 0;
  10838. CSphMatch tMatch;
  10839. while ( iSrcCount < pSrcIndex->m_iDocinfo || iDstCount < pDstIndex->m_iDocinfo )
  10840. {
  10841. SphDocID_t iDstDocID, iSrcDocID;
  10842. if ( iDstCount < pDstIndex->m_iDocinfo )
  10843. {
  10844. iDstDocID = DOCINFO2ID ( pDstRow );
  10845. if ( pFilter )
  10846. {
  10847. tMatch.m_iDocID = iDstDocID;
  10848. tMatch.m_pStatic = DOCINFO2ATTRS ( pDstRow );
  10849. tMatch.m_pDynamic = NULL;
  10850. if ( !pFilter->Eval ( tMatch ) )
  10851. {
  10852. pDstRow += iStride;
  10853. iDstCount++;
  10854. continue;
  10855. }
  10856. }
  10857. } else
  10858. iDstDocID = 0;
  10859. if ( iSrcCount < pSrcIndex->m_iDocinfo )
  10860. iSrcDocID = DOCINFO2ID ( pSrcRow );
  10861. else
  10862. iSrcDocID = 0;
  10863. if ( ( iDstDocID && iDstDocID < iSrcDocID ) || ( iDstDocID && !iSrcDocID ) )
  10864. {
  10865. Verify ( tMinMax.Collect ( pDstRow, pDstIndex->m_pMva.GetWritePtr(), pDstIndex->m_pMva.GetNumEntries(), sError, true ) );
  10866. if ( dMvaLocators.GetLength() || dStringLocators.GetLength() )
  10867. {
  10868. memcpy ( dRow.Begin(), pDstRow, iStride * sizeof ( CSphRowitem ) );
  10869. CopyRowMVA ( pDstIndex->m_pMva.GetWritePtr(), dMvaLocators, iDstDocID, dRow.Begin(), tSPMWriter );
  10870. CopyRowString ( pDstIndex->m_pStrings.GetWritePtr(), dStringLocators, dRow.Begin(), tSPSWriter );
  10871. wrRows.PutBytes ( dRow.Begin(), sizeof(DWORD)*iStride );
  10872. } else
  10873. {
  10874. wrRows.PutBytes ( pDstRow, sizeof(DWORD)*iStride );
  10875. }
  10876. tBuildHeader.m_uMinMaxIndex += iStride;
  10877. pDstRow += iStride;
  10878. iDstCount++;
  10879. iTotalDocuments++;
  10880. if ( bNeedInfinum )
  10881. {
  10882. bNeedInfinum = false;
  10883. iMergeInfinum = iDstDocID - 1;
  10884. }
  10885. } else if ( iSrcDocID )
  10886. {
  10887. Verify ( tMinMax.Collect ( pSrcRow, pSrcIndex->m_pMva.GetWritePtr(), pSrcIndex->m_pMva.GetNumEntries(), sError, true ) );
  10888. if ( dMvaLocators.GetLength() || dStringLocators.GetLength() )
  10889. {
  10890. memcpy ( dRow.Begin(), pSrcRow, iStride * sizeof ( CSphRowitem ) );
  10891. CopyRowMVA ( pSrcIndex->m_pMva.GetWritePtr(), dMvaLocators, iSrcDocID, dRow.Begin(), tSPMWriter );
  10892. CopyRowString ( pSrcIndex->m_pStrings.GetWritePtr(), dStringLocators, dRow.Begin(), tSPSWriter );
  10893. wrRows.PutBytes ( dRow.Begin(), sizeof(DWORD)*iStride );
  10894. } else
  10895. {
  10896. wrRows.PutBytes ( pSrcRow, sizeof(DWORD)*iStride );
  10897. }
  10898. tBuildHeader.m_uMinMaxIndex += iStride;
  10899. pSrcRow += iStride;
  10900. iSrcCount++;
  10901. iTotalDocuments++;
  10902. if ( bNeedInfinum )
  10903. {
  10904. bNeedInfinum = false;
  10905. iMergeInfinum = iSrcDocID - 1;
  10906. }
  10907. if ( iDstDocID==iSrcDocID )
  10908. {
  10909. dPhantomKiller.Add ( iSrcDocID );
  10910. pDstRow += iStride;
  10911. iDstCount++;
  10912. }
  10913. }
  10914. }
  10915. if ( iTotalDocuments )
  10916. {
  10917. tMinMax.FinishCollect();
  10918. int64_t iMinMaxSize = tMinMax.GetActualSize() * sizeof(DWORD);
  10919. wrRows.PutBytes ( dMinMaxBuffer.Begin(), iMinMaxSize );
  10920. }
  10921. wrRows.CloseFile();
  10922. if ( wrRows.IsError() )
  10923. return false;
  10924. } else if ( pDstIndex->m_bIsEmpty || pSrcIndex->m_bIsEmpty )
  10925. {
  10926. // one of the indexes has no documents; copy the .spa file from the other one
  10927. CSphString sSrc = !pDstIndex->m_bIsEmpty ? pDstIndex->GetIndexFileName("spa") : pSrcIndex->GetIndexFileName("spa");
  10928. CSphString sDst = pDstIndex->GetIndexFileName("tmp.spa");
  10929. if ( !CopyFile ( sSrc.cstr(), sDst.cstr(), sError, pThrottle ) )
  10930. return false;
  10931. } else
  10932. {
  10933. // storage is not extern; create dummy .spa file
  10934. CSphAutofile fdSpa ( pDstIndex->GetIndexFileName("tmp.spa"), SPH_O_NEW, sError );
  10935. fdSpa.Close();
  10936. }
  10937. if ( !CheckDocsCount ( iTotalDocuments, sError ) )
  10938. return false;
  10939. // create phantom killlist filter
  10940. if ( dPhantomKiller.GetLength() )
  10941. {
  10942. CSphFilterSettings tKLF;
  10943. tKLF.m_bExclude = true;
  10944. tKLF.m_eType = SPH_FILTER_VALUES;
  10945. tKLF.m_iMinValue = dPhantomKiller[0];
  10946. tKLF.m_iMaxValue = dPhantomKiller.Last();
  10947. tKLF.m_sAttrName = "@id";
  10948. tKLF.SetExternalValues ( &dPhantomKiller[0], dPhantomKiller.GetLength() );
  10949. ISphFilter * pSpaFilter = sphCreateFilter ( tKLF, pDstIndex->m_tSchema, pDstIndex->GetMVAPool(), pDstIndex->m_pStrings.GetWritePtr(), sError );
  10950. pFilter = sphJoinFilters ( pFilter, pSpaFilter );
  10951. }
  10952. CSphAutofile fdTmpDict ( pDstIndex->GetIndexFileName("tmp8.spi"), SPH_O_NEW, sError, true );
  10953. CSphAutofile fdDict ( pDstIndex->GetIndexFileName("tmp.spi"), SPH_O_NEW, sError );
  10954. if ( !sError.IsEmpty() || fdTmpDict.GetFD()<0 || fdDict.GetFD()<0 )
  10955. return false;
  10956. CSphScopedPtr<CSphDict> pDict ( pDstIndex->m_pDict->Clone() );
  10957. int iHitBufferSize = 8 * 1024 * 1024;
  10958. CSphVector<SphWordID_t> dDummy;
  10959. CSphHitBuilder tHitBuilder ( pDstIndex->m_tSettings, dDummy, true, iHitBufferSize, pDict.Ptr(), &sError );
  10960. tHitBuilder.SetThrottle ( pThrottle );
  10961. CSphFixedVector<CSphRowitem> dMinRow ( pDstIndex->m_dMinRow.GetLength() );
  10962. memcpy ( dMinRow.Begin(), pDstIndex->m_dMinRow.Begin(), sizeof(CSphRowitem)*dMinRow.GetLength() );
  10963. // correct infinum might be already set during spa merging.
  10964. SphDocID_t iMinDocid = ( !iMergeInfinum ) ? Min ( pDstIndex->m_iMinDocid, pSrcIndex->m_iMinDocid ) : iMergeInfinum;
  10965. tBuildHeader.m_iMinDocid = iMinDocid;
  10966. tBuildHeader.m_pMinRow = dMinRow.Begin();
  10967. // FIXME? is this magic dict block constant any good?..
  10968. pDict->DictBegin ( fdTmpDict, fdDict, iHitBufferSize, pThrottle );
  10969. // merge dictionaries, doclists and hitlists
  10970. if ( pDict->GetSettings().m_bWordDict )
  10971. {
  10972. WITH_QWORD ( pDstIndex, false, QwordDst,
  10973. WITH_QWORD ( pSrcIndex, false, QwordSrc,
  10974. {
  10975. if ( !CSphIndex_VLN::MergeWords < QwordDst, QwordSrc > ( pDstIndex, pSrcIndex, pFilter, iMinDocid,
  10976. &tHitBuilder, sError, tBuildHeader, tProgress, pThrottle ) )
  10977. return false;
  10978. } ) );
  10979. } else
  10980. {
  10981. WITH_QWORD ( pDstIndex, true, QwordDst,
  10982. WITH_QWORD ( pSrcIndex, true, QwordSrc,
  10983. {
  10984. if ( !CSphIndex_VLN::MergeWords < QwordDst, QwordSrc > ( pDstIndex, pSrcIndex, pFilter, iMinDocid,
  10985. &tHitBuilder, sError, tBuildHeader, tProgress, pThrottle ) )
  10986. return false;
  10987. } ) );
  10988. }
  10989. if ( iTotalDocuments )
  10990. tBuildHeader.m_iTotalDocuments = iTotalDocuments;
  10991. // merge kill-lists
  10992. CSphAutofile fdKillList ( pDstIndex->GetIndexFileName("tmp.spk"), SPH_O_NEW, sError );
  10993. if ( fdKillList.GetFD () < 0 )
  10994. return false;
  10995. if ( bMergeKillLists )
  10996. {
  10997. // merge spk
  10998. CSphVector<SphAttr_t> dKillList;
  10999. dKillList.Reserve ( pDstIndex->GetKillListSize() + pSrcIndex->GetKillListSize() );
  11000. for ( int i = 0; i < pSrcIndex->GetKillListSize (); i++ )
  11001. dKillList.Add ( pSrcIndex->GetKillList () [i] );
  11002. for ( int i = 0; i < pDstIndex->GetKillListSize (); i++ )
  11003. dKillList.Add ( pDstIndex->GetKillList () [i] );
  11004. dKillList.Uniq ();
  11005. tBuildHeader.m_iKillListSize = dKillList.GetLength ();
  11006. if ( dKillList.GetLength() )
  11007. {
  11008. if ( !sphWriteThrottled ( fdKillList.GetFD(), &dKillList[0], dKillList.GetLength()*sizeof(SphAttr_t), "kill_list", sError, pThrottle ) )
  11009. return false;
  11010. }
  11011. }
  11012. fdKillList.Close ();
  11013. // finalize
  11014. CSphAggregateHit tFlush;
  11015. tFlush.m_iDocID = 0;
  11016. tFlush.m_iWordID = 0;
  11017. tFlush.m_sKeyword = (BYTE*)""; // tricky: assertion in cidxHit calls strcmp on this in case of empty index!
  11018. tFlush.m_iWordPos = EMPTY_HIT;
  11019. tFlush.m_dFieldMask.Unset();
  11020. tHitBuilder.cidxHit ( &tFlush, NULL );
  11021. if ( !tHitBuilder.cidxDone ( iHitBufferSize, pDstIndex->m_tSettings.m_iMinInfixLen,
  11022. pDstIndex->m_pTokenizer->GetMaxCodepointLength(), &tBuildHeader ) )
  11023. return false;
  11024. tBuildHeader.m_sHeaderExtension = "tmp.sph";
  11025. tBuildHeader.m_pThrottle = pThrottle;
  11026. pDstIndex->BuildDone ( tBuildHeader, sError ); // FIXME? is this magic dict block constant any good?..
  11027. // we're done
  11028. tProgress.Show ( true );
  11029. return true;
  11030. }
  11031. bool sphMerge ( const CSphIndex * pDst, const CSphIndex * pSrc, ISphFilter * pFilter,
  11032. CSphString & sError, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle )
  11033. {
  11034. const CSphIndex_VLN * pDstIndex = dynamic_cast<const CSphIndex_VLN *>( pDst );
  11035. const CSphIndex_VLN * pSrcIndex = dynamic_cast<const CSphIndex_VLN *> ( pSrc );
  11036. assert ( pDstIndex && pSrcIndex );
  11037. return CSphIndex_VLN::DoMerge ( pDstIndex, pSrcIndex, false, pFilter, sError, tProgress, pThrottle );
  11038. }
  11039. /////////////////////////////////////////////////////////////////////////////
  11040. // THE SEARCHER
  11041. /////////////////////////////////////////////////////////////////////////////
  11042. SphWordID_t CSphDictStar::GetWordID ( BYTE * pWord )
  11043. {
  11044. char sBuf [ 16+3*SPH_MAX_WORD_LEN ];
  11045. assert ( strlen ( (const char*)pWord ) < 16+3*SPH_MAX_WORD_LEN );
  11046. if ( m_pDict->GetSettings().m_bStopwordsStem && m_pDict->IsStopWord ( pWord ) )
  11047. return 0;
  11048. m_pDict->ApplyStemmers ( pWord );
  11049. int iLen = strlen ( (const char*)pWord );
  11050. assert ( iLen < 16+3*SPH_MAX_WORD_LEN - 1 );
  11051. memcpy ( sBuf, pWord, iLen+1 );
  11052. if ( iLen )
  11053. {
  11054. if ( sBuf[iLen-1]=='*' )
  11055. {
  11056. iLen--;
  11057. sBuf[iLen] = '\0';
  11058. } else
  11059. {
  11060. sBuf[iLen] = MAGIC_WORD_TAIL;
  11061. iLen++;
  11062. sBuf[iLen] = '\0';
  11063. }
  11064. }
  11065. return m_pDict->GetWordID ( (BYTE*)sBuf, iLen, !m_pDict->GetSettings().m_bStopwordsStem );
  11066. }
  11067. SphWordID_t CSphDictStar::GetWordIDNonStemmed ( BYTE * pWord )
  11068. {
  11069. return m_pDict->GetWordIDNonStemmed ( pWord );
  11070. }
  11071. //////////////////////////////////////////////////////////////////////////
  11072. CSphDictStarV8::CSphDictStarV8 ( CSphDict * pDict, bool bPrefixes, bool bInfixes )
  11073. : CSphDictStar ( pDict )
  11074. , m_bPrefixes ( bPrefixes )
  11075. , m_bInfixes ( bInfixes )
  11076. {
  11077. }
  11078. SphWordID_t CSphDictStarV8::GetWordID ( BYTE * pWord )
  11079. {
  11080. char sBuf [ 16+3*SPH_MAX_WORD_LEN ];
  11081. int iLen = strlen ( (const char*)pWord );
  11082. iLen = Min ( iLen, 16+3*SPH_MAX_WORD_LEN - 1 );
  11083. if ( !iLen )
  11084. return 0;
  11085. bool bHeadStar = ( pWord[0]=='*' );
  11086. bool bTailStar = ( pWord[iLen-1]=='*' ) && ( iLen>1 );
  11087. if ( !bHeadStar && !bTailStar )
  11088. {
  11089. if ( m_pDict->GetSettings().m_bStopwordsStem && IsStopWord ( pWord ) )
  11090. return 0;
  11091. m_pDict->ApplyStemmers ( pWord );
  11092. if ( !m_pDict->GetSettings().m_bStopwordsStem && IsStopWord ( pWord ) )
  11093. return 0;
  11094. }
  11095. iLen = strlen ( (const char*)pWord );
  11096. assert ( iLen < 16+3*SPH_MAX_WORD_LEN - 2 );
  11097. if ( !iLen || ( bHeadStar && iLen==1 ) )
  11098. return 0;
  11099. if ( m_bInfixes )
  11100. {
  11101. ////////////////////////////////////
  11102. // infix or mixed infix+prefix mode
  11103. ////////////////////////////////////
  11104. // handle head star
  11105. if ( bHeadStar )
  11106. {
  11107. memcpy ( sBuf, pWord+1, iLen-- ); // chops star, copies trailing zero, updates iLen
  11108. } else
  11109. {
  11110. sBuf[0] = MAGIC_WORD_HEAD;
  11111. memcpy ( sBuf+1, pWord, ++iLen ); // copies everything incl trailing zero, updates iLen
  11112. }
  11113. // handle tail star
  11114. if ( bTailStar )
  11115. {
  11116. sBuf[--iLen] = '\0'; // got star, just chop it away
  11117. } else
  11118. {
  11119. sBuf[iLen] = MAGIC_WORD_TAIL; // no star, add tail marker
  11120. sBuf[++iLen] = '\0';
  11121. }
  11122. } else
  11123. {
  11124. ////////////////////
  11125. // prefix-only mode
  11126. ////////////////////
  11127. assert ( m_bPrefixes );
  11128. // always ignore head star in prefix mode
  11129. if ( bHeadStar )
  11130. {
  11131. pWord++;
  11132. iLen--;
  11133. }
  11134. // handle tail star
  11135. if ( !bTailStar )
  11136. {
  11137. // exact word search request, always (ie. both in infix/prefix mode) mangles to "\1word\1" in v.8+
  11138. sBuf[0] = MAGIC_WORD_HEAD;
  11139. memcpy ( sBuf+1, pWord, iLen );
  11140. sBuf[iLen+1] = MAGIC_WORD_TAIL;
  11141. sBuf[iLen+2] = '\0';
  11142. iLen += 2;
  11143. } else
  11144. {
  11145. // prefix search request, mangles to word itself (just chop away the star)
  11146. memcpy ( sBuf, pWord, iLen );
  11147. sBuf[--iLen] = '\0';
  11148. }
  11149. }
  11150. // calc id for mangled word
  11151. return m_pDict->GetWordID ( (BYTE*)sBuf, iLen, !bHeadStar && !bTailStar );
  11152. }
  11153. //////////////////////////////////////////////////////////////////////////
  11154. SphWordID_t CSphDictExact::GetWordID ( BYTE * pWord )
  11155. {
  11156. int iLen = strlen ( (const char*)pWord );
  11157. iLen = Min ( iLen, 16+3*SPH_MAX_WORD_LEN - 1 );
  11158. if ( !iLen )
  11159. return 0;
  11160. if ( pWord[0]=='=' )
  11161. pWord[0] = MAGIC_WORD_HEAD_NONSTEMMED;
  11162. if ( pWord[0]<' ' )
  11163. return m_pDict->GetWordIDNonStemmed ( pWord );
  11164. return m_pDict->GetWordID ( pWord );
  11165. }
  11166. /////////////////////////////////////////////////////////////////////////////
  11167. inline bool sphGroupMatch ( SphAttr_t iGroup, const SphAttr_t * pGroups, int iGroups )
  11168. {
  11169. if ( !pGroups ) return true;
  11170. const SphAttr_t * pA = pGroups;
  11171. const SphAttr_t * pB = pGroups+iGroups-1;
  11172. if ( iGroup==*pA || iGroup==*pB ) return true;
  11173. if ( iGroup<(*pA) || iGroup>(*pB) ) return false;
  11174. while ( pB-pA>1 )
  11175. {
  11176. const SphAttr_t * pM = pA + ((pB-pA)/2);
  11177. if ( iGroup==(*pM) )
  11178. return true;
  11179. if ( iGroup<(*pM) )
  11180. pB = pM;
  11181. else
  11182. pA = pM;
  11183. }
  11184. return false;
  11185. }
  11186. bool CSphIndex_VLN::EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const
  11187. {
  11188. // might be needed even when we do not have a filter
  11189. if ( pCtx->m_bLookupFilter )
  11190. CopyDocinfo ( pCtx, tMatch, FindDocinfo ( tMatch.m_iDocID ) );
  11191. pCtx->CalcFilter ( tMatch ); // FIXME!!! leak of filtered STRING_PTR
  11192. return pCtx->m_pFilter ? !pCtx->m_pFilter->Eval ( tMatch ) : false;
  11193. }
  11194. SphAttr_t * CSphIndex_VLN::GetKillList () const
  11195. {
  11196. return m_pKillList.GetWritePtr ();
  11197. }
  11198. bool CSphIndex_VLN::HasDocid ( SphDocID_t uDocid ) const
  11199. {
  11200. return FindDocinfo ( uDocid )!=NULL;
  11201. }
  11202. const DWORD * CSphIndex_VLN::FindDocinfo ( SphDocID_t uDocID ) const
  11203. {
  11204. if ( m_iDocinfo<=0 )
  11205. return NULL;
  11206. assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN );
  11207. assert ( !m_pDocinfo.IsEmpty() );
  11208. assert ( m_tSchema.GetAttrsCount() );
  11209. int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  11210. int64_t iStart = 0;
  11211. int64_t iEnd = m_iDocinfo-1;
  11212. #define LOC_ROW(_index) &m_pDocinfo [ _index*iStride ]
  11213. #define LOC_ID(_index) DOCINFO2ID(LOC_ROW(_index))
  11214. if ( m_pDocinfoHash.GetLength() )
  11215. {
  11216. SphDocID_t uFirst = LOC_ID(0);
  11217. SphDocID_t uLast = LOC_ID(iEnd);
  11218. if ( uDocID<uFirst || uDocID>uLast )
  11219. return NULL;
  11220. int64_t iHash = ( ( uDocID - uFirst ) >> m_pDocinfoHash[0] );
  11221. if ( iHash > ( 1 << DOCINFO_HASH_BITS ) ) // possible in case of broken data, for instance
  11222. return NULL;
  11223. iStart = m_pDocinfoHash [ iHash+1 ];
  11224. iEnd = m_pDocinfoHash [ iHash+2 ] - 1;
  11225. }
  11226. if ( uDocID==LOC_ID(iStart) )
  11227. return LOC_ROW(iStart);
  11228. if ( uDocID==LOC_ID(iEnd) )
  11229. return LOC_ROW(iEnd);
  11230. while ( iEnd-iStart>1 )
  11231. {
  11232. // check if nothing found
  11233. if ( uDocID<LOC_ID(iStart) || uDocID>LOC_ID(iEnd) )
  11234. return NULL;
  11235. assert ( uDocID > LOC_ID(iStart) );
  11236. assert ( uDocID < LOC_ID(iEnd) );
  11237. int64_t iMid = iStart + (iEnd-iStart)/2;
  11238. if ( uDocID==LOC_ID(iMid) )
  11239. return LOC_ROW(iMid);
  11240. else if ( uDocID<LOC_ID(iMid) )
  11241. iEnd = iMid;
  11242. else
  11243. iStart = iMid;
  11244. }
  11245. #undef LOC_ID
  11246. #undef LOC_ROW
  11247. return NULL;
  11248. }
  11249. void CSphIndex_VLN::CopyDocinfo ( CSphQueryContext * pCtx, CSphMatch & tMatch, const DWORD * pFound ) const
  11250. {
  11251. if ( !pFound )
  11252. return;
  11253. // setup static pointer
  11254. assert ( DOCINFO2ID(pFound)==tMatch.m_iDocID );
  11255. tMatch.m_pStatic = DOCINFO2ATTRS(pFound);
  11256. // patch if necessary
  11257. if ( pCtx->m_pOverrides )
  11258. ARRAY_FOREACH ( i, (*pCtx->m_pOverrides) )
  11259. {
  11260. const CSphAttrOverride & tOverride = (*pCtx->m_pOverrides)[i]; // shortcut
  11261. const CSphAttrOverride::IdValuePair_t * pEntry = tOverride.m_dValues.BinarySearch (
  11262. bind ( &CSphAttrOverride::IdValuePair_t::m_uDocID ), tMatch.m_iDocID );
  11263. tMatch.SetAttr ( pCtx->m_dOverrideOut[i], pEntry
  11264. ? pEntry->m_uValue
  11265. : sphGetRowAttr ( tMatch.m_pStatic, pCtx->m_dOverrideIn[i] ) );
  11266. }
  11267. }
  11268. static inline void CalcContextItems ( CSphMatch & tMatch, const CSphVector<CSphQueryContext::CalcItem_t> & dItems )
  11269. {
  11270. ARRAY_FOREACH ( i, dItems )
  11271. {
  11272. const CSphQueryContext::CalcItem_t & tCalc = dItems[i];
  11273. if ( tCalc.m_eType==SPH_ATTR_INTEGER )
  11274. tMatch.SetAttr ( tCalc.m_tLoc, tCalc.m_pExpr->IntEval(tMatch) );
  11275. else if ( tCalc.m_eType==SPH_ATTR_BIGINT || tCalc.m_eType==SPH_ATTR_JSON_FIELD )
  11276. tMatch.SetAttr ( tCalc.m_tLoc, tCalc.m_pExpr->Int64Eval(tMatch) );
  11277. else if ( tCalc.m_eType==SPH_ATTR_STRINGPTR )
  11278. {
  11279. const BYTE * pStr = NULL;
  11280. tCalc.m_pExpr->StringEval ( tMatch, &pStr );
  11281. tMatch.SetAttr ( tCalc.m_tLoc, (SphAttr_t) pStr ); // FIXME! a potential leak of *previous* value?
  11282. } else if ( tCalc.m_eType==SPH_ATTR_FACTORS )
  11283. tMatch.SetAttr ( tCalc.m_tLoc, (SphAttr_t)tCalc.m_pExpr->FactorEval(tMatch) );
  11284. else
  11285. tMatch.SetAttrFloat ( tCalc.m_tLoc, tCalc.m_pExpr->Eval(tMatch) );
  11286. }
  11287. }
  11288. void CSphQueryContext::CalcFilter ( CSphMatch & tMatch ) const
  11289. {
  11290. CalcContextItems ( tMatch, m_dCalcFilter );
  11291. }
  11292. void CSphQueryContext::CalcSort ( CSphMatch & tMatch ) const
  11293. {
  11294. CalcContextItems ( tMatch, m_dCalcSort );
  11295. }
  11296. void CSphQueryContext::CalcFinal ( CSphMatch & tMatch ) const
  11297. {
  11298. CalcContextItems ( tMatch, m_dCalcFinal );
  11299. }
  11300. static inline void FreeStrItems ( CSphMatch & tMatch, const CSphVector<CSphQueryContext::CalcItem_t> & dItems )
  11301. {
  11302. if ( !tMatch.m_pDynamic )
  11303. return;
  11304. ARRAY_FOREACH ( i, dItems )
  11305. {
  11306. const CSphQueryContext::CalcItem_t & tCalc = dItems[i];
  11307. switch ( tCalc.m_eType )
  11308. {
  11309. case SPH_ATTR_STRINGPTR:
  11310. {
  11311. CSphString sStr;
  11312. sStr.Adopt ( (char**) (tMatch.m_pDynamic+tCalc.m_tLoc.m_iBitOffset/ROWITEM_BITS));
  11313. }
  11314. break;
  11315. case SPH_ATTR_FACTORS:
  11316. {
  11317. BYTE * pData = (BYTE *)tMatch.GetAttr ( tCalc.m_tLoc );
  11318. delete [] pData;
  11319. tMatch.SetAttr ( tCalc.m_tLoc, 0 );
  11320. }
  11321. break;
  11322. default:
  11323. break;
  11324. }
  11325. }
  11326. }
  11327. void CSphQueryContext::FreeStrFilter ( CSphMatch & tMatch ) const
  11328. {
  11329. FreeStrItems ( tMatch, m_dCalcFilter );
  11330. }
  11331. void CSphQueryContext::FreeStrSort ( CSphMatch & tMatch ) const
  11332. {
  11333. FreeStrItems ( tMatch, m_dCalcSort );
  11334. }
  11335. void CSphQueryContext::FreeStrFinal ( CSphMatch & tMatch ) const
  11336. {
  11337. FreeStrItems ( tMatch, m_dCalcFinal );
  11338. }
  11339. void CSphQueryContext::ExprCommand ( ESphExprCommand eCmd, void * pArg )
  11340. {
  11341. ARRAY_FOREACH ( i, m_dCalcFilter )
  11342. m_dCalcFilter[i].m_pExpr->Command ( eCmd, pArg );
  11343. ARRAY_FOREACH ( i, m_dCalcSort )
  11344. m_dCalcSort[i].m_pExpr->Command ( eCmd, pArg );
  11345. ARRAY_FOREACH ( i, m_dCalcFinal )
  11346. m_dCalcFinal[i].m_pExpr->Command ( eCmd, pArg );
  11347. }
  11348. void CSphQueryContext::SetStringPool ( const BYTE * pStrings )
  11349. {
  11350. ExprCommand ( SPH_EXPR_SET_STRING_POOL, (void*)pStrings );
  11351. if ( m_pFilter )
  11352. m_pFilter->SetStringStorage ( pStrings );
  11353. if ( m_pWeightFilter )
  11354. m_pWeightFilter->SetStringStorage ( pStrings );
  11355. }
  11356. void CSphQueryContext::SetMVAPool ( const DWORD * pMva )
  11357. {
  11358. ExprCommand ( SPH_EXPR_SET_MVA_POOL, (void*)pMva );
  11359. if ( m_pFilter )
  11360. m_pFilter->SetMVAStorage ( pMva );
  11361. if ( m_pWeightFilter )
  11362. m_pWeightFilter->SetMVAStorage ( pMva );
  11363. }
  11364. void CSphQueryContext::SetupExtraData ( ISphExtra * pData )
  11365. {
  11366. ExprCommand ( SPH_EXPR_SET_EXTRA_DATA, pData );
  11367. }
  11368. void CSphIndex_VLN::MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery,
  11369. int iSorters, ISphMatchSorter ** ppSorters, ISphRanker * pRanker, int iTag ) const
  11370. {
  11371. CSphQueryProfile * pProfile = pCtx->m_pProfile;
  11372. int iCutoff = pQuery->m_iCutoff;
  11373. if ( iCutoff<=0 )
  11374. iCutoff = -1;
  11375. // do searching
  11376. CSphMatch * pMatch = pRanker->GetMatchesBuffer();
  11377. for ( ;; )
  11378. {
  11379. // ranker does profile switches internally
  11380. int iMatches = pRanker->GetMatches();
  11381. if ( iMatches<=0 )
  11382. break;
  11383. if ( pProfile )
  11384. pProfile->Switch ( SPH_QSTATE_SORT );
  11385. for ( int i=0; i<iMatches; i++ )
  11386. {
  11387. if ( pCtx->m_bLookupSort )
  11388. CopyDocinfo ( pCtx, pMatch[i], FindDocinfo ( pMatch[i].m_iDocID ) );
  11389. pCtx->CalcSort ( pMatch[i] );
  11390. if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )
  11391. {
  11392. pCtx->FreeStrSort ( pMatch[i] );
  11393. continue;
  11394. }
  11395. pMatch[i].m_iTag = iTag;
  11396. bool bRand = false;
  11397. bool bNewMatch = false;
  11398. for ( int iSorter=0; iSorter<iSorters; iSorter++ )
  11399. {
  11400. // all non-random sorters are in the beginning,
  11401. // so we can avoid the simple 'first-element' assertion
  11402. if ( !bRand && ppSorters[iSorter]->m_bRandomize )
  11403. {
  11404. bRand = true;
  11405. pMatch[i].m_iWeight = ( sphRand() & 0xffff );
  11406. if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )
  11407. break;
  11408. }
  11409. bNewMatch |= ppSorters[iSorter]->Push ( pMatch[i] );
  11410. if ( pCtx->m_bPackedFactors )
  11411. {
  11412. pRanker->ExtraData ( EXTRA_SET_MATCHPUSHED, (void**)&(ppSorters[iSorter]->m_iJustPushed) );
  11413. pRanker->ExtraData ( EXTRA_SET_MATCHPOPPED, (void**)&(ppSorters[iSorter]->m_dJustPopped) );
  11414. }
  11415. }
  11416. pCtx->FreeStrSort ( pMatch[i] );
  11417. if ( bNewMatch )
  11418. if ( --iCutoff==0 )
  11419. break;
  11420. }
  11421. if ( iCutoff==0 )
  11422. break;
  11423. }
  11424. if ( pProfile )
  11425. pProfile->Switch ( SPH_QSTATE_UNKNOWN );
  11426. }
  11427. //////////////////////////////////////////////////////////////////////////
  11428. bool CSphIndex_VLN::MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult,
  11429. int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag, bool bFactors ) const
  11430. {
  11431. assert ( pQuery->m_sQuery.IsEmpty() );
  11432. assert ( iTag>=0 );
  11433. // check if index is ready
  11434. if ( !m_pPreread || !*m_pPreread )
  11435. {
  11436. pResult->m_sError = "index not preread";
  11437. return false;
  11438. }
  11439. // check if index supports scans
  11440. if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN || !m_tSchema.GetAttrsCount() )
  11441. {
  11442. pResult->m_sError = "fullscan requires extern docinfo";
  11443. return false;
  11444. }
  11445. if ( bFactors )
  11446. pResult->m_sWarning.SetSprintf ( "packedfactors() will not work with a fullscan; you need to specify a query" );
  11447. // check if index has data
  11448. if ( m_bIsEmpty || m_iDocinfo<=0 || m_pDocinfo.IsEmpty() )
  11449. return true;
  11450. // start counting
  11451. int64_t tmQueryStart = sphMicroTimer();
  11452. // select the sorter with max schema
  11453. int iMaxSchemaSize = -1;
  11454. int iMaxSchemaIndex = -1;
  11455. for ( int i=0; i<iSorters; i++ )
  11456. if ( ppSorters[i]->GetSchema().GetRowSize() > iMaxSchemaSize )
  11457. {
  11458. iMaxSchemaSize = ppSorters[i]->GetSchema().GetRowSize();
  11459. iMaxSchemaIndex = i;
  11460. }
  11461. // setup calculations and result schema
  11462. CSphQueryContext tCtx;
  11463. if ( !tCtx.SetupCalc ( pResult, ppSorters[iMaxSchemaIndex]->GetSchema(), m_tSchema, GetMVAPool() ) )
  11464. return false;
  11465. // set string pool for string on_sort expression fix up
  11466. tCtx.SetStringPool ( m_pStrings.GetWritePtr() );
  11467. // setup filters
  11468. if ( !tCtx.CreateFilters ( true, &pQuery->m_dFilters, pResult->m_tSchema, GetMVAPool(), m_pStrings.GetWritePtr(), pResult->m_sError ) )
  11469. return false;
  11470. if ( !tCtx.CreateFilters ( true, pExtraFilters, pResult->m_tSchema, GetMVAPool(), m_pStrings.GetWritePtr(), pResult->m_sError ) )
  11471. return false;
  11472. // check if we can early reject the whole index
  11473. if ( tCtx.m_pFilter && m_iDocinfoIndex )
  11474. {
  11475. DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  11476. DWORD * pMinEntry = const_cast<DWORD*> ( &m_pDocinfoIndex [ m_iDocinfoIndex*uStride*2 ] );
  11477. DWORD * pMaxEntry = pMinEntry + uStride;
  11478. if ( !tCtx.m_pFilter->EvalBlock ( pMinEntry, pMaxEntry ) )
  11479. {
  11480. pResult->m_iQueryTime += (int)( ( sphMicroTimer()-tmQueryStart )/1000 );
  11481. return true;
  11482. }
  11483. }
  11484. // setup lookup
  11485. tCtx.m_bLookupFilter = false;
  11486. tCtx.m_bLookupSort = true;
  11487. // setup sorters vs. MVA
  11488. for ( int i=0; i<iSorters; i++ )
  11489. {
  11490. (ppSorters[i])->SetMVAPool ( m_pMva.GetWritePtr() );
  11491. (ppSorters[i])->SetStringPool ( m_pStrings.GetWritePtr() );
  11492. }
  11493. // setup overrides
  11494. if ( !tCtx.SetupOverrides ( pQuery, pResult, m_tSchema ) )
  11495. return false;
  11496. // prepare to work them rows
  11497. bool bRandomize = ppSorters[0]->m_bRandomize;
  11498. CSphMatch tMatch;
  11499. tMatch.Reset ( pResult->m_tSchema.GetDynamicSize() );
  11500. tMatch.m_iWeight = pQuery->GetIndexWeight ( m_sIndexName.cstr() );
  11501. tMatch.m_iTag = tCtx.m_dCalcFinal.GetLength() ? -1 : iTag;
  11502. // optimize direct lookups by id
  11503. // run full scan with block and row filtering for everything else
  11504. if ( pQuery->m_dFilters.GetLength()==1
  11505. && pQuery->m_dFilters[0].m_eType==SPH_FILTER_VALUES
  11506. && pQuery->m_dFilters[0].m_bExclude==false
  11507. && pQuery->m_dFilters[0].m_sAttrName=="@id"
  11508. && !pExtraFilters )
  11509. {
  11510. // run id lookups
  11511. for ( int i=0; i<pQuery->m_dFilters[0].GetNumValues(); i++ )
  11512. {
  11513. SphDocID_t uDocid = (SphDocID_t) pQuery->m_dFilters[0].GetValue(i);
  11514. const DWORD * pRow = FindDocinfo ( uDocid );
  11515. if ( !pRow )
  11516. continue;
  11517. assert ( uDocid==DOCINFO2ID(pRow) );
  11518. tMatch.m_iDocID = uDocid;
  11519. CopyDocinfo ( &tCtx, tMatch, pRow );
  11520. // submit match to sorters
  11521. tCtx.CalcSort ( tMatch );
  11522. if ( bRandomize )
  11523. tMatch.m_iWeight = ( sphRand() & 0xffff );
  11524. for ( int iSorter=0; iSorter<iSorters; iSorter++ )
  11525. ppSorters[iSorter]->Push ( tMatch );
  11526. // stringptr expressions should be duplicated (or taken over) at this point
  11527. tCtx.FreeStrSort ( tMatch );
  11528. }
  11529. } else
  11530. {
  11531. // do scan
  11532. DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  11533. int64_t iStart = pQuery->m_bReverseScan ? ( m_iDocinfoIndex-1 ) : 0;
  11534. int iStep = pQuery->m_bReverseScan ? -1 : 1;
  11535. int iCutoff = pQuery->m_iCutoff;
  11536. if ( iCutoff<=0 )
  11537. iCutoff = -1;
  11538. for ( int64_t iIndexEntry=iStart; iIndexEntry<m_iDocinfoIndex; iIndexEntry+=iStep )
  11539. {
  11540. // block-level filtering
  11541. const DWORD * pMin = &m_pDocinfoIndex[ iIndexEntry*uStride*2 ];
  11542. const DWORD * pMax = pMin + uStride;
  11543. // check applicable filters
  11544. if ( tCtx.m_pFilter && !tCtx.m_pFilter->EvalBlock ( pMin, pMax ) )
  11545. continue;
  11546. // row-level filtering
  11547. const DWORD * pBlockStart = &m_pDocinfo [ iIndexEntry*uStride*DOCINFO_INDEX_FREQ ];
  11548. const DWORD * pBlockEnd = &m_pDocinfo [ ( Min ( ( iIndexEntry+1 )*DOCINFO_INDEX_FREQ, m_iDocinfo ) - 1 ) * uStride ];
  11549. if ( !tCtx.m_pOverrides && tCtx.m_pFilter && !pQuery->m_iCutoff
  11550. && !tCtx.m_dCalcFilter.GetLength() && !tCtx.m_dCalcSort.GetLength() )
  11551. {
  11552. // kinda fastpath
  11553. for ( const DWORD * pDocinfo=pBlockStart; pDocinfo<=pBlockEnd; pDocinfo+=uStride )
  11554. {
  11555. tMatch.m_iDocID = DOCINFO2ID ( pDocinfo );
  11556. tMatch.m_pStatic = DOCINFO2ATTRS ( pDocinfo );
  11557. if ( !tCtx.m_pFilter->Eval ( tMatch ) )
  11558. {
  11559. tCtx.FreeStrFilter ( tMatch );
  11560. continue;
  11561. }
  11562. if ( bRandomize )
  11563. tMatch.m_iWeight = ( sphRand() & 0xffff );
  11564. for ( int iSorter=0; iSorter<iSorters; iSorter++ )
  11565. ppSorters[iSorter]->Push ( tMatch );
  11566. // stringptr expressions should be duplicated (or taken over) at this point
  11567. tCtx.FreeStrFilter ( tMatch );
  11568. }
  11569. } else
  11570. {
  11571. // generic path
  11572. for ( const DWORD * pDocinfo=pBlockStart; pDocinfo<=pBlockEnd; pDocinfo+=uStride )
  11573. {
  11574. tMatch.m_iDocID = DOCINFO2ID ( pDocinfo );
  11575. if ( !tCtx.m_pOverrides )
  11576. tMatch.m_pStatic = DOCINFO2ATTRS ( pDocinfo );
  11577. else
  11578. CopyDocinfo ( &tCtx, tMatch, pDocinfo );
  11579. // early filter only (no late filters in full-scan because of no @weight)
  11580. tCtx.CalcFilter ( tMatch );
  11581. if ( tCtx.m_pFilter && !tCtx.m_pFilter->Eval ( tMatch ) )
  11582. {
  11583. tCtx.FreeStrFilter ( tMatch );
  11584. continue;
  11585. }
  11586. // submit match to sorters
  11587. tCtx.CalcSort ( tMatch );
  11588. if ( bRandomize )
  11589. tMatch.m_iWeight = ( sphRand() & 0xffff );
  11590. bool bNewMatch = false;
  11591. for ( int iSorter=0; iSorter<iSorters; iSorter++ )
  11592. bNewMatch |= ppSorters[iSorter]->Push ( tMatch );
  11593. // stringptr expressions should be duplicated (or taken over) at this point
  11594. tCtx.FreeStrFilter ( tMatch );
  11595. tCtx.FreeStrSort ( tMatch );
  11596. // handle cutoff
  11597. if ( bNewMatch && --iCutoff==0 )
  11598. {
  11599. iIndexEntry = m_iDocinfoIndex; // outer break
  11600. break;
  11601. }
  11602. }
  11603. }
  11604. }
  11605. }
  11606. // do final expression calculations
  11607. if ( tCtx.m_dCalcFinal.GetLength() )
  11608. for ( int iSorter=0; iSorter<iSorters; iSorter++ )
  11609. {
  11610. ISphMatchSorter * pTop = ppSorters[iSorter];
  11611. CSphMatch * const pHead = pTop->Finalize();
  11612. const int iCount = pTop->GetLength ();
  11613. if ( !iCount )
  11614. continue;
  11615. CSphMatch * const pTail = pHead + iCount;
  11616. for ( CSphMatch * pCur=pHead; pCur<pTail; pCur++ )
  11617. {
  11618. if ( pCur->m_iTag<0 )
  11619. {
  11620. tCtx.CalcFinal ( *pCur );
  11621. pCur->m_iTag = iTag;
  11622. }
  11623. }
  11624. }
  11625. // done
  11626. pResult->m_pMva = m_pMva.GetWritePtr();
  11627. pResult->m_pStrings = m_pStrings.GetWritePtr();
  11628. pResult->m_iQueryTime += (int)( ( sphMicroTimer()-tmQueryStart )/1000 );
  11629. return true;
  11630. }
  11631. //////////////////////////////////////////////////////////////////////////////
  11632. ISphQword * DiskIndexQwordSetup_c::QwordSpawn ( const XQKeyword_t & tWord ) const
  11633. {
  11634. WITH_QWORD ( m_pIndex, false, Qword, return new Qword ( tWord.m_bExpanded, tWord.m_bExcluded ) );
  11635. return NULL;
  11636. }
  11637. bool DiskIndexQwordSetup_c::QwordSetup ( ISphQword * pWord ) const
  11638. {
  11639. WITH_QWORD ( m_pIndex, false, Qword, return Setup<Qword> ( pWord ) );
  11640. return false;
  11641. }
  11642. template < class Qword >
  11643. bool DiskIndexQwordSetup_c::Setup ( ISphQword * pWord ) const
  11644. {
  11645. Qword * pMyWord = dynamic_cast<Qword*> ( pWord );
  11646. if ( !pMyWord )
  11647. return false;
  11648. Qword & tWord = *pMyWord;
  11649. // setup attrs
  11650. tWord.m_tDoc.Reset ( m_iDynamicRowitems );
  11651. tWord.m_iMinID = m_iMinDocid;
  11652. tWord.m_tDoc.m_iDocID = m_iMinDocid;
  11653. if ( m_eDocinfo==SPH_DOCINFO_INLINE )
  11654. {
  11655. tWord.m_iInlineAttrs = m_iInlineRowitems;
  11656. tWord.m_pInlineFixup = m_pMinRow;
  11657. } else
  11658. {
  11659. tWord.m_iInlineAttrs = 0;
  11660. tWord.m_pInlineFixup = NULL;
  11661. }
  11662. // setup stats
  11663. tWord.m_iDocs = 0;
  11664. tWord.m_iHits = 0;
  11665. CSphIndex_VLN * pIndex = (CSphIndex_VLN *)m_pIndex;
  11666. // !COMMIT FIXME!
  11667. // the below stuff really belongs in wordlist
  11668. // which in turn really belongs in dictreader
  11669. // which in turn might or might not be a part of dict
  11670. // binary search through checkpoints for a one whose range matches word ID
  11671. assert ( pIndex->m_pPreread && *pIndex->m_pPreread );
  11672. assert ( !pIndex->m_bPreloadWordlist || !pIndex->m_tWordlist.m_pBuf.IsEmpty() );
  11673. // empty index?
  11674. if ( !pIndex->m_tWordlist.m_dCheckpoints.GetLength() )
  11675. return false;
  11676. const char * sWord = tWord.m_sDictWord.cstr();
  11677. const bool bWordDict = pIndex->m_pDict->GetSettings().m_bWordDict;
  11678. int iWordLen = sWord ? strlen ( sWord ) : 0;
  11679. if ( pIndex->m_bEnableStar && bWordDict && tWord.m_sWord.Ends("*") )
  11680. {
  11681. iWordLen = Max ( iWordLen-1, 0 );
  11682. // might match either infix or prefix
  11683. int iMinLen = Max ( pIndex->m_tSettings.m_iMinPrefixLen, pIndex->m_tSettings.m_iMinInfixLen );
  11684. if ( pIndex->m_tSettings.m_iMinPrefixLen )
  11685. iMinLen = Min ( iMinLen, pIndex->m_tSettings.m_iMinPrefixLen );
  11686. if ( pIndex->m_tSettings.m_iMinInfixLen )
  11687. iMinLen = Min ( iMinLen, pIndex->m_tSettings.m_iMinInfixLen );
  11688. // bail out term shorter than prefix or infix allowed
  11689. if ( iWordLen<iMinLen )
  11690. return false;
  11691. }
  11692. // leading special symbols trimming
  11693. if ( pIndex->m_bEnableStar && bWordDict && tWord.m_sDictWord.Begins("*") )
  11694. {
  11695. sWord++;
  11696. iWordLen = Max ( iWordLen-1, 0 );
  11697. // bail out term shorter than infix allowed
  11698. if ( iWordLen<pIndex->m_tSettings.m_iMinInfixLen )
  11699. return false;
  11700. }
  11701. const CSphWordlistCheckpoint * pCheckpoint = pIndex->m_tWordlist.FindCheckpoint ( sWord, iWordLen, tWord.m_iWordID, false );
  11702. if ( !pCheckpoint )
  11703. return false;
  11704. // decode wordlist chunk
  11705. const BYTE * pBuf = pIndex->m_tWordlist.AcquireDict ( pCheckpoint, m_tWordlist.GetFD(), m_pDictBuf );
  11706. assert ( pBuf );
  11707. CSphDictEntry tRes;
  11708. if ( bWordDict )
  11709. {
  11710. KeywordsBlockReader_c tCtx ( pBuf, m_pSkips!=NULL );
  11711. while ( tCtx.UnpackWord() )
  11712. {
  11713. // block is sorted
  11714. // so once keywords are greater than the reference word, no more matches
  11715. assert ( tCtx.GetWordLen()>0 );
  11716. int iCmp = sphDictCmpStrictly ( sWord, iWordLen, tCtx.GetWord(), tCtx.GetWordLen() );
  11717. if ( iCmp<0 )
  11718. return false;
  11719. if ( iCmp==0 )
  11720. break;
  11721. }
  11722. if ( tCtx.GetWordLen()<=0 )
  11723. return false;
  11724. tRes = tCtx;
  11725. } else
  11726. {
  11727. if ( !pIndex->m_tWordlist.GetWord ( pBuf, tWord.m_iWordID, tRes ) )
  11728. return false;
  11729. }
  11730. const ESphHitless eMode = pIndex->m_tSettings.m_eHitless;
  11731. tWord.m_iDocs = eMode==SPH_HITLESS_SOME ? ( tRes.m_iDocs & 0x7FFFFFFF ) : tRes.m_iDocs;
  11732. tWord.m_iHits = tRes.m_iHits;
  11733. tWord.m_bHasHitlist =
  11734. ( eMode==SPH_HITLESS_NONE ) ||
  11735. ( eMode==SPH_HITLESS_SOME && !( tRes.m_iDocs & 0x80000000 ) );
  11736. if ( m_bSetupReaders )
  11737. {
  11738. tWord.m_rdDoclist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
  11739. tWord.m_rdDoclist.SetFile ( m_tDoclist );
  11740. tWord.m_rdDoclist.m_pProfile = m_pProfile;
  11741. tWord.m_rdDoclist.m_eProfileState = SPH_QSTATE_READ_DOCS;
  11742. // read in skiplist
  11743. // OPTIMIZE? maybe cache hot decompressed lists?
  11744. // OPTIMIZE? maybe add an option to decompress on preload instead?
  11745. if ( m_pSkips && tRes.m_iDocs>SPH_SKIPLIST_BLOCK )
  11746. {
  11747. const BYTE * pSkip = m_pSkips + tRes.m_iSkiplistOffset;
  11748. SkiplistEntry_t & t = tWord.m_dSkiplist.Add();
  11749. t.m_iBaseDocid = 0;
  11750. t.m_iOffset = tRes.m_iDoclistOffset;
  11751. t.m_iBaseHitlistPos = 0;
  11752. for ( int i=1; i<( tWord.m_iDocs/SPH_SKIPLIST_BLOCK ); i++ )
  11753. {
  11754. SkiplistEntry_t & t = tWord.m_dSkiplist.Add();
  11755. SkiplistEntry_t & p = tWord.m_dSkiplist [ tWord.m_dSkiplist.GetLength()-2 ];
  11756. t.m_iBaseDocid = p.m_iBaseDocid + SPH_SKIPLIST_BLOCK + (SphDocID_t) sphUnzipOffset ( pSkip );
  11757. t.m_iOffset = p.m_iOffset + 4*SPH_SKIPLIST_BLOCK + sphUnzipOffset ( pSkip );
  11758. t.m_iBaseHitlistPos = p.m_iBaseHitlistPos + sphUnzipOffset ( pSkip );
  11759. }
  11760. }
  11761. tWord.m_rdDoclist.SeekTo ( tRes.m_iDoclistOffset, tRes.m_iDoclistHint );
  11762. tWord.m_rdHitlist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
  11763. tWord.m_rdHitlist.SetFile ( m_tHitlist );
  11764. tWord.m_rdHitlist.m_pProfile = m_pProfile;
  11765. tWord.m_rdHitlist.m_eProfileState = SPH_QSTATE_READ_HITS;
  11766. }
  11767. return true;
  11768. }
  11769. //////////////////////////////////////////////////////////////////////////////
  11770. bool CSphIndex_VLN::Lock ()
  11771. {
  11772. CSphString sName = GetIndexFileName("spl");
  11773. sphLogDebug ( "Locking the index via file %s", sName.cstr() );
  11774. if ( m_iLockFD<0 )
  11775. {
  11776. m_iLockFD = ::open ( sName.cstr(), SPH_O_NEW, 0644 );
  11777. if ( m_iLockFD<0 )
  11778. {
  11779. m_sLastError.SetSprintf ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
  11780. sphLogDebug ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
  11781. return false;
  11782. }
  11783. }
  11784. if ( !sphLockEx ( m_iLockFD, false ) )
  11785. {
  11786. m_sLastError.SetSprintf ( "failed to lock %s: %s", sName.cstr(), strerror(errno) );
  11787. ::close ( m_iLockFD );
  11788. m_iLockFD = -1;
  11789. return false;
  11790. }
  11791. sphLogDebug ( "lock %s success", sName.cstr() );
  11792. return true;
  11793. }
  11794. void CSphIndex_VLN::Unlock()
  11795. {
  11796. CSphString sName = GetIndexFileName("spl");
  11797. sphLogDebug ( "Unlocking the index (lock %s)", sName.cstr() );
  11798. if ( m_iLockFD>=0 )
  11799. {
  11800. sphLogDebug ( "File ID ok, closing lock FD %d, unlinking %s", m_iLockFD, sName.cstr() );
  11801. sphLockUn ( m_iLockFD );
  11802. ::close ( m_iLockFD );
  11803. ::unlink ( sName.cstr() );
  11804. m_iLockFD = -1;
  11805. }
  11806. }
  11807. bool CSphIndex_VLN::Mlock ()
  11808. {
  11809. bool bRes = true;
  11810. bRes &= m_pDocinfo.Mlock ( "docinfo", m_sLastError );
  11811. if ( m_bPreloadWordlist )
  11812. bRes &= m_tWordlist.m_pBuf.Mlock ( "wordlist", m_sLastError );
  11813. bRes &= m_pMva.Mlock ( "mva", m_sLastError );
  11814. bRes &= m_pStrings.Mlock ( "strings", m_sLastError );
  11815. return bRes;
  11816. }
  11817. void CSphIndex_VLN::Dealloc ()
  11818. {
  11819. if ( !m_bPreallocated )
  11820. return;
  11821. m_tDoclistFile.Close ();
  11822. m_tHitlistFile.Close ();
  11823. m_pDocinfo.Reset ();
  11824. m_pDocinfoHash.Reset ();
  11825. m_pMva.Reset ();
  11826. m_pStrings.Reset ();
  11827. m_pKillList.Reset ();
  11828. m_tWordlist.Reset ();
  11829. m_pSkiplists.Reset ();
  11830. m_iDocinfo = 0;
  11831. m_uMinMaxIndex = 0;
  11832. m_tSettings.m_eDocinfo = SPH_DOCINFO_NONE;
  11833. m_bPreallocated = false;
  11834. SafeDelete ( m_pTokenizer );
  11835. SafeDelete ( m_pDict );
  11836. if ( m_iIndexTag>=0 && g_pMvaArena )
  11837. g_MvaArena.TaggedFreeTag ( m_iIndexTag );
  11838. m_iIndexTag = -1;
  11839. m_pPreread = NULL;
  11840. m_pAttrsStatus = NULL;
  11841. #ifndef NDEBUG
  11842. m_dShared.Reset ();
  11843. #endif
  11844. }
  11845. void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion )
  11846. {
  11847. if ( uVersion>=8 )
  11848. {
  11849. tSettings.m_iMinPrefixLen = tReader.GetDword ();
  11850. tSettings.m_iMinInfixLen = tReader.GetDword ();
  11851. } else if ( uVersion>=6 )
  11852. {
  11853. bool bPrefixesOnly = ( tReader.GetByte ()!=0 );
  11854. tSettings.m_iMinPrefixLen = tReader.GetDword ();
  11855. tSettings.m_iMinInfixLen = 0;
  11856. if ( !bPrefixesOnly )
  11857. Swap ( tSettings.m_iMinPrefixLen, tSettings.m_iMinInfixLen );
  11858. }
  11859. if ( uVersion>=38 )
  11860. tSettings.m_iMaxSubstringLen = tReader.GetDword();
  11861. if ( uVersion>=9 )
  11862. {
  11863. tSettings.m_bHtmlStrip = !!tReader.GetByte ();
  11864. tSettings.m_sHtmlIndexAttrs = tReader.GetString ();
  11865. tSettings.m_sHtmlRemoveElements = tReader.GetString ();
  11866. }
  11867. if ( uVersion>=12 )
  11868. tSettings.m_bIndexExactWords = !!tReader.GetByte ();
  11869. if ( uVersion>=18 )
  11870. tSettings.m_eHitless = (ESphHitless)tReader.GetDword();
  11871. if ( uVersion>=19 )
  11872. tSettings.m_eHitFormat = (ESphHitFormat)tReader.GetDword();
  11873. else // force plain format for old indices
  11874. tSettings.m_eHitFormat = SPH_HIT_FORMAT_PLAIN;
  11875. if ( uVersion>=21 )
  11876. tSettings.m_bIndexSP = !!tReader.GetByte();
  11877. if ( uVersion>=22 )
  11878. {
  11879. tSettings.m_sZones = tReader.GetString();
  11880. if ( uVersion<25 && !tSettings.m_sZones.IsEmpty() )
  11881. tSettings.m_sZones.SetSprintf ( "%s*", tSettings.m_sZones.cstr() );
  11882. }
  11883. if ( uVersion>=23 )
  11884. {
  11885. tSettings.m_iBoundaryStep = (int)tReader.GetDword();
  11886. tSettings.m_iStopwordStep = (int)tReader.GetDword();
  11887. }
  11888. if ( uVersion>=28 )
  11889. tSettings.m_iOvershortStep = (int)tReader.GetDword();
  11890. if ( uVersion>=30 )
  11891. tSettings.m_iEmbeddedLimit = (int)tReader.GetDword();
  11892. if ( uVersion>=32 )
  11893. {
  11894. tSettings.m_eBigramIndex = (ESphBigram)tReader.GetByte();
  11895. tSettings.m_sBigramWords = tReader.GetString();
  11896. }
  11897. if ( uVersion>=35 )
  11898. tSettings.m_bIndexFieldLens = ( tReader.GetByte()!=0 );
  11899. }
  11900. bool CSphIndex_VLN::LoadHeader ( const char * sHeaderName, bool bStripPath, CSphString & sWarning )
  11901. {
  11902. const int MAX_HEADER_SIZE = 32768;
  11903. CSphFixedVector<BYTE> dCacheInfo ( MAX_HEADER_SIZE );
  11904. CSphAutoreader rdInfo ( dCacheInfo.Begin(), MAX_HEADER_SIZE ); // to avoid mallocs
  11905. if ( !rdInfo.Open ( sHeaderName, m_sLastError ) )
  11906. return false;
  11907. // version
  11908. DWORD uHeader = rdInfo.GetDword ();
  11909. if ( uHeader!=INDEX_MAGIC_HEADER )
  11910. {
  11911. m_sLastError.SetSprintf ( "%s is invalid header file (too old index version?)", sHeaderName );
  11912. return false;
  11913. }
  11914. m_uVersion = rdInfo.GetDword();
  11915. if ( m_uVersion==0 || m_uVersion>INDEX_FORMAT_VERSION )
  11916. {
  11917. m_sLastError.SetSprintf ( "%s is v.%d, binary is v.%d", sHeaderName, m_uVersion, INDEX_FORMAT_VERSION );
  11918. return false;
  11919. }
  11920. // bits
  11921. m_bUse64 = false;
  11922. if ( m_uVersion>=2 )
  11923. m_bUse64 = ( rdInfo.GetDword ()!=0 );
  11924. if ( m_bUse64!=USE_64BIT )
  11925. {
  11926. #if USE_64BIT
  11927. // TODO: may be do this param conditional and push it into the config?
  11928. m_bId32to64 = true;
  11929. #else
  11930. m_sLastError.SetSprintf ( "'%s' is id%d, and this binary is id%d",
  11931. GetIndexFileName("sph").cstr(),
  11932. m_bUse64 ? 64 : 32, USE_64BIT ? 64 : 32 );
  11933. return false;
  11934. #endif
  11935. }
  11936. // skiplists
  11937. m_bHaveSkips = ( m_uVersion>=31 );
  11938. // docinfo
  11939. m_tSettings.m_eDocinfo = (ESphDocinfo) rdInfo.GetDword();
  11940. // schema
  11941. // 4th arg means that inline attributes need be dynamic in searching time too
  11942. ReadSchema ( rdInfo, m_tSchema, m_uVersion, m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE );
  11943. // check schema for dupes
  11944. for ( int iAttr=1; iAttr<m_tSchema.GetAttrsCount(); iAttr++ )
  11945. {
  11946. const CSphColumnInfo & tCol = m_tSchema.GetAttr(iAttr);
  11947. for ( int i=0; i<iAttr; i++ )
  11948. if ( m_tSchema.GetAttr(i).m_sName==tCol.m_sName )
  11949. sWarning.SetSprintf ( "duplicate attribute name: %s", tCol.m_sName.cstr() );
  11950. }
  11951. // in case of *fork rotation we reuse min match from 1st rotated index ( it could be less than my size and inline ( m_pDynamic ) )
  11952. // min doc
  11953. m_dMinRow.Reset ( m_tSchema.GetRowSize() );
  11954. if ( m_uVersion>=2 )
  11955. m_iMinDocid = (SphDocID_t) rdInfo.GetOffset (); // v2+; losing high bits when !USE_64 is intentional, check is performed on bUse64 above
  11956. else
  11957. m_iMinDocid = rdInfo.GetDword(); // v1
  11958. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  11959. rdInfo.GetBytes ( m_dMinRow.Begin(), sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
  11960. // dictionary header (wordlist checkpoints, infix blocks, etc)
  11961. m_tWordlist.m_iDictCheckpointsOffset = rdInfo.GetOffset();
  11962. m_tWordlist.m_iDictCheckpoints = rdInfo.GetDword();
  11963. if ( m_uVersion>=27 )
  11964. {
  11965. m_tWordlist.m_iInfixCodepointBytes = rdInfo.GetByte();
  11966. m_tWordlist.m_iInfixBlocksOffset = rdInfo.GetDword();
  11967. }
  11968. if ( m_uVersion>=34 )
  11969. m_tWordlist.m_iInfixBlocksWordsSize = rdInfo.GetDword();
  11970. m_tWordlist.m_dCheckpoints.Reset ( m_tWordlist.m_iDictCheckpoints );
  11971. // index stats
  11972. m_tStats.m_iTotalDocuments = rdInfo.GetDword ();
  11973. m_tStats.m_iTotalBytes = rdInfo.GetOffset ();
  11974. LoadIndexSettings ( m_tSettings, rdInfo, m_uVersion );
  11975. if ( m_uVersion<9 )
  11976. m_bStripperInited = false;
  11977. if ( m_uVersion>=9 )
  11978. {
  11979. CSphEmbeddedFiles tEmbeddedFiles;
  11980. // tokenizer stuff
  11981. CSphTokenizerSettings tSettings;
  11982. LoadTokenizerSettings ( rdInfo, tSettings, tEmbeddedFiles, m_uVersion, sWarning );
  11983. if ( bStripPath )
  11984. StripPath ( tSettings.m_sSynonymsFile );
  11985. ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tSettings, &tEmbeddedFiles, m_sLastError );
  11986. if ( !pTokenizer )
  11987. return false;
  11988. // dictionary stuff
  11989. CSphDictSettings tDictSettings;
  11990. LoadDictionarySettings ( rdInfo, tDictSettings, tEmbeddedFiles, m_uVersion, sWarning );
  11991. if ( m_bId32to64 )
  11992. tDictSettings.m_bCrc32 = true;
  11993. if ( bStripPath )
  11994. {
  11995. StripPath ( tDictSettings.m_sStopwords );
  11996. ARRAY_FOREACH ( i, tDictSettings.m_dWordforms )
  11997. StripPath ( tDictSettings.m_dWordforms[i] );
  11998. }
  11999. CSphDict * pDict = tDictSettings.m_bWordDict
  12000. ? sphCreateDictionaryKeywords ( tDictSettings, &tEmbeddedFiles, pTokenizer, m_sIndexName.cstr(), m_sLastError )
  12001. : sphCreateDictionaryCRC ( tDictSettings, &tEmbeddedFiles, pTokenizer, m_sIndexName.cstr(), m_sLastError );
  12002. if ( !pDict )
  12003. return false;
  12004. if ( tDictSettings.m_sMorphFingerprint!=pDict->GetMorphDataFingerprint() )
  12005. sWarning.SetSprintf ( "different lemmatizer dictionaries (index='%s', current='%s')",
  12006. tDictSettings.m_sMorphFingerprint.cstr(),
  12007. pDict->GetMorphDataFingerprint().cstr() );
  12008. SetDictionary ( pDict );
  12009. pTokenizer = ISphTokenizer::CreateMultiformFilter ( pTokenizer, pDict->GetMultiWordforms () );
  12010. SetTokenizer ( pTokenizer );
  12011. SetupQueryTokenizer();
  12012. // initialize AOT if needed
  12013. CSphVector<CSphString> dMorphs;
  12014. sphSplit ( dMorphs, tDictSettings.m_sMorphology.cstr() );
  12015. m_tSettings.m_bAotFilter = ARRAY_ANY ( m_tSettings.m_bAotFilter, dMorphs,
  12016. dMorphs[_any]=="lemmatize_ru_all" );
  12017. if ( m_tSettings.m_bAotFilter )
  12018. {
  12019. CSphString sDictFile;
  12020. sDictFile.SetSprintf ( "%s/ru.pak", g_sLemmatizerBase.cstr() );
  12021. if ( !sphAotInitRu ( sDictFile, m_sLastError ) )
  12022. return false;
  12023. }
  12024. } else
  12025. {
  12026. if ( m_bId32to64 )
  12027. {
  12028. m_sLastError.SetSprintf ( "too old id32 index; can not be loaded by this id64 binary" );
  12029. return false;
  12030. }
  12031. }
  12032. if ( m_uVersion>=10 )
  12033. m_iKillListSize = rdInfo.GetDword ();
  12034. if ( m_uVersion>=33 )
  12035. m_uMinMaxIndex = rdInfo.GetOffset ();
  12036. else if ( m_uVersion>=20 )
  12037. m_uMinMaxIndex = rdInfo.GetDword ();
  12038. if ( m_uVersion>=28 )
  12039. {
  12040. CSphFieldFilterSettings tFieldFilterSettings;
  12041. LoadFieldFilterSettings ( rdInfo, tFieldFilterSettings );
  12042. SetFieldFilter ( sphCreateFieldFilter ( tFieldFilterSettings, sWarning ) );
  12043. }
  12044. if ( m_uVersion>=35 && m_tSettings.m_bIndexFieldLens )
  12045. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  12046. m_dFieldLens[i] = rdInfo.GetOffset(); // FIXME? ideally 64bit even when off is 32bit..
  12047. // post-load stuff.. for now, bigrams
  12048. CSphIndexSettings & s = m_tSettings;
  12049. if ( s.m_eBigramIndex!=SPH_BIGRAM_NONE && s.m_eBigramIndex!=SPH_BIGRAM_ALL )
  12050. {
  12051. BYTE * pTok;
  12052. m_pTokenizer->SetBuffer ( (BYTE*)s.m_sBigramWords.cstr(), s.m_sBigramWords.Length() );
  12053. while ( ( pTok = m_pTokenizer->GetToken() )!=NULL )
  12054. s.m_dBigramWords.Add() = (const char*)pTok;
  12055. s.m_dBigramWords.Sort();
  12056. }
  12057. if ( rdInfo.GetErrorFlag() )
  12058. m_sLastError.SetSprintf ( "%s: failed to parse header (unexpected eof)", sHeaderName );
  12059. return !rdInfo.GetErrorFlag();
  12060. }
  12061. void CSphIndex_VLN::DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig )
  12062. {
  12063. CSphString sWarning;
  12064. if ( !LoadHeader ( sHeaderName, false, sWarning ) )
  12065. {
  12066. fprintf ( fp, "FATAL: failed to load header: %s.\n", m_sLastError.cstr() );
  12067. return;
  12068. }
  12069. if ( !sWarning.IsEmpty () )
  12070. fprintf ( fp, "WARNING: %s\n", sWarning.cstr () );
  12071. ///////////////////////////////////////////////
  12072. // print header in index config section format
  12073. ///////////////////////////////////////////////
  12074. if ( bConfig )
  12075. {
  12076. fprintf ( fp, "\nsource $dump\n{\n" );
  12077. fprintf ( fp, "\tsql_query = SELECT id \\\n" );
  12078. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  12079. fprintf ( fp, "\t, %s \\\n", m_tSchema.m_dFields[i].m_sName.cstr() );
  12080. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  12081. {
  12082. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  12083. fprintf ( fp, "\t, %s \\\n", tAttr.m_sName.cstr() );
  12084. }
  12085. fprintf ( fp, "\tFROM documents\n" );
  12086. if ( m_tSchema.GetAttrsCount() )
  12087. fprintf ( fp, "\n" );
  12088. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  12089. {
  12090. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  12091. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
  12092. fprintf ( fp, "\tsql_attr_multi = uint %s from field\n", tAttr.m_sName.cstr() );
  12093. else if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  12094. fprintf ( fp, "\tsql_attr_multi = bigint %s from field\n", tAttr.m_sName.cstr() );
  12095. else if ( tAttr.m_eAttrType==SPH_ATTR_INTEGER && tAttr.m_tLocator.IsBitfield() )
  12096. fprintf ( fp, "\tsql_attr_uint = %s:%d\n", tAttr.m_sName.cstr(), tAttr.m_tLocator.m_iBitCount );
  12097. else
  12098. fprintf ( fp, "\t%s = %s\n", sphTypeDirective ( tAttr.m_eAttrType ), tAttr.m_sName.cstr() );
  12099. }
  12100. fprintf ( fp, "}\n\nindex $dump\n{\n\tsource = $dump\n\tpath = $dump\n" );
  12101. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  12102. fprintf ( fp, "\tdocinfo = inline\n" );
  12103. if ( m_tSettings.m_iMinPrefixLen )
  12104. fprintf ( fp, "\tmin_prefix_len = %d\n", m_tSettings.m_iMinPrefixLen );
  12105. if ( m_tSettings.m_iMinInfixLen )
  12106. fprintf ( fp, "\tmin_prefix_len = %d\n", m_tSettings.m_iMinInfixLen );
  12107. if ( m_tSettings.m_iMaxSubstringLen )
  12108. fprintf ( fp, "\tmax_substring_len = %d\n", m_tSettings.m_iMaxSubstringLen );
  12109. if ( m_tSettings.m_bIndexExactWords )
  12110. fprintf ( fp, "\tindex_exact_words = %d\n", m_tSettings.m_bIndexExactWords ? 1 : 0 );
  12111. if ( m_tSettings.m_bHtmlStrip )
  12112. fprintf ( fp, "\thtml_strip = 1\n" );
  12113. if ( !m_tSettings.m_sHtmlIndexAttrs.IsEmpty() )
  12114. fprintf ( fp, "\thtml_index_attrs = %s\n", m_tSettings.m_sHtmlIndexAttrs.cstr () );
  12115. if ( !m_tSettings.m_sHtmlRemoveElements.IsEmpty() )
  12116. fprintf ( fp, "\thtml_remove_elements = %s\n", m_tSettings.m_sHtmlRemoveElements.cstr () );
  12117. if ( m_tSettings.m_sZones.cstr() )
  12118. fprintf ( fp, "\tindex_zones = %s\n", m_tSettings.m_sZones.cstr() );
  12119. if ( m_pTokenizer )
  12120. {
  12121. const CSphTokenizerSettings & tSettings = m_pTokenizer->GetSettings ();
  12122. fprintf ( fp, "\tcharset_type = %s\n", tSettings.m_iType==TOKENIZER_SBCS ? "sbcs" : "utf-8" );
  12123. fprintf ( fp, "\tcharset_table = %s\n", tSettings.m_sCaseFolding.cstr () );
  12124. if ( tSettings.m_iMinWordLen>1 )
  12125. fprintf ( fp, "\tmin_word_len = %d\n", tSettings.m_iMinWordLen );
  12126. if ( tSettings.m_iNgramLen && !tSettings.m_sNgramChars.IsEmpty() )
  12127. fprintf ( fp, "\tngram_len = %d\nngram_chars = %s\n",
  12128. tSettings.m_iNgramLen, tSettings.m_sNgramChars.cstr () );
  12129. if ( !tSettings.m_sSynonymsFile.IsEmpty() )
  12130. fprintf ( fp, "\texceptions = %s\n", tSettings.m_sSynonymsFile.cstr () );
  12131. if ( !tSettings.m_sBoundary.IsEmpty() )
  12132. fprintf ( fp, "\tphrase_boundary = %s\n", tSettings.m_sBoundary.cstr () );
  12133. if ( !tSettings.m_sIgnoreChars.IsEmpty() )
  12134. fprintf ( fp, "\tignore_chars = %s\n", tSettings.m_sIgnoreChars.cstr () );
  12135. if ( !tSettings.m_sBlendChars.IsEmpty() )
  12136. fprintf ( fp, "\tblend_chars = %s\n", tSettings.m_sBlendChars.cstr () );
  12137. if ( !tSettings.m_sBlendMode.IsEmpty() )
  12138. fprintf ( fp, "\tblend_mode = %s\n", tSettings.m_sBlendMode.cstr () );
  12139. }
  12140. if ( m_pDict )
  12141. {
  12142. const CSphDictSettings & tSettings = m_pDict->GetSettings ();
  12143. if ( tSettings.m_bWordDict )
  12144. fprintf ( fp, "\tdict = keywords\n" );
  12145. if ( !tSettings.m_sMorphology.IsEmpty() )
  12146. fprintf ( fp, "\tmorphology = %s\n", tSettings.m_sMorphology.cstr () );
  12147. if ( !tSettings.m_sStopwords.IsEmpty() )
  12148. fprintf ( fp, "\tstopwords = %s\n", tSettings.m_sStopwords.cstr () );
  12149. if ( tSettings.m_dWordforms.GetLength() )
  12150. {
  12151. fprintf ( fp, "\twordforms =" );
  12152. ARRAY_FOREACH ( i, tSettings.m_dWordforms )
  12153. fprintf ( fp, " %s", tSettings.m_dWordforms[i].cstr () );
  12154. fprintf ( fp, "\n" );
  12155. }
  12156. if ( tSettings.m_iMinStemmingLen>1 )
  12157. fprintf ( fp, "\tmin_stemming_len = %d\n", tSettings.m_iMinStemmingLen );
  12158. }
  12159. fprintf ( fp, "}\n" );
  12160. return;
  12161. }
  12162. ///////////////////////////////////////////////
  12163. // print header and stats in "readable" format
  12164. ///////////////////////////////////////////////
  12165. fprintf ( fp, "version: %d\n", m_uVersion );
  12166. fprintf ( fp, "idbits: %d\n", m_bUse64 ? 64 : 32 );
  12167. fprintf ( fp, "docinfo: " );
  12168. switch ( m_tSettings.m_eDocinfo )
  12169. {
  12170. case SPH_DOCINFO_NONE: fprintf ( fp, "none\n" ); break;
  12171. case SPH_DOCINFO_INLINE: fprintf ( fp, "inline\n" ); break;
  12172. case SPH_DOCINFO_EXTERN: fprintf ( fp, "extern\n" ); break;
  12173. default: fprintf ( fp, "unknown (value=%d)\n", m_tSettings.m_eDocinfo ); break;
  12174. }
  12175. fprintf ( fp, "fields: %d\n", m_tSchema.m_dFields.GetLength() );
  12176. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  12177. fprintf ( fp, " field %d: %s\n", i, m_tSchema.m_dFields[i].m_sName.cstr() );
  12178. fprintf ( fp, "attrs: %d\n", m_tSchema.GetAttrsCount() );
  12179. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  12180. {
  12181. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  12182. fprintf ( fp, " attr %d: %s, %s", i, tAttr.m_sName.cstr(), sphTypeName ( tAttr.m_eAttrType ) );
  12183. if ( tAttr.m_eAttrType==SPH_ATTR_INTEGER && tAttr.m_tLocator.m_iBitCount!=32 )
  12184. fprintf ( fp, ", bits %d", tAttr.m_tLocator.m_iBitCount );
  12185. fprintf ( fp, ", bitoff %d\n", tAttr.m_tLocator.m_iBitOffset );
  12186. }
  12187. // skipped min doc, wordlist checkpoints
  12188. fprintf ( fp, "total-documents: "INT64_FMT"\n", m_tStats.m_iTotalDocuments );
  12189. fprintf ( fp, "total-bytes: "INT64_FMT"\n", int64_t(m_tStats.m_iTotalBytes) );
  12190. fprintf ( fp, "min-prefix-len: %d\n", m_tSettings.m_iMinPrefixLen );
  12191. fprintf ( fp, "min-infix-len: %d\n", m_tSettings.m_iMinInfixLen );
  12192. fprintf ( fp, "max-substring-len: %d\n", m_tSettings.m_iMaxSubstringLen );
  12193. fprintf ( fp, "exact-words: %d\n", m_tSettings.m_bIndexExactWords ? 1 : 0 );
  12194. fprintf ( fp, "html-strip: %d\n", m_tSettings.m_bHtmlStrip ? 1 : 0 );
  12195. fprintf ( fp, "html-index-attrs: %s\n", m_tSettings.m_sHtmlIndexAttrs.cstr () );
  12196. fprintf ( fp, "html-remove-elements: %s\n", m_tSettings.m_sHtmlRemoveElements.cstr () );
  12197. fprintf ( fp, "index-zones: %s\n", m_tSettings.m_sZones.cstr() );
  12198. if ( m_pTokenizer )
  12199. {
  12200. const CSphTokenizerSettings & tSettings = m_pTokenizer->GetSettings ();
  12201. fprintf ( fp, "tokenizer-type: %d\n", tSettings.m_iType );
  12202. fprintf ( fp, "tokenizer-case-folding: %s\n", tSettings.m_sCaseFolding.cstr () );
  12203. fprintf ( fp, "tokenizer-min-word-len: %d\n", tSettings.m_iMinWordLen );
  12204. fprintf ( fp, "tokenizer-ngram-chars: %s\n", tSettings.m_sNgramChars.cstr () );
  12205. fprintf ( fp, "tokenizer-ngram-len: %d\n", tSettings.m_iNgramLen );
  12206. fprintf ( fp, "tokenizer-exceptions: %s\n", tSettings.m_sSynonymsFile.cstr () );
  12207. fprintf ( fp, "tokenizer-phrase-boundary: %s\n", tSettings.m_sBoundary.cstr () );
  12208. fprintf ( fp, "tokenizer-ignore-chars: %s\n", tSettings.m_sIgnoreChars.cstr () );
  12209. fprintf ( fp, "tokenizer-blend-chars: %s\n", tSettings.m_sBlendChars.cstr () );
  12210. fprintf ( fp, "tokenizer-blend-mode: %s\n", tSettings.m_sBlendMode.cstr () );
  12211. }
  12212. if ( m_pDict )
  12213. {
  12214. const CSphDictSettings & tSettings = m_pDict->GetSettings ();
  12215. fprintf ( fp, "dictionary-morphology: %s\n", tSettings.m_sMorphology.cstr () );
  12216. fprintf ( fp, "dictionary-stopwords: %s\n", tSettings.m_sStopwords.cstr () );
  12217. ARRAY_FOREACH ( i, tSettings.m_dWordforms )
  12218. fprintf ( fp, "\tdictionary-wordforms [%d]: %s\n", i, tSettings.m_dWordforms[i].cstr () );
  12219. fprintf ( fp, "min-stemming-len: %d\n", tSettings.m_iMinStemmingLen );
  12220. }
  12221. fprintf ( fp, "killlist-size: %d\n", m_iKillListSize );
  12222. fprintf ( fp, "min-max-index: "UINT64_FMT"\n", m_uMinMaxIndex );
  12223. if ( m_pFieldFilter )
  12224. {
  12225. CSphFieldFilterSettings tSettings;
  12226. m_pFieldFilter->GetSettings ( tSettings );
  12227. fprintf ( fp, "field-filter-utf8: %d\n", tSettings.m_bUTF8 ? 1 : 0 );
  12228. ARRAY_FOREACH ( i, tSettings.m_dRegexps )
  12229. fprintf ( fp, "field-filter-regexp [%d]: %s\n", i, tSettings.m_dRegexps[i].cstr() );
  12230. }
  12231. }
  12232. void CSphIndex_VLN::DebugDumpDocids ( FILE * fp )
  12233. {
  12234. if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
  12235. {
  12236. fprintf ( fp, "FATAL: docids dump only supported for docinfo=extern\n" );
  12237. return;
  12238. }
  12239. const int iRowStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  12240. const int64_t iNumMinMaxRow = ( m_uVersion>=20 ) ? ( (m_iDocinfoIndex+1)*iRowStride*2 ) : 0;
  12241. const int64_t iNumRows = (m_pDocinfo.GetNumEntries()-iNumMinMaxRow) / iRowStride;
  12242. const int64_t iDocinfoSize = iRowStride*m_iDocinfo*sizeof(DWORD);
  12243. const int64_t iMinmaxSize = iNumMinMaxRow*sizeof(CSphRowitem);
  12244. fprintf ( fp, "docinfo-bytes: docinfo="INT64_FMT", min-max="INT64_FMT", total="UINT64_FMT"\n"
  12245. , iDocinfoSize, iMinmaxSize, (uint64_t)m_pDocinfo.GetLength() );
  12246. fprintf ( fp, "docinfo-stride: %d\n", (int)(iRowStride*sizeof(DWORD)) );
  12247. fprintf ( fp, "docinfo-rows: "INT64_FMT"\n", iNumRows );
  12248. if ( !m_pDocinfo.GetNumEntries() )
  12249. return;
  12250. DWORD * pDocinfo = m_pDocinfo.GetWritePtr();
  12251. for ( int64_t iRow=0; iRow<iNumRows; iRow++, pDocinfo+=iRowStride )
  12252. printf ( INT64_FMT". id=" DOCID_FMT "\n", iRow+1, DOCINFO2ID ( pDocinfo ) );
  12253. printf ( "--- min-max="INT64_FMT" ---\n", iNumMinMaxRow );
  12254. for ( int64_t iRow=0; iRow<(m_iDocinfoIndex+1)*2; iRow++, pDocinfo+=iRowStride )
  12255. printf ( "id=" DOCID_FMT "\n", DOCINFO2ID ( pDocinfo ) );
  12256. }
  12257. void CSphIndex_VLN::DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID )
  12258. {
  12259. WITH_QWORD ( this, false, Qword, DumpHitlist<Qword> ( fp, sKeyword, bID ) );
  12260. }
  12261. template < class Qword >
  12262. void CSphIndex_VLN::DumpHitlist ( FILE * fp, const char * sKeyword, bool bID )
  12263. {
  12264. // get keyword id
  12265. SphWordID_t uWordID = 0;
  12266. BYTE * sTok = NULL;
  12267. if ( !bID )
  12268. {
  12269. CSphString sBuf ( sKeyword );
  12270. m_pTokenizer->SetBuffer ( (BYTE*)sBuf.cstr(), strlen ( sBuf.cstr() ) );
  12271. sTok = m_pTokenizer->GetToken();
  12272. if ( !sTok )
  12273. sphDie ( "keyword=%s, no token (too short?)", sKeyword );
  12274. uWordID = m_pDict->GetWordID ( sTok );
  12275. if ( !uWordID )
  12276. sphDie ( "keyword=%s, tok=%s, no wordid (stopped?)", sKeyword, sTok );
  12277. fprintf ( fp, "keyword=%s, tok=%s, wordid="UINT64_FMT"\n", sKeyword, sTok, uint64_t(uWordID) );
  12278. } else
  12279. {
  12280. uWordID = (SphWordID_t) strtoull ( sKeyword, NULL, 10 );
  12281. if ( !uWordID )
  12282. sphDie ( "failed to convert keyword=%s to id (must be integer)", sKeyword );
  12283. fprintf ( fp, "wordid="UINT64_FMT"\n", uint64_t(uWordID) );
  12284. }
  12285. // open files
  12286. CSphAutofile tDoclist, tHitlist, tWordlist;
  12287. if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError ) < 0 )
  12288. sphDie ( "failed to open doclist: %s", m_sLastError.cstr() );
  12289. if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, m_sLastError ) < 0 )
  12290. sphDie ( "failed to open hitlist: %s", m_sLastError.cstr() );
  12291. if ( tWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, m_sLastError ) < 0 )
  12292. sphDie ( "failed to open wordlist: %s", m_sLastError.cstr() );
  12293. // aim
  12294. DiskIndexQwordSetup_c tTermSetup ( tDoclist, tHitlist, tWordlist, m_bPreloadWordlist ? 0 : m_tWordlist.m_iMaxChunk, m_pSkiplists.GetWritePtr(), NULL );
  12295. tTermSetup.m_pDict = m_pDict;
  12296. tTermSetup.m_pIndex = this;
  12297. tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
  12298. tTermSetup.m_iMinDocid = m_iMinDocid;
  12299. tTermSetup.m_pMinRow = m_dMinRow.Begin();
  12300. tTermSetup.m_bSetupReaders = true;
  12301. Qword tKeyword ( false, false );
  12302. tKeyword.m_tDoc.m_iDocID = m_iMinDocid;
  12303. tKeyword.m_iWordID = uWordID;
  12304. tKeyword.m_sWord = sKeyword;
  12305. tKeyword.m_sDictWord = (const char *)sTok;
  12306. if ( !tTermSetup.QwordSetup ( &tKeyword ) )
  12307. sphDie ( "failed to setup keyword" );
  12308. int iSize = m_tSchema.GetRowSize();
  12309. CSphVector<CSphRowitem> dAttrs ( iSize );
  12310. // press play on tape
  12311. for ( ;; )
  12312. {
  12313. tKeyword.GetNextDoc ( iSize ? &dAttrs[0] : NULL );
  12314. if ( !tKeyword.m_tDoc.m_iDocID )
  12315. break;
  12316. tKeyword.SeekHitlist ( tKeyword.m_iHitlistPos );
  12317. int iHits = 0;
  12318. if ( tKeyword.m_bHasHitlist )
  12319. for ( Hitpos_t uHit = tKeyword.GetNextHit(); uHit!=EMPTY_HIT; uHit = tKeyword.GetNextHit() )
  12320. {
  12321. fprintf ( fp, "doc="DOCID_FMT", hit=0x%08x\n", tKeyword.m_tDoc.m_iDocID, uHit ); // FIXME?
  12322. iHits++;
  12323. }
  12324. if ( !iHits )
  12325. {
  12326. uint64_t uOff = tKeyword.m_iHitlistPos;
  12327. fprintf ( fp, "doc="DOCID_FMT", NO HITS, inline=%d, off="UINT64_FMT"\n",
  12328. tKeyword.m_tDoc.m_iDocID, (int)(uOff>>63), (uOff<<1)>>1 );
  12329. }
  12330. }
  12331. }
  12332. void CSphIndex_VLN::DebugDumpDict ( FILE * fp )
  12333. {
  12334. if ( !m_pDict->GetSettings().m_bWordDict )
  12335. {
  12336. fprintf ( fp, "sorry, DebugDumpDict() only supports dict=keywords for now\n" );
  12337. return;
  12338. }
  12339. // thread safe outer storage for dictionaries chunks and file
  12340. // FIXME! cut-n-paste
  12341. CSphString sError;
  12342. BYTE * pBuf = NULL;
  12343. int iFD = -1;
  12344. CSphAutofile rdWordlist;
  12345. if ( !m_bPreloadWordlist )
  12346. {
  12347. if ( m_bKeepFilesOpen )
  12348. iFD = m_tWordlist.m_tFile.GetFD();
  12349. else
  12350. {
  12351. iFD = rdWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, sError );
  12352. if ( iFD<0 )
  12353. {
  12354. fprintf ( fp, "ERROR: %s\n", sError.cstr() );
  12355. return;
  12356. }
  12357. }
  12358. if ( m_tWordlist.m_iMaxChunk>0 )
  12359. pBuf = new BYTE [ m_tWordlist.m_iMaxChunk ];
  12360. }
  12361. fprintf ( fp, "keyword,docs,hits,offset\n" );
  12362. ARRAY_FOREACH ( i, m_tWordlist.m_dCheckpoints )
  12363. {
  12364. KeywordsBlockReader_c tCtx ( m_tWordlist.AcquireDict ( &m_tWordlist.m_dCheckpoints[i], iFD, pBuf ), m_bHaveSkips );
  12365. while ( tCtx.UnpackWord() )
  12366. printf ( "%s,%d,%d," INT64_FMT "\n", tCtx.GetWord(), tCtx.m_iDocs, tCtx.m_iHits, int64_t(tCtx.m_iDoclistOffset) );
  12367. }
  12368. }
  12369. //////////////////////////////////////////////////////////////////////////
  12370. bool CSphIndex_VLN::Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning )
  12371. {
  12372. MEMORY ( SPH_MEM_IDX_DISK );
  12373. // reset
  12374. Dealloc ();
  12375. // always keep shared variables flag
  12376. if ( m_dShared.IsEmpty() )
  12377. {
  12378. if ( !m_dShared.Alloc ( SPH_SHARED_VARS_COUNT, m_sLastError, sWarning ) )
  12379. return false;
  12380. }
  12381. memset ( m_dShared.GetWritePtr(), 0, m_dShared.GetLength() );
  12382. m_pPreread = m_dShared.GetWritePtr()+0;
  12383. m_pAttrsStatus = m_dShared.GetWritePtr()+1;
  12384. // set new locking flag
  12385. m_pDocinfo.SetMlock ( bMlock );
  12386. m_tWordlist.m_pBuf.SetMlock ( bMlock );
  12387. m_pMva.SetMlock ( bMlock );
  12388. m_pStrings.SetMlock ( bMlock );
  12389. m_pKillList.SetMlock ( bMlock );
  12390. m_pSkiplists.SetMlock ( bMlock );
  12391. // preload schema
  12392. if ( !LoadHeader ( GetIndexFileName("sph").cstr(), bStripPath, sWarning ) )
  12393. return false;
  12394. // verify that data files are readable
  12395. if ( !sphIsReadable ( GetIndexFileName("spd").cstr(), &m_sLastError ) )
  12396. return false;
  12397. if ( m_uVersion>=3 && !sphIsReadable ( GetIndexFileName("spp").cstr(), &m_sLastError ) )
  12398. return false;
  12399. if ( m_bHaveSkips && !sphIsReadable ( GetIndexFileName("spe").cstr(), &m_sLastError ) )
  12400. return false;
  12401. /////////////////////
  12402. // prealloc wordlist
  12403. /////////////////////
  12404. // try to open wordlist file in all cases
  12405. CSphAutofile tWordlist ( GetIndexFileName("spi"), SPH_O_READ, m_sLastError );
  12406. if ( tWordlist.GetFD()<0 )
  12407. return false;
  12408. m_tWordlist.m_iSize = tWordlist.GetSize ( 1, true, m_sLastError );
  12409. if ( m_tWordlist.m_iSize<0 )
  12410. return false;
  12411. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
  12412. {
  12413. CSphAutofile tDocinfo ( GetIndexFileName("spa"), SPH_O_READ, m_sLastError );
  12414. if ( tDocinfo.GetFD()<0 )
  12415. return false;
  12416. m_bIsEmpty = ( tDocinfo.GetSize ( 0, false, m_sLastError )==0 );
  12417. } else
  12418. m_bIsEmpty = ( m_tWordlist.m_iSize<=1 );
  12419. if ( ( m_tWordlist.m_iSize<=1 )!=( m_tWordlist.m_dCheckpoints.GetLength()==0 ) )
  12420. sphWarning ( "wordlist size mismatch (size="INT64_FMT", checkpoints=%d)", m_tWordlist.m_iSize, m_tWordlist.m_dCheckpoints.GetLength() );
  12421. // make sure checkpoints are loadable
  12422. // pre-11 indices use different offset type (this is fixed up later during the loading)
  12423. assert ( m_tWordlist.m_iDictCheckpointsOffset>0 );
  12424. // prealloc wordlist upto checkpoints
  12425. // (keyword blocks aka checkpoints, infix blocks etc will be loaded separately)
  12426. if ( m_bPreloadWordlist )
  12427. if ( !m_tWordlist.m_pBuf.Alloc ( m_tWordlist.m_iDictCheckpointsOffset, m_sLastError, sWarning ) )
  12428. return false;
  12429. // preopen
  12430. if ( m_bKeepFilesOpen )
  12431. {
  12432. if ( m_tDoclistFile.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError ) < 0 )
  12433. return false;
  12434. if ( m_tHitlistFile.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, m_sLastError ) < 0 )
  12435. return false;
  12436. if ( !m_bPreloadWordlist && m_tWordlist.m_tFile.Open ( GetIndexFileName("spi"), SPH_O_READ, m_sLastError ) < 0 )
  12437. return false;
  12438. }
  12439. /////////////////////
  12440. // prealloc docinfos
  12441. /////////////////////
  12442. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_bIsEmpty )
  12443. {
  12444. /////////////
  12445. // attr data
  12446. /////////////
  12447. int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  12448. int iStride2 = iStride-1; // id64 - 1 DWORD = id32
  12449. int iEntrySize = sizeof(DWORD)*iStride;
  12450. CSphAutofile tDocinfo ( GetIndexFileName("spa"), SPH_O_READ, m_sLastError );
  12451. if ( tDocinfo.GetFD()<0 )
  12452. return false;
  12453. int64_t iDocinfoSize = tDocinfo.GetSize ( iEntrySize, true, m_sLastError );
  12454. if ( iDocinfoSize<0 )
  12455. return false;
  12456. iDocinfoSize = iDocinfoSize / sizeof(DWORD);
  12457. int64_t iRealDocinfoSize = m_uMinMaxIndex ? m_uMinMaxIndex : iDocinfoSize;
  12458. m_iDocinfo = iRealDocinfoSize / iStride;
  12459. if ( m_bId32to64 )
  12460. {
  12461. // check also the case of id32 here, and correct m_iDocinfo for it
  12462. m_iDocinfo = iRealDocinfoSize / iStride2;
  12463. m_uMinMaxIndex = m_uMinMaxIndex / iStride2 * iStride;
  12464. }
  12465. if ( !CheckDocsCount ( m_iDocinfo, m_sLastError ) )
  12466. return false;
  12467. if ( m_uVersion < 20 )
  12468. {
  12469. if ( m_bId32to64 )
  12470. iDocinfoSize = iDocinfoSize / iStride2 * iStride;
  12471. m_iDocinfoIndex = ( m_iDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
  12472. // prealloc docinfo
  12473. if ( !m_pDocinfo.Alloc ( iDocinfoSize + (m_iDocinfoIndex+1)*iStride*2 + ( m_bId32to64 ? m_iDocinfo : 0 ), m_sLastError, sWarning ) )
  12474. return false;
  12475. m_pDocinfoIndex = m_pDocinfo.GetWritePtr()+iDocinfoSize;
  12476. } else
  12477. {
  12478. if ( iDocinfoSize < iRealDocinfoSize )
  12479. {
  12480. m_sLastError.SetSprintf ( "precomputed chunk size check mismatch" );
  12481. sphLogDebug ( "precomputed chunk size check mismatch (size="INT64_FMT", real="INT64_FMT", min-max="INT64_FMT", count="INT64_FMT")",
  12482. iDocinfoSize, iRealDocinfoSize, m_uMinMaxIndex, m_iDocinfo );
  12483. return false;
  12484. }
  12485. m_iDocinfoIndex = ( ( iDocinfoSize - iRealDocinfoSize ) / (m_bId32to64?iStride2:iStride) / 2 ) - 1;
  12486. // prealloc docinfo
  12487. if ( !m_pDocinfo.Alloc ( iDocinfoSize + ( m_bId32to64 ? ( m_iDocinfo + m_iDocinfoIndex*2 + 2 ) : 0 ), m_sLastError, sWarning ) )
  12488. return false;
  12489. #if PARANOID
  12490. int64_t uDocinfoIndex = ( m_iDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
  12491. assert ( uDocinfoIndex==m_iDocinfoIndex );
  12492. #endif
  12493. m_pDocinfoIndex = m_pDocinfo.GetWritePtr()+m_uMinMaxIndex;
  12494. }
  12495. // prealloc docinfo hash but only if docinfo is big enough (in other words if hash is 8x+ less in size)
  12496. if ( m_pDocinfoHash.IsEmpty() && m_pDocinfo.GetLength() > ( 32 << DOCINFO_HASH_BITS ) )
  12497. if ( !m_pDocinfoHash.Alloc ( ( 1 << DOCINFO_HASH_BITS )+4, m_sLastError, sWarning ) )
  12498. return false;
  12499. ////////////
  12500. // MVA data
  12501. ////////////
  12502. if ( m_uVersion>=4 )
  12503. {
  12504. // if index is v4, .spm must always exist, even though length could be 0
  12505. CSphAutofile fdMva ( GetIndexFileName("spm"), SPH_O_READ, m_sLastError );
  12506. if ( fdMva.GetFD()<0 )
  12507. return false;
  12508. SphOffset_t iMvaSize = fdMva.GetSize ( 0, true, m_sLastError );
  12509. if ( iMvaSize<0 )
  12510. return false;
  12511. // prealloc
  12512. if ( iMvaSize>0 )
  12513. if ( !m_pMva.Alloc ( DWORD(iMvaSize/sizeof(DWORD)), m_sLastError, sWarning ) )
  12514. return false;
  12515. }
  12516. ///////////////
  12517. // string data
  12518. ///////////////
  12519. if ( m_uVersion>=17 )
  12520. {
  12521. CSphAutofile fdStrings ( GetIndexFileName("sps"), SPH_O_READ, m_sLastError );
  12522. if ( fdStrings.GetFD()<0 )
  12523. return false;
  12524. SphOffset_t iStringsSize = fdStrings.GetSize ( 0, true, m_sLastError );
  12525. if ( iStringsSize<0 )
  12526. return false;
  12527. // prealloc
  12528. if ( iStringsSize>0 )
  12529. if ( !m_pStrings.Alloc ( DWORD(iStringsSize), m_sLastError, sWarning ) )
  12530. return false;
  12531. }
  12532. }
  12533. // prealloc killlist
  12534. if ( m_uVersion>=10 )
  12535. {
  12536. CSphAutofile fdKillList ( GetIndexFileName("spk"), SPH_O_READ, m_sLastError );
  12537. if ( fdKillList.GetFD()<0 )
  12538. return false;
  12539. SphOffset_t iSize = fdKillList.GetSize ( 0, true, m_sLastError );
  12540. if ( iSize<0 )
  12541. return false;
  12542. if ( iSize!=(SphOffset_t)( m_iKillListSize*sizeof(SphAttr_t) ) )
  12543. {
  12544. m_sLastError.SetSprintf ( "header k-list size does not match .spk size (klist=" INT64_FMT ", spk=" INT64_FMT ")",
  12545. (int64_t)( m_iKillListSize*sizeof(SphAttr_t) ),
  12546. (int64_t) iSize );
  12547. return false;
  12548. }
  12549. // prealloc
  12550. if ( iSize>0 && !m_pKillList.Alloc ( m_iKillListSize, m_sLastError, sWarning ) )
  12551. return false;
  12552. }
  12553. // prealloc skiplist
  12554. if ( m_bHaveSkips )
  12555. {
  12556. CSphAutofile fdSkips ( GetIndexFileName("spe"), SPH_O_READ, m_sLastError );
  12557. if ( fdSkips.GetFD()<0 )
  12558. return false;
  12559. SphOffset_t iSize = fdSkips.GetSize ( 0, true, m_sLastError );
  12560. if ( iSize<0 )
  12561. return false;
  12562. if ( iSize>0 && !m_pSkiplists.Alloc ( iSize, m_sLastError, sWarning ) )
  12563. return false;
  12564. }
  12565. bool bWordDict = false;
  12566. if ( m_pDict )
  12567. bWordDict = m_pDict->GetSettings().m_bWordDict;
  12568. // preload checkpoints (must be done here as they are not shared)
  12569. if ( !m_tWordlist.ReadCP ( tWordlist, m_uVersion, bWordDict, m_sLastError ) )
  12570. {
  12571. m_sLastError.SetSprintf ( "failed to read %s: %s", GetIndexFileName("spi").cstr(), m_sLastError.cstr () );
  12572. return false;
  12573. }
  12574. // all done
  12575. m_bPreallocated = true;
  12576. m_iIndexTag = ++m_iIndexTagSeq;
  12577. return true;
  12578. }
  12579. template < typename T > bool CSphIndex_VLN::PrereadSharedBuffer ( CSphSharedBuffer<T> & pBuffer,
  12580. const char * sExt, int64_t iExpected, int64_t iOffset )
  12581. {
  12582. sphLogDebug ( "prereading .%s", sExt );
  12583. if ( !pBuffer.GetLength() )
  12584. return true;
  12585. CSphAutofile fdBuf ( GetIndexFileName(sExt), SPH_O_READ, m_sLastError );
  12586. if ( fdBuf.GetFD()<0 )
  12587. return false;
  12588. fdBuf.SetProgressCallback ( &m_tProgress );
  12589. if ( iExpected==0 )
  12590. iExpected = int64_t ( pBuffer.GetLength() ) - iOffset*sizeof(T);
  12591. return fdBuf.Read ( pBuffer.GetWritePtr() + iOffset, iExpected, m_sLastError );
  12592. }
  12593. bool CSphIndex_VLN::Preread ()
  12594. {
  12595. MEMORY ( SPH_MEM_IDX_DISK );
  12596. sphLogDebug ( "CSphIndex_VLN::Preread invoked" );
  12597. if ( !m_bPreallocated )
  12598. {
  12599. m_sLastError = "INTERNAL ERROR: not preallocated";
  12600. return false;
  12601. }
  12602. if ( !m_pPreread || *m_pPreread )
  12603. {
  12604. m_sLastError = "INTERNAL ERROR: already preread";
  12605. return false;
  12606. }
  12607. ///////////////////
  12608. // read everything
  12609. ///////////////////
  12610. m_tProgress.m_ePhase = CSphIndexProgress::PHASE_PREREAD;
  12611. m_tProgress.m_iBytes = 0;
  12612. m_tProgress.m_iBytesTotal = m_pDocinfo.GetLength() + m_pMva.GetLength() + m_pStrings.GetLength() + m_pKillList.GetLength();
  12613. if ( m_bPreloadWordlist )
  12614. m_tProgress.m_iBytesTotal += m_tWordlist.m_pBuf.GetLength();
  12615. int64_t iExpected = ( m_uVersion<20 ? m_iDocinfo * ( ( m_bId32to64 ? 1 : DOCINFO_IDSIZE ) + m_tSchema.GetRowSize() ) * sizeof(DWORD) : 0 );
  12616. int64_t iOffset = ( m_bId32to64 ? ( m_iDocinfo + 2 + m_iDocinfoIndex * 2 ) : 0 );
  12617. if ( !PrereadSharedBuffer ( m_pDocinfo, "spa", iExpected, iOffset ) )
  12618. return false;
  12619. if ( !PrereadSharedBuffer ( m_pMva, "spm" ) )
  12620. return false;
  12621. if ( !PrereadSharedBuffer ( m_pStrings, "sps" ) )
  12622. return false;
  12623. if ( !PrereadSharedBuffer ( m_pKillList, "spk" ) )
  12624. return false;
  12625. if ( !PrereadSharedBuffer ( m_pSkiplists, "spe" ) )
  12626. return false;
  12627. #if PARANOID
  12628. for ( int i = 1; i < (int)m_iKillListSize; i++ )
  12629. assert ( m_pKillList[i-1] < m_pKillList[i] );
  12630. #endif
  12631. // preload wordlist
  12632. // FIXME! OPTIMIZE! can skip checkpoints
  12633. if ( m_bPreloadWordlist )
  12634. {
  12635. sphLogDebug ( "Prereading .spi" );
  12636. if ( !PrereadSharedBuffer ( m_tWordlist.m_pBuf, "spi" ) )
  12637. return false;
  12638. }
  12639. m_tProgress.Show ( true );
  12640. //////////////////////
  12641. // precalc everything
  12642. //////////////////////
  12643. // convert id32 to id64
  12644. if ( m_pDocinfo.GetLength() && m_bId32to64 )
  12645. {
  12646. DWORD * pTarget = m_pDocinfo.GetWritePtr();
  12647. const DWORD * pSource = pTarget + m_iDocinfo + 2 + m_iDocinfoIndex * 2;
  12648. int iStride = m_tSchema.GetRowSize();
  12649. SphDocID_t uDoc;
  12650. int64_t iLimit = m_iDocinfo + ( ( m_uVersion < 20 ) ? 0 : m_iDocinfoIndex * 2 + 2 );
  12651. for ( int64_t i=0; i<iLimit; i++ )
  12652. {
  12653. uDoc = *pSource; ///< wide id32 to id64
  12654. DOCINFOSETID ( pTarget, uDoc );
  12655. memcpy ( pTarget + DOCINFO_IDSIZE, pSource + 1, iStride * sizeof(DWORD) );
  12656. pSource += iStride+1;
  12657. pTarget += iStride+DOCINFO_IDSIZE;
  12658. }
  12659. sphWarning ( "id32 index loaded by id64 binary; attributes converted" );
  12660. }
  12661. // build attributes hash
  12662. if ( m_pDocinfo.GetLength() && m_pDocinfoHash.GetLength() )
  12663. {
  12664. sphLogDebug ( "Hashing docinfo" );
  12665. assert ( CheckDocsCount ( m_iDocinfo, m_sLastError ) );
  12666. int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  12667. SphDocID_t uFirst = DOCINFO2ID ( &m_pDocinfo[0] );
  12668. SphDocID_t uRange = DOCINFO2ID ( &m_pDocinfo[ ( m_iDocinfo-1)*iStride ] ) - uFirst;
  12669. DWORD iShift = 0;
  12670. while ( uRange>=( 1 << DOCINFO_HASH_BITS ) )
  12671. {
  12672. iShift++;
  12673. uRange >>= 1;
  12674. }
  12675. DWORD * pHash = m_pDocinfoHash.GetWritePtr();
  12676. *pHash++ = iShift;
  12677. *pHash = 0;
  12678. DWORD uLastHash = 0;
  12679. for ( int64_t i=1; i<m_iDocinfo; i++ )
  12680. {
  12681. assert ( DOCINFO2ID ( &m_pDocinfo[ i*iStride ] )>uFirst
  12682. && DOCINFO2ID ( &m_pDocinfo[ ( i-1 )*iStride ] ) < DOCINFO2ID ( &m_pDocinfo[ i*iStride ] )
  12683. && "descending document ID found" );
  12684. DWORD uHash = (DWORD)( ( DOCINFO2ID ( &m_pDocinfo[ i*iStride ] ) - uFirst ) >> iShift );
  12685. if ( uHash==uLastHash )
  12686. continue;
  12687. while ( uLastHash<uHash )
  12688. pHash [ ++uLastHash ] = (DWORD)i;
  12689. uLastHash = uHash;
  12690. }
  12691. pHash [ ++uLastHash ] = (DWORD)m_iDocinfo;
  12692. }
  12693. // persist MVA needs valid DocinfoHash
  12694. sphLogDebug ( "Prereading .mvp" );
  12695. if ( !LoadPersistentMVA ( m_sLastError ) )
  12696. return false;
  12697. // build "indexes" for full-scan
  12698. if ( m_uVersion < 20 && !PrecomputeMinMax() )
  12699. return false;
  12700. // paranoid MVA verification
  12701. #if PARANOID
  12702. // find out what attrs are MVA
  12703. CSphVector<int> dMvaRowitem;
  12704. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  12705. {
  12706. const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
  12707. if ( tCol.m_eAttrType==SPH_ATTR_UINT32SET )
  12708. dMvaRowitem.Add ( tCol.m_tLocator.m_iBitOffset/ROWITEM_BITS );
  12709. }
  12710. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  12711. {
  12712. const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
  12713. if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
  12714. dMvaRowitem.Add ( tCol.m_tLocator.m_iBitOffset/ROWITEM_BITS );
  12715. }
  12716. // for each docinfo entry, verify that MVA attrs point to right storage location
  12717. int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  12718. for ( int64_t iDoc=0; iDoc<m_iDocinfo && dMvaRowitem.GetLength(); iDoc++ )
  12719. {
  12720. CSphRowitem * pRow = m_pDocinfo.GetWritePtr() + ( iDoc*iStride );
  12721. CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
  12722. SphDocID_t uDocID = DOCINFO2ID(pRow);
  12723. DWORD uOff = pAttrs[ dMvaRowitem[0] ];
  12724. if ( !uOff )
  12725. {
  12726. // its either all or nothing
  12727. ARRAY_FOREACH ( i, dMvaRowitem )
  12728. assert ( pAttrs[ dMvaRowitem[i] ]==0 );
  12729. } else if ( !( uOff & MVA_ARENA_FLAG ) )
  12730. {
  12731. assert ( uDocID==DOCINFO2ID ( m_pMva.GetWritePtr() + uOff - DOCINFO_IDSIZE ) );
  12732. // walk the trail
  12733. ARRAY_FOREACH ( i, dMvaRowitem )
  12734. {
  12735. assert ( pAttrs[ dMvaRowitem[i] ]==uOff );
  12736. int iCount = m_pMva[uOff];
  12737. uOff += 1+iCount;
  12738. }
  12739. }
  12740. }
  12741. #endif // PARANOID
  12742. *m_pPreread = 1;
  12743. sphLogDebug ( "Preread successfully finished" );
  12744. return true;
  12745. }
  12746. void CSphIndex_VLN::SetBase ( const char * sNewBase )
  12747. {
  12748. m_sFilename = sNewBase;
  12749. }
  12750. bool CSphIndex_VLN::Rename ( const char * sNewBase )
  12751. {
  12752. if ( m_sFilename==sNewBase )
  12753. return true;
  12754. // try to rename everything
  12755. char sFrom [ SPH_MAX_FILENAME_LEN ];
  12756. char sTo [ SPH_MAX_FILENAME_LEN ];
  12757. const int EXT_COUNT = 10;
  12758. const char * sExts[EXT_COUNT] = { "spa", "spd", "sph", "spi", "spl", "spm", "spp", "spk", "sps", "spe" };
  12759. DWORD uMask = 0;
  12760. int iExt;
  12761. for ( iExt=0; iExt<EXT_COUNT; iExt++ )
  12762. {
  12763. const char * sExt = sExts[iExt];
  12764. if ( !strcmp ( sExt, "spp" ) && m_uVersion<3 ) // .spp files are v3+
  12765. continue;
  12766. if ( !strcmp ( sExt, "spm" ) && m_uVersion<4 ) // .spm files are v4+
  12767. continue;
  12768. if ( !strcmp ( sExt, "spk" ) && m_uVersion<10 ) // .spk files are v10+
  12769. continue;
  12770. if ( !strcmp ( sExt, "sps" ) && m_uVersion<17 ) // .spk files are v17+
  12771. continue;
  12772. if ( !strcmp ( sExt, "spe" ) && m_uVersion<31 ) // .spe files are v31+
  12773. continue;
  12774. #if !USE_WINDOWS
  12775. if ( !strcmp ( sExt, "spl" ) && m_iLockFD<0 ) // .spl files are locks
  12776. continue;
  12777. #else
  12778. if ( !strcmp ( sExt, "spl" ) )
  12779. {
  12780. if ( m_iLockFD>=0 )
  12781. {
  12782. ::close ( m_iLockFD );
  12783. ::unlink ( GetIndexFileName("spl").cstr() );
  12784. sphLogDebug ( "lock %s unlinked, file with ID %d closed", GetIndexFileName("spl").cstr(), m_iLockFD );
  12785. m_iLockFD = -1;
  12786. }
  12787. continue;
  12788. }
  12789. #endif
  12790. snprintf ( sFrom, sizeof(sFrom), "%s.%s", m_sFilename.cstr(), sExt );
  12791. snprintf ( sTo, sizeof(sTo), "%s.%s", sNewBase, sExt );
  12792. #if USE_WINDOWS
  12793. ::unlink ( sTo );
  12794. sphLogDebug ( "%s unlinked", sTo );
  12795. #endif
  12796. if ( ::rename ( sFrom, sTo ) )
  12797. {
  12798. m_sLastError.SetSprintf ( "rename %s to %s failed: %s", sFrom, sTo, strerror(errno) );
  12799. // this is no reason to fail if spl is missing, since it is only lock and no data.
  12800. if ( strcmp ( sExt, "spl" ) )
  12801. break;
  12802. }
  12803. uMask |= ( 1UL << iExt );
  12804. }
  12805. // are we good?
  12806. if ( iExt==EXT_COUNT )
  12807. {
  12808. SetBase ( sNewBase );
  12809. sphLogDebug ( "Base set to %s", sNewBase );
  12810. return true;
  12811. }
  12812. // if there were errors, rollback
  12813. for ( iExt=0; iExt<EXT_COUNT; iExt++ )
  12814. {
  12815. if (!( uMask & ( 1UL << iExt ) ))
  12816. continue;
  12817. const char * sExt = sExts[iExt];
  12818. snprintf ( sFrom, sizeof(sFrom), "%s.%s", sNewBase, sExt );
  12819. snprintf ( sTo, sizeof(sTo), "%s.%s", m_sFilename.cstr(), sExt );
  12820. if ( ::rename ( sFrom, sTo ) )
  12821. {
  12822. sphLogDebug ( "Rollback failure when renaming %s to %s", sFrom, sTo );
  12823. // !COMMIT should handle rollback failures somehow
  12824. }
  12825. }
  12826. return false;
  12827. }
  12828. //////////////////////////////////////////////////////////////////////////
  12829. CSphQueryContext::CSphQueryContext ()
  12830. {
  12831. m_iWeights = 0;
  12832. m_bLookupFilter = false;
  12833. m_bLookupSort = false;
  12834. m_bPackedFactors = false;
  12835. m_pFilter = NULL;
  12836. m_pWeightFilter = NULL;
  12837. m_pIndexData = NULL;
  12838. m_pProfile = NULL;
  12839. }
  12840. CSphQueryContext::~CSphQueryContext ()
  12841. {
  12842. SafeDelete ( m_pFilter );
  12843. SafeDelete ( m_pWeightFilter );
  12844. }
  12845. void CSphQueryContext::BindWeights ( const CSphQuery * pQuery, const CSphSchema & tSchema, int iIndexWeight )
  12846. {
  12847. const int MIN_WEIGHT = 1;
  12848. // const int HEAVY_FIELDS = 32;
  12849. const int HEAVY_FIELDS = SPH_MAX_FIELDS;
  12850. // defaults
  12851. m_iWeights = Min ( tSchema.m_dFields.GetLength(), HEAVY_FIELDS );
  12852. for ( int i=0; i<m_iWeights; i++ )
  12853. m_dWeights[i] = MIN_WEIGHT * iIndexWeight;
  12854. // name-bound weights
  12855. if ( pQuery->m_dFieldWeights.GetLength() )
  12856. {
  12857. ARRAY_FOREACH ( i, pQuery->m_dFieldWeights )
  12858. {
  12859. int j = tSchema.GetFieldIndex ( pQuery->m_dFieldWeights[i].m_sName.cstr() );
  12860. if ( j>=0 && j<HEAVY_FIELDS )
  12861. m_dWeights[j] = Max ( MIN_WEIGHT, pQuery->m_dFieldWeights[i].m_iValue ) * iIndexWeight;
  12862. }
  12863. return;
  12864. }
  12865. // order-bound weights
  12866. if ( pQuery->m_pWeights )
  12867. {
  12868. for ( int i=0; i<Min ( m_iWeights, pQuery->m_iWeights ); i++ )
  12869. m_dWeights[i] = Max ( MIN_WEIGHT, (int)pQuery->m_pWeights[i] ) * iIndexWeight;
  12870. }
  12871. }
  12872. bool CSphQueryContext::SetupCalc ( CSphQueryResult * pResult, const CSphSchema & tInSchema,
  12873. const CSphSchema & tSchema, const DWORD * pMvaPool )
  12874. {
  12875. m_dCalcFilter.Resize ( 0 );
  12876. m_dCalcSort.Resize ( 0 );
  12877. m_dCalcFinal.Resize ( 0 );
  12878. // quickly verify that all my real attributes can be stashed there
  12879. if ( tInSchema.GetAttrsCount() < tSchema.GetAttrsCount() )
  12880. {
  12881. pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema mismatch (incount=%d, mycount=%d)",
  12882. tInSchema.GetAttrsCount(), tSchema.GetAttrsCount() );
  12883. return false;
  12884. }
  12885. // now match everyone
  12886. for ( int iIn=0; iIn<tInSchema.GetAttrsCount(); iIn++ )
  12887. {
  12888. const CSphColumnInfo & tIn = tInSchema.GetAttr(iIn);
  12889. switch ( tIn.m_eStage )
  12890. {
  12891. case SPH_EVAL_STATIC:
  12892. case SPH_EVAL_OVERRIDE:
  12893. {
  12894. const CSphColumnInfo * pMy = tSchema.GetAttr ( tIn.m_sName.cstr() );
  12895. if ( !pMy )
  12896. {
  12897. pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema attr missing from index-schema (in=%s)",
  12898. sphDumpAttr(tIn).cstr() );
  12899. return false;
  12900. }
  12901. if ( tIn.m_eStage==SPH_EVAL_OVERRIDE )
  12902. {
  12903. // override; check for type/size match and dynamic part
  12904. if ( tIn.m_eAttrType!=pMy->m_eAttrType
  12905. || tIn.m_tLocator.m_iBitCount!=pMy->m_tLocator.m_iBitCount
  12906. || !tIn.m_tLocator.m_bDynamic )
  12907. {
  12908. pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema override mismatch (in=%s, my=%s)",
  12909. sphDumpAttr(tIn).cstr(), sphDumpAttr(*pMy).cstr() );
  12910. return false;
  12911. }
  12912. } else
  12913. {
  12914. // static; check for full match
  12915. if (!( tIn==*pMy ))
  12916. {
  12917. pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema mismatch (in=%s, my=%s)",
  12918. sphDumpAttr(tIn).cstr(), sphDumpAttr(*pMy).cstr() );
  12919. return false;
  12920. }
  12921. }
  12922. break;
  12923. }
  12924. case SPH_EVAL_PREFILTER:
  12925. case SPH_EVAL_PRESORT:
  12926. case SPH_EVAL_FINAL:
  12927. {
  12928. ISphExpr * pExpr = tIn.m_pExpr.Ptr();
  12929. if ( !pExpr )
  12930. {
  12931. pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema expression missing evaluator (stage=%d, in=%s)",
  12932. (int)tIn.m_eStage, sphDumpAttr(tIn).cstr() );
  12933. return false;
  12934. }
  12935. // an expression that index/searcher should compute
  12936. CalcItem_t tCalc;
  12937. tCalc.m_eType = tIn.m_eAttrType;
  12938. tCalc.m_tLoc = tIn.m_tLocator;
  12939. tCalc.m_pExpr = pExpr;
  12940. tCalc.m_pExpr->Command ( SPH_EXPR_SET_MVA_POOL, (void*)pMvaPool );
  12941. switch ( tIn.m_eStage )
  12942. {
  12943. case SPH_EVAL_PREFILTER: m_dCalcFilter.Add ( tCalc ); break;
  12944. case SPH_EVAL_PRESORT: m_dCalcSort.Add ( tCalc ); break;
  12945. case SPH_EVAL_FINAL: m_dCalcFinal.Add ( tCalc ); break;
  12946. default: break;
  12947. }
  12948. break;
  12949. }
  12950. case SPH_EVAL_SORTER:
  12951. // sorter tells it will compute itself; so just skip it
  12952. case SPH_EVAL_POSTLIMIT:
  12953. break;
  12954. default:
  12955. pResult->m_sError.SetSprintf ( "INTERNAL ERROR: unhandled eval stage=%d", (int)tIn.m_eStage );
  12956. return false;
  12957. }
  12958. }
  12959. // ok, we can emit matches in this schema (incoming for sorter, outgoing for index/searcher)
  12960. pResult->m_tSchema = tInSchema;
  12961. return true;
  12962. }
  12963. bool CSphIndex_VLN::IsStarDict () const
  12964. {
  12965. return (
  12966. ( m_uVersion>=7 && ( m_tSettings.m_iMinPrefixLen>0 || m_tSettings.m_iMinInfixLen>0 ) && m_bEnableStar ) || // v.7 added mangling to infixes
  12967. ( m_uVersion==6 && ( m_tSettings.m_iMinPrefixLen>0 ) && m_bEnableStar ) ); // v.6 added mangling to prefixes
  12968. }
  12969. CSphDict * CSphIndex_VLN::SetupStarDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const
  12970. {
  12971. // spawn wrapper, and put it in the box
  12972. // wrapper type depends on version; v.8 introduced new mangling rules
  12973. if ( !IsStarDict() )
  12974. return pPrevDict;
  12975. if ( m_uVersion>=8 )
  12976. tContainer = new CSphDictStarV8 ( pPrevDict, m_tSettings.m_iMinPrefixLen>0, m_tSettings.m_iMinInfixLen>0 );
  12977. else
  12978. tContainer = new CSphDictStar ( pPrevDict );
  12979. // FIXME? might wanna verify somehow that the tokenizer has '*' as a character
  12980. return tContainer.Ptr();
  12981. }
  12982. CSphDict * CSphIndex_VLN::SetupExactDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const
  12983. {
  12984. if ( m_uVersion<12 || !m_tSettings.m_bIndexExactWords )
  12985. return pPrevDict;
  12986. tContainer = new CSphDictExact ( pPrevDict );
  12987. return tContainer.Ptr();
  12988. }
  12989. bool CSphIndex_VLN::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
  12990. const char * szQuery, bool bGetStats, CSphString & sError ) const
  12991. {
  12992. WITH_QWORD ( this, false, Qword, return DoGetKeywords<Qword> ( dKeywords, szQuery, bGetStats, false, sError ) );
  12993. return false;
  12994. }
  12995. template < class Qword >
  12996. bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
  12997. const char * szQuery, bool bGetStats, bool bFillOnly, CSphString & sError ) const
  12998. {
  12999. if ( !m_pPreread || !*m_pPreread )
  13000. {
  13001. sError = "index not preread";
  13002. return false;
  13003. }
  13004. // short-cut if no query or keywords to fill
  13005. if ( ( bFillOnly && !dKeywords.GetLength() ) || ( !bFillOnly && ( !szQuery || !szQuery[0] ) ) )
  13006. return true;
  13007. CSphScopedPtr <CSphAutofile> pDoclist ( NULL );
  13008. CSphScopedPtr <CSphAutofile> pHitlist ( NULL );
  13009. CSphScopedPtr<ISphTokenizer> pTokenizer ( m_pTokenizer->Clone ( SPH_CLONE_INDEX ) ); // avoid race
  13010. pTokenizer->EnableTokenizedMultiformTracking ();
  13011. // need to support '*' and '=' but not the other specials
  13012. // so m_pQueryTokenizer does not work for us, gotta clone and setup one manually
  13013. if ( IsStarDict() )
  13014. pTokenizer->AddPlainChar ( '*' );
  13015. if ( m_tSettings.m_bIndexExactWords )
  13016. pTokenizer->AddPlainChar ( '=' );
  13017. CSphScopedPtr<CSphDict> tDictCloned ( NULL );
  13018. CSphDict * pDictBase = m_pDict;
  13019. if ( pDictBase->HasState() )
  13020. tDictCloned = pDictBase = pDictBase->Clone();
  13021. CSphScopedPtr<CSphDict> tDict ( NULL );
  13022. CSphDict * pDict = SetupStarDict ( tDict, pDictBase );
  13023. CSphScopedPtr<CSphDict> tDict2 ( NULL );
  13024. pDict = SetupExactDict ( tDict2, pDict );
  13025. // prepare for setup
  13026. CSphAutofile tDummy1, tDummy2, tDummy3, tWordlist;
  13027. if ( !m_bKeepFilesOpen )
  13028. if ( tWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, sError ) < 0 )
  13029. return false;
  13030. DiskIndexQwordSetup_c tTermSetup ( tDummy1, tDummy2
  13031. , m_bPreloadWordlist ? tDummy3 : ( m_bKeepFilesOpen ? m_tWordlist.m_tFile : tWordlist )
  13032. , m_bPreloadWordlist ? 0 : m_tWordlist.m_iMaxChunk, m_pSkiplists.GetWritePtr(), NULL );
  13033. tTermSetup.m_pDict = pDict;
  13034. tTermSetup.m_pIndex = this;
  13035. tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
  13036. Qword QueryWord ( false, false );
  13037. if ( !bFillOnly )
  13038. {
  13039. dKeywords.Resize ( 0 );
  13040. CSphString sTokenized;
  13041. BYTE * sWord;
  13042. CSphString sQbuf ( szQuery );
  13043. pTokenizer->SetBuffer ( (BYTE*)sQbuf.cstr(), strlen(szQuery) );
  13044. while ( ( sWord = pTokenizer->GetToken() )!=NULL )
  13045. {
  13046. BYTE * sMultiform = pTokenizer->GetTokenizedMultiform();
  13047. if ( sMultiform )
  13048. sTokenized = (const char*)sMultiform;
  13049. else
  13050. sTokenized = (const char*)sWord;
  13051. SphWordID_t iWord = pDict->GetWordID ( sWord );
  13052. if ( iWord )
  13053. {
  13054. if ( bGetStats )
  13055. {
  13056. QueryWord.Reset ();
  13057. QueryWord.m_sWord = (const char*)sWord;
  13058. QueryWord.m_sDictWord = (const char*)sWord;
  13059. QueryWord.m_iWordID = iWord;
  13060. tTermSetup.QwordSetup ( &QueryWord );
  13061. }
  13062. CSphKeywordInfo & tInfo = dKeywords.Add();
  13063. Swap ( tInfo.m_sTokenized, sTokenized );
  13064. tInfo.m_sNormalized = (const char*)sWord;
  13065. tInfo.m_iDocs = bGetStats ? QueryWord.m_iDocs : 0;
  13066. tInfo.m_iHits = bGetStats ? QueryWord.m_iHits : 0;
  13067. if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
  13068. *(char *)tInfo.m_sNormalized.cstr() = '=';
  13069. }
  13070. }
  13071. } else
  13072. {
  13073. BYTE sWord[MAX_KEYWORD_BYTES];
  13074. ARRAY_FOREACH ( i, dKeywords )
  13075. {
  13076. CSphKeywordInfo & tInfo = dKeywords[i];
  13077. int iLen = tInfo.m_sTokenized.Length();
  13078. memcpy ( sWord, tInfo.m_sTokenized.cstr(), iLen );
  13079. sWord[iLen] = '\0';
  13080. SphWordID_t iWord = pDict->GetWordID ( sWord );
  13081. if ( iWord )
  13082. {
  13083. QueryWord.Reset ();
  13084. QueryWord.m_sWord = tInfo.m_sTokenized;
  13085. QueryWord.m_sDictWord = (const char*)sWord;
  13086. QueryWord.m_iWordID = iWord;
  13087. tTermSetup.QwordSetup ( &QueryWord );
  13088. tInfo.m_iDocs = QueryWord.m_iDocs;
  13089. tInfo.m_iHits = QueryWord.m_iHits;
  13090. }
  13091. }
  13092. }
  13093. return true;
  13094. }
  13095. bool CSphIndex_VLN::FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, CSphString & sError ) const
  13096. {
  13097. WITH_QWORD ( this, false, Qword, return DoGetKeywords<Qword> ( dKeywords, NULL, true, true, sError ) );
  13098. return false;
  13099. }
  13100. // fix MSVC 2005 fuckup, template DoGetKeywords() just above somehow resets forScope
  13101. #if USE_WINDOWS
  13102. #pragma conform(forScope,on)
  13103. #endif
  13104. static bool IsWeightColumn ( const CSphString & sAttr, const CSphSchema & tSchema )
  13105. {
  13106. if ( sAttr=="@weight" )
  13107. return true;
  13108. const CSphColumnInfo * pCol = tSchema.GetAttr ( sAttr.cstr() );
  13109. return ( pCol && pCol->m_bWeight );
  13110. }
  13111. bool CSphQueryContext::CreateFilters ( bool bFullscan,
  13112. const CSphVector<CSphFilterSettings> * pdFilters, const CSphSchema & tSchema,
  13113. const DWORD * pMvaPool, const BYTE * pStrings, CSphString & sError )
  13114. {
  13115. if ( !pdFilters )
  13116. return true;
  13117. ARRAY_FOREACH ( i, (*pdFilters) )
  13118. {
  13119. const CSphFilterSettings & tFilter = (*pdFilters)[i];
  13120. if ( tFilter.m_sAttrName.IsEmpty() )
  13121. continue;
  13122. bool bWeight = IsWeightColumn ( tFilter.m_sAttrName, tSchema );
  13123. if ( bFullscan && bWeight )
  13124. continue; // @weight is not avaiable in fullscan mode
  13125. ISphFilter * pFilter = sphCreateFilter ( tFilter, tSchema, pMvaPool, pStrings, sError );
  13126. if ( !pFilter )
  13127. return false;
  13128. ISphFilter ** pGroup = bWeight ? &m_pWeightFilter : &m_pFilter;
  13129. *pGroup = sphJoinFilters ( *pGroup, pFilter );
  13130. }
  13131. if ( m_pFilter )
  13132. m_pFilter = m_pFilter->Optimize();
  13133. return true;
  13134. }
  13135. bool CSphQueryContext::SetupOverrides ( const CSphQuery * pQuery, CSphQueryResult * pResult, const CSphSchema & tIndexSchema )
  13136. {
  13137. m_pOverrides = NULL;
  13138. m_dOverrideIn.Resize ( pQuery->m_dOverrides.GetLength() );
  13139. m_dOverrideOut.Resize ( pQuery->m_dOverrides.GetLength() );
  13140. ARRAY_FOREACH ( i, pQuery->m_dOverrides )
  13141. {
  13142. const char * sAttr = pQuery->m_dOverrides[i].m_sAttr.cstr(); // shortcut
  13143. const CSphColumnInfo * pCol = tIndexSchema.GetAttr ( sAttr );
  13144. if ( !pCol )
  13145. {
  13146. pResult->m_sError.SetSprintf ( "attribute override: unknown attribute name '%s'", sAttr );
  13147. return false;
  13148. }
  13149. if ( pCol->m_eAttrType!=pQuery->m_dOverrides[i].m_eAttrType )
  13150. {
  13151. pResult->m_sError.SetSprintf ( "attribute override: attribute '%s' type mismatch (index=%d, query=%d)",
  13152. sAttr, pCol->m_eAttrType, pQuery->m_dOverrides[i].m_eAttrType );
  13153. return false;
  13154. }
  13155. const CSphColumnInfo * pOutCol = pResult->m_tSchema.GetAttr ( pQuery->m_dOverrides[i].m_sAttr.cstr() );
  13156. if ( !pOutCol )
  13157. {
  13158. pResult->m_sError.SetSprintf ( "attribute override: unknown attribute name '%s' in outgoing schema", sAttr );
  13159. return false;
  13160. }
  13161. m_dOverrideIn[i] = pCol->m_tLocator;
  13162. m_dOverrideOut[i] = pOutCol->m_tLocator;
  13163. #ifndef NDEBUG
  13164. // check that the values are actually sorted
  13165. const CSphVector<CSphAttrOverride::IdValuePair_t> & dValues = pQuery->m_dOverrides[i].m_dValues;
  13166. for ( int j=1; j<dValues.GetLength(); j++ )
  13167. assert ( dValues[j-1] < dValues[j] );
  13168. #endif
  13169. }
  13170. if ( pQuery->m_dOverrides.GetLength() )
  13171. m_pOverrides = &pQuery->m_dOverrides;
  13172. return true;
  13173. }
  13174. static int sphQueryHeightCalc ( const XQNode_t * pNode )
  13175. {
  13176. if ( !pNode->m_dChildren.GetLength() )
  13177. {
  13178. // exception, pre-cached OR of tiny (rare) keywords is just one node
  13179. if ( pNode->GetOp()==SPH_QUERY_OR )
  13180. {
  13181. #ifndef NDEBUG
  13182. // sanity checks
  13183. // this node must be only created for a huge OR of tiny expansions
  13184. assert ( pNode->m_dWords.GetLength() );
  13185. ARRAY_FOREACH ( i, pNode->m_dWords )
  13186. {
  13187. assert ( pNode->m_dWords[i].m_iAtomPos==pNode->m_dWords[0].m_iAtomPos );
  13188. assert ( pNode->m_dWords[i].m_bExpanded );
  13189. }
  13190. #endif
  13191. return 1;
  13192. }
  13193. return pNode->m_dWords.GetLength();
  13194. }
  13195. if ( pNode->GetOp()==SPH_QUERY_BEFORE )
  13196. return 1;
  13197. int iMaxChild = 0;
  13198. int iHeight = 0;
  13199. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13200. {
  13201. int iBottom = sphQueryHeightCalc ( pNode->m_dChildren[i] );
  13202. int iTop = pNode->m_dChildren.GetLength()-i-1;
  13203. if ( iBottom+iTop>=iMaxChild+iHeight )
  13204. {
  13205. iMaxChild = iBottom;
  13206. iHeight = iTop;
  13207. }
  13208. }
  13209. return iMaxChild+iHeight;
  13210. }
  13211. #define SPH_EXTNODE_STACK_SIZE 160
  13212. bool sphCheckQueryHeight ( const XQNode_t * pRoot, CSphString & sError )
  13213. {
  13214. int iHeight = 0;
  13215. if ( pRoot )
  13216. iHeight = sphQueryHeightCalc ( pRoot );
  13217. int64_t iQueryStack = sphGetStackUsed() + iHeight*SPH_EXTNODE_STACK_SIZE;
  13218. bool bValid = ( g_iThreadStackSize>=iQueryStack );
  13219. if ( !bValid )
  13220. sError.SetSprintf ( "query too complex, not enough stack (thread_stack=%dK or higher required)",
  13221. (int)( ( iQueryStack + 1024 - ( iQueryStack%1024 ) ) / 1024 ) );
  13222. return bValid;
  13223. }
  13224. static XQNode_t * CloneKeyword ( const XQNode_t * pNode )
  13225. {
  13226. assert ( pNode );
  13227. XQNode_t * pRes = new XQNode_t ( pNode->m_dSpec );
  13228. pRes->m_dWords = pNode->m_dWords;
  13229. return pRes;
  13230. }
  13231. static XQNode_t * ExpandKeyword ( XQNode_t * pNode, const CSphIndexSettings & tSettings, bool bStarEnabled )
  13232. {
  13233. assert ( pNode );
  13234. XQNode_t * pExpand = new XQNode_t ( pNode->m_dSpec );
  13235. pExpand->SetOp ( SPH_QUERY_OR, pNode );
  13236. if ( tSettings.m_iMinInfixLen>0 && bStarEnabled )
  13237. {
  13238. assert ( pNode->m_dChildren.GetLength()==0 );
  13239. assert ( pNode->m_dWords.GetLength()==1 );
  13240. XQNode_t * pInfix = CloneKeyword ( pNode );
  13241. pInfix->m_dWords[0].m_sWord.SetSprintf ( "*%s*", pNode->m_dWords[0].m_sWord.cstr() );
  13242. pInfix->m_pParent = pExpand;
  13243. pExpand->m_dChildren.Add ( pInfix );
  13244. }
  13245. if ( tSettings.m_bIndexExactWords )
  13246. {
  13247. assert ( pNode->m_dChildren.GetLength()==0 );
  13248. assert ( pNode->m_dWords.GetLength()==1 );
  13249. XQNode_t * pExact = CloneKeyword ( pNode );
  13250. pExact->m_dWords[0].m_sWord.SetSprintf ( "=%s", pNode->m_dWords[0].m_sWord.cstr() );
  13251. pExact->m_pParent = pExpand;
  13252. pExpand->m_dChildren.Add ( pExact );
  13253. }
  13254. return pExpand;
  13255. }
  13256. XQNode_t * sphQueryExpandKeywords ( XQNode_t * pNode, const CSphIndexSettings & tSettings, bool bStarEnabled )
  13257. {
  13258. // only if expansion makes sense at all
  13259. if ( tSettings.m_iMinInfixLen<=0 && !tSettings.m_bIndexExactWords )
  13260. return pNode;
  13261. // process children for composite nodes
  13262. if ( pNode->m_dChildren.GetLength() )
  13263. {
  13264. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13265. {
  13266. pNode->m_dChildren[i] = sphQueryExpandKeywords ( pNode->m_dChildren[i], tSettings, bStarEnabled );
  13267. pNode->m_dChildren[i]->m_pParent = pNode;
  13268. }
  13269. return pNode;
  13270. }
  13271. // if that's a phrase/proximity node, create a very special, magic phrase/proximity node
  13272. if ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM )
  13273. {
  13274. assert ( pNode->m_dWords.GetLength()>1 );
  13275. ARRAY_FOREACH ( i, pNode->m_dWords )
  13276. {
  13277. XQNode_t * pWord = new XQNode_t ( pNode->m_dSpec );
  13278. pWord->m_dWords.Add ( pNode->m_dWords[i] );
  13279. pNode->m_dChildren.Add ( ExpandKeyword ( pWord, tSettings, bStarEnabled ) );
  13280. pNode->m_dChildren.Last()->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
  13281. pNode->m_dChildren.Last()->m_pParent = pNode;
  13282. }
  13283. pNode->m_dWords.Reset();
  13284. pNode->m_bVirtuallyPlain = true;
  13285. return pNode;
  13286. }
  13287. // skip empty plain nodes
  13288. if ( pNode->m_dWords.GetLength()<=0 )
  13289. return pNode;
  13290. // process keywords for plain nodes
  13291. assert ( pNode->m_dWords.GetLength()==1 );
  13292. XQKeyword_t & tKeyword = pNode->m_dWords[0];
  13293. if ( tKeyword.m_sWord.Begins("=")
  13294. || tKeyword.m_sWord.Begins("*")
  13295. || tKeyword.m_sWord.Ends("*") )
  13296. {
  13297. return pNode;
  13298. }
  13299. // do the expansion
  13300. return ExpandKeyword ( pNode, tSettings, bStarEnabled );
  13301. }
  13302. void sphQueryAdjustStars ( XQNode_t * pNode, const CSphIndexSettings & tSettings )
  13303. {
  13304. if ( pNode->m_dChildren.GetLength() )
  13305. {
  13306. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13307. sphQueryAdjustStars ( pNode->m_dChildren[i], tSettings );
  13308. return;
  13309. }
  13310. ARRAY_FOREACH ( i, pNode->m_dWords )
  13311. {
  13312. CSphString & sWord = pNode->m_dWords[i].m_sWord;
  13313. // trim all wildcards
  13314. const char * s = sWord.cstr();
  13315. int iLen = sWord.Length();
  13316. while ( iLen>0 && sphIsWild ( s[iLen-1] ) )
  13317. iLen--;
  13318. while ( iLen>0 && sphIsWild(*s) )
  13319. {
  13320. s++;
  13321. iLen--;
  13322. }
  13323. sWord = sWord.SubString ( (int)( s-sWord.cstr() ), iLen );
  13324. // and now append stars if needed
  13325. if ( tSettings.m_iMinPrefixLen>0 && iLen>=tSettings.m_iMinPrefixLen )
  13326. sWord = sWord.SetSprintf ( "%s*", sWord.cstr() );
  13327. else if ( tSettings.m_iMinInfixLen>0 && iLen>=tSettings.m_iMinInfixLen )
  13328. sWord = sWord.SetSprintf ( "*%s*", sWord.cstr() );
  13329. }
  13330. }
  13331. // transform the "one two three"/1 quorum into one|two|three (~40% faster)
  13332. static void TransformQuorum ( XQNode_t ** ppNode )
  13333. {
  13334. XQNode_t *& pNode = *ppNode;
  13335. if ( pNode->GetOp()!=SPH_QUERY_QUORUM || pNode->m_iOpArg!=1 )
  13336. return;
  13337. assert ( pNode->m_dChildren.GetLength()==0 );
  13338. CSphVector<XQNode_t*> dArgs;
  13339. ARRAY_FOREACH ( i, pNode->m_dWords )
  13340. {
  13341. XQNode_t * pAnd = new XQNode_t ( pNode->m_dSpec );
  13342. pAnd->m_dWords.Add ( pNode->m_dWords[i] );
  13343. dArgs.Add ( pAnd );
  13344. }
  13345. pNode->m_dWords.Reset();
  13346. pNode->SetOp ( SPH_QUERY_OR, dArgs );
  13347. }
  13348. struct BinaryNode_t
  13349. {
  13350. int m_iLo;
  13351. int m_iHi;
  13352. };
  13353. static void BuildExpandedTree ( const XQKeyword_t & tRootWord, CSphVector<CSphNamedInt> & dWordSrc, XQNode_t * pRoot, bool bMergeSingles )
  13354. {
  13355. assert ( dWordSrc.GetLength() );
  13356. pRoot->m_dWords.Reset();
  13357. // put all tiny enough expansions in a single node
  13358. int iTinyStart = 0;
  13359. if ( pRoot->m_dSpec.m_dZones.GetLength() || !bMergeSingles )
  13360. {
  13361. // OPTIMIZE
  13362. // ExtCached_c only supports field filtering but not zone filtering for now
  13363. // so we skip tiny expansions optimizations in that case; we also do that in RT case
  13364. // FIXME!!! why not in RT case??? check that case and perf
  13365. iTinyStart = dWordSrc.GetLength();
  13366. } else
  13367. {
  13368. // lookup where those start, relying on that dWordSrc should be reverse sorted
  13369. while ( iTinyStart<dWordSrc.GetLength() && dWordSrc[iTinyStart].m_iValue>1 )
  13370. iTinyStart++;
  13371. }
  13372. XQNode_t * pTiny = NULL;
  13373. if ( iTinyStart!=dWordSrc.GetLength() )
  13374. {
  13375. if ( iTinyStart==0 )
  13376. pTiny = pRoot;
  13377. else
  13378. pTiny = new XQNode_t ( pRoot->m_dSpec );
  13379. pTiny->SetOp ( SPH_QUERY_OR );
  13380. for ( int i=iTinyStart; i<dWordSrc.GetLength(); i++ )
  13381. {
  13382. XQKeyword_t & tWord = pTiny->m_dWords.Add();
  13383. tWord.m_sWord = dWordSrc[i].m_sName;
  13384. tWord.m_iAtomPos = tRootWord.m_iAtomPos;
  13385. tWord.m_bExpanded = true;
  13386. // bFieldStart, bFieldEnd?
  13387. }
  13388. // if we created a new node, we have to propagate field/zone specs there
  13389. if ( pTiny!=pRoot )
  13390. pTiny->CopySpecs ( pRoot );
  13391. if ( iTinyStart==0 )
  13392. return;
  13393. dWordSrc.Resize ( iTinyStart );
  13394. }
  13395. // build a binary tree from all the other expansions
  13396. CSphVector<BinaryNode_t> dNodes;
  13397. dNodes.Reserve ( dWordSrc.GetLength() );
  13398. XQNode_t * pCur = pRoot;
  13399. dNodes.Add();
  13400. dNodes.Last().m_iLo = 0;
  13401. dNodes.Last().m_iHi = ( dWordSrc.GetLength()-1 );
  13402. while ( dNodes.GetLength() )
  13403. {
  13404. BinaryNode_t tNode = dNodes.Pop();
  13405. if ( tNode.m_iHi<tNode.m_iLo )
  13406. {
  13407. pCur = pCur->m_pParent;
  13408. continue;
  13409. }
  13410. int iMid = ( tNode.m_iLo+tNode.m_iHi ) / 2;
  13411. dNodes.Add ();
  13412. dNodes.Last().m_iLo = tNode.m_iLo;
  13413. dNodes.Last().m_iHi = iMid-1;
  13414. dNodes.Add ();
  13415. dNodes.Last().m_iLo = iMid+1;
  13416. dNodes.Last().m_iHi = tNode.m_iHi;
  13417. if ( pCur->m_dWords.GetLength() )
  13418. {
  13419. assert ( pCur->m_dWords.GetLength()==1 );
  13420. XQNode_t * pTerm = CloneKeyword ( pRoot );
  13421. Swap ( pTerm->m_dWords, pCur->m_dWords );
  13422. pCur->m_dChildren.Add ( pTerm );
  13423. pTerm->m_pParent = pCur;
  13424. }
  13425. XQNode_t * pChild = CloneKeyword ( pRoot );
  13426. pChild->m_dWords.Add ( tRootWord );
  13427. pChild->m_dWords.Last().m_sWord.Swap ( dWordSrc[iMid].m_sName );
  13428. pChild->m_dWords.Last().m_bExpanded = true;
  13429. pChild->m_bNotWeighted = ( dWordSrc[iMid].m_iValue==0 );
  13430. pChild->m_pParent = pCur;
  13431. pCur->m_dChildren.Add ( pChild );
  13432. pCur->SetOp ( SPH_QUERY_OR );
  13433. pCur = pChild;
  13434. }
  13435. if ( pTiny )
  13436. {
  13437. assert ( pRoot->GetOp()==SPH_QUERY_OR );
  13438. assert ( pRoot->m_dChildren.GetLength() );
  13439. assert ( pRoot!=pTiny );
  13440. pRoot->m_dChildren.Add ( pTiny );
  13441. pTiny->m_pParent = pRoot;
  13442. }
  13443. }
  13444. void Swap ( CSphNamedInt & a, CSphNamedInt & b )
  13445. {
  13446. a.m_sName.Swap ( b.m_sName );
  13447. Swap ( a.m_iValue, b.m_iValue );
  13448. }
  13449. struct WordDocsGreaterOp_t
  13450. {
  13451. inline bool IsLess ( const CSphNamedInt & a, const CSphNamedInt & b )
  13452. {
  13453. return a.m_iValue > b.m_iValue;
  13454. }
  13455. };
  13456. /// do wildcard expansion for keywords dictionary
  13457. /// (including prefix and infix expansion)
  13458. XQNode_t * sphExpandXQNode ( XQNode_t * pNode, ExpansionContext_t & tCtx )
  13459. {
  13460. assert ( pNode );
  13461. assert ( tCtx.m_pResult );
  13462. // process children for composite nodes
  13463. if ( pNode->m_dChildren.GetLength() )
  13464. {
  13465. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13466. {
  13467. pNode->m_dChildren[i] = sphExpandXQNode ( pNode->m_dChildren[i], tCtx );
  13468. pNode->m_dChildren[i]->m_pParent = pNode;
  13469. }
  13470. return pNode;
  13471. }
  13472. // if that's a phrase/proximity node, create a very special, magic phrase/proximity node
  13473. if ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM )
  13474. {
  13475. assert ( pNode->m_dWords.GetLength()>1 );
  13476. ARRAY_FOREACH ( i, pNode->m_dWords )
  13477. {
  13478. XQNode_t * pWord = new XQNode_t ( pNode->m_dSpec );
  13479. pWord->m_dWords.Add ( pNode->m_dWords[i] );
  13480. pNode->m_dChildren.Add ( sphExpandXQNode ( pWord, tCtx ) );
  13481. pNode->m_dChildren.Last()->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
  13482. pNode->m_dChildren.Last()->m_pParent = pNode;
  13483. // tricky part
  13484. // current node may have field/zone limits attached
  13485. // normally those get pushed down during query parsing
  13486. // but here we create nodes manually and have to push down limits too
  13487. pWord->CopySpecs ( pNode );
  13488. }
  13489. pNode->m_dWords.Reset();
  13490. pNode->m_bVirtuallyPlain = true;
  13491. return pNode;
  13492. }
  13493. // skip empty plain nodes
  13494. if ( pNode->m_dWords.GetLength()<=0 )
  13495. return pNode;
  13496. // process keywords for plain nodes
  13497. assert ( pNode->m_dChildren.GetLength()==0 );
  13498. assert ( pNode->m_dWords.GetLength()==1 );
  13499. // check the wildcards
  13500. const char * sFull = pNode->m_dWords[0].m_sWord.cstr();
  13501. const int iLen = strlen ( sFull );
  13502. int iWilds = 0;
  13503. for ( const char * s = sFull; *s; s++ )
  13504. if ( sphIsWild(*s) )
  13505. iWilds++;
  13506. // no wildcards, or just wildcards? do not expand
  13507. if ( !iWilds || iWilds==iLen )
  13508. return pNode;
  13509. CSphVector<CSphNamedInt> dExpanded;
  13510. if ( !sphIsWild(*sFull) || tCtx.m_iMinInfixLen==0 )
  13511. {
  13512. // do prefix expansion
  13513. // remove exact form modifier, if any
  13514. const char * sPrefix = sFull;
  13515. if ( *sPrefix=='=' )
  13516. sPrefix++;
  13517. // skip leading wildcards
  13518. // (in case we got here on non-infixed index path)
  13519. const char * sWildcard = sPrefix;
  13520. while ( sphIsWild ( *sPrefix ) )
  13521. {
  13522. sPrefix++;
  13523. sWildcard++;
  13524. }
  13525. // compute non-wildcard prefix length
  13526. int iPrefix = 0;
  13527. for ( const char * s = sPrefix; *s && !sphIsWild(*s); s++ )
  13528. iPrefix++;
  13529. // do not expand prefixes under min length
  13530. int iMinLen = Max ( tCtx.m_iMinPrefixLen, tCtx.m_iMinInfixLen );
  13531. if ( iPrefix<iMinLen )
  13532. return pNode;
  13533. // prefix expansion should work on nonstemmed words only
  13534. char sFixed [ MAX_KEYWORD_BYTES ];
  13535. if ( tCtx.m_bHasMorphology )
  13536. {
  13537. sFixed[0] = MAGIC_WORD_HEAD_NONSTEMMED;
  13538. memcpy ( sFixed+1, sPrefix, iPrefix );
  13539. sPrefix = sFixed;
  13540. iPrefix++;
  13541. }
  13542. tCtx.m_pWordlist->GetPrefixedWords ( sPrefix, iPrefix, sWildcard, dExpanded, tCtx.m_pBuf, tCtx.m_iFD );
  13543. } else
  13544. {
  13545. // do infix expansion
  13546. assert ( sphIsWild(*sFull) );
  13547. assert ( tCtx.m_iMinInfixLen>0 );
  13548. // find the longest substring of non-wildcards
  13549. const char * sMaxInfix = NULL;
  13550. int iMaxInfix = 0;
  13551. int iCur = 0;
  13552. for ( const char * s = sFull; *s; s++ )
  13553. {
  13554. if ( sphIsWild(*s) )
  13555. {
  13556. iCur = 0;
  13557. } else if ( ++iCur > iMaxInfix )
  13558. {
  13559. sMaxInfix = s-iCur+1;
  13560. iMaxInfix = iCur;
  13561. }
  13562. }
  13563. // do not expand infixes under min_infix_len
  13564. if ( iMaxInfix < tCtx.m_iMinInfixLen )
  13565. return pNode;
  13566. // ignore heading star
  13567. tCtx.m_pWordlist->GetInfixedWords ( sMaxInfix, iMaxInfix, sFull, dExpanded );
  13568. }
  13569. // no real expansions?
  13570. // mark source word as expanded to prevent warning on terms mismatch in statistics
  13571. if ( !dExpanded.GetLength() )
  13572. {
  13573. pNode->m_dWords.Begin()->m_bExpanded = true;
  13574. return pNode;
  13575. }
  13576. // sort expansions by frequency desc
  13577. // clip the less frequent ones if needed, as they are likely misspellings
  13578. dExpanded.Sort ( WordDocsGreaterOp_t() );
  13579. if ( tCtx.m_iExpansionLimit && tCtx.m_iExpansionLimit<dExpanded.GetLength() )
  13580. dExpanded.Resize ( tCtx.m_iExpansionLimit );
  13581. // mark new words as expanded to skip theirs check on merge
  13582. // (expanded words differ across indexes)
  13583. ARRAY_FOREACH ( i, dExpanded )
  13584. tCtx.m_pResult->AddStat ( dExpanded[i].m_sName, 0, 0, true );
  13585. // replace MAGIC_WORD_HEAD_NONSTEMMED symbol to '='
  13586. if ( tCtx.m_bHasMorphology )
  13587. ARRAY_FOREACH ( i, dExpanded )
  13588. if ( dExpanded[i].m_sName.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
  13589. ( (char *)dExpanded[i].m_sName.cstr() )[0] = '=';
  13590. // copy the original word (iirc it might get overwritten),
  13591. // and build a binary tree of all the expansions
  13592. const XQKeyword_t tRootWord = pNode->m_dWords[0];
  13593. BuildExpandedTree ( tRootWord, dExpanded, pNode, tCtx.m_bMergeSingles );
  13594. return pNode;
  13595. }
  13596. XQNode_t * CSphIndex_VLN::ExpandPrefix ( XQNode_t * pNode, CSphString & sError, CSphQueryResultMeta * pResult ) const
  13597. {
  13598. if ( !pNode || !m_pDict->GetSettings().m_bWordDict || ( m_tSettings.m_iMinPrefixLen<=0 && m_tSettings.m_iMinInfixLen<=0 ) )
  13599. return pNode;
  13600. // thread safe outer storage for dictionaries chunks and file
  13601. BYTE * pBuf = NULL;
  13602. int iFD = -1;
  13603. CSphAutofile rdWordlist;
  13604. if ( !m_bPreloadWordlist )
  13605. {
  13606. if ( m_bKeepFilesOpen )
  13607. iFD = m_tWordlist.m_tFile.GetFD();
  13608. else
  13609. {
  13610. iFD = rdWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, sError );
  13611. if ( iFD<0 )
  13612. return NULL;
  13613. }
  13614. if ( m_tWordlist.m_iMaxChunk>0 )
  13615. pBuf = new BYTE [ m_tWordlist.m_iMaxChunk ];
  13616. }
  13617. assert ( m_pPreread && *m_pPreread );
  13618. assert ( !m_bPreloadWordlist || !m_tWordlist.m_pBuf.IsEmpty() );
  13619. ExpansionContext_t tCtx;
  13620. tCtx.m_pWordlist = &m_tWordlist;
  13621. tCtx.m_pBuf = pBuf;
  13622. tCtx.m_pResult = pResult;
  13623. tCtx.m_iFD = iFD;
  13624. tCtx.m_iMinPrefixLen = m_tSettings.m_iMinPrefixLen;
  13625. tCtx.m_iMinInfixLen = m_tSettings.m_iMinInfixLen;
  13626. tCtx.m_iExpansionLimit = m_iExpansionLimit;
  13627. tCtx.m_bHasMorphology = m_pDict->HasMorphology();
  13628. tCtx.m_bMergeSingles = ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_INLINE );
  13629. pNode = sphExpandXQNode ( pNode, tCtx );
  13630. pNode->Check ( true );
  13631. SafeDeleteArray ( pBuf );
  13632. return pNode;
  13633. }
  13634. // transform the (A B) NEAR C into A NEAR B NEAR C
  13635. static void TransformNear ( XQNode_t ** ppNode )
  13636. {
  13637. XQNode_t *& pNode = *ppNode;
  13638. if ( pNode->GetOp()==SPH_QUERY_NEAR )
  13639. {
  13640. assert ( pNode->m_dWords.GetLength()==0 );
  13641. CSphVector<XQNode_t*> dArgs;
  13642. int iStartFrom;
  13643. // transform all (A B C) NEAR D into A NEAR B NEAR C NEAR D
  13644. do
  13645. {
  13646. dArgs.Reset();
  13647. iStartFrom = 0;
  13648. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13649. {
  13650. XQNode_t * pChild = pNode->m_dChildren[i]; ///< shortcut
  13651. if ( pChild->GetOp()==SPH_QUERY_AND && pChild->m_dChildren.GetLength()>0 )
  13652. {
  13653. ARRAY_FOREACH ( j, pChild->m_dChildren )
  13654. {
  13655. if ( j==0 && iStartFrom==0 )
  13656. {
  13657. // we will remove the node anyway, so just replace it with 1-st child instead
  13658. pNode->m_dChildren[i] = pChild->m_dChildren[j];
  13659. pNode->m_dChildren[i]->m_pParent = pNode;
  13660. iStartFrom = i+1;
  13661. } else
  13662. {
  13663. dArgs.Add ( pChild->m_dChildren[j] );
  13664. }
  13665. }
  13666. pChild->m_dChildren.Reset();
  13667. SafeDelete ( pChild );
  13668. } else if ( iStartFrom!=0 )
  13669. {
  13670. dArgs.Add ( pChild );
  13671. }
  13672. }
  13673. if ( iStartFrom!=0 )
  13674. {
  13675. pNode->m_dChildren.Resize ( iStartFrom + dArgs.GetLength() );
  13676. ARRAY_FOREACH ( i, dArgs )
  13677. {
  13678. pNode->m_dChildren [ i + iStartFrom ] = dArgs[i];
  13679. pNode->m_dChildren [ i + iStartFrom ]->m_pParent = pNode;
  13680. }
  13681. }
  13682. } while ( iStartFrom!=0 );
  13683. }
  13684. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13685. TransformNear ( &pNode->m_dChildren[i] );
  13686. }
  13687. /// tag excluded keywords (rvals to operator NOT)
  13688. static void TagExcluded ( XQNode_t * pNode, bool bNot )
  13689. {
  13690. if ( pNode->GetOp()==SPH_QUERY_ANDNOT )
  13691. {
  13692. assert ( pNode->m_dChildren.GetLength()==2 );
  13693. assert ( pNode->m_dWords.GetLength()==0 );
  13694. TagExcluded ( pNode->m_dChildren[0], bNot );
  13695. TagExcluded ( pNode->m_dChildren[1], !bNot );
  13696. } else if ( pNode->m_dChildren.GetLength() )
  13697. {
  13698. // FIXME? check if this works okay with "virtually plain" stuff?
  13699. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13700. TagExcluded ( pNode->m_dChildren[i], bNot );
  13701. } else
  13702. {
  13703. // tricky bit
  13704. // no assert on length here and that is intended
  13705. // we have fully empty nodes (0 children, 0 words) sometimes!
  13706. ARRAY_FOREACH ( i, pNode->m_dWords )
  13707. pNode->m_dWords[i].m_bExcluded = bNot;
  13708. }
  13709. }
  13710. /// optimize phrase queries if we have bigrams
  13711. static void TransformBigrams ( XQNode_t * pNode, const CSphIndexSettings & tSettings )
  13712. {
  13713. assert ( tSettings.m_eBigramIndex!=SPH_BIGRAM_NONE );
  13714. assert ( tSettings.m_eBigramIndex==SPH_BIGRAM_ALL || tSettings.m_dBigramWords.GetLength() );
  13715. if ( pNode->GetOp()!=SPH_QUERY_PHRASE )
  13716. {
  13717. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13718. TransformBigrams ( pNode->m_dChildren[i], tSettings );
  13719. return;
  13720. }
  13721. CSphBitvec bmRemove;
  13722. bmRemove.Init ( pNode->m_dWords.GetLength() );
  13723. for ( int i=0; i<pNode->m_dWords.GetLength()-1; i++ )
  13724. {
  13725. // check whether this pair was indexed
  13726. bool bBigram = false;
  13727. switch ( tSettings.m_eBigramIndex )
  13728. {
  13729. case SPH_BIGRAM_NONE:
  13730. break;
  13731. case SPH_BIGRAM_ALL:
  13732. bBigram = true;
  13733. break;
  13734. case SPH_BIGRAM_FIRSTFREQ:
  13735. bBigram = tSettings.m_dBigramWords.BinarySearch ( pNode->m_dWords[i].m_sWord )!=NULL;
  13736. break;
  13737. case SPH_BIGRAM_BOTHFREQ:
  13738. bBigram =
  13739. ( tSettings.m_dBigramWords.BinarySearch ( pNode->m_dWords[i].m_sWord )!=NULL ) &&
  13740. ( tSettings.m_dBigramWords.BinarySearch ( pNode->m_dWords[i+1].m_sWord )!=NULL );
  13741. break;
  13742. }
  13743. if ( !bBigram )
  13744. continue;
  13745. // replace the pair with a bigram keyword
  13746. // FIXME!!! set phrase weight for this "word" here
  13747. pNode->m_dWords[i].m_sWord.SetSprintf ( "%s%c%s",
  13748. pNode->m_dWords[i].m_sWord.cstr(),
  13749. MAGIC_WORD_BIGRAM,
  13750. pNode->m_dWords[i+1].m_sWord.cstr() );
  13751. // only mark for removal now, we will sweep later
  13752. // so that [a b c] would convert to ["a b" "b c"], not just ["a b" c]
  13753. bmRemove.BitClear ( i );
  13754. bmRemove.BitSet ( i+1 );
  13755. }
  13756. // remove marked words
  13757. int iOut = 0;
  13758. ARRAY_FOREACH ( i, pNode->m_dWords )
  13759. if ( !bmRemove.BitGet(i) )
  13760. pNode->m_dWords[iOut++] = pNode->m_dWords[i];
  13761. pNode->m_dWords.Resize ( iOut );
  13762. // fixup nodes that are not real phrases any more
  13763. if ( pNode->m_dWords.GetLength()==1 )
  13764. pNode->SetOp ( SPH_QUERY_AND );
  13765. }
  13766. /// create a node from a set of lemmas
  13767. /// WARNING, tKeyword might or might not be pointing to pNode->m_dWords[0]
  13768. static void TransformAotFilter ( XQNode_t * pNode, const XQKeyword_t & tKeyword, bool bUtf8, const CSphWordforms * pWordforms )
  13769. {
  13770. assert ( pNode->m_dWords.GetLength()<=1 );
  13771. assert ( pNode->m_dChildren.GetLength()==0 );
  13772. if ( pWordforms )
  13773. {
  13774. // do a copy, because patching in place is not an option
  13775. // short => longlonglong wordform mapping would crash
  13776. // OPTIMIZE? forms that are not found will (?) get looked up again in the dict
  13777. char sBuf [ MAX_KEYWORD_BYTES ];
  13778. strncpy ( sBuf, tKeyword.m_sWord.cstr(), sizeof(sBuf) );
  13779. if ( pWordforms->ToNormalForm ( (BYTE*)sBuf, true ) )
  13780. {
  13781. pNode->m_dWords[0].m_sWord = sBuf;
  13782. pNode->m_dWords[0].m_bMorphed = true;
  13783. return;
  13784. }
  13785. }
  13786. CSphVector<CSphString> dLemmas;
  13787. sphAotLemmatizeRu ( dLemmas, (BYTE*)tKeyword.m_sWord.cstr(), bUtf8 );
  13788. // post-morph wordforms
  13789. if ( pWordforms && pWordforms->m_bHavePostMorphNF )
  13790. {
  13791. char sBuf [ MAX_KEYWORD_BYTES ];
  13792. ARRAY_FOREACH ( i, dLemmas )
  13793. {
  13794. strncpy ( sBuf, dLemmas[i].cstr(), sizeof(sBuf) );
  13795. if ( pWordforms->ToNormalForm ( (BYTE*)sBuf, false ) )
  13796. dLemmas[i] = sBuf;
  13797. }
  13798. }
  13799. if ( dLemmas.GetLength()<=1 )
  13800. {
  13801. // zero or one lemmas, update node in-place
  13802. if ( !pNode->m_dWords.GetLength() )
  13803. pNode->m_dWords.Add ( tKeyword );
  13804. if ( dLemmas.GetLength() )
  13805. {
  13806. pNode->m_dWords[0].m_sWord = dLemmas[0];
  13807. pNode->m_dWords[0].m_bMorphed = true;
  13808. }
  13809. } else
  13810. {
  13811. // multiple lemmas, create an OR node
  13812. pNode->SetOp ( SPH_QUERY_OR );
  13813. ARRAY_FOREACH ( i, dLemmas )
  13814. {
  13815. pNode->m_dChildren.Add ( new XQNode_t ( pNode->m_dSpec ) );
  13816. pNode->m_dChildren.Last()->m_pParent = pNode;
  13817. XQKeyword_t & tLemma = pNode->m_dChildren.Last()->m_dWords.Add();
  13818. tLemma.m_sWord = dLemmas[i];
  13819. tLemma.m_iAtomPos = tKeyword.m_iAtomPos;
  13820. tLemma.m_bFieldStart = tKeyword.m_bFieldStart;
  13821. tLemma.m_bFieldEnd = tKeyword.m_bFieldEnd;
  13822. tLemma.m_bMorphed = true;
  13823. }
  13824. pNode->m_dWords.Reset();
  13825. }
  13826. }
  13827. /// AOT morph guesses transform
  13828. /// replaces tokens with their respective morph guesses subtrees
  13829. /// used in lemmatize_ru_all morphology processing mode that can generate multiple guesses
  13830. /// in other modes, there is always exactly one morph guess, and the dictionary handles it
  13831. void TransformAotFilter ( XQNode_t * pNode, bool bUtf8, const CSphWordforms * pWordforms )
  13832. {
  13833. // case one, regular operator (and empty nodes)
  13834. ARRAY_FOREACH ( i, pNode->m_dChildren )
  13835. TransformAotFilter ( pNode->m_dChildren[i], bUtf8, pWordforms );
  13836. if ( pNode->m_dChildren.GetLength() || pNode->m_dWords.GetLength()==0 )
  13837. return;
  13838. // case two, operator on a bag of words
  13839. // FIXME? check phrase vs expand_keywords vs lemmatize_ru_all?
  13840. if ( pNode->m_dWords.GetLength()
  13841. && ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM ) )
  13842. {
  13843. assert ( pNode->m_dWords.GetLength() );
  13844. ARRAY_FOREACH ( i, pNode->m_dWords )
  13845. {
  13846. XQNode_t * pNew = new XQNode_t ( pNode->m_dSpec );
  13847. pNew->m_pParent = pNode;
  13848. pNew->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
  13849. pNode->m_dChildren.Add ( pNew );
  13850. TransformAotFilter ( pNew, pNode->m_dWords[i], bUtf8, pWordforms );
  13851. }
  13852. pNode->m_dWords.Reset();
  13853. pNode->m_bVirtuallyPlain = true;
  13854. return;
  13855. }
  13856. // case three, plain old single keyword
  13857. assert ( pNode->m_dWords.GetLength()==1 );
  13858. TransformAotFilter ( pNode, pNode->m_dWords[0], bUtf8, pWordforms );
  13859. }
  13860. void sphTransformExtendedQuery ( XQNode_t ** ppNode, const CSphIndexSettings & tSettings, bool bHasBooleanOptimization, const ISphKeywordsStat * pKeywords )
  13861. {
  13862. TransformQuorum ( ppNode );
  13863. ( *ppNode )->Check ( true );
  13864. TransformNear ( ppNode );
  13865. ( *ppNode )->Check ( true );
  13866. if ( tSettings.m_eBigramIndex!=SPH_BIGRAM_NONE )
  13867. TransformBigrams ( *ppNode, tSettings );
  13868. TagExcluded ( *ppNode, false );
  13869. ( *ppNode )->Check ( true );
  13870. // boolean optimization
  13871. if ( bHasBooleanOptimization )
  13872. sphOptimizeBoolean ( ppNode, pKeywords );
  13873. }
  13874. struct CmpPSortersByRandom_fn
  13875. {
  13876. inline bool IsLess ( const ISphMatchSorter * a, const ISphMatchSorter * b ) const
  13877. {
  13878. assert ( a );
  13879. assert ( b );
  13880. return a->m_bRandomize < b->m_bRandomize;
  13881. }
  13882. };
  13883. /// one regular query vs many sorters
  13884. bool CSphIndex_VLN::MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult,
  13885. int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters,
  13886. int iTag, bool bFactors ) const
  13887. {
  13888. assert ( pQuery );
  13889. CSphQueryProfile * pProfile = pResult->m_pProfile;
  13890. MEMORY ( SPH_MEM_IDX_DISK_MULTY_QUERY );
  13891. // to avoid the checking of a ppSorters's element for NULL on every next step, just filter out all nulls right here
  13892. CSphVector<ISphMatchSorter*> dSorters;
  13893. dSorters.Reserve ( iSorters );
  13894. for ( int i=0; i<iSorters; i++ )
  13895. if ( ppSorters[i] )
  13896. dSorters.Add ( ppSorters[i] );
  13897. iSorters = dSorters.GetLength();
  13898. // if we have anything to work with
  13899. if ( iSorters==0 )
  13900. return false;
  13901. // non-random at the start, random at the end
  13902. dSorters.Sort ( CmpPSortersByRandom_fn() );
  13903. // fast path for scans
  13904. if ( pQuery->m_sQuery.IsEmpty() )
  13905. return MultiScan ( pQuery, pResult, iSorters, &dSorters[0], pExtraFilters, iTag, bFactors );
  13906. if ( pProfile )
  13907. pProfile->Switch ( SPH_QSTATE_DICT_SETUP );
  13908. CSphScopedPtr<CSphDict> tDictCloned ( NULL );
  13909. CSphDict * pDictBase = m_pDict;
  13910. if ( pDictBase->HasState() )
  13911. tDictCloned = pDictBase = pDictBase->Clone();
  13912. CSphScopedPtr<CSphDict> tDict ( NULL );
  13913. CSphDict * pDict = SetupStarDict ( tDict, pDictBase );
  13914. CSphScopedPtr<CSphDict> tDict2 ( NULL );
  13915. pDict = SetupExactDict ( tDict2, pDict );
  13916. const BYTE * sModifiedQuery = (BYTE *)pQuery->m_sQuery.cstr();
  13917. if ( m_pFieldFilter )
  13918. sModifiedQuery = m_pFieldFilter->Apply ( sModifiedQuery );
  13919. // parse query
  13920. if ( pProfile )
  13921. pProfile->Switch ( SPH_QSTATE_PARSE );
  13922. XQQuery_t tParsed;
  13923. if ( !sphParseExtendedQuery ( tParsed, (const char*)sModifiedQuery, m_pQueryTokenizer, &m_tSchema, pDict, m_tSettings ) )
  13924. {
  13925. // FIXME? might wanna reset profile to unknown state
  13926. pResult->m_sError = tParsed.m_sParseError;
  13927. return false;
  13928. }
  13929. // transform query if needed (quorum transform, etc.)
  13930. if ( pProfile )
  13931. pProfile->Switch ( SPH_QSTATE_TRANSFORMS );
  13932. sphTransformExtendedQuery ( &tParsed.m_pRoot, m_tSettings, pQuery->m_bSimplify, this );
  13933. // adjust stars in keywords for dict=keywords, enable_star=0 case
  13934. if ( pDict->GetSettings().m_bWordDict && !m_bEnableStar && ( m_tSettings.m_iMinPrefixLen>0 || m_tSettings.m_iMinInfixLen>0 ) )
  13935. sphQueryAdjustStars ( tParsed.m_pRoot, m_tSettings );
  13936. if ( m_bExpandKeywords )
  13937. {
  13938. tParsed.m_pRoot = sphQueryExpandKeywords ( tParsed.m_pRoot, m_tSettings, m_bEnableStar );
  13939. tParsed.m_pRoot->Check ( true );
  13940. }
  13941. // this should be after keyword expansion
  13942. if ( m_tSettings.m_bAotFilter )
  13943. TransformAotFilter ( tParsed.m_pRoot, m_pQueryTokenizer->IsUtf8(), pDict->GetWordforms() );
  13944. // expanding prefix in word dictionary case
  13945. XQNode_t * pPrefixed = ExpandPrefix ( tParsed.m_pRoot, pResult->m_sError, pResult );
  13946. if ( !pPrefixed )
  13947. return false;
  13948. tParsed.m_pRoot = pPrefixed;
  13949. if ( !sphCheckQueryHeight ( tParsed.m_pRoot, pResult->m_sError ) )
  13950. return false;
  13951. // flag common subtrees
  13952. int iCommonSubtrees = 0;
  13953. if ( m_iMaxCachedDocs && m_iMaxCachedHits )
  13954. iCommonSubtrees = sphMarkCommonSubtrees ( 1, &tParsed );
  13955. tParsed.m_bNeedSZlist = pQuery->m_bZSlist;
  13956. CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );
  13957. bool bResult = ParsedMultiQuery ( pQuery, pResult, iSorters, &dSorters[0], tParsed, pDict, pExtraFilters, &tNodeCache, iTag, bFactors );
  13958. return bResult;
  13959. }
  13960. /// many regular queries with one sorter attached to each query.
  13961. /// returns true if at least one query succeeded. The failed queries indicated with pResult->m_iMultiplier==-1
  13962. bool CSphIndex_VLN::MultiQueryEx ( int iQueries, const CSphQuery * pQueries,
  13963. CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters,
  13964. const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag, bool bFactors ) const
  13965. {
  13966. // ensure we have multiple queries
  13967. if ( iQueries==1 )
  13968. return MultiQuery ( pQueries, ppResults[0], 1, ppSorters, pExtraFilters, iTag, bFactors );
  13969. MEMORY ( SPH_MEM_IDX_DISK_MULTY_QUERY_EX );
  13970. assert ( pQueries );
  13971. assert ( ppResults );
  13972. assert ( ppSorters );
  13973. CSphScopedPtr<CSphDict> tDictCloned ( NULL );
  13974. CSphDict * pDictBase = m_pDict;
  13975. if ( pDictBase->HasState() )
  13976. tDictCloned = pDictBase = pDictBase->Clone();
  13977. CSphScopedPtr<CSphDict> tDict ( NULL );
  13978. CSphDict * pDict = SetupStarDict ( tDict, pDictBase );
  13979. CSphScopedPtr<CSphDict> tDict2 ( NULL );
  13980. pDict = SetupExactDict ( tDict2, pDict );
  13981. CSphFixedVector<XQQuery_t> dXQ ( iQueries );
  13982. bool bResult = false;
  13983. bool bResultScan = false;
  13984. for ( int i=0; i<iQueries; i++ )
  13985. {
  13986. // nothing to do without a sorter
  13987. if ( !ppSorters[i] )
  13988. {
  13989. ppResults[i]->m_iMultiplier = -1; ///< show that this particular query failed
  13990. continue;
  13991. }
  13992. // fast path for scans
  13993. if ( pQueries[i].m_sQuery.IsEmpty() )
  13994. {
  13995. if ( MultiScan ( pQueries + i, ppResults[i], 1, &ppSorters[i], pExtraFilters, iTag, bFactors ) )
  13996. bResultScan = true;
  13997. else
  13998. ppResults[i]->m_iMultiplier = -1; ///< show that this particular query failed
  13999. continue;
  14000. }
  14001. ppResults[i]->m_tIOStats.Start();
  14002. // parse query
  14003. if ( sphParseExtendedQuery ( dXQ[i], pQueries[i].m_sQuery.cstr(), m_pQueryTokenizer, &m_tSchema, pDict, m_tSettings ) )
  14004. {
  14005. // transform query if needed (quorum transform, keyword expansion, etc.)
  14006. sphTransformExtendedQuery ( &dXQ[i].m_pRoot, m_tSettings, pQueries[i].m_bSimplify, this );
  14007. // adjust stars in keywords for dict=keywords, enable_star=0 case
  14008. if ( pDict->GetSettings().m_bWordDict && !m_bEnableStar && ( m_tSettings.m_iMinPrefixLen>0 || m_tSettings.m_iMinInfixLen>0 ) )
  14009. sphQueryAdjustStars ( dXQ[i].m_pRoot, m_tSettings );
  14010. if ( m_bExpandKeywords )
  14011. {
  14012. dXQ[i].m_pRoot = sphQueryExpandKeywords ( dXQ[i].m_pRoot, m_tSettings, m_bEnableStar );
  14013. dXQ[i].m_pRoot->Check ( true );
  14014. }
  14015. // this should be after keyword expansion
  14016. if ( m_tSettings.m_bAotFilter )
  14017. TransformAotFilter ( dXQ[i].m_pRoot, m_pQueryTokenizer->IsUtf8(), pDict->GetWordforms() );
  14018. // expanding prefix in word dictionary case
  14019. XQNode_t * pPrefixed = ExpandPrefix ( dXQ[i].m_pRoot, ppResults[i]->m_sError, ppResults[i] );
  14020. if ( pPrefixed )
  14021. {
  14022. dXQ[i].m_pRoot = pPrefixed;
  14023. if ( sphCheckQueryHeight ( dXQ[i].m_pRoot, ppResults[i]->m_sError ) )
  14024. {
  14025. bResult = true;
  14026. } else
  14027. {
  14028. ppResults[i]->m_iMultiplier = -1;
  14029. SafeDelete ( dXQ[i].m_pRoot );
  14030. }
  14031. } else
  14032. {
  14033. ppResults[i]->m_iMultiplier = -1;
  14034. SafeDelete ( dXQ[i].m_pRoot );
  14035. }
  14036. } else
  14037. {
  14038. ppResults[i]->m_sError = dXQ[i].m_sParseError;
  14039. ppResults[i]->m_iMultiplier = -1;
  14040. }
  14041. ppResults[i]->m_tIOStats.Stop();
  14042. }
  14043. // continue only if we have at least one non-failed
  14044. if ( bResult )
  14045. {
  14046. int iCommonSubtrees = 0;
  14047. if ( m_iMaxCachedDocs && m_iMaxCachedHits )
  14048. iCommonSubtrees = sphMarkCommonSubtrees ( iQueries, &dXQ[0] );
  14049. CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );
  14050. bResult = false;
  14051. for ( int j=0; j<iQueries; j++ )
  14052. {
  14053. // fullscan case
  14054. if ( pQueries[j].m_sQuery.IsEmpty() )
  14055. continue;
  14056. ppResults[j]->m_tIOStats.Start();
  14057. if ( dXQ[j].m_pRoot && ppSorters[j]
  14058. && ParsedMultiQuery ( &pQueries[j], ppResults[j], 1, &ppSorters[j], dXQ[j], pDict, pExtraFilters, &tNodeCache, iTag, bFactors ) )
  14059. {
  14060. bResult = true;
  14061. ppResults[j]->m_iMultiplier = iCommonSubtrees ? iQueries : 1;
  14062. } else
  14063. {
  14064. ppResults[j]->m_iMultiplier = -1;
  14065. }
  14066. ppResults[j]->m_tIOStats.Stop();
  14067. }
  14068. }
  14069. return bResult | bResultScan;
  14070. }
  14071. bool CSphIndex_VLN::ParsedMultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult,
  14072. int iSorters, ISphMatchSorter ** ppSorters, const XQQuery_t & tXQ, CSphDict * pDict,
  14073. const CSphVector<CSphFilterSettings> * pExtraFilters, CSphQueryNodeCache * pNodeCache, int iTag, bool bFactors ) const
  14074. {
  14075. assert ( pQuery );
  14076. assert ( pResult );
  14077. assert ( ppSorters );
  14078. assert ( !pQuery->m_sQuery.IsEmpty() && pQuery->m_eMode!=SPH_MATCH_FULLSCAN ); // scans must go through MultiScan()
  14079. assert ( iTag>=0 );
  14080. // start counting
  14081. int64_t tmQueryStart = sphMicroTimer();
  14082. CSphQueryProfile * pProfile = pResult->m_pProfile;
  14083. if ( pProfile)
  14084. pProfile->Switch ( SPH_QSTATE_INIT );
  14085. ///////////////////
  14086. // setup searching
  14087. ///////////////////
  14088. // non-ready index, empty response!
  14089. if ( !m_pPreread || !*m_pPreread )
  14090. {
  14091. pResult->m_sError = "index not preread";
  14092. return false;
  14093. }
  14094. // select the sorter with max schema
  14095. int iMaxSchemaSize = -1;
  14096. int iMaxSchemaIndex = -1;
  14097. for ( int i=0; i<iSorters; i++ )
  14098. if ( ppSorters[i]->GetSchema().GetRowSize() > iMaxSchemaSize )
  14099. {
  14100. iMaxSchemaSize = ppSorters[i]->GetSchema().GetRowSize();
  14101. iMaxSchemaIndex = i;
  14102. }
  14103. // setup calculations and result schema
  14104. CSphQueryContext tCtx;
  14105. tCtx.m_pProfile = pProfile;
  14106. if ( !tCtx.SetupCalc ( pResult, ppSorters[iMaxSchemaIndex]->GetSchema(), m_tSchema, GetMVAPool() ) )
  14107. return false;
  14108. // set string pool for string on_sort expression fix up
  14109. tCtx.SetStringPool ( m_pStrings.GetWritePtr() );
  14110. tCtx.m_bPackedFactors = bFactors;
  14111. // open files
  14112. CSphAutofile tDoclist, tHitlist, tWordlist, tDummy;
  14113. if ( !m_bKeepFilesOpen )
  14114. {
  14115. if ( pProfile)
  14116. pProfile->Switch ( SPH_QSTATE_OPEN );
  14117. if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, pResult->m_sError ) < 0 )
  14118. return false;
  14119. if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, pResult->m_sError ) < 0 )
  14120. return false;
  14121. if ( tWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, pResult->m_sError ) < 0 )
  14122. return false;
  14123. }
  14124. if ( pProfile)
  14125. pProfile->Switch ( SPH_QSTATE_INIT );
  14126. // setup search terms
  14127. DiskIndexQwordSetup_c tTermSetup ( m_bKeepFilesOpen ? m_tDoclistFile : tDoclist,
  14128. m_bKeepFilesOpen ? m_tHitlistFile : tHitlist,
  14129. m_bPreloadWordlist ? tDummy : ( m_bKeepFilesOpen ? m_tWordlist.m_tFile : tWordlist ),
  14130. m_bPreloadWordlist ? 0 : m_tWordlist.m_iMaxChunk,
  14131. m_pSkiplists.GetWritePtr(), pProfile );
  14132. tTermSetup.m_pDict = pDict;
  14133. tTermSetup.m_pIndex = this;
  14134. tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
  14135. tTermSetup.m_iMinDocid = m_iMinDocid;
  14136. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  14137. {
  14138. tTermSetup.m_iInlineRowitems = m_tSchema.GetRowSize();
  14139. tTermSetup.m_pMinRow = m_dMinRow.Begin();
  14140. }
  14141. tTermSetup.m_iDynamicRowitems = pResult->m_tSchema.GetDynamicSize();
  14142. if ( pQuery->m_uMaxQueryMsec>0 )
  14143. tTermSetup.m_iMaxTimer = sphMicroTimer() + pQuery->m_uMaxQueryMsec*1000; // max_query_time
  14144. tTermSetup.m_pWarning = &pResult->m_sWarning;
  14145. tTermSetup.m_bSetupReaders = true;
  14146. tTermSetup.m_pCtx = &tCtx;
  14147. tTermSetup.m_pNodeCache = pNodeCache;
  14148. // setup prediction constrain
  14149. CSphQueryStats tQueryStats;
  14150. int64_t iNanoBudget = pQuery->m_iMaxPredictedMsec * 1000000; // from milliseconds to nanoseconds
  14151. tQueryStats.m_pNanoBudget = &iNanoBudget;
  14152. if ( pQuery->m_iMaxPredictedMsec>0 )
  14153. tTermSetup.m_pStats = &tQueryStats;
  14154. int iIndexWeight = pQuery->GetIndexWeight ( m_sIndexName.cstr() );
  14155. // bind weights
  14156. tCtx.BindWeights ( pQuery, m_tSchema, iIndexWeight );
  14157. SmallStringHash_T<CSphQueryResultMeta::WordStat_t> hPrevWordStat = pResult->m_hWordStats;
  14158. // setup query
  14159. // must happen before index-level reject, in order to build proper keyword stats
  14160. CSphScopedPtr<ISphRanker> pRanker ( sphCreateRanker ( tXQ, pQuery, pResult, tTermSetup, tCtx ) );
  14161. if ( !pRanker.Ptr() )
  14162. return false;
  14163. if ( bFactors && pQuery->m_eRanker!=SPH_RANK_EXPR )
  14164. pResult->m_sWarning.SetSprintf ( "packedfactors() requires using an expression ranker" );
  14165. sphCheckWordStats ( hPrevWordStat, pResult->m_hWordStats, m_sIndexName.cstr(), pResult->m_sWarning );
  14166. tCtx.SetupExtraData ( pRanker.Ptr() );
  14167. pRanker->ExtraData ( EXTRA_SET_MVAPOOL, (void**)m_pMva.GetWritePtr() );
  14168. pRanker->ExtraData ( EXTRA_SET_STRINGPOOL, (void**)m_pStrings.GetWritePtr() );
  14169. int iMatchPoolSize = 0;
  14170. for ( int i=0; i<iSorters; i++ )
  14171. iMatchPoolSize += ppSorters[i]->GetDataLength();
  14172. pRanker->ExtraData ( EXTRA_SET_MAXMATCHES, (void**)&iMatchPoolSize );
  14173. // empty index, empty response!
  14174. if ( m_bIsEmpty )
  14175. return true;
  14176. assert ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN || !m_pDocinfo.IsEmpty() ); // check that docinfo is preloaded
  14177. // setup filters
  14178. if ( !tCtx.CreateFilters ( pQuery->m_sQuery.IsEmpty(), &pQuery->m_dFilters, pResult->m_tSchema, GetMVAPool(), m_pStrings.GetWritePtr(), pResult->m_sError ) )
  14179. return false;
  14180. if ( !tCtx.CreateFilters ( pQuery->m_sQuery.IsEmpty(), pExtraFilters, pResult->m_tSchema, GetMVAPool(), m_pStrings.GetWritePtr(), pResult->m_sError ) )
  14181. return false;
  14182. // check if we can early reject the whole index
  14183. if ( tCtx.m_pFilter && m_iDocinfoIndex )
  14184. {
  14185. DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  14186. DWORD * pMinEntry = const_cast<DWORD*> ( &m_pDocinfoIndex [ m_iDocinfoIndex*uStride*2 ] );
  14187. DWORD * pMaxEntry = pMinEntry + uStride;
  14188. if ( !tCtx.m_pFilter->EvalBlock ( pMinEntry, pMaxEntry ) )
  14189. return true;
  14190. }
  14191. // setup lookup
  14192. tCtx.m_bLookupFilter = ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN ) && pQuery->m_dFilters.GetLength();
  14193. if ( tCtx.m_dCalcFilter.GetLength() || pQuery->m_eRanker==SPH_RANK_EXPR || pQuery->m_eRanker==SPH_RANK_EXPORT )
  14194. tCtx.m_bLookupFilter = true; // suboptimal in case of attr-independent expressions, but we don't care
  14195. tCtx.m_bLookupSort = false;
  14196. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !tCtx.m_bLookupFilter )
  14197. for ( int iSorter=0; iSorter<iSorters && !tCtx.m_bLookupSort; iSorter++ )
  14198. if ( ppSorters[iSorter]->UsesAttrs() )
  14199. tCtx.m_bLookupSort = true;
  14200. if ( tCtx.m_dCalcSort.GetLength() )
  14201. tCtx.m_bLookupSort = true; // suboptimal in case of attr-independent expressions, but we don't care
  14202. // setup sorters vs. MVA
  14203. for ( int i=0; i<iSorters; i++ )
  14204. {
  14205. (ppSorters[i])->SetMVAPool ( m_pMva.GetWritePtr() );
  14206. (ppSorters[i])->SetStringPool ( m_pStrings.GetWritePtr() );
  14207. }
  14208. // setup overrides
  14209. if ( !tCtx.SetupOverrides ( pQuery, pResult, m_tSchema ) )
  14210. return false;
  14211. //////////////////////////////////////
  14212. // find and weight matching documents
  14213. //////////////////////////////////////
  14214. bool bFinalLookup = !tCtx.m_bLookupFilter && !tCtx.m_bLookupSort;
  14215. bool bFinalPass = bFinalLookup || tCtx.m_dCalcFinal.GetLength();
  14216. int iMyTag = bFinalPass ? -1 : iTag;
  14217. switch ( pQuery->m_eMode )
  14218. {
  14219. case SPH_MATCH_ALL:
  14220. case SPH_MATCH_PHRASE:
  14221. case SPH_MATCH_ANY:
  14222. case SPH_MATCH_EXTENDED:
  14223. case SPH_MATCH_EXTENDED2:
  14224. case SPH_MATCH_BOOLEAN:
  14225. MatchExtended ( &tCtx, pQuery, iSorters, ppSorters, pRanker.Ptr(), iMyTag );
  14226. break;
  14227. default:
  14228. sphDie ( "INTERNAL ERROR: unknown matching mode (mode=%d)", pQuery->m_eMode );
  14229. }
  14230. ////////////////////
  14231. // cook result sets
  14232. ////////////////////
  14233. if ( pProfile)
  14234. pProfile->Switch ( SPH_QSTATE_FINALIZE );
  14235. // adjust result sets
  14236. for ( int iSorter=0; iSorter<iSorters; iSorter++ )
  14237. {
  14238. ISphMatchSorter * pTop = ppSorters[iSorter];
  14239. if ( pTop->GetLength() && bFinalPass )
  14240. {
  14241. CSphMatch * const pHead = pTop->Finalize();
  14242. const int iCount = pTop->GetLength ();
  14243. CSphMatch * const pTail = pHead + iCount;
  14244. bool bGotUDF = false;
  14245. ARRAY_FOREACH_COND ( i, tCtx.m_dCalcFinal, !bGotUDF )
  14246. tCtx.m_dCalcFinal[i].m_pExpr->Command ( SPH_EXPR_GET_UDF, &bGotUDF );
  14247. CSphVector<int> dIndexes;
  14248. if ( bGotUDF )
  14249. {
  14250. pTop->BuildFlatIndexes ( dIndexes );
  14251. bGotUDF = ( dIndexes.GetLength()!=0 );
  14252. }
  14253. if ( bGotUDF )
  14254. {
  14255. // we now promise to UDFs that final-stage calls will be evaluated
  14256. // a) over the final, pre-limit result set
  14257. // b) in the final result set order
  14258. ARRAY_FOREACH ( i, dIndexes )
  14259. {
  14260. assert ( dIndexes[i]>=0 && dIndexes[i]<iCount );
  14261. CSphMatch * pCur = pHead + dIndexes[i];
  14262. if ( pCur->m_iTag>=0 )
  14263. continue;
  14264. if ( bFinalLookup )
  14265. CopyDocinfo ( &tCtx, *pCur, FindDocinfo ( pCur->m_iDocID ) );
  14266. tCtx.CalcFinal ( *pCur );
  14267. pCur->m_iTag = iTag;
  14268. }
  14269. } else
  14270. {
  14271. // just evaluate in heap order
  14272. for ( CSphMatch * pCur=pHead; pCur<pTail; pCur++ )
  14273. if ( pCur->m_iTag<0 )
  14274. {
  14275. if ( bFinalLookup )
  14276. CopyDocinfo ( &tCtx, *pCur, FindDocinfo ( pCur->m_iDocID ) );
  14277. tCtx.CalcFinal ( *pCur );
  14278. pCur->m_iTag = iTag;
  14279. }
  14280. }
  14281. }
  14282. // mva and string pools ptrs
  14283. pResult->m_pMva = m_pMva.GetWritePtr();
  14284. pResult->m_pStrings = m_pStrings.GetWritePtr();
  14285. }
  14286. // query timer
  14287. int64_t tmWall = sphMicroTimer() - tmQueryStart;
  14288. pResult->m_iQueryTime += (int)( tmWall/1000 );
  14289. #if 0
  14290. printf ( "qtm %d, %d, %d, %d, %d\n", int(tmWall), tQueryStats.m_iFetchedDocs,
  14291. tQueryStats.m_iFetchedHits, tQueryStats.m_iSkips, ppSorters[0]->GetTotalCount() );
  14292. #endif
  14293. if ( pProfile)
  14294. pProfile->Switch ( SPH_QSTATE_UNKNOWN );
  14295. return true;
  14296. }
  14297. //////////////////////////////////////////////////////////////////////////
  14298. // INDEX STATUS
  14299. //////////////////////////////////////////////////////////////////////////
  14300. CSphIndexStatus CSphIndex_VLN::GetStatus () const
  14301. {
  14302. CSphIndexStatus tRes;
  14303. tRes.m_iRamUse = sizeof(CSphIndex_VLN)
  14304. + m_dMinRow.GetSizeBytes()
  14305. + m_dFieldLens.GetSizeBytes()
  14306. + m_pDocinfo.GetLength()
  14307. + m_pDocinfoHash.GetLength()
  14308. + m_pMva.GetLength()
  14309. + m_pStrings.GetLength()
  14310. + m_tWordlist.m_pBuf.GetLength()
  14311. + m_pKillList.GetLength()
  14312. + m_pSkiplists.GetLength()
  14313. + m_dShared.GetLength();
  14314. return tRes;
  14315. }
  14316. //////////////////////////////////////////////////////////////////////////
  14317. // INDEX CHECKING
  14318. //////////////////////////////////////////////////////////////////////////
  14319. #define LOC_FAIL(_args) \
  14320. if ( ++iFails<=FAILS_THRESH ) \
  14321. { \
  14322. fprintf ( fp, "FAILED, " ); \
  14323. fprintf _args; \
  14324. fprintf ( fp, "\n" ); \
  14325. iFailsPrinted++; \
  14326. \
  14327. if ( iFails==FAILS_THRESH ) \
  14328. fprintf ( fp, "(threshold reached; suppressing further output)\n" ); \
  14329. }
  14330. int CSphIndex_VLN::DebugCheck ( FILE * fp )
  14331. {
  14332. int64_t tmCheck = sphMicroTimer();
  14333. int iFails = 0;
  14334. int iFailsPrinted = 0;
  14335. const int FAILS_THRESH = 100;
  14336. // check if index is ready
  14337. if ( m_dShared.GetNumEntries()!=SPH_SHARED_VARS_COUNT || !m_pPreread || !*m_pPreread )
  14338. LOC_FAIL(( fp, "index not preread" ));
  14339. bool bProgress = isatty ( fileno ( fp ) )!=0;
  14340. //////////////
  14341. // open files
  14342. //////////////
  14343. CSphString sError;
  14344. CSphAutoreader rdDict, rdDocs, rdHits;
  14345. if ( !rdDict.Open ( GetIndexFileName("spi"), sError ) )
  14346. LOC_FAIL(( fp, "unable to open dictionary: %s", sError.cstr() ));
  14347. if ( !rdDocs.Open ( GetIndexFileName("spd"), sError ) )
  14348. LOC_FAIL(( fp, "unable to open doclist: %s", sError.cstr() ));
  14349. if ( !rdHits.Open ( GetIndexFileName("spp"), sError ) )
  14350. LOC_FAIL(( fp, "unable to open hitlist: %s", sError.cstr() ));
  14351. CSphVector<SphWordID_t> dHitlessWords;
  14352. if ( !LoadHitlessWords ( dHitlessWords ) )
  14353. LOC_FAIL(( fp, "unable to load hitless words: %s", m_sLastError.cstr() ));
  14354. ////////////////////
  14355. // check dictionary
  14356. ////////////////////
  14357. fprintf ( fp, "checking dictionary...\n" );
  14358. SphWordID_t uWordid = 0;
  14359. int64_t iDoclistOffset = 0;
  14360. int iWordsTotal = 0;
  14361. char sWord[MAX_KEYWORD_BYTES], sLastWord[MAX_KEYWORD_BYTES];
  14362. memset ( sWord, 0, sizeof(sWord) );
  14363. memset ( sLastWord, 0, sizeof(sLastWord) );
  14364. const int iWordPerCP = m_uVersion>=21 ? SPH_WORDLIST_CHECKPOINT : 1024;
  14365. const bool bWordDict = m_pDict->GetSettings().m_bWordDict;
  14366. CSphVector<CSphWordlistCheckpoint> dCheckpoints;
  14367. if ( bWordDict && m_uVersion<21 )
  14368. LOC_FAIL(( fp, "dictionary needed index version not less then 21 (readed=%d)"
  14369. , m_uVersion ));
  14370. int iLastSkipsOffset = 0;
  14371. rdDict.SeekTo ( 1, READ_NO_SIZE_HINT );
  14372. SphOffset_t iWordsEnd = m_tWordlist.m_iWordsEnd;
  14373. bool bCheckInfixes = bWordDict && m_tWordlist.m_iInfixCodepointBytes && m_tWordlist.m_dInfixBlocks.GetLength();
  14374. bool bUtf8 = ( m_pTokenizer && m_pTokenizer->IsUtf8() );
  14375. CSphVector<int> dInfix2CP;
  14376. while ( rdDict.GetPos()!=iWordsEnd && !m_bIsEmpty )
  14377. {
  14378. // sanity checks
  14379. if ( rdDict.GetPos()>=iWordsEnd )
  14380. {
  14381. LOC_FAIL(( fp, "reading past checkpoints" ));
  14382. break;
  14383. }
  14384. // store current entry pos (for checkpointing later), read next delta
  14385. const int64_t iDictPos = rdDict.GetPos();
  14386. const SphWordID_t iDeltaWord = bWordDict ? rdDict.GetByte() : rdDict.UnzipWordid();
  14387. // checkpoint encountered, handle it
  14388. if ( !iDeltaWord )
  14389. {
  14390. rdDict.UnzipOffset();
  14391. if ( ( iWordsTotal%iWordPerCP )!=0 && rdDict.GetPos()!=iWordsEnd )
  14392. LOC_FAIL(( fp, "unexpected checkpoint (pos="INT64_FMT", word=%d, words=%d, expected=%d)",
  14393. iDictPos, iWordsTotal, ( iWordsTotal%iWordPerCP ), iWordPerCP ));
  14394. uWordid = 0;
  14395. iDoclistOffset = 0;
  14396. continue;
  14397. }
  14398. SphWordID_t uNewWordid = 0;
  14399. SphOffset_t iNewDoclistOffset = 0;
  14400. int iDocs = 0;
  14401. int iHits = 0;
  14402. if ( bWordDict )
  14403. {
  14404. // unpack next word
  14405. // must be in sync with DictEnd()!
  14406. BYTE uPack = (BYTE)iDeltaWord;
  14407. int iMatch, iDelta;
  14408. if ( uPack & 0x80 )
  14409. {
  14410. iDelta = ( ( uPack>>4 ) & 7 ) + 1;
  14411. iMatch = uPack & 15;
  14412. } else
  14413. {
  14414. iDelta = uPack & 127;
  14415. iMatch = rdDict.GetByte();
  14416. }
  14417. const int iLastWordLen = strlen(sLastWord);
  14418. if ( iMatch+iDelta>=(int)sizeof(sLastWord)-1 || iMatch>iLastWordLen )
  14419. {
  14420. LOC_FAIL(( fp, "wrong word-delta (pos="INT64_FMT", word=%s, len=%d, begin=%d, delta=%d)",
  14421. iDictPos, sLastWord, iLastWordLen, iMatch, iDelta ));
  14422. rdDict.SkipBytes ( iDelta );
  14423. } else
  14424. {
  14425. rdDict.GetBytes ( sWord + iMatch, iDelta );
  14426. sWord [ iMatch+iDelta ] = '\0';
  14427. }
  14428. iNewDoclistOffset = rdDict.UnzipOffset();
  14429. iDocs = rdDict.UnzipInt();
  14430. iHits = rdDict.UnzipInt();
  14431. int iHint = ( iDocs>=DOCLIST_HINT_THRESH ) ? rdDict.GetByte() : 0;
  14432. iHint = DoclistHintUnpack ( iDocs, (BYTE)iHint );
  14433. const int iNewWordLen = strlen(sWord);
  14434. if ( iNewWordLen==0 )
  14435. LOC_FAIL(( fp, "empty word in dictionary (pos="INT64_FMT")",
  14436. iDictPos ));
  14437. if ( iLastWordLen && iNewWordLen )
  14438. if ( sphDictCmpStrictly ( sWord, iNewWordLen, sLastWord, iLastWordLen )<=0 )
  14439. LOC_FAIL(( fp, "word order decreased (pos="INT64_FMT", word=%s, prev=%s)",
  14440. iDictPos, sLastWord, sWord ));
  14441. if ( iHint<0 )
  14442. LOC_FAIL(( fp, "invalid word hint (pos="INT64_FMT", word=%s, hint=%d)",
  14443. iDictPos, sWord, iHint ));
  14444. if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
  14445. LOC_FAIL(( fp, "invalid docs/hits (pos="INT64_FMT", word=%s, docs="INT64_FMT", hits="INT64_FMT")",
  14446. (int64_t)iDictPos, sWord, (int64_t)iDocs, (int64_t)iHits ));
  14447. memcpy ( sLastWord, sWord, sizeof(sLastWord) );
  14448. } else
  14449. {
  14450. // finish reading the entire entry
  14451. uNewWordid = uWordid + iDeltaWord;
  14452. iNewDoclistOffset = iDoclistOffset + rdDict.UnzipOffset();
  14453. iDocs = rdDict.UnzipInt();
  14454. iHits = rdDict.UnzipInt();
  14455. bool bHitless = ( dHitlessWords.BinarySearch ( uNewWordid )!=NULL );
  14456. if ( bHitless )
  14457. iDocs &= 0x7fffffff;
  14458. if ( uNewWordid<=uWordid )
  14459. LOC_FAIL(( fp, "wordid decreased (pos="INT64_FMT", wordid="UINT64_FMT", previd="UINT64_FMT")",
  14460. (int64_t)iDictPos, (uint64_t)uNewWordid, (uint64_t)uWordid ));
  14461. if ( iNewDoclistOffset<=iDoclistOffset )
  14462. LOC_FAIL(( fp, "doclist offset decreased (pos="INT64_FMT", wordid="UINT64_FMT")",
  14463. (int64_t)iDictPos, (uint64_t)uNewWordid ));
  14464. if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
  14465. LOC_FAIL(( fp, "invalid docs/hits (pos="INT64_FMT", wordid="UINT64_FMT", docs="INT64_FMT", hits="INT64_FMT", hitless=%s)",
  14466. (int64_t)iDictPos, (uint64_t)uNewWordid, (int64_t)iDocs, (int64_t)iHits, ( bHitless?"true":"false" ) ));
  14467. }
  14468. // skiplist
  14469. if ( m_bHaveSkips && iDocs>SPH_SKIPLIST_BLOCK )
  14470. {
  14471. int iSkipsOffset = rdDict.UnzipInt();
  14472. if ( !bWordDict && iSkipsOffset<iLastSkipsOffset )
  14473. LOC_FAIL(( fp, "descending skiplist pos (last=%d, cur=%d, wordid=%llu)",
  14474. iLastSkipsOffset, iSkipsOffset, UINT64 ( uNewWordid ) ));
  14475. iLastSkipsOffset = iSkipsOffset;
  14476. }
  14477. // update stats, add checkpoint
  14478. if ( ( iWordsTotal%iWordPerCP )==0 )
  14479. {
  14480. CSphWordlistCheckpoint & tCP = dCheckpoints.Add();
  14481. tCP.m_iWordlistOffset = iDictPos;
  14482. if ( bWordDict )
  14483. {
  14484. const int iLen = strlen ( sWord );
  14485. char * sWordChecked = new char [iLen+1];
  14486. strncpy ( sWordChecked, sWord, iLen+1 );
  14487. tCP.m_sWord = sWordChecked;
  14488. } else
  14489. tCP.m_iWordID = uNewWordid;
  14490. }
  14491. // check infixes
  14492. if ( bCheckInfixes )
  14493. {
  14494. int iWordBytes = strnlen ( sWord, sizeof(sWord) );
  14495. int iWordCodepoints = bUtf8 ? sphUTF8Len ( sWord ) : iWordBytes;
  14496. if ( iWordCodepoints>=m_tSettings.m_iMinInfixLen )
  14497. {
  14498. dInfix2CP.Resize ( 0 );
  14499. int iInfixBytes = sphGetInfixLength ( sWord, iWordBytes, m_tWordlist.m_iInfixCodepointBytes );
  14500. sphLookupInfixCheckpoints ( sWord, iInfixBytes, m_tWordlist.m_pBuf.GetWritePtr(), m_tWordlist.m_dInfixBlocks,
  14501. m_tWordlist.m_iInfixCodepointBytes, dInfix2CP );
  14502. if ( !dInfix2CP.BinarySearch ( dCheckpoints.GetLength() ) )
  14503. {
  14504. LOC_FAIL(( fp, "infix not found for word '%s' (%d), checkpoint %d, readpos="INT64_FMT,
  14505. sWord, iWordsTotal, dCheckpoints.GetLength(), (int64_t)iDictPos ));
  14506. }
  14507. }
  14508. }
  14509. uWordid = uNewWordid;
  14510. iDoclistOffset = iNewDoclistOffset;
  14511. iWordsTotal++;
  14512. }
  14513. // check the checkpoints
  14514. if ( dCheckpoints.GetLength()!=m_tWordlist.m_dCheckpoints.GetLength() )
  14515. LOC_FAIL(( fp, "checkpoint count mismatch (read=%d, calc=%d)",
  14516. m_tWordlist.m_dCheckpoints.GetLength(), dCheckpoints.GetLength() ));
  14517. for ( int i=0; i < Min ( dCheckpoints.GetLength(), m_tWordlist.m_dCheckpoints.GetLength() ); i++ )
  14518. {
  14519. const CSphWordlistCheckpoint & tRefCP = dCheckpoints[i];
  14520. const CSphWordlistCheckpoint & tCP = m_tWordlist.m_dCheckpoints[i];
  14521. const int iLen = bWordDict ? strlen ( tCP.m_sWord ) : 0;
  14522. if ( bWordDict && ( strlen ( tRefCP.m_sWord )==0 || strlen ( tCP.m_sWord )==0 ) )
  14523. {
  14524. LOC_FAIL(( fp, "empty checkpoint %d (read_word=%s, read_len=%u, readpos="INT64_FMT", calc_word=%s, calc_len=%u, calcpos="INT64_FMT")",
  14525. i, tCP.m_sWord, (DWORD)strlen ( tCP.m_sWord ), (int64_t)tCP.m_iWordlistOffset,
  14526. tRefCP.m_sWord, (DWORD)strlen ( tRefCP.m_sWord ), (int64_t)tRefCP.m_iWordlistOffset ));
  14527. } else if ( sphCheckpointCmpStrictly ( tCP.m_sWord, iLen, tCP.m_iWordID, bWordDict, tRefCP )
  14528. || tRefCP.m_iWordlistOffset!=tCP.m_iWordlistOffset )
  14529. {
  14530. if ( bWordDict )
  14531. {
  14532. LOC_FAIL(( fp, "checkpoint %d differs (read_word=%s, readpos="INT64_FMT", calc_word=%s, calcpos="INT64_FMT")",
  14533. i,
  14534. tCP.m_sWord,
  14535. (int64_t)tCP.m_iWordlistOffset,
  14536. tRefCP.m_sWord,
  14537. (int64_t)tRefCP.m_iWordlistOffset ));
  14538. } else
  14539. {
  14540. LOC_FAIL(( fp, "checkpoint %d differs (readid="UINT64_FMT", readpos="INT64_FMT", calcid="UINT64_FMT", calcpos="INT64_FMT")",
  14541. i,
  14542. (uint64_t)tCP.m_iWordID,
  14543. (int64_t)tCP.m_iWordlistOffset,
  14544. (uint64_t)tRefCP.m_iWordID,
  14545. (int64_t)tRefCP.m_iWordlistOffset ));
  14546. }
  14547. }
  14548. }
  14549. if ( bWordDict )
  14550. ARRAY_FOREACH ( i, dCheckpoints )
  14551. SafeDeleteArray ( dCheckpoints[i].m_sWord );
  14552. dCheckpoints.Reset ();
  14553. ///////////////////////
  14554. // check docs and hits
  14555. ///////////////////////
  14556. fprintf ( fp, "checking data...\n" );
  14557. int64_t iDocsSize = rdDocs.GetFilesize();
  14558. rdDict.SeekTo ( 1, READ_NO_SIZE_HINT );
  14559. rdDocs.SeekTo ( 1, READ_NO_SIZE_HINT );
  14560. rdHits.SeekTo ( 1, READ_NO_SIZE_HINT );
  14561. uWordid = 0;
  14562. iDoclistOffset = 0;
  14563. int iDictDocs, iDictHits;
  14564. bool bHitless = false;
  14565. int iWordsChecked = 0;
  14566. while ( rdDict.GetPos()<iWordsEnd )
  14567. {
  14568. const SphWordID_t iDeltaWord = bWordDict ? rdDict.GetByte() : rdDict.UnzipWordid();
  14569. if ( !iDeltaWord )
  14570. {
  14571. rdDict.UnzipOffset();
  14572. uWordid = 0;
  14573. iDoclistOffset = 0;
  14574. continue;
  14575. }
  14576. if ( bWordDict )
  14577. {
  14578. // unpack next word
  14579. // must be in sync with DictEnd()!
  14580. BYTE uPack = (BYTE)iDeltaWord;
  14581. int iMatch, iDelta;
  14582. if ( uPack & 0x80 )
  14583. {
  14584. iDelta = ( ( uPack>>4 ) & 7 ) + 1;
  14585. iMatch = uPack & 15;
  14586. } else
  14587. {
  14588. iDelta = uPack & 127;
  14589. iMatch = rdDict.GetByte();
  14590. }
  14591. const int iLastWordLen = strlen(sWord);
  14592. if ( iMatch+iDelta>=(int)sizeof(sWord)-1 || iMatch>iLastWordLen )
  14593. rdDict.SkipBytes ( iDelta );
  14594. else
  14595. {
  14596. rdDict.GetBytes ( sWord + iMatch, iDelta );
  14597. sWord [ iMatch+iDelta ] = '\0';
  14598. }
  14599. iDoclistOffset = rdDict.UnzipOffset();
  14600. iDictDocs = rdDict.UnzipInt();
  14601. iDictHits = rdDict.UnzipInt();
  14602. int iHint = ( iDictDocs>=DOCLIST_HINT_THRESH ) ? rdDict.GetByte() : 0;
  14603. DoclistHintUnpack ( iDictDocs, (BYTE)iHint );
  14604. } else
  14605. {
  14606. // finish reading the entire entry
  14607. uWordid = uWordid + iDeltaWord;
  14608. bHitless = ( dHitlessWords.BinarySearch ( uWordid )!=NULL );
  14609. iDoclistOffset = iDoclistOffset + rdDict.UnzipOffset();
  14610. iDictDocs = rdDict.UnzipInt();
  14611. if ( bHitless )
  14612. iDictDocs &= 0x7fffffff;
  14613. iDictHits = rdDict.UnzipInt();
  14614. }
  14615. // FIXME? verify skiplist content too
  14616. int iSkipsOffset = 0;
  14617. if ( m_bHaveSkips && iDictDocs>SPH_SKIPLIST_BLOCK )
  14618. iSkipsOffset = rdDict.UnzipInt();
  14619. // check whether the offset is as expected
  14620. if ( iDoclistOffset!=rdDocs.GetPos() )
  14621. {
  14622. if ( !bWordDict )
  14623. LOC_FAIL(( fp, "unexpected doclist offset (wordid="UINT64_FMT"(%s)(%d), dictpos="INT64_FMT", doclistpos="INT64_FMT")",
  14624. (uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, (int64_t)rdDocs.GetPos() ));
  14625. if ( iDoclistOffset>=iDocsSize || iDoclistOffset<0 )
  14626. {
  14627. LOC_FAIL(( fp, "unexpected doclist offset, off the file (wordid="UINT64_FMT"(%s)(%d), dictpos="INT64_FMT", doclistsize="INT64_FMT")",
  14628. (uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, iDocsSize ));
  14629. iWordsChecked++;
  14630. continue;
  14631. } else
  14632. rdDocs.SeekTo ( iDoclistOffset, READ_NO_SIZE_HINT );
  14633. }
  14634. // create and manually setup doclist reader
  14635. DiskIndexQwordTraits_c * pQword = NULL;
  14636. WITH_QWORD ( this, false, T, pQword = new T ( false, false ) );
  14637. pQword->m_tDoc.Reset ( m_tSchema.GetDynamicSize() );
  14638. pQword->m_iMinID = m_iMinDocid;
  14639. pQword->m_tDoc.m_iDocID = m_iMinDocid;
  14640. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
  14641. {
  14642. pQword->m_iInlineAttrs = m_tSchema.GetDynamicSize();
  14643. pQword->m_pInlineFixup = m_dMinRow.Begin();
  14644. } else
  14645. {
  14646. pQword->m_iInlineAttrs = 0;
  14647. pQword->m_pInlineFixup = NULL;
  14648. }
  14649. pQword->m_iDocs = 0;
  14650. pQword->m_iHits = 0;
  14651. pQword->m_rdDoclist.SetFile ( rdDocs.GetFD(), rdDocs.GetFilename().cstr() );
  14652. pQword->m_rdDoclist.SeekTo ( rdDocs.GetPos(), READ_NO_SIZE_HINT );
  14653. pQword->m_rdHitlist.SetFile ( rdHits.GetFD(), rdHits.GetFilename().cstr() );
  14654. pQword->m_rdHitlist.SeekTo ( rdHits.GetPos(), READ_NO_SIZE_HINT );
  14655. CSphRowitem * pInlineStorage = NULL;
  14656. if ( pQword->m_iInlineAttrs )
  14657. pInlineStorage = new CSphRowitem [ pQword->m_iInlineAttrs ];
  14658. // loop the doclist
  14659. SphDocID_t uLastDocid = 0;
  14660. int iDoclistDocs = 0;
  14661. int iDoclistHits = 0;
  14662. int iHitlistHits = 0;
  14663. // FIXME!!! dict=keywords + hitless_words=some
  14664. bHitless = ( m_tSettings.m_eHitless==SPH_HITLESS_ALL ||
  14665. ( m_tSettings.m_eHitless==SPH_HITLESS_SOME && dHitlessWords.BinarySearch ( uWordid ) ) );
  14666. pQword->m_bHasHitlist = !bHitless;
  14667. CSphVector<SkiplistEntry_t> dDoclistSkips;
  14668. for ( ;; )
  14669. {
  14670. // skiplist state is saved just *before* decoding those boundary entries
  14671. if ( m_bHaveSkips && ( iDoclistDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
  14672. {
  14673. SkiplistEntry_t & tBlock = dDoclistSkips.Add();
  14674. tBlock.m_iBaseDocid = pQword->m_tDoc.m_iDocID;
  14675. tBlock.m_iOffset = pQword->m_rdDoclist.GetPos();
  14676. tBlock.m_iBaseHitlistPos = pQword->m_uHitPosition;
  14677. }
  14678. // FIXME? this can fail on a broken entry (eg fieldid over 256)
  14679. const CSphMatch & tDoc = pQword->GetNextDoc ( pInlineStorage );
  14680. if ( !tDoc.m_iDocID )
  14681. break;
  14682. // checks!
  14683. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
  14684. {
  14685. const CSphRowitem * pFound = FindDocinfo ( tDoc.m_iDocID );
  14686. if ( !pFound )
  14687. LOC_FAIL(( fp, "row not found (wordid="UINT64_FMT"(%s), docid="DOCID_FMT")",
  14688. uint64_t(uWordid), sWord, tDoc.m_iDocID ));
  14689. if ( pFound )
  14690. if ( tDoc.m_iDocID!=DOCINFO2ID(pFound) )
  14691. LOC_FAIL(( fp, "row found but id mismatches (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", found="DOCID_FMT")",
  14692. uint64_t(uWordid), sWord, tDoc.m_iDocID, DOCINFO2ID(pFound) ));
  14693. }
  14694. if ( tDoc.m_iDocID<=uLastDocid )
  14695. LOC_FAIL(( fp, "docid decreased (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", lastid="DOCID_FMT")",
  14696. uint64_t(uWordid), sWord, tDoc.m_iDocID, uLastDocid ));
  14697. uLastDocid = tDoc.m_iDocID;
  14698. iDoclistDocs++;
  14699. iDoclistHits += pQword->m_uMatchHits;
  14700. // check position in case of regular (not-inline) hit
  14701. if (!( pQword->m_iHitlistPos>>63 ))
  14702. {
  14703. if ( !bWordDict && pQword->m_iHitlistPos!=pQword->m_rdHitlist.GetPos() )
  14704. LOC_FAIL(( fp, "unexpected hitlist offset (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", expected="INT64_FMT", actual="INT64_FMT")",
  14705. (uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID,
  14706. (int64_t)pQword->m_iHitlistPos, (int64_t)pQword->m_rdHitlist.GetPos() ));
  14707. }
  14708. // aim
  14709. pQword->SeekHitlist ( pQword->m_iHitlistPos );
  14710. // loop the hitlist
  14711. int iDocHits = 0;
  14712. CSphSmallBitvec dFieldMask;
  14713. dFieldMask.Unset();
  14714. Hitpos_t uLastHit = EMPTY_HIT;
  14715. while ( !bHitless )
  14716. {
  14717. Hitpos_t uHit = pQword->GetNextHit();
  14718. if ( uHit==EMPTY_HIT )
  14719. break;
  14720. if (!( uLastHit<uHit ))
  14721. LOC_FAIL(( fp, "hit decreased (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", hit=%u, last=%u)",
  14722. (uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, uHit, uLastHit ));
  14723. uLastHit = uHit;
  14724. int iField = HITMAN::GetField ( uHit );
  14725. if ( iField<0 || iField>=SPH_MAX_FIELDS )
  14726. {
  14727. LOC_FAIL(( fp, "hit field out of bounds (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", field=%d)",
  14728. (uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, iField ));
  14729. } else if ( iField>=m_tSchema.m_dFields.GetLength() )
  14730. {
  14731. LOC_FAIL(( fp, "hit field out of schema (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", field=%d)",
  14732. (uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, iField ));
  14733. }
  14734. dFieldMask.Set(iField);
  14735. iDocHits++; // to check doclist entry
  14736. iHitlistHits++; // to check dictionary entry
  14737. }
  14738. // check hit count
  14739. if ( iDocHits!=(int)pQword->m_uMatchHits && !bHitless )
  14740. LOC_FAIL(( fp, "doc hit count mismatch (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", doclist=%d, hitlist=%d)",
  14741. (uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, pQword->m_uMatchHits, iDocHits ));
  14742. // check the mask
  14743. if ( dFieldMask!=pQword->m_dQwordFields && !bHitless )
  14744. LOC_FAIL(( fp, "field mask mismatch (wordid="UINT64_FMT"(%s), docid="DOCID_FMT")",
  14745. (uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID ));
  14746. // update my hitlist reader
  14747. rdHits.SeekTo ( pQword->m_rdHitlist.GetPos(), READ_NO_SIZE_HINT );
  14748. }
  14749. // do checks
  14750. if ( iDictDocs!=iDoclistDocs )
  14751. LOC_FAIL(( fp, "doc count mismatch (wordid="UINT64_FMT"(%s), dict=%d, doclist=%d, hitless=%s)",
  14752. uint64_t(uWordid), sWord, iDictDocs, iDoclistDocs, ( bHitless?"true":false ) ));
  14753. if ( ( iDictHits!=iDoclistHits || iDictHits!=iHitlistHits ) && !bHitless )
  14754. LOC_FAIL(( fp, "hit count mismatch (wordid="UINT64_FMT"(%s), dict=%d, doclist=%d, hitlist=%d)",
  14755. uint64_t(uWordid), sWord, iDictHits, iDoclistHits, iHitlistHits ));
  14756. while ( m_bHaveSkips && iDoclistDocs>SPH_SKIPLIST_BLOCK )
  14757. {
  14758. if ( iSkipsOffset<=0 || iSkipsOffset>(int)m_pSkiplists.GetLength() )
  14759. {
  14760. LOC_FAIL(( fp, "invalid skiplist offset (wordid=%llu(%s), off=%d, max=%d)",
  14761. UINT64 ( uWordid ), sWord, iSkipsOffset, (int)m_pSkiplists.GetLength() ));
  14762. break;
  14763. }
  14764. // boundary adjustment
  14765. if ( ( iDoclistDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
  14766. dDoclistSkips.Pop();
  14767. SkiplistEntry_t t;
  14768. t.m_iBaseDocid = m_iMinDocid;
  14769. t.m_iOffset = iDoclistOffset;
  14770. t.m_iBaseHitlistPos = 0;
  14771. const BYTE * pSkip = m_pSkiplists.GetWritePtr() + iSkipsOffset;
  14772. const BYTE * pMax = m_pSkiplists.GetWritePtr() + m_pSkiplists.GetLength();
  14773. int i = 0;
  14774. while ( pSkip<pMax && ++i<dDoclistSkips.GetLength() )
  14775. {
  14776. const SkiplistEntry_t & r = dDoclistSkips[i];
  14777. t.m_iBaseDocid += SPH_SKIPLIST_BLOCK + (SphDocID_t) sphUnzipOffset ( pSkip );
  14778. t.m_iOffset += 4*SPH_SKIPLIST_BLOCK + sphUnzipOffset ( pSkip );
  14779. t.m_iBaseHitlistPos += sphUnzipOffset ( pSkip );
  14780. if ( t.m_iBaseDocid!=r.m_iBaseDocid
  14781. || t.m_iOffset!=r.m_iOffset ||
  14782. t.m_iBaseHitlistPos!=r.m_iBaseHitlistPos )
  14783. {
  14784. LOC_FAIL(( fp, "skiplist entry %d mismatch (wordid=%llu(%s), exp={%llu, %llu, %llu}, got={%llu, %llu, %llu})",
  14785. i, UINT64 ( uWordid ), sWord,
  14786. UINT64 ( r.m_iBaseDocid ), UINT64 ( r.m_iOffset ), UINT64 ( r.m_iBaseHitlistPos ),
  14787. UINT64 ( t.m_iBaseDocid ), UINT64 ( t.m_iOffset ), UINT64 ( t.m_iBaseHitlistPos ) ));
  14788. break;
  14789. }
  14790. if ( pSkip>pMax )
  14791. LOC_FAIL(( fp, "skiplist length mismatch (wordid=%llu(%s), exp=%d, got=%d)",
  14792. UINT64 ( uWordid ), sWord, i, dDoclistSkips.GetLength() ));
  14793. }
  14794. break;
  14795. }
  14796. // move my reader instance forward too
  14797. rdDocs.SeekTo ( pQword->m_rdDoclist.GetPos(), READ_NO_SIZE_HINT );
  14798. // cleanup
  14799. SafeDelete ( pInlineStorage );
  14800. SafeDelete ( pQword );
  14801. // progress bar
  14802. if ( (++iWordsChecked)%1000==0 && bProgress )
  14803. {
  14804. fprintf ( fp, "%d/%d\r", iWordsChecked, iWordsTotal );
  14805. fflush ( fp );
  14806. }
  14807. }
  14808. ///////////////////////////
  14809. // check rows (attributes)
  14810. ///////////////////////////
  14811. if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_pDocinfo.IsEmpty() )
  14812. {
  14813. fprintf ( fp, "checking rows...\n" );
  14814. // sizes and counts
  14815. int64_t iRowsTotal = m_iDocinfo;
  14816. DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  14817. int64_t iAllRowsTotal = iRowsTotal;
  14818. iAllRowsTotal += (m_iDocinfoIndex+1)*2; // should had been fixed up to v.20 by the loader
  14819. if ( iAllRowsTotal*uStride!=(int64_t)m_pDocinfo.GetNumEntries() )
  14820. LOC_FAIL(( fp, "rowitems count mismatch (expected="INT64_FMT", loaded="INT64_FMT")",
  14821. iAllRowsTotal*uStride, (int64_t)m_pDocinfo.GetNumEntries() ));
  14822. // extract rowitem indexes for MVAs etc
  14823. // (ie. attr types that we can and will run additional checks on)
  14824. CSphVector<int> dMvaItems;
  14825. CSphVector<CSphAttrLocator> dFloatItems;
  14826. CSphVector<CSphAttrLocator> dStrItems;
  14827. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  14828. {
  14829. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  14830. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  14831. {
  14832. if ( tAttr.m_tLocator.m_iBitCount!=ROWITEM_BITS )
  14833. {
  14834. LOC_FAIL(( fp, "unexpected MVA bitcount (attr=%d, expected=%d, got=%d)",
  14835. i, ROWITEM_BITS, tAttr.m_tLocator.m_iBitCount ));
  14836. continue;
  14837. }
  14838. if ( ( tAttr.m_tLocator.m_iBitOffset % ROWITEM_BITS )!=0 )
  14839. {
  14840. LOC_FAIL(( fp, "unaligned MVA bitoffset (attr=%d, bitoffset=%d)",
  14841. i, tAttr.m_tLocator.m_iBitOffset ));
  14842. continue;
  14843. }
  14844. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
  14845. dMvaItems.Add ( tAttr.m_tLocator.m_iBitOffset/ROWITEM_BITS );
  14846. } else if ( tAttr.m_eAttrType==SPH_ATTR_FLOAT )
  14847. dFloatItems.Add ( tAttr.m_tLocator );
  14848. else if ( tAttr.m_eAttrType==SPH_ATTR_STRING || tAttr.m_eAttrType==SPH_ATTR_JSON )
  14849. dStrItems.Add ( tAttr.m_tLocator );
  14850. }
  14851. int iMva64 = dMvaItems.GetLength();
  14852. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  14853. {
  14854. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
  14855. if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  14856. dMvaItems.Add ( tAttr.m_tLocator.m_iBitOffset/ROWITEM_BITS );
  14857. }
  14858. // walk string data, build a list of acceptable start offsets
  14859. // must be sorted by construction
  14860. CSphVector<DWORD> dStringOffsets;
  14861. if ( m_pStrings.GetNumEntries()>1 )
  14862. {
  14863. const BYTE * pBase = m_pStrings.GetWritePtr();
  14864. const BYTE * pCur = pBase + 1;
  14865. const BYTE * pMax = pBase + m_pStrings.GetNumEntries();
  14866. while ( pCur<pMax )
  14867. {
  14868. const BYTE * pStr = NULL;
  14869. const int iLen = sphUnpackStr ( pCur, &pStr );
  14870. // 4 bytes must be enough to encode string length, hence pCur+4
  14871. if ( pStr+iLen>pMax || pStr<pCur || pStr>pCur+4 )
  14872. {
  14873. LOC_FAIL(( fp, "string length out of bounds (offset=%u, len=%d)", (DWORD)(pCur-pBase), iLen ));
  14874. break;
  14875. }
  14876. dStringOffsets.Add ( (DWORD)(pCur-pBase) );
  14877. pCur = pStr + iLen;
  14878. }
  14879. }
  14880. // loop the rows
  14881. const CSphRowitem * pRow = m_pDocinfo.GetWritePtr();
  14882. const DWORD * pMvaBase = m_pMva.GetWritePtr();
  14883. const DWORD * pMvaMax = pMvaBase + m_pMva.GetNumEntries();
  14884. const DWORD * pMva = pMvaBase;
  14885. int iOrphan = 0;
  14886. SphDocID_t uLastID = 0;
  14887. for ( int64_t iRow=0; iRow<iRowsTotal; iRow++, pRow+=uStride )
  14888. {
  14889. // check that ids are ascending
  14890. bool bIsSpaValid = uLastID < DOCINFO2ID(pRow);
  14891. if ( !bIsSpaValid )
  14892. LOC_FAIL(( fp, "docid decreased (row="INT64_FMT", id="DOCID_FMT", lastid="DOCID_FMT")",
  14893. iRow, DOCINFO2ID(pRow), uLastID ));
  14894. uLastID = DOCINFO2ID(pRow);
  14895. ///////////////////////////
  14896. // check MVAs
  14897. ///////////////////////////
  14898. if ( dMvaItems.GetLength() )
  14899. {
  14900. const DWORD * pMvaSpaFixed = NULL;
  14901. const CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
  14902. bool bHasValues = false;
  14903. ARRAY_FOREACH ( iItem, dMvaItems )
  14904. {
  14905. const DWORD uOffset = pAttrs[dMvaItems[iItem]];
  14906. bHasValues |= uOffset!=0;
  14907. if ( uOffset && pMvaBase+uOffset>=pMvaMax )
  14908. {
  14909. bIsSpaValid = false;
  14910. LOC_FAIL(( fp, "MVA index out of bounds (row="INT64_FMT", mvaattr=%d, docid="DOCID_FMT", index=%u)",
  14911. iRow, iItem, uLastID, uOffset ));
  14912. }
  14913. if ( uOffset && pMvaBase+uOffset<pMvaMax && !pMvaSpaFixed )
  14914. pMvaSpaFixed = pMvaBase + uOffset - sizeof(SphDocID_t) / sizeof(DWORD);
  14915. }
  14916. // MVAs ptr recovery from previous errors only if current spa record is valid
  14917. if ( pMva!=pMvaSpaFixed && bIsSpaValid && pMvaSpaFixed )
  14918. pMva = pMvaSpaFixed;
  14919. bool bLastIDChecked = false;
  14920. SphDocID_t uLastMvaID = 0;
  14921. while ( pMva<pMvaMax && DOCINFO2ID(pMva)<=uLastID )
  14922. {
  14923. const SphDocID_t uMvaID = DOCINFO2ID(pMva);
  14924. pMva = DOCINFO2ATTRS(pMva);
  14925. if ( bLastIDChecked && uLastID==uMvaID )
  14926. LOC_FAIL(( fp, "duplicate docid found (row="INT64_FMT", docid expected="DOCID_FMT", got="DOCID_FMT", index=%u)",
  14927. iRow, uLastID, uMvaID, (DWORD)(pMva-pMvaBase) ));
  14928. if ( uMvaID<uLastMvaID )
  14929. LOC_FAIL(( fp, "MVA docid decreased (row="INT64_FMT", spa docid="DOCID_FMT", last MVA docid="DOCID_FMT", MVA docid="DOCID_FMT", index=%u)",
  14930. iRow, uLastID, uLastMvaID, uMvaID, (DWORD)(pMva-pMvaBase) ));
  14931. bool bIsMvaCorrect = uLastMvaID<=uMvaID && uMvaID<=uLastID;
  14932. uLastMvaID = uMvaID;
  14933. // loop MVAs
  14934. ARRAY_FOREACH_COND ( iItem, dMvaItems, bIsMvaCorrect )
  14935. {
  14936. const DWORD uSpaOffset = pAttrs[dMvaItems[iItem]];
  14937. // check offset (index)
  14938. if ( uMvaID==uLastID && uSpaOffset && bIsSpaValid && pMva!=pMvaBase+uSpaOffset )
  14939. {
  14940. LOC_FAIL(( fp, "unexpected MVA docid (row="INT64_FMT", mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", expected=%u, got=%u)",
  14941. iRow, iItem, uLastID, uMvaID, (DWORD)(pMva-pMvaBase), uSpaOffset ));
  14942. // it's unexpected but it's our best guess
  14943. // but do fix up only once, to prevent infinite loop
  14944. if ( !bLastIDChecked )
  14945. pMva = pMvaBase+uSpaOffset;
  14946. }
  14947. if ( pMva>=pMvaMax )
  14948. {
  14949. LOC_FAIL(( fp, "MVA index out of bounds (row="INT64_FMT", mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", index=%u)",
  14950. iRow, iItem, uLastID, uMvaID, (DWORD)(pMva-pMvaBase) ));
  14951. bIsMvaCorrect = false;
  14952. continue;
  14953. }
  14954. // check values
  14955. DWORD uValues = *pMva++;
  14956. if ( pMva+uValues-1>=pMvaMax )
  14957. {
  14958. LOC_FAIL(( fp, "MVA count out of bounds (row="INT64_FMT", mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", count=%u)",
  14959. iRow, iItem, uLastID, uMvaID, uValues ));
  14960. pMva += uValues;
  14961. bIsMvaCorrect = false;
  14962. continue;
  14963. }
  14964. // check that values are ascending
  14965. for ( DWORD uVal=(iItem>=iMva64 ? 2 : 1); uVal<uValues && bIsMvaCorrect; )
  14966. {
  14967. int64_t iPrev, iCur;
  14968. if ( iItem>=iMva64 )
  14969. {
  14970. iPrev = MVA_UPSIZE ( pMva+uVal-2 );
  14971. iCur = MVA_UPSIZE ( pMva+uVal );
  14972. uVal += 2;
  14973. } else
  14974. {
  14975. iPrev = pMva[uVal-1];
  14976. iCur = pMva[uVal];
  14977. uVal++;
  14978. }
  14979. if ( iCur<=iPrev )
  14980. {
  14981. LOC_FAIL(( fp, "unsorted MVA values (row="INT64_FMT", mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", val[%u]=%u, val[%u]=%u)",
  14982. iRow, iItem, uLastID, uMvaID, ( iItem>=iMva64 ? uVal-2 : uVal-1 ), (unsigned int)iPrev, uVal, (unsigned int)iCur ));
  14983. bIsMvaCorrect = false;
  14984. }
  14985. uVal += ( iItem>=iMva64 ? 2 : 1 );
  14986. }
  14987. pMva += uValues;
  14988. }
  14989. if ( !bIsMvaCorrect )
  14990. break;
  14991. // orphan only ON no errors && ( not matched ids || ids matched multiply times )
  14992. if ( bIsMvaCorrect && ( uMvaID!=uLastID || ( uMvaID==uLastID && bLastIDChecked ) ) )
  14993. iOrphan++;
  14994. bLastIDChecked |= uLastID==uMvaID;
  14995. }
  14996. if ( !bLastIDChecked && bHasValues )
  14997. LOC_FAIL(( fp, "missed or damaged MVA (row="INT64_FMT", docid expected="DOCID_FMT")",
  14998. iRow, uLastID ));
  14999. }
  15000. ///////////////////////////
  15001. // check floats
  15002. ///////////////////////////
  15003. ARRAY_FOREACH ( iItem, dFloatItems )
  15004. {
  15005. const CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
  15006. const DWORD uValue = (DWORD)sphGetRowAttr ( pAttrs, dFloatItems[ iItem ] );
  15007. const DWORD uExp = ( uValue >> 23 ) & 0xff;
  15008. const DWORD uMantissa = uValue & 0x003fffff;
  15009. // check normalized
  15010. if ( uExp==0 && uMantissa!=0 )
  15011. LOC_FAIL(( fp, "float attribute value is unnormalized (row="INT64_FMT", attr=%d, id="DOCID_FMT", raw=0x%x, value=%f)",
  15012. iRow, iItem, uLastID, uValue, sphDW2F ( uValue ) ));
  15013. // check +-inf
  15014. if ( uExp==0xff && uMantissa==0 )
  15015. LOC_FAIL(( fp, "float attribute is infinity (row="INT64_FMT", attr=%d, id="DOCID_FMT", raw=0x%x, value=%f)",
  15016. iRow, iItem, uLastID, uValue, sphDW2F ( uValue ) ));
  15017. }
  15018. /////////////////
  15019. // check strings
  15020. /////////////////
  15021. ARRAY_FOREACH ( iItem, dStrItems )
  15022. {
  15023. const CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
  15024. const DWORD uOffset = (DWORD)sphGetRowAttr ( pAttrs, dStrItems[ iItem ] );
  15025. if ( uOffset>=m_pStrings.GetNumEntries() )
  15026. {
  15027. LOC_FAIL(( fp, "string offset out of bounds (row="INT64_FMT", stringattr=%d, docid="DOCID_FMT", index=%u)",
  15028. iRow, iItem, uLastID, uOffset ));
  15029. continue;
  15030. }
  15031. if ( !uOffset )
  15032. continue;
  15033. const BYTE * pStr = NULL;
  15034. const int iLen = sphUnpackStr ( m_pStrings.GetWritePtr() + uOffset, &pStr );
  15035. // check that length is sane
  15036. if ( pStr+iLen-1>=m_pStrings.GetWritePtr()+m_pStrings.GetLength() )
  15037. {
  15038. LOC_FAIL(( fp, "string length out of bounds (row="INT64_FMT", stringattr=%d, docid="DOCID_FMT", index=%u)",
  15039. iRow, iItem, uLastID, (unsigned int)( pStr-m_pStrings.GetWritePtr()+iLen-1 ) ));
  15040. continue;
  15041. }
  15042. // check that offset is one of the good ones
  15043. // (that is, that we don't point in the middle of some other data)
  15044. if ( !dStringOffsets.BinarySearch ( uOffset ) )
  15045. {
  15046. LOC_FAIL(( fp, "string offset is not a string start (row="INT64_FMT", stringattr=%d, docid="DOCID_FMT", offset=%u)",
  15047. iRow, iItem, uLastID, uOffset ));
  15048. }
  15049. }
  15050. // progress bar
  15051. if ( iRow%1000==0 && bProgress )
  15052. {
  15053. fprintf ( fp, INT64_FMT"/"INT64_FMT"\r", iRow, iRowsTotal );
  15054. fflush ( fp );
  15055. }
  15056. }
  15057. if ( iOrphan )
  15058. fprintf ( fp, "WARNING: %d orphaned MVA entries were found\n", iOrphan );
  15059. ///////////////////////////
  15060. // check blocks index
  15061. ///////////////////////////
  15062. fprintf ( fp, "checking attribute blocks index...\n" );
  15063. // check size
  15064. const int64_t iTempDocinfoIndex = ( m_iDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
  15065. if ( iTempDocinfoIndex!=m_iDocinfoIndex )
  15066. LOC_FAIL(( fp, "block count differs (expected="INT64_FMT", got="INT64_FMT")",
  15067. iTempDocinfoIndex, m_iDocinfoIndex ));
  15068. const DWORD uMinMaxStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
  15069. const DWORD * pDocinfoIndexMax = m_pDocinfoIndex + ( m_iDocinfoIndex+1 )*uMinMaxStride*2;
  15070. for ( int64_t iIndexEntry=0; iIndexEntry<m_iDocinfo; iIndexEntry++ )
  15071. {
  15072. const int64_t iBlock = iIndexEntry / DOCINFO_INDEX_FREQ;
  15073. // we have to do some checks in border cases, for example: when move from 1st to 2nd block
  15074. const int64_t iPrevEntryBlock = ( iIndexEntry-1 )/DOCINFO_INDEX_FREQ;
  15075. const bool bIsBordersCheckTime = ( iPrevEntryBlock!=iBlock );
  15076. const DWORD * pAttr = m_pDocinfo.GetWritePtr() + iIndexEntry * uMinMaxStride;
  15077. const SphDocID_t uDocID = DOCINFO2ID(pAttr);
  15078. const DWORD * pMinEntry = m_pDocinfoIndex + iBlock * uMinMaxStride * 2;
  15079. const DWORD * pMaxEntry = pMinEntry + uMinMaxStride;
  15080. const DWORD * pMinAttrs = DOCINFO2ATTRS ( pMinEntry );
  15081. const DWORD * pMaxAttrs = pMinAttrs + uMinMaxStride;
  15082. // check docid vs global range
  15083. if ( pMaxEntry+uMinMaxStride > pDocinfoIndexMax )
  15084. LOC_FAIL(( fp, "unexpected block index end (row="INT64_FMT", docid="DOCID_FMT", block="INT64_FMT", max="INT64_FMT", cur="INT64_FMT")",
  15085. iIndexEntry, uDocID, iBlock, int64_t ( pDocinfoIndexMax-m_pDocinfoIndex ), int64_t ( pMaxEntry+uMinMaxStride-m_pDocinfoIndex ) ));
  15086. // check attribute location vs global range
  15087. if ( pMaxAttrs+uMinMaxStride > pDocinfoIndexMax )
  15088. LOC_FAIL(( fp, "attribute position out of blocks index (row="INT64_FMT", docid="DOCID_FMT", block="INT64_FMT", expected<"INT64_FMT", got="INT64_FMT")",
  15089. iIndexEntry, uDocID, iBlock, int64_t ( pDocinfoIndexMax-m_pDocinfoIndex ), int64_t ( pMaxAttrs+uMinMaxStride-m_pDocinfoIndex ) ));
  15090. const SphDocID_t uMinDocID = DOCINFO2ID ( pMinEntry );
  15091. const SphDocID_t uMaxDocID = DOCINFO2ID ( pMaxEntry );
  15092. // checks is docid min max range valid
  15093. if ( uMinDocID > uMaxDocID && bIsBordersCheckTime )
  15094. LOC_FAIL(( fp, "invalid docid range (row="INT64_FMT", block="INT64_FMT", min="DOCID_FMT", max="DOCID_FMT")",
  15095. iIndexEntry, iBlock, uMinDocID, uMaxDocID ));
  15096. // checks docid vs blocks range
  15097. if ( uDocID < uMinDocID || uDocID > uMaxDocID )
  15098. LOC_FAIL(( fp, "unexpected docid range (row="INT64_FMT", docid="DOCID_FMT", block="INT64_FMT", min="DOCID_FMT", max="DOCID_FMT")",
  15099. iIndexEntry, uDocID, iBlock, uMinDocID, uMaxDocID ));
  15100. bool bIsFirstMva = true;
  15101. // check values vs blocks range
  15102. const DWORD * pSpaRow = DOCINFO2ATTRS(pAttr);
  15103. for ( int iItem=0; iItem<m_tSchema.GetAttrsCount(); iItem++ )
  15104. {
  15105. const CSphColumnInfo & tCol = m_tSchema.GetAttr(iItem);
  15106. switch ( tCol.m_eAttrType )
  15107. {
  15108. case SPH_ATTR_INTEGER:
  15109. case SPH_ATTR_TIMESTAMP:
  15110. case SPH_ATTR_BOOL:
  15111. case SPH_ATTR_BIGINT:
  15112. {
  15113. const SphAttr_t uVal = sphGetRowAttr ( pSpaRow, tCol.m_tLocator );
  15114. const SphAttr_t uMin = sphGetRowAttr ( pMinAttrs, tCol.m_tLocator );
  15115. const SphAttr_t uMax = sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator );
  15116. // checks is attribute min max range valid
  15117. if ( uMin > uMax && bIsBordersCheckTime )
  15118. LOC_FAIL(( fp, "invalid attribute range (row="INT64_FMT", block="INT64_FMT", min="INT64_FMT", max="INT64_FMT")",
  15119. iIndexEntry, iBlock, uMin, uMax ));
  15120. if ( uVal < uMin || uVal > uMax )
  15121. LOC_FAIL(( fp, "unexpected attribute value (row="INT64_FMT", attr=%u, docid="DOCID_FMT", block="INT64_FMT", value=0x"UINT64_FMT", min=0x"UINT64_FMT", max=0x"UINT64_FMT")",
  15122. iIndexEntry, iItem, uDocID, iBlock, uint64_t(uVal), uint64_t(uMin), uint64_t(uMax) ));
  15123. }
  15124. break;
  15125. case SPH_ATTR_FLOAT:
  15126. {
  15127. const float fVal = sphDW2F ( (DWORD)sphGetRowAttr ( pSpaRow, tCol.m_tLocator ) );
  15128. const float fMin = sphDW2F ( (DWORD)sphGetRowAttr ( pMinAttrs, tCol.m_tLocator ) );
  15129. const float fMax = sphDW2F ( (DWORD)sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator ) );
  15130. // checks is attribute min max range valid
  15131. if ( fMin > fMax && bIsBordersCheckTime )
  15132. LOC_FAIL(( fp, "invalid attribute range (row="INT64_FMT", block="INT64_FMT", min=%f, max=%f)",
  15133. iIndexEntry, iBlock, fMin, fMax ));
  15134. if ( fVal < fMin || fVal > fMax )
  15135. LOC_FAIL(( fp, "unexpected attribute value (row="INT64_FMT", attr=%u, docid="DOCID_FMT", block="INT64_FMT", value=%f, min=%f, max=%f)",
  15136. iIndexEntry, iItem, uDocID, iBlock, fVal, fMin, fMax ));
  15137. }
  15138. break;
  15139. case SPH_ATTR_UINT32SET:
  15140. {
  15141. const DWORD uMin = (DWORD)sphGetRowAttr ( pMinAttrs, tCol.m_tLocator );
  15142. const DWORD uMax = (DWORD)sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator );
  15143. // checks is MVA attribute min max range valid
  15144. if ( uMin > uMax && bIsBordersCheckTime && uMin!=0xffffffff && uMax!=0 )
  15145. LOC_FAIL(( fp, "invalid MVA range (row="INT64_FMT", block="INT64_FMT", min=0x%x, max=0x%x)",
  15146. iIndexEntry, iBlock, uMin, uMax ));
  15147. SphAttr_t uOff = sphGetRowAttr ( pSpaRow, tCol.m_tLocator );
  15148. if ( !uOff )
  15149. break;
  15150. const DWORD * pMva = m_pMva.GetWritePtr() + uOff;
  15151. const DWORD * pMvaDocID = bIsFirstMva ? ( pMva - sizeof(SphDocID_t) / sizeof(DWORD) ) : NULL;
  15152. bIsFirstMva = false;
  15153. if ( uOff>=(SphAttr_t)m_pMva.GetNumEntries() )
  15154. break;
  15155. if ( pMvaDocID && DOCINFO2ID ( pMvaDocID )!=uDocID )
  15156. {
  15157. LOC_FAIL(( fp, "unexpected MVA docid (row="INT64_FMT", mvaattr=%d, expected="DOCID_FMT", got="DOCID_FMT", block="INT64_FMT", index=%u)",
  15158. iIndexEntry, iItem, uDocID, DOCINFO2ID ( pMvaDocID ), iBlock, (DWORD)uOff ));
  15159. break;
  15160. }
  15161. // check values
  15162. const DWORD uValues = *pMva++;
  15163. if ( uOff+uValues>(SphAttr_t)m_pMva.GetNumEntries() )
  15164. break;
  15165. for ( DWORD iVal=0; iVal<uValues; iVal++ )
  15166. {
  15167. const DWORD uVal = *pMva++;
  15168. if ( uVal < uMin || uVal > uMax )
  15169. LOC_FAIL(( fp, "unexpected MVA value (row="INT64_FMT", attr=%u, docid="DOCID_FMT", block="INT64_FMT", index=%u, value=0x%x, min=0x%x, max=0x%x)",
  15170. iIndexEntry, iItem, uDocID, iBlock, iVal, (DWORD)uVal, (DWORD)uMin, (DWORD)uMax ));
  15171. }
  15172. }
  15173. break;
  15174. default:
  15175. break;
  15176. }
  15177. }
  15178. // progress bar
  15179. if ( iIndexEntry%1000==0 && bProgress )
  15180. {
  15181. fprintf ( fp, INT64_FMT"/"INT64_FMT"\r", iIndexEntry, m_iDocinfo );
  15182. fflush ( fp );
  15183. }
  15184. }
  15185. }
  15186. ///////////////////////////
  15187. // check kill-list
  15188. ///////////////////////////
  15189. fprintf ( fp, "checking kill-list...\n" );
  15190. // check size
  15191. if ( m_pKillList.GetNumEntries()!=m_iKillListSize )
  15192. LOC_FAIL(( fp, "kill-list size differs (expected=%d, got="INT64_FMT")",
  15193. m_iKillListSize, (int64_t)m_pKillList.GetNumEntries() ));
  15194. // check that ids are ascending
  15195. for ( DWORD uID=1; uID<m_pKillList.GetNumEntries(); uID++ )
  15196. if ( m_pKillList[uID]<=m_pKillList[uID-1] )
  15197. LOC_FAIL(( fp, "unsorted kill-list values (val[%d]=%d, val[%d]=%d)",
  15198. uID-1, (DWORD)m_pKillList[uID-1], uID, (DWORD)m_pKillList[uID] ));
  15199. ///////////////////////////
  15200. // all finished
  15201. ///////////////////////////
  15202. // well, no known kinds of failures, maybe some unknown ones
  15203. tmCheck = sphMicroTimer() - tmCheck;
  15204. if ( !iFails )
  15205. fprintf ( fp, "check passed" );
  15206. else if ( iFails!=iFailsPrinted )
  15207. fprintf ( fp, "check FAILED, %d of %d failures reported", iFailsPrinted, iFails );
  15208. else
  15209. fprintf ( fp, "check FAILED, %d failures reported", iFails );
  15210. fprintf ( fp, ", %d.%d sec elapsed\n", (int)(tmCheck/1000000), (int)((tmCheck/100000)%10) );
  15211. return Min ( iFails, 255 ); // this is the exitcode; so cap it
  15212. } // NOLINT function length
  15213. //////////////////////////////////////////////////////////////////////////
  15214. /// morphology
  15215. enum
  15216. {
  15217. SPH_MORPH_STEM_EN,
  15218. SPH_MORPH_STEM_RU_CP1251,
  15219. SPH_MORPH_STEM_RU_UTF8,
  15220. SPH_MORPH_STEM_CZ,
  15221. SPH_MORPH_STEM_AR_UTF8,
  15222. SPH_MORPH_SOUNDEX,
  15223. SPH_MORPH_METAPHONE_SBCS,
  15224. SPH_MORPH_METAPHONE_UTF8,
  15225. SPH_MORPH_AOTLEMMER_RU_CP1251,
  15226. SPH_MORPH_AOTLEMMER_RU_UTF8,
  15227. SPH_MORPH_AOTLEMMER_RU_ALL,
  15228. SPH_MORPH_LIBSTEMMER_FIRST,
  15229. SPH_MORPH_LIBSTEMMER_LAST = SPH_MORPH_LIBSTEMMER_FIRST + 64
  15230. };
  15231. /////////////////////////////////////////////////////////////////////////////
  15232. // BASE DICTIONARY INTERFACE
  15233. /////////////////////////////////////////////////////////////////////////////
  15234. void CSphDict::DictBegin ( CSphAutofile &, CSphAutofile &, int, ThrottleState_t * ) {}
  15235. void CSphDict::DictEntry ( const CSphDictEntry & ) {}
  15236. void CSphDict::DictEndEntries ( SphOffset_t ) {}
  15237. bool CSphDict::DictEnd ( DictHeader_t *, int, CSphString &, ThrottleState_t * ) { return true; }
  15238. bool CSphDict::DictIsError () const { return true; }
  15239. /////////////////////////////////////////////////////////////////////////////
  15240. // CRC32/64 DICTIONARIES
  15241. /////////////////////////////////////////////////////////////////////////////
  15242. /// common CRC32/64 dictionary stuff
  15243. struct CSphDictCRCTraits : CSphDict
  15244. {
  15245. CSphDictCRCTraits ();
  15246. virtual ~CSphDictCRCTraits ();
  15247. virtual void LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer );
  15248. virtual void LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords );
  15249. virtual void WriteStopwords ( CSphWriter & tWriter );
  15250. virtual bool LoadWordforms ( const CSphVector<CSphString> & dFiles, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex );
  15251. virtual void WriteWordforms ( CSphWriter & tWriter );
  15252. virtual const CSphWordforms * GetWordforms() { return m_pWordforms; }
  15253. virtual void DisableWordforms() { m_bDisableWordforms = true; }
  15254. virtual int SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sMessage );
  15255. virtual bool HasMorphology() const;
  15256. virtual void ApplyStemmers ( BYTE * pWord );
  15257. virtual void Setup ( const CSphDictSettings & tSettings ) { m_tSettings = tSettings; }
  15258. virtual const CSphDictSettings & GetSettings () const { return m_tSettings; }
  15259. virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_dSWFileInfos; }
  15260. virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_dWFFileInfos; }
  15261. virtual const CSphMultiformContainer * GetMultiWordforms () const;
  15262. static void SweepWordformContainers ( const CSphVector<CSphSavedFile> & dFiles );
  15263. virtual void DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle );
  15264. virtual void DictEntry ( const CSphDictEntry & tEntry );
  15265. virtual void DictEndEntries ( SphOffset_t iDoclistOffset );
  15266. virtual bool DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * );
  15267. virtual bool DictIsError () const { return m_wrDict.IsError(); }
  15268. protected:
  15269. CSphVector < int > m_dMorph;
  15270. #if USE_LIBSTEMMER
  15271. CSphVector < sb_stemmer * > m_dStemmers;
  15272. struct DescStemmer_t
  15273. {
  15274. CSphString m_sAlgo;
  15275. CSphString m_sEnc;
  15276. };
  15277. CSphVector<DescStemmer_t> m_dDescStemmers;
  15278. #endif
  15279. int m_iStopwords; ///< stopwords count
  15280. SphWordID_t * m_pStopwords; ///< stopwords ID list
  15281. CSphFixedVector<SphWordID_t> m_dStopwordContainer;
  15282. protected:
  15283. int ParseMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError );
  15284. SphWordID_t FilterStopword ( SphWordID_t uID ) const; ///< filter ID against stopwords list
  15285. CSphDict * CloneBase ( CSphDictCRCTraits * pDict ) const;
  15286. virtual bool HasState () const;
  15287. CSphTightVector<CSphWordlistCheckpoint> m_dCheckpoints; ///< checkpoint offsets
  15288. CSphWriter m_wrDict; ///< final dict file writer
  15289. CSphString m_sWriterError; ///< writer error message storage
  15290. int m_iEntries; ///< dictionary entries stored
  15291. SphOffset_t m_iLastDoclistPos;
  15292. SphWordID_t m_iLastWordID;
  15293. bool m_bDisableWordforms;
  15294. private:
  15295. CSphWordforms * m_pWordforms;
  15296. CSphVector<CSphSavedFile> m_dSWFileInfos;
  15297. CSphVector<CSphSavedFile> m_dWFFileInfos;
  15298. CSphDictSettings m_tSettings;
  15299. static CSphVector<CSphWordforms*> m_dWordformContainers;
  15300. CSphWordforms * GetWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const CSphVector<CSphString> * pEmbeddedWordforms, const ISphTokenizer * pTokenizer, const char * sIndex );
  15301. CSphWordforms * LoadWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const CSphVector<CSphString> * pEmbeddedWordforms, const ISphTokenizer * pTokenizer, const char * sIndex );
  15302. int InitMorph ( const char * szMorph, int iLength, bool bUseUTF8, CSphString & sError );
  15303. int AddMorph ( int iMorph ); ///< helper that always returns ST_OK
  15304. bool StemById ( BYTE * pWord, int iStemmer );
  15305. void AddWordform ( CSphWordforms * pContainer, char * sBuffer, int iLen, ISphTokenizer * pTokenizer, const char * szFile );
  15306. };
  15307. CSphVector<CSphWordforms*> CSphDictCRCTraits::m_dWordformContainers;
  15308. /// specialized CRC32/64 implementations
  15309. template < bool CRC32DICT >
  15310. struct CSphDictCRC : public CSphDictCRCTraits
  15311. {
  15312. inline SphWordID_t DoCrc ( const BYTE * pWord ) const;
  15313. inline SphWordID_t DoCrc ( const BYTE * pWord, int iLen ) const;
  15314. virtual SphWordID_t GetWordID ( BYTE * pWord );
  15315. virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops );
  15316. virtual SphWordID_t GetWordIDWithMarkers ( BYTE * pWord );
  15317. virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord );
  15318. virtual bool IsStopWord ( const BYTE * pWord ) const;
  15319. virtual CSphDict * Clone () const { return CloneBase ( new CSphDictCRC<CRC32DICT>() ); }
  15320. };
  15321. /////////////////////////////////////////////////////////////////////////////
  15322. uint64_t sphFNV64 ( const BYTE * s )
  15323. {
  15324. return sphFNV64cont ( s, SPH_FNV64_SEED );
  15325. }
  15326. uint64_t sphFNV64 ( const BYTE * s, int iLen, uint64_t uPrev )
  15327. {
  15328. uint64_t hval = uPrev;
  15329. for ( ; iLen>0; iLen-- )
  15330. {
  15331. // xor the bottom with the current octet
  15332. hval ^= (uint64_t)*s++;
  15333. // multiply by the 64 bit FNV magic prime mod 2^64
  15334. hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); // gcc optimization
  15335. }
  15336. return hval;
  15337. }
  15338. uint64_t sphFNV64cont ( const BYTE * s, uint64_t uPrev )
  15339. {
  15340. uint64_t hval = uPrev;
  15341. while ( *s )
  15342. {
  15343. // xor the bottom with the current octet
  15344. hval ^= (uint64_t)*s++;
  15345. // multiply by the 64 bit FNV magic prime mod 2^64
  15346. hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); // gcc optimization
  15347. }
  15348. return hval;
  15349. }
  15350. /////////////////////////////////////////////////////////////////////////////
  15351. extern DWORD g_dSphinxCRC32 [ 256 ];
  15352. bool sphCalcFileCRC32 ( const char * szFilename, DWORD & uCRC32 )
  15353. {
  15354. uCRC32 = 0;
  15355. if ( !szFilename )
  15356. return false;
  15357. FILE * pFile = fopen ( szFilename, "rb" );
  15358. if ( !pFile )
  15359. return false;
  15360. DWORD crc = ~((DWORD)0);
  15361. const int BUFFER_SIZE = 131072;
  15362. static BYTE * pBuffer = NULL;
  15363. if ( !pBuffer )
  15364. pBuffer = new BYTE [ BUFFER_SIZE ];
  15365. int iBytesRead;
  15366. while ( ( iBytesRead = fread ( pBuffer, 1, BUFFER_SIZE, pFile ) )!=0 )
  15367. {
  15368. for ( int i=0; i<iBytesRead; i++ )
  15369. crc = (crc >> 8) ^ g_dSphinxCRC32 [ (crc ^ pBuffer[i]) & 0xff ];
  15370. }
  15371. fclose ( pFile );
  15372. uCRC32 = ~crc;
  15373. return true;
  15374. }
  15375. static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo )
  15376. {
  15377. if ( !szFilename )
  15378. {
  15379. memset ( &tInfo, 0, sizeof ( tInfo ) );
  15380. return false;
  15381. }
  15382. tInfo.m_sFilename = szFilename;
  15383. struct_stat tStat;
  15384. memset ( &tStat, 0, sizeof ( tStat ) );
  15385. if ( stat ( szFilename, &tStat ) < 0 )
  15386. memset ( &tStat, 0, sizeof ( tStat ) );
  15387. tInfo.m_uSize = tStat.st_size;
  15388. tInfo.m_uCTime = tStat.st_ctime;
  15389. tInfo.m_uMTime = tStat.st_mtime;
  15390. DWORD uCRC32 = 0;
  15391. if ( !sphCalcFileCRC32 ( szFilename, uCRC32 ) )
  15392. return false;
  15393. tInfo.m_uCRC32 = uCRC32;
  15394. return true;
  15395. }
  15396. /////////////////////////////////////////////////////////////////////////////
  15397. CSphWordforms::CSphWordforms()
  15398. : m_iRefCount ( 0 )
  15399. , m_uTokenizerFNV ( 0 )
  15400. , m_bHavePostMorphNF ( false )
  15401. , m_pMultiWordforms ( NULL )
  15402. {
  15403. }
  15404. CSphWordforms::~CSphWordforms()
  15405. {
  15406. if ( m_pMultiWordforms )
  15407. {
  15408. m_pMultiWordforms->m_Hash.IterateStart ();
  15409. while ( m_pMultiWordforms->m_Hash.IterateNext () )
  15410. {
  15411. CSphMultiforms * pWordforms = m_pMultiWordforms->m_Hash.IterateGet ();
  15412. ARRAY_FOREACH ( i, pWordforms->m_pForms )
  15413. SafeDelete ( pWordforms->m_pForms[i] );
  15414. SafeDelete ( pWordforms );
  15415. }
  15416. SafeDelete ( m_pMultiWordforms );
  15417. }
  15418. }
  15419. bool CSphWordforms::IsEqual ( const CSphVector<CSphSavedFile> & dFiles )
  15420. {
  15421. if ( m_dFiles.GetLength()!=dFiles.GetLength() )
  15422. return false;
  15423. ARRAY_FOREACH ( i, m_dFiles )
  15424. {
  15425. const CSphSavedFile & tF1 = m_dFiles[i];
  15426. const CSphSavedFile & tF2 = dFiles[i];
  15427. if ( tF1.m_sFilename!=tF2.m_sFilename || tF1.m_uCRC32!=tF2.m_uCRC32 || tF1.m_uSize!=tF2.m_uSize ||
  15428. tF1.m_uCTime!=tF2.m_uCTime || tF1.m_uMTime!=tF2.m_uMTime )
  15429. return false;
  15430. }
  15431. return true;
  15432. }
  15433. bool CSphWordforms::ToNormalForm ( BYTE * pWord, bool bBefore ) const
  15434. {
  15435. int * pIndex = m_dHash ( (char *)pWord );
  15436. if ( !pIndex )
  15437. return false;
  15438. if ( *pIndex<0 || *pIndex>=m_dNormalForms.GetLength () )
  15439. return false;
  15440. if ( bBefore==m_dNormalForms[*pIndex].m_bAfterMorphology )
  15441. return false;
  15442. if ( m_dNormalForms [*pIndex].m_sWord.IsEmpty () )
  15443. return false;
  15444. strcpy ( (char *)pWord, m_dNormalForms[*pIndex].m_sWord.cstr() ); // NOLINT
  15445. return true;
  15446. }
  15447. /////////////////////////////////////////////////////////////////////////////
  15448. CSphDictCRCTraits::CSphDictCRCTraits ()
  15449. : m_iStopwords ( 0 )
  15450. , m_pStopwords ( NULL )
  15451. , m_dStopwordContainer ( 0 )
  15452. , m_iEntries ( 0 )
  15453. , m_iLastDoclistPos ( 0 )
  15454. , m_iLastWordID ( 0 )
  15455. , m_bDisableWordforms ( false )
  15456. , m_pWordforms ( NULL )
  15457. {
  15458. }
  15459. CSphDictCRCTraits::~CSphDictCRCTraits ()
  15460. {
  15461. #if USE_LIBSTEMMER
  15462. ARRAY_FOREACH ( i, m_dStemmers )
  15463. sb_stemmer_delete ( m_dStemmers[i] );
  15464. #endif
  15465. if ( m_pWordforms )
  15466. --m_pWordforms->m_iRefCount;
  15467. }
  15468. SphWordID_t CSphDictCRCTraits::FilterStopword ( SphWordID_t uID ) const
  15469. {
  15470. if ( !m_iStopwords )
  15471. return uID;
  15472. // OPTIMIZE: binary search is not too good, could do some hashing instead
  15473. SphWordID_t * pStart = m_pStopwords;
  15474. SphWordID_t * pEnd = m_pStopwords + m_iStopwords - 1;
  15475. do
  15476. {
  15477. if ( uID==*pStart || uID==*pEnd )
  15478. return 0;
  15479. if ( uID<*pStart || uID>*pEnd )
  15480. return uID;
  15481. SphWordID_t * pMid = pStart + (pEnd-pStart)/2;
  15482. if ( uID==*pMid )
  15483. return 0;
  15484. if ( uID<*pMid )
  15485. pEnd = pMid;
  15486. else
  15487. pStart = pMid;
  15488. } while ( pEnd-pStart>1 );
  15489. return uID;
  15490. }
  15491. int CSphDictCRCTraits::ParseMorphology ( const char * sMorph, bool bUseUTF8, CSphString & sMessage )
  15492. {
  15493. int iRes = ST_OK;
  15494. for ( const char * sStart=sMorph; ; )
  15495. {
  15496. while ( *sStart && ( sphIsSpace ( *sStart ) || *sStart==',' ) )
  15497. ++sStart;
  15498. if ( !*sStart )
  15499. break;
  15500. const char * sWordStart = sStart;
  15501. while ( *sStart && !sphIsSpace ( *sStart ) && *sStart!=',' )
  15502. ++sStart;
  15503. if ( sStart > sWordStart )
  15504. {
  15505. switch ( InitMorph ( sWordStart, sStart - sWordStart, bUseUTF8, sMessage ) )
  15506. {
  15507. case ST_ERROR: return ST_ERROR;
  15508. case ST_WARNING: iRes = ST_WARNING;
  15509. default: break;
  15510. }
  15511. }
  15512. }
  15513. return iRes;
  15514. }
  15515. int CSphDictCRCTraits::InitMorph ( const char * szMorph, int iLength, bool bUseUTF8, CSphString & sMessage )
  15516. {
  15517. if ( iLength==0 )
  15518. return ST_OK;
  15519. if ( iLength==4 && !strncmp ( szMorph, "none", iLength ) )
  15520. return ST_OK;
  15521. if ( iLength==7 && !strncmp ( szMorph, "stem_en", iLength ) )
  15522. {
  15523. stem_en_init ();
  15524. return AddMorph ( SPH_MORPH_STEM_EN );
  15525. }
  15526. if ( iLength==7 && !strncmp ( szMorph, "stem_ru", iLength ) )
  15527. {
  15528. if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_CP1251 ) || m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_UTF8 ) )
  15529. {
  15530. sMessage.SetSprintf ( "stem_ru and lemmatize_ru clash" );
  15531. return ST_ERROR;
  15532. }
  15533. if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_ALL ) )
  15534. {
  15535. sMessage.SetSprintf ( "stem_ru and lemmatize_ru_all clash" );
  15536. return ST_ERROR;
  15537. }
  15538. stem_ru_init ();
  15539. return AddMorph ( bUseUTF8 ? SPH_MORPH_STEM_RU_UTF8 : SPH_MORPH_STEM_RU_CP1251 );
  15540. }
  15541. if ( iLength==12 && !strncmp ( szMorph, "lemmatize_ru", iLength ) )
  15542. {
  15543. if ( m_dMorph.Contains ( SPH_MORPH_STEM_RU_CP1251 ) || m_dMorph.Contains ( SPH_MORPH_STEM_RU_UTF8 ) )
  15544. {
  15545. sMessage.SetSprintf ( "stem_ru and lemmatize_ru clash" );
  15546. return ST_ERROR;
  15547. }
  15548. if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_ALL ) )
  15549. {
  15550. sMessage.SetSprintf ( "lemmatize_ru and lemmatize_ru_all clash" );
  15551. return ST_ERROR;
  15552. }
  15553. CSphString sDictFile;
  15554. sDictFile.SetSprintf ( "%s/ru.pak", g_sLemmatizerBase.cstr() );
  15555. if ( !sphAotInitRu ( sDictFile, sMessage ) )
  15556. return ST_ERROR;
  15557. // add manually instead of AddMorph(), because we need to update that fingerprint
  15558. int iMorph = bUseUTF8 ? SPH_MORPH_AOTLEMMER_RU_UTF8 : SPH_MORPH_AOTLEMMER_RU_CP1251;
  15559. if ( !m_dMorph.Contains ( iMorph ) )
  15560. {
  15561. assert ( m_sMorphFingerprint.IsEmpty() ); // otherwise, append a command and dictionfo
  15562. m_sMorphFingerprint.SetSprintf ( "%s:%08x", sphAotDictinfoRu().m_sName.cstr(), sphAotDictinfoRu().m_iValue );
  15563. m_dMorph.Add ( iMorph );
  15564. }
  15565. return ST_OK;
  15566. }
  15567. if ( iLength==16 && !strncmp ( szMorph, "lemmatize_ru_all", iLength ) )
  15568. {
  15569. if ( m_dMorph.Contains ( SPH_MORPH_STEM_RU_CP1251 ) || m_dMorph.Contains ( SPH_MORPH_STEM_RU_UTF8 ) )
  15570. {
  15571. sMessage.SetSprintf ( "stem_ru and lemmatize_ru_all clash" );
  15572. return ST_ERROR;
  15573. }
  15574. if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_CP1251 ) || m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_UTF8 ) )
  15575. {
  15576. sMessage.SetSprintf ( "lemmatize_ru and lemmatize_ru_all clash" );
  15577. return ST_ERROR;
  15578. }
  15579. return AddMorph ( SPH_MORPH_AOTLEMMER_RU_ALL );
  15580. }
  15581. if ( iLength==7 && !strncmp ( szMorph, "stem_cz", iLength ) )
  15582. {
  15583. stem_cz_init ();
  15584. return AddMorph ( SPH_MORPH_STEM_CZ );
  15585. }
  15586. if ( iLength==7 && !strncmp ( szMorph, "stem_ar", iLength ) )
  15587. {
  15588. if ( !bUseUTF8 )
  15589. {
  15590. sMessage.SetSprintf ( "stem_ar only supports charset_type = utf-8" );
  15591. return ST_ERROR;
  15592. }
  15593. return AddMorph ( SPH_MORPH_STEM_AR_UTF8 );
  15594. }
  15595. if ( iLength==9 && !strncmp ( szMorph, "stem_enru", iLength ) )
  15596. {
  15597. stem_en_init ();
  15598. stem_ru_init ();
  15599. AddMorph ( SPH_MORPH_STEM_EN );
  15600. return AddMorph ( bUseUTF8 ? SPH_MORPH_STEM_RU_UTF8 : SPH_MORPH_STEM_RU_CP1251 );
  15601. }
  15602. if ( iLength==7 && !strncmp ( szMorph, "soundex", iLength ) )
  15603. return AddMorph ( SPH_MORPH_SOUNDEX );
  15604. if ( iLength==9 && !strncmp ( szMorph, "metaphone", iLength ) )
  15605. return AddMorph ( bUseUTF8 ? SPH_MORPH_METAPHONE_UTF8 : SPH_MORPH_METAPHONE_SBCS );
  15606. #if USE_LIBSTEMMER
  15607. const int LIBSTEMMER_LEN = 11;
  15608. const int MAX_ALGO_LENGTH = 64;
  15609. if ( iLength > LIBSTEMMER_LEN && iLength - LIBSTEMMER_LEN < MAX_ALGO_LENGTH && !strncmp ( szMorph, "libstemmer_", LIBSTEMMER_LEN ) )
  15610. {
  15611. CSphString sAlgo;
  15612. CSphString sEnc;
  15613. sAlgo.SetBinary ( szMorph+LIBSTEMMER_LEN, iLength - LIBSTEMMER_LEN );
  15614. sb_stemmer * pStemmer = NULL;
  15615. if ( bUseUTF8 )
  15616. {
  15617. sEnc = "UTF_8";
  15618. pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
  15619. } else
  15620. {
  15621. sEnc = "ISO_8859_1";
  15622. pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
  15623. if ( !pStemmer )
  15624. {
  15625. sEnc = "ISO_8859_2";
  15626. pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
  15627. }
  15628. if ( !pStemmer )
  15629. {
  15630. sEnc = "KOI8_R";
  15631. pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
  15632. }
  15633. }
  15634. if ( !pStemmer )
  15635. {
  15636. sError.SetSprintf ( "unknown %s stemmer libstemmer_%s; skipped",
  15637. bUseUTF8 ? "UTF-8" : "SBCS", sAlgo.cstr(), );
  15638. return ST_WARNING;
  15639. }
  15640. AddMorph ( SPH_MORPH_LIBSTEMMER_FIRST + m_dStemmers.GetLength () );
  15641. ARRAY_FOREACH ( i, m_dStemmers )
  15642. {
  15643. if ( m_dStemmers[i]==pStemmer )
  15644. {
  15645. sb_stemmer_delete ( pStemmer );
  15646. return ST_OK;
  15647. }
  15648. }
  15649. m_dStemmers.Add ( pStemmer );
  15650. DescStemmer_t & tDesc = m_dDescStemmers.Add();
  15651. tDesc.m_sAlgo.Swap ( sAlgo );
  15652. tDesc.m_sEnc.Swap ( sEnc );
  15653. return ST_OK;
  15654. }
  15655. #endif
  15656. sMessage.SetBinary ( szMorph, iLength );
  15657. sMessage.SetSprintf ( "unknown stemmer %s; skipped", sMessage.cstr() );
  15658. return ST_WARNING;
  15659. }
  15660. int CSphDictCRCTraits::AddMorph ( int iMorph )
  15661. {
  15662. if ( !m_dMorph.Contains ( iMorph ) )
  15663. m_dMorph.Add ( iMorph );
  15664. return ST_OK;
  15665. }
  15666. void CSphDictCRCTraits::ApplyStemmers ( BYTE * pWord )
  15667. {
  15668. // try wordforms
  15669. if ( !m_bDisableWordforms && m_pWordforms && m_pWordforms->ToNormalForm ( pWord, true ) )
  15670. return;
  15671. // check length
  15672. if ( m_tSettings.m_iMinStemmingLen<=1 || sphUTF8Len ( (const char*)pWord )>=m_tSettings.m_iMinStemmingLen )
  15673. {
  15674. // try stemmers
  15675. ARRAY_FOREACH ( i, m_dMorph )
  15676. if ( StemById ( pWord, m_dMorph[i] ) )
  15677. break;
  15678. }
  15679. if ( !m_bDisableWordforms && m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
  15680. m_pWordforms->ToNormalForm ( pWord, false );
  15681. }
  15682. const CSphMultiformContainer * CSphDictCRCTraits::GetMultiWordforms () const
  15683. {
  15684. return m_pWordforms ? m_pWordforms->m_pMultiWordforms : NULL;
  15685. }
  15686. CSphDict * CSphDictCRCTraits::CloneBase ( CSphDictCRCTraits * pDict ) const
  15687. {
  15688. assert ( pDict );
  15689. pDict->m_tSettings = m_tSettings;
  15690. pDict->m_iStopwords = m_iStopwords;
  15691. pDict->m_pStopwords = m_pStopwords;
  15692. pDict->m_pWordforms = m_pWordforms;
  15693. if ( m_pWordforms )
  15694. m_pWordforms->m_iRefCount++;
  15695. pDict->m_dMorph = m_dMorph;
  15696. #if USE_LIBSTEMMER
  15697. assert ( m_dDescStemmers.GetLength()==m_dStemmers.GetLength() );
  15698. pDict->m_dDescStemmers = m_dDescStemmers;
  15699. ARRAY_FOREACH ( i, m_dDescStemmers )
  15700. {
  15701. pDict->m_dStemmers.Add ( sb_stemmer_new ( m_dDescStemmers[i].m_sAlgo.cstr(), m_dDescStemmers[i].m_sEnc.cstr() ) );
  15702. assert ( pDict->m_dStemmers.Last() );
  15703. }
  15704. #endif
  15705. return pDict;
  15706. }
  15707. bool CSphDictCRCTraits::HasState() const
  15708. {
  15709. #if !USE_LIBSTEMMER
  15710. return false;
  15711. #else
  15712. return ( m_dDescStemmers.GetLength()>0 );
  15713. #endif
  15714. }
  15715. /////////////////////////////////////////////////////////////////////////////
  15716. template<>
  15717. SphWordID_t CSphDictCRC<true>::DoCrc ( const BYTE * pWord ) const
  15718. {
  15719. return sphCRC32 ( pWord );
  15720. }
  15721. template<>
  15722. SphWordID_t CSphDictCRC<false>::DoCrc ( const BYTE * pWord ) const
  15723. {
  15724. return (SphWordID_t) sphFNV64 ( pWord );
  15725. }
  15726. template<>
  15727. SphWordID_t CSphDictCRC<true>::DoCrc ( const BYTE * pWord, int iLen ) const
  15728. {
  15729. return sphCRC32 ( pWord, iLen );
  15730. }
  15731. template<>
  15732. SphWordID_t CSphDictCRC<false>::DoCrc ( const BYTE * pWord, int iLen ) const
  15733. {
  15734. return (SphWordID_t) sphFNV64 ( pWord, iLen );
  15735. }
  15736. template < bool CRC32DICT >
  15737. SphWordID_t CSphDictCRC<CRC32DICT>::GetWordID ( BYTE * pWord )
  15738. {
  15739. // apply stopword filter before stemmers
  15740. if ( GetSettings().m_bStopwordsStem && !FilterStopword ( DoCrc ( pWord ) ) )
  15741. return 0;
  15742. // skip stemmers for magic words
  15743. if ( pWord[0]>=0x20 )
  15744. ApplyStemmers ( pWord );
  15745. return GetSettings().m_bStopwordsStem ? DoCrc ( pWord ) : FilterStopword ( DoCrc ( pWord ) );
  15746. }
  15747. template < bool CRC32DICT >
  15748. SphWordID_t CSphDictCRC<CRC32DICT>::GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
  15749. {
  15750. SphWordID_t uId = DoCrc ( pWord, iLen );
  15751. return bFilterStops ? FilterStopword ( uId ) : uId;
  15752. }
  15753. template < bool CRC32DICT >
  15754. SphWordID_t CSphDictCRC<CRC32DICT>::GetWordIDWithMarkers ( BYTE * pWord )
  15755. {
  15756. ApplyStemmers ( pWord + 1 );
  15757. SphWordID_t uWordId = DoCrc ( pWord + 1 );
  15758. int iLength = strlen ( (const char *)(pWord + 1) );
  15759. pWord [iLength + 1] = MAGIC_WORD_TAIL;
  15760. pWord [iLength + 2] = '\0';
  15761. return FilterStopword ( uWordId ) ? DoCrc ( pWord ) : 0;
  15762. }
  15763. template < bool CRC32DICT >
  15764. SphWordID_t CSphDictCRC<CRC32DICT>::GetWordIDNonStemmed ( BYTE * pWord )
  15765. {
  15766. SphWordID_t uWordId = DoCrc ( pWord + 1 );
  15767. if ( !FilterStopword ( uWordId ) )
  15768. return 0;
  15769. return DoCrc ( pWord );
  15770. }
  15771. template < bool CRC32DICT >
  15772. bool CSphDictCRC<CRC32DICT>::IsStopWord ( const BYTE * pWord ) const
  15773. {
  15774. return FilterStopword ( DoCrc ( pWord ) )==0;
  15775. }
  15776. //////////////////////////////////////////////////////////////////////////
  15777. void CSphDictCRCTraits::LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer )
  15778. {
  15779. assert ( !m_pStopwords );
  15780. assert ( !m_iStopwords );
  15781. // tokenize file list
  15782. if ( !sFiles || !*sFiles )
  15783. return;
  15784. m_dSWFileInfos.Resize ( 0 );
  15785. CSphScopedPtr<ISphTokenizer> tTokenizer ( pTokenizer->Clone ( SPH_CLONE_INDEX ) );
  15786. CSphFixedVector<char> dList ( 1+strlen(sFiles) );
  15787. strcpy ( dList.Begin(), sFiles ); // NOLINT
  15788. char * pCur = dList.Begin();
  15789. char * sName = NULL;
  15790. CSphVector<SphWordID_t> dStop;
  15791. for ( ;; )
  15792. {
  15793. // find next name start
  15794. while ( *pCur && isspace(*pCur) ) pCur++;
  15795. if ( !*pCur ) break;
  15796. sName = pCur;
  15797. // find next name end
  15798. while ( *pCur && !isspace(*pCur) ) pCur++;
  15799. if ( *pCur ) *pCur++ = '\0';
  15800. BYTE * pBuffer = NULL;
  15801. CSphSavedFile tInfo;
  15802. tInfo.m_sFilename = sName;
  15803. GetFileStats ( sName, tInfo );
  15804. m_dSWFileInfos.Add ( tInfo );
  15805. // open file
  15806. struct_stat st;
  15807. if ( stat ( sName, &st )==0 )
  15808. pBuffer = new BYTE [(size_t)st.st_size];
  15809. else
  15810. {
  15811. sphWarn ( "stopwords: failed to get file size for '%s'", sName );
  15812. continue;
  15813. }
  15814. FILE * fp = fopen ( sName, "rb" );
  15815. if ( !fp )
  15816. {
  15817. sphWarn ( "failed to load stopwords from '%s'", sName );
  15818. SafeDeleteArray ( pBuffer );
  15819. continue;
  15820. }
  15821. // tokenize file
  15822. int iLength = (int)fread ( pBuffer, 1, (size_t)st.st_size, fp );
  15823. BYTE * pToken;
  15824. tTokenizer->SetBuffer ( pBuffer, iLength );
  15825. while ( ( pToken = tTokenizer->GetToken() )!=NULL )
  15826. dStop.Add ( GetWordID ( pToken ) );
  15827. // close file
  15828. fclose ( fp );
  15829. SafeDeleteArray ( pBuffer );
  15830. }
  15831. // sort stopwords
  15832. dStop.Uniq();
  15833. // store IDs
  15834. if ( dStop.GetLength() )
  15835. {
  15836. m_dStopwordContainer.Reset ( dStop.GetLength() );
  15837. ARRAY_FOREACH ( i, dStop )
  15838. m_dStopwordContainer[i] = dStop[i];
  15839. m_iStopwords = m_dStopwordContainer.GetLength ();
  15840. m_pStopwords = m_dStopwordContainer.Begin();
  15841. }
  15842. }
  15843. void CSphDictCRCTraits::LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords )
  15844. {
  15845. m_dStopwordContainer.Reset ( dStopwords.GetLength() );
  15846. ARRAY_FOREACH ( i, dStopwords )
  15847. m_dStopwordContainer[i] = dStopwords[i];
  15848. m_iStopwords = m_dStopwordContainer.GetLength ();
  15849. m_pStopwords = m_dStopwordContainer.Begin();
  15850. }
  15851. void CSphDictCRCTraits::WriteStopwords ( CSphWriter & tWriter )
  15852. {
  15853. tWriter.PutDword ( (DWORD)m_iStopwords );
  15854. for ( int i = 0; i < m_iStopwords; i++ )
  15855. tWriter.ZipOffset ( m_pStopwords[i] );
  15856. }
  15857. void CSphDictCRCTraits::SweepWordformContainers ( const CSphVector<CSphSavedFile> & dFiles )
  15858. {
  15859. for ( int i = 0; i < m_dWordformContainers.GetLength (); )
  15860. {
  15861. CSphWordforms * WC = m_dWordformContainers[i];
  15862. if ( WC->m_iRefCount==0 && !WC->IsEqual ( dFiles ) )
  15863. {
  15864. delete WC;
  15865. m_dWordformContainers.Remove ( i );
  15866. } else
  15867. ++i;
  15868. }
  15869. }
  15870. static const char * ConcatReportStrings ( const CSphVector<CSphString> & dStrings )
  15871. {
  15872. const int MAX_REPORT_LEN = 1024;
  15873. static char szReport[MAX_REPORT_LEN];
  15874. szReport[0] = '\0';
  15875. ARRAY_FOREACH ( i, dStrings )
  15876. {
  15877. int iLen = strlen ( szReport );
  15878. if ( iLen + dStrings[i].Length() + 2 > MAX_REPORT_LEN )
  15879. break;
  15880. strcat ( szReport, dStrings[i].cstr() ); // NOLINT
  15881. iLen += dStrings[i].Length();
  15882. if ( i < dStrings.GetLength()-1 )
  15883. {
  15884. szReport[iLen] = ' ';
  15885. szReport[iLen+1] = '\0';
  15886. } else
  15887. szReport[iLen] = '\0';
  15888. }
  15889. return szReport;
  15890. }
  15891. CSphWordforms * CSphDictCRCTraits::GetWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos,
  15892. const CSphVector<CSphString> * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex )
  15893. {
  15894. ARRAY_FOREACH ( i, m_dWordformContainers )
  15895. if ( m_dWordformContainers[i]->IsEqual ( dFileInfos ) )
  15896. {
  15897. CSphWordforms * pContainer = m_dWordformContainers[i];
  15898. if ( pTokenizer->GetSettingsFNV()==pContainer->m_uTokenizerFNV )
  15899. return pContainer;
  15900. CSphVector<CSphString> dErrorReport;
  15901. ARRAY_FOREACH ( j, dFileInfos )
  15902. dErrorReport.Add ( dFileInfos[j].m_sFilename );
  15903. const char * szAllFiles = ConcatReportStrings ( dErrorReport );
  15904. sphWarning ( "index '%s': wordforms file '%s' is shared with index '%s', "
  15905. "but tokenizer settings are different; IGNORING wordforms",
  15906. sIndex, szAllFiles, pContainer->m_sIndexName.cstr() );
  15907. return NULL;
  15908. }
  15909. CSphWordforms * pContainer = LoadWordformContainer ( dFileInfos, pEmbedded, pTokenizer, sIndex );
  15910. if ( pContainer )
  15911. m_dWordformContainers.Add ( pContainer );
  15912. return pContainer;
  15913. }
  15914. void CSphDictCRCTraits::AddWordform ( CSphWordforms * pContainer, char * sBuffer, int iLen,
  15915. ISphTokenizer * pTokenizer, const char * szFile )
  15916. {
  15917. CSphString sFrom;
  15918. bool bSeparatorFound = false;
  15919. const char * pStart = sBuffer;
  15920. while ( *pStart && sphIsSpace(*pStart) )
  15921. pStart++;
  15922. bool bAfterMorphology = *pStart=='~';
  15923. if ( bAfterMorphology )
  15924. pStart++;
  15925. // parse the line
  15926. pTokenizer->SetBuffer ( (BYTE*)pStart, iLen-(pStart-sBuffer) );
  15927. CSphScopedPtr<CSphMultiform> tMultiWordform ( NULL );
  15928. CSphString sKey;
  15929. bool bStopwordsPresent = false;
  15930. BYTE * pFrom = NULL;
  15931. while ( ( pFrom = pTokenizer->GetToken () )!=NULL )
  15932. {
  15933. if ( *pFrom=='#' && pTokenizer->GetLastTokenLen()==1 )
  15934. break;
  15935. const BYTE * pCur = (const BYTE *) pTokenizer->GetBufferPtr ();
  15936. while ( isspace(*pCur) ) pCur++;
  15937. if ( *pCur=='>' || ( *pCur=='=' && *(pCur+1)=='>' ) )
  15938. {
  15939. sFrom = (const char*)pFrom;
  15940. bSeparatorFound = true;
  15941. pTokenizer->SetBufferPtr ( (const char*) pCur+(*pCur=='=' ? 2 : 1) );
  15942. break;
  15943. } else if ( *pCur=='#' )
  15944. break;
  15945. else
  15946. {
  15947. if ( !tMultiWordform.Ptr() )
  15948. {
  15949. tMultiWordform = new CSphMultiform;
  15950. sKey = (const char*)pFrom;
  15951. } else
  15952. {
  15953. tMultiWordform->m_dTokens.Add ( (const char*)pFrom );
  15954. if ( !bStopwordsPresent && !GetWordID ( pFrom, tMultiWordform->m_dTokens.Last().Length(), true ) )
  15955. bStopwordsPresent = true;
  15956. }
  15957. }
  15958. }
  15959. if ( !pFrom || *pFrom=='#' )
  15960. return;
  15961. if ( !bSeparatorFound )
  15962. {
  15963. sphWarning ( "index '%s': no wordform separator found ( wordform='%s' ). Fix your wordforms file '%s'.",
  15964. pContainer->m_sIndexName.cstr(), sBuffer, szFile );
  15965. return;
  15966. }
  15967. BYTE * pTo = pTokenizer->GetToken ();
  15968. if ( !pTo )
  15969. {
  15970. sphWarning ( "index '%s': no destination token found ( wordform='%s' ). Fix your wordforms file '%s'.",
  15971. pContainer->m_sIndexName.cstr(), sBuffer, szFile );
  15972. return;
  15973. }
  15974. if ( *pTo=='#' )
  15975. {
  15976. sphWarning ( "index '%s': misplaced comment ( wordform='%s' ). Fix your wordforms file '%s'.",
  15977. pContainer->m_sIndexName.cstr(), sBuffer, szFile );
  15978. return;
  15979. }
  15980. CSphString sTo ( (const char *)pTo );
  15981. if ( tMultiWordform.Ptr() )
  15982. {
  15983. if ( bAfterMorphology )
  15984. {
  15985. sphWarning ( "index '%s': '~' modifier is incompatible with wordforms "
  15986. "that have several source words ( wordform='%s' ). Fix your wordforms file '%s'.",
  15987. pContainer->m_sIndexName.cstr(), sBuffer, szFile );
  15988. return;
  15989. }
  15990. tMultiWordform->m_dTokens.Add ( sFrom );
  15991. bool bToIsStopword = !GetWordID ( pTo, sTo.Length(), true );
  15992. bool bKeyIsStopword = !GetWordID ( (BYTE *)sKey.cstr(), sKey.Length(), true );
  15993. if ( bToIsStopword || bStopwordsPresent || bKeyIsStopword )
  15994. {
  15995. const char * szStopwordReport = ConcatReportStrings ( tMultiWordform->m_dTokens );
  15996. sphWarning ( "index '%s': wordforms contain stopwords ( wordform='%s %s> %s' ). Fix your wordforms file '%s'.",
  15997. pContainer->m_sIndexName.cstr(), sKey.cstr(), szStopwordReport, sTo.cstr(), szFile );
  15998. }
  15999. if ( bToIsStopword )
  16000. return;
  16001. if ( bStopwordsPresent )
  16002. ARRAY_FOREACH ( i, tMultiWordform->m_dTokens )
  16003. if ( !GetWordID ( (BYTE *)( tMultiWordform->m_dTokens[i].cstr() ), tMultiWordform->m_dTokens[i].Length(), true ) )
  16004. {
  16005. tMultiWordform->m_dTokens.Remove(i);
  16006. i--;
  16007. }
  16008. if ( bKeyIsStopword )
  16009. {
  16010. if ( tMultiWordform->m_dTokens.GetLength() )
  16011. {
  16012. sKey = tMultiWordform->m_dTokens[0];
  16013. tMultiWordform->m_dTokens.Remove(0);
  16014. } else
  16015. return;
  16016. }
  16017. if ( !tMultiWordform->m_dTokens.GetLength() )
  16018. {
  16019. tMultiWordform.Reset();
  16020. sFrom = sKey;
  16021. }
  16022. } else
  16023. {
  16024. if ( !GetWordID ( (BYTE *)sFrom.cstr(), sFrom.Length(), true ) || !GetWordID ( pTo, sTo.Length(), true ) )
  16025. {
  16026. sphWarning ( "index '%s': wordforms contain stopwords ( wordform='%s' ). Fix your wordforms file '%s'.",
  16027. pContainer->m_sIndexName.cstr(), sBuffer, szFile );
  16028. return;
  16029. }
  16030. }
  16031. const CSphString & sSourceWordform = tMultiWordform.Ptr() ? sTo : sFrom;
  16032. // check wordform that source token is a new token or has same destination token
  16033. int * pRefTo = pContainer->m_dHash ( sSourceWordform );
  16034. assert ( !pRefTo || ( *pRefTo>=0 && *pRefTo<pContainer->m_dNormalForms.GetLength() ) );
  16035. if ( !tMultiWordform.Ptr() && pRefTo )
  16036. {
  16037. // replace with a new wordform
  16038. if ( pContainer->m_dNormalForms[*pRefTo].m_sWord!=sTo || pContainer->m_dNormalForms[*pRefTo].m_bAfterMorphology!=bAfterMorphology )
  16039. {
  16040. CSphStoredNF & tRefTo = pContainer->m_dNormalForms[*pRefTo];
  16041. sphWarning ( "index '%s': duplicate wordform found - overridden ( current='%s', old='%s%s > %s' ). Fix your wordforms file '%s'.",
  16042. pContainer->m_sIndexName.cstr(), sBuffer, tRefTo.m_bAfterMorphology ? "~" : "", sSourceWordform.cstr(), tRefTo.m_sWord.cstr(), szFile );
  16043. tRefTo.m_sWord = sTo;
  16044. tRefTo.m_bAfterMorphology = bAfterMorphology;
  16045. pContainer->m_bHavePostMorphNF |= bAfterMorphology;
  16046. } else
  16047. sphWarning ( "index '%s': duplicate wordform found ( '%s' ). Fix your wordforms file '%s'.",
  16048. pContainer->m_sIndexName.cstr(), sBuffer, szFile );
  16049. return;
  16050. }
  16051. if ( !pRefTo && !tMultiWordform.Ptr() )
  16052. {
  16053. CSphStoredNF tForm;
  16054. tForm.m_sWord = sTo;
  16055. tForm.m_bAfterMorphology = bAfterMorphology;
  16056. pContainer->m_bHavePostMorphNF |= bAfterMorphology;
  16057. if ( !pContainer->m_dNormalForms.GetLength()
  16058. || pContainer->m_dNormalForms.Last().m_sWord!=sTo
  16059. || pContainer->m_dNormalForms.Last().m_bAfterMorphology!=bAfterMorphology)
  16060. pContainer->m_dNormalForms.Add ( tForm );
  16061. pContainer->m_dHash.Add ( pContainer->m_dNormalForms.GetLength()-1, sSourceWordform );
  16062. }
  16063. if ( tMultiWordform.Ptr() )
  16064. {
  16065. CSphMultiform * pMultiWordform = tMultiWordform.LeakPtr();
  16066. pMultiWordform->m_sNormalForm = sTo;
  16067. pMultiWordform->m_iNormalTokenLen = pTokenizer->GetLastTokenLen ();
  16068. if ( !pContainer->m_pMultiWordforms )
  16069. pContainer->m_pMultiWordforms = new CSphMultiformContainer;
  16070. CSphMultiforms ** pWordforms = pContainer->m_pMultiWordforms->m_Hash ( sKey );
  16071. if ( pWordforms )
  16072. {
  16073. ARRAY_FOREACH ( iMultiform, (*pWordforms)->m_pForms )
  16074. {
  16075. CSphMultiform * pStoredMF = (*pWordforms)->m_pForms[iMultiform];
  16076. if ( pStoredMF->m_dTokens.GetLength()==pMultiWordform->m_dTokens.GetLength() )
  16077. {
  16078. bool bSameTokens = true;
  16079. ARRAY_FOREACH_COND ( iToken, pStoredMF->m_dTokens, bSameTokens )
  16080. if ( pStoredMF->m_dTokens[iToken]!=pMultiWordform->m_dTokens[iToken] )
  16081. bSameTokens = false;
  16082. if ( bSameTokens )
  16083. {
  16084. const char * szStoredTokens = ConcatReportStrings ( pStoredMF->m_dTokens );
  16085. sphWarning ( "index '%s': duplicate wordform found - overridden ( current='%s', old='%s %s > %s' ). Fix your wordforms file '%s'.",
  16086. pContainer->m_sIndexName.cstr(), sBuffer, sKey.cstr(), szStoredTokens, pStoredMF->m_sNormalForm.cstr(), szFile );
  16087. pStoredMF->m_iNormalTokenLen = pMultiWordform->m_iNormalTokenLen;
  16088. pStoredMF->m_sNormalForm = pMultiWordform->m_sNormalForm;
  16089. SafeDelete ( pMultiWordform );
  16090. break; // otherwise, we crash next turn
  16091. }
  16092. }
  16093. }
  16094. if ( pMultiWordform )
  16095. {
  16096. (*pWordforms)->m_pForms.Add ( pMultiWordform );
  16097. (*pWordforms)->m_iMinTokens = Min ( (*pWordforms)->m_iMinTokens, pMultiWordform->m_dTokens.GetLength () );
  16098. (*pWordforms)->m_iMaxTokens = Max ( (*pWordforms)->m_iMaxTokens, pMultiWordform->m_dTokens.GetLength () );
  16099. pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, (*pWordforms)->m_iMaxTokens );
  16100. }
  16101. } else
  16102. {
  16103. CSphMultiforms * pNewWordforms = new CSphMultiforms;
  16104. pNewWordforms->m_pForms.Add ( pMultiWordform );
  16105. pNewWordforms->m_iMinTokens = pMultiWordform->m_dTokens.GetLength ();
  16106. pNewWordforms->m_iMaxTokens = pMultiWordform->m_dTokens.GetLength ();
  16107. pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, pNewWordforms->m_iMaxTokens );
  16108. pContainer->m_pMultiWordforms->m_Hash.Add ( pNewWordforms, sKey );
  16109. }
  16110. }
  16111. }
  16112. CSphWordforms * CSphDictCRCTraits::LoadWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos,
  16113. const CSphVector<CSphString> * pEmbeddedWordforms, const ISphTokenizer * pTokenizer, const char * sIndex )
  16114. {
  16115. // allocate it
  16116. CSphWordforms * pContainer = new CSphWordforms();
  16117. pContainer->m_dFiles = dFileInfos;
  16118. pContainer->m_uTokenizerFNV = pTokenizer->GetSettingsFNV();
  16119. pContainer->m_sIndexName = sIndex;
  16120. // my tokenizer
  16121. CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( SPH_CLONE_INDEX ) );
  16122. pMyTokenizer->AddSpecials ( "#=>" );
  16123. if ( pEmbeddedWordforms )
  16124. {
  16125. CSphVector<CSphString> dFilenames;
  16126. dFilenames.Resize ( dFileInfos.GetLength() );
  16127. ARRAY_FOREACH ( i, dFileInfos )
  16128. dFilenames[i] = dFileInfos[i].m_sFilename;
  16129. CSphString sAllFiles = ConcatReportStrings ( dFilenames );
  16130. ARRAY_FOREACH ( i, (*pEmbeddedWordforms) )
  16131. AddWordform ( pContainer, (char*)(*pEmbeddedWordforms)[i].cstr(),
  16132. (*pEmbeddedWordforms)[i].Length(), pMyTokenizer.Ptr(), sAllFiles.cstr() );
  16133. } else
  16134. {
  16135. char sBuffer [ 6*SPH_MAX_WORD_LEN + 512 ]; // enough to hold 2 UTF-8 words, plus some whitespace overhead
  16136. ARRAY_FOREACH ( i, dFileInfos )
  16137. {
  16138. CSphAutoreader rdWordforms;
  16139. const char * szFile = dFileInfos[i].m_sFilename.cstr();
  16140. CSphString sError;
  16141. if ( !rdWordforms.Open ( szFile, sError ) )
  16142. {
  16143. sphWarning ( "index '%s': %s", sIndex, sError.cstr() );
  16144. return NULL;
  16145. }
  16146. int iLen;
  16147. while ( ( iLen = rdWordforms.GetLine ( sBuffer, sizeof(sBuffer) ) )>=0 )
  16148. AddWordform ( pContainer, sBuffer, iLen, pMyTokenizer.Ptr(), szFile );
  16149. }
  16150. }
  16151. return pContainer;
  16152. }
  16153. bool CSphDictCRCTraits::LoadWordforms ( const CSphVector<CSphString> & dFiles,
  16154. const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex )
  16155. {
  16156. if ( pEmbedded )
  16157. {
  16158. m_dWFFileInfos.Resize ( pEmbedded->m_dWordformFiles.GetLength() );
  16159. ARRAY_FOREACH ( i, m_dWFFileInfos )
  16160. m_dWFFileInfos[i] = pEmbedded->m_dWordformFiles[i];
  16161. } else
  16162. {
  16163. m_dWFFileInfos.Reserve ( dFiles.GetLength() );
  16164. CSphSavedFile tFile;
  16165. ARRAY_FOREACH ( i, dFiles )
  16166. if ( !dFiles[i].IsEmpty() )
  16167. {
  16168. if ( GetFileStats ( dFiles[i].cstr(), tFile ) )
  16169. m_dWFFileInfos.Add ( tFile );
  16170. else
  16171. sphWarning ( "index '%s': wordforms file '%s' not found", sIndex, dFiles[i].cstr() );
  16172. }
  16173. }
  16174. if ( !m_dWFFileInfos.GetLength() )
  16175. return false;
  16176. SweepWordformContainers ( m_dWFFileInfos );
  16177. m_pWordforms = GetWordformContainer ( m_dWFFileInfos, pEmbedded ? &(pEmbedded->m_dWordforms) : NULL, pTokenizer, sIndex );
  16178. if ( m_pWordforms )
  16179. {
  16180. m_pWordforms->m_iRefCount++;
  16181. if ( m_pWordforms->m_bHavePostMorphNF && !m_dMorph.GetLength() )
  16182. sphWarning ( "index '%s': wordforms contain post-morphology normal forms, but no morphology was specified", sIndex );
  16183. }
  16184. return !!m_pWordforms;
  16185. }
  16186. void CSphDictCRCTraits::WriteWordforms ( CSphWriter & tWriter )
  16187. {
  16188. if ( !m_pWordforms )
  16189. {
  16190. tWriter.PutDword(0);
  16191. return;
  16192. }
  16193. int nMultiforms = 0;
  16194. if ( m_pWordforms->m_pMultiWordforms )
  16195. {
  16196. CSphMultiformContainer::CSphMultiformHash & tHash = m_pWordforms->m_pMultiWordforms->m_Hash;
  16197. tHash.IterateStart();
  16198. while ( tHash.IterateNext() )
  16199. {
  16200. CSphMultiforms * pMF = tHash.IterateGet();
  16201. nMultiforms += pMF ? pMF->m_pForms.GetLength() : 0;
  16202. }
  16203. }
  16204. tWriter.PutDword ( m_pWordforms->m_dHash.GetLength()+nMultiforms );
  16205. m_pWordforms->m_dHash.IterateStart();
  16206. while ( m_pWordforms->m_dHash.IterateNext() )
  16207. {
  16208. const CSphString & sKey = m_pWordforms->m_dHash.IterateGetKey();
  16209. int iIndex = m_pWordforms->m_dHash.IterateGet();
  16210. CSphString sLine;
  16211. sLine.SetSprintf ( "%s%s > %s", m_pWordforms->m_dNormalForms[iIndex].m_bAfterMorphology ? "~" : "",
  16212. sKey.cstr(), m_pWordforms->m_dNormalForms[iIndex].m_sWord.cstr() );
  16213. tWriter.PutString ( sLine );
  16214. }
  16215. if ( m_pWordforms->m_pMultiWordforms )
  16216. {
  16217. CSphMultiformContainer::CSphMultiformHash & tHash = m_pWordforms->m_pMultiWordforms->m_Hash;
  16218. tHash.IterateStart();
  16219. while ( tHash.IterateNext() )
  16220. {
  16221. const CSphString & sKey = tHash.IterateGetKey();
  16222. CSphMultiforms * pMF = tHash.IterateGet();
  16223. if ( !pMF )
  16224. continue;
  16225. ARRAY_FOREACH ( i, pMF->m_pForms )
  16226. {
  16227. CSphString sLine;
  16228. const char * szTokens = ConcatReportStrings ( pMF->m_pForms[i]->m_dTokens );
  16229. sLine.SetSprintf ( "%s %s > %s", sKey.cstr(), szTokens, pMF->m_pForms[i]->m_sNormalForm.cstr() );
  16230. tWriter.PutString ( sLine );
  16231. }
  16232. }
  16233. }
  16234. }
  16235. int CSphDictCRCTraits::SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sMessage )
  16236. {
  16237. m_dMorph.Reset ();
  16238. #if USE_LIBSTEMMER
  16239. ARRAY_FOREACH ( i, m_dStemmers )
  16240. sb_stemmer_delete ( m_dStemmers[i] );
  16241. m_dStemmers.Reset ();
  16242. #endif
  16243. if ( !szMorph )
  16244. return ST_OK;
  16245. CSphString sOption = szMorph;
  16246. sOption.ToLower ();
  16247. CSphString sError;
  16248. int iRes = ParseMorphology ( sOption.cstr(), bUseUTF8, sMessage );
  16249. if ( iRes==ST_WARNING && sMessage.IsEmpty() )
  16250. sMessage.SetSprintf ( "invalid morphology option %s; skipped", sOption.cstr() );
  16251. return iRes;
  16252. }
  16253. bool CSphDictCRCTraits::HasMorphology() const
  16254. {
  16255. return ( m_dMorph.GetLength()>0 );
  16256. }
  16257. /// common id-based stemmer
  16258. bool CSphDictCRCTraits::StemById ( BYTE * pWord, int iStemmer )
  16259. {
  16260. char szBuf [ MAX_KEYWORD_BYTES ];
  16261. // safe quick strncpy without (!) padding and with a side of strlen
  16262. char * p = szBuf;
  16263. char * pMax = szBuf + sizeof(szBuf) - 1;
  16264. BYTE * pLastSBS = NULL;
  16265. while ( *pWord && p<pMax )
  16266. {
  16267. pLastSBS = ( *pWord )<0x80 ? pWord : pLastSBS;
  16268. *p++ = *pWord++;
  16269. }
  16270. int iLen = p - szBuf;
  16271. *p = '\0';
  16272. pWord -= iLen;
  16273. switch ( iStemmer )
  16274. {
  16275. case SPH_MORPH_STEM_EN:
  16276. stem_en ( pWord, iLen );
  16277. break;
  16278. case SPH_MORPH_STEM_RU_CP1251:
  16279. stem_ru_cp1251 ( pWord );
  16280. break;
  16281. case SPH_MORPH_STEM_RU_UTF8:
  16282. // skip stemming in case of SBC at the end of the word
  16283. if ( pLastSBS && ( pLastSBS-pWord+1 )>=iLen )
  16284. break;
  16285. // stem only UTF8 tail
  16286. if ( !pLastSBS )
  16287. {
  16288. stem_ru_utf8 ( (WORD*)pWord );
  16289. } else
  16290. {
  16291. stem_ru_utf8 ( (WORD *)( pLastSBS+1 ) );
  16292. }
  16293. break;
  16294. case SPH_MORPH_STEM_CZ:
  16295. stem_cz ( pWord );
  16296. break;
  16297. case SPH_MORPH_STEM_AR_UTF8:
  16298. stem_ar_utf8 ( pWord );
  16299. break;
  16300. case SPH_MORPH_SOUNDEX:
  16301. stem_soundex ( pWord );
  16302. break;
  16303. case SPH_MORPH_METAPHONE_SBCS:
  16304. stem_dmetaphone ( pWord, false );
  16305. break;
  16306. case SPH_MORPH_METAPHONE_UTF8:
  16307. stem_dmetaphone ( pWord, true );
  16308. break;
  16309. case SPH_MORPH_AOTLEMMER_RU_CP1251:
  16310. sphAotLemmatizeRu1251 ( pWord );
  16311. break;
  16312. case SPH_MORPH_AOTLEMMER_RU_UTF8:
  16313. sphAotLemmatizeRuUTF8 ( pWord );
  16314. break;
  16315. case SPH_MORPH_AOTLEMMER_RU_ALL:
  16316. // do the real work somewhere else
  16317. // this is mostly for warning suppressing and making some features like
  16318. // index_exact_words=1 vs expand_keywords=1 work
  16319. break;
  16320. default:
  16321. #if USE_LIBSTEMMER
  16322. if ( iStemmer>=SPH_MORPH_LIBSTEMMER_FIRST && iStemmer<SPH_MORPH_LIBSTEMMER_LAST )
  16323. {
  16324. sb_stemmer * pStemmer = m_dStemmers [iStemmer - SPH_MORPH_LIBSTEMMER_FIRST];
  16325. assert ( pStemmer );
  16326. const sb_symbol * sStemmed = sb_stemmer_stem ( pStemmer, (sb_symbol*)pWord, strlen ( (const char*)pWord ) );
  16327. int iLen = sb_stemmer_length ( pStemmer );
  16328. memcpy ( pWord, sStemmed, iLen );
  16329. pWord[iLen] = '\0';
  16330. } else
  16331. return false;
  16332. break;
  16333. #else
  16334. return false;
  16335. #endif
  16336. }
  16337. return strcmp ( (char *)pWord, szBuf )!=0;
  16338. }
  16339. void CSphDictCRCTraits::DictBegin ( CSphAutofile & , CSphAutofile & tDict, int, ThrottleState_t * pThrottle )
  16340. {
  16341. m_wrDict.CloseFile ();
  16342. m_wrDict.SetFile ( tDict, NULL, m_sWriterError );
  16343. m_wrDict.SetThrottle ( pThrottle );
  16344. m_wrDict.PutByte ( 1 );
  16345. }
  16346. bool CSphDictCRCTraits::DictEnd ( DictHeader_t * pHeader, int, CSphString & sError, ThrottleState_t * )
  16347. {
  16348. // flush wordlist checkpoints
  16349. pHeader->m_iDictCheckpointsOffset = m_wrDict.GetPos();
  16350. pHeader->m_iDictCheckpoints = m_dCheckpoints.GetLength();
  16351. ARRAY_FOREACH ( i, m_dCheckpoints )
  16352. {
  16353. assert ( m_dCheckpoints[i].m_iWordlistOffset );
  16354. m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordID );
  16355. m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordlistOffset );
  16356. }
  16357. // done
  16358. m_wrDict.CloseFile ();
  16359. if ( m_wrDict.IsError() )
  16360. sError = m_sWriterError;
  16361. return !m_wrDict.IsError();
  16362. }
  16363. void CSphDictCRCTraits::DictEntry ( const CSphDictEntry & tEntry )
  16364. {
  16365. // insert wordlist checkpoint
  16366. if ( ( m_iEntries % SPH_WORDLIST_CHECKPOINT )==0 )
  16367. {
  16368. if ( m_iEntries ) // but not the 1st entry
  16369. {
  16370. assert ( tEntry.m_iDoclistOffset > m_iLastDoclistPos );
  16371. m_wrDict.ZipInt ( 0 ); // indicate checkpoint
  16372. m_wrDict.ZipOffset ( tEntry.m_iDoclistOffset - m_iLastDoclistPos ); // store last length
  16373. }
  16374. // restart delta coding, once per SPH_WORDLIST_CHECKPOINT entries
  16375. m_iLastWordID = 0;
  16376. m_iLastDoclistPos = 0;
  16377. // begin new wordlist entry
  16378. assert ( m_wrDict.GetPos()<=UINT_MAX );
  16379. CSphWordlistCheckpoint & tCheckpoint = m_dCheckpoints.Add();
  16380. tCheckpoint.m_iWordID = tEntry.m_uWordID;
  16381. tCheckpoint.m_iWordlistOffset = m_wrDict.GetPos();
  16382. }
  16383. assert ( tEntry.m_iDoclistOffset>m_iLastDoclistPos );
  16384. m_wrDict.ZipOffset ( tEntry.m_uWordID - m_iLastWordID ); // FIXME! slow with 32bit wordids
  16385. m_wrDict.ZipOffset ( tEntry.m_iDoclistOffset - m_iLastDoclistPos );
  16386. m_iLastWordID = tEntry.m_uWordID;
  16387. m_iLastDoclistPos = tEntry.m_iDoclistOffset;
  16388. assert ( tEntry.m_iDocs );
  16389. assert ( tEntry.m_iHits );
  16390. m_wrDict.ZipInt ( tEntry.m_iDocs );
  16391. m_wrDict.ZipInt ( tEntry.m_iHits );
  16392. // write skiplist location info, if any
  16393. if ( tEntry.m_iDocs > SPH_SKIPLIST_BLOCK )
  16394. m_wrDict.ZipOffset ( tEntry.m_iSkiplistOffset );
  16395. m_iEntries++;
  16396. }
  16397. void CSphDictCRCTraits::DictEndEntries ( SphOffset_t iDoclistOffset )
  16398. {
  16399. assert ( iDoclistOffset>=m_iLastDoclistPos );
  16400. m_wrDict.ZipInt ( 0 ); // indicate checkpoint
  16401. m_wrDict.ZipOffset ( iDoclistOffset - m_iLastDoclistPos ); // store last doclist length
  16402. }
  16403. //////////////////////////////////////////////////////////////////////////
  16404. // KEYWORDS STORING DICTIONARY, INFIX HASH BUILDER
  16405. //////////////////////////////////////////////////////////////////////////
  16406. template < int SIZE >
  16407. struct Infix_t
  16408. {
  16409. DWORD m_Data[SIZE];
  16410. #ifndef NDEBUG
  16411. BYTE m_TrailingZero;
  16412. Infix_t ()
  16413. : m_TrailingZero ( 0 )
  16414. {}
  16415. #endif
  16416. void Reset ()
  16417. {
  16418. for ( int i=0; i<SIZE; i++ )
  16419. m_Data[i] = 0;
  16420. }
  16421. bool operator == ( const Infix_t<SIZE> & rhs ) const;
  16422. #if 0
  16423. bool operator == ( const Infix_t<SIZE> & rhs ) const
  16424. {
  16425. for ( int i=0; i<SIZE; i++ )
  16426. if ( m_Data[i]!=rhs.m_Data[i] )
  16427. return false;
  16428. return true;
  16429. }
  16430. #endif
  16431. };
  16432. template<>
  16433. bool Infix_t<2>::operator == ( const Infix_t<2> & rhs ) const
  16434. {
  16435. return m_Data[0]==rhs.m_Data[0] && m_Data[1]==rhs.m_Data[1];
  16436. };
  16437. template<>
  16438. bool Infix_t<3>::operator == ( const Infix_t<3> & rhs ) const
  16439. {
  16440. return m_Data[0]==rhs.m_Data[0] && m_Data[1]==rhs.m_Data[1] && m_Data[2]==rhs.m_Data[2];
  16441. };
  16442. template<>
  16443. bool Infix_t<5>::operator == ( const Infix_t<5> & rhs ) const
  16444. {
  16445. return m_Data[0]==rhs.m_Data[0] && m_Data[1]==rhs.m_Data[1] && m_Data[2]==rhs.m_Data[2]
  16446. && m_Data[3]==rhs.m_Data[3] && m_Data[4]==rhs.m_Data[4];
  16447. };
  16448. struct InfixIntvec_t
  16449. {
  16450. public:
  16451. union
  16452. {
  16453. DWORD m_dData[4];
  16454. struct
  16455. {
  16456. int m_iDynLen;
  16457. int m_iDynLimit;
  16458. DWORD * m_pDynData;
  16459. };
  16460. };
  16461. public:
  16462. InfixIntvec_t()
  16463. {
  16464. m_dData[0] = 0;
  16465. m_dData[1] = 0;
  16466. m_dData[2] = 0;
  16467. m_dData[3] = 0;
  16468. }
  16469. ~InfixIntvec_t()
  16470. {
  16471. if ( IsDynamic() )
  16472. SafeDeleteArray ( m_pDynData );
  16473. }
  16474. bool IsDynamic() const
  16475. {
  16476. return ( m_dData[0] & 0x80000000UL )!=0;
  16477. }
  16478. void Add ( DWORD uVal )
  16479. {
  16480. if ( !m_dData[0] )
  16481. {
  16482. // empty
  16483. m_dData[0] = uVal | ( 1UL<<24 );
  16484. } else if ( !IsDynamic() )
  16485. {
  16486. // 1..4 static entries
  16487. int iLen = m_dData[0] >> 24;
  16488. DWORD uLast = m_dData [ iLen-1 ] & 0xffffffUL;
  16489. // redundant
  16490. if ( uVal==uLast )
  16491. return;
  16492. // grow static part
  16493. if ( iLen<4 )
  16494. {
  16495. m_dData[iLen] = uVal;
  16496. m_dData[0] = ( m_dData[0] & 0xffffffUL ) | ( ++iLen<<24 );
  16497. return;
  16498. }
  16499. // dynamize
  16500. DWORD * pDyn = new DWORD[16];
  16501. pDyn[0] = m_dData[0] & 0xffffffUL;
  16502. pDyn[1] = m_dData[1];
  16503. pDyn[2] = m_dData[2];
  16504. pDyn[3] = m_dData[3];
  16505. pDyn[4] = uVal;
  16506. m_iDynLen = 0x80000005UL; // dynamic flag, len=5
  16507. m_iDynLimit = 16; // limit=16
  16508. m_pDynData = pDyn;
  16509. } else
  16510. {
  16511. // N dynamic entries
  16512. int iLen = m_iDynLen & 0xffffffUL;
  16513. if ( uVal==m_pDynData[iLen-1] )
  16514. return;
  16515. if ( iLen>=m_iDynLimit )
  16516. {
  16517. m_iDynLimit *= 2;
  16518. DWORD * pNew = new DWORD [ m_iDynLimit ];
  16519. for ( int i=0; i<iLen; i++ )
  16520. pNew[i] = m_pDynData[i];
  16521. SafeDeleteArray ( m_pDynData );
  16522. m_pDynData = pNew;
  16523. }
  16524. m_pDynData[iLen] = uVal;
  16525. m_iDynLen++;
  16526. }
  16527. }
  16528. bool operator == ( const InfixIntvec_t & rhs ) const
  16529. {
  16530. // check dynflag, length, maybe first element
  16531. if ( m_dData[0]!=rhs.m_dData[0] )
  16532. return false;
  16533. // check static data
  16534. if ( !IsDynamic() )
  16535. {
  16536. for ( int i=1; i<(int)(m_dData[0]>>24); i++ )
  16537. if ( m_dData[i]!=rhs.m_dData[i] )
  16538. return false;
  16539. return true;
  16540. }
  16541. // check dynamic data
  16542. const DWORD * a = m_pDynData;
  16543. const DWORD * b = rhs.m_pDynData;
  16544. const DWORD * m = a + ( m_iDynLen & 0xffffffUL );
  16545. while ( a<m )
  16546. if ( *a++!=*b++ )
  16547. return false;
  16548. return true;
  16549. }
  16550. public:
  16551. int GetLength() const
  16552. {
  16553. if ( !IsDynamic() )
  16554. return m_dData[0] >> 24;
  16555. return m_iDynLen & 0xffffffUL;
  16556. }
  16557. DWORD operator[] ( int iIndex )const
  16558. {
  16559. if ( !IsDynamic() )
  16560. return m_dData[iIndex] & 0xffffffUL;
  16561. return m_pDynData[iIndex];
  16562. }
  16563. };
  16564. void Swap ( InfixIntvec_t & a, InfixIntvec_t & b )
  16565. {
  16566. ::Swap ( a.m_dData[0], b.m_dData[0] );
  16567. ::Swap ( a.m_dData[1], b.m_dData[1] );
  16568. ::Swap ( a.m_dData[2], b.m_dData[2] );
  16569. ::Swap ( a.m_dData[3], b.m_dData[3] );
  16570. }
  16571. template < int SIZE >
  16572. struct InfixHashEntry_t
  16573. {
  16574. Infix_t<SIZE> m_tKey; ///< key, owned by the hash
  16575. InfixIntvec_t m_tValue; ///< data, owned by the hash
  16576. int m_iNext; ///< next entry in hash arena
  16577. };
  16578. template < int SIZE >
  16579. class InfixBuilder_c : public ISphInfixBuilder
  16580. {
  16581. protected:
  16582. static const int LENGTH = 1048576;
  16583. protected:
  16584. int m_dHash [ LENGTH ]; ///< all the hash entries
  16585. CSphSwapVector < InfixHashEntry_t<SIZE> > m_dArena;
  16586. CSphVector<InfixBlock_t> m_dBlocks;
  16587. CSphTightVector<BYTE> m_dBlocksWords;
  16588. public:
  16589. InfixBuilder_c();
  16590. virtual void AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint );
  16591. virtual void SaveEntries ( CSphWriter & wrDict );
  16592. virtual int SaveEntryBlocks ( CSphWriter & wrDict );
  16593. virtual int GetBlocksWordsSize () const { return m_dBlocksWords.GetLength(); }
  16594. protected:
  16595. /// add new entry
  16596. void AddEntry ( const Infix_t<SIZE> & tKey, DWORD uHash, int iCheckpoint )
  16597. {
  16598. uHash &= ( LENGTH-1 );
  16599. int iEntry = m_dArena.GetLength();
  16600. InfixHashEntry_t<SIZE> & tNew = m_dArena.Add();
  16601. tNew.m_tKey = tKey;
  16602. tNew.m_tValue.m_dData[0] = 0x1000000UL | iCheckpoint; // len=1, data=iCheckpoint
  16603. tNew.m_iNext = m_dHash[uHash];
  16604. m_dHash[uHash] = iEntry;
  16605. }
  16606. /// get value pointer by key
  16607. InfixIntvec_t * LookupEntry ( const Infix_t<SIZE> & tKey, DWORD uHash )
  16608. {
  16609. uHash &= ( LENGTH-1 );
  16610. int iEntry = m_dHash [ uHash ];
  16611. int iiEntry = 0;
  16612. while ( iEntry )
  16613. {
  16614. if ( m_dArena[iEntry].m_tKey==tKey )
  16615. {
  16616. // mtf it, if needed
  16617. if ( iiEntry )
  16618. {
  16619. m_dArena[iiEntry].m_iNext = m_dArena[iEntry].m_iNext;
  16620. m_dArena[iEntry].m_iNext = m_dHash[uHash];
  16621. m_dHash[uHash] = iEntry;
  16622. }
  16623. return &m_dArena[iEntry].m_tValue;
  16624. }
  16625. iiEntry = iEntry;
  16626. iEntry = m_dArena[iEntry].m_iNext;
  16627. }
  16628. return NULL;
  16629. }
  16630. };
  16631. template < int SIZE >
  16632. InfixBuilder_c<SIZE>::InfixBuilder_c()
  16633. {
  16634. // init the hash
  16635. for ( int i=0; i<LENGTH; i++ )
  16636. m_dHash[i] = 0;
  16637. m_dArena.Reserve ( 1048576 );
  16638. m_dArena.Resize ( 1 ); // 0 is a reserved index
  16639. }
  16640. /// single-byte case, 2-dword infixes
  16641. template<>
  16642. void InfixBuilder_c<2>::AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint )
  16643. {
  16644. Infix_t<2> sKey;
  16645. for ( int p=0; p<=iWordLength-2; p++ )
  16646. {
  16647. sKey.Reset();
  16648. BYTE * pKey = (BYTE*)sKey.m_Data;
  16649. const BYTE * s = pWord + p;
  16650. const BYTE * sMax = s + Min ( 6, iWordLength-p );
  16651. DWORD uHash = 0xffffffUL ^ g_dSphinxCRC32 [ 0xff ^ *s ];
  16652. *pKey++ = *s++; // copy first infix byte
  16653. while ( s<sMax )
  16654. {
  16655. uHash = (uHash >> 8) ^ g_dSphinxCRC32 [ (uHash ^ *s) & 0xff ];
  16656. *pKey++ = *s++; // copy another infix byte
  16657. InfixIntvec_t * pVal = LookupEntry ( sKey, uHash );
  16658. if ( pVal )
  16659. pVal->Add ( iCheckpoint );
  16660. else
  16661. AddEntry ( sKey, uHash, iCheckpoint );
  16662. }
  16663. }
  16664. }
  16665. /// UTF-8 case, 3/5-dword infixes
  16666. template < int SIZE >
  16667. void InfixBuilder_c<SIZE>::AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint )
  16668. {
  16669. int iCodes = 0; // codepoints in current word
  16670. BYTE dBytes[SPH_MAX_WORD_LEN+1]; // byte offset for each codepoints
  16671. // build an offsets table into the bytestring
  16672. dBytes[0] = 0;
  16673. for ( const BYTE * p = (const BYTE*)pWord; p<pWord+iWordLength; )
  16674. {
  16675. int iLen = 0;
  16676. BYTE uVal = *p;
  16677. while ( uVal & 0x80 )
  16678. {
  16679. uVal <<= 1;
  16680. iLen++;
  16681. }
  16682. if ( !iLen )
  16683. iLen = 1;
  16684. assert ( iLen>=1 && iLen<=3 );
  16685. p += iLen;
  16686. dBytes[iCodes+1] = dBytes[iCodes] + (BYTE)iLen;
  16687. iCodes++;
  16688. }
  16689. assert ( pWord[dBytes[iCodes]]==0 );
  16690. // generate infixes
  16691. Infix_t<SIZE> sKey;
  16692. for ( int p=0; p<=iCodes-2; p++ )
  16693. {
  16694. sKey.Reset();
  16695. BYTE * pKey = (BYTE*)sKey.m_Data;
  16696. const BYTE * s = pWord + dBytes[p];
  16697. const BYTE * sMax = pWord + dBytes[ p+Min ( 6, iCodes-p ) ];
  16698. // copy first infix codepoint
  16699. DWORD uHash = 0xffffffffUL;
  16700. do
  16701. {
  16702. uHash = (uHash >> 8) ^ g_dSphinxCRC32 [ (uHash ^ *s) & 0xff ];
  16703. *pKey++ = *s++;
  16704. } while ( ( *s & 0xC0 )==0x80 );
  16705. while ( s<sMax )
  16706. {
  16707. // copy next infix codepoint
  16708. do
  16709. {
  16710. uHash = (uHash >> 8) ^ g_dSphinxCRC32 [ (uHash ^ *s) & 0xff ];
  16711. *pKey++ = *s++;
  16712. } while ( ( *s & 0xC0 )==0x80 );
  16713. InfixIntvec_t * pVal = LookupEntry ( sKey, uHash );
  16714. if ( pVal )
  16715. pVal->Add ( iCheckpoint );
  16716. else
  16717. AddEntry ( sKey, uHash, iCheckpoint );
  16718. }
  16719. }
  16720. }
  16721. template < int SIZE >
  16722. struct InfixHashCmp_fn
  16723. {
  16724. InfixHashEntry_t<SIZE> * m_pBase;
  16725. explicit InfixHashCmp_fn ( InfixHashEntry_t<SIZE> * pBase )
  16726. : m_pBase ( pBase )
  16727. {}
  16728. bool IsLess ( int a, int b ) const
  16729. {
  16730. return strncmp ( (const char*)m_pBase[a].m_tKey.m_Data, (const char*)m_pBase[b].m_tKey.m_Data, sizeof(DWORD)*SIZE )<0;
  16731. }
  16732. };
  16733. /// is first arg a prefix of second arg
  16734. static inline bool IsPrefix ( const char * a, const char * b )
  16735. {
  16736. if ( !*a )
  16737. return false;
  16738. while ( *a==*b )
  16739. {
  16740. a++;
  16741. b++;
  16742. }
  16743. return !*a;
  16744. }
  16745. static inline int ZippedIntSize ( DWORD v )
  16746. {
  16747. if ( v < (1UL<<7) )
  16748. return 1;
  16749. if ( v < (1UL<<14) )
  16750. return 2;
  16751. if ( v < (1UL<<21) )
  16752. return 3;
  16753. if ( v < (1UL<<28) )
  16754. return 4;
  16755. return 5;
  16756. }
  16757. #if USE_WINDOWS
  16758. #pragma warning(disable:4127) // conditional expr is const for MSVC
  16759. #endif
  16760. static const char * g_sTagInfixEntries = "infix-entries";
  16761. template < int SIZE >
  16762. void InfixBuilder_c<SIZE>::SaveEntries ( CSphWriter & wrDict )
  16763. {
  16764. // intentionally local to this function
  16765. // we mark the block end with an editcode of 0
  16766. const int INFIX_BLOCK_SIZE = 64;
  16767. wrDict.PutBytes ( g_sTagInfixEntries, strlen ( g_sTagInfixEntries ) );
  16768. CSphVector<int> dIndex;
  16769. dIndex.Resize ( m_dArena.GetLength()-1 );
  16770. for ( int i=0; i<m_dArena.GetLength()-1; i++ )
  16771. dIndex[i] = i+1;
  16772. InfixHashCmp_fn<SIZE> fnCmp ( m_dArena.Begin() );
  16773. dIndex.Sort ( fnCmp );
  16774. const int iMaxChars = 1+sizeof ( Infix_t<SIZE> );
  16775. const BYTE * sLast[iMaxChars];
  16776. InfixIntvec_t * pLast[iMaxChars];
  16777. for ( int i=0; i<iMaxChars; i++ )
  16778. {
  16779. sLast[i] = (const BYTE*) "";
  16780. pLast[i] = NULL;
  16781. }
  16782. m_dBlocksWords.Reserve ( m_dArena.GetLength()/INFIX_BLOCK_SIZE*sizeof(DWORD)*SIZE );
  16783. int iBlock = 0;
  16784. int iPrevKey = -1;
  16785. ARRAY_FOREACH ( iIndex, dIndex )
  16786. {
  16787. InfixIntvec_t & dData = m_dArena[dIndex[iIndex]].m_tValue;
  16788. const BYTE * sKey = (const BYTE*) m_dArena[dIndex[iIndex]].m_tKey.m_Data;
  16789. int iChars = ( SIZE==2 )
  16790. ? strnlen ( (const char*)sKey, sizeof(DWORD)*SIZE )
  16791. : sphUTF8Len ( (const char*)sKey, sizeof(DWORD)*SIZE );
  16792. assert ( iChars>=2 && iChars<iMaxChars );
  16793. #if 0
  16794. // fight them redundancies
  16795. // FIXME! is this right, or is it better to save everyone, so that nonexistent (!) lookups are instant?
  16796. bool bSkip = false;
  16797. for ( int i=iLen-1; i>=2 && !bSkip; i-- )
  16798. if ( IsPrefix ( sLast[i], sKey ) && dData==*pLast[i] )
  16799. bSkip = true;
  16800. if ( bSkip )
  16801. continue;
  16802. #endif
  16803. sLast[iChars] = sKey;
  16804. pLast[iChars] = &dData;
  16805. // keep track of N-infix blocks
  16806. int iAppendBytes = strnlen ( (const char*)sKey, sizeof(DWORD)*SIZE );
  16807. if ( !iBlock )
  16808. {
  16809. int iOff = m_dBlocksWords.GetLength();
  16810. m_dBlocksWords.Resize ( iOff+iAppendBytes+1 );
  16811. InfixBlock_t & tBlock = m_dBlocks.Add();
  16812. tBlock.m_iInfixOffset = iOff;
  16813. tBlock.m_iOffset = (int)wrDict.GetPos();
  16814. memcpy ( m_dBlocksWords.Begin()+iOff, sKey, iAppendBytes );
  16815. m_dBlocksWords[iOff+iAppendBytes] = '\0';
  16816. }
  16817. // compute max common prefix
  16818. // edit_code = ( num_keep_chars<<4 ) + num_append_chars
  16819. int iEditCode = iChars;
  16820. if ( iPrevKey>=0 )
  16821. {
  16822. const BYTE * sPrev = (const BYTE*) m_dArena[dIndex[iPrevKey]].m_tKey.m_Data;
  16823. const BYTE * sCur = (const BYTE*) sKey;
  16824. const BYTE * sMax = sCur + iAppendBytes;
  16825. int iKeepChars = 0;
  16826. if ( SIZE==2 )
  16827. {
  16828. // SBCS path
  16829. while ( sCur<sMax && *sCur && *sCur==*sPrev )
  16830. {
  16831. sCur++;
  16832. sPrev++;
  16833. }
  16834. iKeepChars = (int)( sCur- ( const BYTE* ) sKey );
  16835. assert ( iKeepChars>=0 && iKeepChars<16 );
  16836. assert ( iChars-iKeepChars>=0 );
  16837. assert ( iChars-iKeepChars<16 );
  16838. iEditCode = ( iKeepChars<<4 ) + ( iChars-iKeepChars );
  16839. iAppendBytes = ( iChars-iKeepChars );
  16840. sKey = sCur;
  16841. } else
  16842. {
  16843. // UTF-8 path
  16844. const BYTE * sKeyMax = sCur; // track max matching sPrev prefix in [sKey,sKeyMax)
  16845. while ( sCur<sMax && *sCur && *sCur==*sPrev )
  16846. {
  16847. // current byte matches, move the pointer
  16848. sCur++;
  16849. sPrev++;
  16850. // tricky bit
  16851. // if the next (!) byte is a valid UTF-8 char start (or eof!)
  16852. // then we just matched not just a byte, but a full char
  16853. // so bump the matching prefix boundary and length
  16854. if ( sCur>=sMax || ( *sCur & 0xC0 )!=0x80 )
  16855. {
  16856. sKeyMax = sCur;
  16857. iKeepChars++;
  16858. }
  16859. }
  16860. assert ( iKeepChars>=0 && iKeepChars<16 );
  16861. assert ( iChars-iKeepChars>=0 );
  16862. assert ( iChars-iKeepChars<16 );
  16863. iEditCode = ( iKeepChars<<4 ) + ( iChars-iKeepChars );
  16864. iAppendBytes -= (int)( sKeyMax-sKey );
  16865. sKey = sKeyMax;
  16866. }
  16867. }
  16868. // write edit code, postfix
  16869. wrDict.PutByte ( iEditCode );
  16870. wrDict.PutBytes ( sKey, iAppendBytes );
  16871. // compute data length
  16872. int iDataLen = ZippedIntSize ( dData[0] );
  16873. for ( int j=1; j<dData.GetLength(); j++ )
  16874. iDataLen += ZippedIntSize ( dData[j] - dData[j-1] );
  16875. // write data length, data
  16876. wrDict.ZipInt ( iDataLen );
  16877. wrDict.ZipInt ( dData[0] );
  16878. for ( int j=1; j<dData.GetLength(); j++ )
  16879. wrDict.ZipInt ( dData[j] - dData[j-1] );
  16880. // mark block end, restart deltas
  16881. iPrevKey = iIndex;
  16882. if ( ++iBlock==INFIX_BLOCK_SIZE )
  16883. {
  16884. iBlock = 0;
  16885. iPrevKey = -1;
  16886. wrDict.PutByte ( 0 );
  16887. }
  16888. }
  16889. const char * pBlockWords = (const char *)m_dBlocksWords.Begin();
  16890. ARRAY_FOREACH ( i, m_dBlocks )
  16891. m_dBlocks[i].m_sInfix = pBlockWords+m_dBlocks[i].m_iInfixOffset;
  16892. }
  16893. #if USE_WINDOWS
  16894. #pragma warning(default:4127) // conditional expr is const for MSVC
  16895. #endif
  16896. static const char * g_sTagInfixBlocks = "infix-blocks";
  16897. template < int SIZE >
  16898. int InfixBuilder_c<SIZE>::SaveEntryBlocks ( CSphWriter & wrDict )
  16899. {
  16900. // save the blocks
  16901. wrDict.PutBytes ( g_sTagInfixBlocks, strlen ( g_sTagInfixBlocks ) );
  16902. SphOffset_t iInfixBlocksOffset = wrDict.GetPos();
  16903. assert ( iInfixBlocksOffset<=INT_MAX );
  16904. wrDict.ZipInt ( m_dBlocks.GetLength() );
  16905. ARRAY_FOREACH ( i, m_dBlocks )
  16906. {
  16907. int iBytes = strlen ( m_dBlocks[i].m_sInfix );
  16908. wrDict.PutByte ( iBytes );
  16909. wrDict.PutBytes ( m_dBlocks[i].m_sInfix, iBytes );
  16910. wrDict.ZipInt ( m_dBlocks[i].m_iOffset ); // maybe delta these on top?
  16911. }
  16912. return (int)iInfixBlocksOffset;
  16913. }
  16914. ISphInfixBuilder * sphCreateInfixBuilder ( int iCodepointBytes, CSphString * pError )
  16915. {
  16916. assert ( pError );
  16917. *pError = CSphString();
  16918. switch ( iCodepointBytes )
  16919. {
  16920. case 0: return NULL;
  16921. case 1: return new InfixBuilder_c<2>(); // upto 6x1 bytes, 2 dwords, sbcs
  16922. case 2: return new InfixBuilder_c<3>(); // upto 6x2 bytes, 3 dwords, utf-8
  16923. case 3: return new InfixBuilder_c<5>(); // upto 6x3 bytes, 5 dwords, utf-8
  16924. default: pError->SetSprintf ( "unhandled max infix codepoint size %d", iCodepointBytes ); return NULL;
  16925. }
  16926. }
  16927. //////////////////////////////////////////////////////////////////////////
  16928. // KEYWORDS STORING DICTIONARY
  16929. //////////////////////////////////////////////////////////////////////////
  16930. class CSphDictKeywords : public CSphDictCRC<true>
  16931. {
  16932. private:
  16933. static const int SLOTS = 65536;
  16934. static const int ENTRY_CHUNK = 65536;
  16935. static const int KEYWORD_CHUNK = 1048576;
  16936. static const int DICT_CHUNK = 65536;
  16937. public:
  16938. // OPTIMIZE? change pointers to 8:24 locators to save RAM on x64 gear?
  16939. struct HitblockKeyword_t
  16940. {
  16941. SphWordID_t m_uWordid; // locally unique word id (crc value, adjusted in case of collsion)
  16942. HitblockKeyword_t * m_pNextHash; // next hashed entry
  16943. char * m_pKeyword; // keyword
  16944. };
  16945. struct HitblockException_t
  16946. {
  16947. HitblockKeyword_t * m_pEntry; // hash entry
  16948. SphWordID_t m_uCRC; // original unadjusted crc
  16949. bool operator < ( const HitblockException_t & rhs ) const
  16950. {
  16951. return m_pEntry->m_uWordid < rhs.m_pEntry->m_uWordid;
  16952. }
  16953. };
  16954. struct DictKeyword_t
  16955. {
  16956. char * m_sKeyword;
  16957. SphOffset_t m_uOff;
  16958. int m_iDocs;
  16959. int m_iHits;
  16960. BYTE m_uHint;
  16961. int m_iSkiplistPos; ///< position in .spe file; not exactly likely to hit 2B
  16962. };
  16963. struct DictBlock_t
  16964. {
  16965. SphOffset_t m_iPos;
  16966. int m_iLen;
  16967. };
  16968. private:
  16969. HitblockKeyword_t * m_dHash [ SLOTS ]; ///< hash by wordid (!)
  16970. CSphVector<HitblockException_t> m_dExceptions;
  16971. bool m_bHitblock; ///< should we store words on GetWordID or not
  16972. int m_iMemUse; ///< current memory use by all the chunks
  16973. int m_iDictLimit; ///< allowed memory limit for dict block collection
  16974. CSphVector<HitblockKeyword_t*> m_dEntryChunks; ///< hash chunks, only used when indexing hitblocks
  16975. HitblockKeyword_t * m_pEntryChunk;
  16976. int m_iEntryChunkFree;
  16977. CSphVector<BYTE*> m_dKeywordChunks; ///< keyword storage
  16978. BYTE * m_pKeywordChunk;
  16979. int m_iKeywordChunkFree;
  16980. CSphVector<DictKeyword_t*> m_dDictChunks; ///< dict entry chunks, only used when sorting final dict
  16981. DictKeyword_t * m_pDictChunk;
  16982. int m_iDictChunkFree;
  16983. int m_iTmpFD; ///< temp dict file descriptor
  16984. CSphWriter m_wrTmpDict; ///< temp dict writer
  16985. CSphVector<DictBlock_t> m_dDictBlocks; ///< on-disk locations of dict entry blocks
  16986. char m_sClippedWord[MAX_KEYWORD_BYTES]; ///< keyword storage for cliiped word
  16987. private:
  16988. SphWordID_t HitblockGetID ( const char * pWord, int iLen, SphWordID_t uCRC );
  16989. HitblockKeyword_t * HitblockAddKeyword ( DWORD uHash, const char * pWord, int iLen, SphWordID_t uID );
  16990. public:
  16991. explicit CSphDictKeywords ();
  16992. virtual ~CSphDictKeywords ();
  16993. virtual void HitblockBegin () { m_bHitblock = true; }
  16994. virtual void HitblockPatch ( CSphWordHit * pHits, int iHits );
  16995. virtual const char * HitblockGetKeyword ( SphWordID_t uWordID );
  16996. virtual int HitblockGetMemUse () { return m_iMemUse; }
  16997. virtual void HitblockReset ();
  16998. virtual void DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle );
  16999. virtual void DictEntry ( const CSphDictEntry & tEntry );
  17000. virtual void DictEndEntries ( SphOffset_t ) {}
  17001. virtual bool DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * pThrottle );
  17002. virtual SphWordID_t GetWordID ( BYTE * pWord );
  17003. virtual SphWordID_t GetWordIDWithMarkers ( BYTE * pWord );
  17004. virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord );
  17005. virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops );
  17006. virtual CSphDict * Clone () const { return CloneBase ( new CSphDictKeywords() ); }
  17007. private:
  17008. void DictFlush ();
  17009. };
  17010. //////////////////////////////////////////////////////////////////////////
  17011. CSphDictKeywords::CSphDictKeywords ()
  17012. : m_bHitblock ( false )
  17013. , m_iMemUse ( 0 )
  17014. , m_iDictLimit ( 0 )
  17015. , m_pEntryChunk ( NULL )
  17016. , m_iEntryChunkFree ( 0 )
  17017. , m_pKeywordChunk ( NULL )
  17018. , m_iKeywordChunkFree ( 0 )
  17019. , m_pDictChunk ( NULL )
  17020. , m_iDictChunkFree ( 0 )
  17021. {
  17022. memset ( m_dHash, 0, sizeof(m_dHash) );
  17023. }
  17024. CSphDictKeywords::~CSphDictKeywords ()
  17025. {
  17026. HitblockReset();
  17027. }
  17028. void CSphDictKeywords::HitblockReset()
  17029. {
  17030. m_dExceptions.Resize ( 0 );
  17031. ARRAY_FOREACH ( i, m_dEntryChunks )
  17032. SafeDeleteArray ( m_dEntryChunks[i] );
  17033. m_dEntryChunks.Resize ( 0 );
  17034. m_pEntryChunk = NULL;
  17035. m_iEntryChunkFree = 0;
  17036. ARRAY_FOREACH ( i, m_dKeywordChunks )
  17037. SafeDeleteArray ( m_dKeywordChunks[i] );
  17038. m_dKeywordChunks.Resize ( 0 );
  17039. m_pKeywordChunk = NULL;
  17040. m_iKeywordChunkFree = 0;
  17041. m_iMemUse = 0;
  17042. memset ( m_dHash, 0, sizeof(m_dHash) );
  17043. }
  17044. CSphDictKeywords::HitblockKeyword_t * CSphDictKeywords::HitblockAddKeyword ( DWORD uHash, const char * sWord, int iLen, SphWordID_t uID )
  17045. {
  17046. assert ( iLen<MAX_KEYWORD_BYTES );
  17047. // alloc entry
  17048. if ( !m_iEntryChunkFree )
  17049. {
  17050. m_pEntryChunk = new HitblockKeyword_t [ ENTRY_CHUNK ];
  17051. m_iEntryChunkFree = ENTRY_CHUNK;
  17052. m_dEntryChunks.Add ( m_pEntryChunk );
  17053. m_iMemUse += sizeof(HitblockKeyword_t)*ENTRY_CHUNK;
  17054. }
  17055. HitblockKeyword_t * pEntry = m_pEntryChunk++;
  17056. m_iEntryChunkFree--;
  17057. // alloc keyword
  17058. iLen++;
  17059. if ( m_iKeywordChunkFree < iLen )
  17060. {
  17061. m_pKeywordChunk = new BYTE [ KEYWORD_CHUNK ];
  17062. m_iKeywordChunkFree = KEYWORD_CHUNK;
  17063. m_dKeywordChunks.Add ( m_pKeywordChunk );
  17064. m_iMemUse += KEYWORD_CHUNK;
  17065. }
  17066. // fill it
  17067. memcpy ( m_pKeywordChunk, sWord, iLen );
  17068. m_pKeywordChunk[iLen-1] = '\0';
  17069. pEntry->m_pKeyword = (char*)m_pKeywordChunk;
  17070. pEntry->m_uWordid = uID;
  17071. m_pKeywordChunk += iLen;
  17072. m_iKeywordChunkFree -= iLen;
  17073. // mtf it
  17074. pEntry->m_pNextHash = m_dHash [ uHash ];
  17075. m_dHash [ uHash ] = pEntry;
  17076. return pEntry;
  17077. }
  17078. SphWordID_t CSphDictKeywords::HitblockGetID ( const char * sWord, int iLen, SphWordID_t uCRC )
  17079. {
  17080. if ( iLen>=MAX_KEYWORD_BYTES-4 ) // fix of very long word (zones)
  17081. {
  17082. memcpy ( m_sClippedWord, sWord, MAX_KEYWORD_BYTES-4 );
  17083. memset ( m_sClippedWord+MAX_KEYWORD_BYTES-4, 0, 4 );
  17084. CSphString sOrig;
  17085. sOrig.SetBinary ( sWord, iLen );
  17086. sphWarn ( "word overrun buffer, clipped!!!\n"
  17087. "clipped (len=%d, word='%s')\noriginal (len=%d, word='%s')",
  17088. MAX_KEYWORD_BYTES-4, m_sClippedWord, iLen, sOrig.cstr() );
  17089. sWord = m_sClippedWord;
  17090. iLen = MAX_KEYWORD_BYTES-4;
  17091. uCRC = sphCRC32 ( (const BYTE *)m_sClippedWord, MAX_KEYWORD_BYTES-4 );
  17092. }
  17093. // is this a known one? find it
  17094. // OPTIMIZE? in theory we could use something faster than crc32; but quick lookup3 test did not show any improvements
  17095. const DWORD uHash = (DWORD)( uCRC % SLOTS );
  17096. HitblockKeyword_t * pEntry = m_dHash [ uHash ];
  17097. HitblockKeyword_t ** ppEntry = &m_dHash [ uHash ];
  17098. while ( pEntry )
  17099. {
  17100. // check crc
  17101. if ( pEntry->m_uWordid!=uCRC )
  17102. {
  17103. // crc mismatch, try next entry
  17104. ppEntry = &pEntry->m_pNextHash;
  17105. pEntry = pEntry->m_pNextHash;
  17106. continue;
  17107. }
  17108. // crc matches, check keyword
  17109. register int iWordLen = iLen;
  17110. register const char * a = pEntry->m_pKeyword;
  17111. register const char * b = sWord;
  17112. while ( *a==*b && iWordLen-- )
  17113. {
  17114. if ( !*a || !iWordLen )
  17115. {
  17116. // known word, mtf it, and return id
  17117. (*ppEntry) = pEntry->m_pNextHash;
  17118. pEntry->m_pNextHash = m_dHash [ uHash ];
  17119. m_dHash [ uHash ] = pEntry;
  17120. return pEntry->m_uWordid;
  17121. }
  17122. a++;
  17123. b++;
  17124. }
  17125. // collision detected!
  17126. // our crc is taken as a wordid, but keyword does not match
  17127. // welcome to the land of very tricky magic
  17128. //
  17129. // pEntry might either be a known exception, or a regular keyword
  17130. // sWord might either be a known exception, or a new one
  17131. // if they are not known, they needed to be added as exceptions now
  17132. //
  17133. // in case sWord is new, we need to assign a new unique wordid
  17134. // for that, we keep incrementing the crc until it is unique
  17135. // a starting point for wordid search loop would be handy
  17136. //
  17137. // let's scan the exceptions vector and work on all this
  17138. //
  17139. // NOTE, beware of the order, it is wordid asc, which does NOT guarantee crc asc
  17140. // example, assume crc(w1)==X, crc(w2)==X+1, crc(w3)==X (collides with w1)
  17141. // wordids will be X, X+1, X+2 but crcs will be X, X+1, X
  17142. //
  17143. // OPTIMIZE, might make sense to use binary search
  17144. // OPTIMIZE, add early out somehow
  17145. SphWordID_t uWordid = uCRC + 1;
  17146. const int iExcLen = m_dExceptions.GetLength();
  17147. int iExc = m_dExceptions.GetLength();
  17148. ARRAY_FOREACH ( i, m_dExceptions )
  17149. {
  17150. const HitblockKeyword_t * pExcWord = m_dExceptions[i].m_pEntry;
  17151. // incoming word is a known exception? just return the pre-assigned wordid
  17152. if ( m_dExceptions[i].m_uCRC==uCRC && strncmp ( pExcWord->m_pKeyword, sWord, iLen )==0 )
  17153. return pExcWord->m_uWordid;
  17154. // incoming word collided into a known exception? clear the matched entry; no need to re-add it (see below)
  17155. if ( pExcWord==pEntry )
  17156. pEntry = NULL;
  17157. // find first exception with wordid greater or equal to our candidate
  17158. if ( pExcWord->m_uWordid>=uWordid && iExc==iExcLen )
  17159. iExc = i;
  17160. }
  17161. // okay, this is a new collision
  17162. // if entry was a regular word, we have to add it
  17163. if ( pEntry )
  17164. {
  17165. m_dExceptions.Add();
  17166. m_dExceptions.Last().m_pEntry = pEntry;
  17167. m_dExceptions.Last().m_uCRC = uCRC;
  17168. }
  17169. // need to assign a new unique wordid now
  17170. // keep scanning both exceptions and keywords for collisions
  17171. for ( ;; )
  17172. {
  17173. // iExc must be either the first exception greater or equal to current candidate, or out of bounds
  17174. assert ( iExc==iExcLen || m_dExceptions[iExc].m_pEntry->m_uWordid>=uWordid );
  17175. assert ( iExc==0 || m_dExceptions[iExc-1].m_pEntry->m_uWordid<uWordid );
  17176. // candidate collides with a known exception? increment it, and keep looking
  17177. if ( iExc<iExcLen && m_dExceptions[iExc].m_pEntry->m_uWordid==uWordid )
  17178. {
  17179. uWordid++;
  17180. while ( iExc<iExcLen && m_dExceptions[iExc].m_pEntry->m_uWordid<uWordid )
  17181. iExc++;
  17182. continue;
  17183. }
  17184. // candidate collides with a keyword? must be a regular one; add it as an exception, and keep looking
  17185. HitblockKeyword_t * pCheck = m_dHash [ (DWORD)( uWordid % SLOTS ) ];
  17186. while ( pCheck )
  17187. {
  17188. if ( pCheck->m_uWordid==uWordid )
  17189. break;
  17190. pCheck = pCheck->m_pNextHash;
  17191. }
  17192. // no collisions; we've found our unique wordid!
  17193. if ( !pCheck )
  17194. break;
  17195. // got a collision; add it
  17196. HitblockException_t & tColl = m_dExceptions.Add();
  17197. tColl.m_pEntry = pCheck;
  17198. tColl.m_uCRC = pCheck->m_uWordid; // not a known exception; hence, wordid must equal crc
  17199. // and keep looking
  17200. uWordid++;
  17201. continue;
  17202. }
  17203. // and finally, we have that precious new wordid
  17204. // so hash our new unique under its new unique adjusted wordid
  17205. pEntry = HitblockAddKeyword ( (DWORD)( uWordid % SLOTS ), sWord, iLen, uWordid );
  17206. // add it as a collision too
  17207. m_dExceptions.Add();
  17208. m_dExceptions.Last().m_pEntry = pEntry;
  17209. m_dExceptions.Last().m_uCRC = uCRC;
  17210. // keep exceptions list sorted by wordid
  17211. m_dExceptions.Sort();
  17212. return pEntry->m_uWordid;
  17213. }
  17214. // new keyword with unique crc
  17215. pEntry = HitblockAddKeyword ( uHash, sWord, iLen, uCRC );
  17216. return pEntry->m_uWordid;
  17217. }
  17218. struct DictKeywordTagged_t : public CSphDictKeywords::DictKeyword_t
  17219. {
  17220. int m_iBlock;
  17221. };
  17222. struct DictKeywordTaggedCmp_fn
  17223. {
  17224. static inline bool IsLess ( const DictKeywordTagged_t & a, const DictKeywordTagged_t & b )
  17225. {
  17226. return strcmp ( a.m_sKeyword, b.m_sKeyword ) < 0;
  17227. }
  17228. };
  17229. static void DictReadEntry ( CSphBin * pBin, DictKeywordTagged_t & tEntry, BYTE * pKeyword )
  17230. {
  17231. int iKeywordLen = pBin->ReadByte ();
  17232. if ( iKeywordLen<0 )
  17233. {
  17234. // early eof or read error; flag must be raised
  17235. assert ( pBin->IsError() );
  17236. return;
  17237. }
  17238. assert ( iKeywordLen>0 && iKeywordLen<MAX_KEYWORD_BYTES-1 );
  17239. if ( pBin->ReadBytes ( pKeyword, iKeywordLen )<0 )
  17240. {
  17241. assert ( pBin->IsError() );
  17242. return;
  17243. }
  17244. pKeyword[iKeywordLen] = '\0';
  17245. tEntry.m_sKeyword = (char*)pKeyword;
  17246. tEntry.m_uOff = pBin->UnzipOffset();
  17247. tEntry.m_iDocs = pBin->UnzipInt();
  17248. tEntry.m_iHits = pBin->UnzipInt();
  17249. tEntry.m_uHint = (BYTE) pBin->ReadByte();
  17250. if ( tEntry.m_iDocs > SPH_SKIPLIST_BLOCK )
  17251. tEntry.m_iSkiplistPos = pBin->UnzipInt();
  17252. else
  17253. tEntry.m_iSkiplistPos = 0;
  17254. }
  17255. void CSphDictKeywords::DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle )
  17256. {
  17257. m_iTmpFD = tTempDict.GetFD();
  17258. m_wrTmpDict.CloseFile ();
  17259. m_wrTmpDict.SetFile ( tTempDict, NULL, m_sWriterError );
  17260. m_wrTmpDict.SetThrottle ( pThrottle );
  17261. m_wrDict.CloseFile ();
  17262. m_wrDict.SetFile ( tDict, NULL, m_sWriterError );
  17263. m_wrDict.SetThrottle ( pThrottle );
  17264. m_wrDict.PutByte ( 1 );
  17265. m_iDictLimit = Max ( iDictLimit, KEYWORD_CHUNK + DICT_CHUNK*(int)sizeof(DictKeyword_t) ); // can't use less than 1 chunk
  17266. }
  17267. bool CSphDictKeywords::DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * pThrottle )
  17268. {
  17269. DictFlush ();
  17270. m_wrTmpDict.CloseFile (); // tricky: file is not owned, so it won't get closed, and iTmpFD won't get invalidated
  17271. if ( !m_dDictBlocks.GetLength() )
  17272. m_wrDict.CloseFile();
  17273. if ( m_wrTmpDict.IsError() || m_wrDict.IsError() )
  17274. {
  17275. sError.SetSprintf ( "dictionary write error (out of space?)" );
  17276. return false;
  17277. }
  17278. if ( !m_dDictBlocks.GetLength() )
  17279. {
  17280. pHeader->m_iDictCheckpointsOffset = m_wrDict.GetPos ();
  17281. pHeader->m_iDictCheckpoints = 0;
  17282. return true;
  17283. }
  17284. // infix builder, if needed
  17285. ISphInfixBuilder * pInfixer = sphCreateInfixBuilder ( pHeader->m_iInfixCodepointBytes, &sError );
  17286. if ( !sError.IsEmpty() )
  17287. {
  17288. SafeDelete ( pInfixer );
  17289. return false;
  17290. }
  17291. // initialize readers
  17292. CSphVector<CSphBin*> dBins ( m_dDictBlocks.GetLength() );
  17293. int iMaxBlock = 0;
  17294. ARRAY_FOREACH ( i, m_dDictBlocks )
  17295. iMaxBlock = Max ( iMaxBlock, m_dDictBlocks[i].m_iLen );
  17296. iMemLimit = Max ( iMemLimit, iMaxBlock*m_dDictBlocks.GetLength() );
  17297. int iBinSize = CSphBin::CalcBinSize ( iMemLimit, m_dDictBlocks.GetLength(), "sort_dict" );
  17298. SphOffset_t iSharedOffset = -1;
  17299. ARRAY_FOREACH ( i, m_dDictBlocks )
  17300. {
  17301. dBins[i] = new CSphBin();
  17302. dBins[i]->m_iFileLeft = m_dDictBlocks[i].m_iLen;
  17303. dBins[i]->m_iFilePos = m_dDictBlocks[i].m_iPos;
  17304. dBins[i]->Init ( m_iTmpFD, &iSharedOffset, iBinSize );
  17305. dBins[i]->SetThrottle ( pThrottle );
  17306. }
  17307. // keywords storage
  17308. BYTE * pKeywords = new BYTE [ MAX_KEYWORD_BYTES*dBins.GetLength() ];
  17309. #define LOC_CLEANUP() \
  17310. { \
  17311. ARRAY_FOREACH ( i, dBins ) \
  17312. SafeDelete ( dBins[i] ); \
  17313. SafeDeleteArray ( pKeywords ); \
  17314. SafeDelete ( pInfixer ); \
  17315. }
  17316. // do the sort
  17317. CSphQueue < DictKeywordTagged_t, DictKeywordTaggedCmp_fn > qWords ( dBins.GetLength() );
  17318. DictKeywordTagged_t tEntry;
  17319. ARRAY_FOREACH ( i, dBins )
  17320. {
  17321. DictReadEntry ( dBins[i], tEntry, pKeywords + i*MAX_KEYWORD_BYTES );
  17322. if ( dBins[i]->IsError() )
  17323. {
  17324. sError.SetSprintf ( "entry read error in dictionary sort (bin %d of %d)", i, dBins.GetLength() );
  17325. LOC_CLEANUP();
  17326. return false;
  17327. }
  17328. tEntry.m_iBlock = i;
  17329. qWords.Push ( tEntry );
  17330. }
  17331. CSphKeywordDeltaWriter tLastKeyword;
  17332. int iWords = 0;
  17333. while ( qWords.GetLength() )
  17334. {
  17335. const DictKeywordTagged_t & tWord = qWords.Root();
  17336. const int iLen = strlen ( tWord.m_sKeyword ); // OPTIMIZE?
  17337. // store checkpoints as needed
  17338. if ( ( iWords % SPH_WORDLIST_CHECKPOINT )==0 )
  17339. {
  17340. // emit a checkpoint, unless we're at the very dict beginning
  17341. if ( iWords )
  17342. {
  17343. m_wrDict.ZipInt ( 0 );
  17344. m_wrDict.ZipInt ( 0 );
  17345. }
  17346. BYTE * sClone = new BYTE [ iLen+1 ]; // OPTIMIZE? pool these?
  17347. memcpy ( sClone, tWord.m_sKeyword, iLen+1 );
  17348. sClone[iLen] = '\0';
  17349. CSphWordlistCheckpoint & tCheckpoint = m_dCheckpoints.Add ();
  17350. tCheckpoint.m_sWord = (char*) sClone;
  17351. tCheckpoint.m_iWordlistOffset = m_wrDict.GetPos();
  17352. tLastKeyword.Reset();
  17353. }
  17354. iWords++;
  17355. // write final dict entry
  17356. assert ( iLen );
  17357. assert ( tWord.m_uOff );
  17358. assert ( tWord.m_iDocs );
  17359. assert ( tWord.m_iHits );
  17360. tLastKeyword.PutDelta ( m_wrDict, (const BYTE *)tWord.m_sKeyword, iLen );
  17361. m_wrDict.ZipOffset ( tWord.m_uOff );
  17362. m_wrDict.ZipInt ( tWord.m_iDocs );
  17363. m_wrDict.ZipInt ( tWord.m_iHits );
  17364. if ( tWord.m_uHint )
  17365. m_wrDict.PutByte ( tWord.m_uHint );
  17366. if ( tWord.m_iDocs > SPH_SKIPLIST_BLOCK )
  17367. m_wrDict.ZipInt ( tWord.m_iSkiplistPos );
  17368. // build infixes
  17369. if ( pInfixer )
  17370. pInfixer->AddWord ( (const BYTE*)tWord.m_sKeyword, iLen, m_dCheckpoints.GetLength() );
  17371. // next
  17372. int iBin = tWord.m_iBlock;
  17373. qWords.Pop ();
  17374. if ( !dBins[iBin]->IsDone() )
  17375. {
  17376. DictReadEntry ( dBins[iBin], tEntry, pKeywords + iBin*MAX_KEYWORD_BYTES );
  17377. if ( dBins[iBin]->IsError() )
  17378. {
  17379. sError.SetSprintf ( "entry read error in dictionary sort (bin %d of %d)", iBin, dBins.GetLength() );
  17380. LOC_CLEANUP();
  17381. return false;
  17382. }
  17383. tEntry.m_iBlock = iBin;
  17384. qWords.Push ( tEntry );
  17385. }
  17386. }
  17387. // end of dictionary block
  17388. m_wrDict.ZipInt ( 0 );
  17389. m_wrDict.ZipInt ( 0 );
  17390. // flush infix hash entries, if any
  17391. if ( pInfixer )
  17392. pInfixer->SaveEntries ( m_wrDict );
  17393. // flush wordlist checkpoints (blocks)
  17394. pHeader->m_iDictCheckpointsOffset = m_wrDict.GetPos();
  17395. pHeader->m_iDictCheckpoints = m_dCheckpoints.GetLength();
  17396. ARRAY_FOREACH ( i, m_dCheckpoints )
  17397. {
  17398. const int iLen = strlen ( m_dCheckpoints[i].m_sWord );
  17399. assert ( m_dCheckpoints[i].m_iWordlistOffset>0 );
  17400. assert ( iLen>0 && iLen<MAX_KEYWORD_BYTES );
  17401. m_wrDict.PutDword ( iLen );
  17402. m_wrDict.PutBytes ( m_dCheckpoints[i].m_sWord, iLen );
  17403. m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordlistOffset );
  17404. SafeDeleteArray ( m_dCheckpoints[i].m_sWord );
  17405. }
  17406. // flush infix hash blocks
  17407. if ( pInfixer )
  17408. {
  17409. pHeader->m_iInfixBlocksOffset = pInfixer->SaveEntryBlocks ( m_wrDict );
  17410. pHeader->m_iInfixBlocksWordsSize = pInfixer->GetBlocksWordsSize();
  17411. }
  17412. // flush header
  17413. // mostly for debugging convenience
  17414. // primary storage is in the index wide header
  17415. m_wrDict.PutBytes ( "dict-header", 11 );
  17416. m_wrDict.ZipInt ( pHeader->m_iDictCheckpoints );
  17417. m_wrDict.ZipOffset ( pHeader->m_iDictCheckpointsOffset );
  17418. m_wrDict.ZipInt ( pHeader->m_iInfixCodepointBytes );
  17419. m_wrDict.ZipInt ( pHeader->m_iInfixBlocksOffset );
  17420. // about it
  17421. LOC_CLEANUP();
  17422. #undef LOC_CLEANUP
  17423. m_wrDict.CloseFile ();
  17424. if ( m_wrDict.IsError() )
  17425. sError.SetSprintf ( "dictionary write error (out of space?)" );
  17426. return !m_wrDict.IsError();
  17427. }
  17428. struct DictKeywordCmp_fn
  17429. {
  17430. inline bool IsLess ( CSphDictKeywords::DictKeyword_t * a, CSphDictKeywords::DictKeyword_t * b ) const
  17431. {
  17432. return strcmp ( a->m_sKeyword, b->m_sKeyword ) < 0;
  17433. }
  17434. };
  17435. void CSphDictKeywords::DictFlush ()
  17436. {
  17437. if ( !m_dDictChunks.GetLength() )
  17438. return;
  17439. assert ( m_dDictChunks.GetLength() && m_dKeywordChunks.GetLength() );
  17440. // sort em
  17441. int iTotalWords = m_dDictChunks.GetLength()*DICT_CHUNK - m_iDictChunkFree;
  17442. CSphVector<DictKeyword_t*> dWords ( iTotalWords );
  17443. int iIdx = 0;
  17444. ARRAY_FOREACH ( i, m_dDictChunks )
  17445. {
  17446. int iWords = DICT_CHUNK;
  17447. if ( i==m_dDictChunks.GetLength()-1 )
  17448. iWords -= m_iDictChunkFree;
  17449. DictKeyword_t * pWord = m_dDictChunks[i];
  17450. for ( int j=0; j<iWords; j++ )
  17451. dWords[iIdx++] = pWord++;
  17452. }
  17453. dWords.Sort ( DictKeywordCmp_fn() );
  17454. // write em
  17455. DictBlock_t & tBlock = m_dDictBlocks.Add();
  17456. tBlock.m_iPos = m_wrTmpDict.GetPos ();
  17457. ARRAY_FOREACH ( i, dWords )
  17458. {
  17459. const DictKeyword_t * pWord = dWords[i];
  17460. int iLen = strlen ( pWord->m_sKeyword );
  17461. m_wrTmpDict.PutByte ( iLen );
  17462. m_wrTmpDict.PutBytes ( pWord->m_sKeyword, iLen );
  17463. m_wrTmpDict.ZipOffset ( pWord->m_uOff );
  17464. m_wrTmpDict.ZipInt ( pWord->m_iDocs );
  17465. m_wrTmpDict.ZipInt ( pWord->m_iHits );
  17466. m_wrTmpDict.PutByte ( pWord->m_uHint );
  17467. assert ( ( pWord->m_iDocs > SPH_SKIPLIST_BLOCK )==( pWord->m_iSkiplistPos!=0 ) );
  17468. if ( pWord->m_iDocs > SPH_SKIPLIST_BLOCK )
  17469. m_wrTmpDict.ZipInt ( pWord->m_iSkiplistPos );
  17470. }
  17471. tBlock.m_iLen = (int)( m_wrTmpDict.GetPos() - tBlock.m_iPos );
  17472. // clean up buffers
  17473. ARRAY_FOREACH ( i, m_dDictChunks )
  17474. SafeDeleteArray ( m_dDictChunks[i] );
  17475. m_dDictChunks.Resize ( 0 );
  17476. m_pDictChunk = NULL;
  17477. m_iDictChunkFree = 0;
  17478. ARRAY_FOREACH ( i, m_dKeywordChunks )
  17479. SafeDeleteArray ( m_dKeywordChunks[i] );
  17480. m_dKeywordChunks.Resize ( 0 );
  17481. m_pKeywordChunk = NULL;
  17482. m_iKeywordChunkFree = 0;
  17483. m_iMemUse = 0;
  17484. }
  17485. void CSphDictKeywords::DictEntry ( const CSphDictEntry & tEntry )
  17486. {
  17487. // they say, this might just happen during merge
  17488. // FIXME! can we make merge avoid sending such keywords to dict and assert here?
  17489. if ( !tEntry.m_iDocs )
  17490. return;
  17491. assert ( tEntry.m_iHits );
  17492. assert ( tEntry.m_iDoclistLength>0 );
  17493. DictKeyword_t * pWord = NULL;
  17494. int iLen = strlen ( (char*)tEntry.m_sKeyword ) + 1;
  17495. for ( ;; )
  17496. {
  17497. // alloc dict entry
  17498. if ( !m_iDictChunkFree )
  17499. {
  17500. if ( m_iDictLimit && ( m_iMemUse + (int)sizeof(DictKeyword_t)*DICT_CHUNK )>m_iDictLimit )
  17501. DictFlush ();
  17502. m_pDictChunk = new DictKeyword_t [ DICT_CHUNK ];
  17503. m_iDictChunkFree = DICT_CHUNK;
  17504. m_dDictChunks.Add ( m_pDictChunk );
  17505. m_iMemUse += sizeof(DictKeyword_t)*DICT_CHUNK;
  17506. }
  17507. // alloc keyword
  17508. if ( m_iKeywordChunkFree < iLen )
  17509. {
  17510. if ( m_iDictLimit && ( m_iMemUse + KEYWORD_CHUNK )>m_iDictLimit )
  17511. {
  17512. DictFlush ();
  17513. continue; // because we just flushed pWord
  17514. }
  17515. m_pKeywordChunk = new BYTE [ KEYWORD_CHUNK ];
  17516. m_iKeywordChunkFree = KEYWORD_CHUNK;
  17517. m_dKeywordChunks.Add ( m_pKeywordChunk );
  17518. m_iMemUse += KEYWORD_CHUNK;
  17519. }
  17520. // aw kay
  17521. break;
  17522. }
  17523. pWord = m_pDictChunk++;
  17524. m_iDictChunkFree--;
  17525. pWord->m_sKeyword = (char*)m_pKeywordChunk;
  17526. memcpy ( m_pKeywordChunk, tEntry.m_sKeyword, iLen );
  17527. m_pKeywordChunk[iLen-1] = '\0';
  17528. m_pKeywordChunk += iLen;
  17529. m_iKeywordChunkFree -= iLen;
  17530. pWord->m_uOff = tEntry.m_iDoclistOffset;
  17531. pWord->m_iDocs = tEntry.m_iDocs;
  17532. pWord->m_iHits = tEntry.m_iHits;
  17533. pWord->m_uHint = sphDoclistHintPack ( tEntry.m_iDocs, tEntry.m_iDoclistLength );
  17534. pWord->m_iSkiplistPos = 0;
  17535. if ( tEntry.m_iDocs > SPH_SKIPLIST_BLOCK )
  17536. pWord->m_iSkiplistPos = (int)( tEntry.m_iSkiplistOffset );
  17537. }
  17538. SphWordID_t CSphDictKeywords::GetWordID ( BYTE * pWord )
  17539. {
  17540. SphWordID_t uCRC = CSphDictCRC<true>::GetWordID ( pWord );
  17541. if ( !uCRC || !m_bHitblock )
  17542. return uCRC;
  17543. int iLen = strlen ( (const char *)pWord );
  17544. return HitblockGetID ( (const char *)pWord, iLen, uCRC );
  17545. }
  17546. SphWordID_t CSphDictKeywords::GetWordIDWithMarkers ( BYTE * pWord )
  17547. {
  17548. SphWordID_t uCRC = CSphDictCRC<true>::GetWordIDWithMarkers ( pWord );
  17549. if ( !uCRC || !m_bHitblock )
  17550. return uCRC;
  17551. int iLen = strlen ( (const char *)pWord );
  17552. return HitblockGetID ( (const char *)pWord, iLen, uCRC );
  17553. }
  17554. SphWordID_t CSphDictKeywords::GetWordIDNonStemmed ( BYTE * pWord )
  17555. {
  17556. SphWordID_t uCRC = CSphDictCRC<true>::GetWordIDNonStemmed ( pWord );
  17557. if ( !uCRC || !m_bHitblock )
  17558. return uCRC;
  17559. int iLen = strlen ( (const char *)pWord );
  17560. return HitblockGetID ( (const char *)pWord, iLen, uCRC );
  17561. }
  17562. SphWordID_t CSphDictKeywords::GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
  17563. {
  17564. SphWordID_t uCRC = CSphDictCRC<true>::GetWordID ( pWord, iLen, bFilterStops );
  17565. if ( !uCRC || !m_bHitblock )
  17566. return uCRC;
  17567. return HitblockGetID ( (const char *)pWord, iLen, uCRC ); // !COMMIT would break, we kind of strcmp inside; but must never get called?
  17568. }
  17569. /// binary search for the first hit with wordid greater than or equal to reference
  17570. static CSphWordHit * FindFirstGte ( CSphWordHit * pHits, int iHits, SphWordID_t uID )
  17571. {
  17572. if ( pHits->m_iWordID==uID )
  17573. return pHits;
  17574. CSphWordHit * pL = pHits;
  17575. CSphWordHit * pR = pHits + iHits - 1;
  17576. if ( pL->m_iWordID > uID || pR->m_iWordID < uID )
  17577. return NULL;
  17578. while ( pR-pL!=1 )
  17579. {
  17580. CSphWordHit * pM = pL + ( pR-pL )/2;
  17581. if ( pM->m_iWordID < uID )
  17582. pL = pM;
  17583. else
  17584. pR = pM;
  17585. }
  17586. assert ( pR-pL==1 );
  17587. assert ( pL->m_iWordID<uID );
  17588. assert ( pR->m_iWordID>=uID );
  17589. return pR;
  17590. }
  17591. /// full crc and keyword check
  17592. static inline bool FullIsLess ( const CSphDictKeywords::HitblockException_t & a, const CSphDictKeywords::HitblockException_t & b )
  17593. {
  17594. if ( a.m_uCRC!=b.m_uCRC )
  17595. return a.m_uCRC < b.m_uCRC;
  17596. return strcmp ( a.m_pEntry->m_pKeyword, b.m_pEntry->m_pKeyword ) < 0;
  17597. }
  17598. /// sort functor to compute collided hits reordering
  17599. struct HitblockPatchSort_fn
  17600. {
  17601. const CSphDictKeywords::HitblockException_t * m_pExc;
  17602. explicit HitblockPatchSort_fn ( const CSphDictKeywords::HitblockException_t * pExc )
  17603. : m_pExc ( pExc )
  17604. {}
  17605. bool IsLess ( int a, int b ) const
  17606. {
  17607. return FullIsLess ( m_pExc[a], m_pExc[b] );
  17608. }
  17609. };
  17610. /// do hit block patching magic
  17611. void CSphDictKeywords::HitblockPatch ( CSphWordHit * pHits, int iHits )
  17612. {
  17613. if ( !pHits || iHits<=0 )
  17614. return;
  17615. const CSphVector<HitblockException_t> & dExc = m_dExceptions; // shortcut
  17616. CSphVector<CSphWordHit*> dChunk;
  17617. // reorder hit chunks for exceptions (aka crc collisions)
  17618. for ( int iFirst = 0; iFirst < dExc.GetLength()-1; )
  17619. {
  17620. // find next span of collisions, iFirst inclusive, iMax exclusive ie. [iFirst,iMax)
  17621. // (note that exceptions array is always sorted)
  17622. SphWordID_t uFirstWordid = dExc[iFirst].m_pEntry->m_uWordid;
  17623. assert ( dExc[iFirst].m_uCRC==uFirstWordid );
  17624. int iMax = iFirst+1;
  17625. SphWordID_t uSpan = uFirstWordid+1;
  17626. while ( iMax < dExc.GetLength() && dExc[iMax].m_pEntry->m_uWordid==uSpan )
  17627. {
  17628. iMax++;
  17629. uSpan++;
  17630. }
  17631. // check whether they are in proper order already
  17632. bool bSorted = true;
  17633. for ( int i=iFirst; i<iMax-1 && bSorted; i++ )
  17634. if ( FullIsLess ( dExc[i+1], dExc[i] ) )
  17635. bSorted = false;
  17636. // order is ok; skip this span
  17637. if ( bSorted )
  17638. {
  17639. iFirst = iMax;
  17640. continue;
  17641. }
  17642. // we need to fix up these collision hits
  17643. // convert them from arbitrary "wordid asc" to strict "crc asc, keyword asc" order
  17644. // lets begin with looking up hit chunks for every wordid
  17645. dChunk.Resize ( iMax-iFirst+1 );
  17646. // find the end
  17647. dChunk.Last() = FindFirstGte ( pHits, iHits, uFirstWordid+iMax-iFirst );
  17648. if ( !dChunk.Last() )
  17649. {
  17650. assert ( iMax==dExc.GetLength() && pHits[iHits-1].m_iWordID==uFirstWordid+iMax-1-iFirst );
  17651. dChunk.Last() = pHits+iHits;
  17652. }
  17653. // find the start
  17654. dChunk[0] = FindFirstGte ( pHits, dChunk.Last()-pHits, uFirstWordid );
  17655. assert ( dChunk[0] && dChunk[0]->m_iWordID==uFirstWordid );
  17656. // find the chunk starts
  17657. for ( int i=1; i<dChunk.GetLength()-1; i++ )
  17658. {
  17659. dChunk[i] = FindFirstGte ( dChunk[i-1], dChunk.Last()-dChunk[i-1], uFirstWordid+i );
  17660. assert ( dChunk[i] && dChunk[i]->m_iWordID==uFirstWordid+i );
  17661. }
  17662. CSphWordHit * pTemp;
  17663. if ( iMax-iFirst==2 )
  17664. {
  17665. // most frequent case, just two collisions
  17666. // OPTIMIZE? allocate buffer for the smaller chunk, not just first chunk
  17667. pTemp = new CSphWordHit [ dChunk[1]-dChunk[0] ];
  17668. memcpy ( pTemp, dChunk[0], ( dChunk[1]-dChunk[0] )*sizeof(CSphWordHit) );
  17669. memmove ( dChunk[0], dChunk[1], ( dChunk[2]-dChunk[1] )*sizeof(CSphWordHit) );
  17670. memcpy ( dChunk[0] + ( dChunk[2]-dChunk[1] ), pTemp, ( dChunk[1]-dChunk[0] )*sizeof(CSphWordHit) );
  17671. } else
  17672. {
  17673. // generic case, more than two
  17674. CSphVector<int> dReorder ( iMax-iFirst );
  17675. ARRAY_FOREACH ( i, dReorder )
  17676. dReorder[i] = i;
  17677. HitblockPatchSort_fn fnSort ( &dExc[iFirst] );
  17678. dReorder.Sort ( fnSort );
  17679. // OPTIMIZE? could skip heading and trailing blocks that are already in position
  17680. pTemp = new CSphWordHit [ dChunk.Last()-dChunk[0] ];
  17681. CSphWordHit * pOut = pTemp;
  17682. ARRAY_FOREACH ( i, dReorder )
  17683. {
  17684. int iChunk = dReorder[i];
  17685. int iHits = dChunk[iChunk+1] - dChunk[iChunk];
  17686. memcpy ( pOut, dChunk[iChunk], iHits*sizeof(CSphWordHit) );
  17687. pOut += iHits;
  17688. }
  17689. assert ( ( pOut-pTemp )==( dChunk.Last()-dChunk[0] ) );
  17690. memcpy ( dChunk[0], pTemp, ( dChunk.Last()-dChunk[0] )*sizeof(CSphWordHit) );
  17691. }
  17692. // patching done
  17693. SafeDeleteArray ( pTemp );
  17694. iFirst = iMax;
  17695. }
  17696. }
  17697. const char * CSphDictKeywords::HitblockGetKeyword ( SphWordID_t uWordID )
  17698. {
  17699. const DWORD uHash = (DWORD)( uWordID % SLOTS );
  17700. HitblockKeyword_t * pEntry = m_dHash [ uHash ];
  17701. while ( pEntry )
  17702. {
  17703. // check crc
  17704. if ( pEntry->m_uWordid!=uWordID )
  17705. {
  17706. // crc mismatch, try next entry
  17707. pEntry = pEntry->m_pNextHash;
  17708. continue;
  17709. }
  17710. return pEntry->m_pKeyword;
  17711. }
  17712. ARRAY_FOREACH ( i, m_dExceptions )
  17713. if ( m_dExceptions[i].m_pEntry->m_uWordid==uWordID )
  17714. return m_dExceptions[i].m_pEntry->m_pKeyword;
  17715. assert ( "hash missing value in operator []" );
  17716. return "\31oops";
  17717. }
  17718. //////////////////////////////////////////////////////////////////////////
  17719. // KEYWORDS STORING DICTIONARY
  17720. //////////////////////////////////////////////////////////////////////////
  17721. class CRtDictKeywords : public ISphRtDictWraper
  17722. {
  17723. private:
  17724. CSphDict * m_pBase;
  17725. SmallStringHash_T<int> m_hKeywords;
  17726. CSphVector<BYTE> m_dPackedKeywords;
  17727. CSphString m_sWarning;
  17728. int m_iKeywordsOverrun;
  17729. public:
  17730. explicit CRtDictKeywords ( CSphDict * pBase )
  17731. : m_pBase ( pBase )
  17732. , m_iKeywordsOverrun ( 0 )
  17733. {
  17734. m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
  17735. }
  17736. virtual ~CRtDictKeywords() {}
  17737. virtual SphWordID_t GetWordID ( BYTE * pWord )
  17738. {
  17739. SphWordID_t uCRC = m_pBase->GetWordID ( pWord );
  17740. if ( uCRC )
  17741. return AddKeyword ( pWord );
  17742. else
  17743. return 0;
  17744. }
  17745. virtual SphWordID_t GetWordIDWithMarkers ( BYTE * pWord )
  17746. {
  17747. SphWordID_t uCRC = m_pBase->GetWordIDWithMarkers ( pWord );
  17748. if ( uCRC )
  17749. return AddKeyword ( pWord );
  17750. else
  17751. return 0;
  17752. }
  17753. virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord )
  17754. {
  17755. SphWordID_t uCRC = m_pBase->GetWordIDNonStemmed ( pWord );
  17756. if ( uCRC )
  17757. return AddKeyword ( pWord );
  17758. else
  17759. return 0;
  17760. }
  17761. virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
  17762. {
  17763. SphWordID_t uCRC = m_pBase->GetWordID ( pWord, iLen, bFilterStops );
  17764. if ( uCRC )
  17765. return AddKeyword ( pWord );
  17766. else
  17767. return 0;
  17768. }
  17769. virtual const BYTE * GetPackedKeywords () { return m_dPackedKeywords.Begin(); }
  17770. virtual int GetPackedLen () { return m_dPackedKeywords.GetLength(); }
  17771. virtual void ResetKeywords()
  17772. {
  17773. m_dPackedKeywords.Resize ( 0 );
  17774. m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
  17775. m_hKeywords.Reset();
  17776. }
  17777. SphWordID_t AddKeyword ( const BYTE * pWord )
  17778. {
  17779. CSphString sWord;
  17780. int iLen = strlen ( (const char *)pWord );
  17781. // fix of very long word (zones)
  17782. if ( iLen>=( SPH_MAX_WORD_LEN*3 ) )
  17783. {
  17784. int iClippedLen = SPH_MAX_WORD_LEN*3;
  17785. sWord.SetBinary ( (const char *)pWord, iClippedLen );
  17786. if ( m_iKeywordsOverrun )
  17787. {
  17788. m_sWarning.SetSprintf ( "word overrun buffer, clipped!!! clipped='%s', length=%d(%d)", sWord.cstr(), iClippedLen, iLen );
  17789. } else
  17790. {
  17791. m_sWarning.SetSprintf ( ", clipped='%s', length=%d(%d)", sWord.cstr(), iClippedLen, iLen );
  17792. }
  17793. iLen = iClippedLen;
  17794. m_iKeywordsOverrun++;
  17795. } else
  17796. {
  17797. sWord.SetBinary ( (const char *)pWord, iLen );
  17798. }
  17799. int * pOff = m_hKeywords ( sWord );
  17800. if ( pOff )
  17801. {
  17802. return *pOff;
  17803. }
  17804. int iOff = m_dPackedKeywords.GetLength();
  17805. m_dPackedKeywords.Resize ( iOff+iLen+1 );
  17806. m_dPackedKeywords[iOff] = (BYTE)( iLen & 0xFF );
  17807. memcpy ( m_dPackedKeywords.Begin()+iOff+1, pWord, iLen );
  17808. m_hKeywords.Add ( iOff, sWord );
  17809. return iOff;
  17810. }
  17811. virtual void LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer ) { m_pBase->LoadStopwords ( sFiles, pTokenizer ); }
  17812. virtual void LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords ) { m_pBase->LoadStopwords ( dStopwords ); }
  17813. virtual void WriteStopwords ( CSphWriter & tWriter ) { m_pBase->WriteStopwords ( tWriter ); }
  17814. virtual bool LoadWordforms ( const CSphVector<CSphString> & dFiles, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex ) { return m_pBase->LoadWordforms ( dFiles, pEmbedded, pTokenizer, sIndex ); }
  17815. virtual void WriteWordforms ( CSphWriter & tWriter ) { m_pBase->WriteWordforms ( tWriter ); }
  17816. virtual int SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sMessage ) { return m_pBase->SetMorphology ( szMorph, bUseUTF8, sMessage ); }
  17817. virtual void Setup ( const CSphDictSettings & tSettings ) { m_pBase->Setup ( tSettings ); }
  17818. virtual const CSphDictSettings & GetSettings () const { return m_pBase->GetSettings(); }
  17819. virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_pBase->GetStopwordsFileInfos(); }
  17820. virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_pBase->GetWordformsFileInfos(); }
  17821. virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pBase->GetMultiWordforms(); }
  17822. virtual bool IsStopWord ( const BYTE * pWord ) const { return m_pBase->IsStopWord ( pWord ); }
  17823. virtual const char * GetLastWarning() const { return m_iKeywordsOverrun ? m_sWarning.cstr() : NULL; }
  17824. virtual void ResetWarning () { m_iKeywordsOverrun = 0; }
  17825. };
  17826. ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase )
  17827. {
  17828. return new CRtDictKeywords ( pBase );
  17829. }
  17830. //////////////////////////////////////////////////////////////////////////
  17831. // DICTIONARY FACTORIES
  17832. //////////////////////////////////////////////////////////////////////////
  17833. static CSphDict * SetupDictionary ( CSphDict * pDict, const CSphDictSettings & tSettings,
  17834. const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex,
  17835. CSphString & sError )
  17836. {
  17837. assert ( pTokenizer );
  17838. assert ( pDict );
  17839. pDict->Setup ( tSettings );
  17840. int iRet = pDict->SetMorphology ( tSettings.m_sMorphology.cstr (), pTokenizer->IsUtf8(), sError );
  17841. if ( iRet==CSphDict::ST_ERROR )
  17842. {
  17843. SafeDelete ( pDict );
  17844. return NULL;
  17845. }
  17846. if ( pFiles && pFiles->m_bEmbeddedStopwords )
  17847. pDict->LoadStopwords ( pFiles->m_dStopwords );
  17848. else
  17849. pDict->LoadStopwords ( tSettings.m_sStopwords.cstr (), pTokenizer );
  17850. pDict->LoadWordforms ( tSettings.m_dWordforms, pFiles && pFiles->m_bEmbeddedWordforms ? pFiles : NULL, pTokenizer, sIndex );
  17851. return pDict;
  17852. }
  17853. CSphDict * sphCreateDictionaryCRC ( const CSphDictSettings & tSettings,
  17854. const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex,
  17855. CSphString & sError )
  17856. {
  17857. CSphDict * pDict = NULL;
  17858. if ( tSettings.m_bCrc32 )
  17859. pDict = new CSphDictCRC<true> ();
  17860. else
  17861. pDict = new CSphDictCRC<false> ();
  17862. if ( !pDict )
  17863. return NULL;
  17864. return SetupDictionary ( pDict, tSettings, pFiles, pTokenizer, sIndex, sError );
  17865. }
  17866. CSphDict * sphCreateDictionaryKeywords ( const CSphDictSettings & tSettings,
  17867. const CSphEmbeddedFiles * pFiles, ISphTokenizer * pTokenizer, const char * sIndex,
  17868. CSphString & sError )
  17869. {
  17870. CSphDict * pDict = new CSphDictKeywords();
  17871. return SetupDictionary ( pDict, tSettings, pFiles, pTokenizer, sIndex, sError );
  17872. }
  17873. void sphShutdownWordforms ()
  17874. {
  17875. CSphVector<CSphSavedFile> dEmptyFiles;
  17876. CSphDictCRCTraits::SweepWordformContainers ( dEmptyFiles );
  17877. }
  17878. /////////////////////////////////////////////////////////////////////////////
  17879. // HTML STRIPPER
  17880. /////////////////////////////////////////////////////////////////////////////
  17881. static inline int sphIsTag ( int c )
  17882. {
  17883. return sphIsAlpha(c) || c=='.' || c==':';
  17884. }
  17885. static inline int sphIsTagStart ( int c )
  17886. {
  17887. return ( c>='a' && c<='z' ) || ( c>='A' && c<='Z' ) || c=='_' || c=='.' || c==':';
  17888. }
  17889. CSphHTMLStripper::CSphHTMLStripper ( bool bDefaultTags )
  17890. {
  17891. if ( bDefaultTags )
  17892. {
  17893. // known inline tags
  17894. const char * dKnown[] =
  17895. {
  17896. "a", "b", "i", "s", "u",
  17897. "basefont", "big", "em", "font", "img",
  17898. "label", "small", "span", "strike", "strong",
  17899. "sub\0", "sup\0", // fix gcc 3.4.3 on solaris10 compiler bug
  17900. "tt"
  17901. };
  17902. m_dTags.Resize ( sizeof(dKnown)/sizeof(dKnown[0]) );
  17903. ARRAY_FOREACH ( i, m_dTags )
  17904. {
  17905. m_dTags[i].m_sTag = dKnown[i];
  17906. m_dTags[i].m_iTagLen = strlen ( dKnown[i] );
  17907. m_dTags[i].m_bInline = true;
  17908. }
  17909. }
  17910. UpdateTags ();
  17911. }
  17912. int CSphHTMLStripper::GetCharIndex ( int iCh ) const
  17913. {
  17914. if ( iCh>='a' && iCh<='z' ) return iCh-'a';
  17915. if ( iCh>='A' && iCh<='Z' ) return iCh-'A';
  17916. if ( iCh=='_' ) return 26;
  17917. if ( iCh==':' ) return 27;
  17918. return -1;
  17919. }
  17920. void CSphHTMLStripper::UpdateTags ()
  17921. {
  17922. m_dTags.Sort ();
  17923. for ( int i=0; i<MAX_CHAR_INDEX; i++ )
  17924. {
  17925. m_dStart[i] = INT_MAX;
  17926. m_dEnd[i] = -1;
  17927. }
  17928. ARRAY_FOREACH ( i, m_dTags )
  17929. {
  17930. int iIdx = GetCharIndex ( m_dTags[i].m_sTag.cstr()[0] );
  17931. if ( iIdx<0 )
  17932. continue;
  17933. m_dStart[iIdx] = Min ( m_dStart[iIdx], i );
  17934. m_dEnd[iIdx] = Max ( m_dEnd[iIdx], i );
  17935. }
  17936. }
  17937. bool CSphHTMLStripper::SetIndexedAttrs ( const char * sConfig, CSphString & sError )
  17938. {
  17939. if ( !sConfig || !*sConfig )
  17940. return true;
  17941. char sTag[256], sAttr[256];
  17942. const char * p = sConfig, * s;
  17943. #define LOC_ERROR(_msg,_pos) { sError.SetSprintf ( "SetIndexedAttrs(): %s near '%s'", _msg, _pos ); return false; }
  17944. while ( *p )
  17945. {
  17946. // skip spaces
  17947. while ( *p && isspace(*p) ) p++;
  17948. if ( !*p ) break;
  17949. // check tag name
  17950. s = p; while ( sphIsTag(*p) ) p++;
  17951. if ( s==p ) LOC_ERROR ( "invalid character in tag name", s );
  17952. // get tag name
  17953. if ( p-s>=(int)sizeof(sTag) ) LOC_ERROR ( "tag name too long", s );
  17954. strncpy ( sTag, s, p-s );
  17955. sTag[p-s] = '\0';
  17956. // skip spaces
  17957. while ( *p && isspace(*p) ) p++;
  17958. if ( *p++!='=' ) LOC_ERROR ( "'=' expected", p-1 );
  17959. // add indexed tag entry, if not there yet
  17960. strlwr ( sTag );
  17961. int iIndexTag = -1;
  17962. ARRAY_FOREACH ( i, m_dTags )
  17963. if ( m_dTags[i].m_sTag==sTag )
  17964. {
  17965. iIndexTag = i;
  17966. break;
  17967. }
  17968. if ( iIndexTag<0 )
  17969. {
  17970. m_dTags.Add();
  17971. m_dTags.Last().m_sTag = sTag;
  17972. m_dTags.Last().m_iTagLen = strlen ( sTag );
  17973. iIndexTag = m_dTags.GetLength()-1;
  17974. }
  17975. m_dTags[iIndexTag].m_bIndexAttrs = true;
  17976. CSphVector<CSphString> & dAttrs = m_dTags[iIndexTag].m_dAttrs;
  17977. // scan attributes
  17978. while ( *p )
  17979. {
  17980. // skip spaces
  17981. while ( *p && isspace(*p) ) p++;
  17982. if ( !*p ) break;
  17983. // check attr name
  17984. s = p; while ( sphIsTag(*p) ) p++;
  17985. if ( s==p ) LOC_ERROR ( "invalid character in attribute name", s );
  17986. // get attr name
  17987. if ( p-s>=(int)sizeof(sAttr) ) LOC_ERROR ( "attribute name too long", s );
  17988. strncpy ( sAttr, s, p-s );
  17989. sAttr[p-s] = '\0';
  17990. // add attr, if not there yet
  17991. int iAttr;
  17992. for ( iAttr=0; iAttr<dAttrs.GetLength(); iAttr++ )
  17993. if ( dAttrs[iAttr]==sAttr )
  17994. break;
  17995. if ( iAttr==dAttrs.GetLength() )
  17996. dAttrs.Add ( sAttr );
  17997. // skip spaces
  17998. while ( *p && isspace(*p) ) p++;
  17999. if ( !*p ) break;
  18000. // check if there's next attr or tag
  18001. if ( *p==',' ) { p++; continue; } // next attr
  18002. if ( *p==';' ) { p++; break; } // next tag
  18003. LOC_ERROR ( "',' or ';' or end of line expected", p );
  18004. }
  18005. }
  18006. #undef LOC_ERROR
  18007. UpdateTags ();
  18008. return true;
  18009. }
  18010. bool CSphHTMLStripper::SetRemovedElements ( const char * sConfig, CSphString & )
  18011. {
  18012. if ( !sConfig || !*sConfig )
  18013. return true;
  18014. const char * p = sConfig;
  18015. while ( *p )
  18016. {
  18017. // skip separators
  18018. while ( *p && !sphIsTag(*p) ) p++;
  18019. if ( !*p ) break;
  18020. // get tag name
  18021. const char * s = p;
  18022. while ( sphIsTag(*p) ) p++;
  18023. CSphString sTag;
  18024. sTag.SetBinary ( s, p-s );
  18025. sTag.ToLower ();
  18026. // mark it
  18027. int iTag;
  18028. for ( iTag=0; iTag<m_dTags.GetLength(); iTag++ )
  18029. if ( m_dTags[iTag].m_sTag==sTag )
  18030. {
  18031. m_dTags[iTag].m_bRemove = true;
  18032. break;
  18033. }
  18034. if ( iTag==m_dTags.GetLength() )
  18035. {
  18036. m_dTags.Add();
  18037. m_dTags.Last().m_sTag = sTag;
  18038. m_dTags.Last().m_iTagLen = strlen ( sTag.cstr() );
  18039. m_dTags.Last().m_bRemove = true;
  18040. }
  18041. }
  18042. UpdateTags ();
  18043. return true;
  18044. }
  18045. void CSphHTMLStripper::EnableParagraphs ()
  18046. {
  18047. // known block-level elements
  18048. const char * dBlock[] = { "address", "blockquote", "caption", "center",
  18049. "dd", "div", "dl", "dt", "h1", "h2", "h3", "h4", "h5", "li", "menu",
  18050. "ol", "p", "pre", "table", "tbody", "td", "tfoot", "th", "thead",
  18051. "tr", "ul", NULL };
  18052. for ( int iBlock=0; dBlock[iBlock]; iBlock++ )
  18053. {
  18054. const char * sTag = dBlock[iBlock];
  18055. // mark if known already
  18056. int iTag;
  18057. for ( iTag=0; iTag<m_dTags.GetLength(); iTag++ )
  18058. if ( m_dTags[iTag].m_sTag==sTag )
  18059. {
  18060. m_dTags[iTag].m_bPara = true;
  18061. break;
  18062. }
  18063. // add if not known yet
  18064. if ( iTag==m_dTags.GetLength() )
  18065. {
  18066. m_dTags.Add();
  18067. m_dTags.Last().m_sTag = sTag;
  18068. m_dTags.Last().m_iTagLen = strlen(sTag);
  18069. m_dTags.Last().m_bPara = true;
  18070. }
  18071. }
  18072. UpdateTags ();
  18073. }
  18074. bool CSphHTMLStripper::SetZones ( const char * sZones, CSphString & sError )
  18075. {
  18076. // yet another mini parser!
  18077. // index_zones = {tagname | prefix*} [, ...]
  18078. if ( !sZones || !*sZones )
  18079. return true;
  18080. const char * s = sZones;
  18081. while ( *s )
  18082. {
  18083. // skip spaces
  18084. while ( sphIsSpace(*s) )
  18085. s++;
  18086. if ( !*s )
  18087. break;
  18088. // expect ident
  18089. if ( !sphIsTagStart(*s) )
  18090. {
  18091. sError.SetSprintf ( "unexpected char near '%s' in index_zones", s );
  18092. return false;
  18093. }
  18094. // get ident (either tagname or prefix*)
  18095. const char * sTag = s;
  18096. while ( sphIsTag(*s) )
  18097. s++;
  18098. const char * sTagEnd = s;
  18099. bool bPrefix = false;
  18100. if ( *s=='*' )
  18101. {
  18102. s++;
  18103. bPrefix = true;
  18104. }
  18105. // skip spaces
  18106. while ( sphIsSpace(*s) )
  18107. s++;
  18108. // expect eof or comma after ident
  18109. if ( *s && *s!=',' )
  18110. {
  18111. sError.SetSprintf ( "unexpected char near '%s' in index_zones", s );
  18112. return false;
  18113. }
  18114. if ( *s==',' )
  18115. s++;
  18116. // got valid entry, handle it
  18117. CSphHTMLStripper::StripperTag_t & tTag = m_dTags.Add();
  18118. tTag.m_sTag.SetBinary ( sTag, sTagEnd-sTag );
  18119. tTag.m_iTagLen = (int)( sTagEnd-sTag );
  18120. tTag.m_bZone = true;
  18121. tTag.m_bZonePrefix = bPrefix;
  18122. }
  18123. UpdateTags ();
  18124. return true;
  18125. }
  18126. const BYTE * SkipQuoted ( const BYTE * p )
  18127. {
  18128. const BYTE * pMax = p + 512; // 512 bytes should be enough for a reasonable HTML attribute value, right?!
  18129. const BYTE * pProbEnd = NULL; // (most) probable end location in case we don't find a matching quote
  18130. BYTE cEnd = *p++; // either apostrophe or quote
  18131. while ( p<pMax && *p && *p!=cEnd )
  18132. {
  18133. if ( !pProbEnd )
  18134. if ( *p=='>' || *p=='\r' )
  18135. pProbEnd = p;
  18136. p++;
  18137. }
  18138. if ( *p==cEnd )
  18139. return p+1;
  18140. if ( pProbEnd )
  18141. return pProbEnd;
  18142. return p;
  18143. }
  18144. struct HtmlEntity_t
  18145. {
  18146. const char * m_sName;
  18147. int m_iCode;
  18148. };
  18149. static inline DWORD HtmlEntityHash ( const BYTE * str, int len )
  18150. {
  18151. static const unsigned short asso_values[] =
  18152. {
  18153. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18154. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18155. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18156. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18157. 421, 421, 421, 421, 421, 421, 421, 421, 421, 4,
  18158. 6, 22, 1, 421, 421, 421, 421, 421, 421, 421,
  18159. 421, 421, 421, 421, 421, 170, 48, 0, 5, 44,
  18160. 0, 10, 10, 86, 421, 7, 0, 1, 42, 93,
  18161. 41, 421, 0, 5, 8, 14, 421, 421, 5, 11,
  18162. 8, 421, 421, 421, 421, 421, 421, 1, 25, 27,
  18163. 9, 2, 113, 82, 14, 3, 179, 1, 81, 91,
  18164. 12, 0, 1, 180, 56, 17, 5, 31, 60, 7,
  18165. 3, 161, 2, 3, 421, 421, 421, 421, 421, 421,
  18166. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18167. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18168. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18169. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18170. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18171. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18172. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18173. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18174. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18175. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18176. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18177. 421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
  18178. 421, 421, 421, 421, 421, 421, 421
  18179. };
  18180. register int hval = len;
  18181. switch ( hval )
  18182. {
  18183. default: hval += asso_values [ str[4] ];
  18184. case 4:
  18185. case 3: hval += asso_values [ str[2] ];
  18186. case 2: hval += asso_values [ str[1]+1 ];
  18187. case 1: hval += asso_values [ str[0] ];
  18188. break;
  18189. }
  18190. return hval + asso_values [ str[len-1] ];
  18191. }
  18192. static inline int HtmlEntityLookup ( const BYTE * str, int len )
  18193. {
  18194. static const unsigned char lengthtable[] =
  18195. {
  18196. 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 3,
  18197. 4, 3, 3, 5, 3, 6, 5, 5, 3, 4, 4, 5, 3, 4,
  18198. 4, 0, 5, 4, 5, 6, 5, 6, 4, 5, 3, 3, 5, 0,
  18199. 0, 0, 0, 6, 0, 5, 5, 0, 5, 6, 6, 3, 0, 3,
  18200. 5, 3, 0, 6, 0, 4, 3, 6, 3, 6, 6, 6, 6, 5,
  18201. 5, 5, 5, 5, 5, 2, 6, 4, 0, 6, 3, 3, 3, 0,
  18202. 4, 5, 4, 4, 4, 3, 7, 4, 3, 6, 2, 3, 6, 4,
  18203. 3, 6, 5, 6, 5, 5, 4, 2, 0, 0, 4, 6, 8, 0,
  18204. 0, 0, 5, 5, 0, 6, 6, 2, 2, 4, 4, 6, 6, 4,
  18205. 4, 5, 6, 2, 3, 4, 6, 5, 0, 2, 0, 0, 6, 6,
  18206. 6, 6, 6, 4, 6, 5, 0, 6, 4, 5, 4, 6, 6, 0,
  18207. 0, 4, 6, 5, 6, 0, 6, 4, 5, 6, 5, 6, 4, 0,
  18208. 3, 6, 0, 4, 4, 4, 5, 4, 6, 0, 4, 4, 6, 5,
  18209. 6, 7, 2, 2, 6, 2, 5, 2, 5, 0, 0, 0, 4, 4,
  18210. 2, 4, 2, 2, 4, 0, 4, 4, 4, 5, 5, 0, 3, 7,
  18211. 5, 0, 5, 6, 5, 0, 6, 0, 6, 0, 4, 6, 4, 6,
  18212. 6, 2, 6, 0, 5, 5, 4, 6, 6, 0, 5, 6, 4, 4,
  18213. 4, 4, 0, 5, 0, 5, 0, 4, 5, 4, 0, 4, 4, 4,
  18214. 0, 0, 0, 4, 0, 0, 0, 5, 6, 5, 3, 0, 0, 6,
  18215. 5, 4, 5, 5, 5, 5, 0, 5, 5, 0, 5, 0, 0, 0,
  18216. 4, 6, 0, 3, 0, 5, 5, 0, 0, 3, 6, 5, 0, 4,
  18217. 0, 0, 0, 0, 5, 7, 5, 3, 5, 3, 0, 0, 6, 0,
  18218. 6, 0, 0, 7, 0, 0, 5, 0, 5, 0, 0, 0, 0, 5,
  18219. 4, 0, 0, 0, 0, 0, 7, 4, 0, 0, 3, 0, 0, 0,
  18220. 3, 0, 6, 0, 0, 7, 5, 5, 0, 3, 0, 0, 0, 0,
  18221. 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5,
  18222. 5, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  18223. 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  18224. 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0,
  18225. 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  18226. 5
  18227. };
  18228. static const struct HtmlEntity_t wordlist[] =
  18229. {
  18230. {""}, {""}, {""}, {""}, {""}, {""},
  18231. {"Rho", 929},
  18232. {""}, {""}, {""}, {""}, {""},
  18233. {"Chi", 935},
  18234. {"phi", 966},
  18235. {"iota", 953},
  18236. {"psi", 968},
  18237. {"int", 8747},
  18238. {"theta", 952},
  18239. {"amp", 38},
  18240. {"there4", 8756},
  18241. {"Theta", 920},
  18242. {"omega", 969},
  18243. {"and", 8743},
  18244. {"prop", 8733},
  18245. {"ensp", 8194},
  18246. {"image", 8465},
  18247. {"not", 172},
  18248. {"isin", 8712},
  18249. {"sdot", 8901},
  18250. {""},
  18251. {"prime", 8242},
  18252. {"prod", 8719},
  18253. {"trade", 8482},
  18254. {"Scaron", 352},
  18255. {"kappa", 954},
  18256. {"thinsp", 8201},
  18257. {"emsp", 8195},
  18258. {"thorn", 254},
  18259. {"eta", 951},
  18260. {"chi", 967},
  18261. {"Kappa", 922},
  18262. {""}, {""}, {""}, {""},
  18263. {"scaron", 353},
  18264. {""},
  18265. {"notin", 8713},
  18266. {"ndash", 8211},
  18267. {""},
  18268. {"acute", 180},
  18269. {"otilde", 245},
  18270. {"atilde", 227},
  18271. {"Phi", 934},
  18272. {""},
  18273. {"Psi", 936},
  18274. {"pound", 163},
  18275. {"cap", 8745},
  18276. {""},
  18277. {"otimes", 8855},
  18278. {""},
  18279. {"nbsp", 32},
  18280. {"rho", 961},
  18281. {"ntilde", 241},
  18282. {"eth", 240},
  18283. {"oacute", 243},
  18284. {"aacute", 225},
  18285. {"eacute", 233},
  18286. {"iacute", 237},
  18287. {"nabla", 8711},
  18288. {"Prime", 8243},
  18289. {"ocirc", 244},
  18290. {"acirc", 226},
  18291. {"ecirc", 234},
  18292. {"icirc", 238},
  18293. {"or", 8744},
  18294. {"Yacute", 221},
  18295. {"nsub", 8836},
  18296. {""},
  18297. {"Uacute", 218},
  18298. {"Eta", 919},
  18299. {"ETH", 208},
  18300. {"sup", 8835},
  18301. {""},
  18302. {"supe", 8839},
  18303. {"Ucirc", 219},
  18304. {"sup1", 185},
  18305. {"para", 182},
  18306. {"sup2", 178},
  18307. {"loz", 9674},
  18308. {"omicron", 959},
  18309. {"part", 8706},
  18310. {"cup", 8746},
  18311. {"Ntilde", 209},
  18312. {"Mu", 924},
  18313. {"tau", 964},
  18314. {"uacute", 250},
  18315. {"Iota", 921},
  18316. {"Tau", 932},
  18317. {"rsaquo", 8250},
  18318. {"alpha", 945},
  18319. {"Ccedil", 199},
  18320. {"ucirc", 251},
  18321. {"oline", 8254},
  18322. {"sup3", 179},
  18323. {"nu", 957},
  18324. {""}, {""},
  18325. {"sube", 8838},
  18326. {"Eacute", 201},
  18327. {"thetasym", 977},
  18328. {""}, {""}, {""},
  18329. {"Omega", 937},
  18330. {"Ecirc", 202},
  18331. {""},
  18332. {"lowast", 8727},
  18333. {"iquest", 191},
  18334. {"lt", 60},
  18335. {"gt", 62},
  18336. {"ordm", 186},
  18337. {"euro", 8364},
  18338. {"oslash", 248},
  18339. {"lsaquo", 8249},
  18340. {"zeta", 950},
  18341. {"cong", 8773},
  18342. {"mdash", 8212},
  18343. {"ccedil", 231},
  18344. {"ne", 8800},
  18345. {"sub", 8834},
  18346. {"Zeta", 918},
  18347. {"Lambda", 923},
  18348. {"Gamma", 915},
  18349. {""},
  18350. {"Nu", 925},
  18351. {""}, {""},
  18352. {"ograve", 242},
  18353. {"agrave", 224},
  18354. {"egrave", 232},
  18355. {"igrave", 236},
  18356. {"frac14", 188},
  18357. {"ordf", 170},
  18358. {"Otilde", 213},
  18359. {"infin", 8734},
  18360. {""},
  18361. {"frac12", 189},
  18362. {"beta", 946},
  18363. {"radic", 8730},
  18364. {"darr", 8595},
  18365. {"Iacute", 205},
  18366. {"Ugrave", 217},
  18367. {""}, {""},
  18368. {"harr", 8596},
  18369. {"hearts", 9829},
  18370. {"Icirc", 206},
  18371. {"Oacute", 211},
  18372. {""},
  18373. {"frac34", 190},
  18374. {"cent", 162},
  18375. {"crarr", 8629},
  18376. {"curren", 164},
  18377. {"Ocirc", 212},
  18378. {"brvbar", 166},
  18379. {"sect", 167},
  18380. {""},
  18381. {"ang", 8736},
  18382. {"ugrave", 249},
  18383. {""},
  18384. {"Beta", 914},
  18385. {"uarr", 8593},
  18386. {"dArr", 8659},
  18387. {"asymp", 8776},
  18388. {"perp", 8869},
  18389. {"Dagger", 8225},
  18390. {""},
  18391. {"hArr", 8660},
  18392. {"rang", 9002},
  18393. {"dagger", 8224},
  18394. {"exist", 8707},
  18395. {"Egrave", 200},
  18396. {"Omicron", 927},
  18397. {"mu", 956},
  18398. {"pi", 960},
  18399. {"weierp", 8472},
  18400. {"xi", 958},
  18401. {"clubs", 9827},
  18402. {"Xi", 926},
  18403. {"aring", 229},
  18404. {""}, {""}, {""},
  18405. {"copy", 169},
  18406. {"uArr", 8657},
  18407. {"ni", 8715},
  18408. {"rarr", 8594},
  18409. {"le", 8804},
  18410. {"ge", 8805},
  18411. {"zwnj", 8204},
  18412. {""},
  18413. {"apos", 39},
  18414. {"macr", 175},
  18415. {"lang", 9001},
  18416. {"gamma", 947},
  18417. {"Delta", 916},
  18418. {""},
  18419. {"uml", 168},
  18420. {"alefsym", 8501},
  18421. {"delta", 948},
  18422. {""},
  18423. {"bdquo", 8222},
  18424. {"lambda", 955},
  18425. {"equiv", 8801},
  18426. {""},
  18427. {"Oslash", 216},
  18428. {""},
  18429. {"hellip", 8230},
  18430. {""},
  18431. {"rArr", 8658},
  18432. {"Atilde", 195},
  18433. {"larr", 8592},
  18434. {"spades", 9824},
  18435. {"Igrave", 204},
  18436. {"Pi", 928},
  18437. {"yacute", 253},
  18438. {""},
  18439. {"diams", 9830},
  18440. {"sbquo", 8218},
  18441. {"fnof", 402},
  18442. {"Ograve", 210},
  18443. {"plusmn", 177},
  18444. {""},
  18445. {"rceil", 8969},
  18446. {"Aacute", 193},
  18447. {"ouml", 246},
  18448. {"auml", 228},
  18449. {"euml", 235},
  18450. {"iuml", 239},
  18451. {""},
  18452. {"Acirc", 194},
  18453. {""},
  18454. {"rdquo", 8221},
  18455. {""},
  18456. {"lArr", 8656},
  18457. {"rsquo", 8217},
  18458. {"Yuml", 376},
  18459. {""},
  18460. {"quot", 34},
  18461. {"Uuml", 220},
  18462. {"bull", 8226},
  18463. {""}, {""}, {""},
  18464. {"real", 8476},
  18465. {""}, {""}, {""},
  18466. {"lceil", 8968},
  18467. {"permil", 8240},
  18468. {"upsih", 978},
  18469. {"sum", 8721},
  18470. {""}, {""},
  18471. {"divide", 247},
  18472. {"raquo", 187},
  18473. {"uuml", 252},
  18474. {"ldquo", 8220},
  18475. {"Alpha", 913},
  18476. {"szlig", 223},
  18477. {"lsquo", 8216},
  18478. {""},
  18479. {"Sigma", 931},
  18480. {"tilde", 732},
  18481. {""},
  18482. {"THORN", 222},
  18483. {""}, {""}, {""},
  18484. {"Euml", 203},
  18485. {"rfloor", 8971},
  18486. {""},
  18487. {"lrm", 8206},
  18488. {""},
  18489. {"sigma", 963},
  18490. {"iexcl", 161},
  18491. {""}, {""},
  18492. {"deg", 176},
  18493. {"middot", 183},
  18494. {"laquo", 171},
  18495. {""},
  18496. {"circ", 710},
  18497. {""}, {""}, {""}, {""},
  18498. {"frasl", 8260},
  18499. {"epsilon", 949},
  18500. {"oplus", 8853},
  18501. {"yen", 165},
  18502. {"micro", 181},
  18503. {"piv", 982},
  18504. {""}, {""},
  18505. {"lfloor", 8970},
  18506. {""},
  18507. {"Agrave", 192},
  18508. {""}, {""},
  18509. {"Upsilon", 933},
  18510. {""}, {""},
  18511. {"times", 215},
  18512. {""},
  18513. {"cedil", 184},
  18514. {""}, {""}, {""}, {""},
  18515. {"minus", 8722},
  18516. {"Iuml", 207},
  18517. {""}, {""}, {""}, {""}, {""},
  18518. {"upsilon", 965},
  18519. {"Ouml", 214},
  18520. {""}, {""},
  18521. {"rlm", 8207},
  18522. {""}, {""}, {""},
  18523. {"reg", 174},
  18524. {""},
  18525. {"forall", 8704},
  18526. {""}, {""},
  18527. {"Epsilon", 917},
  18528. {"empty", 8709},
  18529. {"OElig", 338},
  18530. {""},
  18531. {"shy", 173},
  18532. {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""},
  18533. {""}, {""}, {""}, {""},
  18534. {"Aring", 197},
  18535. {""}, {""}, {""},
  18536. {"oelig", 339},
  18537. {"aelig", 230},
  18538. {""},
  18539. {"zwj", 8205},
  18540. {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""},
  18541. {""}, {""}, {""}, {""}, {""},
  18542. {"sim", 8764},
  18543. {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""},
  18544. {""}, {""}, {""}, {""}, {""}, {""},
  18545. {"yuml", 255},
  18546. {"sigmaf", 962},
  18547. {""}, {""}, {""}, {""}, {""}, {""}, {""},
  18548. {"Auml", 196},
  18549. {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""}, {""},
  18550. {""}, {""}, {""}, {""},
  18551. {"AElig", 198}
  18552. };
  18553. const int MIN_WORD_LENGTH = 2;
  18554. const int MAX_WORD_LENGTH = 8;
  18555. const int MAX_HASH_VALUE = 420;
  18556. if ( len<=MAX_WORD_LENGTH && len>=MIN_WORD_LENGTH )
  18557. {
  18558. register int key = HtmlEntityHash ( str, len );
  18559. if ( key<=MAX_HASH_VALUE && key>=0 )
  18560. if ( len==lengthtable[key] )
  18561. {
  18562. register const char * s = wordlist[key].m_sName;
  18563. if ( *str==*s && !memcmp ( str+1, s+1, len-1 ) )
  18564. return wordlist[key].m_iCode;
  18565. }
  18566. }
  18567. return 0;
  18568. }
  18569. void CSphHTMLStripper::Strip ( BYTE * sData ) const
  18570. {
  18571. const BYTE * s = sData;
  18572. BYTE * d = sData;
  18573. for ( ;; )
  18574. {
  18575. /////////////////////////////////////
  18576. // scan until eof, or tag, or entity
  18577. /////////////////////////////////////
  18578. while ( *s && *s!='<' && *s!='&' )
  18579. {
  18580. if ( *s>=0x20 )
  18581. *d++ = *s;
  18582. else
  18583. *d++ = ' ';
  18584. s++;
  18585. }
  18586. if ( !*s )
  18587. break;
  18588. /////////////////
  18589. // handle entity
  18590. /////////////////
  18591. if ( *s=='&' )
  18592. {
  18593. if ( s[1]=='#' )
  18594. {
  18595. // handle "&#number;" form
  18596. int iCode = 0;
  18597. s += 2;
  18598. while ( isdigit(*s) )
  18599. iCode = iCode*10 + (*s++) - '0';
  18600. if ( ( iCode>=0 && iCode<=0x1f ) || *s!=';' ) // 0-31 are reserved codes
  18601. continue;
  18602. d += sphUTF8Encode ( d, iCode );
  18603. s++;
  18604. } else
  18605. {
  18606. // skip until ';' or max length
  18607. if ( ( s[1]>='a' && s[1]<='z' ) || ( s[1]>='A' && s[1]<='Z' ) )
  18608. {
  18609. const int MAX_ENTITY_LEN = 8;
  18610. const BYTE * sStart = s+1;
  18611. while ( *s && *s!=';' && s-sStart<=MAX_ENTITY_LEN )
  18612. s++;
  18613. if ( *s==';' )
  18614. {
  18615. int iCode = HtmlEntityLookup ( sStart, (int)(s-sStart) );
  18616. if ( iCode>0 )
  18617. {
  18618. // this is a known entity; encode it
  18619. d += sphUTF8Encode ( d, iCode );
  18620. s++;
  18621. continue;
  18622. }
  18623. }
  18624. // rollback
  18625. s = sStart-1;
  18626. }
  18627. // if we're here, it's not an entity; pass the leading ampersand and rescan
  18628. *d++ = *s++;
  18629. }
  18630. continue;
  18631. }
  18632. //////////////
  18633. // handle tag
  18634. //////////////
  18635. assert ( *s=='<' );
  18636. if ( GetCharIndex(s[1])<0 )
  18637. {
  18638. if ( s[1]=='/' )
  18639. {
  18640. // check if it's valid closing tag
  18641. if ( GetCharIndex(s[2])<0 )
  18642. {
  18643. *d++ = *s++;
  18644. continue;
  18645. }
  18646. } else if ( s[1]=='!' )
  18647. {
  18648. if ( s[2]=='-' && s[3]=='-' )
  18649. {
  18650. // it's valid comment; scan until comment end
  18651. s += 4; // skip opening '<!--'
  18652. while ( *s )
  18653. {
  18654. if ( s[0]=='-' && s[1]=='-' && s[2]=='>' )
  18655. break;
  18656. s++;
  18657. }
  18658. if ( !*s )
  18659. break;
  18660. s += 3; // skip closing '-->'
  18661. continue;
  18662. } else if ( isalpha(s[2]) )
  18663. {
  18664. // it's <!doctype> style PI; scan until PI end
  18665. s += 2;
  18666. while ( *s && *s!='>' )
  18667. {
  18668. if ( *s=='\'' || *s=='"' )
  18669. {
  18670. s = SkipQuoted ( s );
  18671. while ( isspace(*s) ) s++;
  18672. } else
  18673. {
  18674. s++;
  18675. }
  18676. }
  18677. if ( *s=='>' )
  18678. s++;
  18679. continue;
  18680. } else
  18681. {
  18682. // it's something malformed; just ignore
  18683. *d++ = *s++;
  18684. continue;
  18685. }
  18686. } else if ( s[1]=='?' )
  18687. {
  18688. // scan until PI end
  18689. s += 2; // skip opening '<?'
  18690. while ( *s )
  18691. {
  18692. if ( s[0]=='?' && s[1]=='>' )
  18693. break;
  18694. s++;
  18695. }
  18696. if ( !*s )
  18697. break;
  18698. s += 2; // skip closing '?>'
  18699. continue;
  18700. } else
  18701. {
  18702. // simply malformed
  18703. *d++ = *s++;
  18704. continue;
  18705. }
  18706. }
  18707. s++; // skip '<'
  18708. //////////////////////////////////////
  18709. // lookup this tag in known tags list
  18710. //////////////////////////////////////
  18711. const StripperTag_t * pTag = NULL;
  18712. int iZoneNameLen = 0;
  18713. const BYTE * sZoneName = NULL;
  18714. s = FindTag ( s, &pTag, &sZoneName, &iZoneNameLen );
  18715. /////////////////////////////////////
  18716. // process tag contents
  18717. // index attributes if needed
  18718. // gracefully handle malformed stuff
  18719. /////////////////////////////////////
  18720. #define LOC_SKIP_SPACES() { while ( sphIsSpace(*s) ) s++; if ( !*s || *s=='>' ) break; }
  18721. bool bIndexAttrs = ( pTag && pTag->m_bIndexAttrs );
  18722. while ( *s && *s!='>' )
  18723. {
  18724. LOC_SKIP_SPACES();
  18725. if ( sphIsTagStart(*s) )
  18726. {
  18727. // skip attribute name while it's valid
  18728. const BYTE * sAttr = s;
  18729. while ( sphIsTag(*s) )
  18730. s++;
  18731. // blanks or a value after a valid attribute name?
  18732. if ( sphIsSpace(*s) || *s=='=' )
  18733. {
  18734. const int iAttrLen = (int)( s - sAttr );
  18735. LOC_SKIP_SPACES();
  18736. // a valid name but w/o a value; keep scanning
  18737. if ( *s!='=' )
  18738. continue;
  18739. // got value!
  18740. s++;
  18741. LOC_SKIP_SPACES();
  18742. // check attribute name
  18743. // OPTIMIZE! remove linear search
  18744. int iAttr = -1;
  18745. if ( bIndexAttrs )
  18746. {
  18747. for ( iAttr=0; iAttr<pTag->m_dAttrs.GetLength(); iAttr++ )
  18748. {
  18749. int iLen = strlen ( pTag->m_dAttrs[iAttr].cstr() );
  18750. if ( iLen==iAttrLen && !strncasecmp ( pTag->m_dAttrs[iAttr].cstr(), (const char*)sAttr, iLen ) )
  18751. break;
  18752. }
  18753. if ( iAttr==pTag->m_dAttrs.GetLength() )
  18754. iAttr = -1;
  18755. }
  18756. // process the value
  18757. const BYTE * sVal = s;
  18758. if ( *s=='\'' || *s=='"' )
  18759. {
  18760. // skip quoted value until a matching quote
  18761. s = SkipQuoted ( s );
  18762. } else
  18763. {
  18764. // skip unquoted value until tag end or whitespace
  18765. while ( *s && *s!='>' && !sphIsSpace(*s) )
  18766. s++;
  18767. }
  18768. // if this one is to be indexed, copy it
  18769. if ( iAttr>=0 )
  18770. {
  18771. const BYTE * sMax = s;
  18772. if ( *sVal=='\'' || *sVal=='"' )
  18773. {
  18774. if ( sMax[-1]==sVal[0] )
  18775. sMax--;
  18776. sVal++;
  18777. }
  18778. while ( sVal<sMax )
  18779. *d++ = *sVal++;
  18780. *d++ = ' ';
  18781. }
  18782. // handled the value; keep scanning
  18783. continue;
  18784. }
  18785. // nope, got an invalid character in the sequence (or maybe eof)
  18786. // fall through to an invalid name handler
  18787. }
  18788. // keep skipping until tag end or whitespace
  18789. while ( *s && *s!='>' && !sphIsSpace(*s) )
  18790. s++;
  18791. }
  18792. #undef LOC_SKIP_SPACES
  18793. // skip closing angle bracket, if any
  18794. if ( *s )
  18795. s++;
  18796. // unknown tag is done; others might require a bit more work
  18797. if ( !pTag )
  18798. {
  18799. *d++ = ' '; // unknown tags are *not* inline by default
  18800. continue;
  18801. }
  18802. // handle zones
  18803. if ( pTag->m_bZone )
  18804. {
  18805. // should be at tag's end
  18806. assert ( s[0]=='\0' || s[-1]=='>' );
  18807. // emit secret codes
  18808. *d++ = MAGIC_CODE_ZONE;
  18809. for ( int i=0; i<iZoneNameLen; i++ )
  18810. *d++ = (BYTE) tolower ( sZoneName[i] );
  18811. *d++ = MAGIC_CODE_ZONE;
  18812. if ( !*s )
  18813. break;
  18814. continue;
  18815. }
  18816. // handle paragraph boundaries
  18817. if ( pTag->m_bPara )
  18818. {
  18819. *d++ = MAGIC_CODE_PARAGRAPH;
  18820. continue;
  18821. }
  18822. // in all cases, the tag must be fully processed at this point
  18823. // not a remove-tag? we're done
  18824. if ( !pTag->m_bRemove )
  18825. {
  18826. if ( !pTag->m_bInline )
  18827. *d++ = ' ';
  18828. continue;
  18829. }
  18830. // sudden eof? bail out
  18831. if ( !*s )
  18832. break;
  18833. // must be a proper remove-tag end, then
  18834. assert ( pTag->m_bRemove && s[-1]=='>' );
  18835. // short-form? we're done
  18836. if ( s[-2]=='/' )
  18837. continue;
  18838. // skip everything until the closing tag
  18839. // FIXME! should we handle insane cases with quoted closing tag within tag?
  18840. for ( ;; )
  18841. {
  18842. while ( *s && ( s[0]!='<' || s[1]!='/' ) ) s++;
  18843. if ( !*s ) break;
  18844. s += 2; // skip </
  18845. if ( strncasecmp ( pTag->m_sTag.cstr(), (const char*)s, pTag->m_iTagLen )!=0 ) continue;
  18846. if ( !sphIsTag ( s[pTag->m_iTagLen] ) )
  18847. {
  18848. s += pTag->m_iTagLen; // skip tag
  18849. if ( *s=='>' ) s++;
  18850. break;
  18851. }
  18852. }
  18853. if ( !pTag->m_bInline ) *d++ = ' ';
  18854. }
  18855. *d++ = '\0';
  18856. // space, paragraph sequences elimination pass
  18857. s = sData;
  18858. d = sData;
  18859. bool bSpaceOut = false;
  18860. bool bParaOut = false;
  18861. bool bZoneOut = false;
  18862. while ( const char c = *s++ )
  18863. {
  18864. assert ( d<=s-1 );
  18865. // handle different character classes
  18866. if ( sphIsSpace(c) )
  18867. {
  18868. // handle whitespace, skip dupes
  18869. if ( !bSpaceOut )
  18870. *d++ = ' ';
  18871. bSpaceOut = true;
  18872. continue;
  18873. } else if ( c==MAGIC_CODE_PARAGRAPH )
  18874. {
  18875. // handle paragraph marker, skip dupes
  18876. if ( !bParaOut && !bZoneOut )
  18877. {
  18878. *d++ = c;
  18879. bParaOut = true;
  18880. }
  18881. bSpaceOut = true;
  18882. continue;
  18883. } else if ( c==MAGIC_CODE_ZONE )
  18884. {
  18885. // zone marker
  18886. // rewind preceding paragraph, if any, it is redundant
  18887. if ( bParaOut )
  18888. {
  18889. assert ( d>sData && d[-1]==MAGIC_CODE_PARAGRAPH );
  18890. d--;
  18891. }
  18892. // copy \4zoneid\4
  18893. *d++ = c;
  18894. while ( *s && *s!=MAGIC_CODE_ZONE )
  18895. *d++ = *s++;
  18896. if ( *s )
  18897. *d++ = *s++;
  18898. // update state
  18899. // no spaces paragraphs allowed
  18900. bSpaceOut = bZoneOut = true;
  18901. bParaOut = false;
  18902. continue;
  18903. } else
  18904. {
  18905. *d++ = c;
  18906. bSpaceOut = bParaOut = bZoneOut = false;
  18907. }
  18908. }
  18909. *d++ = '\0';
  18910. }
  18911. const BYTE * CSphHTMLStripper::FindTag ( const BYTE * sSrc, const StripperTag_t ** ppTag,
  18912. const BYTE ** ppZoneName, int * pZoneNameLen ) const
  18913. {
  18914. assert ( sSrc && ppTag && ppZoneName && pZoneNameLen );
  18915. assert ( sSrc[0]!='/' || sSrc[1]!='\0' );
  18916. const BYTE * sTagName = ( sSrc[0]=='/' ) ? sSrc+1 : sSrc;
  18917. *ppZoneName = sSrc;
  18918. *pZoneNameLen = 0;
  18919. int iIdx = GetCharIndex ( sTagName[0] );
  18920. assert ( iIdx>=0 && iIdx<MAX_CHAR_INDEX );
  18921. if ( m_dEnd[iIdx]>=0 )
  18922. {
  18923. int iStart = m_dStart[iIdx];
  18924. int iEnd = m_dEnd[iIdx];
  18925. for ( int i=iStart; i<=iEnd; i++ )
  18926. {
  18927. int iLen = m_dTags[i].m_iTagLen;
  18928. int iCmp = strncasecmp ( m_dTags[i].m_sTag.cstr(), (const char*)sTagName, iLen );
  18929. // the tags are sorted; so if current candidate is already greater, rest can be skipped
  18930. if ( iCmp>0 )
  18931. break;
  18932. // do we have a match?
  18933. if ( iCmp==0 )
  18934. {
  18935. // got exact match?
  18936. if ( !sphIsTag ( sTagName[iLen] ) )
  18937. {
  18938. *ppTag = m_dTags.Begin() + i;
  18939. sSrc = sTagName + iLen; // skip tag name
  18940. if ( m_dTags[i].m_bZone )
  18941. *pZoneNameLen = sSrc - *ppZoneName;
  18942. break;
  18943. }
  18944. // got wildcard match?
  18945. if ( m_dTags[i].m_bZonePrefix )
  18946. {
  18947. *ppTag = m_dTags.Begin() + i;
  18948. sSrc = sTagName + iLen;
  18949. while ( sphIsTag(*sSrc) )
  18950. sSrc++;
  18951. *pZoneNameLen = sSrc - *ppZoneName;
  18952. break;
  18953. }
  18954. }
  18955. }
  18956. }
  18957. return sSrc;
  18958. }
  18959. bool CSphHTMLStripper::IsValidTagStart ( int iCh ) const
  18960. {
  18961. int i = GetCharIndex ( iCh );
  18962. return ( i>=0 && i<MAX_CHAR_INDEX );
  18963. }
  18964. //////////////////////////////////////////////////////////////////////////
  18965. #if USE_RE2
  18966. class CSphFieldRegExps : public ISphFieldFilter
  18967. {
  18968. public:
  18969. explicit CSphFieldRegExps ( bool bUTF8 );
  18970. virtual ~CSphFieldRegExps ();
  18971. virtual const BYTE * Apply ( const BYTE * sField, int iLength = 0 );
  18972. virtual int GetResultLength () const;
  18973. virtual void GetSettings ( CSphFieldFilterSettings & tSettings ) const;
  18974. bool AddRegExp ( const char * sRegExp, CSphString & sError );
  18975. private:
  18976. struct RegExp_t
  18977. {
  18978. CSphString m_sFrom;
  18979. CSphString m_sTo;
  18980. RE2 * m_pRE2;
  18981. };
  18982. CSphVector<RegExp_t> m_dRegexps;
  18983. bool m_bUTF8;
  18984. std::string m_sField;
  18985. };
  18986. CSphFieldRegExps::CSphFieldRegExps ( bool bUTF8 )
  18987. : m_bUTF8 ( bUTF8 )
  18988. {
  18989. }
  18990. CSphFieldRegExps::~CSphFieldRegExps ()
  18991. {
  18992. ARRAY_FOREACH ( i, m_dRegexps )
  18993. SafeDelete ( m_dRegexps[i].m_pRE2 );
  18994. }
  18995. const BYTE * CSphFieldRegExps::Apply ( const BYTE * sField, int iLength )
  18996. {
  18997. if ( !sField || !*sField )
  18998. return sField;
  18999. bool bReplaced = false;
  19000. m_sField = iLength ? std::string ( (char *) sField, iLength ) : (char *) sField;
  19001. ARRAY_FOREACH ( i, m_dRegexps )
  19002. {
  19003. assert ( m_dRegexps[i].m_pRE2 );
  19004. if ( RE2::GlobalReplace ( &m_sField, *m_dRegexps[i].m_pRE2, m_dRegexps[i].m_sTo.cstr() ) )
  19005. bReplaced = true;
  19006. }
  19007. return bReplaced ? (const BYTE *)m_sField.c_str () : sField;
  19008. }
  19009. int CSphFieldRegExps::GetResultLength () const
  19010. {
  19011. return m_sField.length();
  19012. }
  19013. void CSphFieldRegExps::GetSettings ( CSphFieldFilterSettings & tSettings ) const
  19014. {
  19015. tSettings.m_bUTF8 = m_bUTF8;
  19016. tSettings.m_dRegexps.Resize ( m_dRegexps.GetLength() );
  19017. ARRAY_FOREACH ( i, m_dRegexps )
  19018. tSettings.m_dRegexps[i].SetSprintf ( "%s => %s", m_dRegexps[i].m_sFrom.cstr(), m_dRegexps[i].m_sTo.cstr() );
  19019. }
  19020. bool CSphFieldRegExps::AddRegExp ( const char * sRegExp, CSphString & sError )
  19021. {
  19022. const char sSplitter [] = "=>";
  19023. const char * sSplit = strstr ( sRegExp, sSplitter );
  19024. if ( !sSplit )
  19025. {
  19026. sError = "mapping token (=>) not found";
  19027. return false;
  19028. } else if ( strstr ( sSplit + strlen ( sSplitter ), sSplitter ) )
  19029. {
  19030. sError = "mapping token (=>) found more than once";
  19031. return false;
  19032. }
  19033. m_dRegexps.Resize ( m_dRegexps.GetLength () + 1 );
  19034. RegExp_t & tRegExp = m_dRegexps.Last();
  19035. tRegExp.m_sFrom.SetBinary ( sRegExp, sSplit-sRegExp );
  19036. tRegExp.m_sTo = sSplit + strlen ( sSplitter );
  19037. tRegExp.m_sFrom.Trim();
  19038. tRegExp.m_sTo.Trim();
  19039. RE2::Options tOptions;
  19040. tOptions.set_utf8 ( m_bUTF8 );
  19041. tRegExp.m_pRE2 = new RE2 ( tRegExp.m_sFrom.cstr(), tOptions );
  19042. std::string sRE2Error;
  19043. if ( !tRegExp.m_pRE2->CheckRewriteString ( tRegExp.m_sTo.cstr(), &sRE2Error ) )
  19044. {
  19045. sError.SetSprintf ( "\"%s => %s\" is not a valid mapping: %s", tRegExp.m_sFrom.cstr(), tRegExp.m_sTo.cstr(), sRE2Error.c_str() );
  19046. SafeDelete ( tRegExp.m_pRE2 );
  19047. m_dRegexps.Remove ( m_dRegexps.GetLength() - 1 );
  19048. return false;
  19049. }
  19050. return true;
  19051. }
  19052. #endif
  19053. #if USE_RE2
  19054. ISphFieldFilter * sphCreateFieldFilter ( const CSphFieldFilterSettings & tFilterSettings, CSphString & sError )
  19055. {
  19056. CSphFieldRegExps * pFilter = new CSphFieldRegExps ( tFilterSettings.m_bUTF8 );
  19057. ARRAY_FOREACH ( i, tFilterSettings.m_dRegexps )
  19058. pFilter->AddRegExp ( tFilterSettings.m_dRegexps[i].cstr(), sError );
  19059. return pFilter;
  19060. }
  19061. #else
  19062. ISphFieldFilter * sphCreateFieldFilter ( const CSphFieldFilterSettings &, CSphString & )
  19063. {
  19064. return NULL;
  19065. }
  19066. #endif
  19067. /////////////////////////////////////////////////////////////////////////////
  19068. // GENERIC SOURCE
  19069. /////////////////////////////////////////////////////////////////////////////
  19070. CSphSourceSettings::CSphSourceSettings ()
  19071. : m_iMinPrefixLen ( 0 )
  19072. , m_iMinInfixLen ( 0 )
  19073. , m_iMaxSubstringLen ( 0 )
  19074. , m_iBoundaryStep ( 0 )
  19075. , m_bIndexExactWords ( false )
  19076. , m_iOvershortStep ( 1 )
  19077. , m_iStopwordStep ( 1 )
  19078. , m_bIndexSP ( false )
  19079. , m_bIndexFieldLens ( false )
  19080. {}
  19081. ESphWordpart CSphSourceSettings::GetWordpart ( const char * sField, bool bWordDict )
  19082. {
  19083. if ( bWordDict )
  19084. return SPH_WORDPART_WHOLE;
  19085. bool bPrefix = ( m_iMinPrefixLen>0 ) && ( m_dPrefixFields.GetLength()==0 || m_dPrefixFields.Contains ( sField ) );
  19086. bool bInfix = ( m_iMinInfixLen>0 ) && ( m_dInfixFields.GetLength()==0 || m_dInfixFields.Contains ( sField ) );
  19087. assert ( !( bPrefix && bInfix ) ); // no field must be marked both prefix and infix
  19088. if ( bPrefix )
  19089. return SPH_WORDPART_PREFIX;
  19090. if ( bInfix )
  19091. return SPH_WORDPART_INFIX;
  19092. return SPH_WORDPART_WHOLE;
  19093. }
  19094. //////////////////////////////////////////////////////////////////////////
  19095. CSphSource::CSphSource ( const char * sName )
  19096. : m_pTokenizer ( NULL )
  19097. , m_pDict ( NULL )
  19098. , m_pFieldFilter ( NULL )
  19099. , m_tSchema ( sName )
  19100. , m_bStripHTML ( false )
  19101. , m_iNullIds ( 0 )
  19102. , m_iMaxIds ( 0 )
  19103. {
  19104. m_pStripper = new CSphHTMLStripper ( true );
  19105. }
  19106. CSphSource::~CSphSource()
  19107. {
  19108. delete m_pStripper;
  19109. }
  19110. void CSphSource::SetDict ( CSphDict * pDict )
  19111. {
  19112. assert ( pDict );
  19113. m_pDict = pDict;
  19114. }
  19115. const CSphSourceStats & CSphSource::GetStats ()
  19116. {
  19117. return m_tStats;
  19118. }
  19119. bool CSphSource::SetStripHTML ( const char * sExtractAttrs, const char * sRemoveElements,
  19120. bool bDetectParagraphs, const char * sZones, CSphString & sError )
  19121. {
  19122. if ( !m_pStripper->SetIndexedAttrs ( sExtractAttrs, sError ) )
  19123. return false;
  19124. if ( !m_pStripper->SetRemovedElements ( sRemoveElements, sError ) )
  19125. return false;
  19126. if ( bDetectParagraphs )
  19127. m_pStripper->EnableParagraphs ();
  19128. if ( !m_pStripper->SetZones ( sZones, sError ) )
  19129. return false;
  19130. m_bStripHTML = true;
  19131. return true;
  19132. }
  19133. void CSphSource::SetFieldFilter ( ISphFieldFilter * pFilter )
  19134. {
  19135. m_pFieldFilter = pFilter;
  19136. }
  19137. void CSphSource::SetTokenizer ( ISphTokenizer * pTokenizer )
  19138. {
  19139. assert ( pTokenizer );
  19140. m_pTokenizer = pTokenizer;
  19141. }
  19142. bool CSphSource::UpdateSchema ( CSphSchema * pInfo, CSphString & sError )
  19143. {
  19144. assert ( pInfo );
  19145. // fill it
  19146. if ( pInfo->m_dFields.GetLength()==0 && pInfo->GetAttrsCount()==0 )
  19147. {
  19148. *pInfo = m_tSchema;
  19149. return true;
  19150. }
  19151. // check it
  19152. return m_tSchema.CompareTo ( *pInfo, sError );
  19153. }
  19154. void CSphSource::Setup ( const CSphSourceSettings & tSettings )
  19155. {
  19156. m_iMinPrefixLen = Max ( tSettings.m_iMinPrefixLen, 0 );
  19157. m_iMinInfixLen = Max ( tSettings.m_iMinInfixLen, 0 );
  19158. m_iMaxSubstringLen = Max ( tSettings.m_iMaxSubstringLen, 0 );
  19159. m_iBoundaryStep = Max ( tSettings.m_iBoundaryStep, -1 );
  19160. m_bIndexExactWords = tSettings.m_bIndexExactWords;
  19161. m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
  19162. m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
  19163. m_bIndexSP = tSettings.m_bIndexSP;
  19164. m_dPrefixFields = tSettings.m_dPrefixFields;
  19165. m_dInfixFields = tSettings.m_dInfixFields;
  19166. m_bIndexFieldLens = tSettings.m_bIndexFieldLens;
  19167. }
  19168. SphDocID_t CSphSource::VerifyID ( SphDocID_t uID )
  19169. {
  19170. if ( uID==0 )
  19171. {
  19172. m_iNullIds++;
  19173. return 0;
  19174. }
  19175. if ( uID==DOCID_MAX )
  19176. {
  19177. m_iMaxIds++;
  19178. return 0;
  19179. }
  19180. return uID;
  19181. }
  19182. ISphHits * CSphSource::IterateJoinedHits ( CSphString & )
  19183. {
  19184. static ISphHits dDummy;
  19185. m_tDocInfo.m_iDocID = 0; // pretend that's an eof
  19186. return &dDummy;
  19187. }
  19188. /////////////////////////////////////////////////////////////////////////////
  19189. // DOCUMENT SOURCE
  19190. /////////////////////////////////////////////////////////////////////////////
  19191. static void FormatEscaped ( FILE * fp, const char * sLine )
  19192. {
  19193. // handle empty lines
  19194. if ( !sLine || !*sLine )
  19195. {
  19196. fprintf ( fp, "''" );
  19197. return;
  19198. }
  19199. // pass one, count the needed buffer size
  19200. int iLen = strlen(sLine);
  19201. int iOut = 0;
  19202. for ( int i=0; i<iLen; i++ )
  19203. switch ( sLine[i] )
  19204. {
  19205. case '\t':
  19206. case '\'':
  19207. case '\\':
  19208. iOut += 2;
  19209. break;
  19210. default:
  19211. iOut++;
  19212. break;
  19213. }
  19214. iOut += 2; // quotes
  19215. // allocate the buffer
  19216. char sMinibuffer[8192];
  19217. char * sMaxibuffer = NULL;
  19218. char * sBuffer = sMinibuffer;
  19219. if ( iOut>(int)sizeof(sMinibuffer) )
  19220. {
  19221. sMaxibuffer = new char [ iOut+4 ]; // 4 is just my safety gap
  19222. sBuffer = sMaxibuffer;
  19223. }
  19224. // pass two, escape it
  19225. char * sOut = sBuffer;
  19226. *sOut++ = '\'';
  19227. for ( int i=0; i<iLen; i++ )
  19228. switch ( sLine[i] )
  19229. {
  19230. case '\t':
  19231. case '\'':
  19232. case '\\': *sOut++ = '\\'; // no break intended
  19233. default: *sOut++ = sLine[i];
  19234. }
  19235. *sOut++ = '\'';
  19236. // print!
  19237. assert ( sOut==sBuffer+iOut );
  19238. fwrite ( sBuffer, 1, iOut, fp );
  19239. // cleanup
  19240. SafeDeleteArray ( sMaxibuffer );
  19241. }
  19242. CSphSource_Document::CSphBuildHitsState_t::CSphBuildHitsState_t ()
  19243. : m_bProcessingHits ( false )
  19244. , m_bDocumentDone ( false )
  19245. , m_dFields ( NULL )
  19246. , m_iStartPos ( 0 )
  19247. , m_iHitPos ( 0 )
  19248. , m_iField ( 0 )
  19249. , m_iStartField ( 0 )
  19250. , m_iEndField ( 0 )
  19251. , m_iBuildLastStep ( 1 )
  19252. {
  19253. }
  19254. CSphSource_Document::CSphBuildHitsState_t::~CSphBuildHitsState_t ()
  19255. {
  19256. ARRAY_FOREACH ( i, m_dTmpFieldStorage )
  19257. SafeDeleteArray ( m_dTmpFieldStorage[i] );
  19258. }
  19259. CSphSource_Document::CSphSource_Document ( const char * sName )
  19260. : CSphSource ( sName )
  19261. , m_pReadFileBuffer ( NULL )
  19262. , m_iReadFileBufferSize ( 256 * 1024 )
  19263. , m_iMaxFileBufferSize ( 2 * 1024 * 1024 )
  19264. , m_eOnFileFieldError ( FFE_IGNORE_FIELD )
  19265. , m_fpDumpRows ( NULL )
  19266. , m_iPlainFieldsLength ( 0 )
  19267. , m_pFieldLengthAttrs ( NULL )
  19268. , m_iMaxHits ( MAX_SOURCE_HITS )
  19269. {
  19270. }
  19271. bool CSphSource_Document::IterateDocument ( CSphString & sError )
  19272. {
  19273. assert ( m_pTokenizer );
  19274. assert ( !m_tState.m_bProcessingHits );
  19275. // PROFILE ( src_document );
  19276. m_tHits.m_dData.Resize ( 0 );
  19277. m_tState = CSphBuildHitsState_t();
  19278. m_tState.m_iEndField = m_iPlainFieldsLength;
  19279. m_tState.m_dTmpFieldPtrs.Resize ( m_tState.m_iEndField );
  19280. m_tState.m_dTmpFieldStorage.Resize ( m_tState.m_iEndField );
  19281. ARRAY_FOREACH ( i, m_tState.m_dTmpFieldPtrs )
  19282. {
  19283. m_tState.m_dTmpFieldPtrs[i] = NULL;
  19284. m_tState.m_dTmpFieldStorage[i] = NULL;
  19285. }
  19286. m_dMva.Resize ( 1 ); // must not have zero offset
  19287. // fetch next document
  19288. for ( ;; )
  19289. {
  19290. m_tState.m_dFields = NextDocument ( sError );
  19291. if ( m_tDocInfo.m_iDocID==0 )
  19292. return true;
  19293. if ( !m_tState.m_dFields )
  19294. return false;
  19295. // tricky bit
  19296. // we can only skip document indexing from here, IterateHits() is too late
  19297. // so in case the user chose to skip documents with file field problems
  19298. // we need to check for those here
  19299. if ( m_eOnFileFieldError==FFE_SKIP_DOCUMENT || m_eOnFileFieldError==FFE_FAIL_INDEX )
  19300. {
  19301. bool bOk = true;
  19302. for ( int iField=0; iField<m_tState.m_iEndField && bOk; iField++ )
  19303. {
  19304. const BYTE * sFilename = m_tState.m_dFields[iField];
  19305. if ( m_tSchema.m_dFields[iField].m_bFilename )
  19306. bOk &= CheckFileField ( sFilename );
  19307. if ( !bOk && m_eOnFileFieldError==FFE_FAIL_INDEX )
  19308. {
  19309. sError.SetSprintf ( "error reading file field data (docid=" DOCID_FMT ", filename=%s)",
  19310. m_tDocInfo.m_iDocID, sFilename );
  19311. return false;
  19312. }
  19313. }
  19314. if ( !bOk && m_eOnFileFieldError==FFE_SKIP_DOCUMENT )
  19315. continue;
  19316. }
  19317. if ( m_pFieldFilter )
  19318. {
  19319. // new field strings may be longer than original, that's why we need temporary storage
  19320. ARRAY_FOREACH ( i, m_tState.m_dTmpFieldStorage )
  19321. SafeDeleteArray ( m_tState.m_dTmpFieldStorage[i] );
  19322. bool bHaveModifiedFields = false;
  19323. for ( int iField=0; iField<m_tState.m_iEndField; iField++ )
  19324. {
  19325. if ( m_tSchema.m_dFields[iField].m_bFilename )
  19326. {
  19327. m_tState.m_dTmpFieldPtrs[iField] = m_tState.m_dFields[iField];
  19328. continue;
  19329. }
  19330. BYTE * sValue = m_tState.m_dFields[iField];
  19331. const BYTE * sResult = m_pFieldFilter->Apply ( sValue );
  19332. if ( sResult!=sValue )
  19333. {
  19334. // emulate CString's safety gap
  19335. const int FAKE_SAFETY_GAP = 4;
  19336. int iResultLen = m_pFieldFilter->GetResultLength();
  19337. m_tState.m_dTmpFieldStorage[iField] = new BYTE [iResultLen + 1 + FAKE_SAFETY_GAP];
  19338. memcpy ( m_tState.m_dTmpFieldStorage[iField], sResult, iResultLen );
  19339. m_tState.m_dTmpFieldStorage[iField][iResultLen] = '\0';
  19340. m_tState.m_dTmpFieldPtrs[iField] = m_tState.m_dTmpFieldStorage[iField];
  19341. bHaveModifiedFields = true;
  19342. } else
  19343. m_tState.m_dTmpFieldPtrs[iField] = m_tState.m_dFields[iField];
  19344. }
  19345. if ( bHaveModifiedFields )
  19346. m_tState.m_dFields = (BYTE **)&( m_tState.m_dTmpFieldPtrs[0] );
  19347. }
  19348. // we're good
  19349. break;
  19350. }
  19351. m_tStats.m_iTotalDocuments++;
  19352. return true;
  19353. }
  19354. ISphHits * CSphSource_Document::IterateHits ( CSphString & sError )
  19355. {
  19356. if ( m_tState.m_bDocumentDone )
  19357. return NULL;
  19358. m_tHits.m_dData.Resize ( 0 );
  19359. BuildHits ( sError, false );
  19360. return &m_tHits;
  19361. }
  19362. bool CSphSource_Document::CheckFileField ( const BYTE * sField )
  19363. {
  19364. CSphAutofile tFileSource;
  19365. CSphString sError;
  19366. if ( tFileSource.Open ( (const char *)sField, SPH_O_READ, sError )==-1 )
  19367. {
  19368. sphWarning ( "docid=" DOCID_FMT ": %s", m_tDocInfo.m_iDocID, sError.cstr() );
  19369. return false;
  19370. }
  19371. int64_t iFileSize = tFileSource.GetSize();
  19372. if ( iFileSize+16 > m_iMaxFileBufferSize )
  19373. {
  19374. sphWarning ( "docid=" DOCID_FMT ": file '%s' too big for a field (size="INT64_FMT", max_file_field_buffer=%d)",
  19375. m_tDocInfo.m_iDocID, (const char *)sField, iFileSize, m_iMaxFileBufferSize );
  19376. return false;
  19377. }
  19378. return true;
  19379. }
  19380. /// returns file size on success, and replaces *ppField with a pointer to data
  19381. /// returns -1 on failure (and emits a warning)
  19382. int CSphSource_Document::LoadFileField ( BYTE ** ppField, CSphString & sError )
  19383. {
  19384. CSphAutofile tFileSource;
  19385. BYTE * sField = *ppField;
  19386. if ( tFileSource.Open ( (const char *)sField, SPH_O_READ, sError )==-1 )
  19387. {
  19388. sphWarning ( "docid=" DOCID_FMT ": %s", m_tDocInfo.m_iDocID, sError.cstr() );
  19389. return -1;
  19390. }
  19391. int64_t iFileSize = tFileSource.GetSize();
  19392. if ( iFileSize+16 > m_iMaxFileBufferSize )
  19393. {
  19394. sphWarning ( "docid=" DOCID_FMT ": file '%s' too big for a field (size="INT64_FMT", max_file_field_buffer=%d)",
  19395. m_tDocInfo.m_iDocID, (const char *)sField, iFileSize, m_iMaxFileBufferSize );
  19396. return -1;
  19397. }
  19398. int iFieldBytes = (int)iFileSize;
  19399. if ( !iFieldBytes )
  19400. return 0;
  19401. int iBufSize = Max ( m_iReadFileBufferSize, 1 << sphLog2 ( iFieldBytes+15 ) );
  19402. if ( m_iReadFileBufferSize < iBufSize )
  19403. SafeDeleteArray ( m_pReadFileBuffer );
  19404. if ( !m_pReadFileBuffer )
  19405. {
  19406. m_pReadFileBuffer = new char [ iBufSize ];
  19407. m_iReadFileBufferSize = iBufSize;
  19408. }
  19409. if ( !tFileSource.Read ( m_pReadFileBuffer, iFieldBytes, sError ) )
  19410. {
  19411. sphWarning ( "docid=" DOCID_FMT ": read failed: %s", m_tDocInfo.m_iDocID, sError.cstr() );
  19412. return -1;
  19413. }
  19414. m_pReadFileBuffer[iFieldBytes] = '\0';
  19415. *ppField = (BYTE*)m_pReadFileBuffer;
  19416. return iFieldBytes;
  19417. }
  19418. bool CSphSource_Document::AddAutoAttrs ( CSphString & sError )
  19419. {
  19420. // auto-computed length attributes
  19421. if ( m_bIndexFieldLens )
  19422. {
  19423. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  19424. {
  19425. CSphColumnInfo tCol;
  19426. tCol.m_sName.SetSprintf ( "%s_len", m_tSchema.m_dFields[i].m_sName.cstr() );
  19427. int iGot = m_tSchema.GetAttrIndex ( tCol.m_sName.cstr() );
  19428. if ( iGot>=0 )
  19429. {
  19430. if ( m_tSchema.GetAttr(iGot).m_eAttrType==SPH_ATTR_TOKENCOUNT )
  19431. {
  19432. // looks like we already added these
  19433. assert ( m_tSchema.GetAttr(iGot).m_sName==tCol.m_sName );
  19434. return true;
  19435. }
  19436. sError.SetSprintf ( "attribute %s conflicts with index_field_lengths=1; remove it", tCol.m_sName.cstr() );
  19437. return false;
  19438. }
  19439. tCol.m_eAttrType = SPH_ATTR_TOKENCOUNT;
  19440. m_tSchema.AddAttr ( tCol, true ); // everything's dynamic at indexing time
  19441. }
  19442. }
  19443. return true;
  19444. }
  19445. void CSphSource_Document::AllocDocinfo()
  19446. {
  19447. // tricky bit
  19448. // with in-config schema, attr storage gets allocated in Setup() when source is initially created
  19449. // so when this AddAutoAttrs() additionally changes the count, we have to change the number of attributes
  19450. // but Reset() prohibits that, because that is usually a programming mistake, hence the Swap() dance
  19451. CSphMatch tNew;
  19452. tNew.Reset ( m_tSchema.GetRowSize() );
  19453. Swap ( m_tDocInfo, tNew );
  19454. m_dStrAttrs.Resize ( m_tSchema.GetAttrsCount() );
  19455. if ( m_bIndexFieldLens )
  19456. {
  19457. int iFirst = m_tSchema.GetAttrsCount() - m_tSchema.m_dFields.GetLength();
  19458. assert ( m_tSchema.GetAttr ( iFirst ).m_eAttrType==SPH_ATTR_TOKENCOUNT );
  19459. assert ( m_tSchema.GetAttr ( iFirst+m_tSchema.m_dFields.GetLength()-1 ).m_eAttrType==SPH_ATTR_TOKENCOUNT );
  19460. m_pFieldLengthAttrs = m_tDocInfo.m_pDynamic + ( m_tSchema.GetAttr ( iFirst ).m_tLocator.m_iBitOffset / 32 );
  19461. }
  19462. }
  19463. //////////////////////////////////////////////////////////////////////////
  19464. // HIT GENERATORS
  19465. //////////////////////////////////////////////////////////////////////////
  19466. bool CSphSource_Document::BuildZoneHits ( SphDocID_t uDocid, BYTE * sWord )
  19467. {
  19468. if ( *sWord==MAGIC_CODE_SENTENCE || *sWord==MAGIC_CODE_PARAGRAPH || *sWord==MAGIC_CODE_ZONE )
  19469. {
  19470. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( (BYTE*)MAGIC_WORD_SENTENCE ), m_tState.m_iHitPos );
  19471. if ( *sWord==MAGIC_CODE_PARAGRAPH || *sWord==MAGIC_CODE_ZONE )
  19472. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( (BYTE*)MAGIC_WORD_PARAGRAPH ), m_tState.m_iHitPos );
  19473. if ( *sWord==MAGIC_CODE_ZONE )
  19474. {
  19475. BYTE * pZone = (BYTE*) m_pTokenizer->GetBufferPtr();
  19476. BYTE * pEnd = pZone;
  19477. while ( *pEnd && *pEnd!=MAGIC_CODE_ZONE )
  19478. {
  19479. pEnd++;
  19480. }
  19481. if ( *pEnd && *pEnd==MAGIC_CODE_ZONE )
  19482. {
  19483. *pEnd = '\0';
  19484. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( pZone-1 ), m_tState.m_iHitPos );
  19485. m_pTokenizer->SetBufferPtr ( (const char*) pEnd+1 );
  19486. }
  19487. }
  19488. m_tState.m_iBuildLastStep = 1;
  19489. return true;
  19490. }
  19491. return false;
  19492. }
  19493. // track blended start and reset on not blended token
  19494. static int TrackBlendedStart ( const ISphTokenizer * pTokenizer, int iBlendedHitsStart, int iHitsCount )
  19495. {
  19496. iBlendedHitsStart = ( ( pTokenizer->TokenIsBlended() || pTokenizer->TokenIsBlendedPart() ) ? iBlendedHitsStart : -1 );
  19497. if ( pTokenizer->TokenIsBlended() )
  19498. iBlendedHitsStart = iHitsCount;
  19499. return iBlendedHitsStart;
  19500. }
  19501. #define BUILD_SUBSTRING_HITS_COUNT 4
  19502. void CSphSource_Document::BuildSubstringHits ( SphDocID_t uDocid, bool bPayload, ESphWordpart eWordpart, bool bSkipEndMarker )
  19503. {
  19504. bool bPrefixField = ( eWordpart==SPH_WORDPART_PREFIX );
  19505. bool bInfixMode = m_iMinInfixLen > 0;
  19506. int iMinInfixLen = bPrefixField ? m_iMinPrefixLen : m_iMinInfixLen;
  19507. if ( !m_tState.m_bProcessingHits )
  19508. m_tState.m_iBuildLastStep = 1;
  19509. BYTE * sWord = NULL;
  19510. BYTE sBuf [ 16+3*SPH_MAX_WORD_LEN ];
  19511. int iIterHitCount = BUILD_SUBSTRING_HITS_COUNT;
  19512. if ( bPrefixField )
  19513. iIterHitCount += SPH_MAX_WORD_LEN - m_iMinPrefixLen;
  19514. else
  19515. iIterHitCount += ( ( m_iMinInfixLen+SPH_MAX_WORD_LEN ) * ( SPH_MAX_WORD_LEN-m_iMinInfixLen ) / 2 );
  19516. // FIELDEND_MASK at blended token stream should be set for HEAD token too
  19517. int iBlendedHitsStart = -1;
  19518. // index all infixes
  19519. while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+iIterHitCount<m_iMaxHits )
  19520. && ( sWord = m_pTokenizer->GetToken() )!=NULL )
  19521. {
  19522. iBlendedHitsStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
  19523. if ( !bPayload )
  19524. {
  19525. HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
  19526. if ( m_pTokenizer->GetBoundary() )
  19527. HITMAN::AddPos ( &m_tState.m_iHitPos, m_iBoundaryStep );
  19528. m_tState.m_iBuildLastStep = 1;
  19529. }
  19530. if ( BuildZoneHits ( uDocid, sWord ) )
  19531. continue;
  19532. int iLen = m_pTokenizer->GetLastTokenLen ();
  19533. // always index full word (with magic head/tail marker(s))
  19534. int iBytes = strlen ( (const char*)sWord );
  19535. if ( m_bIndexExactWords )
  19536. {
  19537. int iBytes = strlen ( (const char*)sWord );
  19538. memcpy ( sBuf + 1, sWord, iBytes );
  19539. sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
  19540. sBuf[iBytes+1] = '\0';
  19541. m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
  19542. }
  19543. memcpy ( sBuf + 1, sWord, iBytes );
  19544. sBuf[0] = MAGIC_WORD_HEAD;
  19545. sBuf[iBytes+1] = '\0';
  19546. // stemmed word w/markers
  19547. SphWordID_t iWord = m_pDict->GetWordIDWithMarkers ( sBuf );
  19548. if ( !iWord )
  19549. {
  19550. m_tState.m_iBuildLastStep = m_iStopwordStep;
  19551. continue;
  19552. }
  19553. m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
  19554. m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
  19555. // restore stemmed word
  19556. int iStemmedLen = strlen ( ( const char *)sBuf );
  19557. sBuf [iStemmedLen - 1] = '\0';
  19558. // stemmed word w/o markers
  19559. if ( strcmp ( (const char *)sBuf + 1, (const char *)sWord ) )
  19560. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sBuf + 1, iStemmedLen - 2, true ), m_tState.m_iHitPos );
  19561. // restore word
  19562. memcpy ( sBuf + 1, sWord, iBytes );
  19563. sBuf[iBytes+1] = MAGIC_WORD_TAIL;
  19564. sBuf[iBytes+2] = '\0';
  19565. // if there are no infixes, that's it
  19566. if ( iMinInfixLen > iLen )
  19567. {
  19568. // index full word
  19569. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sWord ), m_tState.m_iHitPos );
  19570. continue;
  19571. }
  19572. // process all infixes
  19573. int iMaxStart = bPrefixField ? 0 : ( iLen - iMinInfixLen );
  19574. BYTE * sInfix = sBuf + 1;
  19575. for ( int iStart=0; iStart<=iMaxStart; iStart++ )
  19576. {
  19577. BYTE * sInfixEnd = sInfix;
  19578. for ( int i = 0; i < iMinInfixLen; i++ )
  19579. sInfixEnd += m_pTokenizer->GetCodepointLength ( *sInfixEnd );
  19580. int iMaxSubLen = ( iLen-iStart );
  19581. if ( m_iMaxSubstringLen )
  19582. iMaxSubLen = Min ( m_iMaxSubstringLen, iMaxSubLen );
  19583. for ( int i=iMinInfixLen; i<=iMaxSubLen; i++ )
  19584. {
  19585. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix, sInfixEnd-sInfix, false ), m_tState.m_iHitPos );
  19586. // word start: add magic head
  19587. if ( bInfixMode && iStart==0 )
  19588. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix - 1, sInfixEnd-sInfix + 1, false ), m_tState.m_iHitPos );
  19589. // word end: add magic tail
  19590. if ( bInfixMode && i==iLen-iStart )
  19591. m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix, sInfixEnd-sInfix+1, false ), m_tState.m_iHitPos );
  19592. sInfixEnd += m_pTokenizer->GetCodepointLength ( *sInfixEnd );
  19593. }
  19594. sInfix += m_pTokenizer->GetCodepointLength ( *sInfix );
  19595. }
  19596. }
  19597. m_tState.m_bProcessingHits = ( sWord!=NULL );
  19598. // mark trailing hits
  19599. // and compute fields lengths
  19600. if ( !bSkipEndMarker && !m_tState.m_bProcessingHits && m_tHits.Length() )
  19601. {
  19602. CSphWordHit * pHit = const_cast < CSphWordHit * > ( m_tHits.Last() );
  19603. Hitpos_t uRefPos = pHit->m_iWordPos;
  19604. if ( m_pFieldLengthAttrs )
  19605. m_pFieldLengthAttrs [ HITMAN::GetField ( pHit->m_iWordPos ) ] = HITMAN::GetPos ( pHit->m_iWordPos );
  19606. for ( ; pHit>=m_tHits.First() && pHit->m_iWordPos==uRefPos; pHit-- )
  19607. HITMAN::SetEndMarker ( &pHit->m_iWordPos );
  19608. // mark blended HEAD as trailing too
  19609. if ( iBlendedHitsStart>=0 )
  19610. {
  19611. assert ( iBlendedHitsStart>=0 && iBlendedHitsStart<m_tHits.Length() );
  19612. pHit = const_cast < CSphWordHit * > ( m_tHits.First()+iBlendedHitsStart );
  19613. uRefPos = pHit->m_iWordPos;
  19614. const CSphWordHit * pEnd = m_tHits.First()+m_tHits.Length();
  19615. for ( ; pHit<pEnd && pHit->m_iWordPos==uRefPos; pHit++ )
  19616. HITMAN::SetEndMarker ( &pHit->m_iWordPos );
  19617. }
  19618. }
  19619. }
  19620. #define BUILD_REGULAR_HITS_COUNT 6
  19621. void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, bool bSkipEndMarker )
  19622. {
  19623. bool bWordDict = m_pDict->GetSettings().m_bWordDict;
  19624. bool bGlobalPartialMatch = !bWordDict && ( m_iMinPrefixLen > 0 || m_iMinInfixLen > 0 );
  19625. if ( !m_tState.m_bProcessingHits )
  19626. m_tState.m_iBuildLastStep = 1;
  19627. BYTE * sWord = NULL;
  19628. BYTE sBuf [ 16+3*SPH_MAX_WORD_LEN ];
  19629. // FIELDEND_MASK at blended token stream should be set for HEAD token too
  19630. int iBlendedHitsStart = -1;
  19631. // index words only
  19632. while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
  19633. && ( sWord = m_pTokenizer->GetToken() )!=NULL )
  19634. {
  19635. iBlendedHitsStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
  19636. if ( !bPayload )
  19637. {
  19638. HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
  19639. if ( m_pTokenizer->GetBoundary() )
  19640. HITMAN::AddPos ( &m_tState.m_iHitPos, m_iBoundaryStep );
  19641. }
  19642. if ( BuildZoneHits ( uDocid, sWord ) )
  19643. continue;
  19644. if ( bGlobalPartialMatch )
  19645. {
  19646. int iBytes = strlen ( (const char*)sWord );
  19647. memcpy ( sBuf + 1, sWord, iBytes );
  19648. sBuf[0] = MAGIC_WORD_HEAD;
  19649. sBuf[iBytes+1] = '\0';
  19650. m_tHits.AddHit ( uDocid, m_pDict->GetWordIDWithMarkers ( sBuf ), m_tState.m_iHitPos );
  19651. }
  19652. if ( m_bIndexExactWords )
  19653. {
  19654. int iBytes = strlen ( (const char*)sWord );
  19655. memcpy ( sBuf + 1, sWord, iBytes );
  19656. sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
  19657. sBuf[iBytes+1] = '\0';
  19658. m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
  19659. }
  19660. SphWordID_t iWord = m_pDict->GetWordID ( sWord );
  19661. if ( iWord )
  19662. {
  19663. #if 0
  19664. if ( HITMAN::GetPos ( m_tState.m_iHitPos )==1 )
  19665. printf ( "\n" );
  19666. printf ( "doc %d. pos %d. %s\n", uDocid, HITMAN::GetPos ( m_tState.m_iHitPos ), sWord );
  19667. #endif
  19668. m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
  19669. m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
  19670. } else
  19671. m_tState.m_iBuildLastStep = m_iStopwordStep;
  19672. }
  19673. m_tState.m_bProcessingHits = ( sWord!=NULL );
  19674. // mark trailing hit
  19675. // and compute field lengths
  19676. if ( !bSkipEndMarker && !m_tState.m_bProcessingHits && m_tHits.Length() )
  19677. {
  19678. CSphWordHit * pHit = const_cast < CSphWordHit * > ( m_tHits.Last() );
  19679. HITMAN::SetEndMarker ( &pHit->m_iWordPos );
  19680. if ( m_pFieldLengthAttrs )
  19681. m_pFieldLengthAttrs [ HITMAN::GetField ( pHit->m_iWordPos ) ] = HITMAN::GetPos ( pHit->m_iWordPos );
  19682. // mark blended HEAD as trailing too
  19683. if ( iBlendedHitsStart>=0 )
  19684. {
  19685. assert ( iBlendedHitsStart>=0 && iBlendedHitsStart<m_tHits.Length() );
  19686. CSphWordHit * pBlendedHit = const_cast < CSphWordHit * > ( m_tHits.First() + iBlendedHitsStart );
  19687. HITMAN::SetEndMarker ( &pBlendedHit->m_iWordPos );
  19688. }
  19689. }
  19690. }
  19691. void CSphSource_Document::BuildHits ( CSphString & sError, bool bSkipEndMarker )
  19692. {
  19693. SphDocID_t uDocid = m_tDocInfo.m_iDocID;
  19694. for ( ; m_tState.m_iField<m_tState.m_iEndField; m_tState.m_iField++ )
  19695. {
  19696. if ( !m_tState.m_bProcessingHits )
  19697. {
  19698. // get that field
  19699. BYTE * sField = m_tState.m_dFields[m_tState.m_iField-m_tState.m_iStartField];
  19700. if ( !sField || !(*sField) )
  19701. continue;
  19702. // load files
  19703. int iFieldBytes;
  19704. const BYTE * sTextToIndex;
  19705. if ( m_tSchema.m_dFields[m_tState.m_iField].m_bFilename )
  19706. {
  19707. LoadFileField ( &sField, sError );
  19708. sTextToIndex = sField;
  19709. if ( m_pFieldFilter )
  19710. sTextToIndex = m_pFieldFilter->Apply ( sTextToIndex );
  19711. iFieldBytes = sTextToIndex!=sField ? m_pFieldFilter->GetResultLength() : (int) strlen ( (char*)sField );
  19712. } else
  19713. {
  19714. iFieldBytes = (int) strlen ( (char*)sField );
  19715. sTextToIndex = sField;
  19716. }
  19717. if ( iFieldBytes<=0 )
  19718. continue;
  19719. // strip html
  19720. if ( m_bStripHTML )
  19721. {
  19722. m_pStripper->Strip ( (BYTE*)sTextToIndex );
  19723. iFieldBytes = (int) strlen ( (char*)sTextToIndex );
  19724. }
  19725. // tokenize and build hits
  19726. m_tStats.m_iTotalBytes += iFieldBytes;
  19727. m_pTokenizer->SetBuffer ( (BYTE*)sTextToIndex, iFieldBytes );
  19728. m_tState.m_iHitPos = HITMAN::Create ( m_tState.m_iField, m_tState.m_iStartPos );
  19729. }
  19730. const CSphColumnInfo & tField = m_tSchema.m_dFields[m_tState.m_iField];
  19731. if ( tField.m_eWordpart!=SPH_WORDPART_WHOLE )
  19732. BuildSubstringHits ( uDocid, tField.m_bPayload, tField.m_eWordpart, bSkipEndMarker );
  19733. else
  19734. BuildRegularHits ( uDocid, tField.m_bPayload, bSkipEndMarker );
  19735. if ( m_tState.m_bProcessingHits )
  19736. break;
  19737. }
  19738. m_tState.m_bDocumentDone = !m_tState.m_bProcessingHits;
  19739. }
  19740. //////////////////////////////////////////////////////////////////////////
  19741. SphRange_t CSphSource_Document::IterateFieldMVAStart ( int iAttr )
  19742. {
  19743. SphRange_t tRange;
  19744. tRange.m_iStart = tRange.m_iLength = 0;
  19745. if ( iAttr<0 || iAttr>=m_tSchema.GetAttrsCount() )
  19746. return tRange;
  19747. const CSphColumnInfo & tMva = m_tSchema.GetAttr ( iAttr );
  19748. int uOff = MVA_DOWNSIZE ( m_tDocInfo.GetAttr ( tMva.m_tLocator ) );
  19749. if ( !uOff )
  19750. return tRange;
  19751. int iCount = m_dMva[uOff];
  19752. assert ( iCount );
  19753. tRange.m_iStart = uOff+1;
  19754. tRange.m_iLength = iCount;
  19755. return tRange;
  19756. }
  19757. static int sphAddMva64 ( CSphVector<DWORD> & dStorage, int64_t iVal )
  19758. {
  19759. int uOff = dStorage.GetLength();
  19760. dStorage.Resize ( uOff+2 );
  19761. dStorage[uOff] = MVA_DOWNSIZE ( iVal );
  19762. dStorage[uOff+1] = MVA_DOWNSIZE ( ( iVal>>32 ) & 0xffffffff );
  19763. return uOff;
  19764. }
  19765. int CSphSource_Document::ParseFieldMVA ( CSphVector < DWORD > & dMva, const char * szValue, bool bMva64 )
  19766. {
  19767. if ( !szValue )
  19768. return 0;
  19769. const char * pPtr = szValue;
  19770. const char * pDigit = NULL;
  19771. const int MAX_NUMBER_LEN = 64;
  19772. char szBuf [MAX_NUMBER_LEN];
  19773. assert ( dMva.GetLength() ); // must not have zero offset
  19774. int uOff = dMva.GetLength();
  19775. dMva.Add ( 0 ); // reserve value for count
  19776. while ( *pPtr )
  19777. {
  19778. if ( ( *pPtr>='0' && *pPtr<='9' ) || ( bMva64 && *pPtr=='-' ) )
  19779. {
  19780. if ( !pDigit )
  19781. pDigit = pPtr;
  19782. } else
  19783. {
  19784. if ( pDigit )
  19785. {
  19786. if ( pPtr - pDigit < MAX_NUMBER_LEN )
  19787. {
  19788. strncpy ( szBuf, pDigit, pPtr - pDigit );
  19789. szBuf [pPtr - pDigit] = '\0';
  19790. if ( !bMva64 )
  19791. dMva.Add ( sphToDword ( szBuf ) );
  19792. else
  19793. sphAddMva64 ( dMva, sphToInt64 ( szBuf ) );
  19794. }
  19795. pDigit = NULL;
  19796. }
  19797. }
  19798. pPtr++;
  19799. }
  19800. if ( pDigit )
  19801. {
  19802. if ( !bMva64 )
  19803. dMva.Add ( sphToDword ( pDigit ) );
  19804. else
  19805. sphAddMva64 ( dMva, sphToInt64 ( pDigit ) );
  19806. }
  19807. int iCount = dMva.GetLength()-uOff-1;
  19808. if ( !iCount )
  19809. {
  19810. dMva.Pop(); // remove reserved value for count in case of 0 MVAs
  19811. return 0;
  19812. } else
  19813. {
  19814. dMva[uOff] = iCount;
  19815. return uOff; // return offset to ( count, [value] )
  19816. }
  19817. }
  19818. /////////////////////////////////////////////////////////////////////////////
  19819. // GENERIC SQL SOURCE
  19820. /////////////////////////////////////////////////////////////////////////////
  19821. CSphSourceParams_SQL::CSphSourceParams_SQL ()
  19822. : m_iRangeStep ( 1024 )
  19823. , m_iRefRangeStep ( 1024 )
  19824. , m_bPrintQueries ( false )
  19825. , m_iRangedThrottle ( 0 )
  19826. , m_iMaxFileBufferSize ( 0 )
  19827. , m_eOnFileFieldError ( FFE_IGNORE_FIELD )
  19828. , m_iPort ( 0 )
  19829. {
  19830. }
  19831. const char * const CSphSource_SQL::MACRO_VALUES [ CSphSource_SQL::MACRO_COUNT ] =
  19832. {
  19833. "$start",
  19834. "$end"
  19835. };
  19836. CSphSource_SQL::CSphSource_SQL ( const char * sName )
  19837. : CSphSource_Document ( sName )
  19838. , m_bSqlConnected ( false )
  19839. , m_uMinID ( 0 )
  19840. , m_uMaxID ( 0 )
  19841. , m_uCurrentID ( 0 )
  19842. , m_uMaxFetchedID ( 0 )
  19843. , m_iMultiAttr ( -1 )
  19844. , m_iSqlFields ( 0 )
  19845. , m_bCanUnpack ( false )
  19846. , m_bUnpackFailed ( false )
  19847. , m_bUnpackOverflow ( false )
  19848. , m_iJoinedHitField ( -1 )
  19849. , m_iJoinedHitID ( 0 )
  19850. , m_iJoinedHitPos ( 0 )
  19851. {
  19852. }
  19853. bool CSphSource_SQL::Setup ( const CSphSourceParams_SQL & tParams )
  19854. {
  19855. // checks
  19856. assert ( !tParams.m_sQuery.IsEmpty() );
  19857. m_tParams = tParams;
  19858. // defaults
  19859. #define LOC_FIX_NULL(_arg) if ( !m_tParams._arg.cstr() ) m_tParams._arg = "";
  19860. LOC_FIX_NULL ( m_sHost );
  19861. LOC_FIX_NULL ( m_sUser );
  19862. LOC_FIX_NULL ( m_sPass );
  19863. LOC_FIX_NULL ( m_sDB );
  19864. #undef LOC_FIX_NULL
  19865. #define LOC_FIX_QARRAY(_arg) \
  19866. ARRAY_FOREACH ( i, m_tParams._arg ) \
  19867. if ( m_tParams._arg[i].IsEmpty() ) \
  19868. m_tParams._arg.Remove ( i-- );
  19869. LOC_FIX_QARRAY ( m_dQueryPre );
  19870. LOC_FIX_QARRAY ( m_dQueryPost );
  19871. LOC_FIX_QARRAY ( m_dQueryPostIndex );
  19872. #undef LOC_FIX_QARRAY
  19873. // build and store default DSN for error reporting
  19874. char sBuf [ 1024 ];
  19875. snprintf ( sBuf, sizeof(sBuf), "sql://%s:***@%s:%d/%s",
  19876. m_tParams.m_sUser.cstr(), m_tParams.m_sHost.cstr(),
  19877. m_tParams.m_iPort, m_tParams.m_sDB.cstr() );
  19878. m_sSqlDSN = sBuf;
  19879. if ( m_tParams.m_iMaxFileBufferSize > 0 )
  19880. m_iMaxFileBufferSize = m_tParams.m_iMaxFileBufferSize;
  19881. m_eOnFileFieldError = m_tParams.m_eOnFileFieldError;
  19882. return true;
  19883. }
  19884. const char * SubstituteParams ( const char * sQuery, const char * const * dMacroses, const char ** dValues, int iMcount )
  19885. {
  19886. // OPTIMIZE? things can be precalculated
  19887. const char * sCur = sQuery;
  19888. int iLen = 0;
  19889. while ( *sCur )
  19890. {
  19891. if ( *sCur=='$' )
  19892. {
  19893. int i;
  19894. for ( i=0; i<iMcount; i++ )
  19895. if ( strncmp ( dMacroses[i], sCur, strlen ( dMacroses[i] ) )==0 )
  19896. {
  19897. sCur += strlen ( dMacroses[i] );
  19898. iLen += strlen ( dValues[i] );
  19899. break;
  19900. }
  19901. if ( i<iMcount )
  19902. continue;
  19903. }
  19904. sCur++;
  19905. iLen++;
  19906. }
  19907. iLen++; // trailing zero
  19908. // do interpolation
  19909. char * sRes = new char [ iLen ];
  19910. sCur = sQuery;
  19911. char * sDst = sRes;
  19912. while ( *sCur )
  19913. {
  19914. if ( *sCur=='$' )
  19915. {
  19916. int i;
  19917. for ( i=0; i<iMcount; i++ )
  19918. if ( strncmp ( dMacroses[i], sCur, strlen ( dMacroses[i] ) )==0 )
  19919. {
  19920. strcpy ( sDst, dValues[i] ); // NOLINT
  19921. sCur += strlen ( dMacroses[i] );
  19922. sDst += strlen ( dValues[i] );
  19923. break;
  19924. }
  19925. if ( i<iMcount )
  19926. continue;
  19927. }
  19928. *sDst++ = *sCur++;
  19929. }
  19930. *sDst++ = '\0';
  19931. assert ( sDst-sRes==iLen );
  19932. return sRes;
  19933. }
  19934. bool CSphSource_SQL::RunQueryStep ( const char * sQuery, CSphString & sError )
  19935. {
  19936. sError = "";
  19937. if ( m_tParams.m_iRangeStep<=0 )
  19938. return false;
  19939. if ( m_uCurrentID>m_uMaxID )
  19940. return false;
  19941. static const int iBufSize = 32;
  19942. const char * sRes = NULL;
  19943. sphSleepMsec ( m_tParams.m_iRangedThrottle );
  19944. //////////////////////////////////////////////
  19945. // range query with $start/$end interpolation
  19946. //////////////////////////////////////////////
  19947. assert ( m_uMinID>0 );
  19948. assert ( m_uMaxID>0 );
  19949. assert ( m_uMinID<=m_uMaxID );
  19950. assert ( sQuery );
  19951. char sValues [ MACRO_COUNT ] [ iBufSize ];
  19952. const char * pValues [ MACRO_COUNT ];
  19953. SphDocID_t uNextID = Min ( m_uCurrentID + (SphDocID_t)m_tParams.m_iRangeStep - 1, m_uMaxID );
  19954. snprintf ( sValues[0], iBufSize, DOCID_FMT, m_uCurrentID );
  19955. snprintf ( sValues[1], iBufSize, DOCID_FMT, uNextID );
  19956. pValues[0] = sValues[0];
  19957. pValues[1] = sValues[1];
  19958. g_iIndexerCurrentRangeMin = m_uCurrentID;
  19959. g_iIndexerCurrentRangeMax = uNextID;
  19960. m_uCurrentID = 1 + uNextID;
  19961. sRes = SubstituteParams ( sQuery, MACRO_VALUES, pValues, MACRO_COUNT );
  19962. // run query
  19963. SqlDismissResult ();
  19964. bool bRes = SqlQuery ( sRes );
  19965. if ( !bRes )
  19966. sError.SetSprintf ( "sql_range_query: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
  19967. SafeDeleteArray ( sRes );
  19968. return bRes;
  19969. }
  19970. static void HookConnect ( const char* szCommand )
  19971. {
  19972. FILE * pPipe = popen ( szCommand, "r" );
  19973. if ( !pPipe )
  19974. return;
  19975. const int MAX_BUF_SIZE = 1024;
  19976. BYTE dBuf [MAX_BUF_SIZE];
  19977. fread ( dBuf, 1, MAX_BUF_SIZE, pPipe );
  19978. pclose ( pPipe );
  19979. }
  19980. inline static const char* skipspace ( const char* pBuf, const char* pBufEnd )
  19981. {
  19982. assert ( pBuf );
  19983. assert ( pBufEnd );
  19984. while ( (pBuf<pBufEnd) && isspace ( *pBuf ) )
  19985. ++pBuf;
  19986. return pBuf;
  19987. }
  19988. inline static const char* scannumber ( const char* pBuf, const char* pBufEnd, SphDocID_t* pRes )
  19989. {
  19990. assert ( pBuf );
  19991. assert ( pBufEnd );
  19992. assert ( pRes );
  19993. if ( pBuf<pBufEnd )
  19994. {
  19995. *pRes = 0;
  19996. // FIXME! could check for overflow
  19997. while ( isdigit ( *pBuf ) && pBuf<pBufEnd )
  19998. (*pRes) = 10*(*pRes) + (int)( (*pBuf++)-'0' );
  19999. }
  20000. return pBuf;
  20001. }
  20002. static void HookQueryRange ( const char* szCommand, SphDocID_t* pMin, SphDocID_t* pMax )
  20003. {
  20004. FILE * pPipe = popen ( szCommand, "r" );
  20005. if ( !pPipe )
  20006. return;
  20007. const int MAX_BUF_SIZE = 1024;
  20008. char dBuf [MAX_BUF_SIZE];
  20009. int iRead = (int)fread ( dBuf, 1, MAX_BUF_SIZE, pPipe );
  20010. pclose ( pPipe );
  20011. const char* pStart = dBuf;
  20012. const char* pEnd = pStart + iRead;
  20013. // leading whitespace and 1-st number
  20014. pStart = skipspace ( pStart, pEnd );
  20015. pStart = scannumber ( pStart, pEnd, pMin );
  20016. // whitespace and 2-nd number
  20017. pStart = skipspace ( pStart, pEnd );
  20018. pStart = scannumber ( pStart, pEnd, pMax );
  20019. }
  20020. static void HookPostIndex ( const char* szCommand, SphDocID_t uLastIndexed )
  20021. {
  20022. const char * sMacro = "$maxid";
  20023. char sValue[32];
  20024. const char* pValue = sValue;
  20025. snprintf ( sValue, sizeof(sValue), DOCID_FMT, uLastIndexed );
  20026. const char * pCmd = SubstituteParams ( szCommand, &sMacro, &pValue, 1 );
  20027. FILE * pPipe = popen ( pCmd, "r" );
  20028. SafeDeleteArray ( pCmd );
  20029. if ( !pPipe )
  20030. return;
  20031. const int MAX_BUF_SIZE = 1024;
  20032. BYTE dBuf [MAX_BUF_SIZE];
  20033. fread ( dBuf, 1, MAX_BUF_SIZE, pPipe );
  20034. pclose ( pPipe );
  20035. }
  20036. /// connect to SQL server
  20037. bool CSphSource_SQL::Connect ( CSphString & sError )
  20038. {
  20039. // do not connect twice
  20040. if ( m_bSqlConnected )
  20041. return true;
  20042. // try to connect
  20043. if ( !SqlConnect() )
  20044. {
  20045. sError.SetSprintf ( "sql_connect: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
  20046. return false;
  20047. }
  20048. m_tHits.m_dData.Reserve ( m_iMaxHits );
  20049. // all good
  20050. m_bSqlConnected = true;
  20051. if ( !m_tParams.m_sHookConnect.IsEmpty() )
  20052. HookConnect ( m_tParams.m_sHookConnect.cstr() );
  20053. return true;
  20054. }
  20055. #define LOC_ERROR(_msg,_arg) { sError.SetSprintf ( _msg, _arg ); return false; }
  20056. #define LOC_ERROR2(_msg,_arg,_arg2) { sError.SetSprintf ( _msg, _arg, _arg2 ); return false; }
  20057. /// setup them ranges (called both for document range-queries and MVA range-queries)
  20058. bool CSphSource_SQL::SetupRanges ( const char * sRangeQuery, const char * sQuery, const char * sPrefix, CSphString & sError )
  20059. {
  20060. // check step
  20061. if ( m_tParams.m_iRangeStep<=0 )
  20062. LOC_ERROR ( "sql_range_step="INT64_FMT": must be non-zero positive", m_tParams.m_iRangeStep );
  20063. if ( m_tParams.m_iRangeStep<128 )
  20064. sphWarn ( "sql_range_step="INT64_FMT": too small; might hurt indexing performance!", m_tParams.m_iRangeStep );
  20065. // check query for macros
  20066. for ( int i=0; i<MACRO_COUNT; i++ )
  20067. if ( !strstr ( sQuery, MACRO_VALUES[i] ) )
  20068. LOC_ERROR2 ( "%s: macro '%s' not found in match fetch query", sPrefix, MACRO_VALUES[i] );
  20069. // run query
  20070. if ( !SqlQuery ( sRangeQuery ) )
  20071. {
  20072. sError.SetSprintf ( "%s: range-query failed: %s (DSN=%s)", sPrefix, SqlError(), m_sSqlDSN.cstr() );
  20073. return false;
  20074. }
  20075. // fetch min/max
  20076. int iCols = SqlNumFields ();
  20077. if ( iCols!=2 )
  20078. LOC_ERROR2 ( "%s: expected 2 columns (min_id/max_id), got %d", sPrefix, iCols );
  20079. if ( !SqlFetchRow() )
  20080. {
  20081. sError.SetSprintf ( "%s: range-query fetch failed: %s (DSN=%s)", sPrefix, SqlError(), m_sSqlDSN.cstr() );
  20082. return false;
  20083. }
  20084. if ( ( SqlColumn(0)==NULL || !SqlColumn(0)[0] ) && ( SqlColumn(1)==NULL || !SqlColumn(1)[0] ) )
  20085. {
  20086. // the source seems to be empty; workaround
  20087. m_uMinID = 1;
  20088. m_uMaxID = 1;
  20089. } else
  20090. {
  20091. // get and check min/max id
  20092. const char * sCol0 = SqlColumn(0);
  20093. const char * sCol1 = SqlColumn(1);
  20094. m_uMinID = sphToDocid ( sCol0 );
  20095. m_uMaxID = sphToDocid ( sCol1 );
  20096. if ( !sCol0 ) sCol0 = "(null)";
  20097. if ( !sCol1 ) sCol1 = "(null)";
  20098. if ( m_uMinID<=0 )
  20099. LOC_ERROR ( "sql_query_range: min_id='%s': must be positive 32/64-bit unsigned integer", sCol0 );
  20100. if ( m_uMaxID<=0 )
  20101. LOC_ERROR ( "sql_query_range: max_id='%s': must be positive 32/64-bit unsigned integer", sCol1 );
  20102. if ( m_uMinID>m_uMaxID )
  20103. LOC_ERROR2 ( "sql_query_range: min_id='%s', max_id='%s': min_id must be less than max_id", sCol0, sCol1 );
  20104. }
  20105. SqlDismissResult ();
  20106. if ( !m_tParams.m_sHookQueryRange.IsEmpty() )
  20107. {
  20108. HookQueryRange ( m_tParams.m_sHookQueryRange.cstr(), &m_uMinID, &m_uMaxID );
  20109. if ( m_uMinID<=0 )
  20110. LOC_ERROR ( "hook_query_range: min_id="DOCID_FMT": must be positive 32/64-bit unsigned integer", m_uMinID );
  20111. if ( m_uMaxID<=0 )
  20112. LOC_ERROR ( "hook_query_range: max_id="DOCID_FMT": must be positive 32/64-bit unsigned integer", m_uMaxID );
  20113. if ( m_uMinID>m_uMaxID )
  20114. LOC_ERROR2 ( "hook_query_range: min_id="DOCID_FMT", max_id="DOCID_FMT": min_id must be less than max_id", m_uMinID, m_uMaxID );
  20115. }
  20116. return true;
  20117. }
  20118. /// issue main rows fetch query
  20119. bool CSphSource_SQL::IterateStart ( CSphString & sError )
  20120. {
  20121. assert ( m_bSqlConnected );
  20122. m_iNullIds = false;
  20123. m_iMaxIds = false;
  20124. // run pre-queries
  20125. ARRAY_FOREACH ( i, m_tParams.m_dQueryPre )
  20126. {
  20127. if ( !SqlQuery ( m_tParams.m_dQueryPre[i].cstr() ) )
  20128. {
  20129. sError.SetSprintf ( "sql_query_pre[%d]: %s (DSN=%s)", i, SqlError(), m_sSqlDSN.cstr() );
  20130. SqlDisconnect ();
  20131. return false;
  20132. }
  20133. SqlDismissResult ();
  20134. }
  20135. for ( ;; )
  20136. {
  20137. m_tParams.m_iRangeStep = 0;
  20138. // issue first fetch query
  20139. if ( !m_tParams.m_sQueryRange.IsEmpty() )
  20140. {
  20141. m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
  20142. // run range-query; setup ranges
  20143. if ( !SetupRanges ( m_tParams.m_sQueryRange.cstr(), m_tParams.m_sQuery.cstr(), "sql_query_range: ", sError ) )
  20144. return false;
  20145. // issue query
  20146. m_uCurrentID = m_uMinID;
  20147. if ( !RunQueryStep ( m_tParams.m_sQuery.cstr(), sError ) )
  20148. return false;
  20149. } else
  20150. {
  20151. // normal query; just issue
  20152. if ( !SqlQuery ( m_tParams.m_sQuery.cstr() ) )
  20153. {
  20154. sError.SetSprintf ( "sql_query: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
  20155. return false;
  20156. }
  20157. }
  20158. break;
  20159. }
  20160. // some post-query setup
  20161. m_tSchema.Reset();
  20162. for ( int i=0; i<SPH_MAX_FIELDS; i++ )
  20163. m_dUnpack[i] = SPH_UNPACK_NONE;
  20164. m_iSqlFields = SqlNumFields(); // for rowdump
  20165. int iCols = SqlNumFields() - 1; // skip column 0, which must be the id
  20166. CSphVector<bool> dFound;
  20167. dFound.Resize ( m_tParams.m_dAttrs.GetLength() );
  20168. ARRAY_FOREACH ( i, dFound )
  20169. dFound[i] = false;
  20170. const bool bWordDict = m_pDict->GetSettings().m_bWordDict;
  20171. // map plain attrs from SQL
  20172. for ( int i=0; i<iCols; i++ )
  20173. {
  20174. const char * sName = SqlFieldName ( i+1 );
  20175. if ( !sName )
  20176. LOC_ERROR ( "column number %d has no name", i+1 );
  20177. CSphColumnInfo tCol ( sName );
  20178. ARRAY_FOREACH ( j, m_tParams.m_dAttrs )
  20179. if ( !strcasecmp ( tCol.m_sName.cstr(), m_tParams.m_dAttrs[j].m_sName.cstr() ) )
  20180. {
  20181. const CSphColumnInfo & tAttr = m_tParams.m_dAttrs[j];
  20182. tCol.m_eAttrType = tAttr.m_eAttrType;
  20183. assert ( tCol.m_eAttrType!=SPH_ATTR_NONE );
  20184. if ( ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) && tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
  20185. LOC_ERROR ( "multi-valued attribute '%s' of wrong source-type found in query; must be 'field'", tAttr.m_sName.cstr() );
  20186. tCol = tAttr;
  20187. dFound[j] = true;
  20188. break;
  20189. }
  20190. ARRAY_FOREACH ( j, m_tParams.m_dFileFields )
  20191. {
  20192. if ( !strcasecmp ( tCol.m_sName.cstr(), m_tParams.m_dFileFields[j].cstr() ) )
  20193. tCol.m_bFilename = true;
  20194. }
  20195. tCol.m_iIndex = i+1;
  20196. tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), bWordDict );
  20197. if ( tCol.m_eAttrType==SPH_ATTR_NONE || tCol.m_bIndexed )
  20198. {
  20199. m_tSchema.m_dFields.Add ( tCol );
  20200. ARRAY_FOREACH ( k, m_tParams.m_dUnpack )
  20201. {
  20202. CSphUnpackInfo & tUnpack = m_tParams.m_dUnpack[k];
  20203. if ( tUnpack.m_sName==tCol.m_sName )
  20204. {
  20205. if ( !m_bCanUnpack )
  20206. {
  20207. sError.SetSprintf ( "this source does not support column unpacking" );
  20208. return false;
  20209. }
  20210. int iIndex = m_tSchema.m_dFields.GetLength() - 1;
  20211. if ( iIndex < SPH_MAX_FIELDS )
  20212. {
  20213. m_dUnpack[iIndex] = tUnpack.m_eFormat;
  20214. m_dUnpackBuffers[iIndex].Resize ( SPH_UNPACK_BUFFER_SIZE );
  20215. }
  20216. break;
  20217. }
  20218. }
  20219. }
  20220. if ( tCol.m_eAttrType!=SPH_ATTR_NONE )
  20221. m_tSchema.AddAttr ( tCol, true ); // all attributes are dynamic at indexing time
  20222. }
  20223. // map multi-valued attrs
  20224. ARRAY_FOREACH ( i, m_tParams.m_dAttrs )
  20225. {
  20226. const CSphColumnInfo & tAttr = m_tParams.m_dAttrs[i];
  20227. if ( ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) && tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
  20228. {
  20229. m_tSchema.AddAttr ( tAttr, true ); // all attributes are dynamic at indexing time
  20230. dFound[i] = true;
  20231. }
  20232. }
  20233. // warn if some attrs went unmapped
  20234. ARRAY_FOREACH ( i, dFound )
  20235. if ( !dFound[i] )
  20236. sphWarn ( "attribute '%s' not found - IGNORING", m_tParams.m_dAttrs[i].m_sName.cstr() );
  20237. // joined fields
  20238. m_iPlainFieldsLength = m_tSchema.m_dFields.GetLength();
  20239. CSphColumnInfo tCol;
  20240. tCol.m_iIndex = -1;
  20241. ARRAY_FOREACH ( i, m_tParams.m_dJoinedFields )
  20242. {
  20243. tCol.m_sName = m_tParams.m_dJoinedFields[i].m_sName;
  20244. tCol.m_sQuery = m_tParams.m_dJoinedFields[i].m_sQuery;
  20245. tCol.m_bPayload = m_tParams.m_dJoinedFields[i].m_bPayload;
  20246. tCol.m_eSrc = m_tParams.m_dJoinedFields[i].m_sRanged.IsEmpty() ? SPH_ATTRSRC_QUERY : SPH_ATTRSRC_RANGEDQUERY;
  20247. tCol.m_sQueryRange = m_tParams.m_dJoinedFields[i].m_sRanged;
  20248. tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), bWordDict );
  20249. m_tSchema.m_dFields.Add ( tCol );
  20250. }
  20251. // auto-computed length attributes
  20252. if ( !AddAutoAttrs ( sError ) )
  20253. return false;
  20254. // alloc storage
  20255. AllocDocinfo();
  20256. // check it
  20257. if ( m_tSchema.m_dFields.GetLength()>SPH_MAX_FIELDS )
  20258. LOC_ERROR2 ( "too many fields (fields=%d, max=%d)",
  20259. m_tSchema.m_dFields.GetLength(), SPH_MAX_FIELDS );
  20260. // log it
  20261. if ( m_fpDumpRows )
  20262. {
  20263. const char * sTable = m_tSchema.m_sName.cstr();
  20264. time_t iNow = time ( NULL );
  20265. fprintf ( m_fpDumpRows, "#\n# === source %s ts %d\n# %s#\n", sTable, (int)iNow, ctime ( &iNow ) );
  20266. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  20267. fprintf ( m_fpDumpRows, "# field %d: %s\n", i, m_tSchema.m_dFields[i].m_sName.cstr() );
  20268. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  20269. {
  20270. const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
  20271. fprintf ( m_fpDumpRows, "# %s = %s # attr %d\n", sphTypeDirective ( tCol.m_eAttrType ), tCol.m_sName.cstr(), i );
  20272. }
  20273. fprintf ( m_fpDumpRows, "#\n\nDROP TABLE IF EXISTS rows_%s;\nCREATE TABLE rows_%s (\n id VARCHAR(32) NOT NULL,\n",
  20274. sTable, sTable );
  20275. for ( int i=1; i<m_iSqlFields; i++ )
  20276. fprintf ( m_fpDumpRows, " %s VARCHAR(4096) NOT NULL,\n", SqlFieldName(i) );
  20277. fprintf ( m_fpDumpRows, " KEY(id) );\n\n" );
  20278. }
  20279. return true;
  20280. }
  20281. #undef LOC_ERROR
  20282. #undef LOC_ERROR2
  20283. #undef LOC_SQL_ERROR
  20284. void CSphSource_SQL::Disconnect ()
  20285. {
  20286. SafeDeleteArray ( m_pReadFileBuffer );
  20287. m_tHits.m_dData.Reset();
  20288. if ( m_iNullIds )
  20289. sphWarn ( "source %s: skipped %d document(s) with zero/NULL ids", m_tSchema.m_sName.cstr(), m_iNullIds );
  20290. if ( m_iMaxIds )
  20291. sphWarn ( "source %s: skipped %d document(s) with DOCID_MAX ids", m_tSchema.m_sName.cstr(), m_iMaxIds );
  20292. m_iNullIds = 0;
  20293. m_iMaxIds = 0;
  20294. if ( m_bSqlConnected )
  20295. SqlDisconnect ();
  20296. m_bSqlConnected = false;
  20297. }
  20298. BYTE ** CSphSource_SQL::NextDocument ( CSphString & sError )
  20299. {
  20300. // PROFILE ( src_sql );
  20301. assert ( m_bSqlConnected );
  20302. // get next non-zero-id row
  20303. do
  20304. {
  20305. // try to get next row
  20306. bool bGotRow = SqlFetchRow ();
  20307. // when the party's over...
  20308. while ( !bGotRow )
  20309. {
  20310. // is that an error?
  20311. if ( SqlIsError() )
  20312. {
  20313. sError.SetSprintf ( "sql_fetch_row: %s", SqlError() );
  20314. m_tDocInfo.m_iDocID = 1; // 0 means legal eof
  20315. return NULL;
  20316. }
  20317. // maybe we can do next step yet?
  20318. if ( !RunQueryStep ( m_tParams.m_sQuery.cstr(), sError ) )
  20319. {
  20320. // if there's a message, there's an error
  20321. // otherwise, we're just over
  20322. if ( !sError.IsEmpty() )
  20323. {
  20324. m_tDocInfo.m_iDocID = 1; // 0 means legal eof
  20325. return NULL;
  20326. }
  20327. } else
  20328. {
  20329. // step went fine; try to fetch
  20330. bGotRow = SqlFetchRow ();
  20331. continue;
  20332. }
  20333. SqlDismissResult ();
  20334. // ok, we're over
  20335. ARRAY_FOREACH ( i, m_tParams.m_dQueryPost )
  20336. {
  20337. if ( !SqlQuery ( m_tParams.m_dQueryPost[i].cstr() ) )
  20338. {
  20339. sphWarn ( "sql_query_post[%d]: error=%s, query=%s",
  20340. i, SqlError(), m_tParams.m_dQueryPost[i].cstr() );
  20341. break;
  20342. }
  20343. SqlDismissResult ();
  20344. }
  20345. m_tDocInfo.m_iDocID = 0; // 0 means legal eof
  20346. return NULL;
  20347. }
  20348. // get him!
  20349. m_tDocInfo.m_iDocID = VerifyID ( sphToDocid ( SqlColumn(0) ) );
  20350. m_uMaxFetchedID = Max ( m_uMaxFetchedID, m_tDocInfo.m_iDocID );
  20351. } while ( !m_tDocInfo.m_iDocID );
  20352. // cleanup attrs
  20353. for ( int i=0; i<m_tSchema.GetRowSize(); i++ )
  20354. m_tDocInfo.m_pDynamic[i] = 0;
  20355. // split columns into fields and attrs
  20356. for ( int i=0; i<m_iPlainFieldsLength; i++ )
  20357. {
  20358. // get that field
  20359. #if USE_ZLIB
  20360. if ( m_dUnpack[i]!=SPH_UNPACK_NONE )
  20361. {
  20362. m_dFields[i] = (BYTE*) SqlUnpackColumn ( i, m_dUnpack[i] );
  20363. continue;
  20364. }
  20365. #endif
  20366. m_dFields[i] = (BYTE*) SqlColumn ( m_tSchema.m_dFields[i].m_iIndex );
  20367. }
  20368. for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
  20369. {
  20370. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i); // shortcut
  20371. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  20372. {
  20373. int uOff = 0;
  20374. if ( tAttr.m_eSrc==SPH_ATTRSRC_FIELD )
  20375. {
  20376. uOff = ParseFieldMVA ( m_dMva, SqlColumn ( tAttr.m_iIndex ), tAttr.m_eAttrType==SPH_ATTR_INT64SET );
  20377. }
  20378. m_tDocInfo.SetAttr ( tAttr.m_tLocator, uOff );
  20379. continue;
  20380. }
  20381. switch ( tAttr.m_eAttrType )
  20382. {
  20383. case SPH_ATTR_ORDINAL:
  20384. case SPH_ATTR_STRING:
  20385. case SPH_ATTR_JSON:
  20386. case SPH_ATTR_WORDCOUNT:
  20387. // memorize string, fixup NULLs
  20388. m_dStrAttrs[i] = SqlColumn ( tAttr.m_iIndex );
  20389. if ( !m_dStrAttrs[i].cstr() )
  20390. m_dStrAttrs[i] = "";
  20391. m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
  20392. break;
  20393. case SPH_ATTR_FLOAT:
  20394. m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
  20395. break;
  20396. case SPH_ATTR_BIGINT:
  20397. m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
  20398. break;
  20399. case SPH_ATTR_TOKENCOUNT:
  20400. // reset, and the value will be filled by IterateHits()
  20401. m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
  20402. break;
  20403. default:
  20404. // just store as uint by default
  20405. m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
  20406. break;
  20407. }
  20408. }
  20409. // log it
  20410. if ( m_fpDumpRows )
  20411. {
  20412. fprintf ( m_fpDumpRows, "INSERT INTO rows_%s VALUES (", m_tSchema.m_sName.cstr() );
  20413. for ( int i=0; i<m_iSqlFields; i++ )
  20414. {
  20415. if ( i )
  20416. fprintf ( m_fpDumpRows, ", " );
  20417. FormatEscaped ( m_fpDumpRows, SqlColumn(i) );
  20418. }
  20419. fprintf ( m_fpDumpRows, ");\n" );
  20420. }
  20421. return m_dFields;
  20422. }
  20423. void CSphSource_SQL::PostIndex ()
  20424. {
  20425. if ( ( !m_tParams.m_dQueryPostIndex.GetLength() ) && m_tParams.m_sHookPostIndex.IsEmpty() )
  20426. return;
  20427. assert ( !m_bSqlConnected );
  20428. const char * sSqlError = NULL;
  20429. if ( m_tParams.m_dQueryPostIndex.GetLength() )
  20430. {
  20431. #define LOC_SQL_ERROR(_msg) { sSqlError = _msg; break; }
  20432. for ( ;; )
  20433. {
  20434. if ( !SqlConnect () )
  20435. LOC_SQL_ERROR ( "mysql_real_connect" );
  20436. ARRAY_FOREACH ( i, m_tParams.m_dQueryPostIndex )
  20437. {
  20438. char * sQuery = sphStrMacro ( m_tParams.m_dQueryPostIndex[i].cstr(), "$maxid", m_uMaxFetchedID );
  20439. bool bRes = SqlQuery ( sQuery );
  20440. delete [] sQuery;
  20441. if ( !bRes )
  20442. LOC_SQL_ERROR ( "sql_query_post_index" );
  20443. SqlDismissResult ();
  20444. }
  20445. break;
  20446. }
  20447. if ( sSqlError )
  20448. sphWarn ( "%s: %s (DSN=%s)", sSqlError, SqlError(), m_sSqlDSN.cstr() );
  20449. #undef LOC_SQL_ERROR
  20450. SqlDisconnect ();
  20451. }
  20452. if ( !m_tParams.m_sHookPostIndex.IsEmpty() )
  20453. {
  20454. HookPostIndex ( m_tParams.m_sHookPostIndex.cstr(), m_uMaxFetchedID );
  20455. }
  20456. }
  20457. bool CSphSource_SQL::IterateMultivaluedStart ( int iAttr, CSphString & sError )
  20458. {
  20459. if ( iAttr<0 || iAttr>=m_tSchema.GetAttrsCount() )
  20460. return false;
  20461. m_iMultiAttr = iAttr;
  20462. const CSphColumnInfo & tAttr = m_tSchema.GetAttr(iAttr);
  20463. if ( !(tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) )
  20464. return false;
  20465. CSphString sPrefix;
  20466. switch ( tAttr.m_eSrc )
  20467. {
  20468. case SPH_ATTRSRC_FIELD:
  20469. return false;
  20470. case SPH_ATTRSRC_QUERY:
  20471. // run simple query
  20472. if ( !SqlQuery ( tAttr.m_sQuery.cstr() ) )
  20473. {
  20474. sError.SetSprintf ( "multi-valued attr '%s' query failed: %s", tAttr.m_sName.cstr(), SqlError() );
  20475. return false;
  20476. }
  20477. break;
  20478. case SPH_ATTRSRC_RANGEDQUERY:
  20479. m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
  20480. // setup ranges
  20481. sPrefix.SetSprintf ( "multi-valued attr '%s' ranged query: ", tAttr.m_sName.cstr() );
  20482. if ( !SetupRanges ( tAttr.m_sQueryRange.cstr(), tAttr.m_sQuery.cstr(), sPrefix.cstr(), sError ) )
  20483. return false;
  20484. // run first step (in order to report errors)
  20485. m_uCurrentID = m_uMinID;
  20486. if ( !RunQueryStep ( tAttr.m_sQuery.cstr(), sError ) )
  20487. return false;
  20488. break;
  20489. default:
  20490. sError.SetSprintf ( "INTERNAL ERROR: unknown multi-valued attr source type %d", tAttr.m_eSrc );
  20491. return false;
  20492. }
  20493. // check fields count
  20494. if ( SqlNumFields()!=2 )
  20495. {
  20496. sError.SetSprintf ( "multi-valued attr '%s' query returned %d fields (expected 2)", tAttr.m_sName.cstr(), SqlNumFields() );
  20497. SqlDismissResult ();
  20498. return false;
  20499. }
  20500. return true;
  20501. }
  20502. bool CSphSource_SQL::IterateMultivaluedNext ()
  20503. {
  20504. const CSphColumnInfo & tAttr = m_tSchema.GetAttr ( m_iMultiAttr );
  20505. assert ( m_bSqlConnected );
  20506. assert ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET );
  20507. // fetch next row
  20508. bool bGotRow = SqlFetchRow ();
  20509. while ( !bGotRow )
  20510. {
  20511. if ( SqlIsError() )
  20512. sphDie ( "sql_fetch_row: %s", SqlError() ); // FIXME! this should be reported
  20513. if ( tAttr.m_eSrc!=SPH_ATTRSRC_RANGEDQUERY )
  20514. return false;
  20515. CSphString sTmp;
  20516. if ( !RunQueryStep ( tAttr.m_sQuery.cstr(), sTmp ) ) // FIXME! this should be reported
  20517. return false;
  20518. bGotRow = SqlFetchRow ();
  20519. continue;
  20520. }
  20521. // return that tuple or offset to storage for MVA64 value
  20522. m_tDocInfo.m_iDocID = sphToDocid ( SqlColumn(0) );
  20523. m_dMva.Resize ( 0 );
  20524. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
  20525. m_dMva.Add ( sphToDword ( SqlColumn(1) ) );
  20526. else
  20527. sphAddMva64 ( m_dMva, sphToInt64 ( SqlColumn(1) ) );
  20528. return true;
  20529. }
  20530. bool CSphSource_SQL::IterateKillListStart ( CSphString & sError )
  20531. {
  20532. if ( m_tParams.m_sQueryKilllist.IsEmpty () )
  20533. return false;
  20534. if ( !SqlQuery ( m_tParams.m_sQueryKilllist.cstr () ) )
  20535. {
  20536. sError.SetSprintf ( "killlist query failed: %s", SqlError() );
  20537. return false;
  20538. }
  20539. return true;
  20540. }
  20541. bool CSphSource_SQL::IterateKillListNext ( SphDocID_t & tDocId )
  20542. {
  20543. if ( SqlFetchRow () )
  20544. tDocId = sphToDocid ( SqlColumn(0) );
  20545. else
  20546. {
  20547. if ( SqlIsError() )
  20548. sphDie ( "sql_query_killlist: %s", SqlError() ); // FIXME! this should be reported
  20549. else
  20550. {
  20551. SqlDismissResult ();
  20552. return false;
  20553. }
  20554. }
  20555. return true;
  20556. }
  20557. void CSphSource_SQL::ReportUnpackError ( int iIndex, int iError )
  20558. {
  20559. if ( !m_bUnpackFailed )
  20560. {
  20561. m_bUnpackFailed = true;
  20562. sphWarn ( "failed to unpack column '%s', error=%d, docid=" DOCID_FMT, SqlFieldName(iIndex), iError, m_tDocInfo.m_iDocID );
  20563. }
  20564. }
  20565. #if !USE_ZLIB
  20566. const char * CSphSource_SQL::SqlUnpackColumn ( int iFieldIndex, ESphUnpackFormat )
  20567. {
  20568. return SqlColumn ( m_tSchema.m_dFields[iFieldIndex].m_iIndex );
  20569. }
  20570. #else
  20571. const char * CSphSource_SQL::SqlUnpackColumn ( int iFieldIndex, ESphUnpackFormat eFormat )
  20572. {
  20573. int iIndex = m_tSchema.m_dFields[iFieldIndex].m_iIndex;
  20574. const char * pData = SqlColumn(iIndex);
  20575. if ( pData==NULL )
  20576. return NULL;
  20577. int iPackedLen = SqlColumnLength(iIndex);
  20578. if ( iPackedLen<=0 )
  20579. return NULL;
  20580. CSphVector<char> & tBuffer = m_dUnpackBuffers[iFieldIndex];
  20581. switch ( eFormat )
  20582. {
  20583. case SPH_UNPACK_MYSQL_COMPRESS:
  20584. {
  20585. if ( iPackedLen<=4 )
  20586. {
  20587. if ( !m_bUnpackFailed )
  20588. {
  20589. m_bUnpackFailed = true;
  20590. sphWarn ( "failed to unpack '%s', invalid column size (size=%d), "
  20591. "docid="DOCID_FMT, SqlFieldName(iIndex), iPackedLen, m_tDocInfo.m_iDocID );
  20592. }
  20593. return NULL;
  20594. }
  20595. unsigned long uSize = 0;
  20596. for ( int i=0; i<4; i++ )
  20597. uSize += ((unsigned long)((BYTE)pData[i])) << ( 8*i );
  20598. uSize &= 0x3FFFFFFF;
  20599. if ( uSize > m_tParams.m_uUnpackMemoryLimit )
  20600. {
  20601. if ( !m_bUnpackOverflow )
  20602. {
  20603. m_bUnpackOverflow = true;
  20604. sphWarn ( "failed to unpack '%s', column size limit exceeded (size=%d),"
  20605. " docid="DOCID_FMT, SqlFieldName(iIndex), (int)uSize, m_tDocInfo.m_iDocID );
  20606. }
  20607. return NULL;
  20608. }
  20609. int iResult;
  20610. tBuffer.Resize ( uSize + 1 );
  20611. unsigned long uLen = iPackedLen-4;
  20612. iResult = uncompress ( (Bytef *)tBuffer.Begin(), &uSize, (Bytef *)pData + 4, uLen );
  20613. if ( iResult==Z_OK )
  20614. {
  20615. tBuffer[uSize] = 0;
  20616. return &tBuffer[0];
  20617. } else
  20618. ReportUnpackError ( iIndex, iResult );
  20619. return NULL;
  20620. }
  20621. case SPH_UNPACK_ZLIB:
  20622. {
  20623. char * sResult = 0;
  20624. int iBufferOffset = 0;
  20625. int iResult;
  20626. z_stream tStream;
  20627. tStream.zalloc = Z_NULL;
  20628. tStream.zfree = Z_NULL;
  20629. tStream.opaque = Z_NULL;
  20630. tStream.avail_in = iPackedLen;
  20631. tStream.next_in = (Bytef *)SqlColumn(iIndex);
  20632. iResult = inflateInit ( &tStream );
  20633. if ( iResult!=Z_OK )
  20634. return NULL;
  20635. for ( ;; )
  20636. {
  20637. tStream.next_out = (Bytef *)&tBuffer[iBufferOffset];
  20638. tStream.avail_out = tBuffer.GetLength() - iBufferOffset - 1;
  20639. iResult = inflate ( &tStream, Z_NO_FLUSH );
  20640. if ( iResult==Z_STREAM_END )
  20641. {
  20642. tBuffer [ tStream.total_out ] = 0;
  20643. sResult = &tBuffer[0];
  20644. break;
  20645. } else if ( iResult==Z_OK )
  20646. {
  20647. assert ( tStream.avail_out==0 );
  20648. tBuffer.Resize ( tBuffer.GetLength()*2 );
  20649. iBufferOffset = tStream.total_out;
  20650. } else
  20651. {
  20652. ReportUnpackError ( iIndex, iResult );
  20653. break;
  20654. }
  20655. }
  20656. inflateEnd ( &tStream );
  20657. return sResult;
  20658. }
  20659. case SPH_UNPACK_NONE:
  20660. return pData;
  20661. }
  20662. return NULL;
  20663. }
  20664. #endif // USE_ZLIB
  20665. ISphHits * CSphSource_SQL::IterateJoinedHits ( CSphString & sError )
  20666. {
  20667. m_tHits.m_dData.Resize ( 0 );
  20668. // eof check
  20669. if ( m_iJoinedHitField>=m_tSchema.m_dFields.GetLength() )
  20670. {
  20671. m_tDocInfo.m_iDocID = 0;
  20672. return &m_tHits;
  20673. }
  20674. bool bProcessingRanged = true;
  20675. // my fetch loop
  20676. while ( m_iJoinedHitField<m_tSchema.m_dFields.GetLength() )
  20677. {
  20678. if ( m_tState.m_bProcessingHits || SqlFetchRow() )
  20679. {
  20680. // next row
  20681. m_tDocInfo.m_iDocID = sphToDocid ( SqlColumn(0) ); // FIXME! handle conversion errors and zero/max values?
  20682. // field start? restart ids
  20683. if ( !m_iJoinedHitID )
  20684. m_iJoinedHitID = m_tDocInfo.m_iDocID;
  20685. // docid asc requirement violated? report an error
  20686. if ( m_iJoinedHitID>m_tDocInfo.m_iDocID )
  20687. {
  20688. sError.SetSprintf ( "joined field '%s': query MUST return document IDs in ASC order",
  20689. m_tSchema.m_dFields[m_iJoinedHitField].m_sName.cstr() );
  20690. return NULL;
  20691. }
  20692. // next document? update tracker, reset position
  20693. if ( m_iJoinedHitID<m_tDocInfo.m_iDocID )
  20694. {
  20695. m_iJoinedHitID = m_tDocInfo.m_iDocID;
  20696. m_iJoinedHitPos = 0;
  20697. }
  20698. if ( !m_tState.m_bProcessingHits )
  20699. {
  20700. m_tState = CSphBuildHitsState_t();
  20701. m_tState.m_iField = m_iJoinedHitField;
  20702. m_tState.m_iStartField = m_iJoinedHitField;
  20703. m_tState.m_iEndField = m_iJoinedHitField+1;
  20704. if ( m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload )
  20705. m_tState.m_iStartPos = sphToDword ( SqlColumn(2) );
  20706. else
  20707. m_tState.m_iStartPos = m_iJoinedHitPos;
  20708. }
  20709. // build those hits
  20710. BYTE * dText[] = { (BYTE *)SqlColumn(1) };
  20711. m_tState.m_dFields = dText;
  20712. BuildHits ( sError, true );
  20713. // update current position
  20714. if ( !m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload && !m_tState.m_bProcessingHits && m_tHits.Length() )
  20715. m_iJoinedHitPos = HITMAN::GetPos ( m_tHits.Last()->m_iWordPos );
  20716. if ( m_tState.m_bProcessingHits )
  20717. break;
  20718. } else if ( SqlIsError() )
  20719. {
  20720. // error while fetching row
  20721. sError = SqlError();
  20722. return NULL;
  20723. } else
  20724. {
  20725. int iLastField = m_iJoinedHitField;
  20726. bool bRanged = ( m_iJoinedHitField>=m_iPlainFieldsLength && m_iJoinedHitField<m_tSchema.m_dFields.GetLength()
  20727. && m_tSchema.m_dFields[m_iJoinedHitField].m_eSrc==SPH_ATTRSRC_RANGEDQUERY );
  20728. // current field is over, continue to next field
  20729. if ( m_iJoinedHitField<0 )
  20730. m_iJoinedHitField = m_iPlainFieldsLength;
  20731. else if ( !bRanged || !bProcessingRanged )
  20732. m_iJoinedHitField++;
  20733. // eof check
  20734. if ( m_iJoinedHitField>=m_tSchema.m_dFields.GetLength() )
  20735. {
  20736. m_tDocInfo.m_iDocID = ( m_tHits.Length() ? 1 : 0 ); // to eof or not to eof
  20737. return &m_tHits;
  20738. }
  20739. SqlDismissResult ();
  20740. bProcessingRanged = false;
  20741. bool bCheckNumFields = true;
  20742. CSphColumnInfo & tJoined = m_tSchema.m_dFields[m_iJoinedHitField];
  20743. // start fetching next field
  20744. if ( tJoined.m_eSrc!=SPH_ATTRSRC_RANGEDQUERY )
  20745. {
  20746. if ( !SqlQuery ( tJoined.m_sQuery.cstr() ) )
  20747. {
  20748. sError = SqlError();
  20749. return NULL;
  20750. }
  20751. } else
  20752. {
  20753. m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
  20754. // setup ranges for next field
  20755. if ( iLastField!=m_iJoinedHitField )
  20756. {
  20757. CSphString sPrefix;
  20758. sPrefix.SetSprintf ( "joined field '%s' ranged query: ", tJoined.m_sName.cstr() );
  20759. if ( !SetupRanges ( tJoined.m_sQueryRange.cstr(), tJoined.m_sQuery.cstr(), sPrefix.cstr(), sError ) )
  20760. return NULL;
  20761. m_uCurrentID = m_uMinID;
  20762. }
  20763. // run first step (in order to report errors)
  20764. bool bRes = RunQueryStep ( tJoined.m_sQuery.cstr(), sError );
  20765. bProcessingRanged = bRes; // select next documents in range or loop once to process next field
  20766. bCheckNumFields = bRes;
  20767. if ( !sError.IsEmpty() )
  20768. return NULL;
  20769. }
  20770. const int iExpected = m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload ? 3 : 2;
  20771. if ( bCheckNumFields && SqlNumFields()!=iExpected )
  20772. {
  20773. const char * sName = m_tSchema.m_dFields[m_iJoinedHitField].m_sName.cstr();
  20774. sError.SetSprintf ( "joined field '%s': query MUST return exactly %d columns, got %d", sName, iExpected, SqlNumFields() );
  20775. return NULL;
  20776. }
  20777. m_iJoinedHitID = 0;
  20778. m_iJoinedHitPos = 0;
  20779. }
  20780. }
  20781. return &m_tHits;
  20782. }
  20783. /////////////////////////////////////////////////////////////////////////////
  20784. // MYSQL SOURCE
  20785. /////////////////////////////////////////////////////////////////////////////
  20786. #if USE_MYSQL
  20787. CSphSourceParams_MySQL::CSphSourceParams_MySQL ()
  20788. : m_iFlags ( 0 )
  20789. {
  20790. m_iPort = 3306;
  20791. }
  20792. CSphSource_MySQL::CSphSource_MySQL ( const char * sName )
  20793. : CSphSource_SQL ( sName )
  20794. , m_pMysqlResult ( NULL )
  20795. , m_pMysqlFields ( NULL )
  20796. , m_tMysqlRow ( NULL )
  20797. , m_pMysqlLengths ( NULL )
  20798. {
  20799. m_bCanUnpack = true;
  20800. }
  20801. void CSphSource_MySQL::SqlDismissResult ()
  20802. {
  20803. if ( !m_pMysqlResult )
  20804. return;
  20805. while ( m_pMysqlResult )
  20806. {
  20807. mysql_free_result ( m_pMysqlResult );
  20808. m_pMysqlResult = NULL;
  20809. // stored procedures might return multiple result sets
  20810. // FIXME? we might want to index all of them
  20811. // but for now, let's simply dismiss additional result sets
  20812. if ( mysql_next_result ( &m_tMysqlDriver )==0 )
  20813. {
  20814. m_pMysqlResult = mysql_use_result ( &m_tMysqlDriver );
  20815. static bool bOnce = false;
  20816. if ( !bOnce && m_pMysqlResult && mysql_num_rows ( m_pMysqlResult ) )
  20817. {
  20818. sphWarn ( "indexing of multiple result sets is not supported yet; some results sets were dismissed!" );
  20819. bOnce = true;
  20820. }
  20821. }
  20822. }
  20823. m_pMysqlFields = NULL;
  20824. m_pMysqlLengths = NULL;
  20825. }
  20826. bool CSphSource_MySQL::SqlQuery ( const char * sQuery )
  20827. {
  20828. if ( mysql_query ( &m_tMysqlDriver, sQuery ) )
  20829. {
  20830. if ( m_tParams.m_bPrintQueries )
  20831. fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
  20832. return false;
  20833. }
  20834. if ( m_tParams.m_bPrintQueries )
  20835. fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
  20836. m_pMysqlResult = mysql_use_result ( &m_tMysqlDriver );
  20837. m_pMysqlFields = NULL;
  20838. return true;
  20839. }
  20840. bool CSphSource_MySQL::SqlIsError ()
  20841. {
  20842. return mysql_errno ( &m_tMysqlDriver )!=0;
  20843. }
  20844. const char * CSphSource_MySQL::SqlError ()
  20845. {
  20846. return mysql_error ( &m_tMysqlDriver );
  20847. }
  20848. bool CSphSource_MySQL::SqlConnect ()
  20849. {
  20850. mysql_init ( &m_tMysqlDriver );
  20851. if ( !m_sSslKey.IsEmpty() || !m_sSslCert.IsEmpty() || !m_sSslCA.IsEmpty() )
  20852. mysql_ssl_set ( &m_tMysqlDriver, m_sSslKey.cstr(), m_sSslCert.cstr(), m_sSslCA.cstr(), NULL, NULL );
  20853. m_iMysqlConnectFlags |= CLIENT_MULTI_RESULTS; // we now know how to handle this
  20854. bool bRes = ( NULL!=mysql_real_connect ( &m_tMysqlDriver,
  20855. m_tParams.m_sHost.cstr(), m_tParams.m_sUser.cstr(), m_tParams.m_sPass.cstr(),
  20856. m_tParams.m_sDB.cstr(), m_tParams.m_iPort, m_sMysqlUsock.cstr(), m_iMysqlConnectFlags ) );
  20857. if ( m_tParams.m_bPrintQueries )
  20858. fprintf ( stdout, bRes ? "SQL-CONNECT: ok\n" : "SQL-CONNECT: FAIL\n" );
  20859. return bRes;
  20860. }
  20861. void CSphSource_MySQL::SqlDisconnect ()
  20862. {
  20863. if ( m_tParams.m_bPrintQueries )
  20864. fprintf ( stdout, "SQL-DISCONNECT\n" );
  20865. mysql_close ( &m_tMysqlDriver );
  20866. }
  20867. int CSphSource_MySQL::SqlNumFields ()
  20868. {
  20869. if ( !m_pMysqlResult )
  20870. return -1;
  20871. return mysql_num_fields ( m_pMysqlResult );
  20872. }
  20873. bool CSphSource_MySQL::SqlFetchRow ()
  20874. {
  20875. if ( !m_pMysqlResult )
  20876. return false;
  20877. m_tMysqlRow = mysql_fetch_row ( m_pMysqlResult );
  20878. return m_tMysqlRow!=NULL;
  20879. }
  20880. const char * CSphSource_MySQL::SqlColumn ( int iIndex )
  20881. {
  20882. if ( !m_pMysqlResult )
  20883. return NULL;
  20884. return m_tMysqlRow[iIndex];
  20885. }
  20886. const char * CSphSource_MySQL::SqlFieldName ( int iIndex )
  20887. {
  20888. if ( !m_pMysqlResult )
  20889. return NULL;
  20890. if ( !m_pMysqlFields )
  20891. m_pMysqlFields = mysql_fetch_fields ( m_pMysqlResult );
  20892. return m_pMysqlFields[iIndex].name;
  20893. }
  20894. DWORD CSphSource_MySQL::SqlColumnLength ( int iIndex )
  20895. {
  20896. if ( !m_pMysqlResult )
  20897. return 0;
  20898. if ( !m_pMysqlLengths )
  20899. m_pMysqlLengths = mysql_fetch_lengths ( m_pMysqlResult );
  20900. return m_pMysqlLengths[iIndex];
  20901. }
  20902. bool CSphSource_MySQL::Setup ( const CSphSourceParams_MySQL & tParams )
  20903. {
  20904. if ( !CSphSource_SQL::Setup ( tParams ) )
  20905. return false;
  20906. m_sMysqlUsock = tParams.m_sUsock;
  20907. m_iMysqlConnectFlags = tParams.m_iFlags;
  20908. m_sSslKey = tParams.m_sSslKey;
  20909. m_sSslCert = tParams.m_sSslCert;
  20910. m_sSslCA = tParams.m_sSslCA;
  20911. // build and store DSN for error reporting
  20912. char sBuf [ 1024 ];
  20913. snprintf ( sBuf, sizeof(sBuf), "mysql%s", m_sSqlDSN.cstr()+3 );
  20914. m_sSqlDSN = sBuf;
  20915. return true;
  20916. }
  20917. #endif // USE_MYSQL
  20918. /////////////////////////////////////////////////////////////////////////////
  20919. // PGSQL SOURCE
  20920. /////////////////////////////////////////////////////////////////////////////
  20921. #if USE_PGSQL
  20922. CSphSourceParams_PgSQL::CSphSourceParams_PgSQL ()
  20923. {
  20924. m_iRangeStep = 1024;
  20925. m_iPort = 5432;
  20926. }
  20927. CSphSource_PgSQL::CSphSource_PgSQL ( const char * sName )
  20928. : CSphSource_SQL ( sName )
  20929. , m_pPgResult ( NULL )
  20930. , m_iPgRows ( 0 )
  20931. , m_iPgRow ( 0 )
  20932. {
  20933. }
  20934. bool CSphSource_PgSQL::SqlIsError ()
  20935. {
  20936. return ( m_iPgRow<m_iPgRows ); // if we're over, it's just last row
  20937. }
  20938. const char * CSphSource_PgSQL::SqlError ()
  20939. {
  20940. return PQerrorMessage ( m_tPgDriver );
  20941. }
  20942. bool CSphSource_PgSQL::Setup ( const CSphSourceParams_PgSQL & tParams )
  20943. {
  20944. // checks
  20945. CSphSource_SQL::Setup ( tParams );
  20946. m_sPgClientEncoding = tParams.m_sClientEncoding;
  20947. if ( !m_sPgClientEncoding.cstr() )
  20948. m_sPgClientEncoding = "";
  20949. // build and store DSN for error reporting
  20950. char sBuf [ 1024 ];
  20951. snprintf ( sBuf, sizeof(sBuf), "pgsql%s", m_sSqlDSN.cstr()+3 );
  20952. m_sSqlDSN = sBuf;
  20953. return true;
  20954. }
  20955. bool CSphSource_PgSQL::IterateStart ( CSphString & sError )
  20956. {
  20957. bool bResult = CSphSource_SQL::IterateStart ( sError );
  20958. if ( !bResult )
  20959. return false;
  20960. int iMaxIndex = 0;
  20961. for ( int i = 0; i < m_tSchema.GetAttrsCount(); i++ )
  20962. iMaxIndex = Max ( iMaxIndex, m_tSchema.GetAttr(i).m_iIndex );
  20963. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  20964. iMaxIndex = Max ( iMaxIndex, m_tSchema.m_dFields[i].m_iIndex );
  20965. m_dIsColumnBool.Resize ( iMaxIndex + 1 );
  20966. ARRAY_FOREACH ( i, m_dIsColumnBool )
  20967. m_dIsColumnBool[i] = false;
  20968. for ( int i = 0; i < m_tSchema.GetAttrsCount(); i++ )
  20969. m_dIsColumnBool [ m_tSchema.GetAttr(i).m_iIndex ] = m_tSchema.GetAttr(i).m_eAttrType==SPH_ATTR_BOOL;
  20970. return true;
  20971. }
  20972. bool CSphSource_PgSQL::SqlConnect ()
  20973. {
  20974. char sPort[64];
  20975. snprintf ( sPort, sizeof(sPort), "%d", m_tParams.m_iPort );
  20976. m_tPgDriver = PQsetdbLogin ( m_tParams.m_sHost.cstr(), sPort, NULL, NULL,
  20977. m_tParams.m_sDB.cstr(), m_tParams.m_sUser.cstr(), m_tParams.m_sPass.cstr() );
  20978. if ( PQstatus ( m_tPgDriver )==CONNECTION_BAD )
  20979. {
  20980. if ( m_tParams.m_bPrintQueries )
  20981. fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
  20982. return false;
  20983. }
  20984. // set client encoding
  20985. if ( !m_sPgClientEncoding.IsEmpty() )
  20986. if ( -1==PQsetClientEncoding ( m_tPgDriver, m_sPgClientEncoding.cstr() ) )
  20987. {
  20988. SqlDisconnect ();
  20989. if ( m_tParams.m_bPrintQueries )
  20990. fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
  20991. return false;
  20992. }
  20993. if ( m_tParams.m_bPrintQueries )
  20994. fprintf ( stdout, "SQL-CONNECT: ok\n" );
  20995. return true;
  20996. }
  20997. void CSphSource_PgSQL::SqlDisconnect ()
  20998. {
  20999. if ( m_tParams.m_bPrintQueries )
  21000. fprintf ( stdout, "SQL-DISCONNECT\n" );
  21001. PQfinish ( m_tPgDriver );
  21002. }
  21003. bool CSphSource_PgSQL::SqlQuery ( const char * sQuery )
  21004. {
  21005. m_iPgRow = -1;
  21006. m_iPgRows = 0;
  21007. m_pPgResult = PQexec ( m_tPgDriver, sQuery );
  21008. ExecStatusType eRes = PQresultStatus ( m_pPgResult );
  21009. if ( ( eRes!=PGRES_COMMAND_OK ) && ( eRes!=PGRES_TUPLES_OK ) )
  21010. {
  21011. if ( m_tParams.m_bPrintQueries )
  21012. fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
  21013. return false;
  21014. }
  21015. if ( m_tParams.m_bPrintQueries )
  21016. fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
  21017. m_iPgRows = PQntuples ( m_pPgResult );
  21018. return true;
  21019. }
  21020. void CSphSource_PgSQL::SqlDismissResult ()
  21021. {
  21022. if ( !m_pPgResult )
  21023. return;
  21024. PQclear ( m_pPgResult );
  21025. m_pPgResult = NULL;
  21026. }
  21027. int CSphSource_PgSQL::SqlNumFields ()
  21028. {
  21029. if ( !m_pPgResult )
  21030. return -1;
  21031. return PQnfields ( m_pPgResult );
  21032. }
  21033. const char * CSphSource_PgSQL::SqlColumn ( int iIndex )
  21034. {
  21035. if ( !m_pPgResult )
  21036. return NULL;
  21037. const char * szValue = PQgetvalue ( m_pPgResult, m_iPgRow, iIndex );
  21038. if ( m_dIsColumnBool.GetLength() && m_dIsColumnBool[iIndex] && szValue[0]=='t' && !szValue[1] )
  21039. return "1";
  21040. return szValue;
  21041. }
  21042. const char * CSphSource_PgSQL::SqlFieldName ( int iIndex )
  21043. {
  21044. if ( !m_pPgResult )
  21045. return NULL;
  21046. return PQfname ( m_pPgResult, iIndex );
  21047. }
  21048. bool CSphSource_PgSQL::SqlFetchRow ()
  21049. {
  21050. if ( !m_pPgResult )
  21051. return false;
  21052. return ( ++m_iPgRow<m_iPgRows );
  21053. }
  21054. DWORD CSphSource_PgSQL::SqlColumnLength ( int iIndex )
  21055. {
  21056. return 0;
  21057. }
  21058. #endif // USE_PGSQL
  21059. /////////////////////////////////////////////////////////////////////////////
  21060. // XMLPIPE
  21061. /////////////////////////////////////////////////////////////////////////////
  21062. CSphSource_XMLPipe::CSphSource_XMLPipe ( BYTE * dInitialBuf, int iBufLen, const char * sName )
  21063. : CSphSource ( sName )
  21064. , m_iBufferSize ( 1048576 )
  21065. , m_bEOF ( false )
  21066. , m_bWarned ( false )
  21067. , m_iInitialBufLen ( iBufLen )
  21068. , m_bHitsReady ( false )
  21069. {
  21070. assert ( m_iBufferSize > iBufLen );
  21071. m_pTag = NULL;
  21072. m_iTagLength = 0;
  21073. m_pPipe = NULL;
  21074. m_pBuffer = NULL;
  21075. m_pBufferEnd = NULL;
  21076. m_sBuffer = new BYTE [m_iBufferSize];
  21077. if ( iBufLen )
  21078. memcpy ( m_sBuffer, dInitialBuf, iBufLen );
  21079. }
  21080. CSphSource_XMLPipe::~CSphSource_XMLPipe ()
  21081. {
  21082. Disconnect ();
  21083. SafeDeleteArray ( m_sBuffer );
  21084. }
  21085. void CSphSource_XMLPipe::Disconnect ()
  21086. {
  21087. m_iInitialBufLen = 0;
  21088. m_tHits.m_dData.Reset();
  21089. m_tSchema.Reset ();
  21090. if ( m_pPipe )
  21091. {
  21092. pclose ( m_pPipe );
  21093. m_pPipe = NULL;
  21094. }
  21095. }
  21096. bool CSphSource_XMLPipe::Setup ( FILE * pPipe, const char * sCommand )
  21097. {
  21098. assert ( sCommand );
  21099. m_pPipe = pPipe;
  21100. m_sCommand = sCommand;
  21101. return true;
  21102. }
  21103. bool CSphSource_XMLPipe::Connect ( CSphString & )
  21104. {
  21105. m_bEOF = false;
  21106. m_bWarned = false;
  21107. m_tSchema.m_dFields.Reset ();
  21108. m_tSchema.m_dFields.Add ( CSphColumnInfo ( "title" ) );
  21109. m_tSchema.m_dFields.Add ( CSphColumnInfo ( "body" ) );
  21110. CSphColumnInfo tGid ( "gid", SPH_ATTR_INTEGER );
  21111. CSphColumnInfo tTs ( "ts", SPH_ATTR_TIMESTAMP );
  21112. m_tSchema.AddAttr ( tGid, true ); // all attributes are dynamic at indexing time
  21113. m_tSchema.AddAttr ( tTs, true ); // all attributes are dynamic at indexing time
  21114. m_tDocInfo.Reset ( m_tSchema.GetRowSize() );
  21115. m_pBuffer = m_iInitialBufLen > 0 ? m_sBuffer : NULL;
  21116. m_pBufferEnd = m_pBuffer ? m_pBuffer + m_iInitialBufLen : NULL;
  21117. char sBuf [ 1024 ];
  21118. snprintf ( sBuf, sizeof(sBuf), "xmlpipe(%s)", m_sCommand.cstr() );
  21119. m_tSchema.m_sName = sBuf;
  21120. m_tHits.m_dData.Reserve ( MAX_SOURCE_HITS );
  21121. return true;
  21122. }
  21123. bool CSphSource_XMLPipe::IterateDocument ( CSphString & sError )
  21124. {
  21125. // PROFILE ( src_xmlpipe );
  21126. char sTitle [ 1024 ]; // FIXME?
  21127. assert ( m_pPipe );
  21128. assert ( m_pTokenizer );
  21129. m_tHits.m_dData.Resize ( 0 );
  21130. m_bHitsReady = false;
  21131. /////////////////////////
  21132. // parse document header
  21133. /////////////////////////
  21134. // check for eof
  21135. if ( !SkipWhitespace() )
  21136. {
  21137. m_tDocInfo.m_iDocID = 0;
  21138. return true;
  21139. }
  21140. // look for opening '<document>' tag
  21141. SetTag ( "document" );
  21142. if ( !SkipTag ( true, sError ) )
  21143. return false;
  21144. if ( !ScanInt ( "id", &m_tDocInfo.m_iDocID, sError ) )
  21145. return false;
  21146. m_tStats.m_iTotalDocuments++;
  21147. SphAttr_t uVal;
  21148. if ( !ScanInt ( "group", &uVal, sError ) ) uVal = 1; m_tDocInfo.SetAttr ( m_tSchema.GetAttr(0).m_tLocator, uVal );
  21149. if ( !ScanInt ( "timestamp", &uVal, sError ) ) uVal = 1; m_tDocInfo.SetAttr ( m_tSchema.GetAttr(1).m_tLocator, uVal );
  21150. if ( !ScanStr ( "title", sTitle, sizeof(sTitle), sError ) )
  21151. return false;
  21152. // index title
  21153. {
  21154. const BYTE * sTextToIndex = (BYTE *)sTitle;
  21155. int iLen = -1;
  21156. if ( m_pFieldFilter )
  21157. {
  21158. sTextToIndex = m_pFieldFilter->Apply ( sTextToIndex );
  21159. if ( sTextToIndex!=(BYTE *)sTitle )
  21160. iLen = m_pFieldFilter->GetResultLength();
  21161. }
  21162. if ( iLen==-1 )
  21163. iLen = (int)strlen ( (char *)sTextToIndex );
  21164. Hitpos_t iPos = HITMAN::Create ( 0, 1 );
  21165. BYTE * sWord;
  21166. m_pTokenizer->SetBuffer ( (BYTE *)sTextToIndex, iLen );
  21167. while ( ( sWord = m_pTokenizer->GetToken() )!=NULL && m_tHits.Length()<MAX_SOURCE_HITS )
  21168. {
  21169. m_tHits.AddHit ( m_tDocInfo.m_iDocID, m_pDict->GetWordID ( sWord ), iPos );
  21170. HITMAN::AddPos ( &iPos, 1 );
  21171. }
  21172. }
  21173. CheckHitsCount ( "title" );
  21174. SetTag ( "body" );
  21175. if ( !SkipTag ( true, sError ) )
  21176. return false;
  21177. m_iWordPos = 0;
  21178. /////////////////////////////
  21179. // parse body chunk by chunk
  21180. /////////////////////////////
  21181. // check for body tag end in this buffer
  21182. const char * szBodyEnd = "</body>";
  21183. bool bFirstPass = true;
  21184. bool bBodyEnd = false;
  21185. BYTE * p = m_pBuffer;
  21186. while ( !bBodyEnd )
  21187. {
  21188. p = m_pBuffer;
  21189. while ( p<m_pBufferEnd && !bBodyEnd )
  21190. {
  21191. BYTE * pBufTemp = p;
  21192. BYTE * pEndTemp = (BYTE *)szBodyEnd;
  21193. while ( pBufTemp < m_pBufferEnd && *pEndTemp && *pBufTemp==*pEndTemp )
  21194. {
  21195. ++pBufTemp;
  21196. ++pEndTemp;
  21197. }
  21198. if ( !*pEndTemp )
  21199. bBodyEnd = true;
  21200. else
  21201. p++;
  21202. }
  21203. if ( !bFirstPass )
  21204. break;
  21205. bFirstPass = false;
  21206. if ( !bBodyEnd )
  21207. UpdateBuffer ();
  21208. }
  21209. if ( !bBodyEnd )
  21210. {
  21211. if ( !m_bWarned )
  21212. {
  21213. sphWarn ( "xmlpipe: encountered body larger than %d bytes while scanning docid=" DOCID_FMT " body", m_iBufferSize, m_tDocInfo.m_iDocID );
  21214. m_bWarned = true;
  21215. }
  21216. }
  21217. const BYTE * sTextToIndex = m_pFieldFilter ? m_pFieldFilter->Apply ( m_pBuffer, p-m_pBuffer ) : m_pBuffer;
  21218. if ( sTextToIndex!=m_pBuffer )
  21219. m_pTokenizer->SetBuffer ( (BYTE*)sTextToIndex, m_pFieldFilter->GetResultLength() );
  21220. else
  21221. m_pTokenizer->SetBuffer ( m_pBuffer, p-m_pBuffer );
  21222. // tokenize
  21223. BYTE * sWord;
  21224. while ( ( sWord = m_pTokenizer->GetToken () )!=NULL && m_tHits.Length()<MAX_SOURCE_HITS )
  21225. m_tHits.AddHit ( m_tDocInfo.m_iDocID, m_pDict->GetWordID ( sWord ), HITMAN::Create ( 1, ++m_iWordPos ) );
  21226. CheckHitsCount ( "body" );
  21227. m_pBuffer = p;
  21228. SetTag ( "body" );
  21229. // some tag was found
  21230. if ( bBodyEnd )
  21231. {
  21232. // let's check if it's '</body>' which is the only allowed tag at this point
  21233. if ( !SkipTag ( false, sError ) )
  21234. return false;
  21235. } else
  21236. {
  21237. // search for '</body>' tag
  21238. bool bFound = false;
  21239. while ( !bFound )
  21240. {
  21241. while ( m_pBuffer < m_pBufferEnd && *m_pBuffer!='<' )
  21242. ++m_pBuffer;
  21243. BYTE * pBufferTmp = m_pBuffer;
  21244. if ( m_pBuffer < m_pBufferEnd )
  21245. {
  21246. if ( !SkipTag ( false, sError ) )
  21247. {
  21248. if ( m_bEOF )
  21249. return false;
  21250. else
  21251. {
  21252. if ( m_pBuffer==pBufferTmp )
  21253. m_pBuffer = pBufferTmp + 1;
  21254. }
  21255. } else
  21256. bFound = true;
  21257. } else
  21258. if ( !UpdateBuffer () )
  21259. return false;
  21260. }
  21261. }
  21262. // let's check if it's '</document>' which is the only allowed tag at this point
  21263. SetTag ( "document" );
  21264. if ( !SkipTag ( false, sError ) )
  21265. return false;
  21266. // if it was all correct, we have to flush our hits
  21267. m_bHitsReady = m_tHits.Length()>0;
  21268. return true;
  21269. }
  21270. ISphHits * CSphSource_XMLPipe::IterateHits ( CSphString & )
  21271. {
  21272. if ( !m_bHitsReady )
  21273. return NULL;
  21274. m_bHitsReady = false;
  21275. return &m_tHits;
  21276. }
  21277. SphRange_t CSphSource_XMLPipe::IterateFieldMVAStart ( int )
  21278. {
  21279. SphRange_t tRange;
  21280. tRange.m_iStart = tRange.m_iLength = 0;
  21281. return tRange;
  21282. }
  21283. void CSphSource_XMLPipe::SetTag ( const char * sTag )
  21284. {
  21285. m_pTag = sTag;
  21286. m_iTagLength = (int)strlen ( sTag );
  21287. }
  21288. bool CSphSource_XMLPipe::UpdateBuffer ()
  21289. {
  21290. assert ( m_pBuffer!=m_sBuffer );
  21291. int iLeft = Max ( m_pBufferEnd-m_pBuffer, 0 );
  21292. if ( iLeft>0 )
  21293. memmove ( m_sBuffer, m_pBuffer, iLeft );
  21294. size_t iLen = fread ( &m_sBuffer [ iLeft ], 1, m_iBufferSize-iLeft, m_pPipe );
  21295. m_tStats.m_iTotalBytes += iLen;
  21296. m_pBuffer = m_sBuffer;
  21297. m_pBufferEnd = m_pBuffer+iLeft+iLen;
  21298. return ( iLen!=0 );
  21299. }
  21300. bool CSphSource_XMLPipe::SkipWhitespace ()
  21301. {
  21302. for ( ;; )
  21303. {
  21304. // suck in some data if needed
  21305. if ( m_pBuffer>=m_pBufferEnd )
  21306. if ( !UpdateBuffer() )
  21307. return false;
  21308. // skip whitespace
  21309. while ( (m_pBuffer<m_pBufferEnd) && isspace ( *m_pBuffer ) )
  21310. m_pBuffer++;
  21311. // did we anything non-whitspace?
  21312. if ( m_pBuffer<m_pBufferEnd )
  21313. break;
  21314. }
  21315. assert ( m_pBuffer<m_pBufferEnd );
  21316. return true;
  21317. }
  21318. bool CSphSource_XMLPipe::CheckTag ( bool bOpen, CSphString & sError )
  21319. {
  21320. int iAdd = bOpen ? 2 : 3;
  21321. // if case the tag is at buffer boundary, try to suck in some more data
  21322. if ( m_pBufferEnd-m_pBuffer < m_iTagLength+iAdd )
  21323. UpdateBuffer ();
  21324. if ( m_pBufferEnd-m_pBuffer < m_iTagLength+iAdd )
  21325. {
  21326. m_bEOF = true;
  21327. sError.SetSprintf ( "xmlpipe: expected '<%s%s>', got EOF",
  21328. bOpen ? "" : "/", m_pTag );
  21329. return false;
  21330. }
  21331. // check tag
  21332. bool bOk = bOpen
  21333. ? ( ( m_pBuffer[0]=='<' )
  21334. && ( m_pBuffer[m_iTagLength+1]=='>' )
  21335. && strncmp ( (char*)(m_pBuffer+1), m_pTag, m_iTagLength )==0 )
  21336. : ( ( m_pBuffer[0]=='<' )
  21337. && ( m_pBuffer[1]=='/' )
  21338. && ( m_pBuffer[m_iTagLength+2]=='>' )
  21339. && strncmp ( (char*)(m_pBuffer+2), m_pTag, m_iTagLength )==0 );
  21340. if ( !bOk )
  21341. {
  21342. char sGot[64];
  21343. int iCopy = Min ( m_pBufferEnd-m_pBuffer, (int)sizeof(sGot)-1 );
  21344. strncpy ( sGot, (char*)m_pBuffer, iCopy );
  21345. sGot [ iCopy ] = '\0';
  21346. sError.SetSprintf ( "xmlpipe: expected '<%s%s>', got '%s'",
  21347. bOpen ? "" : "/", m_pTag, sGot );
  21348. return false;
  21349. }
  21350. // got tag
  21351. m_pBuffer += iAdd+m_iTagLength;
  21352. assert ( m_pBuffer<=m_pBufferEnd );
  21353. return true;
  21354. }
  21355. bool CSphSource_XMLPipe::SkipTag ( bool bOpen, CSphString & sError )
  21356. {
  21357. if ( !SkipWhitespace() )
  21358. {
  21359. m_bEOF = true;
  21360. sError.SetSprintf ( "xmlpipe: expected '<%s%s>', got EOF",
  21361. bOpen ? "" : "/", m_pTag );
  21362. return false;
  21363. }
  21364. return CheckTag ( bOpen, sError );
  21365. }
  21366. bool CSphSource_XMLPipe::ScanInt ( const char * sTag, DWORD * pRes, CSphString & sError )
  21367. {
  21368. uint64_t uRes;
  21369. if ( !ScanInt ( sTag, &uRes, sError ) )
  21370. return false;
  21371. (*pRes) = (DWORD)uRes;
  21372. return true;
  21373. }
  21374. bool CSphSource_XMLPipe::ScanInt ( const char * sTag, uint64_t * pRes, CSphString & sError )
  21375. {
  21376. assert ( sTag );
  21377. assert ( pRes );
  21378. // scan for <sTag>
  21379. SetTag ( sTag );
  21380. if ( !SkipTag ( true, sError ) )
  21381. return false;
  21382. if ( !SkipWhitespace() )
  21383. {
  21384. sError.SetSprintf ( "xmlpipe: expected <%s> data, got EOF", m_pTag );
  21385. return false;
  21386. }
  21387. *pRes = 0;
  21388. while ( m_pBuffer<m_pBufferEnd )
  21389. {
  21390. // FIXME! could check for overflow
  21391. while ( isdigit ( *m_pBuffer ) && m_pBuffer<m_pBufferEnd )
  21392. (*pRes) = 10*(*pRes) + (int)( (*m_pBuffer++)-'0' );
  21393. if ( m_pBuffer<m_pBufferEnd )
  21394. break;
  21395. else
  21396. UpdateBuffer ();
  21397. }
  21398. // scan for </sTag>
  21399. if ( !SkipTag ( false, sError ) )
  21400. return false;
  21401. return true;
  21402. }
  21403. bool CSphSource_XMLPipe::ScanStr ( const char * sTag, char * pRes, int iMaxLength, CSphString & sError )
  21404. {
  21405. assert ( sTag );
  21406. assert ( pRes );
  21407. char * pEnd = pRes+iMaxLength-1;
  21408. // scan for <sTag>
  21409. SetTag ( sTag );
  21410. if ( !SkipTag ( true, sError ) )
  21411. return false;
  21412. if ( !SkipWhitespace() )
  21413. {
  21414. sError.SetSprintf ( "xmlpipe: expected <%s> data, got EOF", m_pTag );
  21415. return false;
  21416. }
  21417. while ( m_pBuffer<m_pBufferEnd )
  21418. {
  21419. while ( (*m_pBuffer)!='<' && pRes<pEnd && m_pBuffer<m_pBufferEnd )
  21420. *pRes++ = *m_pBuffer++;
  21421. if ( m_pBuffer<m_pBufferEnd )
  21422. break;
  21423. else
  21424. UpdateBuffer ();
  21425. }
  21426. *pRes++ = '\0';
  21427. // scan for </sTag>
  21428. if ( !SkipTag ( false, sError ) )
  21429. return false;
  21430. return true;
  21431. }
  21432. void CSphSource_XMLPipe::CheckHitsCount ( const char * sField )
  21433. {
  21434. if ( m_tHits.Length()>=MAX_SOURCE_HITS && m_pTokenizer->GetTokenEnd()!=m_pTokenizer->GetBufferEnd() )
  21435. sphWarn ( "xmlpipe: collected hits larger than %d(MAX_SOURCE_HITS) "
  21436. "while scanning docid=" DOCID_FMT " %s - clipped!!!",
  21437. MAX_SOURCE_HITS, m_tDocInfo.m_iDocID, sField );
  21438. }
  21439. /////////////////////////////////////////////////////////////////////////////
  21440. // XMLPIPE (v2)
  21441. /////////////////////////////////////////////////////////////////////////////
  21442. #if USE_LIBEXPAT || USE_LIBXML
  21443. /// XML pipe source implementation (v2)
  21444. class CSphSource_XMLPipe2 : public CSphSource_Document
  21445. {
  21446. public:
  21447. CSphSource_XMLPipe2 ( BYTE * dInitialBuf, int iBufLen, const char * sName, int iFieldBufferMax, bool bFixupUTF8 );
  21448. ~CSphSource_XMLPipe2 ();
  21449. bool Setup ( FILE * pPipe, const CSphConfigSection & hSource ); ///< memorize the command
  21450. virtual bool Connect ( CSphString & sError ); ///< run the command and open the pipe
  21451. virtual void Disconnect (); ///< close the pipe
  21452. virtual bool IterateStart ( CSphString & ) { m_iPlainFieldsLength = m_tSchema.m_dFields.GetLength(); return true; } ///< Connect() starts getting documents automatically, so this one is empty
  21453. virtual BYTE ** NextDocument ( CSphString & sError ); ///< parse incoming chunk and emit some hits
  21454. virtual bool HasAttrsConfigured () { return true; } ///< xmlpipe always has some attrs for now
  21455. virtual bool IterateMultivaluedStart ( int, CSphString & ) { return false; }
  21456. virtual bool IterateMultivaluedNext () { return false; }
  21457. virtual bool IterateKillListStart ( CSphString & );
  21458. virtual bool IterateKillListNext ( SphDocID_t & tDocId );
  21459. void StartElement ( const char * szName, const char ** pAttrs );
  21460. void EndElement ( const char * pName );
  21461. void Characters ( const char * pCharacters, int iLen );
  21462. #if USE_LIBXML
  21463. int ReadBuffer ( BYTE * pBuffer, int iLen );
  21464. void ProcessNode ( xmlTextReaderPtr pReader );
  21465. #endif
  21466. void Error ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
  21467. private:
  21468. struct Document_t
  21469. {
  21470. SphDocID_t m_iDocID;
  21471. CSphVector < CSphVector<BYTE> > m_dFields;
  21472. CSphVector<CSphString> m_dAttrs;
  21473. };
  21474. Document_t * m_pCurDocument;
  21475. CSphVector<Document_t *> m_dParsedDocuments;
  21476. FILE * m_pPipe; ///< incoming stream
  21477. CSphString m_sCommand; ///< my command
  21478. CSphString m_sError;
  21479. CSphVector<CSphString> m_dDefaultAttrs;
  21480. CSphVector<CSphString> m_dInvalid;
  21481. CSphVector<CSphString> m_dWarned;
  21482. int m_iElementDepth;
  21483. BYTE * m_pBuffer;
  21484. int m_iBufferSize;
  21485. CSphVector<BYTE*>m_dFieldPtrs;
  21486. bool m_bRemoveParsed;
  21487. bool m_bInDocset;
  21488. bool m_bInSchema;
  21489. bool m_bInDocument;
  21490. bool m_bInKillList;
  21491. bool m_bInId;
  21492. bool m_bInIgnoredTag;
  21493. bool m_bFirstTagAfterDocset;
  21494. int m_iKillListIterator;
  21495. CSphVector < SphDocID_t > m_dKillList;
  21496. int m_iMVA;
  21497. int m_iMVAIterator;
  21498. CSphVector < CSphVector <DWORD> > m_dFieldMVAs;
  21499. CSphVector < int > m_dAttrToMVA;
  21500. int m_iCurField;
  21501. int m_iCurAttr;
  21502. #if USE_LIBEXPAT
  21503. XML_Parser m_pParser;
  21504. #endif
  21505. #if USE_LIBXML
  21506. xmlTextReaderPtr m_pParser;
  21507. BYTE * m_pBufferPtr;
  21508. BYTE * m_pBufferEnd;
  21509. bool m_bPassedBufferEnd;
  21510. CSphVector <const char *> m_dAttrs;
  21511. #endif
  21512. int m_iInitialBufSize;
  21513. int m_iFieldBufferMax;
  21514. BYTE * m_pFieldBuffer;
  21515. int m_iFieldBufferLen;
  21516. bool m_bFixupUTF8; ///< whether to replace invalid utf-8 codepoints with spaces
  21517. int m_iReparseStart; ///< utf-8 fixerupper might need to postpone a few bytes, starting at this offset
  21518. int m_iReparseLen; ///< and this much bytes (under 4)
  21519. const char * DecorateMessage ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
  21520. const char * DecorateMessageVA ( const char * sTemplate, va_list ap );
  21521. void ConfigureAttrs ( const CSphVariant * pHead, ESphAttr eAttrType );
  21522. void ConfigureFields ( const CSphVariant * pHead );
  21523. void AddFieldToSchema ( const char * szName );
  21524. void UnexpectedCharaters ( const char * pCharacters, int iLen, const char * szComment );
  21525. #if USE_LIBEXPAT
  21526. bool ParseNextChunk ( int iBufferLen, CSphString & sError );
  21527. #endif
  21528. #if USE_LIBXML
  21529. int ParseNextChunk ( CSphString & sError );
  21530. #endif
  21531. void DocumentError ( const char * sWhere )
  21532. {
  21533. Error ( "malformed source, <sphinx:document> found inside %s", sWhere );
  21534. // Ideally I'd like to display a notice on the next line that
  21535. // would say where exactly it's allowed. E.g.:
  21536. //
  21537. // <sphinx:document> must be contained in <sphinx:docset>
  21538. }
  21539. };
  21540. #if USE_LIBEXPAT
  21541. // callbacks
  21542. static void XMLCALL xmlStartElement ( void * user_data, const XML_Char * name, const XML_Char ** attrs )
  21543. {
  21544. CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
  21545. pSource->StartElement ( name, attrs );
  21546. }
  21547. static void XMLCALL xmlEndElement ( void * user_data, const XML_Char * name )
  21548. {
  21549. CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
  21550. pSource->EndElement ( name );
  21551. }
  21552. static void XMLCALL xmlCharacters ( void * user_data, const XML_Char * ch, int len )
  21553. {
  21554. CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
  21555. pSource->Characters ( ch, len );
  21556. }
  21557. #if USE_LIBICONV
  21558. static int XMLCALL xmlUnknownEncoding ( void *, const XML_Char * name, XML_Encoding * info )
  21559. {
  21560. iconv_t pDesc = iconv_open ( "UTF-16", name );
  21561. if ( !pDesc )
  21562. return XML_STATUS_ERROR;
  21563. for ( size_t i = 0; i < 256; i++ )
  21564. {
  21565. char cIn = (char) i;
  21566. char dOut[4];
  21567. memset ( dOut, 0, sizeof ( dOut ) );
  21568. #if ICONV_INBUF_CONST
  21569. const char * pInbuf = &cIn;
  21570. #else
  21571. char * pInbuf = &cIn;
  21572. #endif
  21573. char * pOutbuf = dOut;
  21574. size_t iInBytesLeft = 1;
  21575. size_t iOutBytesLeft = 4;
  21576. if ( iconv ( pDesc, &pInbuf, &iInBytesLeft, &pOutbuf, &iOutBytesLeft )!=size_t(-1) )
  21577. info->map[i] = int ( BYTE ( dOut[0] ) ) << 8 | int ( BYTE ( dOut[1] ) );
  21578. else
  21579. info->map[i] = 0;
  21580. }
  21581. iconv_close ( pDesc );
  21582. return XML_STATUS_OK;
  21583. }
  21584. #endif
  21585. #endif
  21586. #if USE_LIBXML
  21587. int xmlReadBuffers ( void * context, char * buffer, int len )
  21588. {
  21589. CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) context;
  21590. return pSource->ReadBuffer ( (BYTE*)buffer, len );
  21591. }
  21592. void xmlErrorHandler ( void * arg, const char * msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator )
  21593. {
  21594. if ( severity==XML_PARSER_SEVERITY_ERROR )
  21595. {
  21596. int iLine = xmlTextReaderLocatorLineNumber ( locator );
  21597. CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) arg;
  21598. pSource->Error ( "%s (line=%d)", msg, iLine );
  21599. }
  21600. }
  21601. #endif
  21602. CSphSource_XMLPipe2::CSphSource_XMLPipe2 ( BYTE * dInitialBuf, int iBufLen, const char * sName, int iFieldBufferMax, bool bFixupUTF8 )
  21603. : CSphSource_Document ( sName )
  21604. , m_pCurDocument ( NULL )
  21605. , m_pPipe ( NULL )
  21606. , m_iElementDepth ( 0 )
  21607. , m_iBufferSize ( 1048576 )
  21608. , m_bRemoveParsed ( false )
  21609. , m_bInDocset ( false )
  21610. , m_bInSchema ( false )
  21611. , m_bInDocument ( false )
  21612. , m_bInKillList ( false )
  21613. , m_bInId ( false )
  21614. , m_bInIgnoredTag ( false )
  21615. , m_bFirstTagAfterDocset ( false )
  21616. , m_iKillListIterator ( 0 )
  21617. , m_iMVA ( 0 )
  21618. , m_iMVAIterator ( 0 )
  21619. , m_iCurField ( -1 )
  21620. , m_iCurAttr ( -1 )
  21621. , m_pParser ( NULL )
  21622. #if USE_LIBXML
  21623. , m_pBufferPtr ( NULL )
  21624. , m_pBufferEnd ( NULL )
  21625. , m_bPassedBufferEnd ( false )
  21626. #endif
  21627. , m_iInitialBufSize ( iBufLen )
  21628. , m_iFieldBufferLen ( 0 )
  21629. , m_bFixupUTF8 ( bFixupUTF8 )
  21630. , m_iReparseStart ( 0 )
  21631. , m_iReparseLen ( 0 )
  21632. {
  21633. assert ( m_iBufferSize > iBufLen );
  21634. m_pBuffer = new BYTE [m_iBufferSize];
  21635. m_iFieldBufferMax = Max ( iFieldBufferMax, 65536 );
  21636. m_pFieldBuffer = new BYTE [ m_iFieldBufferMax ];
  21637. if ( iBufLen )
  21638. memcpy ( m_pBuffer, dInitialBuf, iBufLen );
  21639. m_iInitialBufSize = iBufLen;
  21640. }
  21641. CSphSource_XMLPipe2::~CSphSource_XMLPipe2 ()
  21642. {
  21643. Disconnect ();
  21644. SafeDeleteArray ( m_pBuffer );
  21645. SafeDeleteArray ( m_pFieldBuffer );
  21646. ARRAY_FOREACH ( i, m_dParsedDocuments )
  21647. SafeDelete ( m_dParsedDocuments[i] );
  21648. }
  21649. void CSphSource_XMLPipe2::Disconnect ()
  21650. {
  21651. if ( m_pPipe )
  21652. {
  21653. pclose ( m_pPipe );
  21654. m_pPipe = NULL;
  21655. }
  21656. #if USE_LIBEXPAT
  21657. if ( m_pParser )
  21658. {
  21659. XML_ParserFree ( m_pParser );
  21660. m_pParser = NULL;
  21661. }
  21662. #endif
  21663. #if USE_LIBXML
  21664. if ( m_pParser )
  21665. {
  21666. xmlFreeTextReader ( m_pParser );
  21667. m_pParser = NULL;
  21668. }
  21669. #endif
  21670. m_tHits.m_dData.Reset();
  21671. m_iInitialBufSize = 0;
  21672. }
  21673. void CSphSource_XMLPipe2::Error ( const char * sTemplate, ... )
  21674. {
  21675. if ( !m_sError.IsEmpty() )
  21676. return;
  21677. va_list ap;
  21678. va_start ( ap, sTemplate );
  21679. m_sError = DecorateMessageVA ( sTemplate, ap );
  21680. va_end ( ap );
  21681. }
  21682. const char * CSphSource_XMLPipe2::DecorateMessage ( const char * sTemplate, ... )
  21683. {
  21684. va_list ap;
  21685. va_start ( ap, sTemplate );
  21686. const char * sRes = DecorateMessageVA ( sTemplate, ap );
  21687. va_end ( ap );
  21688. return sRes;
  21689. }
  21690. const char * CSphSource_XMLPipe2::DecorateMessageVA ( const char * sTemplate, va_list ap )
  21691. {
  21692. static char sBuf[1024];
  21693. snprintf ( sBuf, sizeof(sBuf), "source '%s': ", m_tSchema.m_sName.cstr() );
  21694. int iBufLen = strlen ( sBuf );
  21695. int iLeft = sizeof(sBuf) - iBufLen;
  21696. char * szBufStart = sBuf + iBufLen;
  21697. vsnprintf ( szBufStart, iLeft, sTemplate, ap );
  21698. iBufLen = strlen ( sBuf );
  21699. iLeft = sizeof(sBuf) - iBufLen;
  21700. szBufStart = sBuf + iBufLen;
  21701. #if USE_LIBEXPAT
  21702. if ( m_pParser )
  21703. {
  21704. SphDocID_t uFailedID = 0;
  21705. if ( m_dParsedDocuments.GetLength() )
  21706. uFailedID = m_dParsedDocuments.Last()->m_iDocID;
  21707. snprintf ( szBufStart, iLeft, " (line=%d, pos=%d, docid=" DOCID_FMT ")",
  21708. (int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ),
  21709. uFailedID );
  21710. }
  21711. #endif
  21712. #if USE_LIBXML
  21713. if ( m_pParser )
  21714. {
  21715. SphDocID_t uFailedID = 0;
  21716. if ( m_dParsedDocuments.GetLength() )
  21717. uFailedID = m_dParsedDocuments.Last()->m_iDocID;
  21718. snprintf ( szBufStart, iLeft, " (docid=" DOCID_FMT ")", uFailedID );
  21719. }
  21720. #endif
  21721. return sBuf;
  21722. }
  21723. void CSphSource_XMLPipe2::AddFieldToSchema ( const char * szName )
  21724. {
  21725. CSphColumnInfo tCol ( szName );
  21726. tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), m_pDict && m_pDict->GetSettings().m_bWordDict );
  21727. m_tSchema.m_dFields.Add ( tCol );
  21728. }
  21729. void CSphSource_XMLPipe2::ConfigureAttrs ( const CSphVariant * pHead, ESphAttr eAttrType )
  21730. {
  21731. for ( const CSphVariant * pCur = pHead; pCur; pCur= pCur->m_pNext )
  21732. {
  21733. CSphColumnInfo tCol ( pCur->cstr(), eAttrType );
  21734. char * pColon = strchr ( const_cast<char*> ( tCol.m_sName.cstr() ), ':' );
  21735. if ( pColon )
  21736. {
  21737. *pColon = '\0';
  21738. if ( eAttrType==SPH_ATTR_INTEGER )
  21739. {
  21740. int iBits = strtol ( pColon+1, NULL, 10 );
  21741. if ( iBits<=0 || iBits>ROWITEM_BITS )
  21742. {
  21743. sphWarn ( "%s", DecorateMessage ( "attribute '%s': invalid bitcount=%d (bitcount ignored)", tCol.m_sName.cstr(), iBits ) );
  21744. iBits = -1;
  21745. }
  21746. tCol.m_tLocator.m_iBitCount = iBits;
  21747. } else
  21748. sphWarn ( "%s", DecorateMessage ( "attribute '%s': bitcount is only supported for integer types", tCol.m_sName.cstr() ) );
  21749. }
  21750. tCol.m_iIndex = m_tSchema.GetAttrsCount ();
  21751. if ( eAttrType==SPH_ATTR_UINT32SET || eAttrType==SPH_ATTR_INT64SET )
  21752. {
  21753. tCol.m_eAttrType = eAttrType;
  21754. tCol.m_eSrc = SPH_ATTRSRC_FIELD;
  21755. }
  21756. m_tSchema.AddAttr ( tCol, true ); // all attributes are dynamic at indexing time
  21757. }
  21758. }
  21759. void CSphSource_XMLPipe2::ConfigureFields ( const CSphVariant * pHead )
  21760. {
  21761. for ( const CSphVariant * pCur = pHead; pCur; pCur= pCur->m_pNext )
  21762. {
  21763. CSphString sFieldName = pCur->cstr ();
  21764. bool bFound = false;
  21765. for ( int i = 0; i < m_tSchema.m_dFields.GetLength () && !bFound; i++ )
  21766. bFound = m_tSchema.m_dFields[i].m_sName==sFieldName;
  21767. if ( bFound )
  21768. sphWarn ( "%s", DecorateMessage ( "duplicate field '%s'", sFieldName.cstr () ) );
  21769. else
  21770. AddFieldToSchema ( sFieldName.cstr () );
  21771. }
  21772. }
  21773. bool CSphSource_XMLPipe2::Setup ( FILE * pPipe, const CSphConfigSection & hSource )
  21774. {
  21775. m_pPipe = pPipe;
  21776. m_tSchema.Reset ();
  21777. m_sCommand = hSource["xmlpipe_command"].cstr ();
  21778. ConfigureAttrs ( hSource("xmlpipe_attr_uint"), SPH_ATTR_INTEGER );
  21779. ConfigureAttrs ( hSource("xmlpipe_attr_timestamp"), SPH_ATTR_TIMESTAMP );
  21780. ConfigureAttrs ( hSource("xmlpipe_attr_str2ordinal"), SPH_ATTR_ORDINAL );
  21781. ConfigureAttrs ( hSource("xmlpipe_attr_bool"), SPH_ATTR_BOOL );
  21782. ConfigureAttrs ( hSource("xmlpipe_attr_float"), SPH_ATTR_FLOAT );
  21783. ConfigureAttrs ( hSource("xmlpipe_attr_bigint"), SPH_ATTR_BIGINT );
  21784. ConfigureAttrs ( hSource("xmlpipe_attr_multi"), SPH_ATTR_UINT32SET );
  21785. ConfigureAttrs ( hSource("xmlpipe_attr_multi_64"), SPH_ATTR_INT64SET );
  21786. ConfigureAttrs ( hSource("xmlpipe_attr_string"), SPH_ATTR_STRING );
  21787. ConfigureAttrs ( hSource("xmlpipe_attr_json"), SPH_ATTR_JSON );
  21788. ConfigureAttrs ( hSource("xmlpipe_attr_wordcount"), SPH_ATTR_WORDCOUNT );
  21789. ConfigureAttrs ( hSource("xmlpipe_field_string"), SPH_ATTR_STRING );
  21790. ConfigureAttrs ( hSource("xmlpipe_field_wordcount"), SPH_ATTR_WORDCOUNT );
  21791. ConfigureFields ( hSource("xmlpipe_field") );
  21792. ConfigureFields ( hSource("xmlpipe_field_string") );
  21793. ConfigureFields ( hSource("xmlpipe_field_wordcount") );
  21794. AllocDocinfo();
  21795. return true;
  21796. }
  21797. bool CSphSource_XMLPipe2::Connect ( CSphString & sError )
  21798. {
  21799. ARRAY_FOREACH ( i, m_tSchema.m_dFields )
  21800. {
  21801. CSphColumnInfo & tCol = m_tSchema.m_dFields[i];
  21802. tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), m_pDict && m_pDict->GetSettings().m_bWordDict );
  21803. }
  21804. if ( !AddAutoAttrs ( sError ) )
  21805. return false;
  21806. AllocDocinfo();
  21807. #if USE_LIBEXPAT
  21808. m_pParser = XML_ParserCreate(NULL);
  21809. if ( !m_pParser )
  21810. {
  21811. sError.SetSprintf ( "xmlpipe: failed to create XML parser" );
  21812. return false;
  21813. }
  21814. XML_SetUserData ( m_pParser, this );
  21815. XML_SetElementHandler ( m_pParser, xmlStartElement, xmlEndElement );
  21816. XML_SetCharacterDataHandler ( m_pParser, xmlCharacters );
  21817. #if USE_LIBICONV
  21818. XML_SetUnknownEncodingHandler ( m_pParser, xmlUnknownEncoding, NULL );
  21819. #endif
  21820. #endif
  21821. #if USE_LIBXML
  21822. m_pBufferPtr = m_pBuffer;
  21823. m_pBufferEnd = m_pBuffer + m_iInitialBufSize;
  21824. m_bPassedBufferEnd = false;
  21825. m_dAttrs.Reserve ( 16 );
  21826. m_dAttrs.Resize ( 0 );
  21827. m_pParser = xmlReaderForIO ( (xmlInputReadCallback)xmlReadBuffers, NULL, this, NULL, NULL, 0 );
  21828. if ( !m_pParser )
  21829. {
  21830. sError.SetSprintf ( "xmlpipe: failed to create XML parser" );
  21831. return false;
  21832. }
  21833. xmlTextReaderSetErrorHandler ( m_pParser, xmlErrorHandler, this );
  21834. #endif
  21835. m_dKillList.Reserve ( 1024 );
  21836. m_dKillList.Resize ( 0 );
  21837. m_bRemoveParsed = false;
  21838. m_bInDocset = false;
  21839. m_bInSchema = false;
  21840. m_bInDocument = false;
  21841. m_bInKillList = false;
  21842. m_bInId = false;
  21843. m_bFirstTagAfterDocset = false;
  21844. m_iCurField = -1;
  21845. m_iCurAttr = -1;
  21846. m_iElementDepth = 0;
  21847. m_dParsedDocuments.Reset ();
  21848. m_dDefaultAttrs.Reset ();
  21849. m_dInvalid.Reset ();
  21850. m_dWarned.Reset ();
  21851. m_dParsedDocuments.Reserve ( 1024 );
  21852. m_dParsedDocuments.Resize ( 0 );
  21853. m_iKillListIterator = 0;
  21854. m_iMVA = 0;
  21855. m_iMVAIterator = 0;
  21856. m_sError = "";
  21857. #if USE_LIBEXPAT
  21858. int iBytesRead = m_iInitialBufSize;
  21859. iBytesRead += fread ( m_pBuffer + m_iInitialBufSize, 1, m_iBufferSize - m_iInitialBufSize, m_pPipe );
  21860. if ( !ParseNextChunk ( iBytesRead, sError ) )
  21861. return false;
  21862. #endif
  21863. #if USE_LIBXML
  21864. if ( ParseNextChunk ( sError )==-1 )
  21865. return false;
  21866. #endif
  21867. m_dAttrToMVA.Resize ( 0 );
  21868. int iFieldMVA = 0;
  21869. for ( int i = 0; i < m_tSchema.GetAttrsCount (); i++ )
  21870. {
  21871. const CSphColumnInfo & tCol = m_tSchema.GetAttr ( i );
  21872. if ( ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET ) && tCol.m_eSrc==SPH_ATTRSRC_FIELD )
  21873. m_dAttrToMVA.Add ( iFieldMVA++ );
  21874. else
  21875. m_dAttrToMVA.Add ( -1 );
  21876. }
  21877. m_dFieldMVAs.Resize ( iFieldMVA );
  21878. ARRAY_FOREACH ( i, m_dFieldMVAs )
  21879. m_dFieldMVAs[i].Reserve ( 16 );
  21880. m_tHits.m_dData.Reserve ( m_iMaxHits );
  21881. return true;
  21882. }
  21883. #if USE_LIBXML
  21884. int CSphSource_XMLPipe2::ParseNextChunk ( CSphString & sError )
  21885. {
  21886. int iRet = xmlTextReaderRead ( m_pParser );
  21887. while ( iRet==1 )
  21888. {
  21889. ProcessNode ( m_pParser );
  21890. if ( !m_sError.IsEmpty () )
  21891. {
  21892. sError = m_sError;
  21893. m_tDocInfo.m_iDocID = 1;
  21894. return false;
  21895. }
  21896. if ( m_bPassedBufferEnd )
  21897. break;
  21898. iRet = xmlTextReaderRead ( m_pParser );
  21899. }
  21900. m_bPassedBufferEnd = false;
  21901. if ( !m_sError.IsEmpty () || iRet==-1 )
  21902. {
  21903. sError = m_sError;
  21904. m_tDocInfo.m_iDocID = 1;
  21905. return -1;
  21906. }
  21907. return iRet;
  21908. }
  21909. #endif
  21910. #if USE_LIBEXPAT
  21911. bool CSphSource_XMLPipe2::ParseNextChunk ( int iBufferLen, CSphString & sError )
  21912. {
  21913. if ( !iBufferLen )
  21914. return true;
  21915. bool bLast = ( iBufferLen!=m_iBufferSize );
  21916. m_iReparseLen = 0;
  21917. if ( m_bFixupUTF8 )
  21918. {
  21919. BYTE * p = m_pBuffer;
  21920. BYTE * pMax = m_pBuffer + iBufferLen;
  21921. while ( p<pMax )
  21922. {
  21923. BYTE v = *p;
  21924. // fix control codes
  21925. if ( v<0x20 && v!=0x0D && v!=0x0A )
  21926. {
  21927. *p++ = ' ';
  21928. continue;
  21929. }
  21930. // accept ascii7 codes
  21931. if ( v<128 )
  21932. {
  21933. p++;
  21934. continue;
  21935. }
  21936. // remove invalid start bytes
  21937. if ( v<0xC2 )
  21938. {
  21939. *p++ = ' ';
  21940. continue;
  21941. }
  21942. // get and check byte count
  21943. int iBytes = 0;
  21944. while ( v & 0x80 )
  21945. {
  21946. iBytes++;
  21947. v <<= 1;
  21948. }
  21949. if ( iBytes<2 || iBytes>3 )
  21950. {
  21951. *p++ = ' ';
  21952. continue;
  21953. }
  21954. // if we're on a boundary, save these few bytes for the future
  21955. if ( p+iBytes>pMax )
  21956. {
  21957. m_iReparseStart = (int)(p-m_pBuffer);
  21958. m_iReparseLen = (int)(pMax-p);
  21959. iBufferLen -= m_iReparseLen;
  21960. break;
  21961. }
  21962. // otherwise (not a boundary), check them all
  21963. int i = 1;
  21964. int iVal = ( v >> iBytes );
  21965. for ( ; i<iBytes; i++ )
  21966. {
  21967. if ( ( p[i] & 0xC0 )!=0x80 )
  21968. break;
  21969. iVal = ( iVal<<6 ) + ( p[i] & 0x3f );
  21970. }
  21971. if ( i!=iBytes // remove invalid sequences
  21972. || ( iVal>=0xd800 && iVal<=0xdfff ) // and utf-16 surrogate pairs
  21973. || ( iBytes==3 && iVal<0x800 ) // and overlong 3-byte codes
  21974. || ( iVal>=0xfff0 && iVal<=0xffff ) ) // and kinda-valid specials expat chokes on anyway
  21975. {
  21976. iBytes = i;
  21977. for ( i=0; i<iBytes; i++ )
  21978. p[i] = ' ';
  21979. }
  21980. // only move forward by the amount of succesfully processed bytes!
  21981. p += i;
  21982. }
  21983. }
  21984. if ( XML_Parse ( m_pParser, (const char*) m_pBuffer, iBufferLen, bLast )!=XML_STATUS_OK )
  21985. {
  21986. SphDocID_t uFailedID = 0;
  21987. if ( m_dParsedDocuments.GetLength() )
  21988. uFailedID = m_dParsedDocuments.Last()->m_iDocID;
  21989. sError.SetSprintf ( "source '%s': XML parse error: %s (line=%d, pos=%d, docid=" DOCID_FMT ")",
  21990. m_tSchema.m_sName.cstr(), XML_ErrorString ( XML_GetErrorCode ( m_pParser ) ),
  21991. (int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ),
  21992. uFailedID );
  21993. m_tDocInfo.m_iDocID = 1;
  21994. return false;
  21995. }
  21996. if ( !m_sError.IsEmpty () )
  21997. {
  21998. sError = m_sError;
  21999. m_tDocInfo.m_iDocID = 1;
  22000. return false;
  22001. }
  22002. return true;
  22003. }
  22004. #endif
  22005. BYTE ** CSphSource_XMLPipe2::NextDocument ( CSphString & sError )
  22006. {
  22007. if ( m_bRemoveParsed )
  22008. {
  22009. SafeDelete ( m_dParsedDocuments[0] );
  22010. m_dParsedDocuments.RemoveFast ( 0 );
  22011. m_bRemoveParsed = false;
  22012. }
  22013. int iReadResult = 0;
  22014. #if USE_LIBEXPAT
  22015. while ( m_dParsedDocuments.GetLength()==0 )
  22016. {
  22017. // saved bytes to the front!
  22018. if ( m_iReparseLen )
  22019. memmove ( m_pBuffer, m_pBuffer+m_iReparseStart, m_iReparseLen );
  22020. // read more data
  22021. iReadResult = fread ( m_pBuffer+m_iReparseLen, 1, m_iBufferSize-m_iReparseLen, m_pPipe );
  22022. if ( iReadResult==0 )
  22023. break;
  22024. // and parse it
  22025. if ( !ParseNextChunk ( iReadResult+m_iReparseLen, sError ) )
  22026. return NULL;
  22027. }
  22028. #endif
  22029. #if USE_LIBXML
  22030. while ( m_dParsedDocuments.GetLength()==0 && ( iReadResult = ParseNextChunk ( sError ) )==1 );
  22031. #endif
  22032. while ( m_dParsedDocuments.GetLength()!=0 )
  22033. {
  22034. Document_t * pDocument = m_dParsedDocuments[0];
  22035. int nAttrs = m_tSchema.GetAttrsCount ();
  22036. // docid
  22037. m_tDocInfo.m_iDocID = VerifyID ( pDocument->m_iDocID );
  22038. if ( m_tDocInfo.m_iDocID==0 )
  22039. {
  22040. SafeDelete ( m_dParsedDocuments[0] );
  22041. m_dParsedDocuments.RemoveFast ( 0 );
  22042. continue;
  22043. }
  22044. // attributes
  22045. for ( int i = 0; i < nAttrs; i++ )
  22046. {
  22047. const CSphString & sAttrValue = pDocument->m_dAttrs[i].IsEmpty () && m_dDefaultAttrs.GetLength ()
  22048. ? m_dDefaultAttrs[i]
  22049. : pDocument->m_dAttrs[i];
  22050. const CSphColumnInfo & tAttr = m_tSchema.GetAttr ( i );
  22051. if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
  22052. {
  22053. m_tDocInfo.SetAttr ( tAttr.m_tLocator, ParseFieldMVA ( m_dMva, sAttrValue.cstr (), tAttr.m_eAttrType==SPH_ATTR_INT64SET ) );
  22054. continue;
  22055. }
  22056. switch ( tAttr.m_eAttrType )
  22057. {
  22058. case SPH_ATTR_ORDINAL:
  22059. case SPH_ATTR_STRING:
  22060. case SPH_ATTR_JSON:
  22061. case SPH_ATTR_WORDCOUNT:
  22062. m_dStrAttrs[i] = sAttrValue.cstr ();
  22063. if ( !m_dStrAttrs[i].cstr() )
  22064. m_dStrAttrs[i] = "";
  22065. m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
  22066. break;
  22067. case SPH_ATTR_FLOAT:
  22068. m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( sAttrValue.cstr () ) );
  22069. break;
  22070. case SPH_ATTR_BIGINT:
  22071. m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( sAttrValue.cstr () ) );
  22072. break;
  22073. default:
  22074. m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( sAttrValue.cstr () ) );
  22075. break;
  22076. }
  22077. }
  22078. m_bRemoveParsed = true;
  22079. int nFields = m_tSchema.m_dFields.GetLength ();
  22080. if ( !nFields )
  22081. {
  22082. m_tDocInfo.m_iDocID = 0;
  22083. return NULL;
  22084. }
  22085. m_dFieldPtrs.Resize ( nFields );
  22086. for ( int i = 0; i < nFields; ++i )
  22087. m_dFieldPtrs[i] = pDocument->m_dFields[i].Begin();
  22088. return (BYTE **)&( m_dFieldPtrs[0] );
  22089. }
  22090. if ( !iReadResult )
  22091. m_tDocInfo.m_iDocID = 0;
  22092. return NULL;
  22093. }
  22094. bool CSphSource_XMLPipe2::IterateKillListStart ( CSphString & )
  22095. {
  22096. m_iKillListIterator = 0;
  22097. return true;
  22098. }
  22099. bool CSphSource_XMLPipe2::IterateKillListNext ( SphDocID_t & tDocId )
  22100. {
  22101. if ( m_iKillListIterator>=m_dKillList.GetLength () )
  22102. return false;
  22103. tDocId = m_dKillList [ m_iKillListIterator++ ];
  22104. return true;
  22105. }
  22106. void CSphSource_XMLPipe2::StartElement ( const char * szName, const char ** pAttrs )
  22107. {
  22108. if ( !strcmp ( szName, "sphinx:docset" ) )
  22109. {
  22110. m_bInDocset = true;
  22111. m_bFirstTagAfterDocset = true;
  22112. return;
  22113. }
  22114. if ( !strcmp ( szName, "sphinx:schema" ) )
  22115. {
  22116. if ( !m_bInDocset || !m_bFirstTagAfterDocset )
  22117. {
  22118. Error ( "<sphinx:schema> is allowed immediately after <sphinx:docset> only" );
  22119. return;
  22120. }
  22121. if ( m_tSchema.m_dFields.GetLength () > 0 || m_tSchema.GetAttrsCount () > 0 )
  22122. {
  22123. sphWarn ( "%s", DecorateMessage ( "both embedded and configured schemas found; using embedded" ) );
  22124. m_tSchema.Reset ();
  22125. CSphMatch tDocInfo;
  22126. Swap ( m_tDocInfo, tDocInfo );
  22127. }
  22128. m_bFirstTagAfterDocset = false;
  22129. m_bInSchema = true;
  22130. return;
  22131. }
  22132. if ( !strcmp ( szName, "sphinx:field" ) )
  22133. {
  22134. if ( !m_bInDocset || !m_bInSchema )
  22135. {
  22136. Error ( "<sphinx:field> is allowed inside <sphinx:schema> only" );
  22137. return;
  22138. }
  22139. const char ** dAttrs = pAttrs;
  22140. CSphColumnInfo Info;
  22141. CSphString sDefault;
  22142. bool bIsAttr = false;
  22143. while ( dAttrs[0] && dAttrs[1] && dAttrs[0][0] && dAttrs[1][0] )
  22144. {
  22145. if ( !strcmp ( *dAttrs, "name" ) )
  22146. {
  22147. AddFieldToSchema ( dAttrs[1] );
  22148. Info.m_sName = dAttrs[1];
  22149. } else if ( !strcmp ( *dAttrs, "attr" ) )
  22150. {
  22151. bIsAttr = true;
  22152. if ( !strcmp ( dAttrs[1], "string" ) )
  22153. Info.m_eAttrType = SPH_ATTR_STRING;
  22154. else if ( !strcmp ( dAttrs[1], "json" ) )
  22155. Info.m_eAttrType = SPH_ATTR_JSON;
  22156. else if ( !strcmp ( dAttrs[1], "wordcount" ) )
  22157. Info.m_eAttrType = SPH_ATTR_WORDCOUNT;
  22158. } else if ( !strcmp ( *dAttrs, "default" ) )
  22159. sDefault = dAttrs[1];
  22160. dAttrs += 2;
  22161. }
  22162. if ( bIsAttr )
  22163. {
  22164. Info.m_iIndex = m_tSchema.GetAttrsCount ();
  22165. m_tSchema.AddAttr ( Info, true ); // all attributes are dynamic at indexing time
  22166. m_dDefaultAttrs.Add ( sDefault );
  22167. }
  22168. return;
  22169. }
  22170. if ( !strcmp ( szName, "sphinx:attr" ) )
  22171. {
  22172. if ( !m_bInDocset || !m_bInSchema )
  22173. {
  22174. Error ( "<sphinx:attr> is allowed inside <sphinx:schema> only" );
  22175. return;
  22176. }
  22177. bool bError = false;
  22178. CSphString sDefault;
  22179. CSphColumnInfo Info;
  22180. Info.m_eAttrType = SPH_ATTR_INTEGER;
  22181. const char ** dAttrs = pAttrs;
  22182. while ( dAttrs[0] && dAttrs[1] && dAttrs[0][0] && dAttrs[1][0] && !bError )
  22183. {
  22184. if ( !strcmp ( *dAttrs, "name" ) )
  22185. Info.m_sName = dAttrs[1];
  22186. else if ( !strcmp ( *dAttrs, "bits" ) )
  22187. Info.m_tLocator.m_iBitCount = strtol ( dAttrs[1], NULL, 10 );
  22188. else if ( !strcmp ( *dAttrs, "default" ) )
  22189. sDefault = dAttrs[1];
  22190. else if ( !strcmp ( *dAttrs, "type" ) )
  22191. {
  22192. const char * szType = dAttrs[1];
  22193. if ( !strcmp ( szType, "int" ) ) Info.m_eAttrType = SPH_ATTR_INTEGER;
  22194. else if ( !strcmp ( szType, "timestamp" ) ) Info.m_eAttrType = SPH_ATTR_TIMESTAMP;
  22195. else if ( !strcmp ( szType, "str2ordinal" ) ) Info.m_eAttrType = SPH_ATTR_ORDINAL;
  22196. else if ( !strcmp ( szType, "bool" ) ) Info.m_eAttrType = SPH_ATTR_BOOL;
  22197. else if ( !strcmp ( szType, "float" ) ) Info.m_eAttrType = SPH_ATTR_FLOAT;
  22198. else if ( !strcmp ( szType, "bigint" ) ) Info.m_eAttrType = SPH_ATTR_BIGINT;
  22199. else if ( !strcmp ( szType, "string" ) ) Info.m_eAttrType = SPH_ATTR_STRING;
  22200. else if ( !strcmp ( szType, "json" ) ) Info.m_eAttrType = SPH_ATTR_JSON;
  22201. else if ( !strcmp ( szType, "wordcount" ) ) Info.m_eAttrType = SPH_ATTR_WORDCOUNT;
  22202. else if ( !strcmp ( szType, "multi" ) )
  22203. {
  22204. Info.m_eAttrType = SPH_ATTR_UINT32SET;
  22205. Info.m_eSrc = SPH_ATTRSRC_FIELD;
  22206. } else if ( !strcmp ( szType, "multi_64" ) )
  22207. {
  22208. Info.m_eAttrType = SPH_ATTR_INT64SET;
  22209. Info.m_eSrc = SPH_ATTRSRC_FIELD;
  22210. } else
  22211. {
  22212. Error ( "unknown column type '%s'", szType );
  22213. bError = true;
  22214. }
  22215. }
  22216. dAttrs += 2;
  22217. }
  22218. if ( !bError )
  22219. {
  22220. Info.m_iIndex = m_tSchema.GetAttrsCount ();
  22221. m_tSchema.AddAttr ( Info, true ); // all attributes are dynamic at indexing time
  22222. m_dDefaultAttrs.Add ( sDefault );
  22223. }
  22224. return;
  22225. }
  22226. if ( !strcmp ( szName, "sphinx:document" ) )
  22227. {
  22228. if ( !m_bInDocset || m_bInSchema )
  22229. return DocumentError ( "<sphinx:schema>" );
  22230. if ( m_bInKillList )
  22231. return DocumentError ( "<sphinx:killlist>" );
  22232. if ( m_bInDocument )
  22233. return DocumentError ( "<sphinx:document>" );
  22234. if ( m_tSchema.m_dFields.GetLength()==0 && m_tSchema.GetAttrsCount()==0 )
  22235. {
  22236. Error ( "no schema configured, and no embedded schema found" );
  22237. return;
  22238. }
  22239. m_bInDocument = true;
  22240. assert ( !m_pCurDocument );
  22241. m_pCurDocument = new Document_t;
  22242. m_pCurDocument->m_iDocID = 0;
  22243. m_pCurDocument->m_dFields.Resize ( m_tSchema.m_dFields.GetLength () );
  22244. // for safety
  22245. ARRAY_FOREACH ( i, m_pCurDocument->m_dFields )
  22246. m_pCurDocument->m_dFields[i].Add ( '\0' );
  22247. m_pCurDocument->m_dAttrs.Resize ( m_tSchema.GetAttrsCount () );
  22248. if ( pAttrs[0] && pAttrs[1] && pAttrs[0][0] && pAttrs[1][0] )
  22249. if ( !strcmp ( pAttrs[0], "id" ) )
  22250. m_pCurDocument->m_iDocID = sphToDocid ( pAttrs[1] );
  22251. if ( m_pCurDocument->m_iDocID==0 )
  22252. Error ( "attribute 'id' required in <sphinx:document>" );
  22253. return;
  22254. }
  22255. if ( !strcmp ( szName, "sphinx:killlist" ) )
  22256. {
  22257. if ( !m_bInDocset || m_bInDocument || m_bInSchema )
  22258. {
  22259. Error ( "<sphinx:killlist> is not allowed inside <sphinx:schema> or <sphinx:document>" );
  22260. return;
  22261. }
  22262. m_bInKillList = true;
  22263. return;
  22264. }
  22265. if ( m_bInKillList )
  22266. {
  22267. if ( !m_bInId )
  22268. {
  22269. if ( strcmp ( szName, "id" ) )
  22270. {
  22271. Error ( "only 'id' is allowed inside <sphinx:killlist>" );
  22272. return;
  22273. }
  22274. m_bInId = true;
  22275. } else
  22276. ++m_iElementDepth;
  22277. }
  22278. if ( m_bInDocument )
  22279. {
  22280. if ( m_iCurField==-1 && m_iCurAttr==-1 )
  22281. {
  22282. for ( int i = 0; i < m_tSchema.m_dFields.GetLength () && m_iCurField==-1; i++ )
  22283. if ( m_tSchema.m_dFields[i].m_sName==szName )
  22284. m_iCurField = i;
  22285. for ( int i = 0; i < m_tSchema.GetAttrsCount () && m_iCurAttr==-1; i++ )
  22286. if ( m_tSchema.GetAttr(i).m_sName==szName )
  22287. m_iCurAttr = i;
  22288. if ( m_iCurAttr==-1 && m_iCurField==-1 )
  22289. {
  22290. m_bInIgnoredTag = true;
  22291. bool bInvalidFound = false;
  22292. for ( int i = 0; i < m_dInvalid.GetLength () && !bInvalidFound; i++ )
  22293. bInvalidFound = m_dInvalid[i]==szName;
  22294. if ( !bInvalidFound )
  22295. {
  22296. sphWarn ( "%s", DecorateMessage ( "unknown field/attribute '%s'; ignored", szName ) );
  22297. m_dInvalid.Add ( szName );
  22298. }
  22299. }
  22300. } else
  22301. m_iElementDepth++;
  22302. }
  22303. }
  22304. void CSphSource_XMLPipe2::EndElement ( const char * szName )
  22305. {
  22306. m_bInIgnoredTag = false;
  22307. if ( !strcmp ( szName, "sphinx:docset" ) )
  22308. m_bInDocset = false;
  22309. else if ( !strcmp ( szName, "sphinx:schema" ) )
  22310. {
  22311. m_bInSchema = false;
  22312. AddAutoAttrs ( m_sError );
  22313. AllocDocinfo();
  22314. } else if ( !strcmp ( szName, "sphinx:document" ) )
  22315. {
  22316. m_bInDocument = false;
  22317. if ( m_pCurDocument )
  22318. m_dParsedDocuments.Add ( m_pCurDocument );
  22319. m_pCurDocument = NULL;
  22320. } else if ( !strcmp ( szName, "sphinx:killlist" ) )
  22321. {
  22322. m_bInKillList = false;
  22323. } else if ( m_bInKillList )
  22324. {
  22325. if ( m_iElementDepth==0 )
  22326. {
  22327. if ( m_bInId )
  22328. {
  22329. m_pFieldBuffer [ Min ( m_iFieldBufferLen, m_iFieldBufferMax-1 ) ] = '\0';
  22330. m_dKillList.Add ( sphToDocid ( (const char *)m_pFieldBuffer ) );
  22331. m_iFieldBufferLen = 0;
  22332. m_bInId = false;
  22333. }
  22334. } else
  22335. m_iElementDepth--;
  22336. } else if ( m_bInDocument && ( m_iCurAttr!=-1 || m_iCurField!=-1 ) )
  22337. {
  22338. if ( m_iElementDepth==0 )
  22339. {
  22340. if ( m_iCurField!=-1 )
  22341. {
  22342. assert ( m_pCurDocument );
  22343. CSphVector<BYTE> & dBuf = m_pCurDocument->m_dFields [ m_iCurField ];
  22344. dBuf.Last() = ' ';
  22345. dBuf.Reserve ( dBuf.GetLength() + m_iFieldBufferLen + 6 ); // 6 is a safety gap
  22346. memcpy ( dBuf.Begin()+dBuf.GetLength(), m_pFieldBuffer, m_iFieldBufferLen );
  22347. dBuf.Resize ( dBuf.GetLength()+m_iFieldBufferLen );
  22348. dBuf.Add ( '\0' );
  22349. }
  22350. if ( m_iCurAttr!=-1 )
  22351. {
  22352. assert ( m_pCurDocument );
  22353. if ( !m_pCurDocument->m_dAttrs [ m_iCurAttr ].IsEmpty () )
  22354. sphWarn ( "duplicate attribute node <%s> - using first value", m_tSchema.GetAttr ( m_iCurAttr ).m_sName.cstr() );
  22355. else
  22356. m_pCurDocument->m_dAttrs [ m_iCurAttr ].SetBinary ( (char*)m_pFieldBuffer, m_iFieldBufferLen );
  22357. }
  22358. m_iFieldBufferLen = 0;
  22359. m_iCurAttr = -1;
  22360. m_iCurField = -1;
  22361. } else
  22362. m_iElementDepth--;
  22363. }
  22364. }
  22365. void CSphSource_XMLPipe2::UnexpectedCharaters ( const char * pCharacters, int iLen, const char * szComment )
  22366. {
  22367. const int MAX_WARNING_LENGTH = 64;
  22368. bool bSpaces = true;
  22369. for ( int i = 0; i < iLen && bSpaces; i++ )
  22370. if ( !sphIsSpace ( pCharacters[i] ) )
  22371. bSpaces = false;
  22372. if ( !bSpaces )
  22373. {
  22374. CSphString sWarning;
  22375. #if USE_LIBEXPAT
  22376. sWarning.SetBinary ( pCharacters, Min ( iLen, MAX_WARNING_LENGTH ) );
  22377. sphWarn ( "source '%s': unexpected string '%s' (line=%d, pos=%d) %s",
  22378. m_tSchema.m_sName.cstr(), sWarning.cstr (),
  22379. (int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ), szComment );
  22380. #endif
  22381. #if USE_LIBXML
  22382. int i = 0;
  22383. for ( i=0; i<iLen && sphIsSpace ( pCharacters[i] ); i++ );
  22384. sWarning.SetBinary ( pCharacters + i, Min ( iLen - i, MAX_WARNING_LENGTH ) );
  22385. for ( i=iLen-i-1; i>=0 && sphIsSpace ( sWarning.cstr()[i] ); i-- );
  22386. if ( i>=0 )
  22387. ( (char *)sWarning.cstr() )[i+1] = '\0';
  22388. sphWarn ( "source '%s': unexpected string '%s' %s", m_tSchema.m_sName.cstr(), sWarning.cstr(), szComment );
  22389. #endif
  22390. }
  22391. }
  22392. void CSphSource_XMLPipe2::Characters ( const char * pCharacters, int iLen )
  22393. {
  22394. if ( m_bInIgnoredTag )
  22395. return;
  22396. if ( !m_bInDocset )
  22397. {
  22398. UnexpectedCharaters ( pCharacters, iLen, "outside of <sphinx:docset>" );
  22399. return;
  22400. }
  22401. if ( !m_bInSchema && !m_bInDocument && !m_bInKillList )
  22402. {
  22403. UnexpectedCharaters ( pCharacters, iLen, "outside of <sphinx:schema> and <sphinx:document>" );
  22404. return;
  22405. }
  22406. if ( m_iCurAttr==-1 && m_iCurField==-1 && !m_bInKillList )
  22407. {
  22408. UnexpectedCharaters ( pCharacters, iLen, m_bInDocument ? "inside <sphinx:document>" : ( m_bInSchema ? "inside <sphinx:schema>" : "" ) );
  22409. return;
  22410. }
  22411. if ( iLen + m_iFieldBufferLen < m_iFieldBufferMax )
  22412. {
  22413. memcpy ( m_pFieldBuffer + m_iFieldBufferLen, pCharacters, iLen );
  22414. m_iFieldBufferLen += iLen;
  22415. } else
  22416. {
  22417. const CSphString & sName = ( m_iCurField!=-1 ) ? m_tSchema.m_dFields[m_iCurField].m_sName : m_tSchema.GetAttr ( m_iCurAttr ).m_sName;
  22418. bool bWarned = false;
  22419. for ( int i = 0; i < m_dWarned.GetLength () && !bWarned; i++ )
  22420. bWarned = m_dWarned[i]==sName;
  22421. if ( !bWarned )
  22422. {
  22423. #if USE_LIBEXPAT
  22424. sphWarn ( "source '%s': field/attribute '%s' length exceeds max length (line=%d, pos=%d, docid=" DOCID_FMT ")",
  22425. m_tSchema.m_sName.cstr(), sName.cstr(),
  22426. (int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ),
  22427. m_pCurDocument->m_iDocID );
  22428. #endif
  22429. #if USE_LIBXML
  22430. sphWarn ( "source '%s': field/attribute '%s' length exceeds max length (docid=" DOCID_FMT ")",
  22431. m_tSchema.m_sName.cstr(), sName.cstr(), m_pCurDocument->m_iDocID );
  22432. #endif
  22433. m_dWarned.Add ( sName );
  22434. }
  22435. }
  22436. }
  22437. #if USE_LIBXML
  22438. int CSphSource_XMLPipe2::ReadBuffer ( BYTE * pBuffer, int iLen )
  22439. {
  22440. int iLeft = Max ( m_pBufferEnd - m_pBufferPtr, 0 );
  22441. if ( iLeft < iLen )
  22442. {
  22443. memmove ( m_pBuffer, m_pBufferPtr, iLeft );
  22444. size_t iRead = fread ( m_pBuffer + iLeft, 1, m_iBufferSize - iLeft, m_pPipe );
  22445. m_bPassedBufferEnd = ( ( m_iBufferSize - iLeft )==int(iRead) );
  22446. m_pBufferPtr = m_pBuffer;
  22447. m_pBufferEnd = m_pBuffer + iLeft + iRead;
  22448. iLeft = Max ( m_pBufferEnd - m_pBuffer, 0 );
  22449. }
  22450. int iToCopy = Min ( iLen, iLeft );
  22451. memcpy ( pBuffer, m_pBufferPtr, iToCopy );
  22452. m_pBufferPtr += iToCopy;
  22453. return iToCopy;
  22454. }
  22455. void CSphSource_XMLPipe2::ProcessNode ( xmlTextReaderPtr pReader )
  22456. {
  22457. int iType = xmlTextReaderNodeType ( pReader );
  22458. switch ( iType )
  22459. {
  22460. case XML_READER_TYPE_ELEMENT:
  22461. {
  22462. const char * szName = (char*)xmlTextReaderName ( pReader );
  22463. m_dAttrs.Resize ( 0 );
  22464. if ( xmlTextReaderHasAttributes ( pReader ) )
  22465. {
  22466. if ( xmlTextReaderMoveToFirstAttribute ( pReader )!=1 )
  22467. return;
  22468. do
  22469. {
  22470. int iLen = m_dAttrs.GetLength ();
  22471. m_dAttrs.Resize ( iLen + 2 );
  22472. m_dAttrs[iLen] = (char*)xmlTextReaderName ( pReader );
  22473. m_dAttrs[iLen+1] = (char*)xmlTextReaderValue ( pReader );
  22474. }
  22475. while ( xmlTextReaderMoveToNextAttribute ( pReader )==1 );
  22476. }
  22477. int iLen = m_dAttrs.GetLength ();
  22478. m_dAttrs.Resize ( iLen + 2 );
  22479. m_dAttrs[iLen] = NULL;
  22480. m_dAttrs[iLen+1] = NULL;
  22481. StartElement ( szName, &m_dAttrs[0] );
  22482. }
  22483. break;
  22484. case XML_READER_TYPE_END_ELEMENT:
  22485. EndElement ( (char*)xmlTextReaderName ( pReader ) );
  22486. break;
  22487. case XML_TEXT_NODE:
  22488. {
  22489. const char * szText = (char*)xmlTextReaderValue ( pReader );
  22490. Characters ( szText, strlen ( szText ) );
  22491. }
  22492. break;
  22493. }
  22494. }
  22495. #endif
  22496. CSphSource * sphCreateSourceXmlpipe2 ( const CSphConfigSection * pSource, FILE * pPipe,
  22497. BYTE * dInitialBuf, int iBufLen, const char * szSourceName, int iMaxFieldLen )
  22498. {
  22499. CSphSource_XMLPipe2 * pXMLPipe = new CSphSource_XMLPipe2 ( dInitialBuf, iBufLen,
  22500. szSourceName, iMaxFieldLen, pSource->GetInt ( "xmlpipe_fixup_utf8", 0 )!=0 );
  22501. if ( !pXMLPipe->Setup ( pPipe, *pSource ) )
  22502. SafeDelete ( pXMLPipe );
  22503. return pXMLPipe;
  22504. }
  22505. #endif
  22506. FILE * sphDetectXMLPipe ( const char * szCommand, BYTE * dBuf, int & iBufSize, int iMaxBufSize, bool & bUsePipe2 )
  22507. {
  22508. bUsePipe2 = true; // default is xmlpipe2
  22509. FILE * pPipe = popen ( szCommand, "r" );
  22510. if ( !pPipe )
  22511. return NULL;
  22512. BYTE * pStart = dBuf;
  22513. iBufSize = (int)fread ( dBuf, 1, iMaxBufSize, pPipe );
  22514. BYTE * pEnd = pStart + iBufSize;
  22515. // BOM
  22516. if ( iBufSize>=3 )
  22517. if ( !strncmp ( (char*)pStart, "\xEF\xBB\xBF", 3 ) )
  22518. pStart += 3;
  22519. while ( isspace ( *pStart ) && pStart < pEnd )
  22520. pStart++;
  22521. if ( ( pEnd - pStart)>=5 )
  22522. bUsePipe2 = !strncasecmp ( (char *)pStart, "<?xml", 5 );
  22523. return pPipe;
  22524. }
  22525. #if USE_ODBC
  22526. CSphSourceParams_ODBC::CSphSourceParams_ODBC ()
  22527. : m_bWinAuth ( false )
  22528. , m_bUnicode ( false )
  22529. {
  22530. }
  22531. CSphSource_ODBC::CSphSource_ODBC ( const char * sName )
  22532. : CSphSource_SQL ( sName )
  22533. , m_bWinAuth ( false )
  22534. , m_bUnicode ( false )
  22535. , m_hEnv ( NULL )
  22536. , m_hDBC ( NULL )
  22537. , m_hStmt ( NULL )
  22538. , m_nResultCols ( 0 )
  22539. {
  22540. }
  22541. void CSphSource_ODBC::SqlDismissResult ()
  22542. {
  22543. if ( m_hStmt )
  22544. {
  22545. SQLCloseCursor ( m_hStmt );
  22546. SQLFreeHandle ( SQL_HANDLE_STMT, m_hStmt );
  22547. m_hStmt = NULL;
  22548. }
  22549. }
  22550. #define MS_SQL_BUFFER_GAP 16
  22551. bool CSphSource_ODBC::SqlQuery ( const char * sQuery )
  22552. {
  22553. if ( SQLAllocHandle ( SQL_HANDLE_STMT, m_hDBC, &m_hStmt )==SQL_ERROR )
  22554. {
  22555. if ( m_tParams.m_bPrintQueries )
  22556. fprintf ( stdout, "SQL-QUERY: %s: FAIL (SQLAllocHandle failed)\n", sQuery );
  22557. return false;
  22558. }
  22559. if ( SQLExecDirect ( m_hStmt, (SQLCHAR *)sQuery, SQL_NTS )==SQL_ERROR )
  22560. {
  22561. GetSqlError ( SQL_HANDLE_STMT, m_hStmt );
  22562. if ( m_tParams.m_bPrintQueries )
  22563. fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
  22564. return false;
  22565. }
  22566. if ( m_tParams.m_bPrintQueries )
  22567. fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
  22568. SQLSMALLINT nCols = 0;
  22569. m_nResultCols = 0;
  22570. if ( SQLNumResultCols ( m_hStmt, &nCols )==SQL_ERROR )
  22571. return false;
  22572. m_nResultCols = nCols;
  22573. const int MAX_NAME_LEN = 512;
  22574. char szColumnName[MAX_NAME_LEN];
  22575. m_dColumns.Resize ( m_nResultCols );
  22576. int iTotalBuffer = 0;
  22577. ARRAY_FOREACH ( i, m_dColumns )
  22578. {
  22579. QueryColumn_t & tCol = m_dColumns[i];
  22580. SQLULEN uColSize = 0;
  22581. SQLSMALLINT iNameLen = 0;
  22582. SQLSMALLINT iDataType = 0;
  22583. if ( SQLDescribeCol ( m_hStmt, (SQLUSMALLINT)(i+1), (SQLCHAR*)szColumnName,
  22584. MAX_NAME_LEN, &iNameLen, &iDataType, &uColSize, NULL, NULL )==SQL_ERROR )
  22585. return false;
  22586. tCol.m_sName = szColumnName;
  22587. tCol.m_sName.ToLower();
  22588. // deduce buffer size
  22589. // use a small buffer by default, and a bigger one for varchars
  22590. int iBuffLen = DEFAULT_COL_SIZE;
  22591. if ( iDataType==SQL_WCHAR || iDataType==SQL_WVARCHAR || iDataType==SQL_WLONGVARCHAR|| iDataType==SQL_VARCHAR )
  22592. iBuffLen = VARCHAR_COL_SIZE;
  22593. if ( m_hColBuffers ( tCol.m_sName ) )
  22594. iBuffLen = m_hColBuffers [ tCol.m_sName ]; // got explicit user override
  22595. else if ( uColSize )
  22596. iBuffLen = Min ( uColSize+1, (SQLULEN) MAX_COL_SIZE ); // got data from driver
  22597. tCol.m_dContents.Resize ( iBuffLen + MS_SQL_BUFFER_GAP );
  22598. tCol.m_dRaw.Resize ( iBuffLen + MS_SQL_BUFFER_GAP );
  22599. tCol.m_iInd = 0;
  22600. tCol.m_iBufferSize = iBuffLen;
  22601. tCol.m_bUnicode = m_bUnicode && ( iDataType==SQL_WCHAR || iDataType==SQL_WVARCHAR || iDataType==SQL_WLONGVARCHAR );
  22602. tCol.m_bTruncated = false;
  22603. iTotalBuffer += iBuffLen;
  22604. if ( SQLBindCol ( m_hStmt, (SQLUSMALLINT)(i+1),
  22605. tCol.m_bUnicode ? SQL_UNICODE : SQL_C_CHAR,
  22606. tCol.m_bUnicode ? tCol.m_dRaw.Begin() : tCol.m_dContents.Begin(),
  22607. iBuffLen, &(tCol.m_iInd) )==SQL_ERROR )
  22608. return false;
  22609. }
  22610. if ( iTotalBuffer>WARN_ROW_SIZE )
  22611. sphWarn ( "row buffer is over %d bytes; consider revising sql_column_buffers", iTotalBuffer );
  22612. return true;
  22613. }
  22614. bool CSphSource_ODBC::SqlIsError ()
  22615. {
  22616. return !m_sError.IsEmpty ();
  22617. }
  22618. const char * CSphSource_ODBC::SqlError ()
  22619. {
  22620. return m_sError.cstr();
  22621. }
  22622. bool CSphSource_ODBC::SqlConnect ()
  22623. {
  22624. if ( SQLAllocHandle ( SQL_HANDLE_ENV, NULL, &m_hEnv )==SQL_ERROR )
  22625. {
  22626. if ( m_tParams.m_bPrintQueries )
  22627. fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
  22628. return false;
  22629. }
  22630. SQLSetEnvAttr ( m_hEnv, SQL_ATTR_ODBC_VERSION, (void*) SQL_OV_ODBC3, SQL_IS_INTEGER );
  22631. if ( SQLAllocHandle ( SQL_HANDLE_DBC, m_hEnv, &m_hDBC )==SQL_ERROR )
  22632. {
  22633. if ( m_tParams.m_bPrintQueries )
  22634. fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
  22635. return false;
  22636. }
  22637. OdbcPostConnect ();
  22638. char szOutConn [2048];
  22639. SQLSMALLINT iOutConn = 0;
  22640. if ( SQLDriverConnect ( m_hDBC, NULL, (SQLTCHAR*) m_sOdbcDSN.cstr(), SQL_NTS,
  22641. (SQLCHAR*)szOutConn, sizeof(szOutConn), &iOutConn, SQL_DRIVER_NOPROMPT )==SQL_ERROR )
  22642. {
  22643. GetSqlError ( SQL_HANDLE_DBC, m_hDBC );
  22644. if ( m_tParams.m_bPrintQueries )
  22645. fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
  22646. return false;
  22647. }
  22648. if ( m_tParams.m_bPrintQueries )
  22649. fprintf ( stdout, "SQL-CONNECT: ok\n" );
  22650. return true;
  22651. }
  22652. void CSphSource_ODBC::SqlDisconnect ()
  22653. {
  22654. if ( m_tParams.m_bPrintQueries )
  22655. fprintf ( stdout, "SQL-DISCONNECT\n" );
  22656. if ( m_hStmt!=NULL )
  22657. SQLFreeHandle ( SQL_HANDLE_STMT, m_hStmt );
  22658. if ( m_hDBC )
  22659. {
  22660. SQLDisconnect ( m_hDBC );
  22661. SQLFreeHandle ( SQL_HANDLE_DBC, m_hDBC );
  22662. }
  22663. if ( m_hEnv )
  22664. SQLFreeHandle ( SQL_HANDLE_ENV, m_hEnv );
  22665. }
  22666. int CSphSource_ODBC::SqlNumFields ()
  22667. {
  22668. if ( !m_hStmt )
  22669. return -1;
  22670. return m_nResultCols;
  22671. }
  22672. bool CSphSource_ODBC::SqlFetchRow ()
  22673. {
  22674. if ( !m_hStmt )
  22675. return false;
  22676. SQLRETURN iRet = SQLFetch ( m_hStmt );
  22677. if ( iRet==SQL_ERROR || iRet==SQL_INVALID_HANDLE || iRet==SQL_NO_DATA )
  22678. {
  22679. GetSqlError ( SQL_HANDLE_STMT, m_hStmt );
  22680. return false;
  22681. }
  22682. ARRAY_FOREACH ( i, m_dColumns )
  22683. {
  22684. QueryColumn_t & tCol = m_dColumns[i];
  22685. switch ( tCol.m_iInd )
  22686. {
  22687. case SQL_NULL_DATA:
  22688. tCol.m_dContents[0] = '\0';
  22689. tCol.m_dContents[0] = '\0';
  22690. break;
  22691. default:
  22692. #if USE_WINDOWS // FIXME! support UCS-2 columns on Unix too
  22693. if ( tCol.m_bUnicode )
  22694. {
  22695. // WideCharToMultiByte should get NULL terminated string
  22696. memset ( tCol.m_dRaw.Begin()+tCol.m_iBufferSize, 0, MS_SQL_BUFFER_GAP );
  22697. int iConv = WideCharToMultiByte ( CP_UTF8, 0, LPCWSTR ( tCol.m_dRaw.Begin() ), tCol.m_iInd/sizeof(WCHAR),
  22698. LPSTR ( tCol.m_dContents.Begin() ), tCol.m_iBufferSize-1, NULL, NULL );
  22699. if ( iConv==0 )
  22700. if ( GetLastError()==ERROR_INSUFFICIENT_BUFFER )
  22701. iConv = tCol.m_iBufferSize-1;
  22702. tCol.m_dContents[iConv] = '\0';
  22703. } else
  22704. #endif
  22705. {
  22706. if ( tCol.m_iInd>=0 && tCol.m_iInd<tCol.m_iBufferSize )
  22707. {
  22708. // data fetched ok; add trailing zero
  22709. tCol.m_dContents[tCol.m_iInd] = '\0';
  22710. } else if ( tCol.m_iInd>=tCol.m_iBufferSize && !tCol.m_bTruncated )
  22711. {
  22712. // out of buffer; warn about that (once)
  22713. tCol.m_bTruncated = true;
  22714. sphWarn ( "'%s' column truncated (buffer=%d, got=%d); consider revising sql_column_buffers",
  22715. tCol.m_sName.cstr(), tCol.m_iBufferSize-1, (int) tCol.m_iInd );
  22716. }
  22717. }
  22718. break;
  22719. }
  22720. }
  22721. return iRet!=SQL_NO_DATA;
  22722. }
  22723. const char * CSphSource_ODBC::SqlColumn ( int iIndex )
  22724. {
  22725. if ( !m_hStmt )
  22726. return NULL;
  22727. return &(m_dColumns [iIndex].m_dContents[0]);
  22728. }
  22729. const char * CSphSource_ODBC::SqlFieldName ( int iIndex )
  22730. {
  22731. return m_dColumns[iIndex].m_sName.cstr();
  22732. }
  22733. DWORD CSphSource_ODBC::SqlColumnLength ( int )
  22734. {
  22735. return 0;
  22736. }
  22737. bool CSphSource_ODBC::Setup ( const CSphSourceParams_ODBC & tParams )
  22738. {
  22739. if ( !CSphSource_SQL::Setup ( tParams ) )
  22740. return false;
  22741. // parse column buffers spec, if any
  22742. if ( !tParams.m_sColBuffers.IsEmpty() )
  22743. {
  22744. const char * p = tParams.m_sColBuffers.cstr();
  22745. while ( *p )
  22746. {
  22747. // skip space
  22748. while ( sphIsSpace(*p) )
  22749. p++;
  22750. // expect eof or ident
  22751. if ( !*p )
  22752. break;
  22753. if ( !sphIsAlpha(*p) )
  22754. {
  22755. m_sError.SetSprintf ( "identifier expected in sql_column_buffers near '%s'", p );
  22756. return false;
  22757. }
  22758. // get ident
  22759. CSphString sCol;
  22760. const char * pIdent = p;
  22761. while ( sphIsAlpha(*p) )
  22762. p++;
  22763. sCol.SetBinary ( pIdent, p-pIdent );
  22764. // skip space
  22765. while ( sphIsSpace(*p) )
  22766. p++;
  22767. // expect assignment
  22768. if ( *p!='=' )
  22769. {
  22770. m_sError.SetSprintf ( "'=' expected in sql_column_buffers near '%s'", p );
  22771. return false;
  22772. }
  22773. p++;
  22774. // skip space
  22775. while ( sphIsSpace(*p) )
  22776. p++;
  22777. // expect number
  22778. if (!( *p>='0' && *p<='9' ))
  22779. {
  22780. m_sError.SetSprintf ( "number expected in sql_column_buffers near '%s'", p );
  22781. return false;
  22782. }
  22783. // get value
  22784. int iSize = 0;
  22785. while ( *p>='0' && *p<='9' )
  22786. {
  22787. iSize = 10*iSize + ( *p-'0' );
  22788. p++;
  22789. }
  22790. if ( *p=='K' )
  22791. {
  22792. iSize *= 1024;
  22793. p++;
  22794. } else if ( *p=='M' )
  22795. {
  22796. iSize *= 1048576;
  22797. p++;
  22798. }
  22799. // hash value
  22800. sCol.ToLower();
  22801. m_hColBuffers.Add ( iSize, sCol );
  22802. // skip space
  22803. while ( sphIsSpace(*p) )
  22804. p++;
  22805. // expect eof or comma
  22806. if ( !*p )
  22807. break;
  22808. if ( *p!=',' )
  22809. {
  22810. m_sError.SetSprintf ( "comma expected in sql_column_buffers near '%s'", p );
  22811. return false;
  22812. }
  22813. p++;
  22814. }
  22815. }
  22816. // ODBC specific params
  22817. m_sOdbcDSN = tParams.m_sOdbcDSN;
  22818. m_bWinAuth = tParams.m_bWinAuth;
  22819. m_bUnicode = tParams.m_bUnicode;
  22820. // build and store DSN for error reporting
  22821. char sBuf [ 1024 ];
  22822. snprintf ( sBuf, sizeof(sBuf), "odbc%s", m_sSqlDSN.cstr()+3 );
  22823. m_sSqlDSN = sBuf;
  22824. return true;
  22825. }
  22826. void CSphSource_ODBC::GetSqlError ( SQLSMALLINT iHandleType, SQLHANDLE hHandle )
  22827. {
  22828. if ( !hHandle )
  22829. {
  22830. m_sError.SetSprintf ( "invalid handle" );
  22831. return;
  22832. }
  22833. char szState[16] = "";
  22834. char szMessageText[1024] = "";
  22835. SQLINTEGER iError;
  22836. SQLSMALLINT iLen;
  22837. SQLGetDiagRec ( iHandleType, hHandle, 1, (SQLCHAR*)szState, &iError, (SQLCHAR*)szMessageText, 1024, &iLen );
  22838. m_sError = szMessageText;
  22839. }
  22840. //////////////////////////////////////////////////////////////////////////
  22841. void CSphSource_MSSQL::OdbcPostConnect ()
  22842. {
  22843. const int MAX_LEN = 1024;
  22844. char szDriver[MAX_LEN];
  22845. char szDriverAttrs[MAX_LEN];
  22846. SQLSMALLINT iDescLen = 0;
  22847. SQLSMALLINT iAttrLen = 0;
  22848. SQLSMALLINT iDir = SQL_FETCH_FIRST;
  22849. CSphString sDriver;
  22850. for ( ;; )
  22851. {
  22852. SQLRETURN iRet = SQLDrivers ( m_hEnv, iDir, (SQLCHAR*)szDriver, MAX_LEN, &iDescLen, (SQLCHAR*)szDriverAttrs, MAX_LEN, &iAttrLen );
  22853. if ( iRet==SQL_NO_DATA )
  22854. break;
  22855. iDir = SQL_FETCH_NEXT;
  22856. if ( !strcmp ( szDriver, "SQL Native Client" )
  22857. || !strncmp ( szDriver, "SQL Server Native Client", strlen("SQL Server Native Client") ) )
  22858. {
  22859. sDriver = szDriver;
  22860. break;
  22861. }
  22862. }
  22863. if ( sDriver.IsEmpty() )
  22864. sDriver = "SQL Server";
  22865. if ( m_bWinAuth && m_tParams.m_sUser.IsEmpty () )
  22866. {
  22867. m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};Database={%s};Trusted_Connection=yes",
  22868. sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sDB.cstr () );
  22869. } else if ( m_bWinAuth )
  22870. {
  22871. m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};UID={%s};PWD={%s};Database={%s};Trusted_Connection=yes",
  22872. sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sUser.cstr (), m_tParams.m_sPass.cstr (), m_tParams.m_sDB.cstr () );
  22873. } else
  22874. {
  22875. m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};UID={%s};PWD={%s};Database={%s}",
  22876. sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sUser.cstr (), m_tParams.m_sPass.cstr (), m_tParams.m_sDB.cstr () );
  22877. }
  22878. }
  22879. #endif
  22880. /////////////////////////////////////////////////////////////////////////////
  22881. void sphSetQuiet ( bool bQuiet )
  22882. {
  22883. g_bSphQuiet = bQuiet;
  22884. }
  22885. void sphSetJsonOptions ( bool bStrict, bool bAutoconvNumbers, bool bKeynamesToLowercase )
  22886. {
  22887. g_bJsonStrict = bStrict;
  22888. g_bJsonAutoconvNumbers = bAutoconvNumbers;
  22889. g_bJsonKeynamesToLowercase = bKeynamesToLowercase;
  22890. }
  22891. static inline float GetPercent ( int64_t a, int64_t b )
  22892. {
  22893. if ( b==0 )
  22894. return 100.0f;
  22895. int64_t r = a*100000/b;
  22896. return float(r)/1000;
  22897. }
  22898. const char * CSphIndexProgress::BuildMessage() const
  22899. {
  22900. static char sBuf[256];
  22901. switch ( m_ePhase )
  22902. {
  22903. case PHASE_COLLECT:
  22904. snprintf ( sBuf, sizeof(sBuf), "collected "INT64_FMT" docs, %.1f MB", m_iDocuments,
  22905. float(m_iBytes)/1000000.0f );
  22906. break;
  22907. case PHASE_SORT:
  22908. snprintf ( sBuf, sizeof(sBuf), "sorted %.1f Mhits, %.1f%% done", float(m_iHits)/1000000,
  22909. GetPercent ( m_iHits, m_iHitsTotal ) );
  22910. break;
  22911. case PHASE_COLLECT_MVA:
  22912. snprintf ( sBuf, sizeof(sBuf), "collected "INT64_FMT" attr values", m_iAttrs );
  22913. break;
  22914. case PHASE_SORT_MVA:
  22915. snprintf ( sBuf, sizeof(sBuf), "sorted %.1f Mvalues, %.1f%% done", float(m_iAttrs)/1000000,
  22916. GetPercent ( m_iAttrs, m_iAttrsTotal ) );
  22917. break;
  22918. case PHASE_MERGE:
  22919. snprintf ( sBuf, sizeof(sBuf), "merged %.1f Kwords", float(m_iWords)/1000 );
  22920. break;
  22921. case PHASE_PREREAD:
  22922. snprintf ( sBuf, sizeof(sBuf), "read %.1f of %.1f MB, %.1f%% done",
  22923. float(m_iBytes)/1000000.0f, float(m_iBytesTotal)/1000000.0f,
  22924. GetPercent ( m_iBytes, m_iBytesTotal ) );
  22925. break;
  22926. case PHASE_PRECOMPUTE:
  22927. snprintf ( sBuf, sizeof(sBuf), "indexing attributes, %d.%d%% done", m_iDone/10, m_iDone%10 );
  22928. break;
  22929. default:
  22930. assert ( 0 && "internal error: unhandled progress phase" );
  22931. snprintf ( sBuf, sizeof(sBuf), "(progress-phase-%d)", m_ePhase );
  22932. break;
  22933. }
  22934. sBuf[sizeof(sBuf)-1] = '\0';
  22935. return sBuf;
  22936. }
  22937. void CSphIndexProgress::Show ( bool bPhaseEnd ) const
  22938. {
  22939. if ( m_fnProgress )
  22940. m_fnProgress ( this, bPhaseEnd );
  22941. }
  22942. /////////////////////////////////////////////////////////////////////////////
  22943. int sphDictCmp ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 )
  22944. {
  22945. assert ( pStr1 && pStr2 );
  22946. assert ( iLen1 && iLen2 );
  22947. const int iCmpLen = Min ( iLen1, iLen2 );
  22948. return strncmp ( pStr1, pStr2, iCmpLen );
  22949. }
  22950. int sphDictCmpStrictly ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 )
  22951. {
  22952. assert ( pStr1 && pStr2 );
  22953. assert ( iLen1 && iLen2 );
  22954. const int iCmpLen = Min ( iLen1, iLen2 );
  22955. const int iCmpRes = strncmp ( pStr1, pStr2, iCmpLen );
  22956. return iCmpRes==0 ? iLen1-iLen2 : iCmpRes;
  22957. }
  22958. CWordlist::CWordlist ()
  22959. : m_dCheckpoints ( 0 )
  22960. , m_dInfixBlocks ( 0 )
  22961. {
  22962. m_iDictCheckpointsOffset = 0;
  22963. m_iSize = 0;
  22964. m_iMaxChunk = 0;
  22965. m_bWordDict = false;
  22966. m_pWords = NULL;
  22967. m_pInfixBlocksWords = NULL;
  22968. }
  22969. CWordlist::~CWordlist ()
  22970. {
  22971. Reset();
  22972. }
  22973. void CWordlist::Reset ()
  22974. {
  22975. m_tFile.Close ();
  22976. m_pBuf.Reset ();
  22977. m_dCheckpoints.Reset ( 0 );
  22978. SafeDeleteArray ( m_pWords );
  22979. SafeDeleteArray ( m_pInfixBlocksWords );
  22980. }
  22981. bool CWordlist::ReadCP ( CSphAutofile & tFile, DWORD uVersion, bool bWordDict, CSphString & sError )
  22982. {
  22983. assert ( ( uVersion>=21 && bWordDict ) || !bWordDict );
  22984. assert ( m_iDictCheckpointsOffset>0 );
  22985. assert ( m_iSize-m_iDictCheckpointsOffset<UINT_MAX );
  22986. m_bHaveSkips = ( uVersion>=31 );
  22987. ////////////////////////////
  22988. // preload word checkpoints
  22989. ////////////////////////////
  22990. int iCheckpointOnlySize = (int)(m_iSize-m_iDictCheckpointsOffset);
  22991. if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
  22992. iCheckpointOnlySize = (int)(m_iInfixBlocksOffset - strlen ( g_sTagInfixBlocks ) - m_iDictCheckpointsOffset);
  22993. CSphReader tReader;
  22994. tReader.SetFile ( tFile );
  22995. tReader.SeekTo ( m_iDictCheckpointsOffset, iCheckpointOnlySize );
  22996. m_bWordDict = bWordDict;
  22997. if ( m_bWordDict )
  22998. {
  22999. int iArenaSize = iCheckpointOnlySize
  23000. - (sizeof(DWORD)+sizeof(SphOffset_t))*m_dCheckpoints.GetLength()
  23001. + sizeof(BYTE)*m_dCheckpoints.GetLength();
  23002. assert ( iArenaSize>=0 );
  23003. m_pWords = new BYTE[iArenaSize];
  23004. assert ( m_pWords );
  23005. BYTE * pWord = m_pWords;
  23006. ARRAY_FOREACH ( i, m_dCheckpoints )
  23007. {
  23008. m_dCheckpoints[i].m_sWord = (char *)pWord;
  23009. const int iLen = tReader.GetDword();
  23010. assert ( iLen>0 );
  23011. assert ( iLen+1+(pWord-m_pWords)<=iArenaSize );
  23012. tReader.GetBytes ( pWord, iLen );
  23013. pWord[iLen] = '\0';
  23014. pWord += iLen+1;
  23015. m_dCheckpoints[i].m_iWordlistOffset = tReader.GetOffset();
  23016. }
  23017. } else if ( uVersion>=11 )
  23018. {
  23019. // read v.14 checkpoints
  23020. ARRAY_FOREACH ( i, m_dCheckpoints )
  23021. {
  23022. m_dCheckpoints[i].m_iWordID = (SphWordID_t)tReader.GetOffset();
  23023. m_dCheckpoints[i].m_iWordlistOffset = tReader.GetOffset();
  23024. }
  23025. } else
  23026. {
  23027. // convert v.10 checkpoints
  23028. ARRAY_FOREACH ( i, m_dCheckpoints )
  23029. {
  23030. #if USE_64BIT
  23031. m_dCheckpoints[i].m_iWordID = tReader.GetOffset();
  23032. #else
  23033. m_dCheckpoints[i].m_iWordID = tReader.GetDword();
  23034. #endif
  23035. m_dCheckpoints[i].m_iWordlistOffset = tReader.GetDword();
  23036. }
  23037. }
  23038. ////////////////////////
  23039. // preload infix blocks
  23040. ////////////////////////
  23041. if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
  23042. {
  23043. // reading to vector as old version doesn't store total infix words length
  23044. CSphTightVector<BYTE> dInfixWords;
  23045. dInfixWords.Reserve ( (int)m_iInfixBlocksWordsSize );
  23046. tReader.SeekTo ( m_iInfixBlocksOffset, (int)(m_iSize-m_iInfixBlocksOffset) );
  23047. m_dInfixBlocks.Resize ( tReader.UnzipInt() );
  23048. ARRAY_FOREACH ( i, m_dInfixBlocks )
  23049. {
  23050. int iBytes = tReader.UnzipInt();
  23051. int iOff = dInfixWords.GetLength();
  23052. m_dInfixBlocks[i].m_iInfixOffset = iOff;
  23053. dInfixWords.Resize ( iOff+iBytes+1 );
  23054. tReader.GetBytes ( dInfixWords.Begin()+iOff, iBytes );
  23055. dInfixWords[iOff+iBytes] = '\0';
  23056. m_dInfixBlocks[i].m_iOffset = tReader.UnzipInt();
  23057. }
  23058. // fix-up offset to pointer
  23059. m_pInfixBlocksWords = dInfixWords.LeakData();
  23060. ARRAY_FOREACH ( i, m_dInfixBlocks )
  23061. m_dInfixBlocks[i].m_sInfix = (const char *)m_pInfixBlocksWords + m_dInfixBlocks[i].m_iInfixOffset;
  23062. }
  23063. // set wordlist end
  23064. assert ( !m_iInfixCodepointBytes || !m_iInfixBlocksOffset || m_dInfixBlocks.GetLength() );
  23065. m_iWordsEnd = m_iDictCheckpointsOffset;
  23066. if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
  23067. {
  23068. m_iWordsEnd = m_dInfixBlocks.Begin()->m_iOffset - strlen ( g_sTagInfixEntries );
  23069. }
  23070. // TODO: count m_dInfixBlocks too while make on_disk_dict work with dict=keywords + infix
  23071. SphOffset_t uMaxChunk = 0;
  23072. if ( m_dCheckpoints.GetLength() )
  23073. {
  23074. uMaxChunk = m_iWordsEnd - m_dCheckpoints.Last().m_iWordlistOffset;
  23075. SphOffset_t uPrev = m_dCheckpoints.Begin()->m_iWordlistOffset;
  23076. for ( int i=1; i<m_dCheckpoints.GetLength(); i++ )
  23077. {
  23078. SphOffset_t uOff = m_dCheckpoints[i].m_iWordlistOffset;
  23079. uMaxChunk = Max ( uMaxChunk, uOff-uPrev );
  23080. uPrev = uOff;
  23081. }
  23082. }
  23083. assert ( uMaxChunk<UINT_MAX );
  23084. m_iMaxChunk = (int)uMaxChunk;
  23085. ////////
  23086. // done
  23087. ////////
  23088. if ( tReader.GetErrorFlag() )
  23089. sError = tReader.GetErrorMessage();
  23090. return !tReader.GetErrorFlag();
  23091. }
  23092. const CSphWordlistCheckpoint * CWordlist::FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const
  23093. {
  23094. return sphSearchCheckpoint ( sWord, iWordLen, iWordID, bStarMode, m_bWordDict, m_dCheckpoints.Begin(), &m_dCheckpoints.Last() );
  23095. }
  23096. KeywordsBlockReader_c::KeywordsBlockReader_c ( const BYTE * pBuf, bool bSkips )
  23097. {
  23098. m_pBuf = pBuf;
  23099. m_sWord[0] = '\0';
  23100. m_iLen = 0;
  23101. m_bHaveSkips = bSkips;
  23102. }
  23103. bool KeywordsBlockReader_c::UnpackWord()
  23104. {
  23105. if ( !m_pBuf )
  23106. return false;
  23107. // unpack next word
  23108. // must be in sync with DictEnd()!
  23109. BYTE uPack = *m_pBuf++;
  23110. if ( !uPack )
  23111. {
  23112. // ok, this block is over
  23113. m_pBuf = NULL;
  23114. m_iLen = 0;
  23115. return false;
  23116. }
  23117. int iMatch, iDelta;
  23118. if ( uPack & 0x80 )
  23119. {
  23120. iDelta = ( ( uPack>>4 ) & 7 ) + 1;
  23121. iMatch = uPack & 15;
  23122. } else
  23123. {
  23124. iDelta = uPack & 127;
  23125. iMatch = *m_pBuf++;
  23126. }
  23127. assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
  23128. assert ( iMatch<=(int)strlen ( (char *)m_sWord ) );
  23129. memcpy ( m_sWord + iMatch, m_pBuf, iDelta );
  23130. m_pBuf += iDelta;
  23131. m_iLen = iMatch + iDelta;
  23132. m_sWord[m_iLen] = '\0';
  23133. m_iDoclistOffset = sphUnzipOffset ( m_pBuf );
  23134. m_iDocs = sphUnzipInt ( m_pBuf );
  23135. m_iHits = sphUnzipInt ( m_pBuf );
  23136. m_uHint = ( m_iDocs>=DOCLIST_HINT_THRESH ) ? *m_pBuf++ : 0;
  23137. m_iDoclistHint = DoclistHintUnpack ( m_iDocs, m_uHint );
  23138. if ( m_bHaveSkips && ( m_iDocs > SPH_SKIPLIST_BLOCK ) )
  23139. m_iSkiplistOffset = sphUnzipInt ( m_pBuf );
  23140. else
  23141. m_iSkiplistOffset = 0;
  23142. assert ( m_iLen>0 );
  23143. return true;
  23144. }
  23145. bool CWordlist::GetWord ( const BYTE * pBuf, SphWordID_t iWordID, CSphDictEntry & tWord ) const
  23146. {
  23147. SphWordID_t iLastID = 0;
  23148. SphOffset_t uLastOff = 0;
  23149. for ( ;; )
  23150. {
  23151. // unpack next word ID
  23152. const SphWordID_t iDeltaWord = sphUnzipWordid ( pBuf ); // FIXME! slow with 32bit wordids
  23153. if ( iDeltaWord==0 ) // wordlist chunk is over
  23154. return false;
  23155. iLastID += iDeltaWord;
  23156. // list is sorted, so if there was no match, there's no such word
  23157. if ( iLastID>iWordID )
  23158. return false;
  23159. // unpack next offset
  23160. const SphOffset_t iDeltaOffset = sphUnzipOffset ( pBuf );
  23161. uLastOff += iDeltaOffset;
  23162. // unpack doc/hit count
  23163. const int iDocs = sphUnzipInt ( pBuf );
  23164. const int iHits = sphUnzipInt ( pBuf );
  23165. SphOffset_t iSkiplistPos = 0;
  23166. if ( m_bHaveSkips && ( iDocs > SPH_SKIPLIST_BLOCK ) )
  23167. iSkiplistPos = sphUnzipOffset ( pBuf );
  23168. assert ( iDeltaOffset );
  23169. assert ( iDocs );
  23170. assert ( iHits );
  23171. // it matches?!
  23172. if ( iLastID==iWordID )
  23173. {
  23174. sphUnzipWordid ( pBuf ); // might be 0 at checkpoint
  23175. const SphOffset_t iDoclistLen = sphUnzipOffset ( pBuf );
  23176. tWord.m_iDoclistOffset = uLastOff;
  23177. tWord.m_iDocs = iDocs;
  23178. tWord.m_iHits = iHits;
  23179. tWord.m_iDoclistHint = (int)iDoclistLen;
  23180. tWord.m_iSkiplistOffset = iSkiplistPos;
  23181. return true;
  23182. }
  23183. }
  23184. }
  23185. const BYTE * CWordlist::AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint, int iFD, BYTE * pDictBuf ) const
  23186. {
  23187. assert ( pCheckpoint );
  23188. assert ( m_dCheckpoints.GetLength() );
  23189. assert ( pCheckpoint>=m_dCheckpoints.Begin() && pCheckpoint<=&m_dCheckpoints.Last() );
  23190. assert ( pCheckpoint->m_iWordlistOffset>0 && pCheckpoint->m_iWordlistOffset<=m_iSize );
  23191. assert ( m_pBuf.IsEmpty() || pCheckpoint->m_iWordlistOffset<(int64_t)m_pBuf.GetLength() );
  23192. // TODO: implement on_disk_dict = 1 for dict=keywords + infix
  23193. const BYTE * pBuf = NULL;
  23194. if ( !m_pBuf.IsEmpty() )
  23195. pBuf = m_pBuf.GetWritePtr()+pCheckpoint->m_iWordlistOffset;
  23196. else
  23197. {
  23198. assert ( pDictBuf );
  23199. SphOffset_t iChunkLength = 0;
  23200. // not the end?
  23201. if ( pCheckpoint < &m_dCheckpoints.Last() )
  23202. iChunkLength = pCheckpoint[1].m_iWordlistOffset - pCheckpoint->m_iWordlistOffset;
  23203. else
  23204. iChunkLength = m_iWordsEnd - pCheckpoint->m_iWordlistOffset;
  23205. assert ( iChunkLength<=m_iMaxChunk );
  23206. if ( (int)sphPread ( iFD, pDictBuf, (size_t)iChunkLength, pCheckpoint->m_iWordlistOffset )==iChunkLength )
  23207. pBuf = pDictBuf;
  23208. }
  23209. return pBuf;
  23210. }
  23211. int sphGetExpansionMagic ( int iDocs, int iHits )
  23212. {
  23213. if ( iHits<=256 ) // magic threshold; mb make this configurable?
  23214. return 1;
  23215. else
  23216. return iDocs + 1;
  23217. }
  23218. static inline void AddExpansion ( CSphVector<CSphNamedInt> & dExpanded, const KeywordsBlockReader_c & tCtx )
  23219. {
  23220. assert ( tCtx.GetWordLen() );
  23221. CSphNamedInt & tRes = dExpanded.Add();
  23222. tRes.m_sName = tCtx.GetWord();
  23223. tRes.m_iValue = sphGetExpansionMagic ( tCtx.m_iDocs, tCtx.m_iHits );
  23224. }
  23225. void CWordlist::GetPrefixedWords ( const char * sPrefix, int iPrefixLen, const char * sWildcard,
  23226. CSphVector<CSphNamedInt> & dExpanded, BYTE * pDictBuf, int iFD ) const
  23227. {
  23228. assert ( sPrefix && *sPrefix && iPrefixLen>0 );
  23229. assert ( sWildcard && *sWildcard );
  23230. // empty index?
  23231. if ( !m_dCheckpoints.GetLength() )
  23232. return;
  23233. const CSphWordlistCheckpoint * pCheckpoint = FindCheckpoint ( sPrefix, iPrefixLen, 0, true );
  23234. const int iSkipMagic = ( BYTE(*sPrefix)<0x20 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
  23235. while ( pCheckpoint )
  23236. {
  23237. // decode wordlist chunk
  23238. KeywordsBlockReader_c tCtx ( AcquireDict ( pCheckpoint, iFD, pDictBuf ), m_bHaveSkips );
  23239. while ( tCtx.UnpackWord() )
  23240. {
  23241. // block is sorted
  23242. // so once keywords are greater than the prefix, no more matches
  23243. int iCmp = sphDictCmp ( sPrefix, iPrefixLen, tCtx.GetWord(), tCtx.GetWordLen() );
  23244. if ( iCmp<0 )
  23245. break;
  23246. // does it match the prefix *and* the entire wildcard?
  23247. if ( iCmp==0 && sphWildcardMatch ( tCtx.GetWord() + iSkipMagic, sWildcard ) )
  23248. AddExpansion ( dExpanded, tCtx );
  23249. }
  23250. pCheckpoint++;
  23251. if ( pCheckpoint > &m_dCheckpoints.Last() )
  23252. break;
  23253. if ( sphDictCmp ( sPrefix, iPrefixLen, pCheckpoint->m_sWord, strlen ( pCheckpoint->m_sWord ) )<0 )
  23254. break;
  23255. }
  23256. }
  23257. bool operator < ( const InfixBlock_t & a, const char * b )
  23258. {
  23259. return strcmp ( a.m_sInfix, b )<0;
  23260. }
  23261. bool operator == ( const InfixBlock_t & a, const char * b )
  23262. {
  23263. return strcmp ( a.m_sInfix, b )==0;
  23264. }
  23265. bool operator < ( const char * a, const InfixBlock_t & b )
  23266. {
  23267. return strcmp ( a, b.m_sInfix )<0;
  23268. }
  23269. bool sphLookupInfixCheckpoints ( const char * sInfix, int iBytes, const BYTE * pInfixes, const CSphVector<InfixBlock_t> & dInfixBlocks, int iInfixCodepointBytes, CSphVector<int> & dCheckpoints )
  23270. {
  23271. assert ( pInfixes );
  23272. dCheckpoints.Resize ( 0 );
  23273. // lookup block
  23274. int iBlock = FindSpan ( dInfixBlocks, sInfix );
  23275. if ( iBlock<0 )
  23276. return false;
  23277. const BYTE * pBlock = pInfixes + dInfixBlocks[iBlock].m_iOffset;
  23278. // decode block and check for exact infix match
  23279. // block entry is { byte edit_code, byte[] key_append, zint data_len, zint data_deltas[] }
  23280. // zero edit_code marks block end
  23281. BYTE sKey[32];
  23282. for ( ;; )
  23283. {
  23284. // unpack next key
  23285. int iCode = *pBlock++;
  23286. if ( !iCode )
  23287. break;
  23288. BYTE * pOut = sKey;
  23289. if ( iInfixCodepointBytes==1 )
  23290. {
  23291. pOut = sKey + ( iCode>>4 );
  23292. iCode &= 15;
  23293. while ( iCode-- )
  23294. *pOut++ = *pBlock++;
  23295. } else
  23296. {
  23297. int iKeep = ( iCode>>4 );
  23298. while ( iKeep-- )
  23299. pOut += sphUtf8CharBytes ( *pOut ); ///< wtf? *pOut (=sKey) is NOT initialized?
  23300. assert ( pOut-sKey<=(int)sizeof(sKey) );
  23301. iCode &= 15;
  23302. while ( iCode-- )
  23303. {
  23304. int i = sphUtf8CharBytes ( *pBlock );
  23305. while ( i-- )
  23306. *pOut++ = *pBlock++;
  23307. }
  23308. assert ( pOut-sKey<=(int)sizeof(sKey) );
  23309. }
  23310. assert ( pOut-sKey<(int)sizeof(sKey) );
  23311. #ifndef NDEBUG
  23312. *pOut = '\0'; // handy for debugging, but not used for real matching
  23313. #endif
  23314. if ( pOut==sKey+iBytes && memcmp ( sKey, sInfix, iBytes )==0 )
  23315. {
  23316. // found you! decompress the data
  23317. int iLast = 0;
  23318. int iPackedLen = sphUnzipInt ( pBlock );
  23319. const BYTE * pMax = pBlock + iPackedLen;
  23320. while ( pBlock<pMax )
  23321. {
  23322. iLast += sphUnzipInt ( pBlock );
  23323. dCheckpoints.Add ( iLast );
  23324. }
  23325. return true;
  23326. }
  23327. int iSkip = sphUnzipInt ( pBlock );
  23328. pBlock += iSkip;
  23329. }
  23330. return false;
  23331. }
  23332. // calculate length, upto iInfixCodepointBytes chars from infix start
  23333. int sphGetInfixLength ( const char * sInfix, int iBytes, int iInfixCodepointBytes )
  23334. {
  23335. int iBytes1 = Min ( 6, iBytes );
  23336. if ( iInfixCodepointBytes!=1 )
  23337. {
  23338. int iCharsLeft = 6;
  23339. const char * s = sInfix;
  23340. const char * sMax = sInfix + iBytes;
  23341. while ( iCharsLeft-- && s<sMax )
  23342. s += sphUtf8CharBytes(*s);
  23343. iBytes1 = (int)( s - sInfix );
  23344. }
  23345. return iBytes1;
  23346. }
  23347. void CWordlist::GetInfixedWords ( const char * sInfix, int iBytes, const char * sWildcard, CSphVector<CSphNamedInt> & dExpanded ) const
  23348. {
  23349. // dict must be of keywords type, and fully cached
  23350. // mmap()ed in the worst case, should we ever banish it to disk again
  23351. if ( m_pBuf.IsEmpty() || !m_dCheckpoints.GetLength() )
  23352. return;
  23353. // extract key1, upto 6 chars from infix start
  23354. int iBytes1 = sphGetInfixLength ( sInfix, iBytes, m_iInfixCodepointBytes );
  23355. // lookup key1
  23356. // OPTIMIZE? maybe lookup key2 and reduce checkpoint set size, if possible?
  23357. CSphVector<int> dPoints;
  23358. if ( !sphLookupInfixCheckpoints ( sInfix, iBytes1, m_pBuf.GetWritePtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dPoints ) )
  23359. return;
  23360. // walk those checkpoints, check all their words
  23361. ARRAY_FOREACH ( i, dPoints )
  23362. {
  23363. // OPTIMIZE? add a quicker path than a generic wildcard for "*infix*" case?
  23364. KeywordsBlockReader_c tCtx ( m_pBuf.GetWritePtr() + m_dCheckpoints[dPoints[i]-1].m_iWordlistOffset, m_bHaveSkips );
  23365. while ( tCtx.UnpackWord() )
  23366. if ( sphWildcardMatch ( tCtx.GetWord(), sWildcard ) )
  23367. AddExpansion ( dExpanded, tCtx );
  23368. }
  23369. }
  23370. void sphCheckWordStats ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hDst, const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hSrc, const char * sIndex, CSphString & sWarning )
  23371. {
  23372. if ( !hDst.GetLength() )
  23373. return;
  23374. bool bHasHead = false;
  23375. hSrc.IterateStart();
  23376. while ( hSrc.IterateNext() )
  23377. {
  23378. const CSphQueryResultMeta::WordStat_t * pDstStat = hDst ( hSrc.IterateGetKey() );
  23379. const CSphQueryResultMeta::WordStat_t & tSrcStat = hSrc.IterateGet();
  23380. // all indexes should produce same terms for same query
  23381. if ( !pDstStat && !tSrcStat.m_bExpanded )
  23382. {
  23383. if ( !bHasHead )
  23384. {
  23385. sWarning.SetSprintf ( "index '%s': query word(s) mismatch: %s", sIndex, hSrc.IterateGetKey().cstr() );
  23386. bHasHead = true;
  23387. } else
  23388. {
  23389. sWarning.SetSprintf ( "%s, %s", sWarning.cstr(), hSrc.IterateGetKey().cstr() );
  23390. }
  23391. }
  23392. }
  23393. }
  23394. //////////////////////////////////////////////////////////////////////////
  23395. // CSphQueryResultMeta
  23396. //////////////////////////////////////////////////////////////////////////
  23397. CSphQueryResultMeta::CSphQueryResultMeta ()
  23398. : m_iQueryTime ( 0 )
  23399. , m_iRealQueryTime ( 0 )
  23400. , m_iCpuTime ( 0 )
  23401. , m_iMultiplier ( 1 )
  23402. , m_iMatches ( 0 )
  23403. , m_iTotalMatches ( 0 )
  23404. , m_iAgentCpuTime ( 0 )
  23405. {
  23406. }
  23407. void CSphQueryResultMeta::AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits, bool bExpanded )
  23408. {
  23409. CSphString sFixed;
  23410. const CSphString * pFixed = &sWord;
  23411. if ( sWord.cstr()[0]==MAGIC_WORD_HEAD )
  23412. {
  23413. sFixed = sWord;
  23414. *(char *)( sFixed.cstr() ) = '*';
  23415. pFixed = &sFixed;
  23416. } else if ( sWord.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
  23417. {
  23418. if ( !bExpanded )
  23419. {
  23420. sFixed = sWord;
  23421. *(char *)( sFixed.cstr() ) = '=';
  23422. pFixed = &sFixed;
  23423. } else
  23424. {
  23425. sFixed = sWord.SubString ( 1, sWord.Length()-1 );
  23426. pFixed = &sFixed;
  23427. }
  23428. } else
  23429. {
  23430. const char * p = strchr ( sWord.cstr(), MAGIC_WORD_BIGRAM );
  23431. if ( p )
  23432. {
  23433. sFixed.SetSprintf ( "\"%s\"", sWord.cstr() );
  23434. *( (char*)sFixed.cstr() + ( p - sWord.cstr() ) + 1 ) = ' ';
  23435. pFixed = &sFixed;
  23436. }
  23437. }
  23438. WordStat_t * pStats = m_hWordStats ( *pFixed );
  23439. if ( !pStats )
  23440. {
  23441. CSphQueryResultMeta::WordStat_t tStats;
  23442. tStats.m_iDocs = iDocs;
  23443. tStats.m_iHits = iHits;
  23444. tStats.m_bExpanded = bExpanded;
  23445. m_hWordStats.Add ( tStats, *pFixed );
  23446. } else
  23447. {
  23448. pStats->m_iDocs += iDocs;
  23449. pStats->m_iHits += iHits;
  23450. pStats->m_bExpanded |= bExpanded;
  23451. }
  23452. }
  23453. CSphQueryResultMeta::CSphQueryResultMeta ( const CSphQueryResultMeta & tMeta )
  23454. {
  23455. *this = tMeta;
  23456. }
  23457. CSphQueryResultMeta & CSphQueryResultMeta::operator= ( const CSphQueryResultMeta & tMeta )
  23458. {
  23459. m_iQueryTime = tMeta.m_iQueryTime;
  23460. m_iRealQueryTime = tMeta.m_iRealQueryTime;
  23461. m_iCpuTime = tMeta.m_iCpuTime;
  23462. m_iMultiplier = tMeta.m_iMultiplier;
  23463. m_iMatches = tMeta.m_iMatches;
  23464. m_iTotalMatches = tMeta.m_iTotalMatches;
  23465. m_tIOStats = tMeta.m_tIOStats;
  23466. m_iAgentCpuTime = tMeta.m_iAgentCpuTime;
  23467. m_tAgentIOStats = tMeta.m_tAgentIOStats;
  23468. m_sError = tMeta.m_sError;
  23469. m_sWarning = tMeta.m_sWarning;
  23470. m_hWordStats = tMeta.m_hWordStats;
  23471. return *this;
  23472. }
  23473. //////////////////////////////////////////////////////////////////////////
  23474. // CONVERSION TOOLS HELPERS
  23475. //////////////////////////////////////////////////////////////////////////
  23476. static void CopyBytes ( CSphWriter & wrTo, CSphReader & rdFrom, int iBytes )
  23477. {
  23478. const int BUFSIZE = 65536;
  23479. BYTE * pBuf = new BYTE [ BUFSIZE ];
  23480. int iCopied = 0;
  23481. while ( iCopied < iBytes )
  23482. {
  23483. int iToCopy = Min ( iBytes - iCopied, BUFSIZE );
  23484. rdFrom.GetBytes ( pBuf, iToCopy );
  23485. wrTo.PutBytes ( pBuf, iToCopy );
  23486. iCopied += iToCopy;
  23487. }
  23488. SafeDeleteArray ( pBuf );
  23489. }
  23490. /// post-conversion chores
  23491. /// rename the files, show elapsed time
  23492. static void FinalizeUpgrade ( const char ** sRenames, const char * sBanner, const char * sPath, int64_t tmStart )
  23493. {
  23494. while ( *sRenames )
  23495. {
  23496. CSphString sFrom, sTo;
  23497. sFrom.SetSprintf ( "%s%s", sPath, sRenames[0] );
  23498. sTo.SetSprintf ( "%s%s", sPath, sRenames[1] );
  23499. sRenames += 2;
  23500. if ( ::rename ( sFrom.cstr(), sTo.cstr() ) )
  23501. sphDie ( "%s: rename %s to %s failed: %s\n", sBanner,
  23502. sFrom.cstr(), sTo.cstr(), strerror(errno) );
  23503. }
  23504. // all done! yay
  23505. int64_t tmWall = sphMicroTimer() - tmStart;
  23506. fprintf ( stdout, "%s: elapsed %d.%d sec\n", sBanner,
  23507. (int)(tmWall/1000000), (int)((tmWall/100000)%10) );
  23508. fprintf ( stdout, "%s: done!\n", sBanner );
  23509. }
  23510. #if USE_WINDOWS
  23511. #pragma warning(disable:4127) // conditional expr is const for MSVC
  23512. #endif
  23513. //////////////////////////////////////////////////////////////////////////
  23514. // V.26 TO V.27 CONVERSION TOOL, INFIX BUILDER
  23515. //////////////////////////////////////////////////////////////////////////
  23516. void sphDictBuildInfixes ( const char * sPath )
  23517. {
  23518. CSphString sFilename, sError;
  23519. int64_t tmStart = sphMicroTimer();
  23520. if ( INDEX_FORMAT_VERSION!=27 )
  23521. sphDie ( "infix upgrade: only works in v.27 builds for now; get an older indextool or contact support" );
  23522. //////////////////////////////////////////////////
  23523. // load (interesting parts from) the index header
  23524. //////////////////////////////////////////////////
  23525. CSphAutoreader rdHeader;
  23526. sFilename.SetSprintf ( "%s.sph", sPath );
  23527. if ( !rdHeader.Open ( sFilename.cstr(), sError ) )
  23528. sphDie ( "infix upgrade: %s", sError.cstr() );
  23529. // version
  23530. DWORD uHeader = rdHeader.GetDword ();
  23531. DWORD uVersion = rdHeader.GetDword();
  23532. bool bUse64 = ( rdHeader.GetDword()!=0 );
  23533. ESphDocinfo eDocinfo = (ESphDocinfo) rdHeader.GetDword();
  23534. if ( uHeader!=INDEX_MAGIC_HEADER )
  23535. sphDie ( "infix upgrade: invalid header file" );
  23536. if ( uVersion<21 || uVersion>26 )
  23537. sphDie ( "infix upgrade: got v.%d header, v.21 to v.26 required", uVersion );
  23538. if ( eDocinfo==SPH_DOCINFO_INLINE )
  23539. sphDie ( "infix upgrade: docinfo=inline is not supported" );
  23540. CSphSchema tSchema;
  23541. DictHeader_t tDictHeader;
  23542. CSphSourceStats tStats;
  23543. CSphIndexSettings tIndexSettings;
  23544. CSphTokenizerSettings tTokenizerSettings;
  23545. CSphDictSettings tDictSettings;
  23546. CSphEmbeddedFiles tEmbeddedFiles;
  23547. ReadSchema ( rdHeader, tSchema, uVersion, eDocinfo==SPH_DOCINFO_INLINE );
  23548. SphOffset_t iMinDocid = rdHeader.GetOffset();
  23549. tDictHeader.m_iDictCheckpointsOffset = rdHeader.GetOffset ();
  23550. tDictHeader.m_iDictCheckpoints = rdHeader.GetDword ();
  23551. tDictHeader.m_iInfixCodepointBytes = 0;
  23552. tDictHeader.m_iInfixBlocksOffset = 0;
  23553. tDictHeader.m_iInfixBlocksWordsSize = 0;
  23554. tStats.m_iTotalDocuments = rdHeader.GetDword ();
  23555. tStats.m_iTotalBytes = rdHeader.GetOffset ();
  23556. LoadIndexSettings ( tIndexSettings, rdHeader, uVersion );
  23557. LoadTokenizerSettings ( rdHeader, tTokenizerSettings, tEmbeddedFiles, uVersion, sError );
  23558. LoadDictionarySettings ( rdHeader, tDictSettings, tEmbeddedFiles, uVersion, sError );
  23559. int iKillListSize = rdHeader.GetDword();
  23560. DWORD uMinMaxIndex = rdHeader.GetDword();
  23561. if ( rdHeader.GetErrorFlag() )
  23562. sphDie ( "infix upgrade: failed to parse header" );
  23563. rdHeader.Close();
  23564. ////////////////////
  23565. // generate infixes
  23566. ////////////////////
  23567. if ( !tDictSettings.m_bWordDict )
  23568. sphDie ( "infix upgrade: dict=keywords required" );
  23569. tIndexSettings.m_iMinPrefixLen = 0;
  23570. tIndexSettings.m_iMinInfixLen = 2;
  23571. ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tTokenizerSettings, &tEmbeddedFiles, sError );
  23572. if ( !pTokenizer )
  23573. sphDie ( "infix upgrade: %s", sError.cstr() );
  23574. tDictHeader.m_iInfixCodepointBytes = pTokenizer->GetMaxCodepointLength();
  23575. ISphInfixBuilder * pInfixer = sphCreateInfixBuilder ( tDictHeader.m_iInfixCodepointBytes, &sError );
  23576. if ( !pInfixer )
  23577. sphDie ( "infix upgrade: %s", sError.cstr() );
  23578. // scan all dict entries, generate infixes
  23579. // (in a separate block, so that tDictReader gets destroyed, and file closed)
  23580. {
  23581. CSphDictReader tDictReader;
  23582. if ( !tDictReader.Setup ( sFilename.SetSprintf ( "%s.spi", sPath ),
  23583. tDictHeader.m_iDictCheckpointsOffset, tIndexSettings.m_eHitless, sError, true, &g_tThrottle, uVersion>=31 ) )
  23584. sphDie ( "infix upgrade: %s", sError.cstr() );
  23585. while ( tDictReader.Read() )
  23586. {
  23587. const BYTE * sWord = tDictReader.GetWord();
  23588. int iLen = strlen ( (const char *)sWord );
  23589. pInfixer->AddWord ( sWord, iLen, tDictReader.GetCheckpoint() );
  23590. }
  23591. }
  23592. /////////////////////////////
  23593. // write new dictionary file
  23594. /////////////////////////////
  23595. // ready to party
  23596. // open all the cans!
  23597. CSphAutofile fdDict;
  23598. fdDict.Open ( sFilename, SPH_O_READ, sError );
  23599. CSphReader rdDict;
  23600. rdDict.SetFile ( fdDict );
  23601. rdDict.SeekTo ( 0, READ_NO_SIZE_HINT );
  23602. CSphWriter wrDict;
  23603. sFilename.SetSprintf ( "%s.spi.upgrade", sPath );
  23604. if ( !wrDict.OpenFile ( sFilename, sError ) )
  23605. sphDie ( "infix upgrade: failed to open %s", sFilename.cstr() );
  23606. // copy the keyword entries until checkpoints
  23607. CopyBytes ( wrDict, rdDict, (int)tDictHeader.m_iDictCheckpointsOffset );
  23608. // write newly generated infix hash entries
  23609. pInfixer->SaveEntries ( wrDict );
  23610. // copy checkpoints
  23611. int iCheckpointsSize = (int)( fdDict.GetSize() - tDictHeader.m_iDictCheckpointsOffset );
  23612. tDictHeader.m_iDictCheckpointsOffset = wrDict.GetPos();
  23613. CopyBytes ( wrDict, rdDict, iCheckpointsSize );
  23614. // write newly generated infix hash blocks
  23615. tDictHeader.m_iInfixBlocksOffset = pInfixer->SaveEntryBlocks ( wrDict );
  23616. tDictHeader.m_iInfixBlocksWordsSize = pInfixer->GetBlocksWordsSize();
  23617. // flush header
  23618. // mostly for debugging convenience
  23619. // primary storage is in the index wide header
  23620. wrDict.PutBytes ( "dict-header", 11 );
  23621. wrDict.ZipInt ( tDictHeader.m_iDictCheckpoints );
  23622. wrDict.ZipOffset ( tDictHeader.m_iDictCheckpointsOffset );
  23623. wrDict.ZipInt ( tDictHeader.m_iInfixCodepointBytes );
  23624. wrDict.ZipInt ( tDictHeader.m_iInfixBlocksOffset );
  23625. wrDict.CloseFile ();
  23626. if ( wrDict.IsError() )
  23627. sphDie ( "infix upgrade: dictionary write error (out of space?)" );
  23628. if ( rdDict.GetErrorFlag() )
  23629. sphDie ( "infix upgrade: dictionary read error" );
  23630. fdDict.Close();
  23631. ////////////////////
  23632. // write new header
  23633. ////////////////////
  23634. assert ( tDictSettings.m_bWordDict );
  23635. CSphDict * pDict = sphCreateDictionaryKeywords ( tDictSettings, &tEmbeddedFiles, pTokenizer, "$indexname", sError );
  23636. if ( !pDict )
  23637. sphDie ( "infix upgrade: %s", sError.cstr() );
  23638. CSphWriter wrHeader;
  23639. sFilename.SetSprintf ( "%s.sph.upgrade", sPath );
  23640. if ( !wrHeader.OpenFile ( sFilename, sError ) )
  23641. sphDie ( "infix upgrade: %s", sError.cstr() );
  23642. wrHeader.PutDword ( INDEX_MAGIC_HEADER );
  23643. wrHeader.PutDword ( INDEX_FORMAT_VERSION );
  23644. wrHeader.PutDword ( bUse64 );
  23645. wrHeader.PutDword ( eDocinfo );
  23646. WriteSchema ( wrHeader, tSchema );
  23647. wrHeader.PutOffset ( iMinDocid );
  23648. wrHeader.PutOffset ( tDictHeader.m_iDictCheckpointsOffset );
  23649. wrHeader.PutDword ( tDictHeader.m_iDictCheckpoints );
  23650. wrHeader.PutByte ( tDictHeader.m_iInfixCodepointBytes );
  23651. wrHeader.PutDword ( tDictHeader.m_iInfixBlocksOffset );
  23652. wrHeader.PutDword ( tDictHeader.m_iInfixBlocksWordsSize );
  23653. wrHeader.PutDword ( (DWORD)tStats.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
  23654. wrHeader.PutOffset ( tStats.m_iTotalBytes );
  23655. SaveIndexSettings ( wrHeader, tIndexSettings );
  23656. SaveTokenizerSettings ( wrHeader, pTokenizer, tIndexSettings.m_iEmbeddedLimit );
  23657. SaveDictionarySettings ( wrHeader, pDict, false, tIndexSettings.m_iEmbeddedLimit );
  23658. wrHeader.PutDword ( iKillListSize );
  23659. wrHeader.PutDword ( uMinMaxIndex );
  23660. wrHeader.PutDword ( 0 ); // no field filter
  23661. wrHeader.CloseFile ();
  23662. if ( wrHeader.IsError() )
  23663. sphDie ( "infix upgrade: header write error (out of space?)" );
  23664. // all done!
  23665. const char * sRenames[] = {
  23666. ".sph", ".sph.bak",
  23667. ".spi", ".spi.bak",
  23668. ".sph.upgrade", ".sph",
  23669. ".spi.upgrade", ".spi",
  23670. NULL };
  23671. FinalizeUpgrade ( sRenames, "infix upgrade", sPath, tmStart );
  23672. }
  23673. //////////////////////////////////////////////////////////////////////////
  23674. // V.12 TO V.31 CONVERSION TOOL, SKIPLIST BUILDER
  23675. //////////////////////////////////////////////////////////////////////////
  23676. struct EntrySkips_t
  23677. {
  23678. DWORD m_uEntry; ///< sequential index in dict
  23679. SphOffset_t m_iDoclist; ///< doclist offset from dict
  23680. int m_iSkiplist; ///< generated skiplist offset
  23681. };
  23682. void sphDictBuildSkiplists ( const char * sPath )
  23683. {
  23684. CSphString sFilename, sError;
  23685. int64_t tmStart = sphMicroTimer();
  23686. if ( INDEX_FORMAT_VERSION<31 || INDEX_FORMAT_VERSION>35 )
  23687. sphDie ( "skiplists upgrade: ony works in v.31 to v.35 builds for now; get an older indextool or contact support" );
  23688. // load (interesting parts from) the index header
  23689. CSphAutoreader rdHeader;
  23690. sFilename.SetSprintf ( "%s.sph", sPath );
  23691. if ( !rdHeader.Open ( sFilename.cstr(), sError ) )
  23692. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  23693. // version
  23694. DWORD uHeader = rdHeader.GetDword ();
  23695. DWORD uVersion = rdHeader.GetDword();
  23696. bool bUse64 = ( rdHeader.GetDword()!=0 );
  23697. bool bConvertCheckpoints = ( uVersion<=21 );
  23698. ESphDocinfo eDocinfo = (ESphDocinfo) rdHeader.GetDword();
  23699. const DWORD uLowestVersion = 12;
  23700. if ( bUse64!=USE_64BIT )
  23701. sphDie ( "skiplists upgrade: USE_64BIT differs, index %s, binary %s",
  23702. bUse64 ? "enabled" : "disabled", USE_64BIT ? "enabled" : "disabled" );
  23703. if ( uHeader!=INDEX_MAGIC_HEADER )
  23704. sphDie ( "skiplists upgrade: invalid header file" );
  23705. if ( uVersion<uLowestVersion )
  23706. sphDie ( "skiplists upgrade: got v.%d header, v.%d to v.30 required", uVersion, uLowestVersion );
  23707. if ( eDocinfo==SPH_DOCINFO_INLINE )
  23708. sphDie ( "skiplists upgrade: docinfo=inline is not supported yet" );
  23709. CSphSchema tSchema;
  23710. DictHeader_t tDictHeader;
  23711. CSphSourceStats tStats;
  23712. CSphIndexSettings tIndexSettings;
  23713. CSphTokenizerSettings tTokenizerSettings;
  23714. CSphDictSettings tDictSettings;
  23715. CSphEmbeddedFiles tEmbeddedFiles;
  23716. ReadSchema ( rdHeader, tSchema, uVersion, eDocinfo==SPH_DOCINFO_INLINE );
  23717. SphOffset_t iMinDocid = rdHeader.GetOffset();
  23718. tDictHeader.m_iDictCheckpointsOffset = rdHeader.GetOffset ();
  23719. tDictHeader.m_iDictCheckpoints = rdHeader.GetDword ();
  23720. tDictHeader.m_iInfixCodepointBytes = 0;
  23721. tDictHeader.m_iInfixBlocksOffset = 0;
  23722. if ( uVersion>=27 )
  23723. {
  23724. tDictHeader.m_iInfixCodepointBytes = rdHeader.GetByte();
  23725. tDictHeader.m_iInfixBlocksOffset = rdHeader.GetDword();
  23726. }
  23727. if ( uVersion>=34 )
  23728. tDictHeader.m_iInfixBlocksWordsSize = rdHeader.GetDword();
  23729. tStats.m_iTotalDocuments = rdHeader.GetDword ();
  23730. tStats.m_iTotalBytes = rdHeader.GetOffset ();
  23731. LoadIndexSettings ( tIndexSettings, rdHeader, uVersion );
  23732. LoadTokenizerSettings ( rdHeader, tTokenizerSettings, tEmbeddedFiles, uVersion, sError );
  23733. LoadDictionarySettings ( rdHeader, tDictSettings, tEmbeddedFiles, uVersion, sError );
  23734. int iKillListSize = rdHeader.GetDword();
  23735. SphOffset_t uMinMaxIndex = 0;
  23736. if ( uVersion>=33 )
  23737. uMinMaxIndex = rdHeader.GetOffset ();
  23738. else if ( uVersion>=20 )
  23739. uMinMaxIndex = rdHeader.GetDword ();
  23740. ISphFieldFilter * pFieldFilter = NULL;
  23741. if ( uVersion>=28 )
  23742. {
  23743. CSphFieldFilterSettings tFieldFilterSettings;
  23744. LoadFieldFilterSettings ( rdHeader, tFieldFilterSettings );
  23745. pFieldFilter = sphCreateFieldFilter ( tFieldFilterSettings, sError );
  23746. }
  23747. CSphFixedVector<uint64_t> dFieldLens ( tSchema.m_dFields.GetLength() );
  23748. if ( uVersion>=35 && tIndexSettings.m_bIndexFieldLens )
  23749. ARRAY_FOREACH ( i, tSchema.m_dFields )
  23750. dFieldLens[i] = rdHeader.GetOffset(); // FIXME? ideally 64bit even when off is 32bit..
  23751. if ( rdHeader.GetErrorFlag() )
  23752. sphDie ( "skiplists upgrade: failed to parse header" );
  23753. rdHeader.Close();
  23754. //////////////////////
  23755. // generate skiplists
  23756. //////////////////////
  23757. // keywords on disk might be in a different order than dictionary
  23758. // and random accesses on a plain disk would be extremely slow
  23759. // so we load the dictionary, sort by doclist offset
  23760. // then we walk doclists, generate skiplists, sort back by entry number
  23761. // then walk the disk dictionary again, lookup skiplist offset, and patch
  23762. // load the dictionary
  23763. CSphVector<EntrySkips_t> dSkips;
  23764. const bool bWordDict = tDictSettings.m_bWordDict;
  23765. CSphAutoreader rdDict;
  23766. if ( !rdDict.Open ( sFilename.SetSprintf ( "%s.spi", sPath ), sError ) )
  23767. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  23768. // compute actual keyword data length
  23769. SphOffset_t iWordsEnd = tDictHeader.m_iDictCheckpointsOffset;
  23770. if ( bWordDict && tDictHeader.m_iInfixCodepointBytes )
  23771. {
  23772. rdDict.SeekTo ( tDictHeader.m_iInfixBlocksOffset, 32 ); // need just 1 entry, 32 bytes should be ok
  23773. rdDict.UnzipInt(); // skip block count
  23774. int iInfixLen = rdDict.GetByte();
  23775. rdDict.SkipBytes ( iInfixLen );
  23776. iWordsEnd = rdDict.UnzipInt() - strlen ( g_sTagInfixEntries );
  23777. rdDict.SeekTo ( 0, READ_NO_SIZE_HINT );
  23778. }
  23779. CSphDictReader * pReader = new CSphDictReader();
  23780. pReader->Setup ( &rdDict, iWordsEnd, tIndexSettings.m_eHitless, bWordDict, &g_tThrottle, uVersion>=31 );
  23781. DWORD uEntry = 0;
  23782. while ( pReader->Read() )
  23783. {
  23784. if ( pReader->m_iDocs > SPH_SKIPLIST_BLOCK )
  23785. {
  23786. EntrySkips_t & t = dSkips.Add();
  23787. t.m_uEntry = uEntry;
  23788. t.m_iDoclist = pReader->m_iDoclistOffset;
  23789. t.m_iSkiplist = -1;
  23790. }
  23791. if ( ++uEntry==0 )
  23792. sphDie ( "skiplists upgrade: dictionaries over 4B entries are not supported yet!" );
  23793. }
  23794. // sort by doclist offset
  23795. dSkips.Sort ( sphMemberLess ( &EntrySkips_t::m_iDoclist ) );
  23796. // walk doclists, create skiplists
  23797. CSphAutoreader rdDocs;
  23798. if ( !rdDocs.Open ( sFilename.SetSprintf ( "%s.spd", sPath ), sError ) )
  23799. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  23800. CSphWriter wrSkips;
  23801. if ( !wrSkips.OpenFile ( sFilename.SetSprintf ( "%s.spe.tmp", sPath ), sError ) )
  23802. sphDie ( "skiplists upgrade: failed to create %s", sFilename.cstr() );
  23803. wrSkips.PutByte ( 1 );
  23804. int iDone = -1;
  23805. CSphVector<SkiplistEntry_t> dSkiplist;
  23806. ARRAY_FOREACH ( i, dSkips )
  23807. {
  23808. // seek to that keyword
  23809. // OPTIMIZE? use length hint from dict too?
  23810. rdDocs.SeekTo ( dSkips[i].m_iDoclist, READ_NO_SIZE_HINT );
  23811. // decode interesting bits of doclist
  23812. SphDocID_t uDocid = SphDocID_t ( iMinDocid );
  23813. SphOffset_t uHitPosition = 0;
  23814. DWORD uDocs = 0;
  23815. for ( ;; )
  23816. {
  23817. // save current entry position
  23818. SphOffset_t uPos = rdDocs.GetPos();
  23819. // decode next entry
  23820. SphDocID_t uDelta = rdDocs.UnzipDocid();
  23821. if ( !uDelta )
  23822. break;
  23823. // build skiplist, aka save decoder state as needed
  23824. if ( ( uDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
  23825. {
  23826. SkiplistEntry_t & t = dSkiplist.Add();
  23827. t.m_iBaseDocid = uDocid;
  23828. t.m_iOffset = uPos;
  23829. t.m_iBaseHitlistPos = uHitPosition;
  23830. }
  23831. uDocs++;
  23832. // do decode
  23833. uDocid += uDelta; // track delta-encoded docid
  23834. if ( tIndexSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE )
  23835. {
  23836. DWORD uHits = rdDocs.UnzipInt();
  23837. rdDocs.UnzipInt(); // skip hit field mask/data
  23838. if ( uHits==1 )
  23839. {
  23840. rdDocs.UnzipInt(); // skip inlined field id
  23841. } else
  23842. {
  23843. uHitPosition += rdDocs.UnzipOffset(); // track delta-encoded hitlist offset
  23844. }
  23845. } else
  23846. {
  23847. uHitPosition += rdDocs.UnzipOffset(); // track delta-encoded hitlist offset
  23848. rdDocs.UnzipInt(); // skip hit field mask/data
  23849. rdDocs.UnzipInt(); // skip hit count
  23850. }
  23851. }
  23852. // alright, we built it, so save it
  23853. assert ( uDocs>SPH_SKIPLIST_BLOCK );
  23854. assert ( dSkiplist.GetLength() );
  23855. dSkips[i].m_iSkiplist = (int)wrSkips.GetPos();
  23856. SkiplistEntry_t tLast = dSkiplist[0];
  23857. for ( int j=1; j<dSkiplist.GetLength(); j++ )
  23858. {
  23859. const SkiplistEntry_t & t = dSkiplist[j];
  23860. assert ( t.m_iBaseDocid - tLast.m_iBaseDocid>=SPH_SKIPLIST_BLOCK );
  23861. assert ( t.m_iOffset - tLast.m_iOffset>=4*SPH_SKIPLIST_BLOCK );
  23862. wrSkips.ZipOffset ( t.m_iBaseDocid - tLast.m_iBaseDocid - SPH_SKIPLIST_BLOCK );
  23863. wrSkips.ZipOffset ( t.m_iOffset - tLast.m_iOffset - 4*SPH_SKIPLIST_BLOCK );
  23864. wrSkips.ZipOffset ( t.m_iBaseHitlistPos - tLast.m_iBaseHitlistPos );
  23865. tLast = t;
  23866. }
  23867. dSkiplist.Resize ( 0 );
  23868. // progress bar
  23869. int iDone2 = (1+i)*100 / dSkips.GetLength();
  23870. if ( iDone2!=iDone )
  23871. {
  23872. iDone = iDone2;
  23873. fprintf ( stdout, "skiplists upgrade: building skiplists, %d%% done\r", iDone );
  23874. }
  23875. }
  23876. fprintf ( stdout, "skiplists upgrade: building skiplists, 100%% done\n" );
  23877. // finalize
  23878. wrSkips.CloseFile ();
  23879. if ( wrSkips.IsError() )
  23880. sphDie ( "skiplists upgrade: write error (out of space?)" );
  23881. if ( rdDocs.GetErrorFlag() )
  23882. sphDie ( "skiplists upgrade: doclist read error: %s", rdDocs.GetErrorMessage().cstr() );
  23883. // sort by entry id again
  23884. dSkips.Sort ( sphMemberLess ( &EntrySkips_t::m_uEntry ) );
  23885. /////////////////////////////
  23886. // write new dictionary file
  23887. /////////////////////////////
  23888. // converted dict writer
  23889. CSphWriter wrDict;
  23890. sFilename.SetSprintf ( "%s.spi.upgrade", sPath );
  23891. if ( !wrDict.OpenFile ( sFilename, sError ) )
  23892. sphDie ( "skiplists upgrade: failed to create %s", sFilename.cstr() );
  23893. wrDict.PutByte ( 1 );
  23894. // handy entry iterator
  23895. // we will use this one to decode entries, and rdDict for other raw access
  23896. pReader->Setup ( &rdDict, iWordsEnd, tIndexSettings.m_eHitless, bWordDict, &g_tThrottle, uVersion>=31 );
  23897. // we have to adjust some of the entries
  23898. // thus we also have to recompute the offset in the checkpoints too
  23899. //
  23900. // infix hashes (if any) in dict=keywords refer to checkpoints by numbers
  23901. // so infix data can simply be copied around
  23902. // new checkpoints
  23903. CSphVector<CSphWordlistCheckpoint> dNewCP;
  23904. int iLastCheckpoint = 0;
  23905. // skiplist lookup
  23906. EntrySkips_t * pSkips = dSkips.Begin();
  23907. // dict encoder state
  23908. SphWordID_t uLastWordid = 0; // crc case
  23909. SphOffset_t iLastDoclist = 0; // crc case
  23910. CSphKeywordDeltaWriter tLastKeyword; // keywords case
  23911. DWORD uWordCount = 0;
  23912. // read old entries, write new entries
  23913. while ( pReader->Read() )
  23914. {
  23915. // update or regenerate checkpoint
  23916. if ( ( !bConvertCheckpoints && iLastCheckpoint!=pReader->GetCheckpoint() )
  23917. || ( bConvertCheckpoints && ( uWordCount % SPH_WORDLIST_CHECKPOINT )==0 ) )
  23918. {
  23919. // FIXME? GetCheckpoint() is for some reason 1-based
  23920. if ( uWordCount )
  23921. {
  23922. wrDict.ZipInt ( 0 );
  23923. if ( bWordDict )
  23924. wrDict.ZipInt ( 0 );
  23925. else
  23926. wrDict.ZipOffset ( pReader->m_iDoclistOffset - iLastDoclist );
  23927. }
  23928. uLastWordid = 0;
  23929. iLastDoclist = 0;
  23930. CSphWordlistCheckpoint & tCP = dNewCP.Add();
  23931. if ( bWordDict )
  23932. {
  23933. tCP.m_sWord = strdup ( (const char*)pReader->GetWord() );
  23934. tLastKeyword.Reset();
  23935. } else
  23936. {
  23937. tCP.m_iWordID = pReader->m_iWordID;
  23938. }
  23939. tCP.m_iWordlistOffset = wrDict.GetPos();
  23940. iLastCheckpoint = pReader->GetCheckpoint();
  23941. }
  23942. // resave entry
  23943. if ( bWordDict )
  23944. {
  23945. // keywords dict path
  23946. const int iLen = strlen ( (const char*)pReader->GetWord() );
  23947. tLastKeyword.PutDelta ( wrDict, pReader->GetWord(), iLen );
  23948. wrDict.ZipOffset ( pReader->m_iDoclistOffset );
  23949. wrDict.ZipInt ( pReader->m_iDocs );
  23950. wrDict.ZipInt ( pReader->m_iHits );
  23951. if ( pReader->m_iDocs>=DOCLIST_HINT_THRESH )
  23952. wrDict.PutByte ( pReader->m_iHint );
  23953. } else
  23954. {
  23955. // crc dict path
  23956. assert ( pReader->m_iWordID > uLastWordid );
  23957. assert ( pReader->m_iDoclistOffset > iLastDoclist );
  23958. wrDict.ZipOffset ( pReader->m_iWordID - uLastWordid );
  23959. wrDict.ZipOffset ( pReader->m_iDoclistOffset - iLastDoclist );
  23960. wrDict.ZipInt ( pReader->m_iDocs );
  23961. wrDict.ZipInt ( pReader->m_iHits );
  23962. uLastWordid = pReader->m_iWordID;
  23963. iLastDoclist = pReader->m_iDoclistOffset;
  23964. }
  23965. // emit skiplist pointer
  23966. if ( pReader->m_iDocs > SPH_SKIPLIST_BLOCK )
  23967. {
  23968. // lots of checks
  23969. if ( uWordCount!=pSkips->m_uEntry )
  23970. sphDie ( "skiplist upgrade: internal error, entry mismatch (expected %d, got %d)",
  23971. uWordCount, pSkips->m_uEntry );
  23972. if ( pReader->m_iDoclistOffset!=pSkips->m_iDoclist )
  23973. sphDie ( "skiplist upgrade: internal error, offset mismatch (expected %lld, got %lld)",
  23974. INT64 ( pReader->m_iDoclistOffset ), INT64 ( pSkips->m_iDoclist ) );
  23975. if ( pSkips->m_iSkiplist<0 )
  23976. sphDie ( "skiplist upgrade: internal error, bad skiplist offset %d",
  23977. pSkips->m_iSkiplist );
  23978. // and a bit of work
  23979. wrDict.ZipInt ( pSkips->m_iSkiplist );
  23980. pSkips++;
  23981. }
  23982. // next entry
  23983. uWordCount++;
  23984. }
  23985. // finalize last keywords block
  23986. wrDict.ZipInt ( 0 );
  23987. if ( bWordDict )
  23988. wrDict.ZipInt ( 0 );
  23989. else
  23990. wrDict.ZipOffset ( rdDocs.GetFilesize() - iLastDoclist );
  23991. rdDocs.Close();
  23992. SafeDelete ( pReader );
  23993. // copy infix hash entries, if any
  23994. int iDeltaInfix = 0;
  23995. if ( bWordDict && tDictHeader.m_iInfixCodepointBytes )
  23996. {
  23997. if ( iWordsEnd!=rdDict.GetPos() )
  23998. sphDie ( "skiplist upgrade: internal error, infix hash position mismatch (expected=%lld, got=%lld)",
  23999. INT64 ( iWordsEnd ), INT64 ( rdDict.GetPos() ) );
  24000. iDeltaInfix = (int)( wrDict.GetPos() - rdDict.GetPos() );
  24001. CopyBytes ( wrDict, rdDict, (int)( tDictHeader.m_iDictCheckpointsOffset - iWordsEnd ) );
  24002. }
  24003. // write new checkpoints
  24004. if ( tDictHeader.m_iDictCheckpointsOffset!=rdDict.GetPos() )
  24005. sphDie ( "skiplist upgrade: internal error, checkpoints position mismatch (expected=%lld, got=%lld)",
  24006. INT64 ( tDictHeader.m_iDictCheckpointsOffset ), INT64 ( rdDict.GetPos() ) );
  24007. if ( !bConvertCheckpoints && tDictHeader.m_iDictCheckpoints!=dNewCP.GetLength() )
  24008. sphDie ( "skiplist upgrade: internal error, checkpoint count mismatch (old=%d, new=%d)",
  24009. tDictHeader.m_iDictCheckpoints, dNewCP.GetLength() );
  24010. tDictHeader.m_iDictCheckpoints = dNewCP.GetLength();
  24011. tDictHeader.m_iDictCheckpointsOffset = wrDict.GetPos();
  24012. ARRAY_FOREACH ( i, dNewCP )
  24013. {
  24014. if ( bWordDict )
  24015. {
  24016. wrDict.PutString ( dNewCP[i].m_sWord );
  24017. SafeDeleteArray ( dNewCP[i].m_sWord );
  24018. } else
  24019. {
  24020. wrDict.PutOffset ( dNewCP[i].m_iWordID );
  24021. }
  24022. wrDict.PutOffset ( dNewCP[i].m_iWordlistOffset );
  24023. }
  24024. // update infix hash blocks, if any
  24025. // (they store direct offsets to infix hash, which just got moved)
  24026. if ( bWordDict && tDictHeader.m_iInfixCodepointBytes )
  24027. {
  24028. rdDict.SeekTo ( tDictHeader.m_iInfixBlocksOffset, READ_NO_SIZE_HINT );
  24029. int iBlocks = rdDict.UnzipInt();
  24030. wrDict.PutBytes ( g_sTagInfixBlocks, strlen ( g_sTagInfixBlocks ) );
  24031. tDictHeader.m_iInfixBlocksOffset = (int)wrDict.GetPos();
  24032. wrDict.ZipInt ( iBlocks );
  24033. for ( int i=0; i<iBlocks; i++ )
  24034. {
  24035. char sInfix[256];
  24036. int iBytes = rdDict.GetByte();
  24037. rdDict.GetBytes ( sInfix, iBytes );
  24038. wrDict.PutByte ( iBytes );
  24039. wrDict.PutBytes ( sInfix, iBytes );
  24040. wrDict.ZipInt ( rdDict.UnzipInt() + iDeltaInfix );
  24041. }
  24042. }
  24043. // emit new aux tail header
  24044. if ( bWordDict )
  24045. {
  24046. wrDict.PutBytes ( "dict-header", 11 );
  24047. wrDict.ZipInt ( tDictHeader.m_iDictCheckpoints );
  24048. wrDict.ZipOffset ( tDictHeader.m_iDictCheckpointsOffset );
  24049. wrDict.ZipInt ( tDictHeader.m_iInfixCodepointBytes );
  24050. wrDict.ZipInt ( tDictHeader.m_iInfixBlocksOffset );
  24051. }
  24052. wrDict.CloseFile();
  24053. if ( wrDict.IsError() )
  24054. sphDie ( "skiplists upgrade: dict write error (out of space?)" );
  24055. rdDict.Close();
  24056. ////////////////////
  24057. // build min-max attribute index
  24058. ////////////////////
  24059. bool bShuffleAttributes = false;
  24060. if ( uVersion<20 )
  24061. {
  24062. int iStride = DOCINFO_IDSIZE + tSchema.GetRowSize();
  24063. int iEntrySize = sizeof(DWORD)*iStride;
  24064. sFilename.SetSprintf ( "%s.spa", sPath );
  24065. CSphAutofile rdDocinfo ( sFilename.cstr(), SPH_O_READ, sError );
  24066. if ( rdDocinfo.GetFD()<0 )
  24067. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  24068. sFilename.SetSprintf ( "%s.spa.upgrade", sPath );
  24069. CSphWriter wrDocinfo;
  24070. if ( !wrDocinfo.OpenFile ( sFilename.cstr(), sError ) )
  24071. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  24072. CSphFixedVector<DWORD> dMva ( 0 );
  24073. CSphAutofile tMvaFile ( sFilename.cstr(), SPH_O_READ, sError );
  24074. if ( tMvaFile.GetFD()>=0 && tMvaFile.GetSize()>0 )
  24075. {
  24076. uint64_t uMvaSize = tMvaFile.GetSize();
  24077. assert ( uMvaSize/sizeof(DWORD)<=UINT_MAX );
  24078. dMva.Reset ( (int)( uMvaSize/sizeof(DWORD) ) );
  24079. tMvaFile.Read ( dMva.Begin(), uMvaSize, sError );
  24080. }
  24081. tMvaFile.Close();
  24082. int64_t iDocinfoSize = rdDocinfo.GetSize ( iEntrySize, true, sError ) / sizeof(CSphRowitem);
  24083. assert ( iDocinfoSize / iStride < UINT_MAX );
  24084. int iRows = (int)(iDocinfoSize/iStride);
  24085. AttrIndexBuilder_c tBuilder ( tSchema );
  24086. int64_t iMinMaxSize = tBuilder.GetExpectedSize ( tStats.m_iTotalDocuments );
  24087. if ( iMinMaxSize>INT_MAX )
  24088. sphDie ( "attribute files (.spa) over 128 GB are not supported" );
  24089. CSphFixedVector<CSphRowitem> dMinMax ( (int)iMinMaxSize );
  24090. tBuilder.Prepare ( dMinMax.Begin(), dMinMax.Begin() + dMinMax.GetLength() ); // FIXME!!! for over INT_MAX blocks
  24091. CSphFixedVector<CSphRowitem> dRow ( iStride );
  24092. uMinMaxIndex = 0;
  24093. for ( int i=0; i<iRows; i++ )
  24094. {
  24095. rdDocinfo.Read ( dRow.Begin(), iStride*sizeof(CSphRowitem), sError );
  24096. wrDocinfo.PutBytes ( dRow.Begin(), iStride*sizeof(CSphRowitem) );
  24097. if ( !tBuilder.Collect ( dRow.Begin(), dMva.Begin(), dMva.GetLength(), sError, true ) )
  24098. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  24099. uMinMaxIndex += iStride;
  24100. int iDone1 = ( 1+i ) * 100 / iRows;
  24101. int iDone2 = ( 2+i ) * 100 / iRows;
  24102. if ( iDone1!=iDone2 )
  24103. fprintf ( stdout, "skiplists upgrade: building attribute min-max, %d%% done\r", iDone1 );
  24104. }
  24105. fprintf ( stdout, "skiplists upgrade: building attribute min-max, 100%% done\n" );
  24106. tBuilder.FinishCollect();
  24107. rdDocinfo.Close();
  24108. wrDocinfo.PutBytes ( dMinMax.Begin(), dMinMax.GetLength()*sizeof(CSphRowitem) );
  24109. wrDocinfo.CloseFile();
  24110. if ( wrDocinfo.IsError() )
  24111. sphDie ( "skiplists upgrade: attribute write error (out of space?)" );
  24112. bShuffleAttributes = true;
  24113. }
  24114. ////////////////////
  24115. // write new header
  24116. ////////////////////
  24117. ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tTokenizerSettings, &tEmbeddedFiles, sError );
  24118. if ( !pTokenizer )
  24119. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  24120. CSphDict * pDict = bWordDict
  24121. ? sphCreateDictionaryKeywords ( tDictSettings, &tEmbeddedFiles, pTokenizer, "$indexname", sError )
  24122. : sphCreateDictionaryCRC ( tDictSettings, &tEmbeddedFiles, pTokenizer, "$indexname", sError );
  24123. if ( !pDict )
  24124. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  24125. CSphWriter wrHeader;
  24126. sFilename.SetSprintf ( "%s.sph.upgrade", sPath );
  24127. if ( !wrHeader.OpenFile ( sFilename, sError ) )
  24128. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  24129. wrHeader.PutDword ( INDEX_MAGIC_HEADER );
  24130. wrHeader.PutDword ( INDEX_FORMAT_VERSION );
  24131. wrHeader.PutDword ( bUse64 );
  24132. wrHeader.PutDword ( eDocinfo );
  24133. WriteSchema ( wrHeader, tSchema );
  24134. wrHeader.PutOffset ( iMinDocid );
  24135. wrHeader.PutOffset ( tDictHeader.m_iDictCheckpointsOffset );
  24136. wrHeader.PutDword ( tDictHeader.m_iDictCheckpoints );
  24137. wrHeader.PutByte ( tDictHeader.m_iInfixCodepointBytes );
  24138. wrHeader.PutDword ( tDictHeader.m_iInfixBlocksOffset );
  24139. wrHeader.PutDword ( tDictHeader.m_iInfixBlocksWordsSize );
  24140. wrHeader.PutDword ( (DWORD)tStats.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
  24141. wrHeader.PutOffset ( tStats.m_iTotalBytes );
  24142. SaveIndexSettings ( wrHeader, tIndexSettings );
  24143. SaveTokenizerSettings ( wrHeader, pTokenizer, tIndexSettings.m_iEmbeddedLimit );
  24144. SaveDictionarySettings ( wrHeader, pDict, false, tIndexSettings.m_iEmbeddedLimit );
  24145. wrHeader.PutDword ( iKillListSize );
  24146. wrHeader.PutOffset ( uMinMaxIndex );
  24147. SaveFieldFilterSettings ( wrHeader, pFieldFilter );
  24148. // average field lengths
  24149. if ( tIndexSettings.m_bIndexFieldLens )
  24150. ARRAY_FOREACH ( i, tSchema.m_dFields )
  24151. wrHeader.PutOffset ( dFieldLens[i] );
  24152. wrHeader.CloseFile ();
  24153. if ( wrHeader.IsError() )
  24154. sphDie ( "skiplists upgrade: header write error (out of space?)" );
  24155. sFilename.SetSprintf ( "%s.sps", sPath );
  24156. if ( !sphIsReadable ( sFilename.cstr(), NULL ) )
  24157. {
  24158. CSphWriter wrStrings;
  24159. if ( !wrStrings.OpenFile ( sFilename, sError ) )
  24160. sphDie ( "skiplists upgrade: %s", sError.cstr() );
  24161. wrStrings.PutByte ( 0 );
  24162. wrStrings.CloseFile();
  24163. if ( wrStrings.IsError() )
  24164. sphDie ( "skiplists upgrade: string write error (out of space?)" );
  24165. }
  24166. // all done!
  24167. const char * sRenames[] = {
  24168. ".spe.tmp", ".spe",
  24169. ".sph", ".sph.bak",
  24170. ".spi", ".spi.bak",
  24171. ".sph.upgrade", ".sph",
  24172. ".spi.upgrade", ".spi",
  24173. bShuffleAttributes ? ".spa" : NULL, ".spa.bak",
  24174. ".spa.upgrade", ".spa",
  24175. NULL };
  24176. FinalizeUpgrade ( sRenames, "skiplists upgrade", sPath, tmStart );
  24177. }
  24178. bool CSphGlobalIDF::Touch ( const CSphString & sFilename )
  24179. {
  24180. // update m_uMTime, return true if modified
  24181. struct_stat tStat;
  24182. memset ( &tStat, 0, sizeof ( tStat ) );
  24183. if ( stat ( sFilename.cstr(), &tStat ) < 0 )
  24184. memset ( &tStat, 0, sizeof ( tStat ) );
  24185. bool bModified = ( m_uMTime!=tStat.st_mtime );
  24186. m_uMTime = tStat.st_mtime;
  24187. return bModified;
  24188. }
  24189. bool CSphGlobalIDF::Preread ( const CSphString & sFilename, CSphString & sError )
  24190. {
  24191. Touch ( sFilename );
  24192. CSphAutoreader tReader;
  24193. if ( !tReader.Open ( sFilename, sError ) )
  24194. return false;
  24195. m_iTotalDocuments = tReader.GetOffset ();
  24196. const SphOffset_t iSize = tReader.GetFilesize () - sizeof(SphOffset_t);
  24197. m_iTotalWords = iSize/sizeof(IDFWord_t);
  24198. // allocate words cache
  24199. CSphString sWarning;
  24200. if ( !m_pWords.Alloc ( m_iTotalWords, sError, sWarning ) )
  24201. return false;
  24202. // allocate lookup table if needed
  24203. int iHashSize = (int)( U64C(1) << HASH_BITS );
  24204. if ( m_iTotalWords > iHashSize*8 )
  24205. {
  24206. if ( !m_pHash.Alloc ( iHashSize+2, sError, sWarning ) )
  24207. return false;
  24208. }
  24209. // read file into memory (may exceed 2GB)
  24210. const int iBlockSize = 10485760; // 10M block
  24211. for ( SphOffset_t iRead=0; iRead<iSize && !sphInterrupted(); iRead+=iBlockSize )
  24212. tReader.GetBytes ( (BYTE*)m_pWords.GetWritePtr()+iRead, iRead+iBlockSize>iSize ? (int)( iSize-iRead ) : iBlockSize );
  24213. if ( sphInterrupted() )
  24214. return false;
  24215. // build lookup table
  24216. if ( m_pHash.GetLength () )
  24217. {
  24218. int64_t * pHash = m_pHash.GetWritePtr();
  24219. uint64_t uFirst = m_pWords[0].m_uWordID;
  24220. uint64_t uRange = m_pWords[m_iTotalWords-1].m_uWordID - uFirst;
  24221. DWORD iShift = 0;
  24222. while ( uRange>=( U64C(1) << HASH_BITS ) )
  24223. {
  24224. iShift++;
  24225. uRange >>= 1;
  24226. }
  24227. pHash[0] = iShift;
  24228. pHash[1] = 0;
  24229. DWORD uLastHash = 0;
  24230. for ( int64_t i=1; i<m_iTotalWords; i++ )
  24231. {
  24232. // check for interrupt (throttled for speed)
  24233. if ( ( i&0xffff )==0 && sphInterrupted() )
  24234. return false;
  24235. DWORD uHash = (DWORD)( ( m_pWords[i].m_uWordID-uFirst ) >> iShift );
  24236. if ( uHash==uLastHash )
  24237. continue;
  24238. while ( uLastHash<uHash )
  24239. pHash [ ++uLastHash+1 ] = i;
  24240. uLastHash = uHash;
  24241. }
  24242. pHash [ ++uLastHash+1 ] = m_iTotalWords;
  24243. }
  24244. return true;
  24245. }
  24246. const DWORD CSphGlobalIDF::GetDocs ( const CSphString & sWord ) const
  24247. {
  24248. uint64_t uWordID = sphFNV64 ( (BYTE*)sWord.cstr() );
  24249. int64_t iStart = 0;
  24250. int64_t iEnd = m_iTotalWords-1;
  24251. const IDFWord_t * pWords = (IDFWord_t *)m_pWords.GetWritePtr ();
  24252. if ( m_pHash.GetLength () )
  24253. {
  24254. uint64_t uFirst = pWords[0].m_uWordID;
  24255. DWORD uHash = (DWORD)( ( uWordID-uFirst ) >> m_pHash[0] );
  24256. if ( uHash > ( U64C(1) << HASH_BITS ) )
  24257. return 0;
  24258. iStart = m_pHash [ uHash+1 ];
  24259. iEnd = m_pHash [ uHash+2 ] - 1;
  24260. }
  24261. const IDFWord_t * pWord = sphBinarySearch ( pWords+iStart, pWords+iEnd, bind ( &IDFWord_t::m_uWordID ), uWordID );
  24262. return pWord ? pWord->m_iDocs : 0;
  24263. }
  24264. float CSphGlobalIDF::GetIDF ( const CSphString & sWord, int iDocsLocal, int iQwords, bool bPlainIDF )
  24265. {
  24266. const int64_t iDocs = Max ( iDocsLocal, (int64_t)GetDocs ( sWord ) );
  24267. const int64_t iTotalClamped = Max ( m_iTotalDocuments, iDocs );
  24268. if ( bPlainIDF )
  24269. {
  24270. float fLogTotal = logf ( float ( 1+iTotalClamped ) );
  24271. return logf ( float ( iTotalClamped-iDocs+1 ) / float ( iDocs ) )
  24272. / ( 2*iQwords*fLogTotal );
  24273. } else
  24274. {
  24275. float fLogTotal = logf ( float ( 1+iTotalClamped ) );
  24276. return logf ( float ( iTotalClamped ) / float ( iDocs ) )
  24277. / ( 2*iQwords*fLogTotal );
  24278. }
  24279. }
  24280. bool sphPrereadGlobalIDF ( const CSphString & sPath, CSphString & sError )
  24281. {
  24282. g_tGlobalIDFLock.Lock ();
  24283. CSphGlobalIDF ** ppGlobalIDF = g_hGlobalIDFs ( sPath );
  24284. bool bExpired = ( ppGlobalIDF && *ppGlobalIDF && (*ppGlobalIDF)->Touch ( sPath ) );
  24285. if ( !ppGlobalIDF || bExpired )
  24286. {
  24287. if ( bExpired )
  24288. sphLogDebug ( "Reloading global IDF (%s)", sPath.cstr() );
  24289. else
  24290. sphLogDebug ( "Loading global IDF (%s)", sPath.cstr() );
  24291. // unlock while prereading
  24292. g_tGlobalIDFLock.Unlock ();
  24293. CSphGlobalIDF * pGlobalIDF = new CSphGlobalIDF ();
  24294. if ( !pGlobalIDF->Preread ( sPath, sError ) )
  24295. {
  24296. SafeDelete ( pGlobalIDF );
  24297. return false;
  24298. }
  24299. // lock while updating
  24300. g_tGlobalIDFLock.Lock ();
  24301. if ( bExpired )
  24302. {
  24303. ppGlobalIDF = g_hGlobalIDFs ( sPath );
  24304. if ( ppGlobalIDF )
  24305. {
  24306. CSphGlobalIDF * pOld = *ppGlobalIDF;
  24307. *ppGlobalIDF = pGlobalIDF;
  24308. SafeDelete ( pOld );
  24309. }
  24310. } else
  24311. {
  24312. if ( !g_hGlobalIDFs.Add ( pGlobalIDF, sPath ) )
  24313. SafeDelete ( pGlobalIDF );
  24314. }
  24315. }
  24316. g_tGlobalIDFLock.Unlock ();
  24317. return true;
  24318. }
  24319. void sphUpdateGlobalIDFs ( const CSphVector<CSphString> & dFiles )
  24320. {
  24321. // delete unlisted entries
  24322. g_tGlobalIDFLock.Lock ();
  24323. g_hGlobalIDFs.IterateStart ();
  24324. while ( g_hGlobalIDFs.IterateNext () )
  24325. {
  24326. const CSphString & sKey = g_hGlobalIDFs.IterateGetKey ();
  24327. if ( !dFiles.Contains ( sKey ) )
  24328. {
  24329. sphLogDebug ( "Unloading global IDF (%s)", sKey.cstr() );
  24330. SafeDelete ( g_hGlobalIDFs.IterateGet () );
  24331. g_hGlobalIDFs.Delete ( sKey );
  24332. }
  24333. }
  24334. g_tGlobalIDFLock.Unlock ();
  24335. // load/rotate remaining entries
  24336. CSphString sError;
  24337. ARRAY_FOREACH ( i, dFiles )
  24338. {
  24339. CSphString sPath = dFiles[i];
  24340. if ( !sphPrereadGlobalIDF ( sPath, sError ) )
  24341. sphLogDebug ( "Could not load global IDF (%s): %s", sPath.cstr(), sError.cstr() );
  24342. }
  24343. }
  24344. void sphShutdownGlobalIDFs ()
  24345. {
  24346. CSphVector<CSphString> dEmptyFiles;
  24347. sphUpdateGlobalIDFs ( dEmptyFiles );
  24348. }
  24349. #if USE_WINDOWS
  24350. #pragma warning(default:4127) // conditional expr is const for MSVC
  24351. #endif
  24352. //////////////////////////////////////////////////////////////////////////
  24353. //
  24354. // $Id$
  24355. //