regexpr.pas 202 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993
  1. {$IFNDEF FPC_DOTTEDUNITS}
  2. unit regexpr;
  3. {$ENDIF FPC_DOTTEDUNITS}
  4. {
  5. TRegExpr class library
  6. Delphi Regular Expressions
  7. Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
  8. You can choose to use this Pascal unit in one of the two following licenses:
  9. Option 1>
  10. You may use this software in any kind of development,
  11. including comercial, redistribute, and modify it freely,
  12. under the following restrictions :
  13. 1. This software is provided as it is, without any kind of
  14. warranty given. Use it at Your own risk.The author is not
  15. responsible for any consequences of use of this software.
  16. 2. The origin of this software may not be mispresented, You
  17. must not claim that You wrote the original software. If
  18. You use this software in any kind of product, it would be
  19. appreciated that there in a information box, or in the
  20. documentation would be an acknowledgement like
  21. Partial Copyright (c) 2004 Andrey V. Sorokin
  22. https://sorokin.engineer/
  23. [email protected]
  24. 3. You may not have any income from distributing this source
  25. (or altered version of it) to other developers. When You
  26. use this product in a comercial package, the source may
  27. not be charged seperatly.
  28. 4. Altered versions must be plainly marked as such, and must
  29. not be misrepresented as being the original software.
  30. 5. RegExp Studio application and all the visual components as
  31. well as documentation is not part of the TRegExpr library
  32. and is not free for usage.
  33. https://sorokin.engineer/
  34. [email protected]
  35. Option 2>
  36. The same modified LGPL with static linking exception as the Free Pascal RTL
  37. }
  38. {
  39. program is essentially a linear encoding
  40. of a nondeterministic finite-state machine (aka syntax charts or
  41. "railroad normal form" in parsing technology). Each node is an opcode
  42. plus a "next" pointer, possibly plus an operand. "Next" pointers of
  43. all nodes except BRANCH implement concatenation; a "next" pointer with
  44. a BRANCH on both ends of it connects two alternatives. (Here we
  45. have one of the subtle syntax dependencies: an individual BRANCH (as
  46. opposed to a collection of them) is never concatenated with anything
  47. because of operator precedence.) The operand of some types of node is
  48. a literal string; for others, it is a node leading into a sub-FSM. In
  49. particular, the operand of a BRANCH node is the first node of the branch.
  50. (NB this is *not* a tree structure: the tail of the branch connects
  51. to the thing following the set of BRANCHes.)
  52. }
  53. interface
  54. { off $DEFINE DebugSynRegExpr }
  55. // ======== Determine compiler
  56. // ======== Define base compiler options
  57. {$BOOLEVAL OFF}
  58. {$EXTENDEDSYNTAX ON}
  59. {$LONGSTRINGS ON}
  60. {$MODE DELPHI} // Delphi-compatible mode in FreePascal
  61. {$INLINE ON}
  62. {$DEFINE COMPAT}
  63. // ======== Define options for TRegExpr engine
  64. { off $DEFINE UnicodeRE} // Use WideChar for characters and UnicodeString/WideString for strings
  65. { off $DEFINE UnicodeEx} // Support Unicode >0xFFFF, e.g. emoji, e.g. "." must find 2 WideChars of 1 emoji
  66. {$DEFINE UseWordChars} // Use WordChars property, otherwise fixed list 'a'..'z','A'..'Z','0'..'9','_'
  67. {$DEFINE UseSpaceChars} // Use SpaceChars property, otherwise fixed list
  68. {$DEFINE UseLineSep} // Use LineSeparators property, otherwise fixed line-break chars
  69. { off $DEFINE FastUnicodeData} // Use arrays for UpperCase/LowerCase/IsWordChar, they take 320K more memory
  70. {$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string
  71. {$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string
  72. {$IFNDEF FPC} // Not supported in FreePascal
  73. {$DEFINE reRealExceptionAddr} // Exceptions will point to appropriate source line, not to Error procedure
  74. {$ENDIF}
  75. {$DEFINE ComplexBraces} // Support braces in complex cases
  76. {$IFNDEF UnicodeRE}
  77. {$UNDEF UnicodeEx}
  78. {$UNDEF FastUnicodeData}
  79. {$ENDIF}
  80. {.$DEFINE Compat} // Enable compatability methods/properties for forked version in Free Pascal 3.0
  81. // ======== Define Pascal-language options
  82. // Define 'UseAsserts' option (do not edit this definitions).
  83. // Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
  84. // completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
  85. {$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
  86. {$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
  87. // Define 'use subroutine parameters default values' option (do not edit this definition).
  88. {$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
  89. {$IFDEF FPC} {$DEFINE DefParam} {$ENDIF}
  90. // Define 'OverMeth' options, to use method overloading (do not edit this definitions).
  91. {$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
  92. {$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
  93. // Define 'InlineFuncs' options, to use inline keyword (do not edit this definitions).
  94. {$IFDEF D8} {$DEFINE InlineFuncs} {$ENDIF}
  95. {$IFDEF FPC} {$DEFINE InlineFuncs} {$ENDIF}
  96. {$IFDEF FPC_DOTTEDUNITS}
  97. uses
  98. System.SysUtils, // Exception
  99. {$IFDEF D2009}
  100. {$IFDEF D_XE2}
  101. System.System.Character,
  102. {$ELSE}
  103. System.Character,
  104. {$ENDIF}
  105. {$ENDIF}
  106. System.Classes; // TStrings in Split method
  107. {$ELSE FPC_DOTTEDUNITS}
  108. uses
  109. SysUtils, // Exception
  110. {$IFDEF D2009}
  111. {$IFDEF D_XE2}
  112. System.Character,
  113. {$ELSE}
  114. Character,
  115. {$ENDIF}
  116. {$ENDIF}
  117. Classes; // TStrings in Split method
  118. {$ENDIF FPC_DOTTEDUNITS}
  119. type
  120. {$IFNDEF FPC}
  121. // Delphi doesn't have PtrInt but has NativeInt
  122. // but unfortunately NativeInt is declared wrongly in several versions
  123. {$IF SizeOf(Pointer)=4}
  124. PtrInt = Integer;
  125. PtrUInt = Cardinal;
  126. {$ELSE}
  127. PtrInt = Int64;
  128. PtrUInt = UInt64;
  129. {$IFEND}
  130. {$ENDIF}
  131. {$IFDEF UnicodeRE}
  132. PRegExprChar = PWideChar;
  133. {$IFDEF FPC}
  134. RegExprString = UnicodeString;
  135. {$ELSE}
  136. {$IFDEF D2009}
  137. RegExprString = UnicodeString;
  138. {$ELSE}
  139. RegExprString = WideString;
  140. {$ENDIF}
  141. {$ENDIF}
  142. REChar = WideChar;
  143. {$ELSE}
  144. PRegExprChar = PAnsiChar;
  145. RegExprString = AnsiString;
  146. REChar = AnsiChar;
  147. {$ENDIF}
  148. TREOp = REChar; // internal opcode type
  149. PREOp = ^TREOp;
  150. type
  151. TRegExprCharset = set of byte;
  152. const
  153. // Escape AnsiChar ('\' in common r.e.) used for escaping metachars (\w, \d etc)
  154. EscChar = '\';
  155. // Substitute method: prefix of group reference: $1 .. $9 and $<name>
  156. SubstituteGroupChar = '$';
  157. RegExprModifierI: boolean = False; // default value for ModifierI
  158. RegExprModifierR: boolean = True; // default value for ModifierR
  159. RegExprModifierS: boolean = True; // default value for ModifierS
  160. RegExprModifierG: boolean = True; // default value for ModifierG
  161. RegExprModifierM: boolean = False; // default value for ModifierM
  162. RegExprModifierX: boolean = False; // default value for ModifierX
  163. {$IFDEF UseSpaceChars}
  164. // default value for SpaceChars
  165. RegExprSpaceChars: RegExprString = ' '#$9#$A#$D#$C;
  166. {$ENDIF}
  167. {$IFDEF UseWordChars}
  168. // default value for WordChars
  169. RegExprWordChars: RegExprString = '0123456789'
  170. + 'abcdefghijklmnopqrstuvwxyz'
  171. + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
  172. {$ENDIF}
  173. {$IFDEF UseLineSep}
  174. // default value for LineSeparators
  175. RegExprLineSeparators: RegExprString = #$d#$a#$b#$c
  176. {$IFDEF UnicodeRE}
  177. + #$2028#$2029#$85
  178. {$ENDIF};
  179. {$ENDIF}
  180. // Tab and Unicode category "Space Separator":
  181. // https://www.compart.com/en/unicode/category/Zs
  182. RegExprHorzSeparators: RegExprString = #9#$20#$A0
  183. {$IFDEF UnicodeRE}
  184. + #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000
  185. {$ENDIF};
  186. RegExprUsePairedBreak: boolean = True;
  187. RegExprReplaceLineBreak: RegExprString = sLineBreak;
  188. RegExprLookaheadIsAtomic: boolean = False;
  189. RegExprLookbehindIsAtomic: boolean = True;
  190. const
  191. // Max number of groups.
  192. // Be carefull - don't use values which overflow OP_CLOSE* opcode
  193. // (in this case you'll get compiler error).
  194. // Big value causes slower work and more stack required.
  195. RegexMaxGroups = 90;
  196. // Max possible value for RegexMaxGroups.
  197. // Don't change it! It's defined by internal TRegExpr design.
  198. RegexMaxMaxGroups = 255;
  199. // Max depth of recursion for (?R) and (?1)..(?9)
  200. RegexMaxRecursion = 20;
  201. {$IFDEF ComplexBraces}
  202. const
  203. LoopStackMax = 10; // max depth of loops stack //###0.925
  204. type
  205. TRegExprLoopStack = array [1 .. LoopStackMax] of integer;
  206. {$ENDIF}
  207. type
  208. TRegExprModifiers = record
  209. I: boolean;
  210. // Case-insensitive.
  211. R: boolean;
  212. // Extended syntax for Russian ranges in [].
  213. // If True, then а-я additionally includes letter 'ё',
  214. // А-Я additionally includes 'Ё', and а-Я includes all Russian letters.
  215. // Turn it off if it interferes with your national alphabet.
  216. S: boolean;
  217. // Dot '.' matches any AnsiChar, otherwise only [^\n].
  218. G: boolean;
  219. // Greedy. Switching it off switches all operators to non-greedy style,
  220. // so if G=False, then '*' works like '*?', '+' works like '+?' and so on.
  221. M: boolean;
  222. // Treat string as multiple lines. It changes `^' and `$' from
  223. // matching at only the very start/end of the string to the start/end
  224. // of any line anywhere within the string.
  225. X: boolean;
  226. // Allow comments in regex using # AnsiChar.
  227. end;
  228. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  229. type
  230. TRegExpr = class;
  231. TRegExprReplaceFunction = function(ARegExpr: TRegExpr): RegExprString of object;
  232. TRegExprCharChecker = function(ch: REChar): boolean of object;
  233. TRegExprCharCheckerArray = array[0 .. 30] of TRegExprCharChecker;
  234. TRegExprCharCheckerInfo = record
  235. CharBegin, CharEnd: REChar;
  236. CheckerIndex: integer;
  237. end;
  238. TRegExprCharCheckerInfos = array of TRegExprCharCheckerInfo;
  239. {$IFDEF Compat}
  240. TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object;
  241. {$ENDIF}
  242. TRegExprBounds = record
  243. GrpStart: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group start in InputString
  244. GrpEnd: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to group end in InputString
  245. end;
  246. TRegExprBoundsArray = array[0 .. RegexMaxRecursion] of TRegExprBounds;
  247. { TRegExpr }
  248. TRegExpr = class
  249. private
  250. GrpBounds: TRegExprBoundsArray;
  251. GrpIndexes: array [0 .. RegexMaxGroups - 1] of integer; // map global group index to _capturing_ group index
  252. GrpNames: array [0 .. RegexMaxGroups - 1] of RegExprString; // names of groups, if non-empty
  253. GrpAtomic: array [0 .. RegexMaxGroups - 1] of boolean; // group[i] is atomic (filled in Compile)
  254. GrpAtomicDone: array [0 .. RegexMaxGroups - 1] of boolean; // atomic group[i] is "done" (used in Exec* only)
  255. GrpOpCodes: array [0 .. RegexMaxGroups - 1] of PRegExprChar; // pointer to opcode of group[i] (used by OP_SUBCALL*)
  256. GrpSubCalled: array [0 .. RegexMaxGroups - 1] of boolean; // group[i] is called by OP_SUBCALL*
  257. GrpCount: integer;
  258. {$IFDEF ComplexBraces}
  259. LoopStack: TRegExprLoopStack; // state before entering loop
  260. LoopStackIdx: integer; // 0 - out of all loops
  261. {$ENDIF}
  262. // The "internal use only" fields to pass info from compile
  263. // to execute that permits the execute phase to run lots faster on
  264. // simple cases.
  265. regAnchored: REChar; // is the match anchored (at beginning-of-line only)?
  266. // regAnchored permits very fast decisions on suitable starting points
  267. // for a match, cutting down the work a lot. regMust permits fast rejection
  268. // of lines that cannot possibly match. The regMust tests are costly enough
  269. // that regcomp() supplies a regMust only if the r.e. contains something
  270. // potentially expensive (at present, the only such thing detected is * or +
  271. // at the start of the r.e., which can involve a lot of backup). regMustLen is
  272. // supplied because the test in regexec() needs it and regcomp() is computing
  273. // it anyway.
  274. regMust: PRegExprChar; // string (pointer into program) that match must include, or nil
  275. regMustLen: integer; // length of regMust string
  276. regMustString: RegExprString; // string which must occur in match (got from regMust/regMustLen)
  277. regLookahead: boolean; // regex has _some_ lookahead
  278. regLookaheadNeg: boolean; // regex has _nagative_ lookahead
  279. regLookaheadGroup: integer; // index of group for lookahead
  280. regLookbehind: boolean; // regex has positive lookbehind
  281. regNestedCalls: integer; // some attempt to prevent 'catastrophic backtracking' but not used
  282. {$IFDEF UseFirstCharSet}
  283. FirstCharSet: TRegExprCharset;
  284. FirstCharArray: array[byte] of boolean;
  285. {$ENDIF}
  286. // work variables for Exec routines - save stack in recursion
  287. regInput: PRegExprChar; // pointer to currently handling AnsiChar of input string
  288. fInputStart: PRegExprChar; // pointer to first AnsiChar of input string
  289. fInputEnd: PRegExprChar; // pointer after last AnsiChar of input string
  290. fRegexStart: PRegExprChar; // pointer to first AnsiChar of regex
  291. fRegexEnd: PRegExprChar; // pointer after last AnsiChar of regex
  292. regCurrentGrp: integer; // index of group handling by OP_OPEN* opcode
  293. regRecursion: integer; // current level of recursion (?R) (?1); always 0 if no recursion is used
  294. // work variables for compiler's routines
  295. regParse: PRegExprChar; // pointer to currently handling AnsiChar of regex
  296. regNumBrackets: integer; // count of () brackets
  297. regDummy: REChar; // dummy pointer, used to detect 1st/2nd pass of Compile
  298. // if p=@regDummy, it is pass-1: opcode memory is not yet allocated
  299. programm: PRegExprChar; // pointer to opcode, =nil in pass-1
  300. regCode: PRegExprChar; // pointer to last emitted opcode; changing in pass-2, but =@regDummy in pass-1
  301. regCodeSize: integer; // total opcode size in REChars
  302. regCodeWork: PRegExprChar; // pointer to opcode, to first code after MAGIC
  303. regExactlyLen: PLongInt; // pointer to length of substring of OP_EXACTLY* inside opcode
  304. fSecondPass: boolean; // true inside pass-2 of Compile
  305. fExpression: RegExprString; // regex string
  306. fInputString: RegExprString; // input string
  307. fLastError: integer; // Error call sets code of LastError
  308. fLastErrorOpcode: TREOp;
  309. fLastErrorSymbol: REChar;
  310. fModifiers: TRegExprModifiers; // regex modifiers
  311. fCompModifiers: TRegExprModifiers; // compiler's copy of modifiers
  312. fProgModifiers: TRegExprModifiers; // modifiers values from last programm compilation
  313. {$IFDEF UseSpaceChars}
  314. fSpaceChars: RegExprString;
  315. {$ENDIF}
  316. {$IFDEF UseWordChars}
  317. fWordChars: RegExprString;
  318. {$ENDIF}
  319. {$IFDEF UseLineSep}
  320. fLineSeparators: RegExprString;
  321. {$ENDIF}
  322. fUsePairedBreak: boolean;
  323. fReplaceLineEnd: RegExprString; // string to use for "\n" in Substitute method
  324. fSlowChecksSizeMax: integer;
  325. // Exec() param ASlowChecks is set to True, when Length(InputString)<SlowChecksSizeMax
  326. // This ASlowChecks enables to use regMustString optimization
  327. {$IFDEF UseLineSep}
  328. {$IFNDEF UnicodeRE}
  329. fLineSepArray: array[byte] of boolean;
  330. {$ENDIF}
  331. {$ENDIF}
  332. CharCheckers: TRegExprCharCheckerArray;
  333. CharCheckerInfos: TRegExprCharCheckerInfos;
  334. CheckerIndex_Word: byte;
  335. CheckerIndex_NotWord: byte;
  336. CheckerIndex_Digit: byte;
  337. CheckerIndex_NotDigit: byte;
  338. CheckerIndex_Space: byte;
  339. CheckerIndex_NotSpace: byte;
  340. CheckerIndex_HorzSep: byte;
  341. CheckerIndex_NotHorzSep: byte;
  342. CheckerIndex_VertSep: byte;
  343. CheckerIndex_NotVertSep: byte;
  344. CheckerIndex_LowerAZ: byte;
  345. CheckerIndex_UpperAZ: byte;
  346. fHelper: TRegExpr;
  347. fHelperLen: integer;
  348. {$IFDEF Compat}
  349. fUseUnicodeWordDetection: boolean;
  350. fInvertCase: TRegExprInvertCaseFunction;
  351. fEmptyInputRaisesError: boolean;
  352. fUseOsLineEndOnReplace: boolean;
  353. function OldInvertCase(const Ch: REChar): REChar;
  354. function GetLinePairedSeparator: RegExprString;
  355. procedure SetLinePairedSeparator(const AValue: RegExprString);
  356. procedure SetUseOsLineEndOnReplace(AValue: boolean);
  357. {$ENDIF}
  358. procedure InitCharCheckers;
  359. function CharChecker_Word(ch: REChar): boolean;
  360. function CharChecker_NotWord(ch: REChar): boolean;
  361. function CharChecker_Space(ch: REChar): boolean;
  362. function CharChecker_NotSpace(ch: REChar): boolean;
  363. function CharChecker_Digit(ch: REChar): boolean;
  364. function CharChecker_NotDigit(ch: REChar): boolean;
  365. function CharChecker_HorzSep(ch: REChar): boolean;
  366. function CharChecker_NotHorzSep(ch: REChar): boolean;
  367. function CharChecker_VertSep(ch: REChar): boolean;
  368. function CharChecker_NotVertSep(ch: REChar): boolean;
  369. function CharChecker_LowerAZ(ch: REChar): boolean;
  370. function CharChecker_UpperAZ(ch: REChar): boolean;
  371. function DumpCheckerIndex(N: byte): RegExprString;
  372. function DumpCategoryChars(ch, ch2: REChar; Positive: boolean): RegExprString;
  373. procedure ClearMatches; {$IFDEF InlineFuncs}inline;{$ENDIF}
  374. procedure ClearInternalIndexes; {$IFDEF InlineFuncs}inline;{$ENDIF}
  375. function FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  376. procedure GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  377. procedure GetCharSetFromSpaceChars(var ARes: TRegExprCharset); {$IFDEF InlineFuncs}inline;{$ENDIF}
  378. procedure GetCharSetFromWordChars(var ARes: TRegExprCharSet); {$IFDEF InlineFuncs}inline;{$ENDIF}
  379. function IsWordChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  380. function IsSpaceChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  381. function IsCustomLineSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  382. {$IFDEF UseLineSep}
  383. procedure InitLineSepArray;
  384. {$ENDIF}
  385. procedure FindGroupName(APtr, AEndPtr: PRegExprChar; AEndChar: REChar; var AName: RegExprString);
  386. // Mark programm as having to be [re]compiled
  387. procedure InvalidateProgramm;
  388. // Check if we can use compiled regex, compile it if something changed
  389. function IsProgrammOk: boolean;
  390. procedure SetExpression(const AStr: RegExprString);
  391. function GetModifierStr: RegExprString;
  392. procedure SetModifierStr(const AStr: RegExprString);
  393. function GetModifierG: boolean;
  394. function GetModifierI: boolean;
  395. function GetModifierM: boolean;
  396. function GetModifierR: boolean;
  397. function GetModifierS: boolean;
  398. function GetModifierX: boolean;
  399. procedure SetModifierG(AValue: boolean);
  400. procedure SetModifierI(AValue: boolean);
  401. procedure SetModifierM(AValue: boolean);
  402. procedure SetModifierR(AValue: boolean);
  403. procedure SetModifierS(AValue: boolean);
  404. procedure SetModifierX(AValue: boolean);
  405. // Default handler raises exception ERegExpr with
  406. // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
  407. // and CompilerErrorPos = value of property CompilerErrorPos.
  408. procedure Error(AErrorID: integer); virtual; // error handler.
  409. { ==================== Compiler section =================== }
  410. // compile a regular expression into internal code
  411. function CompileRegExpr(ARegExp: PRegExprChar): boolean;
  412. // set the next-pointer at the end of a node chain
  413. procedure Tail(p: PRegExprChar; val: PRegExprChar);
  414. // regoptail - regtail on operand of first argument; nop if operandless
  415. procedure OpTail(p: PRegExprChar; val: PRegExprChar);
  416. // regnode - emit a node, return location
  417. function EmitNode(op: TREOp): PRegExprChar;
  418. // emit (if appropriate) a byte of code
  419. procedure EmitC(ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  420. // emit LongInt value
  421. procedure EmitInt(AValue: LongInt); {$IFDEF InlineFuncs}inline;{$ENDIF}
  422. // emit back-reference to group
  423. function EmitGroupRef(AIndex: integer; AIgnoreCase: boolean): PRegExprChar;
  424. {$IFDEF FastUnicodeData}
  425. procedure FindCategoryName(var scan: PRegExprChar; var ch1, ch2: REChar);
  426. function EmitCategoryMain(APositive: boolean): PRegExprChar;
  427. {$ENDIF}
  428. // insert an operator in front of already-emitted operand
  429. // Means relocating the operand.
  430. procedure InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  431. // ###0.90
  432. // regular expression, i.e. main body or parenthesized thing
  433. function ParseReg(InBrackets: boolean; var FlagParse: integer): PRegExprChar;
  434. // one alternative of an | operator
  435. function ParseBranch(var FlagParse: integer): PRegExprChar;
  436. // something followed by possible [*+?]
  437. function ParsePiece(var FlagParse: integer): PRegExprChar;
  438. function HexDig(Ch: REChar): integer;
  439. function UnQuoteChar(var APtr, AEnd: PRegExprChar): REChar;
  440. // the lowest level
  441. function ParseAtom(var FlagParse: integer): PRegExprChar;
  442. // current pos in r.e. - for error hanling
  443. function GetCompilerErrorPos: PtrInt;
  444. {$IFDEF UseFirstCharSet} // ###0.929
  445. procedure FillFirstCharSet(prog: PRegExprChar);
  446. {$ENDIF}
  447. { ===================== Matching section =================== }
  448. // repeatedly match something simple, report how many
  449. function FindRepeated(p: PRegExprChar; AMax: integer): integer;
  450. // dig the "next" pointer out of a node
  451. function regNext(p: PRegExprChar): PRegExprChar;
  452. // recursively matching routine
  453. function MatchPrim(prog: PRegExprChar): boolean;
  454. // match at specific position only, called from ExecPrim
  455. function MatchAtOnePos(APos: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  456. // Exec for stored InputString
  457. function ExecPrim(AOffset: integer; ATryOnce, ASlowChecks, ABackward: boolean): boolean;
  458. function GetSubExprCount: integer;
  459. function GetMatchPos(Idx: integer): PtrInt;
  460. function GetMatchLen(Idx: integer): PtrInt;
  461. function GetMatch(Idx: integer): RegExprString;
  462. procedure SetInputString(const AInputString: RegExprString);
  463. procedure SetInputRange(AStart, AEnd: PRegExprChar);
  464. {$IFDEF UseLineSep}
  465. procedure SetLineSeparators(const AStr: RegExprString);
  466. {$ENDIF}
  467. procedure SetUsePairedBreak(AValue: boolean);
  468. public
  469. constructor Create; {$IFDEF OverMeth} overload;
  470. constructor Create(const AExpression: RegExprString); overload;
  471. {$ENDIF}
  472. destructor Destroy; override;
  473. class function VersionMajor: integer;
  474. class function VersionMinor: integer;
  475. // match a programm against a string AInputString
  476. // !!! Exec store AInputString into InputString property
  477. // For Delphi 5 and higher available overloaded versions - first without
  478. // parameter (uses already assigned to InputString property value)
  479. // and second that has int parameter and is same as ExecPos
  480. function Exec(const AInputString: RegExprString): boolean;
  481. {$IFDEF OverMeth} overload;
  482. function Exec: boolean; overload;
  483. function Exec(AOffset: integer): boolean; overload;
  484. {$ENDIF}
  485. // find next match:
  486. // ExecNext;
  487. // works the same as
  488. // if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
  489. // else ExecPos (MatchPos [0] + MatchLen [0]);
  490. // but it's more simpler !
  491. // Raises exception if used without preceeding SUCCESSFUL call to
  492. // Exec* (Exec, ExecPos, ExecNext). So You always must use something like
  493. // if Exec (InputString) then repeat { proceed results} until not ExecNext;
  494. function ExecNext(ABackward: boolean {$IFDEF DefParam} = False{$ENDIF}): boolean;
  495. // find match for InputString starting from AOffset position
  496. // (AOffset=1 - first AnsiChar of InputString)
  497. function ExecPos(AOffset: integer {$IFDEF DefParam} = 1{$ENDIF}): boolean;
  498. {$IFDEF OverMeth} overload;
  499. function ExecPos(AOffset: integer; ATryOnce, ABackward: boolean): boolean; overload;
  500. {$ENDIF}
  501. // Returns ATemplate with '$&' or '$0' replaced by whole r.e.
  502. // occurence and '$1'...'$nn' replaced by subexpression with given index.
  503. // Symbol '$' is used instead of '\' (for future extensions
  504. // and for more Perl-compatibility) and accepts more than one digit.
  505. // If you want to place into template raw '$' or '\', use prefix '\'.
  506. // Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
  507. // If you want to place any number after '$' you must enclose it
  508. // with curly braces: '${12}'.
  509. // Example: 'a$12bc' -> 'a<Match[12]>bc'
  510. // 'a${1}2bc' -> 'a<Match[1]>2bc'.
  511. function Substitute(const ATemplate: RegExprString): RegExprString;
  512. // Splits AInputStr to list by positions of all r.e. occurencies.
  513. // Internally calls Exec, ExecNext.
  514. procedure Split(const AInputStr: RegExprString; APieces: TStrings);
  515. function Replace(const AInputStr: RegExprString;
  516. const AReplaceStr: RegExprString;
  517. AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}) // ###0.946
  518. : RegExprString; {$IFDEF OverMeth} overload;
  519. function Replace(const AInputStr: RegExprString;
  520. AReplaceFunc: TRegExprReplaceFunction): RegExprString; overload;
  521. {$ENDIF}
  522. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr.
  523. // If AUseSubstitution is true, then AReplaceStr will be used
  524. // as template for Substitution methods.
  525. // For example:
  526. // Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
  527. // Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
  528. // will return: def 'BLOCK' value 'test1'
  529. // Replace ('BLOCK( test1)', 'def "$1" value "$2"')
  530. // will return: def "$1" value "$2"
  531. // Internally calls Exec, ExecNext.
  532. // Overloaded version and ReplaceEx operate with callback function,
  533. // so you can implement really complex functionality.
  534. function ReplaceEx(const AInputStr: RegExprString;
  535. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  536. {$IFDEF Compat}
  537. function ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload; deprecated 'Use modern form of ExecPos()';
  538. class function InvertCaseFunction(const Ch: REChar): REChar; deprecated 'This has no effect now';
  539. property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; deprecated 'This has no effect now';
  540. property UseUnicodeWordDetection: boolean read fUseUnicodeWordDetection write fUseUnicodeWordDetection; deprecated 'This has no effect, use {$DEFINE UnicodeRE} instead';
  541. property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; deprecated 'This has no effect now';
  542. property EmptyInputRaisesError: boolean read fEmptyInputRaisesError write fEmptyInputRaisesError; deprecated 'This has no effect now';
  543. property UseOsLineEndOnReplace: boolean read fUseOsLineEndOnReplace write SetUseOsLineEndOnReplace; deprecated 'Use property ReplaceLineEnd instead';
  544. {$ENDIF}
  545. // Returns ID of last error, 0 if no errors (unusable if
  546. // Error method raises exception) and clear internal status
  547. // into 0 (no errors).
  548. function LastError: integer;
  549. // Returns Error message for error with ID = AErrorID.
  550. function ErrorMsg(AErrorID: integer): RegExprString; virtual;
  551. // Re-compile regex
  552. procedure Compile;
  553. {$IFDEF RegExpPCodeDump}
  554. // Show compiled regex in textual form
  555. function Dump: RegExprString;
  556. // Show single opcode in textual form
  557. function DumpOp(op: TREOp): RegExprString;
  558. {$ENDIF}
  559. function IsCompiled: boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  560. // Opcode contains only operations for fixed match length: EXACTLY*, ANY*, etc
  561. function IsFixedLength(var op: TREOp; var ALen: integer): boolean;
  562. // Regular expression.
  563. // For optimization, TRegExpr will automatically compiles it into 'P-code'
  564. // (You can see it with help of Dump method) and stores in internal
  565. // structures. Real [re]compilation occures only when it really needed -
  566. // while calling Exec, ExecNext, Substitute, Dump, etc
  567. // and only if Expression or other P-code affected properties was changed
  568. // after last [re]compilation.
  569. // If any errors while [re]compilation occures, Error method is called
  570. // (by default Error raises exception - see below)
  571. property Expression: RegExprString read fExpression write SetExpression;
  572. // Set/get default values of r.e.syntax modifiers. Modifiers in
  573. // r.e. (?ismx-ismx) will replace this default values.
  574. // If you try to set unsupported modifier, Error will be called
  575. // (by defaul Error raises exception ERegExpr).
  576. property ModifierStr: RegExprString read GetModifierStr write SetModifierStr;
  577. property ModifierI: boolean read GetModifierI write SetModifierI;
  578. property ModifierR: boolean read GetModifierR write SetModifierR;
  579. property ModifierS: boolean read GetModifierS write SetModifierS;
  580. property ModifierG: boolean read GetModifierG write SetModifierG;
  581. property ModifierM: boolean read GetModifierM write SetModifierM;
  582. property ModifierX: boolean read GetModifierX write SetModifierX;
  583. // returns current input string (from last Exec call or last assign
  584. // to this property).
  585. // Any assignment to this property clear Match* properties !
  586. property InputString: RegExprString read fInputString write SetInputString;
  587. // Number of subexpressions has been found in last Exec* call.
  588. // If there are no subexpr. but whole expr was found (Exec* returned True),
  589. // then SubExprMatchCount=0, if no subexpressions nor whole
  590. // r.e. found (Exec* returned false) then SubExprMatchCount=-1.
  591. // Note, that some subexpr. may be not found and for such
  592. // subexpr. MathPos=MatchLen=-1 and Match=''.
  593. // For example: Expression := '(1)?2(3)?';
  594. // Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
  595. // Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
  596. // Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
  597. // Exec ('2'): SubExprMatchCount=0, Match[0]='2'
  598. // Exec ('7') - return False: SubExprMatchCount=-1
  599. property SubExprMatchCount: integer read GetSubExprCount;
  600. // pos of entrance subexpr. #Idx into tested in last Exec*
  601. // string. First subexpr. has Idx=1, last - MatchCount,
  602. // whole r.e. has Idx=0.
  603. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  604. // not found in input string.
  605. property MatchPos[Idx: integer]: PtrInt read GetMatchPos;
  606. // len of entrance subexpr. #Idx r.e. into tested in last Exec*
  607. // string. First subexpr. has Idx=1, last - MatchCount,
  608. // whole r.e. has Idx=0.
  609. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  610. // not found in input string.
  611. // Remember - MatchLen may be 0 (if r.e. match empty string) !
  612. property MatchLen[Idx: integer]: PtrInt read GetMatchLen;
  613. // == copy (InputString, MatchPos [Idx], MatchLen [Idx])
  614. // Returns '' if in r.e. no such subexpr. or this subexpr.
  615. // not found in input string.
  616. property Match[Idx: integer]: RegExprString read GetMatch;
  617. // get index of group (subexpression) by name, to support named groups
  618. // like in Python: (?P<name>regex)
  619. function MatchIndexFromName(const AName: RegExprString): integer;
  620. function MatchFromName(const AName: RegExprString): RegExprString;
  621. // Returns position in r.e. where compiler stopped.
  622. // Useful for error diagnostics
  623. property CompilerErrorPos: PtrInt read GetCompilerErrorPos;
  624. {$IFDEF UseSpaceChars}
  625. // Contains chars, treated as /s (initially filled with RegExprSpaceChars
  626. // global constant)
  627. property SpaceChars: RegExprString read fSpaceChars write fSpaceChars;
  628. // ###0.927
  629. {$ENDIF}
  630. {$IFDEF UseWordChars}
  631. // Contains chars, treated as /w (initially filled with RegExprWordChars
  632. // global constant)
  633. property WordChars: RegExprString read fWordChars write fWordChars;
  634. // ###0.929
  635. {$ENDIF}
  636. {$IFDEF UseLineSep}
  637. // line separators (like \n in Unix)
  638. property LineSeparators: RegExprString read fLineSeparators write SetLineSeparators; // ###0.941
  639. {$ENDIF}
  640. // support paired line-break CR LF
  641. property UseLinePairedBreak: boolean read fUsePairedBreak write SetUsePairedBreak;
  642. property ReplaceLineEnd: RegExprString read fReplaceLineEnd write fReplaceLineEnd;
  643. property SlowChecksSizeMax: integer read fSlowChecksSizeMax write fSlowChecksSizeMax;
  644. end;
  645. type
  646. ERegExpr = class(Exception)
  647. public
  648. ErrorCode: integer;
  649. CompilerErrorPos: PtrInt;
  650. end;
  651. // true if string AInputString match regular expression ARegExpr
  652. // ! will raise exeption if syntax errors in ARegExpr
  653. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  654. // Split AInputStr into APieces by r.e. ARegExpr occurencies
  655. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  656. APieces: TStrings);
  657. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr
  658. // If AUseSubstitution is true, then AReplaceStr will be used
  659. // as template for Substitution methods.
  660. // For example:
  661. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  662. // 'BLOCK( test1)', 'def "$1" value "$2"', True)
  663. // will return: def 'BLOCK' value 'test1'
  664. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  665. // 'BLOCK( test1)', 'def "$1" value "$2"')
  666. // will return: def "$1" value "$2"
  667. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  668. AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
  669. {$IFDEF OverMeth}overload; // ###0.947
  670. // Alternate form allowing to set more parameters.
  671. type
  672. TRegexReplaceOption = (
  673. rroModifierI,
  674. rroModifierR,
  675. rroModifierS,
  676. rroModifierG,
  677. rroModifierM,
  678. rroModifierX,
  679. rroUseSubstitution,
  680. rroUseOsLineEnd
  681. );
  682. TRegexReplaceOptions = set of TRegexReplaceOption;
  683. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  684. Options: TRegexReplaceOptions): RegExprString; overload;
  685. {$ENDIF}
  686. // Replace all metachars with its safe representation,
  687. // for example 'abc$cd.(' converts into 'abc\$cd\.\('
  688. // This function useful for r.e. autogeneration from
  689. // user input
  690. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  691. // Makes list of subexpressions found in ARegExpr r.e.
  692. // In ASubExps every item represent subexpression,
  693. // from first to last, in format:
  694. // String - subexpression text (without '()')
  695. // low word of Object - starting position in ARegExpr, including '('
  696. // if exists! (first position is 1)
  697. // high word of Object - length, including starting '(' and ending ')'
  698. // if exist!
  699. // AExtendedSyntax - must be True if modifier /m will be On while
  700. // using the r.e.
  701. // Useful for GUI editors of r.e. etc (You can find example of using
  702. // in TestRExp.dpr project)
  703. // Returns
  704. // 0 Success. No unbalanced brackets was found;
  705. // -1 There are not enough closing brackets ')';
  706. // -(n+1) At position n was found opening '[' without //###0.942
  707. // corresponding closing ']';
  708. // n At position n was found closing bracket ')' without
  709. // corresponding opening '('.
  710. // If Result <> 0, then ASubExpr can contain empty items or illegal ones
  711. function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings;
  712. AExtendedSyntax: boolean{$IFDEF DefParam} = False{$ENDIF}): integer;
  713. implementation
  714. {$IFDEF FastUnicodeData}
  715. uses
  716. regexpr_unicodedata;
  717. {$ENDIF}
  718. const
  719. // TRegExpr.VersionMajor/Minor return values of these constants:
  720. REVersionMajor = 1;
  721. REVersionMinor = 158;
  722. OpKind_End = REChar(1);
  723. OpKind_MetaClass = REChar(2);
  724. OpKind_Range = REChar(3);
  725. OpKind_Char = REChar(4);
  726. OpKind_CategoryYes = REChar(5);
  727. OpKind_CategoryNo = REChar(6);
  728. RegExprAllSet = [0 .. 255];
  729. RegExprWordSet = [Ord('a') .. Ord('z'), Ord('A') .. Ord('Z'), Ord('0') .. Ord('9'), Ord('_')];
  730. RegExprDigitSet = [Ord('0') .. Ord('9')];
  731. RegExprLowerAzSet = [Ord('a') .. Ord('z')];
  732. RegExprUpperAzSet = [Ord('A') .. Ord('Z')];
  733. RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet;
  734. RegExprSpaceSet = [Ord(' '), $9, $A, $D, $C];
  735. RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UnicodeRE} + [$85] {$ENDIF};
  736. RegExprHorzSeparatorsSet = [9, $20, $A0];
  737. MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
  738. type
  739. TRENextOff = PtrInt;
  740. // internal Next "pointer" (offset to current p-code) //###0.933
  741. PRENextOff = ^TRENextOff;
  742. // used for extracting Next "pointers" from compiled r.e. //###0.933
  743. TREBracesArg = integer; // type of {m,n} arguments
  744. PREBracesArg = ^TREBracesArg;
  745. TREGroupKind = (
  746. gkNormalGroup,
  747. gkNonCapturingGroup,
  748. gkNamedGroupReference,
  749. gkComment,
  750. gkModifierString,
  751. gkLookahead,
  752. gkLookaheadNeg,
  753. gkLookbehind,
  754. gkLookbehindNeg,
  755. gkRecursion,
  756. gkSubCall
  757. );
  758. // Alexey T.: handling of that define FPC_REQUIRES_PROPER_ALIGNMENT was present even 15 years ago,
  759. // but with it, we have failing of some RegEx tests, on ARM64 CPU.
  760. // If I undefine FPC_REQUIRES_PROPER_ALIGNMENT, all tests run OK on ARM64 again.
  761. {$undef FPC_REQUIRES_PROPER_ALIGNMENT}
  762. const
  763. REOpSz = SizeOf(TREOp) div SizeOf(REChar);
  764. // size of OP_ command in REChars
  765. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  766. // add space for aligning pointer
  767. // -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size
  768. RENextOffSz = (2 * SizeOf(TRENextOff) div SizeOf(REChar)) - 1;
  769. REBracesArgSz = (2 * SizeOf(TREBracesArg) div SizeOf(REChar));
  770. // add space for aligning pointer
  771. {$ELSE}
  772. RENextOffSz = (SizeOf(TRENextOff) div SizeOf(REChar));
  773. // size of Next pointer in REChars
  774. REBracesArgSz = SizeOf(TREBracesArg) div SizeOf(REChar);
  775. // size of BRACES arguments in REChars
  776. {$ENDIF}
  777. RENumberSz = SizeOf(LongInt) div SizeOf(REChar);
  778. type
  779. PtrPair = {$IFDEF UnicodeRE} ^LongInt; {$ELSE} ^Word; {$ENDIF}
  780. function IsPairedBreak(p: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  781. const
  782. cBreak = {$IFDEF UnicodeRE} $000D000A; {$ELSE} $0D0A; {$ENDIF}
  783. begin
  784. Result := PtrPair(p)^ = cBreak;
  785. end;
  786. function _FindCharInBuffer(SBegin, SEnd: PRegExprChar; Ch: REChar): PRegExprChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
  787. begin
  788. while SBegin < SEnd do
  789. begin
  790. if SBegin^ = Ch then
  791. begin
  792. Result := SBegin;
  793. Exit;
  794. end;
  795. Inc(SBegin);
  796. end;
  797. Result := nil;
  798. end;
  799. function IsIgnoredChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  800. begin
  801. case AChar of
  802. ' ', #9, #$d, #$a:
  803. Result := True
  804. else
  805. Result := False;
  806. end;
  807. end;
  808. function _IsMetaChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  809. begin
  810. case AChar of
  811. 'd', 'D',
  812. 's', 'S',
  813. 'w', 'W',
  814. 'v', 'V',
  815. 'h', 'H':
  816. Result := True
  817. else
  818. Result := False;
  819. end;
  820. end;
  821. function AlignToPtr(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  822. begin
  823. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  824. Result := Align(p, SizeOf(Pointer));
  825. {$ELSE}
  826. Result := p;
  827. {$ENDIF}
  828. end;
  829. function AlignToInt(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  830. begin
  831. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  832. Result := Align(p, SizeOf(integer));
  833. {$ELSE}
  834. Result := p;
  835. {$ENDIF}
  836. end;
  837. {$IFDEF FastUnicodeData}
  838. function _UpperCase(Ch: REChar): REChar; inline;
  839. begin
  840. Result := CharUpperArray[Ord(Ch)];
  841. end;
  842. function _LowerCase(Ch: REChar): REChar; inline;
  843. begin
  844. Result := CharLowerArray[Ord(Ch)];
  845. end;
  846. {$ELSE}
  847. function _UpperCase(Ch: REChar): REChar;
  848. begin
  849. Result := Ch;
  850. if (Ch >= 'a') and (Ch <= 'z') then
  851. begin
  852. Dec(Result, 32);
  853. Exit;
  854. end;
  855. if Ord(Ch) < 128 then
  856. Exit;
  857. {$IFDEF FPC}
  858. {$IFDEF UnicodeRE}
  859. Result := UnicodeUpperCase(Ch)[1];
  860. {$ELSE}
  861. Result := AnsiUpperCase(Ch)[1];
  862. {$ENDIF}
  863. {$ELSE}
  864. {$IFDEF UnicodeRE}
  865. {$IFDEF D_XE4}
  866. Result := Ch.ToUpper;
  867. {$ELSE}
  868. {$IFDEF D2009}
  869. Result := TCharacter.ToUpper(Ch);
  870. {$ENDIF}
  871. {$ENDIF}
  872. {$ELSE}
  873. Result := AnsiUpperCase(Ch)[1];
  874. {$ENDIF}
  875. {$ENDIF}
  876. end;
  877. function _LowerCase(Ch: REChar): REChar;
  878. begin
  879. Result := Ch;
  880. if (Ch >= 'A') and (Ch <= 'Z') then
  881. begin
  882. Inc(Result, 32);
  883. Exit;
  884. end;
  885. if Ord(Ch) < 128 then
  886. Exit;
  887. {$IFDEF FPC}
  888. {$IFDEF UnicodeRE}
  889. Result := UnicodeLowerCase(Ch)[1];
  890. {$ELSE}
  891. Result := AnsiLowerCase(Ch)[1];
  892. {$ENDIF}
  893. {$ELSE}
  894. {$IFDEF UnicodeRE}
  895. {$IFDEF D_XE4}
  896. Result := Ch.ToLower;
  897. {$ELSE}
  898. {$IFDEF D2009}
  899. Result := TCharacter.ToLower(Ch);
  900. {$ENDIF}
  901. {$ENDIF}
  902. {$ELSE}
  903. Result := AnsiLowerCase(Ch)[1];
  904. {$ENDIF}
  905. {$ENDIF}
  906. end;
  907. {$ENDIF}
  908. function InvertCase(const Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
  909. begin
  910. Result := _UpperCase(Ch);
  911. if Result = Ch then
  912. Result := _LowerCase(Ch);
  913. end;
  914. function _FindClosingBracket(P, PEnd: PRegExprChar): PRegExprChar;
  915. var
  916. Level: integer;
  917. begin
  918. Result := nil;
  919. Level := 1;
  920. repeat
  921. if P >= PEnd then Exit;
  922. case P^ of
  923. EscChar:
  924. Inc(P);
  925. '(':
  926. begin
  927. Inc(Level);
  928. end;
  929. ')':
  930. begin
  931. Dec(Level);
  932. if Level = 0 then
  933. begin
  934. Result := P;
  935. Exit;
  936. end;
  937. end;
  938. end;
  939. Inc(P);
  940. until False;
  941. end;
  942. {$IFDEF UNICODEEX}
  943. procedure IncUnicode(var p: PRegExprChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  944. // make additional increment if we are on low-surrogate AnsiChar
  945. // no need to check p<fInputEnd, at the end of string we have chr(0)
  946. var
  947. ch: REChar;
  948. begin
  949. Inc(p);
  950. ch := p^;
  951. if (Ord(ch) >= $DC00) and (Ord(ch) <= $DFFF) then
  952. Inc(p);
  953. end;
  954. procedure IncUnicode2(var p: PRegExprChar; var N: integer); {$IFDEF InlineFuncs}inline;{$ENDIF}
  955. var
  956. ch: REChar;
  957. begin
  958. Inc(p);
  959. Inc(N);
  960. ch := p^;
  961. if (Ord(ch) >= $DC00) and (Ord(ch) <= $DFFF) then
  962. begin
  963. Inc(p);
  964. Inc(N);
  965. end;
  966. end;
  967. {$ENDIF}
  968. { ============================================================= }
  969. { ===================== Global functions ====================== }
  970. { ============================================================= }
  971. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  972. begin
  973. Result :=
  974. (A.I = B.I) and
  975. (A.G = B.G) and
  976. (A.M = B.M) and
  977. (A.S = B.S) and
  978. (A.R = B.R) and
  979. (A.X = B.X);
  980. end;
  981. function ParseModifiers(const APtr: PRegExprChar;
  982. ALen: integer;
  983. var AValue: TRegExprModifiers): boolean;
  984. // Parse string and set AValue if it's in format 'ismxrg-ismxrg'
  985. var
  986. IsOn: boolean;
  987. i: integer;
  988. begin
  989. Result := True;
  990. IsOn := True;
  991. for i := 0 to ALen-1 do
  992. case APtr[i] of
  993. '-':
  994. IsOn := False;
  995. 'I', 'i':
  996. AValue.I := IsOn;
  997. 'R', 'r':
  998. AValue.R := IsOn;
  999. 'S', 's':
  1000. AValue.S := IsOn;
  1001. 'G', 'g':
  1002. AValue.G := IsOn;
  1003. 'M', 'm':
  1004. AValue.M := IsOn;
  1005. 'X', 'x':
  1006. AValue.X := IsOn;
  1007. else
  1008. begin
  1009. Result := False;
  1010. Exit;
  1011. end;
  1012. end;
  1013. end;
  1014. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  1015. var
  1016. r: TRegExpr;
  1017. begin
  1018. r := TRegExpr.Create;
  1019. try
  1020. r.Expression := ARegExpr;
  1021. Result := r.Exec(AInputStr);
  1022. finally
  1023. r.Free;
  1024. end;
  1025. end; { of function ExecRegExpr
  1026. -------------------------------------------------------------- }
  1027. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  1028. APieces: TStrings);
  1029. var
  1030. r: TRegExpr;
  1031. begin
  1032. APieces.Clear;
  1033. r := TRegExpr.Create;
  1034. try
  1035. r.Expression := ARegExpr;
  1036. r.Split(AInputStr, APieces);
  1037. finally
  1038. r.Free;
  1039. end;
  1040. end; { of procedure SplitRegExpr
  1041. -------------------------------------------------------------- }
  1042. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  1043. AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
  1044. begin
  1045. with TRegExpr.Create do
  1046. try
  1047. Expression := ARegExpr;
  1048. Result := Replace(AInputStr, AReplaceStr, AUseSubstitution);
  1049. finally
  1050. Free;
  1051. end;
  1052. end; { of function ReplaceRegExpr
  1053. -------------------------------------------------------------- }
  1054. {$IFDEF OverMeth}
  1055. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  1056. Options: TRegexReplaceOptions): RegExprString; overload;
  1057. begin
  1058. with TRegExpr.Create do
  1059. try
  1060. ModifierI := (rroModifierI in Options);
  1061. ModifierR := (rroModifierR in Options);
  1062. ModifierS := (rroModifierS in Options);
  1063. ModifierG := (rroModifierG in Options);
  1064. ModifierM := (rroModifierM in Options);
  1065. ModifierX := (rroModifierX in Options);
  1066. // Set this after the above, if the regex contains modifiers, they will be applied.
  1067. Expression := ARegExpr;
  1068. if rroUseOsLineEnd in Options then
  1069. ReplaceLineEnd := sLineBreak
  1070. else
  1071. ReplaceLineEnd := #10;
  1072. Result := Replace(AInputStr, AReplaceStr, rroUseSubstitution in Options);
  1073. finally
  1074. Free;
  1075. end;
  1076. end;
  1077. {$ENDIF}
  1078. (*
  1079. const
  1080. MetaChars_Init = '^$.[()|?+*' + EscChar + '{';
  1081. MetaChars = MetaChars_Init; // not needed to be a variable, const is faster
  1082. MetaAll = MetaChars_Init + ']}'; // Very similar to MetaChars, but slighly changed.
  1083. *)
  1084. function _IsMetaSymbol1(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1085. begin
  1086. case ch of
  1087. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{':
  1088. Result := True
  1089. else
  1090. Result := False
  1091. end;
  1092. end;
  1093. function _IsMetaSymbol2(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1094. begin
  1095. case ch of
  1096. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{',
  1097. ']', '}':
  1098. Result := True
  1099. else
  1100. Result := False
  1101. end;
  1102. end;
  1103. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  1104. var
  1105. i, i0, Len: integer;
  1106. ch: REChar;
  1107. begin
  1108. Result := '';
  1109. Len := Length(AStr);
  1110. i := 1;
  1111. i0 := i;
  1112. while i <= Len do
  1113. begin
  1114. ch := AStr[i];
  1115. if _IsMetaSymbol2(ch) then
  1116. begin
  1117. Result := Result + System.Copy(AStr, i0, i - i0) + EscChar + ch;
  1118. i0 := i + 1;
  1119. end;
  1120. Inc(i);
  1121. end;
  1122. Result := Result + System.Copy(AStr, i0, MaxInt); // Tail
  1123. end; { of function QuoteRegExprMetaChars
  1124. -------------------------------------------------------------- }
  1125. function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings;
  1126. AExtendedSyntax: boolean{$IFDEF DefParam} = False{$ENDIF}): integer;
  1127. type
  1128. TStackItemRec = record // ###0.945
  1129. SubExprIdx: integer;
  1130. StartPos: PtrInt;
  1131. end;
  1132. TStackArray = packed array [0 .. RegexMaxMaxGroups - 1] of TStackItemRec;
  1133. var
  1134. Len, SubExprLen: integer;
  1135. i, i0: integer;
  1136. Modif: TRegExprModifiers;
  1137. Stack: ^TStackArray; // ###0.945
  1138. StackIdx, StackSz: integer;
  1139. begin
  1140. Result := 0; // no unbalanced brackets found at this very moment
  1141. FillChar(Modif, SizeOf(Modif), 0);
  1142. ASubExprs.Clear; // I don't think that adding to non empty list
  1143. // can be useful, so I simplified algorithm to work only with empty list
  1144. Len := Length(ARegExpr); // some optimization tricks
  1145. // first we have to calculate number of subexpression to reserve
  1146. // space in Stack array (may be we'll reserve more than needed, but
  1147. // it's faster then memory reallocation during parsing)
  1148. StackSz := 1; // add 1 for entire r.e.
  1149. for i := 1 to Len do
  1150. if ARegExpr[i] = '(' then
  1151. Inc(StackSz);
  1152. // SetLength (Stack, StackSz); //###0.945
  1153. GetMem(Stack, SizeOf(TStackItemRec) * StackSz);
  1154. try
  1155. StackIdx := 0;
  1156. i := 1;
  1157. while (i <= Len) do
  1158. begin
  1159. case ARegExpr[i] of
  1160. '(':
  1161. begin
  1162. if (i < Len) and (ARegExpr[i + 1] = '?') then
  1163. begin
  1164. // this is not subexpression, but comment or other
  1165. // Perl extension. We must check is it (?ismxrg-ismxrg)
  1166. // and change AExtendedSyntax if /x is changed.
  1167. Inc(i, 2); // skip '(?'
  1168. i0 := i;
  1169. while (i <= Len) and (ARegExpr[i] <> ')') do
  1170. Inc(i);
  1171. if i > Len then
  1172. Result := -1 // unbalansed '('
  1173. else
  1174. if ParseModifiers(@ARegExpr[i0], i - i0, Modif) then
  1175. // Alexey-T: original code had copy from i, not from i0
  1176. AExtendedSyntax := Modif.X;
  1177. end
  1178. else
  1179. begin // subexpression starts
  1180. ASubExprs.Add(''); // just reserve space
  1181. with Stack[StackIdx] do
  1182. begin
  1183. SubExprIdx := ASubExprs.Count - 1;
  1184. StartPos := i;
  1185. end;
  1186. Inc(StackIdx);
  1187. end;
  1188. end;
  1189. ')':
  1190. begin
  1191. if StackIdx = 0 then
  1192. Result := i // unbalanced ')'
  1193. else
  1194. begin
  1195. Dec(StackIdx);
  1196. with Stack[StackIdx] do
  1197. begin
  1198. SubExprLen := i - StartPos + 1;
  1199. ASubExprs.Objects[SubExprIdx] :=
  1200. TObject(StartPos or (SubExprLen ShL 16));
  1201. ASubExprs[SubExprIdx] := System.Copy(ARegExpr, StartPos + 1,
  1202. SubExprLen - 2); // add without brackets
  1203. end;
  1204. end;
  1205. end;
  1206. EscChar:
  1207. Inc(i); // skip quoted symbol
  1208. '[':
  1209. begin
  1210. // we have to skip character ranges at once, because they can
  1211. // contain '#', and '#' in it must NOT be recognized as eXtended
  1212. // comment beginning!
  1213. i0 := i;
  1214. Inc(i);
  1215. if ARegExpr[i] = ']' // first ']' inside [] treated as simple AnsiChar, no need to check '['
  1216. then
  1217. Inc(i);
  1218. while (i <= Len) and (ARegExpr[i] <> ']') do
  1219. if ARegExpr[i] = EscChar // ###0.942
  1220. then
  1221. Inc(i, 2) // skip 'escaped' AnsiChar to prevent stopping at '\]'
  1222. else
  1223. Inc(i);
  1224. if (i > Len) or (ARegExpr[i] <> ']') // ###0.942
  1225. then
  1226. Result := -(i0 + 1); // unbalansed '[' //###0.942
  1227. end;
  1228. '#':
  1229. if AExtendedSyntax then
  1230. begin
  1231. // skip eXtended comments
  1232. while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a)
  1233. // do not use [#$d, #$a] due to Unicode compatibility
  1234. do
  1235. Inc(i);
  1236. while (i + 1 <= Len) and
  1237. ((ARegExpr[i + 1] = #$d) or (ARegExpr[i + 1] = #$a)) do
  1238. Inc(i); // attempt to work with different kinds of line separators
  1239. // now we are at the line separator that must be skipped.
  1240. end;
  1241. // here is no 'else' clause - we simply skip ordinary chars
  1242. end; // of case
  1243. Inc(i); // skip scanned AnsiChar
  1244. // ! can move after Len due to skipping quoted symbol
  1245. end;
  1246. // check brackets balance
  1247. if StackIdx <> 0 then
  1248. Result := -1; // unbalansed '('
  1249. // check if entire r.e. added
  1250. if (ASubExprs.Count = 0) or ((PtrInt(ASubExprs.Objects[0]) and $FFFF) <> 1)
  1251. or (((PtrInt(ASubExprs.Objects[0]) ShR 16) and $FFFF) <> Len)
  1252. // whole r.e. wasn't added because it isn't bracketed
  1253. // well, we add it now:
  1254. then
  1255. ASubExprs.InsertObject(0, ARegExpr, TObject((Len ShL 16) or 1));
  1256. finally
  1257. FreeMem(Stack);
  1258. end;
  1259. end; { of function RegExprSubExpressions
  1260. -------------------------------------------------------------- }
  1261. const
  1262. OP_MAGIC = TREOp(216); // programm signature
  1263. // name opcode opnd? meaning
  1264. OP_EEND = TREOp(0); // - End of program
  1265. OP_BOL = TREOp(1); // - Match "" at beginning of line
  1266. OP_EOL = TREOp(2); // - Match "" at end of line
  1267. OP_ANY = TREOp(3); // - Match any one character
  1268. OP_ANYOF = TREOp(4); // Str Match any character in string Str
  1269. OP_ANYBUT = TREOp(5); // Str Match any AnsiChar. not in string Str
  1270. OP_BRANCH = TREOp(6); // Node Match this alternative, or the next
  1271. OP_BACK = TREOp(7); // - Jump backward (Next < 0)
  1272. OP_EXACTLY = TREOp(8); // Str Match string Str
  1273. OP_NOTHING = TREOp(9); // - Match empty string
  1274. OP_STAR = TREOp(10); // Node Match this (simple) thing 0 or more times
  1275. OP_PLUS = TREOp(11); // Node Match this (simple) thing 1 or more times
  1276. OP_ANYDIGIT = TREOp(12); // - Match any digit (equiv [0-9])
  1277. OP_NOTDIGIT = TREOp(13); // - Match not digit (equiv [0-9])
  1278. OP_ANYLETTER = TREOp(14); // - Match any letter from property WordChars
  1279. OP_NOTLETTER = TREOp(15); // - Match not letter from property WordChars
  1280. OP_ANYSPACE = TREOp(16); // - Match any space AnsiChar (see property SpaceChars)
  1281. OP_NOTSPACE = TREOp(17); // - Match not space AnsiChar (see property SpaceChars)
  1282. OP_BRACES = TREOp(18);
  1283. // Node,Min,Max Match this (simple) thing from Min to Max times.
  1284. // Min and Max are TREBracesArg
  1285. OP_COMMENT = TREOp(19); // - Comment ;)
  1286. OP_EXACTLYCI = TREOp(20); // Str Match string Str case insensitive
  1287. OP_ANYOFCI = TREOp(21);
  1288. // Str Match any character in string Str, case insensitive
  1289. OP_ANYBUTCI = TREOp(22);
  1290. // Str Match any AnsiChar. not in string Str, case insensitive
  1291. OP_LOOPENTRY = TREOp(23); // Node Start of loop (Node - LOOP for this loop)
  1292. OP_LOOP = TREOp(24); // Node,Min,Max,LoopEntryJmp - back jump for LOOPENTRY.
  1293. // Min and Max are TREBracesArg
  1294. // Node - next node in sequence,
  1295. // LoopEntryJmp - associated LOOPENTRY node addr
  1296. OP_EOL2 = TReOp(25); // like OP_EOL but also matches before final line-break
  1297. OP_BSUBEXP = TREOp(28);
  1298. // Idx Match previously matched subexpression #Idx (stored as REChar) //###0.936
  1299. OP_BSUBEXPCI = TREOp(29); // Idx -"- in case-insensitive mode
  1300. // Non-Greedy Style Ops //###0.940
  1301. OP_STARNG = TREOp(30); // Same as OP_START but in non-greedy mode
  1302. OP_PLUSNG = TREOp(31); // Same as OP_PLUS but in non-greedy mode
  1303. OP_BRACESNG = TREOp(32); // Same as OP_BRACES but in non-greedy mode
  1304. OP_LOOPNG = TREOp(33); // Same as OP_LOOP but in non-greedy mode
  1305. // Multiline mode \m
  1306. OP_BOLML = TREOp(34); // - Match "" at beginning of line
  1307. OP_EOLML = TREOp(35); // - Match "" at end of line
  1308. OP_ANYML = TREOp(36); // - Match any one character
  1309. // Word boundary
  1310. OP_BOUND = TREOp(37); // Match "" between words //###0.943
  1311. OP_NOTBOUND = TREOp(38); // Match "" not between words //###0.943
  1312. OP_ANYHORZSEP = TREOp(39); // Any horizontal whitespace \h
  1313. OP_NOTHORZSEP = TREOp(40); // Not horizontal whitespace \H
  1314. OP_ANYVERTSEP = TREOp(41); // Any vertical whitespace \v
  1315. OP_NOTVERTSEP = TREOp(42); // Not vertical whitespace \V
  1316. OP_ANYCATEGORY = TREOp(43); // \p{L}
  1317. OP_NOTCATEGORY = TREOp(44); // \P{L}
  1318. OP_STAR_POSS = TReOp(45);
  1319. OP_PLUS_POSS = TReOp(46);
  1320. OP_BRACES_POSS = TReOp(47);
  1321. OP_RECUR = TReOp(48);
  1322. // !!! Change OP_OPEN value if you add new opcodes !!!
  1323. OP_OPEN = TREOp(50); // Opening of group; OP_OPEN+i is for group i
  1324. OP_OPEN_FIRST = Succ(OP_OPEN);
  1325. OP_OPEN_LAST = TREOp(Ord(OP_OPEN) + RegexMaxGroups - 1);
  1326. OP_CLOSE = Succ(OP_OPEN_LAST); // Closing of group; OP_CLOSE+i is for group i
  1327. OP_CLOSE_FIRST = Succ(OP_CLOSE);
  1328. OP_CLOSE_LAST = TReOp(Ord(OP_CLOSE) + RegexMaxGroups - 1);
  1329. OP_SUBCALL = Succ(OP_CLOSE_LAST); // Call of subroutine; OP_SUBCALL+i is for group i
  1330. OP_SUBCALL_FIRST = Succ(OP_SUBCALL);
  1331. OP_SUBCALL_LAST =
  1332. {$IFDEF UnicodeRE}
  1333. TReOp(Ord(OP_SUBCALL) + RegexMaxGroups - 1);
  1334. {$ELSE}
  1335. High(REChar); // must fit to 0..255 range
  1336. {$ENDIF}
  1337. // We work with p-code through pointers, compatible with PRegExprChar.
  1338. // Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
  1339. // must have lengths that can be divided by SizeOf (REChar) !
  1340. // A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
  1341. // The Next is a offset from the opcode of the node containing it.
  1342. // An operand, if any, simply follows the node. (Note that much of
  1343. // the code generation knows about this implicit relationship!)
  1344. // Using TRENextOff=PtrInt speed up p-code processing.
  1345. // Opcodes description:
  1346. //
  1347. // BRANCH The set of branches constituting a single choice are hooked
  1348. // together with their "next" pointers, since precedence prevents
  1349. // anything being concatenated to any individual branch. The
  1350. // "next" pointer of the last BRANCH in a choice points to the
  1351. // thing following the whole choice. This is also where the
  1352. // final "next" pointer of each individual branch points; each
  1353. // branch starts with the operand node of a BRANCH node.
  1354. // BACK Normal "next" pointers all implicitly point forward; BACK
  1355. // exists to make loop structures possible.
  1356. // STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
  1357. // circular BRANCH structures using BACK. Complex '{min,max}'
  1358. // - as pair LOOPENTRY-LOOP (see below). Simple cases (one
  1359. // character per match) are implemented with STAR, PLUS and
  1360. // BRACES for speed and to minimize recursive plunges.
  1361. // LOOPENTRY,LOOP {min,max} are implemented as special pair
  1362. // LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
  1363. // current level.
  1364. // OPEN,CLOSE are numbered at compile time.
  1365. { ============================================================= }
  1366. { ================== Error handling section =================== }
  1367. { ============================================================= }
  1368. const
  1369. reeOk = 0;
  1370. reeCompNullArgument = 100;
  1371. reeUnknownMetaSymbol = 101;
  1372. reeCompParseRegTooManyBrackets = 102;
  1373. reeCompParseRegUnmatchedBrackets = 103;
  1374. reeCompParseRegUnmatchedBrackets2 = 104;
  1375. reeCompParseRegJunkOnEnd = 105;
  1376. reePlusStarOperandCouldBeEmpty = 106;
  1377. reeNestedQuantif = 107;
  1378. reeBadHexDigit = 108;
  1379. reeInvalidRange = 109;
  1380. reeParseAtomTrailingBackSlash = 110;
  1381. reeNoHexCodeAfterBSlashX = 111;
  1382. reeHexCodeAfterBSlashXTooBig = 112;
  1383. reeUnmatchedSqBrackets = 113;
  1384. reeInternalUrp = 114;
  1385. reeQuantifFollowsNothing = 115;
  1386. reeTrailingBackSlash = 116;
  1387. reeNoLetterAfterBSlashC = 117;
  1388. reeMetaCharAfterMinusInRange = 118;
  1389. reeRarseAtomInternalDisaster = 119;
  1390. reeIncorrectSpecialBrackets = 120;
  1391. reeIncorrectBraces = 121;
  1392. reeBRACESArgTooBig = 122;
  1393. reeUnknownOpcodeInFillFirst = 123;
  1394. reeBracesMinParamGreaterMax = 124;
  1395. reeUnclosedComment = 125;
  1396. reeComplexBracesNotImplemented = 126;
  1397. reeUnrecognizedModifier = 127;
  1398. reeBadLinePairedSeparator = 128;
  1399. reeBadUnicodeCategory = 129;
  1400. reeTooSmallCheckersArray = 130;
  1401. reePossessiveAfterComplexBraces = 131;
  1402. reeBadRecursion = 132;
  1403. reeBadSubCall = 133;
  1404. reeNamedGroupBad = 140;
  1405. reeNamedGroupBadName = 141;
  1406. reeNamedGroupBadRef = 142;
  1407. reeNamedGroupDupName = 143;
  1408. reeLookaheadBad = 150;
  1409. reeLookbehindBad = 152;
  1410. reeLookbehindTooComplex = 153;
  1411. reeLookaroundNotAtEdge = 154;
  1412. // Runtime errors must be >= reeFirstRuntimeCode
  1413. reeFirstRuntimeCode = 1000;
  1414. reeRegRepeatCalledInappropriately = 1000;
  1415. reeMatchPrimMemoryCorruption = 1001;
  1416. reeMatchPrimCorruptedPointers = 1002;
  1417. reeNoExpression = 1003;
  1418. reeCorruptedProgram = 1004;
  1419. reeOffsetMustBePositive = 1006;
  1420. reeExecNextWithoutExec = 1007;
  1421. reeBadOpcodeInCharClass = 1008;
  1422. reeDumpCorruptedOpcode = 1011;
  1423. reeModifierUnsupported = 1013;
  1424. reeLoopStackExceeded = 1014;
  1425. reeLoopWithoutEntry = 1015;
  1426. function TRegExpr.ErrorMsg(AErrorID: integer): RegExprString;
  1427. begin
  1428. case AErrorID of
  1429. reeOk:
  1430. Result := 'No errors';
  1431. reeCompNullArgument:
  1432. Result := 'TRegExpr compile: null argument';
  1433. reeUnknownMetaSymbol:
  1434. Result := 'TRegExpr compile: unknown meta-character: \' + fLastErrorSymbol;
  1435. reeCompParseRegTooManyBrackets:
  1436. Result := 'TRegExpr compile: ParseReg: too many ()';
  1437. reeCompParseRegUnmatchedBrackets:
  1438. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1439. reeCompParseRegUnmatchedBrackets2:
  1440. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1441. reeCompParseRegJunkOnEnd:
  1442. Result := 'TRegExpr compile: ParseReg: junk at end';
  1443. reePlusStarOperandCouldBeEmpty:
  1444. Result := 'TRegExpr compile: *+ operand could be empty';
  1445. reeNestedQuantif:
  1446. Result := 'TRegExpr compile: nested quantifier *?+';
  1447. reeBadHexDigit:
  1448. Result := 'TRegExpr compile: bad hex digit';
  1449. reeInvalidRange:
  1450. Result := 'TRegExpr compile: invalid [] range';
  1451. reeParseAtomTrailingBackSlash:
  1452. Result := 'TRegExpr compile: parse atom trailing \';
  1453. reeNoHexCodeAfterBSlashX:
  1454. Result := 'TRegExpr compile: no hex code after \x';
  1455. reeNoLetterAfterBSlashC:
  1456. Result := 'TRegExpr compile: no letter "A".."Z" after \c';
  1457. reeMetaCharAfterMinusInRange:
  1458. Result := 'TRegExpr compile: metachar after "-" in [] range';
  1459. reeHexCodeAfterBSlashXTooBig:
  1460. Result := 'TRegExpr compile: hex code after \x is too big';
  1461. reeUnmatchedSqBrackets:
  1462. Result := 'TRegExpr compile: unmatched []';
  1463. reeInternalUrp:
  1464. Result := 'TRegExpr compile: internal fail on AnsiChar "|", ")"';
  1465. reeQuantifFollowsNothing:
  1466. Result := 'TRegExpr compile: quantifier ?+*{ follows nothing';
  1467. reeTrailingBackSlash:
  1468. Result := 'TRegExpr compile: trailing \';
  1469. reeRarseAtomInternalDisaster:
  1470. Result := 'TRegExpr compile: RarseAtom internal disaster';
  1471. reeIncorrectSpecialBrackets:
  1472. Result := 'TRegExpr compile: incorrect expression in (?...) brackets';
  1473. reeIncorrectBraces:
  1474. Result := 'TRegExpr compile: incorrect {} braces';
  1475. reeBRACESArgTooBig:
  1476. Result := 'TRegExpr compile: braces {} argument too big';
  1477. reeUnknownOpcodeInFillFirst:
  1478. Result := 'TRegExpr compile: unknown opcode in FillFirstCharSet ('+DumpOp(fLastErrorOpcode)+')';
  1479. reeBracesMinParamGreaterMax:
  1480. Result := 'TRegExpr compile: braces {} min param greater then max';
  1481. reeUnclosedComment:
  1482. Result := 'TRegExpr compile: unclosed (?#comment)';
  1483. reeComplexBracesNotImplemented:
  1484. Result := 'TRegExpr compile: if you use braces {} and non-greedy ops *?, +?, ?? for complex cases, enable {$DEFINE ComplexBraces}';
  1485. reeUnrecognizedModifier:
  1486. Result := 'TRegExpr compile: incorrect modifier in (?...)';
  1487. reeBadLinePairedSeparator:
  1488. Result := 'TRegExpr compile: LinePairedSeparator must countain two different chars or be empty';
  1489. reeBadUnicodeCategory:
  1490. Result := 'TRegExpr compile: invalid category after \p or \P';
  1491. reeTooSmallCheckersArray:
  1492. Result := 'TRegExpr compile: too small CharCheckers array';
  1493. reePossessiveAfterComplexBraces:
  1494. Result := 'TRegExpr compile: possessive + after complex braces: (foo){n,m}+';
  1495. reeBadRecursion:
  1496. Result := 'TRegExpr compile: bad recursion (?R)';
  1497. reeBadSubCall:
  1498. Result := 'TRegExpr compile: bad subroutine call';
  1499. reeNamedGroupBad:
  1500. Result := 'TRegExpr compile: bad named group';
  1501. reeNamedGroupBadName:
  1502. Result := 'TRegExpr compile: bad identifier in named group';
  1503. reeNamedGroupBadRef:
  1504. Result := 'TRegExpr compile: bad back-reference to named group';
  1505. reeNamedGroupDupName:
  1506. Result := 'TRegExpr compile: named group defined more than once';
  1507. reeLookaheadBad:
  1508. Result := 'TRegExpr compile: bad lookahead';
  1509. reeLookbehindBad:
  1510. Result := 'TRegExpr compile: bad lookbehind';
  1511. reeLookbehindTooComplex:
  1512. Result := 'TRegExpr compile: lookbehind (?<!foo) must have fixed length';
  1513. reeLookaroundNotAtEdge:
  1514. Result := 'TRegExpr compile: lookaround brackets must be at the very beginning/ending';
  1515. reeRegRepeatCalledInappropriately:
  1516. Result := 'TRegExpr exec: RegRepeat called inappropriately';
  1517. reeMatchPrimMemoryCorruption:
  1518. Result := 'TRegExpr exec: MatchPrim memory corruption';
  1519. reeMatchPrimCorruptedPointers:
  1520. Result := 'TRegExpr exec: MatchPrim corrupted pointers';
  1521. reeNoExpression:
  1522. Result := 'TRegExpr exec: empty expression';
  1523. reeCorruptedProgram:
  1524. Result := 'TRegExpr exec: corrupted opcode (no magic byte)';
  1525. reeOffsetMustBePositive:
  1526. Result := 'TRegExpr exec: offset must be >0';
  1527. reeExecNextWithoutExec:
  1528. Result := 'TRegExpr exec: ExecNext without Exec(Pos)';
  1529. reeBadOpcodeInCharClass:
  1530. Result := 'TRegExpr exec: invalid opcode in AnsiChar class';
  1531. reeDumpCorruptedOpcode:
  1532. Result := 'TRegExpr dump: corrupted opcode';
  1533. reeLoopStackExceeded:
  1534. Result := 'TRegExpr exec: loop stack exceeded';
  1535. reeLoopWithoutEntry:
  1536. Result := 'TRegExpr exec: loop without loop entry';
  1537. else
  1538. Result := 'Unknown error';
  1539. end;
  1540. end; { of procedure TRegExpr.Error
  1541. -------------------------------------------------------------- }
  1542. function TRegExpr.LastError: integer;
  1543. begin
  1544. Result := fLastError;
  1545. fLastError := reeOk;
  1546. end; { of function TRegExpr.LastError
  1547. -------------------------------------------------------------- }
  1548. { ============================================================= }
  1549. { ===================== Common section ======================== }
  1550. { ============================================================= }
  1551. class function TRegExpr.VersionMajor: integer;
  1552. begin
  1553. Result := REVersionMajor;
  1554. end;
  1555. class function TRegExpr.VersionMinor: integer;
  1556. begin
  1557. Result := REVersionMinor;
  1558. end;
  1559. constructor TRegExpr.Create;
  1560. begin
  1561. inherited;
  1562. programm := nil;
  1563. fExpression := '';
  1564. fInputString := '';
  1565. FillChar(fModifiers, SizeOf(fModifiers), 0);
  1566. fModifiers.I := RegExprModifierI;
  1567. fModifiers.R := RegExprModifierR;
  1568. fModifiers.S := RegExprModifierS;
  1569. fModifiers.G := RegExprModifierG;
  1570. fModifiers.M := RegExprModifierM;
  1571. fModifiers.X := RegExprModifierX;
  1572. {$IFDEF UseSpaceChars}
  1573. SpaceChars := RegExprSpaceChars;
  1574. {$ENDIF}
  1575. {$IFDEF UseWordChars}
  1576. WordChars := RegExprWordChars;
  1577. {$ENDIF}
  1578. {$IFDEF UseLineSep}
  1579. fLineSeparators := RegExprLineSeparators;
  1580. {$ENDIF}
  1581. fUsePairedBreak := RegExprUsePairedBreak;
  1582. fReplaceLineEnd := RegExprReplaceLineBreak;
  1583. fSlowChecksSizeMax := 2000;
  1584. {$IFDEF UseLineSep}
  1585. InitLineSepArray;
  1586. {$ENDIF}
  1587. InitCharCheckers;
  1588. {$IFDEF Compat}
  1589. fInvertCase := OldInvertCase;
  1590. {$ENDIF}
  1591. end; { of constructor TRegExpr.Create
  1592. -------------------------------------------------------------- }
  1593. {$IFDEF OverMeth}
  1594. constructor TRegExpr.Create(const AExpression: RegExprString);
  1595. begin
  1596. Create;
  1597. Expression := AExpression;
  1598. end;
  1599. {$ENDIF}
  1600. destructor TRegExpr.Destroy;
  1601. begin
  1602. if programm <> nil then
  1603. begin
  1604. FreeMem(programm);
  1605. programm := nil;
  1606. end;
  1607. if Assigned(fHelper) then
  1608. FreeAndNil(fHelper);
  1609. end;
  1610. procedure TRegExpr.SetExpression(const AStr: RegExprString);
  1611. begin
  1612. if (AStr <> fExpression) or not IsCompiled then
  1613. begin
  1614. fExpression := AStr;
  1615. UniqueString(fExpression);
  1616. fRegexStart := PRegExprChar(fExpression);
  1617. fRegexEnd := fRegexStart + Length(fExpression);
  1618. InvalidateProgramm;
  1619. end;
  1620. end; { of procedure TRegExpr.SetExpression
  1621. -------------------------------------------------------------- }
  1622. function TRegExpr.GetSubExprCount: integer;
  1623. begin
  1624. // if nothing found, we must return -1 per TRegExpr docs
  1625. if GrpBounds[0].GrpStart[0] = nil then
  1626. Result := -1
  1627. else
  1628. Result := GrpCount;
  1629. end;
  1630. function TRegExpr.GetMatchPos(Idx: integer): PtrInt;
  1631. begin
  1632. Idx := GrpIndexes[Idx];
  1633. if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then
  1634. Result := GrpBounds[0].GrpStart[Idx] - fInputStart + 1
  1635. else
  1636. Result := -1;
  1637. end; { of function TRegExpr.GetMatchPos
  1638. -------------------------------------------------------------- }
  1639. function TRegExpr.GetMatchLen(Idx: integer): PtrInt;
  1640. begin
  1641. Idx := GrpIndexes[Idx];
  1642. if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then
  1643. Result := GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx]
  1644. else
  1645. Result := -1;
  1646. end; { of function TRegExpr.GetMatchLen
  1647. -------------------------------------------------------------- }
  1648. function TRegExpr.GetMatch(Idx: integer): RegExprString;
  1649. begin
  1650. Result := '';
  1651. Idx := GrpIndexes[Idx];
  1652. if (Idx >= 0) and (GrpBounds[0].GrpEnd[Idx] > GrpBounds[0].GrpStart[Idx]) then
  1653. SetString(Result, GrpBounds[0].GrpStart[Idx], GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx]);
  1654. end; { of function TRegExpr.GetMatch
  1655. -------------------------------------------------------------- }
  1656. function TRegExpr.MatchIndexFromName(const AName: RegExprString): integer;
  1657. var
  1658. i: integer;
  1659. begin
  1660. for i := 1 {not 0} to GrpCount do
  1661. if GrpNames[i] = AName then
  1662. begin
  1663. Result := i;
  1664. Exit;
  1665. end;
  1666. Result := -1;
  1667. end;
  1668. function TRegExpr.MatchFromName(const AName: RegExprString): RegExprString;
  1669. var
  1670. Idx: integer;
  1671. begin
  1672. Idx := MatchIndexFromName(AName);
  1673. if Idx >= 0 then
  1674. Result := GetMatch(Idx)
  1675. else
  1676. Result := '';
  1677. end;
  1678. function TRegExpr.GetModifierStr: RegExprString;
  1679. begin
  1680. Result := '-';
  1681. if ModifierI then
  1682. Result := 'i' + Result
  1683. else
  1684. Result := Result + 'i';
  1685. if ModifierR then
  1686. Result := 'r' + Result
  1687. else
  1688. Result := Result + 'r';
  1689. if ModifierS then
  1690. Result := 's' + Result
  1691. else
  1692. Result := Result + 's';
  1693. if ModifierG then
  1694. Result := 'g' + Result
  1695. else
  1696. Result := Result + 'g';
  1697. if ModifierM then
  1698. Result := 'm' + Result
  1699. else
  1700. Result := Result + 'm';
  1701. if ModifierX then
  1702. Result := 'x' + Result
  1703. else
  1704. Result := Result + 'x';
  1705. if Result[Length(Result)] = '-' // remove '-' if all modifiers are 'On'
  1706. then
  1707. System.Delete(Result, Length(Result), 1);
  1708. end; { of function TRegExpr.GetModifierStr
  1709. -------------------------------------------------------------- }
  1710. procedure TRegExpr.SetModifierG(AValue: boolean);
  1711. begin
  1712. if fModifiers.G <> AValue then
  1713. begin
  1714. fModifiers.G := AValue;
  1715. InvalidateProgramm;
  1716. end;
  1717. end;
  1718. procedure TRegExpr.SetModifierI(AValue: boolean);
  1719. begin
  1720. if fModifiers.I <> AValue then
  1721. begin
  1722. fModifiers.I := AValue;
  1723. InvalidateProgramm;
  1724. end;
  1725. end;
  1726. procedure TRegExpr.SetModifierM(AValue: boolean);
  1727. begin
  1728. if fModifiers.M <> AValue then
  1729. begin
  1730. fModifiers.M := AValue;
  1731. InvalidateProgramm;
  1732. end;
  1733. end;
  1734. procedure TRegExpr.SetModifierR(AValue: boolean);
  1735. begin
  1736. if fModifiers.R <> AValue then
  1737. begin
  1738. fModifiers.R := AValue;
  1739. InvalidateProgramm;
  1740. end;
  1741. end;
  1742. procedure TRegExpr.SetModifierS(AValue: boolean);
  1743. begin
  1744. if fModifiers.S <> AValue then
  1745. begin
  1746. fModifiers.S := AValue;
  1747. InvalidateProgramm;
  1748. end;
  1749. end;
  1750. procedure TRegExpr.SetModifierX(AValue: boolean);
  1751. begin
  1752. if fModifiers.X <> AValue then
  1753. begin
  1754. fModifiers.X := AValue;
  1755. InvalidateProgramm;
  1756. end;
  1757. end;
  1758. procedure TRegExpr.SetModifierStr(const AStr: RegExprString);
  1759. begin
  1760. if ParseModifiers(PRegExprChar(AStr), Length(AStr), fModifiers) then
  1761. InvalidateProgramm
  1762. else
  1763. Error(reeModifierUnsupported);
  1764. end;
  1765. { ============================================================= }
  1766. { ==================== Compiler section ======================= }
  1767. { ============================================================= }
  1768. {$IFDEF FastUnicodeData}
  1769. function TRegExpr.IsWordChar(AChar: REChar): boolean;
  1770. begin
  1771. // bit 7 in value: is word AnsiChar
  1772. Result := CharCategoryArray[Ord(AChar)] and 128 <> 0;
  1773. end;
  1774. (*
  1775. // Unicode General Category
  1776. UGC_UppercaseLetter = 0; Lu
  1777. UGC_LowercaseLetter = 1; Ll
  1778. UGC_TitlecaseLetter = 2; Lt
  1779. UGC_ModifierLetter = 3; Lm
  1780. UGC_OtherLetter = 4; Lo
  1781. UGC_NonSpacingMark = 5; Mn
  1782. UGC_CombiningMark = 6; Mc
  1783. UGC_EnclosingMark = 7; Me
  1784. UGC_DecimalNumber = 8; Nd
  1785. UGC_LetterNumber = 9; Nl
  1786. UGC_OtherNumber = 10; No
  1787. UGC_ConnectPunctuation = 11; Pc
  1788. UGC_DashPunctuation = 12; Pd
  1789. UGC_OpenPunctuation = 13; Ps
  1790. UGC_ClosePunctuation = 14; Pe
  1791. UGC_InitialPunctuation = 15; Pi
  1792. UGC_FinalPunctuation = 16; Pf
  1793. UGC_OtherPunctuation = 17; Po
  1794. UGC_MathSymbol = 18; Sm
  1795. UGC_CurrencySymbol = 19; Sc
  1796. UGC_ModifierSymbol = 20; Sk
  1797. UGC_OtherSymbol = 21; So
  1798. UGC_SpaceSeparator = 22; Zs
  1799. UGC_LineSeparator = 23; Zl
  1800. UGC_ParagraphSeparator = 24; Zp
  1801. UGC_Control = 25; Cc
  1802. UGC_Format = 26; Cf
  1803. UGC_Surrogate = 27; Cs
  1804. UGC_PrivateUse = 28; Co
  1805. UGC_Unassigned = 29; Cn
  1806. *)
  1807. const
  1808. CategoryNames: array[0..29] of array[0..1] of REChar = (
  1809. ('L', 'u'),
  1810. ('L', 'l'),
  1811. ('L', 't'),
  1812. ('L', 'm'),
  1813. ('L', 'o'),
  1814. ('M', 'n'),
  1815. ('M', 'c'),
  1816. ('M', 'e'),
  1817. ('N', 'd'),
  1818. ('N', 'l'),
  1819. ('N', 'o'),
  1820. ('P', 'c'),
  1821. ('P', 'd'),
  1822. ('P', 's'),
  1823. ('P', 'e'),
  1824. ('P', 'i'),
  1825. ('P', 'f'),
  1826. ('P', 'o'),
  1827. ('S', 'm'),
  1828. ('S', 'c'),
  1829. ('S', 'k'),
  1830. ('S', 'o'),
  1831. ('Z', 's'),
  1832. ('Z', 'l'),
  1833. ('Z', 'p'),
  1834. ('C', 'c'),
  1835. ('C', 'f'),
  1836. ('C', 's'),
  1837. ('C', 'o'),
  1838. ('C', 'n')
  1839. );
  1840. function IsCategoryFirstChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1841. begin
  1842. case AChar of
  1843. 'L', 'M', 'N', 'P', 'S', 'C', 'Z':
  1844. Result := True;
  1845. else
  1846. Result := False;
  1847. end;
  1848. end;
  1849. function IsCategoryChars(AChar, AChar2: REChar): boolean;
  1850. var
  1851. i: integer;
  1852. begin
  1853. for i := Low(CategoryNames) to High(CategoryNames) do
  1854. if (AChar = CategoryNames[i][0]) then
  1855. if (AChar2 = CategoryNames[i][1]) then
  1856. begin
  1857. Result := True;
  1858. Exit
  1859. end;
  1860. Result := False;
  1861. end;
  1862. function CheckCharCategory(AChar: REChar; Ch0, Ch1: REChar): boolean;
  1863. // AChar: check this AnsiChar against opcode
  1864. // Ch0, Ch1: opcode operands after OP_*CATEGORY
  1865. var
  1866. N: byte;
  1867. Name0, Name1: REChar;
  1868. begin
  1869. Result := False;
  1870. // bits 0..6 are category
  1871. N := CharCategoryArray[Ord(AChar)] and 127;
  1872. if N <= High(CategoryNames) then
  1873. begin
  1874. Name0 := CategoryNames[N][0];
  1875. Name1 := CategoryNames[N][1];
  1876. if Ch0 <> Name0 then Exit;
  1877. if Ch1 <> #0 then
  1878. if Ch1 <> Name1 then Exit;
  1879. Result := True;
  1880. end;
  1881. end;
  1882. function MatchOneCharCategory(opnd, scan: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1883. // opnd: points to opcode operands after OP_*CATEGORY
  1884. // scan: points into InputString
  1885. begin
  1886. Result := CheckCharCategory(scan^, opnd^, (opnd + 1)^);
  1887. end;
  1888. {$ELSE}
  1889. function TRegExpr.IsWordChar(AChar: REChar): boolean;
  1890. begin
  1891. {$IFDEF UseWordChars}
  1892. Result := Pos(AChar, fWordChars) > 0;
  1893. {$ELSE}
  1894. case AChar of
  1895. 'a' .. 'z',
  1896. 'A' .. 'Z',
  1897. '0' .. '9', '_':
  1898. Result := True
  1899. else
  1900. Result := False;
  1901. end;
  1902. {$ENDIF}
  1903. end;
  1904. {$ENDIF}
  1905. function TRegExpr.IsSpaceChar(AChar: REChar): boolean;
  1906. begin
  1907. {$IFDEF UseSpaceChars}
  1908. Result := Pos(AChar, fSpaceChars) > 0;
  1909. {$ELSE}
  1910. case AChar of
  1911. ' ', #$9, #$A, #$D, #$C:
  1912. Result := True
  1913. else
  1914. Result := False;
  1915. end;
  1916. {$ENDIF}
  1917. end;
  1918. function TRegExpr.IsCustomLineSeparator(AChar: REChar): boolean;
  1919. begin
  1920. {$IFDEF UseLineSep}
  1921. {$IFDEF UnicodeRE}
  1922. Result := Pos(AChar, fLineSeparators) > 0;
  1923. {$ELSE}
  1924. Result := fLineSepArray[byte(AChar)];
  1925. {$ENDIF}
  1926. {$ELSE}
  1927. case AChar of
  1928. #$d, #$a,
  1929. {$IFDEF UnicodeRE}
  1930. #$85, #$2028, #$2029,
  1931. {$ENDIF}
  1932. #$b, #$c:
  1933. Result := True;
  1934. else
  1935. Result := False;
  1936. end;
  1937. {$ENDIF}
  1938. end;
  1939. function IsDigitChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1940. begin
  1941. case AChar of
  1942. '0' .. '9':
  1943. Result := True;
  1944. else
  1945. Result := False;
  1946. end;
  1947. end;
  1948. function IsHorzSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1949. begin
  1950. // Tab and Unicode categoty "Space Separator": https://www.compart.com/en/unicode/category/Zs
  1951. case AChar of
  1952. #9, #$20, #$A0:
  1953. Result := True;
  1954. {$IFDEF UnicodeRE}
  1955. #$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000:
  1956. Result := True;
  1957. {$ENDIF}
  1958. else
  1959. Result := False;
  1960. end;
  1961. end;
  1962. function IsVertLineSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1963. begin
  1964. case AChar of
  1965. #$d, #$a, #$b, #$c:
  1966. Result := True;
  1967. {$IFDEF UnicodeRE}
  1968. #$2028, #$2029, #$85:
  1969. Result := True;
  1970. {$ENDIF}
  1971. else
  1972. Result := False;
  1973. end;
  1974. end;
  1975. procedure TRegExpr.InvalidateProgramm;
  1976. begin
  1977. if programm <> nil then
  1978. begin
  1979. FreeMem(programm);
  1980. programm := nil;
  1981. end;
  1982. end; { of procedure TRegExpr.InvalidateProgramm
  1983. -------------------------------------------------------------- }
  1984. procedure TRegExpr.Compile;
  1985. begin
  1986. if fExpression = '' then
  1987. begin
  1988. Error(reeNoExpression);
  1989. Exit;
  1990. end;
  1991. CompileRegExpr(fRegexStart);
  1992. end; { of procedure TRegExpr.Compile
  1993. -------------------------------------------------------------- }
  1994. {$IFDEF UseLineSep}
  1995. procedure TRegExpr.InitLineSepArray;
  1996. {$IFNDEF UnicodeRE}
  1997. var
  1998. i: integer;
  1999. {$ENDIF}
  2000. begin
  2001. {$IFNDEF UnicodeRE}
  2002. FillChar(fLineSepArray, SizeOf(fLineSepArray), 0);
  2003. for i := 1 to Length(fLineSeparators) do
  2004. fLineSepArray[byte(fLineSeparators[i])] := True;
  2005. {$ENDIF}
  2006. end;
  2007. {$ENDIF}
  2008. function TRegExpr.IsProgrammOk: boolean;
  2009. begin
  2010. Result := False;
  2011. // check modifiers
  2012. if not IsModifiersEqual(fModifiers, fProgModifiers) then
  2013. InvalidateProgramm;
  2014. // compile if needed
  2015. if programm = nil then
  2016. begin
  2017. Compile;
  2018. // Check compiled programm
  2019. if programm = nil then
  2020. Exit;
  2021. end;
  2022. if programm[0] <> OP_MAGIC then
  2023. Error(reeCorruptedProgram)
  2024. else
  2025. Result := True;
  2026. end; { of function TRegExpr.IsProgrammOk
  2027. -------------------------------------------------------------- }
  2028. procedure TRegExpr.Tail(p: PRegExprChar; val: PRegExprChar);
  2029. // set the next-pointer at the end of a node chain
  2030. var
  2031. scan: PRegExprChar;
  2032. temp: PRegExprChar;
  2033. begin
  2034. if p = @regDummy then
  2035. Exit;
  2036. // Find last node.
  2037. scan := p;
  2038. repeat
  2039. temp := regNext(scan);
  2040. if temp = nil then
  2041. Break;
  2042. scan := temp;
  2043. until False;
  2044. // Set Next 'pointer'
  2045. if val < scan then
  2046. PRENextOff(AlignToPtr(scan + REOpSz))^ := -(scan - val) // ###0.948
  2047. // work around PWideChar subtraction bug (Delphi uses
  2048. // shr after subtraction to calculate widechar distance %-( )
  2049. // so, if difference is negative we have .. the "feature" :(
  2050. // I could wrap it in $IFDEF UnicodeRE, but I didn't because
  2051. // "P – Q computes the difference between the address given
  2052. // by P (the higher address) and the address given by Q (the
  2053. // lower address)" - Delphi help quotation.
  2054. else
  2055. PRENextOff(AlignToPtr(scan + REOpSz))^ := val - scan; // ###0.933
  2056. end; { of procedure TRegExpr.Tail
  2057. -------------------------------------------------------------- }
  2058. procedure TRegExpr.OpTail(p: PRegExprChar; val: PRegExprChar);
  2059. // regtail on operand of first argument; nop if operandless
  2060. begin
  2061. // "Operandless" and "op != OP_BRANCH" are synonymous in practice.
  2062. if (p = nil) or (p = @regDummy) or (PREOp(p)^ <> OP_BRANCH) then
  2063. Exit;
  2064. Tail(p + REOpSz + RENextOffSz, val); // ###0.933
  2065. end; { of procedure TRegExpr.OpTail
  2066. -------------------------------------------------------------- }
  2067. function TRegExpr.EmitNode(op: TREOp): PRegExprChar; // ###0.933
  2068. // emit a node, return location
  2069. begin
  2070. Result := regCode;
  2071. if Result <> @regDummy then
  2072. begin
  2073. PREOp(regCode)^ := op;
  2074. Inc(regCode, REOpSz);
  2075. PRENextOff(AlignToPtr(regCode))^ := 0; // Next "pointer" := nil
  2076. Inc(regCode, RENextOffSz);
  2077. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  2078. regExactlyLen := PLongInt(regCode)
  2079. else
  2080. regExactlyLen := nil;
  2081. {$IFDEF DebugSynRegExpr}
  2082. if regcode - programm > regsize then
  2083. raise Exception.Create('TRegExpr.EmitNode buffer overrun');
  2084. {$ENDIF}
  2085. end
  2086. else
  2087. Inc(regCodeSize, REOpSz + RENextOffSz);
  2088. // compute code size without code generation
  2089. end; { of function TRegExpr.EmitNode
  2090. -------------------------------------------------------------- }
  2091. procedure TRegExpr.EmitC(ch: REChar);
  2092. begin
  2093. if regCode <> @regDummy then
  2094. begin
  2095. regCode^ := ch;
  2096. Inc(regCode);
  2097. {$IFDEF DebugSynRegExpr}
  2098. if regcode - programm > regsize then
  2099. raise Exception.Create('TRegExpr.EmitC buffer overrun');
  2100. {$ENDIF}
  2101. end
  2102. else
  2103. Inc(regCodeSize, REOpSz); // Type of p-code pointer always is ^REChar
  2104. end; { of procedure TRegExpr.EmitC
  2105. -------------------------------------------------------------- }
  2106. procedure TRegExpr.EmitInt(AValue: LongInt);
  2107. begin
  2108. if regCode <> @regDummy then
  2109. begin
  2110. PLongInt(regCode)^ := AValue;
  2111. Inc(regCode, RENumberSz);
  2112. {$IFDEF DebugSynRegExpr}
  2113. if regcode - programm > regsize then
  2114. raise Exception.Create('TRegExpr.EmitInt buffer overrun');
  2115. {$ENDIF}
  2116. end
  2117. else
  2118. Inc(regCodeSize, RENumberSz);
  2119. end;
  2120. function TRegExpr.EmitGroupRef(AIndex: integer; AIgnoreCase: boolean): PRegExprChar;
  2121. begin
  2122. if AIgnoreCase then
  2123. Result := EmitNode(OP_BSUBEXPCI)
  2124. else
  2125. Result := EmitNode(OP_BSUBEXP);
  2126. EmitC(REChar(AIndex));
  2127. end;
  2128. {$IFDEF FastUnicodeData}
  2129. procedure TRegExpr.FindCategoryName(var scan: PRegExprChar; var ch1, ch2: REChar);
  2130. // scan: points into regex string after '\p', to find category name
  2131. // ch1, ch2: 2-AnsiChar name of category; ch2 can be #0
  2132. var
  2133. ch: REChar;
  2134. pos1, pos2, namePtr: PRegExprChar;
  2135. nameLen: integer;
  2136. begin
  2137. ch1 := #0;
  2138. ch2 := #0;
  2139. ch := scan^;
  2140. if IsCategoryFirstChar(ch) then
  2141. begin
  2142. ch1 := ch;
  2143. Exit;
  2144. end;
  2145. if ch = '{' then
  2146. begin
  2147. pos1 := scan;
  2148. pos2 := pos1;
  2149. while (pos2 < fRegexEnd) and (pos2^ <> '}') do
  2150. Inc(pos2);
  2151. if pos2 >= fRegexEnd then
  2152. Error(reeIncorrectBraces);
  2153. namePtr := pos1+1;
  2154. nameLen := pos2-pos1-1;
  2155. Inc(scan, nameLen+1);
  2156. if nameLen<1 then
  2157. Error(reeBadUnicodeCategory);
  2158. if nameLen>2 then
  2159. Error(reeBadUnicodeCategory);
  2160. if nameLen = 1 then
  2161. begin
  2162. ch1 := namePtr^;
  2163. ch2 := #0;
  2164. if not IsCategoryFirstChar(ch1) then
  2165. Error(reeBadUnicodeCategory);
  2166. Exit;
  2167. end;
  2168. if nameLen = 2 then
  2169. begin
  2170. ch1 := namePtr^;
  2171. ch2 := (namePtr+1)^;
  2172. if not IsCategoryChars(ch1, ch2) then
  2173. Error(reeBadUnicodeCategory);
  2174. Exit;
  2175. end;
  2176. end
  2177. else
  2178. Error(reeBadUnicodeCategory);
  2179. end;
  2180. function TRegExpr.EmitCategoryMain(APositive: boolean): PRegExprChar;
  2181. var
  2182. ch, ch2: REChar;
  2183. begin
  2184. Inc(regParse);
  2185. if regParse >= fRegexEnd then
  2186. Error(reeBadUnicodeCategory);
  2187. FindCategoryName(regParse, ch, ch2);
  2188. if APositive then
  2189. Result := EmitNode(OP_ANYCATEGORY)
  2190. else
  2191. Result := EmitNode(OP_NOTCATEGORY);
  2192. EmitC(ch);
  2193. EmitC(ch2);
  2194. end;
  2195. {$ENDIF}
  2196. procedure TRegExpr.InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  2197. // insert an operator in front of already-emitted operand
  2198. // Means relocating the operand.
  2199. var
  2200. src, dst, place: PRegExprChar;
  2201. i: integer;
  2202. begin
  2203. if regCode = @regDummy then
  2204. begin
  2205. Inc(regCodeSize, sz);
  2206. Exit;
  2207. end;
  2208. // move code behind insert position
  2209. src := regCode;
  2210. Inc(regCode, sz);
  2211. {$IFDEF DebugSynRegExpr}
  2212. if regCode - programm > regCodeSize then
  2213. raise Exception.Create('TRegExpr.InsertOperator buffer overrun');
  2214. // if (opnd<regCode) or (opnd-regCode>regCodeSize) then
  2215. // raise Exception.Create('TRegExpr.InsertOperator invalid opnd');
  2216. {$ENDIF}
  2217. dst := regCode;
  2218. while src > opnd do
  2219. begin
  2220. Dec(dst);
  2221. Dec(src);
  2222. dst^ := src^;
  2223. end;
  2224. place := opnd; // Op node, where operand used to be.
  2225. PREOp(place)^ := op;
  2226. Inc(place, REOpSz);
  2227. for i := 1 + REOpSz to sz do
  2228. begin
  2229. place^ := #0;
  2230. Inc(place);
  2231. end;
  2232. end; { of procedure TRegExpr.InsertOperator
  2233. -------------------------------------------------------------- }
  2234. function FindSkippedMetaLen(PStart, PEnd: PRegExprChar): integer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  2235. // find length of initial segment of PStart string consisting
  2236. // entirely of characters not from IsMetaSymbol1.
  2237. begin
  2238. Result := 0;
  2239. while PStart < PEnd do
  2240. begin
  2241. if _IsMetaSymbol1(PStart^) then
  2242. Exit;
  2243. Inc(Result);
  2244. Inc(PStart)
  2245. end;
  2246. end;
  2247. const
  2248. // Flags to be passed up and down.
  2249. FLAG_WORST = 0; // Worst case
  2250. FLAG_HASWIDTH = 1; // Cannot match empty string
  2251. FLAG_SIMPLE = 2; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand
  2252. FLAG_SPECSTART = 4; // Starts with * or +
  2253. {$IFDEF UnicodeRE}
  2254. RusRangeLoLow = #$430; // 'а'
  2255. RusRangeLoHigh = #$44F; // 'я'
  2256. RusRangeHiLow = #$410; // 'А'
  2257. RusRangeHiHigh = #$42F; // 'Я'
  2258. {$ELSE}
  2259. RusRangeLoLow = #$E0; // 'а' in cp1251
  2260. RusRangeLoHigh = #$FF; // 'я' in cp1251
  2261. RusRangeHiLow = #$C0; // 'А' in cp1251
  2262. RusRangeHiHigh = #$DF; // 'Я' in cp1251
  2263. {$ENDIF}
  2264. function TRegExpr.FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  2265. // Buffer contains AnsiChar pairs: (Kind, Data), where Kind is one of OpKind_ values,
  2266. // and Data depends on Kind
  2267. var
  2268. OpKind: REChar;
  2269. ch, ch2: REChar;
  2270. N, i: integer;
  2271. begin
  2272. if AIgnoreCase then
  2273. AChar := _UpperCase(AChar);
  2274. repeat
  2275. OpKind := ABuffer^;
  2276. case OpKind of
  2277. OpKind_End:
  2278. begin
  2279. Result := False;
  2280. Exit;
  2281. end;
  2282. OpKind_Range:
  2283. begin
  2284. Inc(ABuffer);
  2285. ch := ABuffer^;
  2286. Inc(ABuffer);
  2287. ch2 := ABuffer^;
  2288. Inc(ABuffer);
  2289. {
  2290. // if AIgnoreCase, ch, ch2 are upcased in opcode
  2291. if AIgnoreCase then
  2292. begin
  2293. ch := _UpperCase(ch);
  2294. ch2 := _UpperCase(ch2);
  2295. end;
  2296. }
  2297. if (AChar >= ch) and (AChar <= ch2) then
  2298. begin
  2299. Result := True;
  2300. Exit;
  2301. end;
  2302. end;
  2303. OpKind_MetaClass:
  2304. begin
  2305. Inc(ABuffer);
  2306. N := Ord(ABuffer^);
  2307. Inc(ABuffer);
  2308. if CharCheckers[N](AChar) then
  2309. begin
  2310. Result := True;
  2311. Exit
  2312. end;
  2313. end;
  2314. OpKind_Char:
  2315. begin
  2316. Inc(ABuffer);
  2317. N := PLongInt(ABuffer)^;
  2318. Inc(ABuffer, RENumberSz);
  2319. for i := 1 to N do
  2320. begin
  2321. ch := ABuffer^;
  2322. Inc(ABuffer);
  2323. {
  2324. // already upcased in opcode
  2325. if AIgnoreCase then
  2326. ch := _UpperCase(ch);
  2327. }
  2328. if ch = AChar then
  2329. begin
  2330. Result := True;
  2331. Exit;
  2332. end;
  2333. end;
  2334. end;
  2335. {$IFDEF FastUnicodeData}
  2336. OpKind_CategoryYes,
  2337. OpKind_CategoryNo:
  2338. begin
  2339. Inc(ABuffer);
  2340. ch := ABuffer^;
  2341. Inc(ABuffer);
  2342. ch2 := ABuffer^;
  2343. Inc(ABuffer);
  2344. Result := CheckCharCategory(AChar, ch, ch2);
  2345. if OpKind = OpKind_CategoryNo then
  2346. Result := not Result;
  2347. if Result then
  2348. Exit;
  2349. end;
  2350. {$ENDIF}
  2351. else
  2352. Error(reeBadOpcodeInCharClass);
  2353. end;
  2354. until False; // assume that Buffer is ended correctly
  2355. end;
  2356. procedure TRegExpr.GetCharSetFromWordChars(var ARes: TRegExprCharSet);
  2357. {$IFDEF UseWordChars}
  2358. var
  2359. i: integer;
  2360. ch: REChar;
  2361. {$ENDIF}
  2362. begin
  2363. {$IFDEF UseWordChars}
  2364. ARes := [];
  2365. for i := 1 to Length(fWordChars) do
  2366. begin
  2367. ch := fWordChars[i];
  2368. {$IFDEF UnicodeRE}
  2369. if Ord(ch) <= $FF then
  2370. {$ENDIF}
  2371. Include(ARes, byte(ch));
  2372. end;
  2373. {$ELSE}
  2374. ARes := RegExprWordSet;
  2375. {$ENDIF}
  2376. end;
  2377. procedure TRegExpr.GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
  2378. {$IFDEF UseSpaceChars}
  2379. var
  2380. i: integer;
  2381. ch: REChar;
  2382. {$ENDIF}
  2383. begin
  2384. {$IFDEF UseSpaceChars}
  2385. ARes := [];
  2386. for i := 1 to Length(fSpaceChars) do
  2387. begin
  2388. ch := fSpaceChars[i];
  2389. {$IFDEF UnicodeRE}
  2390. if Ord(ch) <= $FF then
  2391. {$ENDIF}
  2392. Include(ARes, byte(ch));
  2393. end;
  2394. {$ELSE}
  2395. ARes := RegExprSpaceSet;
  2396. {$ENDIF}
  2397. end;
  2398. procedure TRegExpr.GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  2399. var
  2400. ch, ch2: REChar;
  2401. TempSet: TRegExprCharSet;
  2402. N, i: integer;
  2403. begin
  2404. ARes := [];
  2405. TempSet := [];
  2406. repeat
  2407. case ABuffer^ of
  2408. OpKind_End:
  2409. Exit;
  2410. OpKind_Range:
  2411. begin
  2412. Inc(ABuffer);
  2413. ch := ABuffer^;
  2414. Inc(ABuffer);
  2415. ch2 := ABuffer^;
  2416. {$IFDEF UnicodeRE}
  2417. if Ord(ch2) > $FF then
  2418. ch2 := REChar($FF);
  2419. {$ENDIF}
  2420. Inc(ABuffer);
  2421. for i := Ord(ch) to Ord(ch2) do
  2422. begin
  2423. Include(ARes, byte(i));
  2424. if AIgnoreCase then
  2425. Include(ARes, byte(InvertCase(REChar(i))));
  2426. end;
  2427. end;
  2428. OpKind_MetaClass:
  2429. begin
  2430. Inc(ABuffer);
  2431. N := Ord(ABuffer^);
  2432. Inc(ABuffer);
  2433. if N = CheckerIndex_Word then
  2434. begin
  2435. GetCharSetFromWordChars(TempSet);
  2436. ARes := ARes + TempSet;
  2437. end
  2438. else
  2439. if N = CheckerIndex_NotWord then
  2440. begin
  2441. GetCharSetFromWordChars(TempSet);
  2442. ARes := ARes + (RegExprAllSet - TempSet);
  2443. end
  2444. else
  2445. if N = CheckerIndex_Space then
  2446. begin
  2447. GetCharSetFromSpaceChars(TempSet);
  2448. ARes := ARes + TempSet;
  2449. end
  2450. else
  2451. if N = CheckerIndex_NotSpace then
  2452. begin
  2453. GetCharSetFromSpaceChars(TempSet);
  2454. ARes := ARes + (RegExprAllSet - TempSet);
  2455. end
  2456. else
  2457. if N = CheckerIndex_Digit then
  2458. ARes := ARes + RegExprDigitSet
  2459. else
  2460. if N = CheckerIndex_NotDigit then
  2461. ARes := ARes + (RegExprAllSet - RegExprDigitSet)
  2462. else
  2463. if N = CheckerIndex_VertSep then
  2464. ARes := ARes + RegExprLineSeparatorsSet
  2465. else
  2466. if N = CheckerIndex_NotVertSep then
  2467. ARes := ARes + (RegExprAllSet - RegExprLineSeparatorsSet)
  2468. else
  2469. if N = CheckerIndex_HorzSep then
  2470. ARes := ARes + RegExprHorzSeparatorsSet
  2471. else
  2472. if N = CheckerIndex_NotHorzSep then
  2473. ARes := ARes + (RegExprAllSet - RegExprHorzSeparatorsSet)
  2474. else
  2475. if N = CheckerIndex_LowerAZ then
  2476. begin
  2477. if AIgnoreCase then
  2478. ARes := ARes + RegExprAllAzSet
  2479. else
  2480. ARes := ARes + RegExprLowerAzSet;
  2481. end
  2482. else
  2483. if N = CheckerIndex_UpperAZ then
  2484. begin
  2485. if AIgnoreCase then
  2486. ARes := ARes + RegExprAllAzSet
  2487. else
  2488. ARes := ARes + RegExprUpperAzSet;
  2489. end
  2490. else
  2491. Error(reeBadOpcodeInCharClass);
  2492. end;
  2493. OpKind_Char:
  2494. begin
  2495. Inc(ABuffer);
  2496. N := PLongInt(ABuffer)^;
  2497. Inc(ABuffer, RENumberSz);
  2498. for i := 1 to N do
  2499. begin
  2500. ch := ABuffer^;
  2501. Inc(ABuffer);
  2502. {$IFDEF UnicodeRE}
  2503. if Ord(ch) <= $FF then
  2504. {$ENDIF}
  2505. begin
  2506. Include(ARes, byte(ch));
  2507. if AIgnoreCase then
  2508. Include(ARes, byte(InvertCase(ch)));
  2509. end;
  2510. end;
  2511. end;
  2512. {$IFDEF FastUnicodeData}
  2513. OpKind_CategoryYes,
  2514. OpKind_CategoryNo:
  2515. begin
  2516. // usage of FirstCharSet makes no sense for regex with \p \P
  2517. ARes := RegExprAllSet;
  2518. Exit;
  2519. end;
  2520. {$ENDIF}
  2521. else
  2522. Error(reeBadOpcodeInCharClass);
  2523. end;
  2524. until False; // assume that Buffer is ended correctly
  2525. end;
  2526. function TRegExpr.GetModifierG: boolean;
  2527. begin
  2528. Result := fModifiers.G;
  2529. end;
  2530. function TRegExpr.GetModifierI: boolean;
  2531. begin
  2532. Result := fModifiers.I;
  2533. end;
  2534. function TRegExpr.GetModifierM: boolean;
  2535. begin
  2536. Result := fModifiers.M;
  2537. end;
  2538. function TRegExpr.GetModifierR: boolean;
  2539. begin
  2540. Result := fModifiers.R;
  2541. end;
  2542. function TRegExpr.GetModifierS: boolean;
  2543. begin
  2544. Result := fModifiers.S;
  2545. end;
  2546. function TRegExpr.GetModifierX: boolean;
  2547. begin
  2548. Result := fModifiers.X;
  2549. end;
  2550. function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): boolean;
  2551. // Compile a regular expression into internal code
  2552. // We can't allocate space until we know how big the compiled form will be,
  2553. // but we can't compile it (and thus know how big it is) until we've got a
  2554. // place to put the code. So we cheat: we compile it twice, once with code
  2555. // generation turned off and size counting turned on, and once "for real".
  2556. // This also means that we don't allocate space until we are sure that the
  2557. // thing really will compile successfully, and we never have to move the
  2558. // code and thus invalidate pointers into it. (Note that it has to be in
  2559. // one piece because free() must be able to free it all.)
  2560. // Beware that the optimization-preparation code in here knows about some
  2561. // of the structure of the compiled regexp.
  2562. var
  2563. scan, longest, longestTemp: PRegExprChar;
  2564. Len, LenTemp: integer;
  2565. FlagTemp: integer;
  2566. begin
  2567. Result := False;
  2568. FlagTemp := 0;
  2569. regParse := nil; // for correct error handling
  2570. regExactlyLen := nil;
  2571. ClearInternalIndexes;
  2572. fLastError := reeOk;
  2573. fLastErrorOpcode := TREOp(0);
  2574. if Assigned(fHelper) then
  2575. FreeAndNil(fHelper);
  2576. fHelperLen := 0;
  2577. try
  2578. if programm <> nil then
  2579. begin
  2580. FreeMem(programm);
  2581. programm := nil;
  2582. end;
  2583. if ARegExp = nil then
  2584. begin
  2585. Error(reeCompNullArgument);
  2586. Exit;
  2587. end;
  2588. fProgModifiers := fModifiers;
  2589. // well, may it's paranoia. I'll check it later.
  2590. // First pass: calculate opcode size, validate regex
  2591. fSecondPass := False;
  2592. fCompModifiers := fModifiers;
  2593. regParse := ARegExp;
  2594. regNumBrackets := 1;
  2595. regCodeSize := 0;
  2596. regCode := @regDummy;
  2597. regCodeWork := nil;
  2598. regLookahead := False;
  2599. regLookaheadNeg := False;
  2600. regLookaheadGroup := -1;
  2601. regLookbehind := False;
  2602. EmitC(OP_MAGIC);
  2603. if ParseReg(False, FlagTemp) = nil then
  2604. Exit;
  2605. // Allocate memory
  2606. GetMem(programm, regCodeSize * SizeOf(REChar));
  2607. // Second pass: emit opcode
  2608. fSecondPass := True;
  2609. fCompModifiers := fModifiers;
  2610. regParse := ARegExp;
  2611. regNumBrackets := 1;
  2612. regCode := programm;
  2613. regCodeWork := programm + REOpSz;
  2614. EmitC(OP_MAGIC);
  2615. if ParseReg(False, FlagTemp) = nil then
  2616. Exit;
  2617. // Dig out information for optimizations.
  2618. {$IFDEF UseFirstCharSet} // ###0.929
  2619. FirstCharSet := [];
  2620. FillFirstCharSet(regCodeWork);
  2621. for Len := 0 to 255 do
  2622. FirstCharArray[Len] := byte(Len) in FirstCharSet;
  2623. {$ENDIF}
  2624. regAnchored := #0;
  2625. regMust := nil;
  2626. regMustLen := 0;
  2627. regMustString := '';
  2628. scan := regCodeWork; // First OP_BRANCH.
  2629. if PREOp(regNext(scan))^ = OP_EEND then
  2630. begin // Only one top-level choice.
  2631. scan := scan + REOpSz + RENextOffSz;
  2632. // Starting-point info.
  2633. if PREOp(scan)^ = OP_BOL then
  2634. Inc(regAnchored);
  2635. // If there's something expensive in the r.e., find the longest
  2636. // literal string that must appear and make it the regMust. Resolve
  2637. // ties in favor of later strings, since the regstart check works
  2638. // with the beginning of the r.e. and avoiding duplication
  2639. // strengthens checking. Not a strong reason, but sufficient in the
  2640. // absence of others.
  2641. if (FlagTemp and FLAG_SPECSTART) <> 0 then
  2642. begin
  2643. longest := nil;
  2644. Len := 0;
  2645. while scan <> nil do
  2646. begin
  2647. if PREOp(scan)^ = OP_EXACTLY then
  2648. begin
  2649. longestTemp := scan + REOpSz + RENextOffSz + RENumberSz;
  2650. LenTemp := PLongInt(scan + REOpSz + RENextOffSz)^;
  2651. if LenTemp >= Len then
  2652. begin
  2653. longest := longestTemp;
  2654. Len := LenTemp;
  2655. end;
  2656. end;
  2657. scan := regNext(scan);
  2658. end;
  2659. regMust := longest;
  2660. regMustLen := Len;
  2661. if regMustLen > 1 then // don't use regMust if too short
  2662. SetString(regMustString, regMust, regMustLen);
  2663. end;
  2664. end;
  2665. Result := True;
  2666. finally
  2667. begin
  2668. if not Result then
  2669. InvalidateProgramm;
  2670. end;
  2671. end;
  2672. end; { of function TRegExpr.CompileRegExpr
  2673. -------------------------------------------------------------- }
  2674. function TRegExpr.ParseReg(InBrackets: boolean; var FlagParse: integer): PRegExprChar;
  2675. // regular expression, i.e. main body or parenthesized thing
  2676. // Caller must absorb opening parenthesis.
  2677. // Combining parenthesis handling with the base level of regular expression
  2678. // is a trifle forced, but the need to tie the tails of the branches to what
  2679. // follows makes it hard to avoid.
  2680. var
  2681. ret, br, ender: PRegExprChar;
  2682. NBrackets: integer;
  2683. FlagTemp: integer;
  2684. SavedModifiers: TRegExprModifiers;
  2685. begin
  2686. Result := nil;
  2687. FlagTemp := 0;
  2688. FlagParse := FLAG_HASWIDTH; // Tentatively.
  2689. NBrackets := 0;
  2690. SavedModifiers := fCompModifiers;
  2691. // Make an OP_OPEN node, if parenthesized.
  2692. if InBrackets then
  2693. begin
  2694. if regNumBrackets >= RegexMaxGroups then
  2695. begin
  2696. Error(reeCompParseRegTooManyBrackets);
  2697. Exit;
  2698. end;
  2699. NBrackets := regNumBrackets;
  2700. Inc(regNumBrackets);
  2701. ret := EmitNode(TREOp(Ord(OP_OPEN) + NBrackets));
  2702. GrpOpCodes[NBrackets] := ret;
  2703. end
  2704. else
  2705. ret := nil;
  2706. // Pick up the branches, linking them together.
  2707. br := ParseBranch(FlagTemp);
  2708. if br = nil then
  2709. begin
  2710. Result := nil;
  2711. Exit;
  2712. end;
  2713. if ret <> nil then
  2714. Tail(ret, br) // OP_OPEN -> first.
  2715. else
  2716. ret := br;
  2717. if (FlagTemp and FLAG_HASWIDTH) = 0 then
  2718. FlagParse := FlagParse and not FLAG_HASWIDTH;
  2719. FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART;
  2720. while (regParse^ = '|') do
  2721. begin
  2722. Inc(regParse);
  2723. br := ParseBranch(FlagTemp);
  2724. if br = nil then
  2725. begin
  2726. Result := nil;
  2727. Exit;
  2728. end;
  2729. Tail(ret, br); // OP_BRANCH -> OP_BRANCH.
  2730. if (FlagTemp and FLAG_HASWIDTH) = 0 then
  2731. FlagParse := FlagParse and not FLAG_HASWIDTH;
  2732. FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART;
  2733. end;
  2734. // Make a closing node, and hook it on the end.
  2735. if InBrackets then
  2736. ender := EmitNode(TREOp(Ord(OP_CLOSE) + NBrackets))
  2737. else
  2738. ender := EmitNode(OP_EEND);
  2739. Tail(ret, ender);
  2740. // Hook the tails of the branches to the closing node.
  2741. br := ret;
  2742. while br <> nil do
  2743. begin
  2744. OpTail(br, ender);
  2745. br := regNext(br);
  2746. end;
  2747. // Check for proper termination.
  2748. if InBrackets then
  2749. if regParse^ <> ')' then
  2750. begin
  2751. Error(reeCompParseRegUnmatchedBrackets);
  2752. Exit;
  2753. end
  2754. else
  2755. Inc(regParse); // skip trailing ')'
  2756. if (not InBrackets) and (regParse < fRegexEnd) then
  2757. begin
  2758. if regParse^ = ')' then
  2759. Error(reeCompParseRegUnmatchedBrackets2)
  2760. else
  2761. Error(reeCompParseRegJunkOnEnd);
  2762. Exit;
  2763. end;
  2764. fCompModifiers := SavedModifiers; // restore modifiers of parent
  2765. Result := ret;
  2766. end; { of function TRegExpr.ParseReg
  2767. -------------------------------------------------------------- }
  2768. function TRegExpr.ParseBranch(var FlagParse: integer): PRegExprChar;
  2769. // one alternative of an | operator
  2770. // Implements the concatenation operator.
  2771. var
  2772. ret, chain, latest: PRegExprChar;
  2773. FlagTemp: integer;
  2774. begin
  2775. FlagTemp := 0;
  2776. FlagParse := FLAG_WORST; // Tentatively.
  2777. ret := EmitNode(OP_BRANCH);
  2778. chain := nil;
  2779. while (regParse < fRegexEnd) and (regParse^ <> '|') and (regParse^ <> ')') do
  2780. begin
  2781. latest := ParsePiece(FlagTemp);
  2782. if latest = nil then
  2783. begin
  2784. Result := nil;
  2785. Exit;
  2786. end;
  2787. FlagParse := FlagParse or FlagTemp and FLAG_HASWIDTH;
  2788. if chain = nil // First piece.
  2789. then
  2790. FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART
  2791. else
  2792. Tail(chain, latest);
  2793. chain := latest;
  2794. end;
  2795. if chain = nil // Loop ran zero times.
  2796. then
  2797. EmitNode(OP_NOTHING);
  2798. Result := ret;
  2799. end; { of function TRegExpr.ParseBranch
  2800. -------------------------------------------------------------- }
  2801. function TRegExpr.ParsePiece(var FlagParse: integer): PRegExprChar;
  2802. // something followed by possible [*+?{]
  2803. // Note that the branching code sequences used for ? and the general cases
  2804. // of * and + and { are somewhat optimized: they use the same OP_NOTHING node as
  2805. // both the endmarker for their branch list and the body of the last branch.
  2806. // It might seem that this node could be dispensed with entirely, but the
  2807. // endmarker role is not redundant.
  2808. function ParseNumber(AStart, AEnd: PRegExprChar): TREBracesArg;
  2809. begin
  2810. Result := 0;
  2811. if AEnd - AStart + 1 > 8 then
  2812. begin // prevent stupid scanning
  2813. Error(reeBRACESArgTooBig);
  2814. Exit;
  2815. end;
  2816. while AStart <= AEnd do
  2817. begin
  2818. Result := Result * 10 + (Ord(AStart^) - Ord('0'));
  2819. Inc(AStart);
  2820. end;
  2821. if (Result > MaxBracesArg) or (Result < 0) then
  2822. begin
  2823. Error(reeBRACESArgTooBig);
  2824. Exit;
  2825. end;
  2826. end;
  2827. var
  2828. TheOp: TREOp;
  2829. NextNode: PRegExprChar;
  2830. procedure EmitComplexBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
  2831. {$IFDEF ComplexBraces}
  2832. var
  2833. off: TRENextOff;
  2834. {$ENDIF}
  2835. begin
  2836. {$IFNDEF ComplexBraces}
  2837. Error(reeComplexBracesNotImplemented);
  2838. {$ELSE}
  2839. if ANonGreedyOp then
  2840. TheOp := OP_LOOPNG
  2841. else
  2842. TheOp := OP_LOOP;
  2843. InsertOperator(OP_LOOPENTRY, Result, REOpSz + RENextOffSz);
  2844. NextNode := EmitNode(TheOp);
  2845. if regCode <> @regDummy then
  2846. begin
  2847. off := (Result + REOpSz + RENextOffSz) - (regCode - REOpSz - RENextOffSz);
  2848. // back to Atom after OP_LOOPENTRY
  2849. PREBracesArg(AlignToInt(regCode))^ := ABracesMin;
  2850. Inc(regCode, REBracesArgSz);
  2851. PREBracesArg(AlignToInt(regCode))^ := ABracesMax;
  2852. Inc(regCode, REBracesArgSz);
  2853. PRENextOff(AlignToPtr(regCode))^ := off;
  2854. Inc(regCode, RENextOffSz);
  2855. {$IFDEF DebugSynRegExpr}
  2856. if regcode - programm > regsize then
  2857. raise Exception.Create
  2858. ('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun');
  2859. {$ENDIF}
  2860. end
  2861. else
  2862. Inc(regCodeSize, REBracesArgSz * 2 + RENextOffSz);
  2863. Tail(Result, NextNode); // OP_LOOPENTRY -> OP_LOOP
  2864. if regCode <> @regDummy then
  2865. Tail(Result + REOpSz + RENextOffSz, NextNode); // Atom -> OP_LOOP
  2866. {$ENDIF}
  2867. end;
  2868. procedure EmitSimpleBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp, APossessive: boolean);
  2869. begin
  2870. if APossessive then
  2871. TheOp := OP_BRACES_POSS
  2872. else
  2873. if ANonGreedyOp then
  2874. TheOp := OP_BRACESNG
  2875. else
  2876. TheOp := OP_BRACES;
  2877. InsertOperator(TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
  2878. if regCode <> @regDummy then
  2879. begin
  2880. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin;
  2881. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax;
  2882. end;
  2883. end;
  2884. var
  2885. op, nextch: REChar;
  2886. NonGreedyOp, NonGreedyCh, PossessiveCh: boolean;
  2887. FlagTemp: integer;
  2888. BracesMin, BracesMax: TREBracesArg;
  2889. p: PRegExprChar;
  2890. begin
  2891. FlagTemp := 0;
  2892. Result := ParseAtom(FlagTemp);
  2893. if Result = nil then
  2894. Exit;
  2895. op := regParse^;
  2896. if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then
  2897. begin
  2898. FlagParse := FlagTemp;
  2899. Exit;
  2900. end;
  2901. if ((FlagTemp and FLAG_HASWIDTH) = 0) and (op <> '?') then
  2902. begin
  2903. Error(reePlusStarOperandCouldBeEmpty);
  2904. Exit;
  2905. end;
  2906. case op of
  2907. '*':
  2908. begin
  2909. FlagParse := FLAG_WORST or FLAG_SPECSTART;
  2910. nextch := (regParse + 1)^;
  2911. PossessiveCh := nextch = '+';
  2912. if PossessiveCh then
  2913. begin
  2914. NonGreedyCh := False;
  2915. NonGreedyOp := False;
  2916. end
  2917. else
  2918. begin
  2919. NonGreedyCh := nextch = '?';
  2920. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2921. end;
  2922. if (FlagTemp and FLAG_SIMPLE) = 0 then
  2923. begin
  2924. if NonGreedyOp then
  2925. EmitComplexBraces(0, MaxBracesArg, NonGreedyOp)
  2926. else
  2927. begin // Emit x* as (x&|), where & means "self".
  2928. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  2929. OpTail(Result, EmitNode(OP_BACK)); // and loop
  2930. OpTail(Result, Result); // back
  2931. Tail(Result, EmitNode(OP_BRANCH)); // or
  2932. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2933. end
  2934. end
  2935. else
  2936. begin // Simple
  2937. if PossessiveCh then
  2938. TheOp := OP_STAR_POSS
  2939. else
  2940. if NonGreedyOp then
  2941. TheOp := OP_STARNG
  2942. else
  2943. TheOp := OP_STAR;
  2944. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2945. end;
  2946. if NonGreedyCh or PossessiveCh then
  2947. Inc(regParse); // Skip extra AnsiChar ('?')
  2948. end; { of case '*' }
  2949. '+':
  2950. begin
  2951. FlagParse := FLAG_WORST or FLAG_SPECSTART or FLAG_HASWIDTH;
  2952. nextch := (regParse + 1)^;
  2953. PossessiveCh := nextch = '+';
  2954. if PossessiveCh then
  2955. begin
  2956. NonGreedyCh := False;
  2957. NonGreedyOp := False;
  2958. end
  2959. else
  2960. begin
  2961. NonGreedyCh := nextch = '?';
  2962. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2963. end;
  2964. if (FlagTemp and FLAG_SIMPLE) = 0 then
  2965. begin
  2966. if NonGreedyOp then
  2967. EmitComplexBraces(1, MaxBracesArg, NonGreedyOp)
  2968. else
  2969. begin // Emit x+ as x(&|), where & means "self".
  2970. NextNode := EmitNode(OP_BRANCH); // Either
  2971. Tail(Result, NextNode);
  2972. Tail(EmitNode(OP_BACK), Result); // loop back
  2973. Tail(NextNode, EmitNode(OP_BRANCH)); // or
  2974. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2975. end
  2976. end
  2977. else
  2978. begin // Simple
  2979. if PossessiveCh then
  2980. TheOp := OP_PLUS_POSS
  2981. else
  2982. if NonGreedyOp then
  2983. TheOp := OP_PLUSNG
  2984. else
  2985. TheOp := OP_PLUS;
  2986. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2987. end;
  2988. if NonGreedyCh or PossessiveCh then
  2989. Inc(regParse); // Skip extra AnsiChar ('?')
  2990. end; { of case '+' }
  2991. '?':
  2992. begin
  2993. FlagParse := FLAG_WORST;
  2994. nextch := (regParse + 1)^;
  2995. PossessiveCh := nextch = '+';
  2996. if PossessiveCh then
  2997. begin
  2998. NonGreedyCh := False;
  2999. NonGreedyOp := False;
  3000. end
  3001. else
  3002. begin
  3003. NonGreedyCh := nextch = '?';
  3004. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  3005. end;
  3006. if NonGreedyOp or PossessiveCh then
  3007. begin // ###0.940 // We emit x?? as x{0,1}?
  3008. if (FlagTemp and FLAG_SIMPLE) = 0 then
  3009. begin
  3010. if PossessiveCh then
  3011. Error(reePossessiveAfterComplexBraces);
  3012. EmitComplexBraces(0, 1, NonGreedyOp);
  3013. end
  3014. else
  3015. EmitSimpleBraces(0, 1, NonGreedyOp, PossessiveCh);
  3016. end
  3017. else
  3018. begin // greedy '?'
  3019. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  3020. Tail(Result, EmitNode(OP_BRANCH)); // or
  3021. NextNode := EmitNode(OP_NOTHING); // nil.
  3022. Tail(Result, NextNode);
  3023. OpTail(Result, NextNode);
  3024. end;
  3025. if NonGreedyCh or PossessiveCh then
  3026. Inc(regParse); // Skip extra AnsiChar ('?')
  3027. end; { of case '?' }
  3028. '{':
  3029. begin
  3030. Inc(regParse);
  3031. p := regParse;
  3032. while IsDigitChar(regParse^) do // <min> MUST appear
  3033. Inc(regParse);
  3034. if (regParse^ <> '}') and (regParse^ <> ',') or (p = regParse) then
  3035. begin
  3036. Error(reeIncorrectBraces);
  3037. Exit;
  3038. end;
  3039. BracesMin := ParseNumber(p, regParse - 1);
  3040. if regParse^ = ',' then
  3041. begin
  3042. Inc(regParse);
  3043. p := regParse;
  3044. while IsDigitChar(regParse^) do
  3045. Inc(regParse);
  3046. if regParse^ <> '}' then
  3047. begin
  3048. Error(reeIncorrectBraces);
  3049. Exit;
  3050. end;
  3051. if p = regParse then
  3052. BracesMax := MaxBracesArg
  3053. else
  3054. BracesMax := ParseNumber(p, regParse - 1);
  3055. end
  3056. else
  3057. BracesMax := BracesMin; // {n} == {n,n}
  3058. if BracesMin > BracesMax then
  3059. begin
  3060. Error(reeBracesMinParamGreaterMax);
  3061. Exit;
  3062. end;
  3063. if BracesMin > 0 then
  3064. FlagParse := FLAG_WORST;
  3065. if BracesMax > 0 then
  3066. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SPECSTART;
  3067. nextch := (regParse + 1)^;
  3068. PossessiveCh := nextch = '+';
  3069. if PossessiveCh then
  3070. begin
  3071. NonGreedyCh := False;
  3072. NonGreedyOp := False;
  3073. end
  3074. else
  3075. begin
  3076. NonGreedyCh := nextch = '?';
  3077. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  3078. end;
  3079. if (FlagTemp and FLAG_SIMPLE) <> 0 then
  3080. EmitSimpleBraces(BracesMin, BracesMax, NonGreedyOp, PossessiveCh)
  3081. else
  3082. begin
  3083. if PossessiveCh then
  3084. Error(reePossessiveAfterComplexBraces);
  3085. EmitComplexBraces(BracesMin, BracesMax, NonGreedyOp);
  3086. end;
  3087. if NonGreedyCh or PossessiveCh then
  3088. Inc(regParse); // Skip extra AnsiChar '?'
  3089. end; // of case '{'
  3090. // else // here we can't be
  3091. end; { of case op }
  3092. Inc(regParse);
  3093. op := regParse^;
  3094. if (op = '*') or (op = '+') or (op = '?') or (op = '{') then
  3095. Error(reeNestedQuantif);
  3096. end; { of function TRegExpr.ParsePiece
  3097. -------------------------------------------------------------- }
  3098. function TRegExpr.HexDig(Ch: REChar): integer;
  3099. begin
  3100. case Ch of
  3101. '0' .. '9':
  3102. Result := Ord(Ch) - Ord('0');
  3103. 'a' .. 'f':
  3104. Result := Ord(Ch) - Ord('a') + 10;
  3105. 'A' .. 'F':
  3106. Result := Ord(Ch) - Ord('A') + 10;
  3107. else
  3108. begin
  3109. Result := 0;
  3110. Error(reeBadHexDigit);
  3111. end;
  3112. end;
  3113. end;
  3114. function TRegExpr.UnQuoteChar(var APtr, AEnd: PRegExprChar): REChar;
  3115. var
  3116. Ch: REChar;
  3117. begin
  3118. case APtr^ of
  3119. 't':
  3120. Result := #$9; // \t => tab (HT/TAB)
  3121. 'n':
  3122. Result := #$a; // \n => newline (NL)
  3123. 'r':
  3124. Result := #$d; // \r => carriage return (CR)
  3125. 'f':
  3126. Result := #$c; // \f => form feed (FF)
  3127. 'a':
  3128. Result := #$7; // \a => alarm (bell) (BEL)
  3129. 'e':
  3130. Result := #$1b; // \e => escape (ESC)
  3131. 'c':
  3132. begin // \cK => code for Ctrl+K
  3133. Result := #0;
  3134. Inc(APtr);
  3135. if APtr >= AEnd then
  3136. Error(reeNoLetterAfterBSlashC);
  3137. Ch := APtr^;
  3138. case Ch of
  3139. 'a' .. 'z':
  3140. Result := REChar(Ord(Ch) - Ord('a') + 1);
  3141. 'A' .. 'Z':
  3142. Result := REChar(Ord(Ch) - Ord('A') + 1);
  3143. else
  3144. Error(reeNoLetterAfterBSlashC);
  3145. end;
  3146. end;
  3147. 'x':
  3148. begin // \x: hex AnsiChar
  3149. Result := #0;
  3150. Inc(APtr);
  3151. if APtr >= AEnd then
  3152. begin
  3153. Error(reeNoHexCodeAfterBSlashX);
  3154. Exit;
  3155. end;
  3156. if APtr^ = '{' then
  3157. begin // \x{nnnn} //###0.936
  3158. repeat
  3159. Inc(APtr);
  3160. if APtr >= AEnd then
  3161. begin
  3162. Error(reeNoHexCodeAfterBSlashX);
  3163. Exit;
  3164. end;
  3165. if APtr^ <> '}' then
  3166. begin
  3167. if (Ord(Result) ShR (SizeOf(REChar) * 8 - 4)) and $F <> 0 then
  3168. begin
  3169. Error(reeHexCodeAfterBSlashXTooBig);
  3170. Exit;
  3171. end;
  3172. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  3173. // HexDig will cause Error if bad hex digit found
  3174. end
  3175. else
  3176. Break;
  3177. until False;
  3178. end
  3179. else
  3180. begin
  3181. Result := REChar(HexDig(APtr^));
  3182. // HexDig will cause Error if bad hex digit found
  3183. Inc(APtr);
  3184. if APtr >= AEnd then
  3185. begin
  3186. Error(reeNoHexCodeAfterBSlashX);
  3187. Exit;
  3188. end;
  3189. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  3190. // HexDig will cause Error if bad hex digit found
  3191. end;
  3192. end;
  3193. else
  3194. begin
  3195. Result := APtr^;
  3196. if (Result <> '_') and IsWordChar(Result) then
  3197. begin
  3198. fLastErrorSymbol := Result;
  3199. Error(reeUnknownMetaSymbol);
  3200. end;
  3201. end;
  3202. end;
  3203. end;
  3204. function TRegExpr.ParseAtom(var FlagParse: integer): PRegExprChar;
  3205. // the lowest level
  3206. // Optimization: gobbles an entire sequence of ordinary characters so that
  3207. // it can turn them into a single node, which is smaller to store and
  3208. // faster to run. Backslashed characters are exceptions, each becoming a
  3209. // separate node; the code is simpler that way and it's not worth fixing.
  3210. var
  3211. ret: PRegExprChar;
  3212. RangeBeg, RangeEnd: REChar;
  3213. CanBeRange: boolean;
  3214. AddrOfLen: PLongInt;
  3215. procedure EmitExactly(Ch: REChar);
  3216. begin
  3217. if fCompModifiers.I then
  3218. ret := EmitNode(OP_EXACTLYCI)
  3219. else
  3220. ret := EmitNode(OP_EXACTLY);
  3221. EmitInt(1);
  3222. EmitC(Ch);
  3223. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3224. end;
  3225. procedure EmitRangeChar(Ch: REChar; AStartOfRange: boolean);
  3226. begin
  3227. CanBeRange := AStartOfRange;
  3228. if fCompModifiers.I then
  3229. Ch := _UpperCase(Ch);
  3230. if AStartOfRange then
  3231. begin
  3232. AddrOfLen := nil;
  3233. RangeBeg := Ch;
  3234. end
  3235. else
  3236. begin
  3237. if AddrOfLen = nil then
  3238. begin
  3239. EmitC(OpKind_Char);
  3240. Pointer(AddrOfLen) := regCode;
  3241. EmitInt(0);
  3242. end;
  3243. Inc(AddrOfLen^);
  3244. EmitC(Ch);
  3245. end;
  3246. end;
  3247. procedure EmitRangePacked(ch1, ch2: REChar);
  3248. var
  3249. ChkIndex: integer;
  3250. begin
  3251. AddrOfLen := nil;
  3252. CanBeRange := False;
  3253. if fCompModifiers.I then
  3254. begin
  3255. ch1 := _UpperCase(ch1);
  3256. ch2 := _UpperCase(ch2);
  3257. end;
  3258. for ChkIndex := Low(CharCheckerInfos) to High(CharCheckerInfos) do
  3259. if (CharCheckerInfos[ChkIndex].CharBegin = ch1) and
  3260. (CharCheckerInfos[ChkIndex].CharEnd = ch2) then
  3261. begin
  3262. EmitC(OpKind_MetaClass);
  3263. EmitC(REChar(CharCheckerInfos[ChkIndex].CheckerIndex));
  3264. Exit;
  3265. end;
  3266. EmitC(OpKind_Range);
  3267. EmitC(ch1);
  3268. EmitC(ch2);
  3269. end;
  3270. {$IFDEF FastUnicodeData}
  3271. procedure EmitCategoryInCharClass(APositive: boolean);
  3272. var
  3273. ch, ch2: REChar;
  3274. begin
  3275. AddrOfLen := nil;
  3276. CanBeRange := False;
  3277. Inc(regParse);
  3278. FindCategoryName(regParse, ch, ch2);
  3279. if APositive then
  3280. EmitC(OpKind_CategoryYes)
  3281. else
  3282. EmitC(OpKind_CategoryNo);
  3283. EmitC(ch);
  3284. EmitC(ch2);
  3285. end;
  3286. {$ENDIF}
  3287. var
  3288. FlagTemp: integer;
  3289. Len: integer;
  3290. SavedPtr: PRegExprChar;
  3291. EnderChar, TempChar: REChar;
  3292. DashForRange: Boolean;
  3293. GrpKind: TREGroupKind;
  3294. GrpName: RegExprString;
  3295. GrpIndex: integer;
  3296. NextCh: REChar;
  3297. begin
  3298. Result := nil;
  3299. FlagTemp := 0;
  3300. FlagParse := FLAG_WORST;
  3301. AddrOfLen := nil;
  3302. GrpIndex := -1;
  3303. Inc(regParse);
  3304. case (regParse - 1)^ of
  3305. '^':
  3306. begin
  3307. if not fCompModifiers.M
  3308. {$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then
  3309. ret := EmitNode(OP_BOL)
  3310. else
  3311. ret := EmitNode(OP_BOLML);
  3312. end;
  3313. '$':
  3314. begin
  3315. if not fCompModifiers.M
  3316. {$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then
  3317. ret := EmitNode(OP_EOL)
  3318. else
  3319. ret := EmitNode(OP_EOLML);
  3320. end;
  3321. '.':
  3322. begin
  3323. if fCompModifiers.S then
  3324. begin
  3325. ret := EmitNode(OP_ANY);
  3326. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3327. end
  3328. else
  3329. begin // not /s, so emit [^:LineSeparators:]
  3330. ret := EmitNode(OP_ANYML);
  3331. FlagParse := FlagParse or FLAG_HASWIDTH; // not so simple ;)
  3332. end;
  3333. end;
  3334. '[':
  3335. begin
  3336. if regParse^ = '^' then
  3337. begin // Complement of range.
  3338. if fCompModifiers.I then
  3339. ret := EmitNode(OP_ANYBUTCI)
  3340. else
  3341. ret := EmitNode(OP_ANYBUT);
  3342. Inc(regParse);
  3343. end
  3344. else if fCompModifiers.I then
  3345. ret := EmitNode(OP_ANYOFCI)
  3346. else
  3347. ret := EmitNode(OP_ANYOF);
  3348. CanBeRange := False;
  3349. if regParse^ = ']' then
  3350. begin
  3351. // first ']' inside [] treated as simple AnsiChar, no need to check '['
  3352. EmitRangeChar(regParse^, (regParse + 1)^ = '-');
  3353. Inc(regParse);
  3354. end;
  3355. while (regParse < fRegexEnd) and (regParse^ <> ']') do
  3356. begin
  3357. // last '-' inside [] treated as simple dash
  3358. if (regParse^ = '-') and
  3359. ((regParse + 1) < fRegexEnd) and
  3360. ((regParse + 1)^ = ']') then
  3361. begin
  3362. EmitRangeChar('-', False);
  3363. Inc(regParse);
  3364. Break;
  3365. end;
  3366. // AnsiChar '-' which (maybe) makes a range
  3367. if (regParse^ = '-') and ((regParse + 1) < fRegexEnd) and CanBeRange then
  3368. begin
  3369. Inc(regParse);
  3370. RangeEnd := regParse^;
  3371. if RangeEnd = EscChar then
  3372. begin
  3373. if _IsMetaChar((regParse + 1)^) then
  3374. begin
  3375. Error(reeMetaCharAfterMinusInRange);
  3376. Exit;
  3377. end;
  3378. Inc(regParse);
  3379. RangeEnd := UnQuoteChar(regParse, fRegexEnd);
  3380. end;
  3381. // special handling for Russian range a-YA, add 2 ranges: a-ya and A-YA
  3382. if fCompModifiers.R and
  3383. (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then
  3384. begin
  3385. EmitRangePacked(RusRangeLoLow, RusRangeLoHigh);
  3386. EmitRangePacked(RusRangeHiLow, RusRangeHiHigh);
  3387. end
  3388. else
  3389. begin // standard r.e. handling
  3390. if RangeBeg > RangeEnd then
  3391. begin
  3392. Error(reeInvalidRange);
  3393. Exit;
  3394. end;
  3395. EmitRangePacked(RangeBeg, RangeEnd);
  3396. end;
  3397. Inc(regParse);
  3398. end
  3399. else
  3400. begin
  3401. if regParse^ = EscChar then
  3402. begin
  3403. Inc(regParse);
  3404. if regParse >= fRegexEnd then
  3405. begin
  3406. Error(reeParseAtomTrailingBackSlash);
  3407. Exit;
  3408. end;
  3409. if _IsMetaChar(regParse^) then
  3410. begin
  3411. AddrOfLen := nil;
  3412. CanBeRange := False;
  3413. EmitC(OpKind_MetaClass);
  3414. case regParse^ of
  3415. 'w':
  3416. EmitC(REChar(CheckerIndex_Word));
  3417. 'W':
  3418. EmitC(REChar(CheckerIndex_NotWord));
  3419. 's':
  3420. EmitC(REChar(CheckerIndex_Space));
  3421. 'S':
  3422. EmitC(REChar(CheckerIndex_NotSpace));
  3423. 'd':
  3424. EmitC(REChar(CheckerIndex_Digit));
  3425. 'D':
  3426. EmitC(REChar(CheckerIndex_NotDigit));
  3427. 'v':
  3428. EmitC(REChar(CheckerIndex_VertSep));
  3429. 'V':
  3430. EmitC(REChar(CheckerIndex_NotVertSep));
  3431. 'h':
  3432. EmitC(REChar(CheckerIndex_HorzSep));
  3433. 'H':
  3434. EmitC(REChar(CheckerIndex_NotHorzSep));
  3435. else
  3436. Error(reeBadOpcodeInCharClass);
  3437. end;
  3438. end
  3439. else
  3440. {$IFDEF FastUnicodeData}
  3441. if regParse^ = 'p' then
  3442. EmitCategoryInCharClass(True)
  3443. else
  3444. if regParse^ = 'P' then
  3445. EmitCategoryInCharClass(False)
  3446. else
  3447. {$ENDIF}
  3448. begin
  3449. TempChar := UnQuoteChar(regParse, fRegexEnd);
  3450. // False if '-' is last AnsiChar in []
  3451. DashForRange :=
  3452. (regParse + 2 < fRegexEnd) and
  3453. ((regParse + 1)^ = '-') and
  3454. ((regParse + 2)^ <> ']');
  3455. EmitRangeChar(TempChar, DashForRange);
  3456. end;
  3457. end
  3458. else
  3459. begin
  3460. // False if '-' is last AnsiChar in []
  3461. DashForRange :=
  3462. (regParse + 2 < fRegexEnd) and
  3463. ((regParse + 1)^ = '-') and
  3464. ((regParse + 2)^ <> ']');
  3465. EmitRangeChar(regParse^, DashForRange);
  3466. end;
  3467. Inc(regParse);
  3468. end;
  3469. end; { of while }
  3470. AddrOfLen := nil;
  3471. CanBeRange := False;
  3472. EmitC(OpKind_End);
  3473. if regParse^ <> ']' then
  3474. begin
  3475. Error(reeUnmatchedSqBrackets);
  3476. Exit;
  3477. end;
  3478. Inc(regParse);
  3479. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3480. end;
  3481. '(':
  3482. begin
  3483. GrpKind := gkNormalGroup;
  3484. GrpName := '';
  3485. // A: detect kind of expression in brackets
  3486. if regParse^ = '?' then
  3487. begin
  3488. NextCh := (regParse + 1)^;
  3489. case NextCh of
  3490. ':':
  3491. begin
  3492. // non-capturing group: (?:regex)
  3493. GrpKind := gkNonCapturingGroup;
  3494. Inc(regParse, 2);
  3495. end;
  3496. '>':
  3497. begin
  3498. // atomic group: (?>regex)
  3499. GrpKind := gkNonCapturingGroup;
  3500. Inc(regParse, 2);
  3501. GrpAtomic[regNumBrackets] := True;
  3502. end;
  3503. 'P':
  3504. begin
  3505. if (regParse + 4 >= fRegexEnd) then
  3506. Error(reeNamedGroupBad);
  3507. case (regParse + 2)^ of
  3508. '<':
  3509. begin
  3510. // named group: (?P<name>regex)
  3511. GrpKind := gkNormalGroup;
  3512. FindGroupName(regParse + 3, fRegexEnd, '>', GrpName);
  3513. Inc(regParse, Length(GrpName) + 4);
  3514. end;
  3515. '=':
  3516. begin
  3517. // back-reference to named group: (?P=name)
  3518. GrpKind := gkNamedGroupReference;
  3519. FindGroupName(regParse + 3, fRegexEnd, ')', GrpName);
  3520. Inc(regParse, Length(GrpName) + 4);
  3521. end;
  3522. '>':
  3523. begin
  3524. // subroutine call to named group: (?P>name)
  3525. GrpKind := gkSubCall;
  3526. FindGroupName(regParse + 3, fRegexEnd, ')', GrpName);
  3527. Inc(regParse, Length(GrpName) + 4);
  3528. GrpIndex := MatchIndexFromName(GrpName);
  3529. if GrpIndex < 1 then
  3530. Error(reeNamedGroupBadRef);
  3531. end;
  3532. else
  3533. Error(reeNamedGroupBad);
  3534. end;
  3535. end;
  3536. '<':
  3537. begin
  3538. // lookbehind: (?<=foo)bar
  3539. if (regParse + 4 >= fRegexEnd) then
  3540. Error(reeLookbehindBad);
  3541. case (regParse + 2)^ of
  3542. '=':
  3543. begin
  3544. // allow lookbehind only at the beginning
  3545. if regParse <> fRegexStart + 1 then
  3546. Error(reeLookaroundNotAtEdge);
  3547. GrpKind := gkLookbehind;
  3548. GrpAtomic[regNumBrackets] := RegExprLookbehindIsAtomic;
  3549. regLookbehind := True;
  3550. Inc(regParse, 3);
  3551. end;
  3552. '!':
  3553. begin
  3554. // allow lookbehind only at the beginning
  3555. if regParse <> fRegexStart + 1 then
  3556. Error(reeLookaroundNotAtEdge);
  3557. GrpKind := gkLookbehindNeg;
  3558. Inc(regParse, 3);
  3559. SavedPtr := _FindClosingBracket(regParse, fRegexEnd);
  3560. if SavedPtr = nil then
  3561. Error(reeCompParseRegUnmatchedBrackets);
  3562. // for '(?<!foo)bar', we make our regex 'bar' and make Helper object with 'foo'
  3563. if not fSecondPass then
  3564. begin
  3565. Len := SavedPtr - fRegexStart - 4;
  3566. if Len = 0 then
  3567. Error(reeLookbehindBad);
  3568. if fHelper = nil then
  3569. fHelper := TRegExpr.Create;
  3570. fHelper.Expression := Copy(fExpression, 5, Len);
  3571. try
  3572. fHelper.Compile;
  3573. except
  3574. Len := fHelper.LastError;
  3575. FreeAndNil(fHelper);
  3576. Error(Len);
  3577. end;
  3578. if fHelper.IsFixedLength(TempChar, Len) then
  3579. fHelperLen := Len
  3580. else
  3581. begin
  3582. FreeAndNil(fHelper);
  3583. Error(reeLookbehindTooComplex);
  3584. end;
  3585. end;
  3586. // jump to closing bracket, don't make opcode for (?<!foo)
  3587. regParse := SavedPtr + 1;
  3588. end;
  3589. else
  3590. Error(reeLookbehindBad);
  3591. end;
  3592. end;
  3593. '=', '!':
  3594. begin
  3595. // lookaheads: foo(?=bar) and foo(?!bar)
  3596. if (regParse + 3 >= fRegexEnd) then
  3597. Error(reeLookaheadBad);
  3598. regLookahead := True;
  3599. regLookaheadGroup := regNumBrackets;
  3600. if NextCh = '=' then
  3601. begin
  3602. GrpKind := gkLookahead;
  3603. end
  3604. else
  3605. begin
  3606. GrpKind := gkLookaheadNeg;
  3607. regLookaheadNeg := True;
  3608. end;
  3609. GrpAtomic[regNumBrackets] := RegExprLookaheadIsAtomic;
  3610. // check that these brackets are last in regex
  3611. SavedPtr := _FindClosingBracket(regParse + 1, fRegexEnd);
  3612. if (SavedPtr <> fRegexEnd - 1) then
  3613. Error(reeLookaroundNotAtEdge);
  3614. Inc(regParse, 2);
  3615. end;
  3616. '#':
  3617. begin
  3618. // (?#comment)
  3619. GrpKind := gkComment;
  3620. Inc(regParse, 2);
  3621. end;
  3622. 'a'..'z', '-':
  3623. begin
  3624. // modifiers string like (?mxr)
  3625. GrpKind := gkModifierString;
  3626. Inc(regParse);
  3627. end;
  3628. 'R', '0':
  3629. begin
  3630. // recursion (?R), (?0)
  3631. GrpKind := gkRecursion;
  3632. Inc(regParse, 2);
  3633. if regParse^ <> ')' then
  3634. Error(reeBadRecursion);
  3635. Inc(regParse);
  3636. end;
  3637. '1'..'9':
  3638. begin
  3639. // subroutine call (?1)..(?99)
  3640. GrpKind := gkSubCall;
  3641. GrpIndex := Ord(NextCh) - Ord('0');
  3642. Inc(regParse, 2);
  3643. // support 2-digit group numbers
  3644. case regParse^ of
  3645. ')':
  3646. begin
  3647. Inc(regParse);
  3648. end;
  3649. '0'..'9':
  3650. begin
  3651. GrpIndex := GrpIndex * 10 + Ord(regParse^) - Ord('0');
  3652. if GrpIndex >= RegexMaxGroups then
  3653. Error(reeBadSubCall);
  3654. Inc(regParse);
  3655. if regParse^ <> ')' then
  3656. Error(reeBadSubCall);
  3657. Inc(regParse);
  3658. end
  3659. else
  3660. Error(reeBadRecursion);
  3661. end;
  3662. end;
  3663. '''':
  3664. begin
  3665. // named group: (?'name'regex)
  3666. if (regParse + 4 >= fRegexEnd) then
  3667. Error(reeNamedGroupBad);
  3668. GrpKind := gkNormalGroup;
  3669. FindGroupName(regParse + 2, fRegexEnd, '''', GrpName);
  3670. Inc(regParse, Length(GrpName) + 3);
  3671. end;
  3672. '&':
  3673. begin
  3674. // subroutine call to named group: (?&name)
  3675. if (regParse + 2 >= fRegexEnd) then
  3676. Error(reeBadSubCall);
  3677. GrpKind := gkSubCall;
  3678. FindGroupName(regParse + 2, fRegexEnd, ')', GrpName);
  3679. Inc(regParse, Length(GrpName) + 3);
  3680. GrpIndex := MatchIndexFromName(GrpName);
  3681. if GrpIndex < 1 then
  3682. Error(reeNamedGroupBadRef);
  3683. end;
  3684. else
  3685. Error(reeIncorrectSpecialBrackets);
  3686. end;
  3687. end;
  3688. // B: process found kind of brackets
  3689. case GrpKind of
  3690. gkNormalGroup,
  3691. gkNonCapturingGroup,
  3692. gkLookahead,
  3693. gkLookaheadNeg,
  3694. gkLookbehind:
  3695. begin
  3696. // skip this block for one of passes, to not double groups count;
  3697. // must take first pass (we need GrpNames filled)
  3698. if (GrpKind = gkNormalGroup) and not fSecondPass then
  3699. if GrpCount < RegexMaxGroups - 1 then
  3700. begin
  3701. Inc(GrpCount);
  3702. GrpIndexes[GrpCount] := regNumBrackets;
  3703. if GrpName <> '' then
  3704. begin
  3705. if MatchIndexFromName(GrpName) >= 0 then
  3706. Error(reeNamedGroupDupName);
  3707. GrpNames[GrpCount] := GrpName;
  3708. end;
  3709. end;
  3710. ret := ParseReg(True, FlagTemp);
  3711. if ret = nil then
  3712. begin
  3713. Result := nil;
  3714. Exit;
  3715. end;
  3716. FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART);
  3717. end;
  3718. gkLookbehindNeg:
  3719. begin
  3720. // don't make opcode
  3721. ret := EmitNode(OP_COMMENT);
  3722. FlagParse := FLAG_WORST;
  3723. end;
  3724. gkNamedGroupReference:
  3725. begin
  3726. Len := MatchIndexFromName(GrpName);
  3727. if Len < 0 then
  3728. Error(reeNamedGroupBadRef);
  3729. ret := EmitGroupRef(Len, fCompModifiers.I);
  3730. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3731. end;
  3732. gkModifierString:
  3733. begin
  3734. SavedPtr := regParse;
  3735. while (regParse < fRegexEnd) and (regParse^ <> ')') do
  3736. Inc(regParse);
  3737. if (regParse^ <> ')') or
  3738. not ParseModifiers(SavedPtr, regParse - SavedPtr, fCompModifiers) then
  3739. begin
  3740. Error(reeUnrecognizedModifier);
  3741. Exit;
  3742. end;
  3743. Inc(regParse); // skip ')'
  3744. ret := EmitNode(OP_COMMENT); // comment
  3745. // Error (reeQuantifFollowsNothing);
  3746. // Exit;
  3747. end;
  3748. gkComment:
  3749. begin
  3750. while (regParse < fRegexEnd) and (regParse^ <> ')') do
  3751. Inc(regParse);
  3752. if regParse^ <> ')' then
  3753. begin
  3754. Error(reeUnclosedComment);
  3755. Exit;
  3756. end;
  3757. Inc(regParse); // skip ')'
  3758. ret := EmitNode(OP_COMMENT); // comment
  3759. end;
  3760. gkRecursion:
  3761. begin
  3762. // set FLAG_HASWIDTH to allow compiling of such regex: b(?:m|(?R))*e
  3763. FlagParse := FlagParse or FLAG_HASWIDTH;
  3764. ret := EmitNode(OP_RECUR);
  3765. end;
  3766. gkSubCall:
  3767. begin
  3768. // set FLAG_HASWIDTH like for (?R)
  3769. FlagParse := FlagParse or FLAG_HASWIDTH;
  3770. ret := EmitNode(TReOp(Ord(OP_SUBCALL) + GrpIndex));
  3771. end;
  3772. end; // case GrpKind of
  3773. end;
  3774. '|', ')':
  3775. begin // Supposed to be caught earlier.
  3776. Error(reeInternalUrp);
  3777. Exit;
  3778. end;
  3779. '?', '+', '*':
  3780. begin
  3781. Error(reeQuantifFollowsNothing);
  3782. Exit;
  3783. end;
  3784. EscChar:
  3785. begin
  3786. if regParse >= fRegexEnd then
  3787. begin
  3788. Error(reeTrailingBackSlash);
  3789. Exit;
  3790. end;
  3791. case regParse^ of
  3792. 'b':
  3793. ret := EmitNode(OP_BOUND);
  3794. 'B':
  3795. ret := EmitNode(OP_NOTBOUND);
  3796. 'A':
  3797. ret := EmitNode(OP_BOL);
  3798. 'z':
  3799. ret := EmitNode(OP_EOL);
  3800. 'Z':
  3801. ret := EmitNode(OP_EOL2);
  3802. 'd':
  3803. begin // r.e.extension - any digit ('0' .. '9')
  3804. ret := EmitNode(OP_ANYDIGIT);
  3805. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3806. end;
  3807. 'D':
  3808. begin // r.e.extension - not digit ('0' .. '9')
  3809. ret := EmitNode(OP_NOTDIGIT);
  3810. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3811. end;
  3812. 's':
  3813. begin // r.e.extension - any space AnsiChar
  3814. ret := EmitNode(OP_ANYSPACE);
  3815. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3816. end;
  3817. 'S':
  3818. begin // r.e.extension - not space AnsiChar
  3819. ret := EmitNode(OP_NOTSPACE);
  3820. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3821. end;
  3822. 'w':
  3823. begin // r.e.extension - any english AnsiChar / digit / '_'
  3824. ret := EmitNode(OP_ANYLETTER);
  3825. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3826. end;
  3827. 'W':
  3828. begin // r.e.extension - not english AnsiChar / digit / '_'
  3829. ret := EmitNode(OP_NOTLETTER);
  3830. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3831. end;
  3832. 'v':
  3833. begin
  3834. ret := EmitNode(OP_ANYVERTSEP);
  3835. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3836. end;
  3837. 'V':
  3838. begin
  3839. ret := EmitNode(OP_NOTVERTSEP);
  3840. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3841. end;
  3842. 'h':
  3843. begin
  3844. ret := EmitNode(OP_ANYHORZSEP);
  3845. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3846. end;
  3847. 'H':
  3848. begin
  3849. ret := EmitNode(OP_NOTHORZSEP);
  3850. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3851. end;
  3852. '1' .. '9':
  3853. begin
  3854. ret := EmitGroupRef(Ord(regParse^) - Ord('0'), fCompModifiers.I);
  3855. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3856. end;
  3857. {$IFDEF FastUnicodeData}
  3858. 'p':
  3859. begin
  3860. ret := EmitCategoryMain(True);
  3861. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3862. end;
  3863. 'P':
  3864. begin
  3865. ret := EmitCategoryMain(False);
  3866. FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
  3867. end;
  3868. {$ENDIF}
  3869. else
  3870. EmitExactly(UnQuoteChar(regParse, fRegexEnd));
  3871. end; { of case }
  3872. Inc(regParse);
  3873. end;
  3874. else
  3875. begin
  3876. Dec(regParse);
  3877. if fCompModifiers.X and // check for eXtended syntax
  3878. ((regParse^ = '#') or IsIgnoredChar(regParse^)) then
  3879. begin // ###0.941 \x
  3880. if regParse^ = '#' then
  3881. begin // Skip eXtended comment
  3882. // find comment terminator (group of \n and/or \r)
  3883. while (regParse < fRegexEnd) and (regParse^ <> #$d) and
  3884. (regParse^ <> #$a) do
  3885. Inc(regParse);
  3886. while (regParse^ = #$d) or (regParse^ = #$a)
  3887. // skip comment terminator
  3888. do
  3889. Inc(regParse);
  3890. // attempt to support different type of line separators
  3891. end
  3892. else
  3893. begin // Skip the blanks!
  3894. while IsIgnoredChar(regParse^) do
  3895. Inc(regParse);
  3896. end;
  3897. ret := EmitNode(OP_COMMENT); // comment
  3898. end
  3899. else
  3900. begin
  3901. Len := FindSkippedMetaLen(regParse, fRegexEnd);
  3902. if Len <= 0 then
  3903. if regParse^ <> '{' then
  3904. begin
  3905. Error(reeRarseAtomInternalDisaster);
  3906. Exit;
  3907. end
  3908. else
  3909. Len := FindSkippedMetaLen(regParse + 1, fRegexEnd) + 1;
  3910. // bad {n,m} - compile as EXACTLY
  3911. EnderChar := (regParse + Len)^;
  3912. if (Len > 1) and ((EnderChar = '*') or (EnderChar = '+') or (EnderChar = '?') or (EnderChar = '{')) then
  3913. Dec(Len); // back off clear of ?+*{ operand.
  3914. FlagParse := FlagParse or FLAG_HASWIDTH;
  3915. if Len = 1 then
  3916. FlagParse := FlagParse or FLAG_SIMPLE;
  3917. if fCompModifiers.I then
  3918. ret := EmitNode(OP_EXACTLYCI)
  3919. else
  3920. ret := EmitNode(OP_EXACTLY);
  3921. EmitInt(0);
  3922. while (Len > 0) and ((not fCompModifiers.X) or (regParse^ <> '#')) do
  3923. begin
  3924. if not fCompModifiers.X or not IsIgnoredChar(regParse^) then
  3925. begin
  3926. EmitC(regParse^);
  3927. if regCode <> @regDummy then
  3928. Inc(regExactlyLen^);
  3929. end;
  3930. Inc(regParse);
  3931. Dec(Len);
  3932. end;
  3933. end; { of if not comment }
  3934. end; { of case else }
  3935. end; { of case }
  3936. Result := ret;
  3937. end; { of function TRegExpr.ParseAtom
  3938. -------------------------------------------------------------- }
  3939. function TRegExpr.GetCompilerErrorPos: PtrInt;
  3940. begin
  3941. Result := 0;
  3942. if (fRegexStart = nil) or (regParse = nil) then
  3943. Exit; // not in compiling mode ?
  3944. Result := regParse - fRegexStart;
  3945. end; { of function TRegExpr.GetCompilerErrorPos
  3946. -------------------------------------------------------------- }
  3947. { ============================================================= }
  3948. { ===================== Matching section ====================== }
  3949. { ============================================================= }
  3950. procedure TRegExpr.FindGroupName(APtr, AEndPtr: PRegExprChar; AEndChar: REChar; var AName: RegExprString);
  3951. // check that group name is valid identifier, started from non-digit
  3952. // this is to be like in Python regex
  3953. var
  3954. P: PRegExprChar;
  3955. begin
  3956. P := APtr;
  3957. if IsDigitChar(P^) or not IsWordChar(P^) then
  3958. Error(reeNamedGroupBadName);
  3959. repeat
  3960. if P >= AEndPtr then
  3961. Error(reeNamedGroupBad);
  3962. if P^ = AEndChar then
  3963. Break;
  3964. if not IsWordChar(P^) then
  3965. Error(reeNamedGroupBadName);
  3966. Inc(P);
  3967. until False;
  3968. SetString(AName, APtr, P-APtr);
  3969. end;
  3970. function TRegExpr.FindRepeated(p: PRegExprChar; AMax: integer): integer;
  3971. // repeatedly match something simple, report how many
  3972. // p: points to current opcode
  3973. var
  3974. scan: PRegExprChar;
  3975. opnd: PRegExprChar;
  3976. TheMax: PtrInt; // PtrInt, gets diff of 2 pointers
  3977. InvChar: REChar;
  3978. CurStart, CurEnd: PRegExprChar;
  3979. ArrayIndex: integer;
  3980. {$IFDEF UnicodeEx}
  3981. i: integer;
  3982. {$ENDIF}
  3983. begin
  3984. Result := 0;
  3985. scan := regInput; // points into InputString
  3986. opnd := p + REOpSz + RENextOffSz; // points to operand of opcode (after OP_nnn code)
  3987. TheMax := fInputEnd - scan;
  3988. if TheMax > AMax then
  3989. TheMax := AMax;
  3990. case PREOp(p)^ of
  3991. OP_ANY:
  3992. begin
  3993. // note - OP_ANYML cannot be proceeded in FindRepeated because can skip
  3994. // more than one AnsiChar at once
  3995. {$IFDEF UnicodeEx}
  3996. for i := 1 to TheMax do
  3997. IncUnicode2(scan, Result);
  3998. {$ELSE}
  3999. Result := TheMax;
  4000. Inc(scan, Result);
  4001. {$ENDIF}
  4002. end;
  4003. OP_EXACTLY:
  4004. begin // in opnd can be only ONE AnsiChar !!!
  4005. {
  4006. // Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
  4007. NLen := PLongInt(opnd)^;
  4008. if TheMax > NLen then
  4009. TheMax := NLen;
  4010. }
  4011. Inc(opnd, RENumberSz);
  4012. while (Result < TheMax) and (opnd^ = scan^) do
  4013. begin
  4014. Inc(Result);
  4015. Inc(scan);
  4016. end;
  4017. end;
  4018. OP_EXACTLYCI:
  4019. begin // in opnd can be only ONE AnsiChar !!!
  4020. {
  4021. // Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
  4022. NLen := PLongInt(opnd)^;
  4023. if TheMax > NLen then
  4024. TheMax := NLen;
  4025. }
  4026. Inc(opnd, RENumberSz);
  4027. while (Result < TheMax) and (opnd^ = scan^) do
  4028. begin // prevent unneeded InvertCase //###0.931
  4029. Inc(Result);
  4030. Inc(scan);
  4031. end;
  4032. if Result < TheMax then
  4033. begin // ###0.931
  4034. InvChar := InvertCase(opnd^); // store in register
  4035. while (Result < TheMax) and ((opnd^ = scan^) or (InvChar = scan^)) do
  4036. begin
  4037. Inc(Result);
  4038. Inc(scan);
  4039. end;
  4040. end;
  4041. end;
  4042. OP_BSUBEXP:
  4043. begin // ###0.936
  4044. ArrayIndex := GrpIndexes[Ord(opnd^)];
  4045. if ArrayIndex < 0 then
  4046. Exit;
  4047. CurStart := GrpBounds[regRecursion].GrpStart[ArrayIndex];
  4048. if CurStart = nil then
  4049. Exit;
  4050. CurEnd := GrpBounds[regRecursion].GrpEnd[ArrayIndex];
  4051. if CurEnd = nil then
  4052. Exit;
  4053. repeat
  4054. opnd := CurStart;
  4055. while opnd < CurEnd do
  4056. begin
  4057. if (scan >= fInputEnd) or (scan^ <> opnd^) then
  4058. Exit;
  4059. Inc(scan);
  4060. Inc(opnd);
  4061. end;
  4062. Inc(Result);
  4063. regInput := scan;
  4064. until Result >= AMax;
  4065. end;
  4066. OP_BSUBEXPCI:
  4067. begin // ###0.936
  4068. ArrayIndex := GrpIndexes[Ord(opnd^)];
  4069. if ArrayIndex < 0 then
  4070. Exit;
  4071. CurStart := GrpBounds[regRecursion].GrpStart[ArrayIndex];
  4072. if CurStart = nil then
  4073. Exit;
  4074. CurEnd := GrpBounds[regRecursion].GrpEnd[ArrayIndex];
  4075. if CurEnd = nil then
  4076. Exit;
  4077. repeat
  4078. opnd := CurStart;
  4079. while opnd < CurEnd do
  4080. begin
  4081. if (scan >= fInputEnd) or
  4082. ((scan^ <> opnd^) and (scan^ <> InvertCase(opnd^))) then
  4083. Exit;
  4084. Inc(scan);
  4085. Inc(opnd);
  4086. end;
  4087. Inc(Result);
  4088. regInput := scan;
  4089. until Result >= AMax;
  4090. end;
  4091. OP_ANYDIGIT:
  4092. while (Result < TheMax) and IsDigitChar(scan^) do
  4093. begin
  4094. Inc(Result);
  4095. Inc(scan);
  4096. end;
  4097. OP_NOTDIGIT:
  4098. {$IFDEF UNICODEEX}
  4099. begin
  4100. i := 0;
  4101. while (i < TheMax) and not IsDigitChar(scan^) do
  4102. begin
  4103. Inc(i);
  4104. IncUnicode2(scan, Result);
  4105. end;
  4106. end;
  4107. {$ELSE}
  4108. while (Result < TheMax) and not IsDigitChar(scan^) do
  4109. begin
  4110. Inc(Result);
  4111. Inc(scan);
  4112. end;
  4113. {$ENDIF}
  4114. OP_ANYLETTER:
  4115. while (Result < TheMax) and IsWordChar(scan^) do // ###0.940
  4116. begin
  4117. Inc(Result);
  4118. Inc(scan);
  4119. end;
  4120. OP_NOTLETTER:
  4121. {$IFDEF UNICODEEX}
  4122. begin
  4123. i := 0;
  4124. while (i < TheMax) and not IsWordChar(scan^) do
  4125. begin
  4126. Inc(i);
  4127. IncUnicode2(scan, Result);
  4128. end;
  4129. end;
  4130. {$ELSE}
  4131. while (Result < TheMax) and not IsWordChar(scan^) do // ###0.940
  4132. begin
  4133. Inc(Result);
  4134. Inc(scan);
  4135. end;
  4136. {$ENDIF}
  4137. OP_ANYSPACE:
  4138. while (Result < TheMax) and IsSpaceChar(scan^) do
  4139. begin
  4140. Inc(Result);
  4141. Inc(scan);
  4142. end;
  4143. OP_NOTSPACE:
  4144. {$IFDEF UNICODEEX}
  4145. begin
  4146. i := 0;
  4147. while (i < TheMax) and not IsSpaceChar(scan^) do
  4148. begin
  4149. Inc(i);
  4150. IncUnicode2(scan, Result);
  4151. end;
  4152. end;
  4153. {$ELSE}
  4154. while (Result < TheMax) and not IsSpaceChar(scan^) do
  4155. begin
  4156. Inc(Result);
  4157. Inc(scan);
  4158. end;
  4159. {$ENDIF}
  4160. OP_ANYVERTSEP:
  4161. while (Result < TheMax) and IsVertLineSeparator(scan^) do
  4162. begin
  4163. Inc(Result);
  4164. Inc(scan);
  4165. end;
  4166. OP_NOTVERTSEP:
  4167. {$IFDEF UNICODEEX}
  4168. begin
  4169. i := 0;
  4170. while (i < TheMax) and not IsVertLineSeparator(scan^) do
  4171. begin
  4172. Inc(i);
  4173. IncUnicode2(scan, Result);
  4174. end;
  4175. end;
  4176. {$ELSE}
  4177. while (Result < TheMax) and not IsVertLineSeparator(scan^) do
  4178. begin
  4179. Inc(Result);
  4180. Inc(scan);
  4181. end;
  4182. {$ENDIF}
  4183. OP_ANYHORZSEP:
  4184. while (Result < TheMax) and IsHorzSeparator(scan^) do
  4185. begin
  4186. Inc(Result);
  4187. Inc(scan);
  4188. end;
  4189. OP_NOTHORZSEP:
  4190. {$IFDEF UNICODEEX}
  4191. begin
  4192. i := 0;
  4193. while (i < TheMax) and not IsHorzSeparator(scan^) do
  4194. begin
  4195. Inc(i);
  4196. IncUnicode2(scan, Result);
  4197. end;
  4198. end;
  4199. {$ELSE}
  4200. while (Result < TheMax) and not IsHorzSeparator(scan^) do
  4201. begin
  4202. Inc(Result);
  4203. Inc(scan);
  4204. end;
  4205. {$ENDIF}
  4206. OP_ANYOF:
  4207. {$IFDEF UNICODEEX}
  4208. begin
  4209. i := 0;
  4210. while (i < TheMax) and FindInCharClass(opnd, scan^, False) do
  4211. begin
  4212. Inc(i);
  4213. IncUnicode2(scan, Result);
  4214. end;
  4215. end;
  4216. {$ELSE}
  4217. while (Result < TheMax) and FindInCharClass(opnd, scan^, False) do
  4218. begin
  4219. Inc(Result);
  4220. Inc(scan);
  4221. end;
  4222. {$ENDIF}
  4223. OP_ANYBUT:
  4224. {$IFDEF UNICODEEX}
  4225. begin
  4226. i := 0;
  4227. while (i < TheMax) and not FindInCharClass(opnd, scan^, False) do
  4228. begin
  4229. Inc(i);
  4230. IncUnicode2(scan, Result);
  4231. end;
  4232. end;
  4233. {$ELSE}
  4234. while (Result < TheMax) and not FindInCharClass(opnd, scan^, False) do
  4235. begin
  4236. Inc(Result);
  4237. Inc(scan);
  4238. end;
  4239. {$ENDIF}
  4240. OP_ANYOFCI:
  4241. {$IFDEF UNICODEEX}
  4242. begin
  4243. i := 0;
  4244. while (i < TheMax) and FindInCharClass(opnd, scan^, True) do
  4245. begin
  4246. Inc(i);
  4247. IncUnicode2(scan, Result);
  4248. end;
  4249. end;
  4250. {$ELSE}
  4251. while (Result < TheMax) and FindInCharClass(opnd, scan^, True) do
  4252. begin
  4253. Inc(Result);
  4254. Inc(scan);
  4255. end;
  4256. {$ENDIF}
  4257. OP_ANYBUTCI:
  4258. {$IFDEF UNICODEEX}
  4259. begin
  4260. i := 0;
  4261. while (i < TheMax) and not FindInCharClass(opnd, scan^, True) do
  4262. begin
  4263. Inc(i);
  4264. IncUnicode2(scan, Result);
  4265. end;
  4266. end;
  4267. {$ELSE}
  4268. while (Result < TheMax) and not FindInCharClass(opnd, scan^, True) do
  4269. begin
  4270. Inc(Result);
  4271. Inc(scan);
  4272. end;
  4273. {$ENDIF}
  4274. {$IFDEF FastUnicodeData}
  4275. OP_ANYCATEGORY:
  4276. {$IFDEF UNICODEEX}
  4277. begin
  4278. i := 0;
  4279. while (i < TheMax) and MatchOneCharCategory(opnd, scan) do
  4280. begin
  4281. Inc(i);
  4282. IncUnicode2(scan, Result);
  4283. end;
  4284. end;
  4285. {$ELSE}
  4286. while (Result < TheMax) and MatchOneCharCategory(opnd, scan) do
  4287. begin
  4288. Inc(Result);
  4289. Inc(scan);
  4290. end;
  4291. {$ENDIF}
  4292. OP_NOTCATEGORY:
  4293. {$IFDEF UNICODEEX}
  4294. begin
  4295. i := 0;
  4296. while (i < TheMax) and not MatchOneCharCategory(opnd, scan) do
  4297. begin
  4298. Inc(i);
  4299. IncUnicode2(scan, Result);
  4300. end;
  4301. end;
  4302. {$ELSE}
  4303. while (Result < TheMax) and not MatchOneCharCategory(opnd, scan) do
  4304. begin
  4305. Inc(Result);
  4306. Inc(scan);
  4307. end;
  4308. {$ENDIF}
  4309. {$ENDIF}
  4310. else
  4311. begin // Oh dear. Called inappropriately.
  4312. Result := 0;
  4313. Error(reeRegRepeatCalledInappropriately);
  4314. Exit;
  4315. end;
  4316. end; { of case }
  4317. regInput := scan;
  4318. end; { of function TRegExpr.FindRepeated
  4319. -------------------------------------------------------------- }
  4320. function TRegExpr.regNext(p: PRegExprChar): PRegExprChar;
  4321. // dig the "next" pointer out of a node
  4322. var
  4323. offset: TRENextOff;
  4324. begin
  4325. if p = @regDummy then
  4326. begin
  4327. Result := nil;
  4328. Exit;
  4329. end;
  4330. offset := PRENextOff(AlignToPtr(p + REOpSz))^; // ###0.933 inlined NEXT
  4331. if offset = 0 then
  4332. Result := nil
  4333. else
  4334. Result := p + offset;
  4335. end; { of function TRegExpr.regNext
  4336. -------------------------------------------------------------- }
  4337. function TRegExpr.MatchPrim(prog: PRegExprChar): boolean;
  4338. // recursively matching routine
  4339. // Conceptually the strategy is simple: check to see whether the current
  4340. // node matches, call self recursively to see whether the rest matches,
  4341. // and then act accordingly. In practice we make some effort to avoid
  4342. // recursion, in particular by going through "ordinary" nodes (that don't
  4343. // need to know whether the rest of the match failed) by a loop instead of
  4344. // by recursion.
  4345. var
  4346. scan: PRegExprChar; // current node
  4347. next: PRegExprChar; // next node
  4348. Len: PtrInt;
  4349. opnd, opGrpEnd: PRegExprChar;
  4350. no: integer;
  4351. save: PRegExprChar;
  4352. saveCurrentGrp: integer;
  4353. nextch: REChar;
  4354. BracesMin, BracesMax: integer;
  4355. // we use integer instead of TREBracesArg to better support */+
  4356. {$IFDEF ComplexBraces}
  4357. SavedLoopStack: TRegExprLoopStack; // very bad for recursion
  4358. SavedLoopStackIdx: integer;
  4359. {$ENDIF}
  4360. bound1, bound2: boolean;
  4361. checkAtomicGroup: boolean;
  4362. begin
  4363. Result := False;
  4364. {
  4365. // Alexey: not sure it's ok for long searches in big texts, so disabled
  4366. if regNestedCalls > MaxRegexBackTracking then
  4367. Exit;
  4368. Inc(regNestedCalls);
  4369. }
  4370. scan := prog;
  4371. while scan <> nil do
  4372. begin
  4373. Len := PRENextOff(AlignToPtr(scan + 1))^; // ###0.932 inlined regNext
  4374. if Len = 0 then
  4375. next := nil
  4376. else
  4377. next := scan + Len;
  4378. case scan^ of
  4379. OP_BOUND:
  4380. begin
  4381. bound1 := (regInput = fInputStart) or not IsWordChar((regInput - 1)^);
  4382. bound2 := (regInput >= fInputEnd) or not IsWordChar(regInput^);
  4383. if bound1 = bound2 then
  4384. Exit;
  4385. end;
  4386. OP_NOTBOUND:
  4387. begin
  4388. bound1 := (regInput = fInputStart) or not IsWordChar((regInput - 1)^);
  4389. bound2 := (regInput >= fInputEnd) or not IsWordChar(regInput^);
  4390. if bound1 <> bound2 then
  4391. Exit;
  4392. end;
  4393. OP_BOL:
  4394. begin
  4395. if regInput <> fInputStart then
  4396. Exit;
  4397. end;
  4398. OP_EOL:
  4399. begin
  4400. // \z matches at the very end
  4401. if regInput < fInputEnd then
  4402. Exit;
  4403. end;
  4404. OP_EOL2:
  4405. begin
  4406. // \Z matches at the very and + before the final line-break (LF and CR LF)
  4407. if regInput < fInputEnd then
  4408. begin
  4409. if (regInput = fInputEnd - 1) and (regInput^ = #10) then
  4410. begin end
  4411. else
  4412. if (regInput = fInputEnd - 2) and (regInput^ = #13) and ((regInput + 1) ^ = #10) then
  4413. begin end
  4414. else
  4415. Exit;
  4416. end;
  4417. end;
  4418. OP_BOLML:
  4419. if regInput > fInputStart then
  4420. begin
  4421. if ((regInput - 1) <= fInputStart) or
  4422. not IsPairedBreak(regInput - 2) then
  4423. begin
  4424. // don't stop between paired separator
  4425. if IsPairedBreak(regInput - 1) then
  4426. Exit;
  4427. if not IsCustomLineSeparator((regInput - 1)^) then
  4428. Exit;
  4429. end;
  4430. end;
  4431. OP_EOLML:
  4432. if regInput < fInputEnd then
  4433. begin
  4434. if not IsPairedBreak(regInput) then
  4435. begin
  4436. // don't stop between paired separator
  4437. if (regInput > fInputStart) and IsPairedBreak(regInput - 1) then
  4438. Exit;
  4439. if not IsCustomLineSeparator(regInput^) then
  4440. Exit;
  4441. end;
  4442. end;
  4443. OP_ANY:
  4444. begin
  4445. if regInput >= fInputEnd then
  4446. Exit;
  4447. {$IFDEF UNICODEEX}
  4448. IncUnicode(regInput);
  4449. {$ELSE}
  4450. Inc(regInput);
  4451. {$ENDIF}
  4452. end;
  4453. OP_ANYML:
  4454. begin
  4455. if (regInput >= fInputEnd) or
  4456. IsPairedBreak(regInput) or
  4457. IsCustomLineSeparator(regInput^)
  4458. then
  4459. Exit;
  4460. {$IFDEF UNICODEEX}
  4461. IncUnicode(regInput);
  4462. {$ELSE}
  4463. Inc(regInput);
  4464. {$ENDIF}
  4465. end;
  4466. OP_ANYDIGIT:
  4467. begin
  4468. if (regInput >= fInputEnd) or not IsDigitChar(regInput^) then
  4469. Exit;
  4470. Inc(regInput);
  4471. end;
  4472. OP_NOTDIGIT:
  4473. begin
  4474. if (regInput >= fInputEnd) or IsDigitChar(regInput^) then
  4475. Exit;
  4476. {$IFDEF UNICODEEX}
  4477. IncUnicode(regInput);
  4478. {$ELSE}
  4479. Inc(regInput);
  4480. {$ENDIF}
  4481. end;
  4482. OP_ANYLETTER:
  4483. begin
  4484. if (regInput >= fInputEnd) or not IsWordChar(regInput^) then
  4485. Exit;
  4486. Inc(regInput);
  4487. end;
  4488. OP_NOTLETTER:
  4489. begin
  4490. if (regInput >= fInputEnd) or IsWordChar(regInput^) then
  4491. Exit;
  4492. {$IFDEF UNICODEEX}
  4493. IncUnicode(regInput);
  4494. {$ELSE}
  4495. Inc(regInput);
  4496. {$ENDIF}
  4497. end;
  4498. OP_ANYSPACE:
  4499. begin
  4500. if (regInput >= fInputEnd) or not IsSpaceChar(regInput^) then
  4501. Exit;
  4502. Inc(regInput);
  4503. end;
  4504. OP_NOTSPACE:
  4505. begin
  4506. if (regInput >= fInputEnd) or IsSpaceChar(regInput^) then
  4507. Exit;
  4508. {$IFDEF UNICODEEX}
  4509. IncUnicode(regInput);
  4510. {$ELSE}
  4511. Inc(regInput);
  4512. {$ENDIF}
  4513. end;
  4514. OP_ANYVERTSEP:
  4515. begin
  4516. if (regInput >= fInputEnd) or not IsVertLineSeparator(regInput^) then
  4517. Exit;
  4518. Inc(regInput);
  4519. end;
  4520. OP_NOTVERTSEP:
  4521. begin
  4522. if (regInput >= fInputEnd) or IsVertLineSeparator(regInput^) then
  4523. Exit;
  4524. {$IFDEF UNICODEEX}
  4525. IncUnicode(regInput);
  4526. {$ELSE}
  4527. Inc(regInput);
  4528. {$ENDIF}
  4529. end;
  4530. OP_ANYHORZSEP:
  4531. begin
  4532. if (regInput >= fInputEnd) or not IsHorzSeparator(regInput^) then
  4533. Exit;
  4534. Inc(regInput);
  4535. end;
  4536. OP_NOTHORZSEP:
  4537. begin
  4538. if (regInput >= fInputEnd) or IsHorzSeparator(regInput^) then
  4539. Exit;
  4540. {$IFDEF UNICODEEX}
  4541. IncUnicode(regInput);
  4542. {$ELSE}
  4543. Inc(regInput);
  4544. {$ENDIF}
  4545. end;
  4546. OP_EXACTLYCI:
  4547. begin
  4548. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  4549. Len := PLongInt(opnd)^;
  4550. Inc(opnd, RENumberSz);
  4551. // Inline the first character, for speed.
  4552. if (opnd^ <> regInput^) and (InvertCase(opnd^) <> regInput^) then
  4553. Exit;
  4554. // ###0.929 begin
  4555. no := Len;
  4556. save := regInput;
  4557. while no > 1 do
  4558. begin
  4559. Inc(save);
  4560. Inc(opnd);
  4561. if (opnd^ <> save^) and (InvertCase(opnd^) <> save^) then
  4562. Exit;
  4563. Dec(no);
  4564. end;
  4565. // ###0.929 end
  4566. Inc(regInput, Len);
  4567. end;
  4568. OP_EXACTLY:
  4569. begin
  4570. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  4571. Len := PLongInt(opnd)^;
  4572. Inc(opnd, RENumberSz);
  4573. // Inline the first character, for speed.
  4574. if opnd^ <> regInput^ then
  4575. Exit;
  4576. // ###0.929 begin
  4577. no := Len;
  4578. save := regInput;
  4579. while no > 1 do
  4580. begin
  4581. Inc(save);
  4582. Inc(opnd);
  4583. if opnd^ <> save^ then
  4584. Exit;
  4585. Dec(no);
  4586. end;
  4587. // ###0.929 end
  4588. Inc(regInput, Len);
  4589. end;
  4590. OP_BSUBEXP:
  4591. begin // ###0.936
  4592. no := Ord((scan + REOpSz + RENextOffSz)^);
  4593. no := GrpIndexes[no];
  4594. if no < 0 then
  4595. Exit;
  4596. opnd := GrpBounds[regRecursion].GrpStart[no];
  4597. if opnd = nil then
  4598. Exit;
  4599. opGrpEnd := GrpBounds[regRecursion].GrpEnd[no];
  4600. if opGrpEnd = nil then
  4601. Exit;
  4602. save := regInput;
  4603. while opnd < opGrpEnd do
  4604. begin
  4605. if (save >= fInputEnd) or (save^ <> opnd^) then
  4606. Exit;
  4607. Inc(save);
  4608. Inc(opnd);
  4609. end;
  4610. regInput := save;
  4611. end;
  4612. OP_BSUBEXPCI:
  4613. begin // ###0.936
  4614. no := Ord((scan + REOpSz + RENextOffSz)^);
  4615. no := GrpIndexes[no];
  4616. if no < 0 then
  4617. Exit;
  4618. opnd := GrpBounds[regRecursion].GrpStart[no];
  4619. if opnd = nil then
  4620. Exit;
  4621. opGrpEnd := GrpBounds[regRecursion].GrpEnd[no];
  4622. if opGrpEnd = nil then
  4623. Exit;
  4624. save := regInput;
  4625. while opnd < opGrpEnd do
  4626. begin
  4627. if (save >= fInputEnd) or
  4628. ((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then
  4629. Exit;
  4630. Inc(save);
  4631. Inc(opnd);
  4632. end;
  4633. regInput := save;
  4634. end;
  4635. OP_ANYOF:
  4636. begin
  4637. if (regInput >= fInputEnd) or
  4638. not FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, False) then
  4639. Exit;
  4640. {$IFDEF UNICODEEX}
  4641. IncUnicode(regInput);
  4642. {$ELSE}
  4643. Inc(regInput);
  4644. {$ENDIF}
  4645. end;
  4646. OP_ANYBUT:
  4647. begin
  4648. if (regInput >= fInputEnd) or
  4649. FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, False) then
  4650. Exit;
  4651. {$IFDEF UNICODEEX}
  4652. IncUnicode(regInput);
  4653. {$ELSE}
  4654. Inc(regInput);
  4655. {$ENDIF}
  4656. end;
  4657. OP_ANYOFCI:
  4658. begin
  4659. if (regInput >= fInputEnd) or
  4660. not FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, True) then
  4661. Exit;
  4662. {$IFDEF UNICODEEX}
  4663. IncUnicode(regInput);
  4664. {$ELSE}
  4665. Inc(regInput);
  4666. {$ENDIF}
  4667. end;
  4668. OP_ANYBUTCI:
  4669. begin
  4670. if (regInput >= fInputEnd) or
  4671. FindInCharClass(scan + REOpSz + RENextOffSz, regInput^, True) then
  4672. Exit;
  4673. {$IFDEF UNICODEEX}
  4674. IncUnicode(regInput);
  4675. {$ELSE}
  4676. Inc(regInput);
  4677. {$ENDIF}
  4678. end;
  4679. OP_NOTHING:
  4680. ;
  4681. OP_COMMENT:
  4682. ;
  4683. OP_BACK:
  4684. ;
  4685. OP_OPEN_FIRST .. OP_OPEN_LAST:
  4686. begin
  4687. no := Ord(scan^) - Ord(OP_OPEN);
  4688. regCurrentGrp := no;
  4689. save := GrpBounds[regRecursion].GrpStart[no];
  4690. GrpBounds[regRecursion].GrpStart[no] := regInput;
  4691. Result := MatchPrim(next);
  4692. if not Result then
  4693. GrpBounds[regRecursion].GrpStart[no] := save;
  4694. // handle negative lookahead
  4695. if regLookaheadNeg then
  4696. if no = regLookaheadGroup then
  4697. begin
  4698. Result := not Result;
  4699. if Result then
  4700. begin
  4701. // we need zero length of "lookahead group",
  4702. // it is later used to adjust the match
  4703. GrpBounds[regRecursion].GrpStart[no] := regInput;
  4704. GrpBounds[regRecursion].GrpEnd[no]:= regInput;
  4705. end
  4706. else
  4707. GrpBounds[regRecursion].GrpStart[no] := save;
  4708. end;
  4709. Exit;
  4710. end;
  4711. OP_CLOSE_FIRST .. OP_CLOSE_LAST:
  4712. begin
  4713. no := Ord(scan^) - Ord(OP_CLOSE);
  4714. regCurrentGrp := -1;
  4715. // handle atomic group, mark it as "done"
  4716. // (we are here because some OP_BRANCH is matched)
  4717. if GrpAtomic[no] then
  4718. GrpAtomicDone[no] := True;
  4719. save := GrpBounds[regRecursion].GrpEnd[no];
  4720. GrpBounds[regRecursion].GrpEnd[no] := regInput;
  4721. // if we are in OP_SUBCALL* call, it called OP_OPEN*, so we must return
  4722. // in OP_CLOSE, without going to next opcode
  4723. if GrpSubCalled[no] then
  4724. begin
  4725. Result := True;
  4726. Exit;
  4727. end;
  4728. Result := MatchPrim(next);
  4729. if not Result then // ###0.936
  4730. GrpBounds[regRecursion].GrpEnd[no] := save;
  4731. Exit;
  4732. end;
  4733. OP_BRANCH:
  4734. begin
  4735. saveCurrentGrp := regCurrentGrp;
  4736. checkAtomicGroup := (regCurrentGrp >= 0) and GrpAtomic[regCurrentGrp];
  4737. if (next^ <> OP_BRANCH) // No next choice in group
  4738. then
  4739. next := scan + REOpSz + RENextOffSz // Avoid recursion
  4740. else
  4741. begin
  4742. repeat
  4743. save := regInput;
  4744. Result := MatchPrim(scan + REOpSz + RENextOffSz);
  4745. regCurrentGrp := saveCurrentGrp;
  4746. if Result then
  4747. Exit;
  4748. // if branch worked until OP_CLOSE, and marked atomic group as "done", then exit
  4749. if checkAtomicGroup then
  4750. if GrpAtomicDone[regCurrentGrp] then
  4751. Exit;
  4752. regInput := save;
  4753. scan := regNext(scan);
  4754. until (scan = nil) or (scan^ <> OP_BRANCH);
  4755. Exit;
  4756. end;
  4757. end;
  4758. {$IFDEF ComplexBraces}
  4759. OP_LOOPENTRY:
  4760. begin // ###0.925
  4761. no := LoopStackIdx;
  4762. Inc(LoopStackIdx);
  4763. if LoopStackIdx > LoopStackMax then
  4764. begin
  4765. Error(reeLoopStackExceeded);
  4766. Exit;
  4767. end;
  4768. save := regInput;
  4769. LoopStack[LoopStackIdx] := 0; // init loop counter
  4770. Result := MatchPrim(next); // execute loop
  4771. LoopStackIdx := no; // cleanup
  4772. if Result then
  4773. Exit;
  4774. regInput := save;
  4775. Exit;
  4776. end;
  4777. OP_LOOP, OP_LOOPNG:
  4778. begin // ###0.940
  4779. if LoopStackIdx <= 0 then
  4780. begin
  4781. Error(reeLoopWithoutEntry);
  4782. Exit;
  4783. end;
  4784. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + 2 * REBracesArgSz))^;
  4785. BracesMin := PREBracesArg(AlignToInt(scan + REOpSz + RENextOffSz))^;
  4786. BracesMax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  4787. save := regInput;
  4788. if LoopStack[LoopStackIdx] >= BracesMin then
  4789. begin // Min alredy matched - we can work
  4790. if scan^ = OP_LOOP then
  4791. begin
  4792. // greedy way - first try to max deep of greed ;)
  4793. if LoopStack[LoopStackIdx] < BracesMax then
  4794. begin
  4795. Inc(LoopStack[LoopStackIdx]);
  4796. no := LoopStackIdx;
  4797. Result := MatchPrim(opnd);
  4798. LoopStackIdx := no;
  4799. if Result then
  4800. Exit;
  4801. regInput := save;
  4802. end;
  4803. Dec(LoopStackIdx); // Fail. May be we are too greedy? ;)
  4804. Result := MatchPrim(next);
  4805. if not Result then
  4806. regInput := save;
  4807. Exit;
  4808. end
  4809. else
  4810. begin
  4811. // non-greedy - try just now
  4812. Result := MatchPrim(next);
  4813. if Result then
  4814. Exit
  4815. else
  4816. regInput := save; // failed - move next and try again
  4817. if LoopStack[LoopStackIdx] < BracesMax then
  4818. begin
  4819. Inc(LoopStack[LoopStackIdx]);
  4820. no := LoopStackIdx;
  4821. Result := MatchPrim(opnd);
  4822. LoopStackIdx := no;
  4823. if Result then
  4824. Exit;
  4825. regInput := save;
  4826. end;
  4827. Dec(LoopStackIdx); // Failed - back up
  4828. Exit;
  4829. end
  4830. end
  4831. else
  4832. begin // first match a min_cnt times
  4833. Inc(LoopStack[LoopStackIdx]);
  4834. no := LoopStackIdx;
  4835. Result := MatchPrim(opnd);
  4836. LoopStackIdx := no;
  4837. if Result then
  4838. Exit;
  4839. Dec(LoopStack[LoopStackIdx]);
  4840. regInput := save;
  4841. Exit;
  4842. end;
  4843. end;
  4844. {$ENDIF}
  4845. OP_STAR, OP_PLUS, OP_BRACES, OP_STARNG, OP_PLUSNG, OP_BRACESNG:
  4846. begin
  4847. // Lookahead to avoid useless match attempts when we know
  4848. // what character comes next.
  4849. nextch := #0;
  4850. if next^ = OP_EXACTLY then
  4851. nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
  4852. BracesMax := MaxInt; // infinite loop for * and + //###0.92
  4853. if (scan^ = OP_STAR) or (scan^ = OP_STARNG) then
  4854. BracesMin := 0 // star
  4855. else if (scan^ = OP_PLUS) or (scan^ = OP_PLUSNG) then
  4856. BracesMin := 1 // plus
  4857. else
  4858. begin // braces
  4859. BracesMin := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  4860. BracesMax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  4861. end;
  4862. save := regInput;
  4863. opnd := scan + REOpSz + RENextOffSz;
  4864. if (scan^ = OP_BRACES) or (scan^ = OP_BRACESNG) then
  4865. Inc(opnd, 2 * REBracesArgSz);
  4866. if (scan^ = OP_PLUSNG) or (scan^ = OP_STARNG) or (scan^ = OP_BRACESNG) then
  4867. begin
  4868. // non-greedy mode
  4869. BracesMax := FindRepeated(opnd, BracesMax);
  4870. // don't repeat more than BracesMax
  4871. // Now we know real Max limit to move forward (for recursion 'back up')
  4872. // In some cases it can be faster to check only Min positions first,
  4873. // but after that we have to check every position separtely instead
  4874. // of fast scannig in loop.
  4875. no := BracesMin;
  4876. while no <= BracesMax do
  4877. begin
  4878. regInput := save + no;
  4879. // If it could work, try it.
  4880. if (nextch = #0) or (regInput^ = nextch) then
  4881. begin
  4882. {$IFDEF ComplexBraces}
  4883. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  4884. // ###0.925
  4885. SavedLoopStackIdx := LoopStackIdx;
  4886. {$ENDIF}
  4887. if MatchPrim(next) then
  4888. begin
  4889. Result := True;
  4890. Exit;
  4891. end;
  4892. {$IFDEF ComplexBraces}
  4893. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  4894. LoopStackIdx := SavedLoopStackIdx;
  4895. {$ENDIF}
  4896. end;
  4897. Inc(no); // Couldn't or didn't - move forward.
  4898. end; { of while }
  4899. Exit;
  4900. end
  4901. else
  4902. begin // greedy mode
  4903. no := FindRepeated(opnd, BracesMax); // don't repeat more than max_cnt
  4904. while no >= BracesMin do
  4905. begin
  4906. // If it could work, try it.
  4907. if (nextch = #0) or (regInput^ = nextch) then
  4908. begin
  4909. {$IFDEF ComplexBraces}
  4910. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  4911. // ###0.925
  4912. SavedLoopStackIdx := LoopStackIdx;
  4913. {$ENDIF}
  4914. if MatchPrim(next) then
  4915. begin
  4916. Result := True;
  4917. Exit;
  4918. end;
  4919. {$IFDEF ComplexBraces}
  4920. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  4921. LoopStackIdx := SavedLoopStackIdx;
  4922. {$ENDIF}
  4923. end;
  4924. Dec(no); // Couldn't or didn't - back up.
  4925. regInput := save + no;
  4926. end; { of while }
  4927. Exit;
  4928. end;
  4929. end;
  4930. OP_STAR_POSS, OP_PLUS_POSS, OP_BRACES_POSS:
  4931. begin
  4932. // Lookahead to avoid useless match attempts when we know
  4933. // what character comes next.
  4934. nextch := #0;
  4935. if next^ = OP_EXACTLY then
  4936. nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
  4937. opnd := scan + REOpSz + RENextOffSz;
  4938. case scan^ of
  4939. OP_STAR_POSS:
  4940. begin
  4941. BracesMin := 0;
  4942. BracesMax := MaxInt;
  4943. end;
  4944. OP_PLUS_POSS:
  4945. begin
  4946. BracesMin := 1;
  4947. BracesMax := MaxInt;
  4948. end;
  4949. else
  4950. begin // braces
  4951. BracesMin := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  4952. BracesMax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  4953. Inc(opnd, 2 * REBracesArgSz);
  4954. end;
  4955. end;
  4956. no := FindRepeated(opnd, BracesMax);
  4957. if no >= BracesMin then
  4958. if (nextch = #0) or (regInput^ = nextch) then
  4959. Result := MatchPrim(next);
  4960. Exit;
  4961. end;
  4962. OP_EEND:
  4963. begin
  4964. Result := True; // Success!
  4965. Exit;
  4966. end;
  4967. {$IFDEF FastUnicodeData}
  4968. OP_ANYCATEGORY:
  4969. begin
  4970. if (regInput >= fInputEnd) then Exit;
  4971. if not MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit;
  4972. {$IFDEF UNICODEEX}
  4973. IncUnicode(regInput);
  4974. {$ELSE}
  4975. Inc(regInput);
  4976. {$ENDIF}
  4977. end;
  4978. OP_NOTCATEGORY:
  4979. begin
  4980. if (regInput >= fInputEnd) then Exit;
  4981. if MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit;
  4982. {$IFDEF UNICODEEX}
  4983. IncUnicode(regInput);
  4984. {$ELSE}
  4985. Inc(regInput);
  4986. {$ENDIF}
  4987. end;
  4988. {$ENDIF}
  4989. OP_RECUR:
  4990. begin
  4991. // call opcode start
  4992. if regRecursion < RegexMaxRecursion then
  4993. begin
  4994. Inc(regRecursion);
  4995. bound1 := MatchPrim(regCodeWork);
  4996. Dec(regRecursion);
  4997. end
  4998. else
  4999. bound1 := False;
  5000. if not bound1 then Exit;
  5001. end;
  5002. OP_SUBCALL_FIRST .. OP_SUBCALL_LAST:
  5003. begin
  5004. // call subroutine
  5005. no := GrpIndexes[Ord(scan^) - Ord(OP_SUBCALL)];
  5006. if no < 0 then Exit;
  5007. save := GrpOpCodes[no];
  5008. if save = nil then Exit;
  5009. if regRecursion < RegexMaxRecursion then
  5010. begin
  5011. // mark group in GrpSubCalled array so opcode can detect subcall
  5012. checkAtomicGroup := GrpSubCalled[no];
  5013. GrpSubCalled[no] := True;
  5014. Inc(regRecursion);
  5015. bound1 := MatchPrim(save);
  5016. Dec(regRecursion);
  5017. GrpSubCalled[no] := checkAtomicGroup;
  5018. end
  5019. else
  5020. bound1 := False;
  5021. if not bound1 then Exit;
  5022. end;
  5023. else
  5024. begin
  5025. Error(reeMatchPrimMemoryCorruption);
  5026. Exit;
  5027. end;
  5028. end; { of case scan^ }
  5029. scan := next;
  5030. end; { of while scan <> nil }
  5031. // We get here only if there's trouble -- normally "case EEND" is the
  5032. // terminating point.
  5033. Error(reeMatchPrimCorruptedPointers);
  5034. end; { of function TRegExpr.MatchPrim
  5035. -------------------------------------------------------------- }
  5036. function TRegExpr.Exec(const AInputString: RegExprString): boolean;
  5037. begin
  5038. InputString := AInputString;
  5039. Result := ExecPrim(1, False, False, False);
  5040. end; { of function TRegExpr.Exec
  5041. -------------------------------------------------------------- }
  5042. {$IFDEF OverMeth}
  5043. function TRegExpr.Exec: boolean;
  5044. var
  5045. SlowChecks: boolean;
  5046. begin
  5047. SlowChecks := Length(fInputString) < fSlowChecksSizeMax;
  5048. Result := ExecPrim(1, False, SlowChecks, False);
  5049. end; { of function TRegExpr.Exec
  5050. -------------------------------------------------------------- }
  5051. function TRegExpr.Exec(AOffset: integer): boolean;
  5052. begin
  5053. Result := ExecPrim(AOffset, False, False, False);
  5054. end; { of function TRegExpr.Exec
  5055. -------------------------------------------------------------- }
  5056. {$ENDIF}
  5057. function TRegExpr.ExecPos(AOffset: integer {$IFDEF DefParam} = 1{$ENDIF}): boolean;
  5058. begin
  5059. Result := ExecPrim(AOffset, False, False, False);
  5060. end; { of function TRegExpr.ExecPos
  5061. -------------------------------------------------------------- }
  5062. {$IFDEF OverMeth}
  5063. function TRegExpr.ExecPos(AOffset: integer; ATryOnce, ABackward: boolean): boolean;
  5064. begin
  5065. Result := ExecPrim(AOffset, ATryOnce, False, ABackward);
  5066. end;
  5067. {$ENDIF}
  5068. function TRegExpr.MatchAtOnePos(APos: PRegExprChar): boolean;
  5069. begin
  5070. // test for lookbehind '(?<!foo)bar' before running actual MatchPrim
  5071. if Assigned(fHelper) then
  5072. if (APos - fHelperLen) >= fInputStart then
  5073. begin
  5074. fHelper.SetInputRange(APos - fHelperLen, APos);
  5075. if fHelper.MatchAtOnePos(APos - fHelperLen) then
  5076. begin
  5077. Result := False;
  5078. Exit;
  5079. end;
  5080. end;
  5081. regInput := APos;
  5082. regCurrentGrp := -1;
  5083. regNestedCalls := 0;
  5084. regRecursion := 0;
  5085. Result := MatchPrim(regCodeWork);
  5086. if Result then
  5087. begin
  5088. GrpBounds[0].GrpStart[0] := APos;
  5089. GrpBounds[0].GrpEnd[0] := regInput;
  5090. // with lookbehind, increase found position by the len of group=1
  5091. if regLookbehind then
  5092. Inc(GrpBounds[0].GrpStart[0], GrpBounds[0].GrpEnd[1] - GrpBounds[0].GrpStart[1]);
  5093. // with lookahead, decrease ending by the len of group=regLookaheadGroup
  5094. if regLookahead and (regLookaheadGroup > 0) then
  5095. Dec(GrpBounds[0].GrpEnd[0], GrpBounds[0].GrpEnd[regLookaheadGroup] - GrpBounds[0].GrpStart[regLookaheadGroup]);
  5096. end;
  5097. end;
  5098. procedure TRegExpr.ClearMatches;
  5099. begin
  5100. FillChar(GrpBounds, SizeOf(GrpBounds), 0);
  5101. FillChar(GrpAtomicDone, SizeOf(GrpAtomicDone), 0);
  5102. FillChar(GrpSubCalled, SizeOf(GrpSubCalled), 0);
  5103. end;
  5104. procedure TRegExpr.ClearInternalIndexes;
  5105. var
  5106. i: integer;
  5107. begin
  5108. FillChar(GrpBounds, SizeOf(GrpBounds), 0);
  5109. FillChar(GrpAtomic, SizeOf(GrpAtomic), 0);
  5110. FillChar(GrpAtomicDone, SizeOf(GrpAtomicDone), 0);
  5111. FillChar(GrpSubCalled, SizeOf(GrpSubCalled), 0);
  5112. FillChar(GrpOpCodes, SizeOf(GrpOpCodes), 0);
  5113. for i := 0 to RegexMaxGroups - 1 do
  5114. begin
  5115. GrpIndexes[i] := -1;
  5116. GrpNames[i] := '';
  5117. end;
  5118. GrpIndexes[0] := 0;
  5119. GrpCount := 0;
  5120. end;
  5121. function TRegExpr.ExecPrim(AOffset: integer;
  5122. ATryOnce, ASlowChecks, ABackward: boolean): boolean;
  5123. var
  5124. Ptr: PRegExprChar;
  5125. begin
  5126. Result := False;
  5127. // Ensure that Match cleared either if optimization tricks or some error
  5128. // will lead to leaving ExecPrim without actual search. That is
  5129. // important for ExecNext logic and so on.
  5130. ClearMatches;
  5131. // Don't check IsProgrammOk here! it causes big slowdown in test_benchmark!
  5132. if programm = nil then
  5133. begin
  5134. Compile;
  5135. if programm = nil then
  5136. Exit;
  5137. end;
  5138. if fInputString = '' then
  5139. begin
  5140. // Empty string can match e.g. '^$'
  5141. if regMustLen > 0 then
  5142. Exit;
  5143. end;
  5144. // Check that the start position is not negative
  5145. if AOffset < 1 then
  5146. begin
  5147. Error(reeOffsetMustBePositive);
  5148. Exit;
  5149. end;
  5150. // Check that the start position is not longer than the line
  5151. if AOffset > (Length(fInputString) + 1) then
  5152. Exit;
  5153. Ptr := fInputStart + AOffset - 1;
  5154. // If there is a "must appear" string, look for it.
  5155. if ASlowChecks then
  5156. if regMustString <> '' then
  5157. if Pos(regMustString, fInputString) = 0 then Exit;
  5158. {$IFDEF ComplexBraces}
  5159. // no loops started
  5160. LoopStackIdx := 0; // ###0.925
  5161. {$ENDIF}
  5162. // ATryOnce or anchored match (it needs to be tried only once).
  5163. if ATryOnce or (regAnchored <> #0) then
  5164. begin
  5165. {$IFDEF UseFirstCharSet}
  5166. {$IFDEF UnicodeRE}
  5167. if Ord(Ptr^) <= $FF then
  5168. {$ENDIF}
  5169. if not FirstCharArray[byte(Ptr^)] then
  5170. Exit;
  5171. {$ENDIF}
  5172. Result := MatchAtOnePos(Ptr);
  5173. Exit;
  5174. end;
  5175. // Messy cases: unanchored match.
  5176. if ABackward then
  5177. Inc(Ptr, 2)
  5178. else
  5179. Dec(Ptr);
  5180. repeat
  5181. if ABackward then
  5182. begin
  5183. Dec(Ptr);
  5184. if Ptr < fInputStart then
  5185. Exit;
  5186. end
  5187. else
  5188. begin
  5189. Inc(Ptr);
  5190. if Ptr > fInputEnd then
  5191. Exit;
  5192. end;
  5193. {$IFDEF UseFirstCharSet}
  5194. {$IFDEF UnicodeRE}
  5195. if Ord(Ptr^) <= $FF then
  5196. {$ENDIF}
  5197. if not FirstCharArray[byte(Ptr^)] then
  5198. Continue;
  5199. {$ENDIF}
  5200. Result := MatchAtOnePos(Ptr);
  5201. // Exit on a match or after testing the end-of-string
  5202. if Result then
  5203. Exit;
  5204. until False;
  5205. end; { of function TRegExpr.ExecPrim
  5206. -------------------------------------------------------------- }
  5207. function TRegExpr.ExecNext(ABackward: boolean {$IFDEF DefParam} = False{$ENDIF}): boolean;
  5208. var
  5209. PtrBegin, PtrEnd: PRegExprChar;
  5210. Offset: PtrInt;
  5211. begin
  5212. PtrBegin := GrpBounds[0].GrpStart[0];
  5213. PtrEnd := GrpBounds[0].GrpEnd[0];
  5214. if (PtrBegin = nil) or (PtrEnd = nil) then
  5215. begin
  5216. Error(reeExecNextWithoutExec);
  5217. Result := False;
  5218. Exit;
  5219. end;
  5220. Offset := PtrEnd - fInputStart + 1;
  5221. // prevent infinite looping if empty string matches r.e.
  5222. if PtrBegin = PtrEnd then
  5223. Inc(Offset);
  5224. Result := ExecPrim(Offset, False, False, ABackward);
  5225. end; { of function TRegExpr.ExecNext
  5226. -------------------------------------------------------------- }
  5227. procedure TRegExpr.SetInputString(const AInputString: RegExprString);
  5228. begin
  5229. ClearMatches;
  5230. fInputString := AInputString;
  5231. UniqueString(fInputString);
  5232. fInputStart := PRegExprChar(fInputString);
  5233. fInputEnd := fInputStart + Length(fInputString);
  5234. end;
  5235. procedure TRegExpr.SetInputRange(AStart, AEnd: PRegExprChar);
  5236. begin
  5237. fInputString := '';
  5238. fInputStart := AStart;
  5239. fInputEnd := AEnd;
  5240. end;
  5241. {$IFDEF UseLineSep}
  5242. procedure TRegExpr.SetLineSeparators(const AStr: RegExprString);
  5243. begin
  5244. if AStr <> fLineSeparators then
  5245. begin
  5246. fLineSeparators := AStr;
  5247. InitLineSepArray;
  5248. InvalidateProgramm;
  5249. end;
  5250. end; { of procedure TRegExpr.SetLineSeparators
  5251. -------------------------------------------------------------- }
  5252. {$ENDIF}
  5253. procedure TRegExpr.SetUsePairedBreak(AValue: boolean);
  5254. begin
  5255. if AValue <> fUsePairedBreak then
  5256. begin
  5257. fUsePairedBreak := AValue;
  5258. InvalidateProgramm;
  5259. end;
  5260. end;
  5261. function TRegExpr.Substitute(const ATemplate: RegExprString): RegExprString;
  5262. // perform substitutions after a regexp match
  5263. var
  5264. TemplateBeg, TemplateEnd: PRegExprChar;
  5265. function ParseVarName(var APtr: PRegExprChar): integer;
  5266. // extract name of variable: $1 or ${1} or ${name}
  5267. // from APtr^, uses TemplateEnd
  5268. var
  5269. p: PRegExprChar;
  5270. Delimited: boolean;
  5271. GrpName: RegExprString;
  5272. begin
  5273. Result := 0;
  5274. GrpName := '';
  5275. p := APtr;
  5276. Delimited := (p < TemplateEnd) and (p^ = '{');
  5277. if Delimited then
  5278. Inc(p); // skip left curly brace
  5279. if (p < TemplateEnd) and (p^ = '&') then
  5280. Inc(p) // this is '$&' or '${&}'
  5281. else
  5282. begin
  5283. if IsDigitChar(p^) then
  5284. begin
  5285. while (p < TemplateEnd) and IsDigitChar(p^) do
  5286. begin
  5287. Result := Result * 10 + (Ord(p^) - Ord('0'));
  5288. Inc(p);
  5289. end
  5290. end
  5291. else
  5292. if Delimited then
  5293. begin
  5294. FindGroupName(p, TemplateEnd, '}', GrpName);
  5295. Result := MatchIndexFromName(GrpName);
  5296. Inc(p, Length(GrpName));
  5297. end;
  5298. end;
  5299. if Delimited then
  5300. if (p < TemplateEnd) and (p^ = '}') then
  5301. Inc(p) // skip right curly brace
  5302. else
  5303. p := APtr; // isn't properly terminated
  5304. if p = APtr then
  5305. Result := -1; // no valid digits found or no right curly brace
  5306. APtr := p;
  5307. end;
  5308. procedure FindSubstGroupIndex(var p: PRegExprChar; var Idx: integer; var NumberFound: boolean);
  5309. begin
  5310. Idx := ParseVarName(p);
  5311. NumberFound := (Idx >= 0) and (Idx <= High(GrpIndexes));
  5312. if NumberFound then
  5313. Idx := GrpIndexes[Idx];
  5314. end;
  5315. type
  5316. TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper, smodeAllLower);
  5317. var
  5318. Mode: TSubstMode;
  5319. p, p0, p1, ResultPtr: PRegExprChar;
  5320. ResultLen, n: integer;
  5321. Ch, QuotedChar: REChar;
  5322. GroupFound: boolean;
  5323. begin
  5324. // Check programm and input string
  5325. if not IsProgrammOk then
  5326. Exit;
  5327. // Note: don't check for empty fInputString, it's valid case,
  5328. // e.g. user needs to replace regex "\b" to "_", it's zero match length
  5329. if ATemplate = '' then
  5330. begin
  5331. Result := '';
  5332. Exit;
  5333. end;
  5334. TemplateBeg := PRegExprChar(ATemplate);
  5335. TemplateEnd := TemplateBeg + Length(ATemplate);
  5336. // Count result length for speed optimization.
  5337. ResultLen := 0;
  5338. p := TemplateBeg;
  5339. while p < TemplateEnd do
  5340. begin
  5341. Ch := p^;
  5342. Inc(p);
  5343. n := -1;
  5344. GroupFound := False;
  5345. if Ch = SubstituteGroupChar then
  5346. FindSubstGroupIndex(p, n, GroupFound);
  5347. if GroupFound then
  5348. begin
  5349. if n >= 0 then
  5350. Inc(ResultLen, GrpBounds[0].GrpEnd[n] - GrpBounds[0].GrpStart[n]);
  5351. end
  5352. else
  5353. begin
  5354. if (Ch = EscChar) and (p < TemplateEnd) then
  5355. begin // quoted or special AnsiChar followed
  5356. Ch := p^;
  5357. Inc(p);
  5358. case Ch of
  5359. 'n':
  5360. Inc(ResultLen, Length(fReplaceLineEnd));
  5361. 'u', 'l', 'U', 'L': { nothing }
  5362. ;
  5363. 'x':
  5364. begin
  5365. Inc(ResultLen);
  5366. if (p^ = '{') then
  5367. begin // skip \x{....}
  5368. while ((p^ <> '}') and (p < TemplateEnd)) do
  5369. p := p + 1;
  5370. p := p + 1;
  5371. end
  5372. else
  5373. p := p + 2 // skip \x..
  5374. end;
  5375. else
  5376. Inc(ResultLen);
  5377. end;
  5378. end
  5379. else
  5380. Inc(ResultLen);
  5381. end;
  5382. end;
  5383. // Get memory. We do it once and it significant speed up work !
  5384. if ResultLen = 0 then
  5385. begin
  5386. Result := '';
  5387. Exit;
  5388. end;
  5389. SetLength(Result, ResultLen);
  5390. // Fill Result
  5391. ResultPtr := PRegExprChar(Result);
  5392. p := TemplateBeg;
  5393. Mode := smodeNormal;
  5394. while p < TemplateEnd do
  5395. begin
  5396. Ch := p^;
  5397. p0 := p;
  5398. Inc(p);
  5399. p1 := p;
  5400. n := -1;
  5401. GroupFound := False;
  5402. if Ch = SubstituteGroupChar then
  5403. FindSubstGroupIndex(p, n, GroupFound);
  5404. if GroupFound then
  5405. begin
  5406. if n >= 0 then
  5407. begin
  5408. p0 := GrpBounds[0].GrpStart[n];
  5409. p1 := GrpBounds[0].GrpEnd[n];
  5410. end
  5411. else
  5412. p1 := p0;
  5413. end
  5414. else
  5415. begin
  5416. if (Ch = EscChar) and (p < TemplateEnd) then
  5417. begin // quoted or special AnsiChar followed
  5418. Ch := p^;
  5419. Inc(p);
  5420. case Ch of
  5421. 'n':
  5422. begin
  5423. p0 := PRegExprChar(fReplaceLineEnd);
  5424. p1 := p0 + Length(fReplaceLineEnd);
  5425. end;
  5426. 'x', 't', 'r', 'f', 'a', 'e':
  5427. begin
  5428. p := p - 1;
  5429. // UnquoteChar expects the escaped AnsiChar under the pointer
  5430. QuotedChar := UnQuoteChar(p, TemplateEnd);
  5431. p := p + 1;
  5432. // Skip after last part of the escaped sequence - UnquoteChar stops on the last symbol of it
  5433. p0 := @QuotedChar;
  5434. p1 := p0 + 1;
  5435. end;
  5436. 'l':
  5437. begin
  5438. Mode := smodeOneLower;
  5439. p1 := p0;
  5440. end;
  5441. 'L':
  5442. begin
  5443. Mode := smodeAllLower;
  5444. p1 := p0;
  5445. end;
  5446. 'u':
  5447. begin
  5448. Mode := smodeOneUpper;
  5449. p1 := p0;
  5450. end;
  5451. 'U':
  5452. begin
  5453. Mode := smodeAllUpper;
  5454. p1 := p0;
  5455. end;
  5456. else
  5457. begin
  5458. Inc(p0);
  5459. Inc(p1);
  5460. end;
  5461. end;
  5462. end
  5463. end;
  5464. if p0 < p1 then
  5465. begin
  5466. while p0 < p1 do
  5467. begin
  5468. case Mode of
  5469. smodeOneLower:
  5470. begin
  5471. ResultPtr^ := _LowerCase(p0^);
  5472. Mode := smodeNormal;
  5473. end;
  5474. smodeAllLower:
  5475. begin
  5476. ResultPtr^ := _LowerCase(p0^);
  5477. end;
  5478. smodeOneUpper:
  5479. begin
  5480. ResultPtr^ := _UpperCase(p0^);
  5481. Mode := smodeNormal;
  5482. end;
  5483. smodeAllUpper:
  5484. begin
  5485. ResultPtr^ := _UpperCase(p0^);
  5486. end;
  5487. else
  5488. ResultPtr^ := p0^;
  5489. end;
  5490. Inc(ResultPtr);
  5491. Inc(p0);
  5492. end;
  5493. Mode := smodeNormal;
  5494. end;
  5495. end;
  5496. end; { of function TRegExpr.Substitute
  5497. -------------------------------------------------------------- }
  5498. procedure TRegExpr.Split(const AInputStr: RegExprString; APieces: TStrings);
  5499. var
  5500. PrevPos: PtrInt;
  5501. begin
  5502. PrevPos := 1;
  5503. if Exec(AInputStr) then
  5504. repeat
  5505. APieces.Add(System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos));
  5506. PrevPos := MatchPos[0] + MatchLen[0];
  5507. until not ExecNext;
  5508. APieces.Add(System.Copy(AInputStr, PrevPos, MaxInt)); // Tail
  5509. end; { of procedure TRegExpr.Split
  5510. -------------------------------------------------------------- }
  5511. function TRegExpr.Replace(const AInputStr: RegExprString;
  5512. const AReplaceStr: RegExprString;
  5513. AUseSubstitution: boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
  5514. var
  5515. PrevPos: PtrInt;
  5516. begin
  5517. Result := '';
  5518. PrevPos := 1;
  5519. if Exec(AInputStr) then
  5520. repeat
  5521. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos);
  5522. if AUseSubstitution // ###0.946
  5523. then
  5524. Result := Result + Substitute(AReplaceStr)
  5525. else
  5526. Result := Result + AReplaceStr;
  5527. PrevPos := MatchPos[0] + MatchLen[0];
  5528. until not ExecNext;
  5529. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  5530. end; { of function TRegExpr.Replace
  5531. -------------------------------------------------------------- }
  5532. function TRegExpr.ReplaceEx(const AInputStr: RegExprString;
  5533. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  5534. var
  5535. PrevPos: PtrInt;
  5536. begin
  5537. Result := '';
  5538. PrevPos := 1;
  5539. if Exec(AInputStr) then
  5540. repeat
  5541. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos)
  5542. + AReplaceFunc(Self);
  5543. PrevPos := MatchPos[0] + MatchLen[0];
  5544. until not ExecNext;
  5545. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  5546. end; { of function TRegExpr.ReplaceEx
  5547. -------------------------------------------------------------- }
  5548. {$IFDEF OverMeth}
  5549. function TRegExpr.Replace(const AInputStr: RegExprString;
  5550. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  5551. begin
  5552. Result := ReplaceEx(AInputStr, AReplaceFunc);
  5553. end; { of function TRegExpr.Replace
  5554. -------------------------------------------------------------- }
  5555. {$ENDIF}
  5556. { ============================================================= }
  5557. { ====================== Debug section ======================== }
  5558. { ============================================================= }
  5559. {$IFDEF UseFirstCharSet}
  5560. procedure TRegExpr.FillFirstCharSet(prog: PRegExprChar);
  5561. var
  5562. scan: PRegExprChar; // Current node.
  5563. Next: PRegExprChar; // Next node.
  5564. opnd: PRegExprChar;
  5565. Oper: TREOp;
  5566. ch: REChar;
  5567. min_cnt: integer;
  5568. {$IFDEF UseLineSep}
  5569. i: integer;
  5570. {$ENDIF}
  5571. TempSet: TRegExprCharset;
  5572. begin
  5573. TempSet := [];
  5574. scan := prog;
  5575. while scan <> nil do
  5576. begin
  5577. Next := regNext(scan);
  5578. Oper := PREOp(scan)^;
  5579. case Oper of
  5580. OP_BSUBEXP,
  5581. OP_BSUBEXPCI:
  5582. begin
  5583. // we cannot optimize r.e. if it starts with back reference
  5584. FirstCharSet := RegExprAllSet; //###0.930
  5585. Exit;
  5586. end;
  5587. OP_BOL,
  5588. OP_BOLML:
  5589. ; // Exit; //###0.937
  5590. OP_EOL,
  5591. OP_EOL2,
  5592. OP_EOLML:
  5593. begin //###0.948 was empty in 0.947, was EXIT in 0.937
  5594. Include(FirstCharSet, 0);
  5595. if ModifierM then
  5596. begin
  5597. {$IFDEF UseLineSep}
  5598. for i := 1 to Length(LineSeparators) do
  5599. Include(FirstCharSet, byte(LineSeparators[i]));
  5600. {$ELSE}
  5601. FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
  5602. {$ENDIF}
  5603. end;
  5604. Exit;
  5605. end;
  5606. OP_BOUND,
  5607. OP_NOTBOUND:
  5608. ; //###0.943 ?!!
  5609. OP_ANY,
  5610. OP_ANYML:
  5611. begin // we can better define ANYML !!!
  5612. FirstCharSet := RegExprAllSet; //###0.930
  5613. Exit;
  5614. end;
  5615. OP_ANYDIGIT:
  5616. begin
  5617. FirstCharSet := FirstCharSet + RegExprDigitSet;
  5618. Exit;
  5619. end;
  5620. OP_NOTDIGIT:
  5621. begin
  5622. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprDigitSet);
  5623. Exit;
  5624. end;
  5625. OP_ANYLETTER:
  5626. begin
  5627. GetCharSetFromWordChars(TempSet);
  5628. FirstCharSet := FirstCharSet + TempSet;
  5629. Exit;
  5630. end;
  5631. OP_NOTLETTER:
  5632. begin
  5633. GetCharSetFromWordChars(TempSet);
  5634. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  5635. Exit;
  5636. end;
  5637. OP_ANYSPACE:
  5638. begin
  5639. GetCharSetFromSpaceChars(TempSet);
  5640. FirstCharSet := FirstCharSet + TempSet;
  5641. Exit;
  5642. end;
  5643. OP_NOTSPACE:
  5644. begin
  5645. GetCharSetFromSpaceChars(TempSet);
  5646. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  5647. Exit;
  5648. end;
  5649. OP_ANYVERTSEP:
  5650. begin
  5651. FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
  5652. Exit;
  5653. end;
  5654. OP_NOTVERTSEP:
  5655. begin
  5656. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprLineSeparatorsSet);
  5657. Exit;
  5658. end;
  5659. OP_ANYHORZSEP:
  5660. begin
  5661. FirstCharSet := FirstCharSet + RegExprHorzSeparatorsSet;
  5662. Exit;
  5663. end;
  5664. OP_NOTHORZSEP:
  5665. begin
  5666. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprHorzSeparatorsSet);
  5667. Exit;
  5668. end;
  5669. OP_EXACTLYCI:
  5670. begin
  5671. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  5672. {$IFDEF UnicodeRE}
  5673. if Ord(ch) <= $FF then
  5674. {$ENDIF}
  5675. begin
  5676. Include(FirstCharSet, byte(ch));
  5677. Include(FirstCharSet, byte(InvertCase(ch)));
  5678. end;
  5679. Exit;
  5680. end;
  5681. OP_EXACTLY:
  5682. begin
  5683. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  5684. {$IFDEF UnicodeRE}
  5685. if Ord(ch) <= $FF then
  5686. {$ENDIF}
  5687. Include(FirstCharSet, byte(ch));
  5688. Exit;
  5689. end;
  5690. OP_ANYOF:
  5691. begin
  5692. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  5693. FirstCharSet := FirstCharSet + TempSet;
  5694. Exit;
  5695. end;
  5696. OP_ANYBUT:
  5697. begin
  5698. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  5699. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  5700. Exit;
  5701. end;
  5702. OP_ANYOFCI:
  5703. begin
  5704. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  5705. FirstCharSet := FirstCharSet + TempSet;
  5706. Exit;
  5707. end;
  5708. OP_ANYBUTCI:
  5709. begin
  5710. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  5711. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  5712. Exit;
  5713. end;
  5714. OP_NOTHING:
  5715. ;
  5716. OP_COMMENT:
  5717. ;
  5718. OP_BACK:
  5719. ;
  5720. OP_OPEN_FIRST .. OP_OPEN_LAST:
  5721. begin
  5722. FillFirstCharSet(Next);
  5723. Exit;
  5724. end;
  5725. OP_CLOSE_FIRST .. OP_CLOSE_LAST:
  5726. begin
  5727. FillFirstCharSet(Next);
  5728. Exit;
  5729. end;
  5730. OP_BRANCH:
  5731. begin
  5732. if (PREOp(Next)^ <> OP_BRANCH) // No choice.
  5733. then
  5734. Next := scan + REOpSz + RENextOffSz // Avoid recursion.
  5735. else
  5736. begin
  5737. repeat
  5738. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  5739. scan := regNext(scan);
  5740. until (scan = nil) or (PREOp(scan)^ <> OP_BRANCH);
  5741. Exit;
  5742. end;
  5743. end;
  5744. {$IFDEF ComplexBraces}
  5745. OP_LOOPENTRY:
  5746. begin //###0.925
  5747. //LoopStack [LoopStackIdx] := 0; //###0.940 line removed
  5748. FillFirstCharSet(Next); // execute LOOP
  5749. Exit;
  5750. end;
  5751. OP_LOOP,
  5752. OP_LOOPNG:
  5753. begin //###0.940
  5754. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz * 2))^;
  5755. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  5756. FillFirstCharSet(opnd);
  5757. if min_cnt = 0 then
  5758. FillFirstCharSet(Next);
  5759. Exit;
  5760. end;
  5761. {$ENDIF}
  5762. OP_STAR,
  5763. OP_STARNG,
  5764. OP_STAR_POSS: //###0.940
  5765. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  5766. OP_PLUS,
  5767. OP_PLUSNG,
  5768. OP_PLUS_POSS:
  5769. begin //###0.940
  5770. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  5771. Exit;
  5772. end;
  5773. OP_BRACES,
  5774. OP_BRACESNG,
  5775. OP_BRACES_POSS:
  5776. begin //###0.940
  5777. opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
  5778. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES
  5779. FillFirstCharSet(opnd);
  5780. if min_cnt > 0 then
  5781. Exit;
  5782. end;
  5783. OP_EEND:
  5784. begin
  5785. FirstCharSet := RegExprAllSet; //###0.948
  5786. Exit;
  5787. end;
  5788. OP_ANYCATEGORY,
  5789. OP_NOTCATEGORY:
  5790. begin
  5791. FirstCharSet := RegExprAllSet;
  5792. Exit;
  5793. end;
  5794. OP_RECUR,
  5795. OP_SUBCALL_FIRST .. OP_SUBCALL_LAST:
  5796. begin
  5797. end
  5798. else
  5799. begin
  5800. fLastErrorOpcode := Oper;
  5801. Error(reeUnknownOpcodeInFillFirst);
  5802. Exit;
  5803. end;
  5804. end; { of case scan^}
  5805. scan := Next;
  5806. end; { of while scan <> nil}
  5807. end; { of procedure FillFirstCharSet
  5808. --------------------------------------------------------------}
  5809. {$ENDIF}
  5810. procedure TRegExpr.InitCharCheckers;
  5811. var
  5812. Cnt: integer;
  5813. //
  5814. function Add(AChecker: TRegExprCharChecker): byte;
  5815. begin
  5816. Inc(Cnt);
  5817. if Cnt > High(CharCheckers) then
  5818. Error(reeTooSmallCheckersArray);
  5819. CharCheckers[Cnt - 1] := AChecker;
  5820. Result := Cnt - 1;
  5821. end;
  5822. //
  5823. begin
  5824. Cnt := 0;
  5825. FillChar(CharCheckers, SizeOf(CharCheckers), 0);
  5826. CheckerIndex_Word := Add(CharChecker_Word);
  5827. CheckerIndex_NotWord := Add(CharChecker_NotWord);
  5828. CheckerIndex_Space := Add(CharChecker_Space);
  5829. CheckerIndex_NotSpace := Add(CharChecker_NotSpace);
  5830. CheckerIndex_Digit := Add(CharChecker_Digit);
  5831. CheckerIndex_NotDigit := Add(CharChecker_NotDigit);
  5832. CheckerIndex_VertSep := Add(CharChecker_VertSep);
  5833. CheckerIndex_NotVertSep := Add(CharChecker_NotVertSep);
  5834. CheckerIndex_HorzSep := Add(CharChecker_HorzSep);
  5835. CheckerIndex_NotHorzSep := Add(CharChecker_NotHorzSep);
  5836. //CheckerIndex_AllAZ := Add(CharChecker_AllAZ);
  5837. CheckerIndex_LowerAZ := Add(CharChecker_LowerAZ);
  5838. CheckerIndex_UpperAZ := Add(CharChecker_UpperAZ);
  5839. SetLength(CharCheckerInfos, 3);
  5840. with CharCheckerInfos[0] do
  5841. begin
  5842. CharBegin := 'a';
  5843. CharEnd:= 'z';
  5844. CheckerIndex := CheckerIndex_LowerAZ;
  5845. end;
  5846. with CharCheckerInfos[1] do
  5847. begin
  5848. CharBegin := 'A';
  5849. CharEnd := 'Z';
  5850. CheckerIndex := CheckerIndex_UpperAZ;
  5851. end;
  5852. with CharCheckerInfos[2] do
  5853. begin
  5854. CharBegin := '0';
  5855. CharEnd := '9';
  5856. CheckerIndex := CheckerIndex_Digit;
  5857. end;
  5858. end;
  5859. function TRegExpr.CharChecker_Word(ch: REChar): boolean;
  5860. begin
  5861. Result := IsWordChar(ch);
  5862. end;
  5863. function TRegExpr.CharChecker_NotWord(ch: REChar): boolean;
  5864. begin
  5865. Result := not IsWordChar(ch);
  5866. end;
  5867. function TRegExpr.CharChecker_Space(ch: REChar): boolean;
  5868. begin
  5869. Result := IsSpaceChar(ch);
  5870. end;
  5871. function TRegExpr.CharChecker_NotSpace(ch: REChar): boolean;
  5872. begin
  5873. Result := not IsSpaceChar(ch);
  5874. end;
  5875. function TRegExpr.CharChecker_Digit(ch: REChar): boolean;
  5876. begin
  5877. Result := IsDigitChar(ch);
  5878. end;
  5879. function TRegExpr.CharChecker_NotDigit(ch: REChar): boolean;
  5880. begin
  5881. Result := not IsDigitChar(ch);
  5882. end;
  5883. function TRegExpr.CharChecker_VertSep(ch: REChar): boolean;
  5884. begin
  5885. Result := IsVertLineSeparator(ch);
  5886. end;
  5887. function TRegExpr.CharChecker_NotVertSep(ch: REChar): boolean;
  5888. begin
  5889. Result := not IsVertLineSeparator(ch);
  5890. end;
  5891. function TRegExpr.CharChecker_HorzSep(ch: REChar): boolean;
  5892. begin
  5893. Result := IsHorzSeparator(ch);
  5894. end;
  5895. function TRegExpr.CharChecker_NotHorzSep(ch: REChar): boolean;
  5896. begin
  5897. Result := not IsHorzSeparator(ch);
  5898. end;
  5899. function TRegExpr.CharChecker_LowerAZ(ch: REChar): boolean;
  5900. begin
  5901. case ch of
  5902. 'a' .. 'z':
  5903. Result := True;
  5904. else
  5905. Result := False;
  5906. end;
  5907. end;
  5908. function TRegExpr.CharChecker_UpperAZ(ch: REChar): boolean;
  5909. begin
  5910. case ch of
  5911. 'A' .. 'Z':
  5912. Result := True;
  5913. else
  5914. Result := False;
  5915. end;
  5916. end;
  5917. {$IFDEF RegExpPCodeDump}
  5918. function TRegExpr.DumpOp(op: TREOp): RegExprString;
  5919. // printable representation of opcode
  5920. begin
  5921. case op of
  5922. OP_BOL:
  5923. Result := 'BOL';
  5924. OP_EOL:
  5925. Result := 'EOL';
  5926. OP_EOL2:
  5927. Result := 'EOL2';
  5928. OP_BOLML:
  5929. Result := 'BOLML';
  5930. OP_EOLML:
  5931. Result := 'EOLML';
  5932. OP_BOUND:
  5933. Result := 'BOUND';
  5934. OP_NOTBOUND:
  5935. Result := 'NOTBOUND';
  5936. OP_ANY:
  5937. Result := 'ANY';
  5938. OP_ANYML:
  5939. Result := 'ANYML';
  5940. OP_ANYLETTER:
  5941. Result := 'ANYLETTER';
  5942. OP_NOTLETTER:
  5943. Result := 'NOTLETTER';
  5944. OP_ANYDIGIT:
  5945. Result := 'ANYDIGIT';
  5946. OP_NOTDIGIT:
  5947. Result := 'NOTDIGIT';
  5948. OP_ANYSPACE:
  5949. Result := 'ANYSPACE';
  5950. OP_NOTSPACE:
  5951. Result := 'NOTSPACE';
  5952. OP_ANYHORZSEP:
  5953. Result := 'ANYHORZSEP';
  5954. OP_NOTHORZSEP:
  5955. Result := 'NOTHORZSEP';
  5956. OP_ANYVERTSEP:
  5957. Result := 'ANYVERTSEP';
  5958. OP_NOTVERTSEP:
  5959. Result := 'NOTVERTSEP';
  5960. OP_ANYOF:
  5961. Result := 'ANYOF';
  5962. OP_ANYBUT:
  5963. Result := 'ANYBUT';
  5964. OP_ANYOFCI:
  5965. Result := 'ANYOF/CI';
  5966. OP_ANYBUTCI:
  5967. Result := 'ANYBUT/CI';
  5968. OP_BRANCH:
  5969. Result := 'BRANCH';
  5970. OP_EXACTLY:
  5971. Result := 'EXACTLY';
  5972. OP_EXACTLYCI:
  5973. Result := 'EXACTLY/CI';
  5974. OP_NOTHING:
  5975. Result := 'NOTHING';
  5976. OP_COMMENT:
  5977. Result := 'COMMENT';
  5978. OP_BACK:
  5979. Result := 'BACK';
  5980. OP_EEND:
  5981. Result := 'END';
  5982. OP_BSUBEXP:
  5983. Result := 'BSUBEXP';
  5984. OP_BSUBEXPCI:
  5985. Result := 'BSUBEXP/CI';
  5986. OP_OPEN_FIRST .. OP_OPEN_LAST:
  5987. Result := Format('OPEN[%d]', [Ord(op) - Ord(OP_OPEN)]);
  5988. OP_CLOSE_FIRST .. OP_CLOSE_LAST:
  5989. Result := Format('CLOSE[%d]', [Ord(op) - Ord(OP_CLOSE)]);
  5990. OP_STAR:
  5991. Result := 'STAR';
  5992. OP_PLUS:
  5993. Result := 'PLUS';
  5994. OP_BRACES:
  5995. Result := 'BRACES';
  5996. {$IFDEF ComplexBraces}
  5997. OP_LOOPENTRY:
  5998. Result := 'LOOPENTRY';
  5999. OP_LOOP:
  6000. Result := 'LOOP';
  6001. OP_LOOPNG:
  6002. Result := 'LOOPNG';
  6003. {$ENDIF}
  6004. OP_STARNG:
  6005. Result := 'STARNG';
  6006. OP_PLUSNG:
  6007. Result := 'PLUSNG';
  6008. OP_BRACESNG:
  6009. Result := 'BRACESNG';
  6010. OP_STAR_POSS:
  6011. Result := 'STAR_POSS';
  6012. OP_PLUS_POSS:
  6013. Result := 'PLUS_POSS';
  6014. OP_BRACES_POSS:
  6015. Result := 'BRACES_POSS';
  6016. OP_ANYCATEGORY:
  6017. Result := 'ANYCATEG';
  6018. OP_NOTCATEGORY:
  6019. Result := 'NOTCATEG';
  6020. OP_RECUR:
  6021. Result := 'RECURSION';
  6022. OP_SUBCALL_FIRST .. OP_SUBCALL_LAST:
  6023. Result := Format('SUBCALL[%d]', [Ord(op) - Ord(OP_SUBCALL)]);
  6024. else
  6025. Error(reeDumpCorruptedOpcode);
  6026. end;
  6027. end; { of function TRegExpr.DumpOp
  6028. -------------------------------------------------------------- }
  6029. function TRegExpr.IsCompiled: boolean;
  6030. begin
  6031. Result := programm <> nil;
  6032. end;
  6033. function PrintableChar(AChar: REChar): RegExprString; {$IFDEF InlineFuncs}inline;{$ENDIF}
  6034. begin
  6035. if AChar < ' ' then
  6036. Result := '#' + IntToStr(Ord(AChar))
  6037. else
  6038. Result := AChar;
  6039. end;
  6040. function TRegExpr.DumpCheckerIndex(N: byte): RegExprString;
  6041. begin
  6042. Result := '?';
  6043. if N = CheckerIndex_Word then Result := '\w' else
  6044. if N = CheckerIndex_NotWord then Result := '\W' else
  6045. if N = CheckerIndex_Digit then Result := '\d' else
  6046. if N = CheckerIndex_NotDigit then Result := '\D' else
  6047. if N = CheckerIndex_Space then Result := '\s' else
  6048. if N = CheckerIndex_NotSpace then Result := '\S' else
  6049. if N = CheckerIndex_HorzSep then Result := '\h' else
  6050. if N = CheckerIndex_NotHorzSep then Result := '\H' else
  6051. if N = CheckerIndex_VertSep then Result := '\v' else
  6052. if N = CheckerIndex_NotVertSep then Result := '\V' else
  6053. if N = CheckerIndex_LowerAZ then Result := 'az' else
  6054. if N = CheckerIndex_UpperAZ then Result := 'AZ' else
  6055. ;
  6056. end;
  6057. function TRegExpr.DumpCategoryChars(ch, ch2: REChar; Positive: boolean): RegExprString;
  6058. const
  6059. S: array[boolean] of RegExprString = ('P', 'p');
  6060. begin
  6061. Result := '\' + S[Positive] + '{' + ch;
  6062. if ch2 <> #0 then
  6063. Result := Result + ch2;
  6064. Result := Result + '} ';
  6065. end;
  6066. function TRegExpr.Dump: RegExprString;
  6067. // dump a regexp in vaguely comprehensible form
  6068. var
  6069. s: PRegExprChar;
  6070. op: TREOp; // Arbitrary non-END op.
  6071. next: PRegExprChar;
  6072. i, NLen: integer;
  6073. Diff: PtrInt;
  6074. iByte: byte;
  6075. ch, ch2: REChar;
  6076. begin
  6077. if not IsProgrammOk then
  6078. Exit;
  6079. op := OP_EXACTLY;
  6080. Result := '';
  6081. s := regCodeWork;
  6082. while op <> OP_EEND do
  6083. begin // While that wasn't END last time...
  6084. op := s^;
  6085. Result := Result + Format('%2d: %s', [s - programm, DumpOp(s^)]);
  6086. // Where, what.
  6087. next := regNext(s);
  6088. if next = nil // Next ptr.
  6089. then
  6090. Result := Result + ' (0)'
  6091. else
  6092. begin
  6093. if next > s
  6094. // ###0.948 PWideChar subtraction workaround (see comments in Tail method for details)
  6095. then
  6096. Diff := next - s
  6097. else
  6098. Diff := -(s - next);
  6099. Result := Result + Format(' (%d) ', [(s - programm) + Diff]);
  6100. end;
  6101. Inc(s, REOpSz + RENextOffSz);
  6102. if (op = OP_ANYOF) or (op = OP_ANYOFCI) or (op = OP_ANYBUT) or (op = OP_ANYBUTCI) then
  6103. begin
  6104. repeat
  6105. case s^ of
  6106. OpKind_End:
  6107. begin
  6108. Inc(s);
  6109. Break;
  6110. end;
  6111. OpKind_Range:
  6112. begin
  6113. Result := Result + 'Rng(';
  6114. Inc(s);
  6115. Result := Result + PrintableChar(s^) + '-';
  6116. Inc(s);
  6117. Result := Result + PrintableChar(s^);
  6118. Result := Result + ') ';
  6119. Inc(s);
  6120. end;
  6121. OpKind_MetaClass:
  6122. begin
  6123. Inc(s);
  6124. Result := Result + DumpCheckerIndex(byte(s^)) + ' ';
  6125. Inc(s);
  6126. end;
  6127. OpKind_Char:
  6128. begin
  6129. Inc(s);
  6130. NLen := PLongInt(s)^;
  6131. Inc(s, RENumberSz);
  6132. Result := Result + 'Ch(';
  6133. for i := 1 to NLen do
  6134. begin
  6135. Result := Result + PrintableChar(s^);
  6136. Inc(s);
  6137. end;
  6138. Result := Result + ') ';
  6139. end;
  6140. OpKind_CategoryYes:
  6141. begin
  6142. Inc(s);
  6143. ch := s^;
  6144. Inc(s);
  6145. ch2 := s^;
  6146. Result := Result + DumpCategoryChars(ch, ch2, True);
  6147. Inc(s);
  6148. end;
  6149. OpKind_CategoryNo:
  6150. begin
  6151. Inc(s);
  6152. ch := s^;
  6153. Inc(s);
  6154. ch2 := s^;
  6155. Result := Result + DumpCategoryChars(ch, ch2, False);
  6156. Inc(s);
  6157. end;
  6158. else
  6159. Error(reeDumpCorruptedOpcode);
  6160. end;
  6161. until false;
  6162. end;
  6163. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  6164. begin
  6165. // Literal string, where present.
  6166. NLen := PLongInt(s)^;
  6167. Inc(s, RENumberSz);
  6168. for i := 1 to NLen do
  6169. begin
  6170. Result := Result + PrintableChar(s^);
  6171. Inc(s);
  6172. end;
  6173. end;
  6174. if (op = OP_BSUBEXP) or (op = OP_BSUBEXPCI) then
  6175. begin
  6176. Result := Result + ' \' + IntToStr(Ord(s^));
  6177. Inc(s);
  6178. end;
  6179. if (op = OP_BRACES) or (op = OP_BRACESNG) or (op = OP_BRACES_POSS) then
  6180. begin // ###0.941
  6181. // show min/max argument of braces operator
  6182. Result := Result + Format('{%d,%d}', [PREBracesArg(AlignToInt(s))^,
  6183. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  6184. Inc(s, REBracesArgSz * 2);
  6185. end;
  6186. {$IFDEF ComplexBraces}
  6187. if (op = OP_LOOP) or (op = OP_LOOPNG) then
  6188. begin // ###0.940
  6189. Result := Result + Format(' -> (%d) {%d,%d}',
  6190. [(s - programm - (REOpSz + RENextOffSz)) +
  6191. PRENextOff(AlignToPtr(s + 2 * REBracesArgSz))^,
  6192. PREBracesArg(AlignToInt(s))^,
  6193. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  6194. Inc(s, 2 * REBracesArgSz + RENextOffSz);
  6195. end;
  6196. {$ENDIF}
  6197. if (op = OP_ANYCATEGORY) or (op = OP_NOTCATEGORY) then
  6198. begin
  6199. ch := s^;
  6200. Inc(s);
  6201. ch2 := s^;
  6202. Inc(s);
  6203. if ch2<>#0 then
  6204. Result := Result + '{' + ch + ch2 + '}'
  6205. else
  6206. Result := Result + '{' + ch + '}';
  6207. end;
  6208. Result := Result + #$d#$a;
  6209. end; { of while }
  6210. // Header fields of interest.
  6211. if regAnchored <> #0 then
  6212. Result := Result + 'Anchored; ';
  6213. if regMustString <> '' then
  6214. Result := Result + 'Must have: "' + regMustString + '"; ';
  6215. {$IFDEF UseFirstCharSet} // ###0.929
  6216. Result := Result + #$d#$a'First charset: ';
  6217. if FirstCharSet = [] then
  6218. Result := Result + '<empty set>'
  6219. else
  6220. if FirstCharSet = RegExprAllSet then
  6221. Result := Result + '<all chars>'
  6222. else
  6223. for iByte := 0 to 255 do
  6224. if iByte in FirstCharSet then
  6225. Result := Result + PrintableChar(REChar(iByte));
  6226. {$ENDIF}
  6227. Result := Result + #$d#$a;
  6228. end; { of function TRegExpr.Dump
  6229. -------------------------------------------------------------- }
  6230. {$ENDIF}
  6231. function TRegExpr.IsFixedLength(var op: TREOp; var ALen: integer): boolean;
  6232. var
  6233. s, next: PRegExprChar;
  6234. N, N2: integer;
  6235. begin
  6236. Result := False;
  6237. ALen := 0;
  6238. if not IsCompiled then Exit;
  6239. s := regCodeWork;
  6240. repeat
  6241. next := regNext(s);
  6242. op := s^;
  6243. Inc(s, REOpSz + RENextOffSz);
  6244. case op of
  6245. OP_EEND:
  6246. begin
  6247. Result := True;
  6248. Exit;
  6249. end;
  6250. OP_BRANCH:
  6251. begin
  6252. op := next^;
  6253. if op <> OP_EEND then Exit;
  6254. end;
  6255. OP_COMMENT,
  6256. OP_BOUND,
  6257. OP_NOTBOUND:
  6258. Continue;
  6259. OP_ANY,
  6260. OP_ANYML,
  6261. OP_ANYDIGIT,
  6262. OP_NOTDIGIT,
  6263. OP_ANYLETTER,
  6264. OP_NOTLETTER,
  6265. OP_ANYSPACE,
  6266. OP_NOTSPACE,
  6267. OP_ANYHORZSEP,
  6268. OP_NOTHORZSEP,
  6269. OP_ANYVERTSEP,
  6270. OP_NOTVERTSEP:
  6271. begin
  6272. Inc(ALen);
  6273. Continue;
  6274. end;
  6275. OP_ANYOF,
  6276. OP_ANYOFCI,
  6277. OP_ANYBUT,
  6278. OP_ANYBUTCI:
  6279. begin
  6280. Inc(ALen);
  6281. repeat
  6282. case s^ of
  6283. OpKind_End:
  6284. begin
  6285. Inc(s);
  6286. Break;
  6287. end;
  6288. OpKind_Range:
  6289. begin
  6290. Inc(s);
  6291. Inc(s);
  6292. Inc(s);
  6293. end;
  6294. OpKind_MetaClass:
  6295. begin
  6296. Inc(s);
  6297. Inc(s);
  6298. end;
  6299. OpKind_Char:
  6300. begin
  6301. Inc(s);
  6302. Inc(s, RENumberSz + PLongInt(s)^);
  6303. end;
  6304. OpKind_CategoryYes,
  6305. OpKind_CategoryNo:
  6306. begin
  6307. Inc(s);
  6308. Inc(s);
  6309. Inc(s);
  6310. end;
  6311. end;
  6312. until False;
  6313. end;
  6314. OP_EXACTLY,
  6315. OP_EXACTLYCI:
  6316. begin
  6317. N := PLongInt(s)^;
  6318. Inc(ALen, N);
  6319. Inc(s, RENumberSz + N);
  6320. Continue;
  6321. end;
  6322. OP_ANYCATEGORY,
  6323. OP_NOTCATEGORY:
  6324. begin
  6325. Inc(ALen);
  6326. Inc(s, 2);
  6327. Continue;
  6328. end;
  6329. OP_BRACES,
  6330. OP_BRACESNG,
  6331. OP_BRACES_POSS:
  6332. begin
  6333. // allow only d{n,n}
  6334. N := PREBracesArg(AlignToInt(s))^;
  6335. N2 := PREBracesArg(AlignToInt(s + REBracesArgSz))^;
  6336. if N <> N2 then
  6337. Exit;
  6338. Inc(ALen, N-1);
  6339. Inc(s, REBracesArgSz * 2);
  6340. end;
  6341. else
  6342. Exit;
  6343. end;
  6344. until False;
  6345. end;
  6346. {$IFDEF reRealExceptionAddr}
  6347. {$OPTIMIZATION ON}
  6348. // ReturnAddr works correctly only if compiler optimization is ON
  6349. // I placed this method at very end of unit because there are no
  6350. // way to restore compiler optimization flag ...
  6351. {$ENDIF}
  6352. procedure TRegExpr.Error(AErrorID: integer);
  6353. {$IFNDEF LINUX}
  6354. {$IFDEF reRealExceptionAddr}
  6355. function ReturnAddr: Pointer; // ###0.938
  6356. asm
  6357. mov eax,[ebp+4]
  6358. end;
  6359. {$ENDIF}
  6360. {$ENDIF}
  6361. var
  6362. e: ERegExpr;
  6363. Msg: string;
  6364. begin
  6365. fLastError := AErrorID; // dummy stub - useless because will raise exception
  6366. Msg := ErrorMsg(AErrorID);
  6367. // compilation error ?
  6368. if AErrorID < reeFirstRuntimeCode then
  6369. Msg := Msg + ' (pos ' + IntToStr(CompilerErrorPos) + ')';
  6370. e := ERegExpr.Create(Msg);
  6371. e.ErrorCode := AErrorID;
  6372. e.CompilerErrorPos := CompilerErrorPos;
  6373. raise e
  6374. {$IFNDEF LINUX}
  6375. {$IFDEF reRealExceptionAddr}
  6376. at ReturnAddr
  6377. {$ENDIF}
  6378. {$ENDIF};
  6379. end; { of procedure TRegExpr.Error
  6380. -------------------------------------------------------------- }
  6381. {$IFDEF Compat} // APIs needed only for users of old FPC 3.0
  6382. function TRegExpr.ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload;
  6383. begin
  6384. Result := ExecPrim(AOffset, ATryOnce, False, False);
  6385. end;
  6386. function TRegExpr.OldInvertCase(const Ch: REChar): REChar;
  6387. begin
  6388. Result := _UpperCase(Ch);
  6389. if Result = Ch then
  6390. Result := _LowerCase(Ch);
  6391. end;
  6392. class function TRegExpr.InvertCaseFunction(const Ch: REChar): REChar;
  6393. begin
  6394. Result := _UpperCase(Ch);
  6395. if Result = Ch then
  6396. Result := _LowerCase(Ch);
  6397. end;
  6398. function TRegExpr.GetLinePairedSeparator: RegExprString;
  6399. begin
  6400. // not supported anymore
  6401. Result := '';
  6402. end;
  6403. procedure TRegExpr.SetLinePairedSeparator(const AValue: RegExprString);
  6404. begin
  6405. // not supported anymore
  6406. end;
  6407. procedure TRegExpr.SetUseOsLineEndOnReplace(AValue: boolean);
  6408. begin
  6409. if fUseOsLineEndOnReplace = AValue then
  6410. Exit;
  6411. fUseOsLineEndOnReplace := AValue;
  6412. if fUseOsLineEndOnReplace then
  6413. fReplaceLineEnd := sLineBreak
  6414. else
  6415. fReplaceLineEnd := #10;
  6416. end;
  6417. {$ENDIF}
  6418. end.