regexpr.pas 157 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277
  1. unit RegExpr;
  2. {
  3. TRegExpr class library
  4. Delphi Regular Expressions
  5. Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
  6. You can choose to use this Pascal unit in one of the two following licenses:
  7. Option 1>
  8. You may use this software in any kind of development,
  9. including comercial, redistribute, and modify it freely,
  10. under the following restrictions :
  11. 1. This software is provided as it is, without any kind of
  12. warranty given. Use it at Your own risk.The author is not
  13. responsible for any consequences of use of this software.
  14. 2. The origin of this software may not be mispresented, You
  15. must not claim that You wrote the original software. If
  16. You use this software in any kind of product, it would be
  17. appreciated that there in a information box, or in the
  18. documentation would be an acknowledgement like
  19. Partial Copyright (c) 2004 Andrey V. Sorokin
  20. https://sorokin.engineer/
  21. [email protected]
  22. 3. You may not have any income from distributing this source
  23. (or altered version of it) to other developers. When You
  24. use this product in a comercial package, the source may
  25. not be charged seperatly.
  26. 4. Altered versions must be plainly marked as such, and must
  27. not be misrepresented as being the original software.
  28. 5. RegExp Studio application and all the visual components as
  29. well as documentation is not part of the TRegExpr library
  30. and is not free for usage.
  31. https://sorokin.engineer/
  32. [email protected]
  33. Option 2>
  34. The same modified LGPL with static linking exception as the Free Pascal RTL
  35. }
  36. interface
  37. { off $DEFINE DebugSynRegExpr }
  38. {$MODE DELPHI} // Delphi-compatible mode in FreePascal
  39. {$INLINE ON}
  40. // ======== Define base compiler options
  41. {$BOOLEVAL OFF}
  42. {$EXTENDEDSYNTAX ON}
  43. {$LONGSTRINGS ON}
  44. {$OPTIMIZATION ON}
  45. // ======== Define options for TRegExpr engine
  46. {$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string
  47. {$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string
  48. {$DEFINE ComplexBraces} // Support braces in complex cases
  49. {$IFNDEF UniCode}
  50. {$UNDEF UnicodeWordDetection}
  51. {$ELSE}
  52. {$DEFINE UnicodeWordDetection}
  53. {$ENDIF}
  54. uses
  55. Math, // Min
  56. Classes, // TStrings in Split method
  57. SysUtils; // Exception
  58. type
  59. {$IFDEF UniCode}
  60. PRegExprChar = PWideChar;
  61. RegExprString = UnicodeString;
  62. REChar = WideChar;
  63. {$ELSE}
  64. PRegExprChar = PChar;
  65. RegExprString = AnsiString; // ###0.952 was string
  66. REChar = Char;
  67. {$ENDIF}
  68. TREOp = REChar; // internal p-code type //###0.933
  69. PREOp = ^TREOp;
  70. type
  71. TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object;
  72. TRegExprCharset = set of byte;
  73. const
  74. // Escape char ('\' in common r.e.) used for escaping metachars (\w, \d etc)
  75. EscChar = '\';
  76. RegExprModifierI: boolean = False; // default value for ModifierI
  77. RegExprModifierR: boolean = True; // default value for ModifierR
  78. RegExprModifierS: boolean = True; // default value for ModifierS
  79. RegExprModifierG: boolean = True; // default value for ModifierG
  80. RegExprModifierM: boolean = False; // default value for ModifierM
  81. RegExprModifierX: boolean = False; // default value for ModifierX
  82. // default value for SpaceChars
  83. RegExprSpaceChars: RegExprString = ' '#$9#$A#$D#$C;
  84. // default value for WordChars
  85. RegExprWordChars: RegExprString = '0123456789'
  86. + 'abcdefghijklmnopqrstuvwxyz'
  87. + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
  88. // default value for LineSeparators
  89. RegExprLineSeparators: RegExprString = #$d#$a#$b#$c
  90. {$IFDEF UniCode}
  91. + #$2028#$2029#$85
  92. {$ENDIF};
  93. // default value for LinePairedSeparator
  94. RegExprLinePairedSeparator: RegExprString = #$d#$a;
  95. { if You need Unix-styled line separators (only \n), then use:
  96. RegExprLineSeparators = #$a;
  97. RegExprLinePairedSeparator = '';
  98. }
  99. // Tab and Unicode category "Space Separator":
  100. // https://www.compart.com/en/unicode/category/Zs
  101. RegExprHorzSeparators: RegExprString = #9#$20#$A0
  102. {$IFDEF UniCode}
  103. + #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000
  104. {$ENDIF};
  105. const
  106. NSUBEXP = 90; // max number of subexpression //###0.929
  107. // Cannot be more than NSUBEXPMAX
  108. // Be carefull - don't use values which overflow CLOSE opcode
  109. // (in this case you'll get compiler error).
  110. // Big NSUBEXP will cause more slow work and more stack required
  111. NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
  112. // Don't change it! It's defined by internal TRegExpr design.
  113. {$IFDEF ComplexBraces}
  114. const
  115. LoopStackMax = 10; // max depth of loops stack //###0.925
  116. type
  117. TRegExprLoopStack = array [1 .. LoopStackMax] of integer;
  118. {$ENDIF}
  119. type
  120. TRegExprModifiers = record
  121. I: boolean;
  122. // Case-insensitive.
  123. R: boolean;
  124. // Extended syntax for Russian ranges in [].
  125. // If True, then а-я additionally includes letter 'ё',
  126. // А-Я additionally includes 'Ё', and а-Я includes all Russian letters.
  127. // Turn it off if it interferes with your national alphabet.
  128. S: boolean;
  129. // Dot '.' matches any char, otherwise only [^\n].
  130. G: boolean;
  131. // Greedy. Switching it off switches all operators to non-greedy style,
  132. // so if G=False, then '*' works like '*?', '+' works like '+?' and so on.
  133. M: boolean;
  134. // Treat string as multiple lines. It changes `^' and `$' from
  135. // matching at only the very start/end of the string to the start/end
  136. // of any line anywhere within the string.
  137. X: boolean;
  138. // Allow comments in regex using # char.
  139. end;
  140. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  141. type
  142. TRegExpr = class;
  143. TRegExprReplaceFunction = function(ARegExpr: TRegExpr): RegExprString of object;
  144. TRegExprCharChecker = function(ch: REChar): boolean of object;
  145. TRegExprCharCheckerArray = array[0 .. 30] of TRegExprCharChecker;
  146. TRegExprCharCheckerInfo = record
  147. CharBegin, CharEnd: REChar;
  148. CheckerIndex: integer;
  149. end;
  150. TRegExprCharCheckerInfos = array of TRegExprCharCheckerInfo;
  151. { TRegExpr }
  152. TRegExpr = class
  153. private
  154. startp: array [0 .. NSUBEXP - 1] of PRegExprChar; // found expr start points
  155. endp: array [0 .. NSUBEXP - 1] of PRegExprChar; // found expr end points
  156. GrpIndexes: array [0 .. NSUBEXP - 1] of integer;
  157. GrpCount: integer;
  158. {$IFDEF ComplexBraces}
  159. LoopStack: TRegExprLoopStack; // state before entering loop
  160. LoopStackIdx: integer; // 0 - out of all loops
  161. {$ENDIF}
  162. // The "internal use only" fields to pass info from compile
  163. // to execute that permits the execute phase to run lots faster on
  164. // simple cases.
  165. reganchored: REChar; // is the match anchored (at beginning-of-line only)?
  166. regmust: PRegExprChar; // string (pointer into program) that match must include, or nil
  167. regmustlen: integer; // length of regmust string
  168. regmustString: RegExprString;
  169. // reganchored permits very fast decisions on suitable starting points
  170. // for a match, cutting down the work a lot. Regmust permits fast rejection
  171. // of lines that cannot possibly match. The regmust tests are costly enough
  172. // that regcomp() supplies a regmust only if the r.e. contains something
  173. // potentially expensive (at present, the only such thing detected is * or +
  174. // at the start of the r.e., which can involve a lot of backup). regmustlen is
  175. // supplied because the test in regexec() needs it and regcomp() is computing
  176. // it anyway.
  177. {$IFDEF UseFirstCharSet}
  178. FirstCharSet: TRegExprCharset;
  179. FirstCharArray: array[byte] of boolean;
  180. {$ENDIF}
  181. // work variables for Exec routines - save stack in recursion
  182. reginput: PRegExprChar; // String-input pointer.
  183. fInputStart: PRegExprChar; // Pointer to first char of input string.
  184. fInputEnd: PRegExprChar; // Pointer to char AFTER last char of input string
  185. fRegexStart: PRegExprChar;
  186. fRegexEnd: PRegExprChar;
  187. // work variables for compiler's routines
  188. regparse: PRegExprChar; // Input-scan pointer.
  189. regnpar: integer; // Count of () brackets.
  190. regdummy: REChar;
  191. regcode: PRegExprChar; // Code-emit pointer; @regdummy = don't.
  192. regsize: integer; // Total programm size in REChars.
  193. regExactlyLen: PLongInt;
  194. regexpBegin: PRegExprChar; // only for error handling. Contains pointer to beginning of r.e. while compiling
  195. regexpIsCompiled: boolean; // true if r.e. successfully compiled
  196. fSecondPass: boolean;
  197. // programm is essentially a linear encoding
  198. // of a nondeterministic finite-state machine (aka syntax charts or
  199. // "railroad normal form" in parsing technology). Each node is an opcode
  200. // plus a "next" pointer, possibly plus an operand. "Next" pointers of
  201. // all nodes except BRANCH implement concatenation; a "next" pointer with
  202. // a BRANCH on both ends of it connects two alternatives. (Here we
  203. // have one of the subtle syntax dependencies: an individual BRANCH (as
  204. // opposed to a collection of them) is never concatenated with anything
  205. // because of operator precedence.) The operand of some types of node is
  206. // a literal string; for others, it is a node leading into a sub-FSM. In
  207. // particular, the operand of a BRANCH node is the first node of the branch.
  208. // (NB this is *not* a tree structure: the tail of the branch connects
  209. // to the thing following the set of BRANCHes.) The opcodes are:
  210. programm: PRegExprChar; // Unwarranted chumminess with compiler.
  211. fExpression: RegExprString; // source of compiled r.e.
  212. fInputString: RegExprString; // input string
  213. fLastError: integer; // see Error, LastError
  214. fLastErrorOpcode: TREOp;
  215. fModifiers: TRegExprModifiers; // modifiers
  216. fCompModifiers: TRegExprModifiers; // compiler's copy of modifiers
  217. fProgModifiers: TRegExprModifiers; // modifiers values from last programm compilation
  218. fSpaceChars: RegExprString;
  219. fWordChars: RegExprString;
  220. fInvertCase: TRegExprInvertCaseFunction;
  221. fLineSeparators: RegExprString;
  222. fLinePairedSeparatorAssigned: boolean;
  223. fLinePairedSeparatorHead, fLinePairedSeparatorTail: REChar;
  224. FReplaceLineEnd: RegExprString; // string to use for "\n" in Substitute method
  225. FUseOsLineEndOnReplace: boolean; // use OS LineBreak chars (LF or CRLF) for FReplaceLineEnd
  226. fSlowChecksSizeMax: integer;
  227. // use ASlowChecks=True in Exec() only when Length(InputString)<SlowChecksSizeMax
  228. // ASlowChecks enables to use regmustString optimization
  229. {$IFNDEF UniCode}
  230. fLineSepArray: array[byte] of boolean;
  231. {$ENDIF}
  232. {$IFDEF UnicodeWordDetection}
  233. FUseUnicodeWordDetection: boolean;
  234. {$ENDIF}
  235. CharCheckers: TRegExprCharCheckerArray;
  236. CharCheckerInfos: TRegExprCharCheckerInfos;
  237. CheckerIndex_Word: byte;
  238. CheckerIndex_NotWord: byte;
  239. CheckerIndex_Digit: byte;
  240. CheckerIndex_NotDigit: byte;
  241. CheckerIndex_Space: byte;
  242. CheckerIndex_NotSpace: byte;
  243. CheckerIndex_HorzSep: byte;
  244. CheckerIndex_NotHorzSep: byte;
  245. CheckerIndex_VertSep: byte;
  246. CheckerIndex_NotVertSep: byte;
  247. CheckerIndex_LowerAZ: byte;
  248. CheckerIndex_UpperAZ: byte;
  249. procedure InitCharCheckers;
  250. function CharChecker_Word(ch: REChar): boolean;
  251. function CharChecker_NotWord(ch: REChar): boolean;
  252. function CharChecker_Space(ch: REChar): boolean;
  253. function CharChecker_NotSpace(ch: REChar): boolean;
  254. function CharChecker_Digit(ch: REChar): boolean;
  255. function CharChecker_NotDigit(ch: REChar): boolean;
  256. function CharChecker_HorzSep(ch: REChar): boolean;
  257. function CharChecker_NotHorzSep(ch: REChar): boolean;
  258. function CharChecker_VertSep(ch: REChar): boolean;
  259. function CharChecker_NotVertSep(ch: REChar): boolean;
  260. function CharChecker_LowerAZ(ch: REChar): boolean;
  261. function CharChecker_UpperAZ(ch: REChar): boolean;
  262. procedure ClearMatches; {$IFDEF InlineFuncs}inline;{$ENDIF}
  263. procedure ClearInternalIndexes; {$IFDEF InlineFuncs}inline;{$ENDIF}
  264. function FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  265. procedure GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  266. procedure GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
  267. procedure GetCharSetFromWordChars(var ARes: TRegExprCharSet);
  268. function IsWordChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  269. function IsSpaceChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  270. function IsCustomLineSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  271. procedure InitLineSepArray;
  272. // Mark programm as having to be [re]compiled
  273. procedure InvalidateProgramm;
  274. // Check if we can use precompiled r.e. or
  275. // [re]compile it if something changed
  276. function IsProgrammOk: boolean; // ###0.941
  277. procedure SetExpression(const AStr: RegExprString);
  278. function GetModifierStr: RegExprString;
  279. procedure SetModifierStr(const AStr: RegExprString);
  280. function GetModifierG: boolean;
  281. function GetModifierI: boolean;
  282. function GetModifierM: boolean;
  283. function GetModifierR: boolean;
  284. function GetModifierS: boolean;
  285. function GetModifierX: boolean;
  286. procedure SetModifierG(AValue: boolean);
  287. procedure SetModifierI(AValue: boolean);
  288. procedure SetModifierM(AValue: boolean);
  289. procedure SetModifierR(AValue: boolean);
  290. procedure SetModifierS(AValue: boolean);
  291. procedure SetModifierX(AValue: boolean);
  292. // Default handler raises exception ERegExpr with
  293. // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
  294. // and CompilerErrorPos = value of property CompilerErrorPos.
  295. procedure Error(AErrorID: integer); virtual; // error handler.
  296. { ==================== Compiler section =================== }
  297. // compile a regular expression into internal code
  298. function CompileRegExpr(ARegExp: PRegExprChar): boolean;
  299. procedure SetUseOsLineEndOnReplace(AValue: boolean);
  300. // set the next-pointer at the end of a node chain
  301. procedure Tail(p: PRegExprChar; val: PRegExprChar);
  302. // regoptail - regtail on operand of first argument; nop if operandless
  303. procedure OpTail(p: PRegExprChar; val: PRegExprChar);
  304. // regnode - emit a node, return location
  305. function EmitNode(op: TREOp): PRegExprChar;
  306. // emit (if appropriate) a byte of code
  307. procedure EmitC(ch: REChar);
  308. // emit LongInt value
  309. procedure EmitInt(AValue: LongInt);
  310. // insert an operator in front of already-emitted operand
  311. // Means relocating the operand.
  312. procedure InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  313. // ###0.90
  314. // regular expression, i.e. main body or parenthesized thing
  315. function ParseReg(paren: integer; var flagp: integer): PRegExprChar;
  316. // one alternative of an | operator
  317. function ParseBranch(var flagp: integer): PRegExprChar;
  318. // something followed by possible [*+?]
  319. function ParsePiece(var flagp: integer): PRegExprChar;
  320. function HexDig(Ch: REChar): integer;
  321. function UnQuoteChar(var APtr: PRegExprChar): REChar;
  322. // the lowest level
  323. function ParseAtom(var flagp: integer): PRegExprChar;
  324. // current pos in r.e. - for error hanling
  325. function GetCompilerErrorPos: PtrInt;
  326. {$IFDEF UseFirstCharSet} // ###0.929
  327. procedure FillFirstCharSet(prog: PRegExprChar);
  328. {$ENDIF}
  329. { ===================== Matching section =================== }
  330. // repeatedly match something simple, report how many
  331. function regrepeat(p: PRegExprChar; AMax: integer): integer;
  332. // dig the "next" pointer out of a node
  333. function regnext(p: PRegExprChar): PRegExprChar;
  334. // recursively matching routine
  335. function MatchPrim(prog: PRegExprChar): boolean;
  336. // match at specific position only, called from ExecPrim
  337. function MatchAtOnePos(APos: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  338. // Exec for stored InputString
  339. function ExecPrim(AOffset: integer; ATryOnce, ASlowChecks: boolean): boolean;
  340. {$IFDEF RegExpPCodeDump}
  341. function DumpOp(op: TREOp): RegExprString;
  342. {$ENDIF}
  343. function GetSubExprCount: integer;
  344. function GetMatchPos(Idx: integer): PtrInt;
  345. function GetMatchLen(Idx: integer): PtrInt;
  346. function GetMatch(Idx: integer): RegExprString;
  347. procedure SetInputString(const AInputString: RegExprString);
  348. procedure SetLineSeparators(const AStr: RegExprString);
  349. procedure SetLinePairedSeparator(const AStr: RegExprString);
  350. function GetLinePairedSeparator: RegExprString;
  351. public
  352. constructor Create; overload;
  353. constructor Create(const AExpression: RegExprString); overload;
  354. destructor Destroy; override;
  355. class function VersionMajor: integer;
  356. class function VersionMinor: integer;
  357. // match a programm against a string AInputString
  358. // !!! Exec store AInputString into InputString property
  359. // For Delphi 5 and higher available overloaded versions - first without
  360. // parameter (uses already assigned to InputString property value)
  361. // and second that has int parameter and is same as ExecPos
  362. function Exec(const AInputString: RegExprString): boolean; overload;
  363. function Exec: boolean; overload;
  364. function Exec(AOffset: integer): boolean; overload;
  365. // find next match:
  366. // ExecNext;
  367. // works the same as
  368. // if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
  369. // else ExecPos (MatchPos [0] + MatchLen [0]);
  370. // but it's more simpler !
  371. // Raises exception if used without preceeding SUCCESSFUL call to
  372. // Exec* (Exec, ExecPos, ExecNext). So You always must use something like
  373. // if Exec (InputString) then repeat { proceed results} until not ExecNext;
  374. function ExecNext: boolean;
  375. // find match for InputString starting from AOffset position
  376. // (AOffset=1 - first char of InputString)
  377. function ExecPos(AOffset: integer = 1): boolean; overload;
  378. function ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload;
  379. // Returns ATemplate with '$&' or '$0' replaced by whole r.e.
  380. // occurence and '$1'...'$nn' replaced by subexpression with given index.
  381. // Symbol '$' is used instead of '\' (for future extensions
  382. // and for more Perl-compatibility) and accepts more than one digit.
  383. // If you want to place into template raw '$' or '\', use prefix '\'.
  384. // Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
  385. // If you want to place any number after '$' you must enclose it
  386. // with curly braces: '${12}'.
  387. // Example: 'a$12bc' -> 'a<Match[12]>bc'
  388. // 'a${1}2bc' -> 'a<Match[1]>2bc'.
  389. function Substitute(const ATemplate: RegExprString): RegExprString;
  390. // Splits AInputStr to list by positions of all r.e. occurencies.
  391. // Internally calls Exec, ExecNext.
  392. procedure Split(const AInputStr: RegExprString; APieces: TStrings);
  393. function Replace(const AInputStr: RegExprString;
  394. const AReplaceStr: RegExprString;
  395. AUseSubstitution: boolean = False) // ###0.946
  396. : RegExprString; overload;
  397. function Replace(const AInputStr: RegExprString;
  398. AReplaceFunc: TRegExprReplaceFunction): RegExprString; overload;
  399. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr.
  400. // If AUseSubstitution is true, then AReplaceStr will be used
  401. // as template for Substitution methods.
  402. // For example:
  403. // Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
  404. // Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
  405. // will return: def 'BLOCK' value 'test1'
  406. // Replace ('BLOCK( test1)', 'def "$1" value "$2"')
  407. // will return: def "$1" value "$2"
  408. // Internally calls Exec, ExecNext.
  409. // Overloaded version and ReplaceEx operate with callback function,
  410. // so you can implement really complex functionality.
  411. function ReplaceEx(const AInputStr: RegExprString;
  412. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  413. // Returns ID of last error, 0 if no errors (unusable if
  414. // Error method raises exception) and clear internal status
  415. // into 0 (no errors).
  416. function LastError: integer;
  417. // Returns Error message for error with ID = AErrorID.
  418. function ErrorMsg(AErrorID: integer): RegExprString; virtual;
  419. // Converts Ch into upper case if it in lower case or in lower
  420. // if it in upper (uses current system local setings)
  421. class function InvertCaseFunction(const Ch: REChar): REChar;
  422. // [Re]compile r.e. Useful for example for GUI r.e. editors (to check
  423. // all properties validity).
  424. procedure Compile; // ###0.941
  425. {$IFDEF RegExpPCodeDump}
  426. // dump a compiled regexp in vaguely comprehensible form
  427. function Dump: RegExprString;
  428. {$ENDIF}
  429. // Regular expression.
  430. // For optimization, TRegExpr will automatically compiles it into 'P-code'
  431. // (You can see it with help of Dump method) and stores in internal
  432. // structures. Real [re]compilation occures only when it really needed -
  433. // while calling Exec, ExecNext, Substitute, Dump, etc
  434. // and only if Expression or other P-code affected properties was changed
  435. // after last [re]compilation.
  436. // If any errors while [re]compilation occures, Error method is called
  437. // (by default Error raises exception - see below)
  438. property Expression: RegExprString read fExpression write SetExpression;
  439. // Set/get default values of r.e.syntax modifiers. Modifiers in
  440. // r.e. (?ismx-ismx) will replace this default values.
  441. // If you try to set unsupported modifier, Error will be called
  442. // (by defaul Error raises exception ERegExpr).
  443. property ModifierStr: RegExprString read GetModifierStr write SetModifierStr;
  444. property ModifierI: boolean read GetModifierI write SetModifierI;
  445. property ModifierR: boolean read GetModifierR write SetModifierR;
  446. property ModifierS: boolean read GetModifierS write SetModifierS;
  447. property ModifierG: boolean read GetModifierG write SetModifierG;
  448. property ModifierM: boolean read GetModifierM write SetModifierM;
  449. property ModifierX: boolean read GetModifierX write SetModifierX;
  450. // returns current input string (from last Exec call or last assign
  451. // to this property).
  452. // Any assignment to this property clear Match* properties !
  453. property InputString: RegExprString read fInputString write SetInputString;
  454. // Number of subexpressions has been found in last Exec* call.
  455. // If there are no subexpr. but whole expr was found (Exec* returned True),
  456. // then SubExprMatchCount=0, if no subexpressions nor whole
  457. // r.e. found (Exec* returned false) then SubExprMatchCount=-1.
  458. // Note, that some subexpr. may be not found and for such
  459. // subexpr. MathPos=MatchLen=-1 and Match=''.
  460. // For example: Expression := '(1)?2(3)?';
  461. // Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
  462. // Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
  463. // Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
  464. // Exec ('2'): SubExprMatchCount=0, Match[0]='2'
  465. // Exec ('7') - return False: SubExprMatchCount=-1
  466. property SubExprMatchCount: integer read GetSubExprCount;
  467. // pos of entrance subexpr. #Idx into tested in last Exec*
  468. // string. First subexpr. has Idx=1, last - MatchCount,
  469. // whole r.e. has Idx=0.
  470. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  471. // not found in input string.
  472. property MatchPos[Idx: integer]: PtrInt read GetMatchPos;
  473. // len of entrance subexpr. #Idx r.e. into tested in last Exec*
  474. // string. First subexpr. has Idx=1, last - MatchCount,
  475. // whole r.e. has Idx=0.
  476. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  477. // not found in input string.
  478. // Remember - MatchLen may be 0 (if r.e. match empty string) !
  479. property MatchLen[Idx: integer]: PtrInt read GetMatchLen;
  480. // == copy (InputString, MatchPos [Idx], MatchLen [Idx])
  481. // Returns '' if in r.e. no such subexpr. or this subexpr.
  482. // not found in input string.
  483. property Match[Idx: integer]: RegExprString read GetMatch;
  484. // Returns position in r.e. where compiler stopped.
  485. // Useful for error diagnostics
  486. property CompilerErrorPos: PtrInt read GetCompilerErrorPos;
  487. // Contains chars, treated as /s (initially filled with RegExprSpaceChars
  488. // global constant)
  489. property SpaceChars: RegExprString read fSpaceChars write fSpaceChars;
  490. // ###0.927
  491. // Contains chars, treated as /w (initially filled with RegExprWordChars
  492. // global constant)
  493. property WordChars: RegExprString read fWordChars write fWordChars;
  494. // ###0.929
  495. {$IFDEF UnicodeWordDetection}
  496. // If set to true, in addition to using WordChars, a heuristic to detect unicode word letters is used for \w
  497. property UseUnicodeWordDetection: boolean read FUseUnicodeWordDetection write FUseUnicodeWordDetection;
  498. {$ENDIF}
  499. // line separators (like \n in Unix)
  500. property LineSeparators: RegExprString read fLineSeparators write SetLineSeparators; // ###0.941
  501. // paired line separator (like \r\n in DOS and Windows).
  502. // must contain exactly two chars or no chars at all
  503. property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; // ###0.941
  504. // Set this property if you want to override case-insensitive functionality.
  505. // Create set it to RegExprInvertCaseFunction (InvertCaseFunction by default)
  506. property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; // ##0.935
  507. // Use OS line end on replace or not. Default is True for backwards compatibility.
  508. // Set to false to use #10.
  509. property UseOsLineEndOnReplace: boolean read FUseOsLineEndOnReplace write SetUseOsLineEndOnReplace;
  510. property SlowChecksSizeMax: integer read fSlowChecksSizeMax write fSlowChecksSizeMax;
  511. end;
  512. type
  513. ERegExpr = class(Exception)
  514. public
  515. ErrorCode: integer;
  516. CompilerErrorPos: PtrInt;
  517. end;
  518. const
  519. RegExprInvertCaseFunction: TRegExprInvertCaseFunction = nil;
  520. // true if string AInputString match regular expression ARegExpr
  521. // ! will raise exeption if syntax errors in ARegExpr
  522. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  523. // Split AInputStr into APieces by r.e. ARegExpr occurencies
  524. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  525. APieces: TStrings);
  526. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr
  527. // If AUseSubstitution is true, then AReplaceStr will be used
  528. // as template for Substitution methods.
  529. // For example:
  530. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  531. // 'BLOCK( test1)', 'def "$1" value "$2"', True)
  532. // will return: def 'BLOCK' value 'test1'
  533. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  534. // 'BLOCK( test1)', 'def "$1" value "$2"')
  535. // will return: def "$1" value "$2"
  536. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  537. AUseSubstitution: boolean = False): RegExprString; overload; // ###0.947
  538. // Alternate form allowing to set more parameters.
  539. type
  540. TRegexReplaceOption = (
  541. rroModifierI,
  542. rroModifierR,
  543. rroModifierS,
  544. rroModifierG,
  545. rroModifierM,
  546. rroModifierX,
  547. rroUseSubstitution,
  548. rroUseOsLineEnd
  549. );
  550. TRegexReplaceOptions = set of TRegexReplaceOption;
  551. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  552. Options: TRegexReplaceOptions): RegExprString; overload;
  553. // Replace all metachars with its safe representation,
  554. // for example 'abc$cd.(' converts into 'abc\$cd\.\('
  555. // This function useful for r.e. autogeneration from
  556. // user input
  557. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  558. // Makes list of subexpressions found in ARegExpr r.e.
  559. // In ASubExps every item represent subexpression,
  560. // from first to last, in format:
  561. // String - subexpression text (without '()')
  562. // low word of Object - starting position in ARegExpr, including '('
  563. // if exists! (first position is 1)
  564. // high word of Object - length, including starting '(' and ending ')'
  565. // if exist!
  566. // AExtendedSyntax - must be True if modifier /m will be On while
  567. // using the r.e.
  568. // Useful for GUI editors of r.e. etc (You can find example of using
  569. // in TestRExp.dpr project)
  570. // Returns
  571. // 0 Success. No unbalanced brackets was found;
  572. // -1 There are not enough closing brackets ')';
  573. // -(n+1) At position n was found opening '[' without //###0.942
  574. // corresponding closing ']';
  575. // n At position n was found closing bracket ')' without
  576. // corresponding opening '('.
  577. // If Result <> 0, then ASubExpr can contain empty items or illegal ones
  578. function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
  579. AExtendedSyntax: boolean= False): integer;
  580. implementation
  581. {$IFDEF UnicodeWordDetection}
  582. uses
  583. UnicodeData;
  584. {$ENDIF}
  585. const
  586. // TRegExpr.VersionMajor/Minor return values of these constants:
  587. REVersionMajor = 0;
  588. REVersionMinor = 987;
  589. OpKind_End = REChar(1);
  590. OpKind_MetaClass = REChar(2);
  591. OpKind_Range = REChar(3);
  592. OpKind_Char = REChar(4);
  593. RegExprAllSet = [0 .. 255];
  594. RegExprDigitSet = [Ord('0') .. Ord('9')];
  595. RegExprLowerAzSet = [Ord('a') .. Ord('z')];
  596. RegExprUpperAzSet = [Ord('A') .. Ord('Z')];
  597. RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet;
  598. RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UniCode} + [$85] {$ENDIF};
  599. RegExprHorzSeparatorsSet = [9, $20, $A0];
  600. MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
  601. type
  602. TRENextOff = PtrInt;
  603. // internal Next "pointer" (offset to current p-code) //###0.933
  604. PRENextOff = ^TRENextOff;
  605. // used for extracting Next "pointers" from compiled r.e. //###0.933
  606. TREBracesArg = integer; // type of {m,n} arguments
  607. PREBracesArg = ^TREBracesArg;
  608. const
  609. REOpSz = SizeOf(TREOp) div SizeOf(REChar);
  610. // size of OP_ command in REChars
  611. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  612. // add space for aligning pointer
  613. // -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size
  614. RENextOffSz = (2 * SizeOf(TRENextOff) div SizeOf(REChar)) - 1;
  615. REBracesArgSz = (2 * SizeOf(TREBracesArg) div SizeOf(REChar));
  616. // add space for aligning pointer
  617. {$ELSE}
  618. RENextOffSz = (SizeOf(TRENextOff) div SizeOf(REChar));
  619. // size of Next pointer in REChars
  620. REBracesArgSz = SizeOf(TREBracesArg) div SizeOf(REChar);
  621. // size of BRACES arguments in REChars
  622. {$ENDIF}
  623. RENumberSz = SizeOf(LongInt) div SizeOf(REChar);
  624. function _FindCharInBuffer(SBegin, SEnd: PRegExprChar; Ch: REChar): PRegExprChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
  625. begin
  626. while SBegin < SEnd do
  627. begin
  628. if SBegin^ = Ch then
  629. begin
  630. Result := SBegin;
  631. Exit;
  632. end;
  633. Inc(SBegin);
  634. end;
  635. Result := nil;
  636. end;
  637. function IsIgnoredChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  638. begin
  639. case AChar of
  640. ' ', #9, #$d, #$a:
  641. Result := True
  642. else
  643. Result := False;
  644. end;
  645. end;
  646. function _IsMetaChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  647. begin
  648. case AChar of
  649. 'd', 'D',
  650. 's', 'S',
  651. 'w', 'W',
  652. 'v', 'V',
  653. 'h', 'H':
  654. Result := True
  655. else
  656. Result := False;
  657. end;
  658. end;
  659. function AlignToPtr(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  660. begin
  661. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  662. Result := Align(p, SizeOf(Pointer));
  663. {$ELSE}
  664. Result := p;
  665. {$ENDIF}
  666. end;
  667. function AlignToInt(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  668. begin
  669. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  670. Result := Align(p, SizeOf(integer));
  671. {$ELSE}
  672. Result := p;
  673. {$ENDIF}
  674. end;
  675. function _UpperCase(Ch: REChar): REChar;
  676. begin
  677. Result := Ch;
  678. if (Ch >= 'a') and (Ch <= 'z') then
  679. begin
  680. Dec(Result, 32);
  681. Exit;
  682. end;
  683. if Ord(Ch) < 128 then
  684. Exit;
  685. {$IFDEF FPC}
  686. {$IFDEF UniCode}
  687. Result := UnicodeUpperCase(Ch)[1];
  688. {$ELSE}
  689. Result := AnsiUpperCase(Ch)[1];
  690. {$ENDIF}
  691. {$ELSE}
  692. {$IFDEF UniCode}
  693. {$IFDEF D2009}
  694. Result := TCharacter.ToUpper(Ch);
  695. {$ENDIF}
  696. {$ELSE}
  697. Result := AnsiUpperCase(Ch)[1];
  698. {$ENDIF}
  699. {$ENDIF}
  700. end;
  701. function _LowerCase(Ch: REChar): REChar;
  702. begin
  703. Result := Ch;
  704. if (Ch >= 'A') and (Ch <= 'Z') then
  705. begin
  706. Inc(Result, 32);
  707. Exit;
  708. end;
  709. if Ord(Ch) < 128 then
  710. Exit;
  711. {$IFDEF FPC}
  712. {$IFDEF UniCode}
  713. Result := UnicodeLowerCase(Ch)[1];
  714. {$ELSE}
  715. Result := AnsiLowerCase(Ch)[1];
  716. {$ENDIF}
  717. {$ELSE}
  718. {$IFDEF UniCode}
  719. {$IFDEF D2009}
  720. Result := TCharacter.ToLower(Ch);
  721. {$ENDIF}
  722. {$ELSE}
  723. Result := AnsiLowerCase(Ch)[1];
  724. {$ENDIF}
  725. {$ENDIF}
  726. end;
  727. { ============================================================= }
  728. { ===================== Global functions ====================== }
  729. { ============================================================= }
  730. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  731. begin
  732. Result :=
  733. (A.I = B.I) and
  734. (A.G = B.G) and
  735. (A.M = B.M) and
  736. (A.S = B.S) and
  737. (A.R = B.R) and
  738. (A.X = B.X);
  739. end;
  740. function ParseModifiers(const APtr: PRegExprChar;
  741. ALen: integer;
  742. var AValue: TRegExprModifiers): boolean;
  743. // Parse string and set AValue if it's in format 'ismxrg-ismxrg'
  744. var
  745. IsOn: boolean;
  746. i: integer;
  747. begin
  748. Result := True;
  749. IsOn := True;
  750. for i := 0 to ALen-1 do
  751. case APtr[i] of
  752. '-':
  753. IsOn := False;
  754. 'I', 'i':
  755. AValue.I := IsOn;
  756. 'R', 'r':
  757. AValue.R := IsOn;
  758. 'S', 's':
  759. AValue.S := IsOn;
  760. 'G', 'g':
  761. AValue.G := IsOn;
  762. 'M', 'm':
  763. AValue.M := IsOn;
  764. 'X', 'x':
  765. AValue.X := IsOn;
  766. else
  767. begin
  768. Result := False;
  769. Exit;
  770. end;
  771. end;
  772. end;
  773. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  774. var
  775. r: TRegExpr;
  776. begin
  777. r := TRegExpr.Create;
  778. try
  779. r.Expression := ARegExpr;
  780. Result := r.Exec(AInputStr);
  781. finally
  782. r.Free;
  783. end;
  784. end; { of function ExecRegExpr
  785. -------------------------------------------------------------- }
  786. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  787. APieces: TStrings);
  788. var
  789. r: TRegExpr;
  790. begin
  791. APieces.Clear;
  792. r := TRegExpr.Create;
  793. try
  794. r.Expression := ARegExpr;
  795. r.Split(AInputStr, APieces);
  796. finally
  797. r.Free;
  798. end;
  799. end; { of procedure SplitRegExpr
  800. -------------------------------------------------------------- }
  801. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  802. AUseSubstitution: boolean= False): RegExprString;
  803. begin
  804. with TRegExpr.Create do
  805. try
  806. Expression := ARegExpr;
  807. Result := Replace(AInputStr, AReplaceStr, AUseSubstitution);
  808. finally
  809. Free;
  810. end;
  811. end; { of function ReplaceRegExpr
  812. -------------------------------------------------------------- }
  813. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  814. Options: TRegexReplaceOptions): RegExprString; overload;
  815. begin
  816. with TRegExpr.Create do
  817. try
  818. ModifierI := (rroModifierI in Options);
  819. ModifierR := (rroModifierR in Options);
  820. ModifierS := (rroModifierS in Options);
  821. ModifierG := (rroModifierG in Options);
  822. ModifierM := (rroModifierM in Options);
  823. ModifierX := (rroModifierX in Options);
  824. // Set this after the above, if the regex contains modifiers, they will be applied.
  825. Expression := ARegExpr;
  826. UseOsLineEndOnReplace := (rroUseOsLineEnd in Options);
  827. Result := Replace(AInputStr, AReplaceStr, rroUseSubstitution in Options);
  828. finally
  829. Free;
  830. end;
  831. end;
  832. (*
  833. const
  834. MetaChars_Init = '^$.[()|?+*' + EscChar + '{';
  835. MetaChars = MetaChars_Init; // not needed to be a variable, const is faster
  836. MetaAll = MetaChars_Init + ']}'; // Very similar to MetaChars, but slighly changed.
  837. *)
  838. function _IsMetaSymbol1(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  839. begin
  840. case ch of
  841. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{':
  842. Result := True
  843. else
  844. Result := False
  845. end;
  846. end;
  847. function _IsMetaSymbol2(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  848. begin
  849. case ch of
  850. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{',
  851. ']', '}':
  852. Result := True
  853. else
  854. Result := False
  855. end;
  856. end;
  857. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  858. var
  859. i, i0, Len: integer;
  860. ch: REChar;
  861. begin
  862. Result := '';
  863. Len := Length(AStr);
  864. i := 1;
  865. i0 := i;
  866. while i <= Len do
  867. begin
  868. ch := AStr[i];
  869. if _IsMetaSymbol2(ch) then
  870. begin
  871. Result := Result + System.Copy(AStr, i0, i - i0) + EscChar + ch;
  872. i0 := i + 1;
  873. end;
  874. Inc(i);
  875. end;
  876. Result := Result + System.Copy(AStr, i0, MaxInt); // Tail
  877. end; { of function QuoteRegExprMetaChars
  878. -------------------------------------------------------------- }
  879. function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
  880. AExtendedSyntax: boolean = False): integer;
  881. type
  882. TStackItemRec = record // ###0.945
  883. SubExprIdx: integer;
  884. StartPos: PtrInt;
  885. end;
  886. TStackArray = packed array [0 .. NSUBEXPMAX - 1] of TStackItemRec;
  887. var
  888. Len, SubExprLen: integer;
  889. i, i0: integer;
  890. Modif: TRegExprModifiers;
  891. Stack: ^TStackArray; // ###0.945
  892. StackIdx, StackSz: integer;
  893. begin
  894. Result := 0; // no unbalanced brackets found at this very moment
  895. Modif:=Default(TRegExprModifiers);
  896. ASubExprs.Clear; // I don't think that adding to non empty list
  897. // can be useful, so I simplified algorithm to work only with empty list
  898. Len := Length(ARegExpr); // some optimization tricks
  899. // first we have to calculate number of subexpression to reserve
  900. // space in Stack array (may be we'll reserve more than needed, but
  901. // it's faster then memory reallocation during parsing)
  902. StackSz := 1; // add 1 for entire r.e.
  903. for i := 1 to Len do
  904. if ARegExpr[i] = '(' then
  905. Inc(StackSz);
  906. // SetLength (Stack, StackSz); //###0.945
  907. GetMem(Stack, SizeOf(TStackItemRec) * StackSz);
  908. try
  909. StackIdx := 0;
  910. i := 1;
  911. while (i <= Len) do
  912. begin
  913. case ARegExpr[i] of
  914. '(':
  915. begin
  916. if (i < Len) and (ARegExpr[i + 1] = '?') then
  917. begin
  918. // this is not subexpression, but comment or other
  919. // Perl extension. We must check is it (?ismxrg-ismxrg)
  920. // and change AExtendedSyntax if /x is changed.
  921. Inc(i, 2); // skip '(?'
  922. i0 := i;
  923. while (i <= Len) and (ARegExpr[i] <> ')') do
  924. Inc(i);
  925. if i > Len then
  926. Result := -1 // unbalansed '('
  927. else
  928. if ParseModifiers(@ARegExpr[i0], i - i0, Modif) then
  929. // Alexey-T: original code had copy from i, not from i0
  930. AExtendedSyntax := Modif.X;
  931. end
  932. else
  933. begin // subexpression starts
  934. ASubExprs.Add(''); // just reserve space
  935. with Stack[StackIdx] do
  936. begin
  937. SubExprIdx := ASubExprs.Count - 1;
  938. StartPos := i;
  939. end;
  940. Inc(StackIdx);
  941. end;
  942. end;
  943. ')':
  944. begin
  945. if StackIdx = 0 then
  946. Result := i // unbalanced ')'
  947. else
  948. begin
  949. Dec(StackIdx);
  950. with Stack[StackIdx] do
  951. begin
  952. SubExprLen := i - StartPos + 1;
  953. ASubExprs.Objects[SubExprIdx] :=
  954. TObject(StartPos or (SubExprLen ShL 16));
  955. ASubExprs[SubExprIdx] := System.Copy(ARegExpr, StartPos + 1,
  956. SubExprLen - 2); // add without brackets
  957. end;
  958. end;
  959. end;
  960. EscChar:
  961. Inc(i); // skip quoted symbol
  962. '[':
  963. begin
  964. // we have to skip character ranges at once, because they can
  965. // contain '#', and '#' in it must NOT be recognized as eXtended
  966. // comment beginning!
  967. i0 := i;
  968. Inc(i);
  969. if ARegExpr[i] = ']' // first ']' inside [] treated as simple char, no need to check '['
  970. then
  971. Inc(i);
  972. while (i <= Len) and (ARegExpr[i] <> ']') do
  973. if ARegExpr[i] = EscChar // ###0.942
  974. then
  975. Inc(i, 2) // skip 'escaped' char to prevent stopping at '\]'
  976. else
  977. Inc(i);
  978. if (i > Len) or (ARegExpr[i] <> ']') // ###0.942
  979. then
  980. Result := -(i0 + 1); // unbalansed '[' //###0.942
  981. end;
  982. '#':
  983. if AExtendedSyntax then
  984. begin
  985. // skip eXtended comments
  986. while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a)
  987. // do not use [#$d, #$a] due to UniCode compatibility
  988. do
  989. Inc(i);
  990. while (i + 1 <= Len) and
  991. ((ARegExpr[i + 1] = #$d) or (ARegExpr[i + 1] = #$a)) do
  992. Inc(i); // attempt to work with different kinds of line separators
  993. // now we are at the line separator that must be skipped.
  994. end;
  995. // here is no 'else' clause - we simply skip ordinary chars
  996. end; // of case
  997. Inc(i); // skip scanned char
  998. // ! can move after Len due to skipping quoted symbol
  999. end;
  1000. // check brackets balance
  1001. if StackIdx <> 0 then
  1002. Result := -1; // unbalansed '('
  1003. // check if entire r.e. added
  1004. if (ASubExprs.Count = 0) or ((PtrInt(ASubExprs.Objects[0]) and $FFFF) <> 1)
  1005. or (((PtrInt(ASubExprs.Objects[0]) ShR 16) and $FFFF) <> Len)
  1006. // whole r.e. wasn't added because it isn't bracketed
  1007. // well, we add it now:
  1008. then
  1009. ASubExprs.InsertObject(0, ARegExpr, TObject((Len ShL 16) or 1));
  1010. finally
  1011. FreeMem(Stack);
  1012. end;
  1013. end; { of function RegExprSubExpressions
  1014. -------------------------------------------------------------- }
  1015. const
  1016. OP_MAGIC = TREOp(216); // programm signature
  1017. // name opcode opnd? meaning
  1018. OP_EEND = TREOp(0); // - End of program
  1019. OP_BOL = TREOp(1); // - Match "" at beginning of line
  1020. OP_EOL = TREOp(2); // - Match "" at end of line
  1021. OP_ANY = TREOp(3); // - Match any one character
  1022. OP_ANYOF = TREOp(4); // Str Match any character in string Str
  1023. OP_ANYBUT = TREOp(5); // Str Match any char. not in string Str
  1024. OP_BRANCH = TREOp(6); // Node Match this alternative, or the next
  1025. OP_BACK = TREOp(7); // - Jump backward (Next < 0)
  1026. OP_EXACTLY = TREOp(8); // Str Match string Str
  1027. OP_NOTHING = TREOp(9); // - Match empty string
  1028. OP_STAR = TREOp(10); // Node Match this (simple) thing 0 or more times
  1029. OP_PLUS = TREOp(11); // Node Match this (simple) thing 1 or more times
  1030. OP_ANYDIGIT = TREOp(12); // - Match any digit (equiv [0-9])
  1031. OP_NOTDIGIT = TREOp(13); // - Match not digit (equiv [0-9])
  1032. OP_ANYLETTER = TREOp(14); // - Match any letter from property WordChars
  1033. OP_NOTLETTER = TREOp(15); // - Match not letter from property WordChars
  1034. OP_ANYSPACE = TREOp(16); // - Match any space char (see property SpaceChars)
  1035. OP_NOTSPACE = TREOp(17); // - Match not space char (see property SpaceChars)
  1036. OP_BRACES = TREOp(18);
  1037. // Node,Min,Max Match this (simple) thing from Min to Max times.
  1038. // Min and Max are TREBracesArg
  1039. OP_COMMENT = TREOp(19); // - Comment ;)
  1040. OP_EXACTLYCI = TREOp(20); // Str Match string Str case insensitive
  1041. OP_ANYOFCI = TREOp(21);
  1042. // Str Match any character in string Str, case insensitive
  1043. OP_ANYBUTCI = TREOp(22);
  1044. // Str Match any char. not in string Str, case insensitive
  1045. OP_LOOPENTRY = TREOp(23); // Node Start of loop (Node - LOOP for this loop)
  1046. OP_LOOP = TREOp(24); // Node,Min,Max,LoopEntryJmp - back jump for LOOPENTRY.
  1047. // Min and Max are TREBracesArg
  1048. // Node - next node in sequence,
  1049. // LoopEntryJmp - associated LOOPENTRY node addr
  1050. OP_BSUBEXP = TREOp(28);
  1051. // Idx Match previously matched subexpression #Idx (stored as REChar) //###0.936
  1052. OP_BSUBEXPCI = TREOp(29); // Idx -"- in case-insensitive mode
  1053. // Non-Greedy Style Ops //###0.940
  1054. OP_STARNG = TREOp(30); // Same as OP_START but in non-greedy mode
  1055. OP_PLUSNG = TREOp(31); // Same as OP_PLUS but in non-greedy mode
  1056. OP_BRACESNG = TREOp(32); // Same as OP_BRACES but in non-greedy mode
  1057. OP_LOOPNG = TREOp(33); // Same as OP_LOOP but in non-greedy mode
  1058. // Multiline mode \m
  1059. OP_BOLML = TREOp(34); // - Match "" at beginning of line
  1060. OP_EOLML = TREOp(35); // - Match "" at end of line
  1061. OP_ANYML = TREOp(36); // - Match any one character
  1062. // Word boundary
  1063. OP_BOUND = TREOp(37); // Match "" between words //###0.943
  1064. OP_NOTBOUND = TREOp(38); // Match "" not between words //###0.943
  1065. OP_ANYHORZSEP = TREOp(39); // Any horizontal whitespace \h
  1066. OP_NOTHORZSEP = TREOp(40); // Not horizontal whitespace \H
  1067. OP_ANYVERTSEP = TREOp(41); // Any vertical whitespace \v
  1068. OP_NOTVERTSEP = TREOp(42); // Not vertical whitespace \V
  1069. // !!! Change OP_OPEN value if you add new opcodes !!!
  1070. OP_OPEN = TREOp(43); // - Mark this point in input as start of \n
  1071. // OP_OPEN + 1 is \1, etc.
  1072. OP_CLOSE = TREOp(Ord(OP_OPEN) + NSUBEXP);
  1073. // - Analogous to OP_OPEN.
  1074. // !!! Don't add new OpCodes after CLOSE !!!
  1075. // We work with p-code through pointers, compatible with PRegExprChar.
  1076. // Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
  1077. // must have lengths that can be divided by SizeOf (REChar) !
  1078. // A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
  1079. // The Next is a offset from the opcode of the node containing it.
  1080. // An operand, if any, simply follows the node. (Note that much of
  1081. // the code generation knows about this implicit relationship!)
  1082. // Using TRENextOff=PtrInt speed up p-code processing.
  1083. // Opcodes description:
  1084. //
  1085. // BRANCH The set of branches constituting a single choice are hooked
  1086. // together with their "next" pointers, since precedence prevents
  1087. // anything being concatenated to any individual branch. The
  1088. // "next" pointer of the last BRANCH in a choice points to the
  1089. // thing following the whole choice. This is also where the
  1090. // final "next" pointer of each individual branch points; each
  1091. // branch starts with the operand node of a BRANCH node.
  1092. // BACK Normal "next" pointers all implicitly point forward; BACK
  1093. // exists to make loop structures possible.
  1094. // STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
  1095. // circular BRANCH structures using BACK. Complex '{min,max}'
  1096. // - as pair LOOPENTRY-LOOP (see below). Simple cases (one
  1097. // character per match) are implemented with STAR, PLUS and
  1098. // BRACES for speed and to minimize recursive plunges.
  1099. // LOOPENTRY,LOOP {min,max} are implemented as special pair
  1100. // LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
  1101. // current level.
  1102. // OPEN,CLOSE are numbered at compile time.
  1103. { ============================================================= }
  1104. { ================== Error handling section =================== }
  1105. { ============================================================= }
  1106. const
  1107. reeOk = 0;
  1108. reeCompNullArgument = 100;
  1109. reeCompParseRegTooManyBrackets = 102;
  1110. reeCompParseRegUnmatchedBrackets = 103;
  1111. reeCompParseRegUnmatchedBrackets2 = 104;
  1112. reeCompParseRegJunkOnEnd = 105;
  1113. reePlusStarOperandCouldBeEmpty = 106;
  1114. reeNestedSQP = 107;
  1115. reeBadHexDigit = 108;
  1116. reeInvalidRange = 109;
  1117. reeParseAtomTrailingBackSlash = 110;
  1118. reeNoHexCodeAfterBSlashX = 111;
  1119. reeHexCodeAfterBSlashXTooBig = 112;
  1120. reeUnmatchedSqBrackets = 113;
  1121. reeInternalUrp = 114;
  1122. reeQPSBFollowsNothing = 115;
  1123. reeTrailingBackSlash = 116;
  1124. reeNoLetterAfterBSlashC = 117;
  1125. reeMetaCharAfterMinusInRange = 118;
  1126. reeRarseAtomInternalDisaster = 119;
  1127. reeIncorrectBraces = 121;
  1128. reeBRACESArgTooBig = 122;
  1129. reeUnknownOpcodeInFillFirst = 123;
  1130. reeBracesMinParamGreaterMax = 124;
  1131. reeUnclosedComment = 125;
  1132. reeComplexBracesNotImplemented = 126;
  1133. reeUnrecognizedModifier = 127;
  1134. reeBadLinePairedSeparator = 128;
  1135. // Runtime errors must be >= 1000
  1136. reeRegRepeatCalledInappropriately = 1000;
  1137. reeMatchPrimMemoryCorruption = 1001;
  1138. reeMatchPrimCorruptedPointers = 1002;
  1139. reeNoExpression = 1003;
  1140. reeCorruptedProgram = 1004;
  1141. reeNoInputStringSpecified = 1005;
  1142. reeOffsetMustBePositive = 1006;
  1143. reeExecNextWithoutExec = 1007;
  1144. reeBadOpcodeInCharClass = 1008;
  1145. reeDumpCorruptedOpcode = 1011;
  1146. reeModifierUnsupported = 1013;
  1147. reeLoopStackExceeded = 1014;
  1148. reeLoopWithoutEntry = 1015;
  1149. function TRegExpr.ErrorMsg(AErrorID: integer): RegExprString;
  1150. begin
  1151. case AErrorID of
  1152. reeOk:
  1153. Result := 'No errors';
  1154. reeCompNullArgument:
  1155. Result := 'TRegExpr compile: null argument';
  1156. reeCompParseRegTooManyBrackets:
  1157. Result := 'TRegExpr compile: ParseReg: too many ()';
  1158. reeCompParseRegUnmatchedBrackets:
  1159. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1160. reeCompParseRegUnmatchedBrackets2:
  1161. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1162. reeCompParseRegJunkOnEnd:
  1163. Result := 'TRegExpr compile: ParseReg: junk at end';
  1164. reePlusStarOperandCouldBeEmpty:
  1165. Result := 'TRegExpr compile: *+ operand could be empty';
  1166. reeNestedSQP:
  1167. Result := 'TRegExpr compile: nested *?+';
  1168. reeBadHexDigit:
  1169. Result := 'TRegExpr compile: bad hex digit';
  1170. reeInvalidRange:
  1171. Result := 'TRegExpr compile: invalid [] range';
  1172. reeParseAtomTrailingBackSlash:
  1173. Result := 'TRegExpr compile: parse atom trailing \';
  1174. reeNoHexCodeAfterBSlashX:
  1175. Result := 'TRegExpr compile: no hex code after \x';
  1176. reeNoLetterAfterBSlashC:
  1177. Result := 'TRegExpr compile: no letter "A".."Z" after \c';
  1178. reeMetaCharAfterMinusInRange:
  1179. Result := 'TRegExpr compile: metachar after "-" in [] range';
  1180. reeHexCodeAfterBSlashXTooBig:
  1181. Result := 'TRegExpr compile: hex code after \x is too big';
  1182. reeUnmatchedSqBrackets:
  1183. Result := 'TRegExpr compile: unmatched []';
  1184. reeInternalUrp:
  1185. Result := 'TRegExpr compile: internal fail on char "|", ")"';
  1186. reeQPSBFollowsNothing:
  1187. Result := 'TRegExpr compile: ?+*{ follows nothing';
  1188. reeTrailingBackSlash:
  1189. Result := 'TRegExpr compile: trailing \';
  1190. reeRarseAtomInternalDisaster:
  1191. Result := 'TRegExpr compile: RarseAtom internal disaster';
  1192. reeIncorrectBraces:
  1193. Result := 'TRegExpr compile: incorrect {} braces';
  1194. reeBRACESArgTooBig:
  1195. Result := 'TRegExpr compile: braces {} argument too big';
  1196. reeUnknownOpcodeInFillFirst:
  1197. Result := 'TRegExpr compile: unknown opcode in FillFirstCharSet ('+DumpOp(fLastErrorOpcode)+')';
  1198. reeBracesMinParamGreaterMax:
  1199. Result := 'TRegExpr compile: braces {} min param greater then max';
  1200. reeUnclosedComment:
  1201. Result := 'TRegExpr compile: unclosed (?#comment)';
  1202. reeComplexBracesNotImplemented:
  1203. Result := 'TRegExpr compile: if you use braces {} and non-greedy ops *?, +?, ?? for complex cases, enable {$DEFINE ComplexBraces}';
  1204. reeUnrecognizedModifier:
  1205. Result := 'TRegExpr compile: unrecognized modifier';
  1206. reeBadLinePairedSeparator:
  1207. Result := 'TRegExpr compile: LinePairedSeparator must countain two different chars or be empty';
  1208. reeRegRepeatCalledInappropriately:
  1209. Result := 'TRegExpr exec: RegRepeat called inappropriately';
  1210. reeMatchPrimMemoryCorruption:
  1211. Result := 'TRegExpr exec: MatchPrim memory corruption';
  1212. reeMatchPrimCorruptedPointers:
  1213. Result := 'TRegExpr exec: MatchPrim corrupted pointers';
  1214. reeNoExpression:
  1215. Result := 'TRegExpr exec: empty expression';
  1216. reeCorruptedProgram:
  1217. Result := 'TRegExpr exec: corrupted opcode (no magic byte)';
  1218. reeNoInputStringSpecified:
  1219. Result := 'TRegExpr exec: empty input string';
  1220. reeOffsetMustBePositive:
  1221. Result := 'TRegExpr exec: offset must be >0';
  1222. reeExecNextWithoutExec:
  1223. Result := 'TRegExpr exec: ExecNext without Exec(Pos)';
  1224. reeBadOpcodeInCharClass:
  1225. Result := 'TRegExpr exec: invalid opcode in char class';
  1226. reeDumpCorruptedOpcode:
  1227. Result := 'TRegExpr dump: corrupted opcode';
  1228. reeLoopStackExceeded:
  1229. Result := 'TRegExpr exec: loop stack exceeded';
  1230. reeLoopWithoutEntry:
  1231. Result := 'TRegExpr exec: loop without loop entry';
  1232. else
  1233. Result := 'Unknown error';
  1234. end;
  1235. end; { of procedure TRegExpr.Error
  1236. -------------------------------------------------------------- }
  1237. function TRegExpr.LastError: integer;
  1238. begin
  1239. Result := fLastError;
  1240. fLastError := reeOk;
  1241. end; { of function TRegExpr.LastError
  1242. -------------------------------------------------------------- }
  1243. { ============================================================= }
  1244. { ===================== Common section ======================== }
  1245. { ============================================================= }
  1246. class function TRegExpr.VersionMajor: integer;
  1247. begin
  1248. Result := REVersionMajor;
  1249. end;
  1250. class function TRegExpr.VersionMinor: integer;
  1251. begin
  1252. Result := REVersionMinor;
  1253. end;
  1254. constructor TRegExpr.Create;
  1255. begin
  1256. inherited;
  1257. programm := nil;
  1258. fExpression := '';
  1259. fInputString := '';
  1260. regexpBegin := nil;
  1261. regexpIsCompiled := False;
  1262. FillChar(fModifiers, SIzeOf(fModifiers), 0);
  1263. ModifierI := RegExprModifierI;
  1264. ModifierR := RegExprModifierR;
  1265. ModifierS := RegExprModifierS;
  1266. ModifierG := RegExprModifierG;
  1267. ModifierM := RegExprModifierM;
  1268. ModifierX := RegExprModifierX;
  1269. SpaceChars := RegExprSpaceChars; // ###0.927
  1270. WordChars := RegExprWordChars; // ###0.929
  1271. fInvertCase := RegExprInvertCaseFunction; // ###0.927
  1272. fLineSeparators := RegExprLineSeparators; // ###0.941
  1273. LinePairedSeparator := RegExprLinePairedSeparator; // ###0.941
  1274. FUseOsLineEndOnReplace := True;
  1275. FReplaceLineEnd := sLineBreak;
  1276. {$IFDEF UnicodeWordDetection}
  1277. FUseUnicodeWordDetection := True;
  1278. {$ENDIF}
  1279. fSlowChecksSizeMax := 2000;
  1280. InitLineSepArray;
  1281. InitCharCheckers;
  1282. end; { of constructor TRegExpr.Create
  1283. -------------------------------------------------------------- }
  1284. constructor TRegExpr.Create(const AExpression: RegExprString);
  1285. begin
  1286. Create;
  1287. Expression := AExpression;
  1288. end;
  1289. destructor TRegExpr.Destroy;
  1290. begin
  1291. if programm <> nil then
  1292. begin
  1293. FreeMem(programm);
  1294. programm := nil;
  1295. end;
  1296. end; { of destructor TRegExpr.Destroy
  1297. -------------------------------------------------------------- }
  1298. class function TRegExpr.InvertCaseFunction(const Ch: REChar): REChar;
  1299. begin
  1300. Result := Ch;
  1301. if (Ch >= 'a') and (Ch <= 'z') then
  1302. begin
  1303. Dec(Result, 32);
  1304. Exit;
  1305. end;
  1306. if (Ch >= 'A') and (Ch <= 'Z') then
  1307. begin
  1308. Inc(Result, 32);
  1309. Exit;
  1310. end;
  1311. if Ord(Ch) < 128 then
  1312. Exit;
  1313. Result := _UpperCase(Ch);
  1314. if Result = Ch then
  1315. Result := _LowerCase(Ch);
  1316. Result := _UpperCase(Ch);
  1317. if Result = Ch then
  1318. Result := _LowerCase(Ch);
  1319. end; { of function TRegExpr.InvertCaseFunction
  1320. -------------------------------------------------------------- }
  1321. procedure TRegExpr.SetExpression(const AStr: RegExprString);
  1322. begin
  1323. if (AStr <> fExpression) or not regexpIsCompiled then
  1324. begin
  1325. regexpIsCompiled := False;
  1326. fExpression := AStr;
  1327. UniqueString(fExpression);
  1328. fRegexStart := PRegExprChar(fExpression);
  1329. fRegexEnd := fRegexStart + Length(fExpression);
  1330. InvalidateProgramm; // ###0.941
  1331. end;
  1332. end; { of procedure TRegExpr.SetExpression
  1333. -------------------------------------------------------------- }
  1334. function TRegExpr.GetSubExprCount: integer;
  1335. begin
  1336. // if nothing found, we must return -1 per TRegExpr docs
  1337. if startp[0] = nil then
  1338. Result := -1
  1339. else
  1340. Result := GrpCount;
  1341. end;
  1342. function TRegExpr.GetMatchPos(Idx: integer): PtrInt;
  1343. begin
  1344. Idx := GrpIndexes[Idx];
  1345. if (Idx >= 0) and (startp[Idx] <> nil) then
  1346. Result := startp[Idx] - fInputStart + 1
  1347. else
  1348. Result := -1;
  1349. end; { of function TRegExpr.GetMatchPos
  1350. -------------------------------------------------------------- }
  1351. function TRegExpr.GetMatchLen(Idx: integer): PtrInt;
  1352. begin
  1353. Idx := GrpIndexes[Idx];
  1354. if (Idx >= 0) and (startp[Idx] <> nil) then
  1355. Result := endp[Idx] - startp[Idx]
  1356. else
  1357. Result := -1;
  1358. end; { of function TRegExpr.GetMatchLen
  1359. -------------------------------------------------------------- }
  1360. function TRegExpr.GetMatch(Idx: integer): RegExprString;
  1361. begin
  1362. Result := '';
  1363. Idx := GrpIndexes[Idx];
  1364. if (Idx >= 0) and (endp[Idx] > startp[Idx]) then
  1365. SetString(Result, startp[Idx], endp[Idx] - startp[Idx]);
  1366. {
  1367. // then Result := copy (fInputString, MatchPos [Idx], MatchLen [Idx]) //###0.929
  1368. then
  1369. begin
  1370. SetLength(Result, endp[Idx] - startp[Idx]);
  1371. System.Move(startp[Idx]^, Result[1], Length(Result) * SizeOf(REChar));
  1372. end;
  1373. }
  1374. end; { of function TRegExpr.GetMatch
  1375. -------------------------------------------------------------- }
  1376. function TRegExpr.GetModifierStr: RegExprString;
  1377. begin
  1378. Result := '-';
  1379. if ModifierI then
  1380. Result := 'i' + Result
  1381. else
  1382. Result := Result + 'i';
  1383. if ModifierR then
  1384. Result := 'r' + Result
  1385. else
  1386. Result := Result + 'r';
  1387. if ModifierS then
  1388. Result := 's' + Result
  1389. else
  1390. Result := Result + 's';
  1391. if ModifierG then
  1392. Result := 'g' + Result
  1393. else
  1394. Result := Result + 'g';
  1395. if ModifierM then
  1396. Result := 'm' + Result
  1397. else
  1398. Result := Result + 'm';
  1399. if ModifierX then
  1400. Result := 'x' + Result
  1401. else
  1402. Result := Result + 'x';
  1403. if Result[Length(Result)] = '-' // remove '-' if all modifiers are 'On'
  1404. then
  1405. System.Delete(Result, Length(Result), 1);
  1406. end; { of function TRegExpr.GetModifierStr
  1407. -------------------------------------------------------------- }
  1408. procedure TRegExpr.SetModifierG(AValue: boolean);
  1409. begin
  1410. fModifiers.G := AValue;
  1411. end;
  1412. procedure TRegExpr.SetModifierI(AValue: boolean);
  1413. begin
  1414. fModifiers.I := AValue;
  1415. end;
  1416. procedure TRegExpr.SetModifierM(AValue: boolean);
  1417. begin
  1418. fModifiers.M := AValue;
  1419. end;
  1420. procedure TRegExpr.SetModifierR(AValue: boolean);
  1421. begin
  1422. fModifiers.R := AValue;
  1423. end;
  1424. procedure TRegExpr.SetModifierS(AValue: boolean);
  1425. begin
  1426. fModifiers.S := AValue;
  1427. end;
  1428. procedure TRegExpr.SetModifierX(AValue: boolean);
  1429. begin
  1430. fModifiers.X := AValue;
  1431. end;
  1432. procedure TRegExpr.SetModifierStr(const AStr: RegExprString);
  1433. begin
  1434. if not ParseModifiers(PRegExprChar(AStr), Length(AStr), fModifiers) then
  1435. Error(reeModifierUnsupported);
  1436. end; { of procedure TRegExpr.SetModifierStr
  1437. -------------------------------------------------------------- }
  1438. { ============================================================= }
  1439. { ==================== Compiler section ======================= }
  1440. { ============================================================= }
  1441. {$IFDEF UnicodeWordDetection}
  1442. {$IFDEF FPC}
  1443. function IsUnicodeWordChar(AChar: WideChar): boolean; inline;
  1444. var
  1445. NType: byte;
  1446. begin
  1447. if Ord(AChar) >= LOW_SURROGATE_BEGIN then
  1448. Exit(False);
  1449. NType := GetProps(Ord(AChar))^.Category;
  1450. Result := (NType <= UGC_OtherNumber);
  1451. end;
  1452. {$ELSE}
  1453. function IsUnicodeWordChar(AChar: WideChar): boolean; inline;
  1454. begin
  1455. Result := System.Character.IsLetterOrDigit(AChar);
  1456. end;
  1457. {$ENDIF}
  1458. {$ENDIF}
  1459. function TRegExpr.IsWordChar(AChar: REChar): boolean;
  1460. begin
  1461. Result := Pos(AChar, fWordChars) > 0;
  1462. {$IFDEF UnicodeWordDetection}
  1463. if not Result and (Ord(AChar) >= 128) and UseUnicodeWordDetection then
  1464. Result := IsUnicodeWordChar(AChar);
  1465. {$ENDIF}
  1466. end;
  1467. function TRegExpr.IsSpaceChar(AChar: REChar): boolean;
  1468. begin
  1469. Result := Pos(AChar, fSpaceChars) > 0;
  1470. end;
  1471. function TRegExpr.IsCustomLineSeparator(AChar: REChar): boolean;
  1472. begin
  1473. {$IFDEF UniCode}
  1474. Result := Pos(AChar, fLineSeparators) > 0;
  1475. {$ELSE}
  1476. Result := fLineSepArray[byte(AChar)];
  1477. {$ENDIF}
  1478. end;
  1479. function IsDigitChar(AChar: REChar): boolean; inline;
  1480. begin
  1481. case AChar of
  1482. '0' .. '9':
  1483. Result := True;
  1484. else
  1485. Result := False;
  1486. end;
  1487. end;
  1488. function IsHorzSeparator(AChar: REChar): boolean; inline;
  1489. begin
  1490. // Tab and Unicode categoty "Space Separator": https://www.compart.com/en/unicode/category/Zs
  1491. case AChar of
  1492. #9, #$20, #$A0:
  1493. Result := True;
  1494. {$IFDEF UniCode}
  1495. #$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000:
  1496. Result := True;
  1497. {$ENDIF}
  1498. else
  1499. Result := False;
  1500. end;
  1501. end;
  1502. function IsLineSeparator(AChar: REChar): boolean; inline;
  1503. begin
  1504. case AChar of
  1505. #$d, #$a, #$b, #$c:
  1506. Result := True;
  1507. {$IFDEF UniCode}
  1508. #$2028, #$2029, #$85:
  1509. Result := True;
  1510. {$ENDIF}
  1511. else
  1512. Result := False;
  1513. end;
  1514. end;
  1515. procedure TRegExpr.InvalidateProgramm;
  1516. begin
  1517. if programm <> nil then
  1518. begin
  1519. FreeMem(programm);
  1520. programm := nil;
  1521. end;
  1522. end; { of procedure TRegExpr.InvalidateProgramm
  1523. -------------------------------------------------------------- }
  1524. procedure TRegExpr.Compile;
  1525. begin
  1526. if fExpression = '' then
  1527. begin
  1528. Error(reeNoExpression);
  1529. Exit;
  1530. end;
  1531. CompileRegExpr(PRegExprChar(fExpression));
  1532. end; { of procedure TRegExpr.Compile
  1533. -------------------------------------------------------------- }
  1534. procedure TRegExpr.InitLineSepArray;
  1535. {$IFNDEF UniCode}
  1536. var
  1537. i: integer;
  1538. {$ENDIF}
  1539. begin
  1540. {$IFNDEF UniCode}
  1541. FillChar(fLineSepArray, SizeOf(fLineSepArray), 0);
  1542. for i := 1 to Length(fLineSeparators) do
  1543. fLineSepArray[byte(fLineSeparators[i])] := True;
  1544. {$ENDIF}
  1545. end;
  1546. function TRegExpr.IsProgrammOk: boolean;
  1547. begin
  1548. Result := False;
  1549. // check modifiers
  1550. if not IsModifiersEqual(fModifiers, fProgModifiers) // ###0.941
  1551. then
  1552. InvalidateProgramm;
  1553. // [Re]compile if needed
  1554. if programm = nil then
  1555. begin
  1556. Compile; // ###0.941
  1557. // Check [re]compiled programm
  1558. if programm = nil then
  1559. Exit; // error was set/raised by Compile (was reeExecAfterCompErr)
  1560. end;
  1561. if programm[0] <> OP_MAGIC // Program corrupted.
  1562. then
  1563. Error(reeCorruptedProgram)
  1564. else
  1565. Result := True;
  1566. end; { of function TRegExpr.IsProgrammOk
  1567. -------------------------------------------------------------- }
  1568. procedure TRegExpr.Tail(p: PRegExprChar; val: PRegExprChar);
  1569. // set the next-pointer at the end of a node chain
  1570. var
  1571. scan: PRegExprChar;
  1572. temp: PRegExprChar;
  1573. begin
  1574. if p = @regdummy then
  1575. Exit;
  1576. // Find last node.
  1577. scan := p;
  1578. repeat
  1579. temp := regnext(scan);
  1580. if temp = nil then
  1581. Break;
  1582. scan := temp;
  1583. until False;
  1584. // Set Next 'pointer'
  1585. if val < scan then
  1586. PRENextOff(AlignToPtr(scan + REOpSz))^ := -(scan - val) // ###0.948
  1587. // work around PWideChar subtraction bug (Delphi uses
  1588. // shr after subtraction to calculate widechar distance %-( )
  1589. // so, if difference is negative we have .. the "feature" :(
  1590. // I could wrap it in $IFDEF UniCode, but I didn't because
  1591. // "P – Q computes the difference between the address given
  1592. // by P (the higher address) and the address given by Q (the
  1593. // lower address)" - Delphi help quotation.
  1594. else
  1595. PRENextOff(AlignToPtr(scan + REOpSz))^ := val - scan; // ###0.933
  1596. end; { of procedure TRegExpr.Tail
  1597. -------------------------------------------------------------- }
  1598. procedure TRegExpr.OpTail(p: PRegExprChar; val: PRegExprChar);
  1599. // regtail on operand of first argument; nop if operandless
  1600. begin
  1601. // "Operandless" and "op != OP_BRANCH" are synonymous in practice.
  1602. if (p = nil) or (p = @regdummy) or (PREOp(p)^ <> OP_BRANCH) then
  1603. Exit;
  1604. Tail(p + REOpSz + RENextOffSz, val); // ###0.933
  1605. end; { of procedure TRegExpr.OpTail
  1606. -------------------------------------------------------------- }
  1607. function TRegExpr.EmitNode(op: TREOp): PRegExprChar; // ###0.933
  1608. // emit a node, return location
  1609. begin
  1610. Result := regcode;
  1611. if Result <> @regdummy then
  1612. begin
  1613. PREOp(regcode)^ := op;
  1614. Inc(regcode, REOpSz);
  1615. PRENextOff(AlignToPtr(regcode))^ := 0; // Next "pointer" := nil
  1616. Inc(regcode, RENextOffSz);
  1617. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  1618. regExactlyLen := PLongInt(regcode)
  1619. else
  1620. regExactlyLen := nil;
  1621. {$IFDEF DebugSynRegExpr}
  1622. if regcode - programm > regsize then
  1623. raise Exception.Create('TRegExpr.EmitNode buffer overrun');
  1624. {$ENDIF}
  1625. end
  1626. else
  1627. Inc(regsize, REOpSz + RENextOffSz);
  1628. // compute code size without code generation
  1629. end; { of function TRegExpr.EmitNode
  1630. -------------------------------------------------------------- }
  1631. procedure TRegExpr.EmitC(ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  1632. begin
  1633. if regcode <> @regdummy then
  1634. begin
  1635. regcode^ := ch;
  1636. Inc(regcode);
  1637. {$IFDEF DebugSynRegExpr}
  1638. if regcode - programm > regsize then
  1639. raise Exception.Create('TRegExpr.EmitC buffer overrun');
  1640. {$ENDIF}
  1641. end
  1642. else
  1643. Inc(regsize, REOpSz); // Type of p-code pointer always is ^REChar
  1644. end; { of procedure TRegExpr.EmitC
  1645. -------------------------------------------------------------- }
  1646. procedure TRegExpr.EmitInt(AValue: LongInt); {$IFDEF InlineFuncs}inline;{$ENDIF}
  1647. begin
  1648. if regcode <> @regdummy then
  1649. begin
  1650. PLongInt(regcode)^ := AValue;
  1651. Inc(regcode, RENumberSz);
  1652. {$IFDEF DebugSynRegExpr}
  1653. if regcode - programm > regsize then
  1654. raise Exception.Create('TRegExpr.EmitInt buffer overrun');
  1655. {$ENDIF}
  1656. end
  1657. else
  1658. Inc(regsize, RENumberSz);
  1659. end;
  1660. procedure TRegExpr.InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  1661. // insert an operator in front of already-emitted operand
  1662. // Means relocating the operand.
  1663. var
  1664. src, dst, place: PRegExprChar;
  1665. i: integer;
  1666. begin
  1667. if regcode = @regdummy then
  1668. begin
  1669. Inc(regsize, sz);
  1670. Exit;
  1671. end;
  1672. // move code behind insert position
  1673. src := regcode;
  1674. Inc(regcode, sz);
  1675. {$IFDEF DebugSynRegExpr}
  1676. if regcode - programm > regsize then
  1677. raise Exception.Create('TRegExpr.InsertOperator buffer overrun');
  1678. // if (opnd<regcode) or (opnd-regcode>regsize) then
  1679. // raise Exception.Create('TRegExpr.InsertOperator invalid opnd');
  1680. {$ENDIF}
  1681. dst := regcode;
  1682. while src > opnd do
  1683. begin
  1684. Dec(dst);
  1685. Dec(src);
  1686. dst^ := src^;
  1687. end;
  1688. place := opnd; // Op node, where operand used to be.
  1689. PREOp(place)^ := op;
  1690. Inc(place, REOpSz);
  1691. for i := 1 + REOpSz to sz do
  1692. begin
  1693. place^ := #0;
  1694. Inc(place);
  1695. end;
  1696. end; { of procedure TRegExpr.InsertOperator
  1697. -------------------------------------------------------------- }
  1698. function FindSkippedMetaLen(PStart, PEnd: PRegExprChar): integer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1699. // find length of initial segment of PStart string consisting
  1700. // entirely of characters not from IsMetaSymbol1.
  1701. begin
  1702. Result := 0;
  1703. while PStart < PEnd do
  1704. begin
  1705. if _IsMetaSymbol1(PStart^) then
  1706. Exit;
  1707. Inc(Result);
  1708. Inc(PStart)
  1709. end;
  1710. end;
  1711. const
  1712. // Flags to be passed up and down.
  1713. flag_HasWidth = 01; // Known never to match nil string.
  1714. flag_Simple = 02; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand.
  1715. flag_SpecStart = 04; // Starts with * or +.
  1716. flag_Worst = 0; // Worst case.
  1717. {$IFDEF UniCode}
  1718. RusRangeLoLow = #$430; // 'а'
  1719. RusRangeLoHigh = #$44F; // 'я'
  1720. RusRangeHiLow = #$410; // 'А'
  1721. RusRangeHiHigh = #$42F; // 'Я'
  1722. {$ELSE}
  1723. RusRangeLoLow = #$E0; // 'а' in cp1251
  1724. RusRangeLoHigh = #$FF; // 'я' in cp1251
  1725. RusRangeHiLow = #$C0; // 'А' in cp1251
  1726. RusRangeHiHigh = #$DF; // 'Я' in cp1251
  1727. {$ENDIF}
  1728. function TRegExpr.FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  1729. // Buffer contains char pairs: (Kind, Data), where Kind is one of OpKind_ values,
  1730. // and Data depends on Kind
  1731. var
  1732. ch, ch2: REChar;
  1733. N, i: integer;
  1734. begin
  1735. if AIgnoreCase then
  1736. AChar := _UpperCase(AChar);
  1737. repeat
  1738. case ABuffer^ of
  1739. OpKind_End:
  1740. begin
  1741. Result := False;
  1742. Exit;
  1743. end;
  1744. OpKind_Range:
  1745. begin
  1746. Inc(ABuffer);
  1747. ch := ABuffer^;
  1748. Inc(ABuffer);
  1749. ch2 := ABuffer^;
  1750. Inc(ABuffer);
  1751. {
  1752. // if AIgnoreCase, ch, ch2 are upcased in opcode
  1753. if AIgnoreCase then
  1754. begin
  1755. ch := _UpperCase(ch);
  1756. ch2 := _UpperCase(ch2);
  1757. end;
  1758. }
  1759. if (AChar >= ch) and (AChar <= ch2) then
  1760. begin
  1761. Result := True;
  1762. Exit;
  1763. end;
  1764. end;
  1765. OpKind_MetaClass:
  1766. begin
  1767. Inc(ABuffer);
  1768. N := Ord(ABuffer^);
  1769. Inc(ABuffer);
  1770. if CharCheckers[N](AChar) then
  1771. begin
  1772. Result := True;
  1773. Exit
  1774. end;
  1775. end;
  1776. OpKind_Char:
  1777. begin
  1778. Inc(ABuffer);
  1779. N := PLongInt(ABuffer)^;
  1780. Inc(ABuffer, RENumberSz);
  1781. for i := 1 to N do
  1782. begin
  1783. ch := ABuffer^;
  1784. Inc(ABuffer);
  1785. {
  1786. // already upcased in opcode
  1787. if AIgnoreCase then
  1788. ch := _UpperCase(ch);
  1789. }
  1790. if ch = AChar then
  1791. begin
  1792. Result := True;
  1793. Exit;
  1794. end;
  1795. end;
  1796. end;
  1797. else
  1798. Error(reeBadOpcodeInCharClass);
  1799. end;
  1800. until False; // assume that Buffer is ended correctly
  1801. end;
  1802. procedure TRegExpr.GetCharSetFromWordChars(var ARes: TRegExprCharset);
  1803. var
  1804. i: integer;
  1805. ch: REChar;
  1806. begin
  1807. ARes := [];
  1808. for i := 1 to Length(fWordChars) do
  1809. begin
  1810. ch := fWordChars[i];
  1811. {$IFDEF UniCode}
  1812. if Ord(ch) <= $FF then
  1813. {$ENDIF}
  1814. Include(ARes, byte(ch));
  1815. end;
  1816. end;
  1817. procedure TRegExpr.GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
  1818. var
  1819. i: integer;
  1820. ch: REChar;
  1821. begin
  1822. ARes := [];
  1823. for i := 1 to Length(fSpaceChars) do
  1824. begin
  1825. ch := fSpaceChars[i];
  1826. {$IFDEF UniCode}
  1827. if Ord(ch) <= $FF then
  1828. {$ENDIF}
  1829. Include(ARes, byte(ch));
  1830. end;
  1831. end;
  1832. procedure TRegExpr.GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  1833. var
  1834. ch, ch2: REChar;
  1835. TempSet: TRegExprCharSet;
  1836. N, i: integer;
  1837. begin
  1838. ARes := [];
  1839. TempSet := [];
  1840. repeat
  1841. case ABuffer^ of
  1842. OpKind_End:
  1843. Exit;
  1844. OpKind_Range:
  1845. begin
  1846. Inc(ABuffer);
  1847. ch := ABuffer^;
  1848. Inc(ABuffer);
  1849. ch2 := ABuffer^;
  1850. Inc(ABuffer);
  1851. for i := Ord(ch) to
  1852. {$IFDEF UniCode} Min(Ord(ch2), $FF) {$ELSE} Ord(ch2) {$ENDIF} do
  1853. begin
  1854. Include(ARes, byte(i));
  1855. if AIgnoreCase then
  1856. Include(ARes, byte(InvertCase(REChar(i))));
  1857. end;
  1858. end;
  1859. OpKind_MetaClass:
  1860. begin
  1861. Inc(ABuffer);
  1862. N := Ord(ABuffer^);
  1863. Inc(ABuffer);
  1864. if N = CheckerIndex_Word then
  1865. begin
  1866. GetCharSetFromWordChars(TempSet);
  1867. ARes := ARes + TempSet;
  1868. end
  1869. else
  1870. if N = CheckerIndex_NotWord then
  1871. begin
  1872. GetCharSetFromWordChars(TempSet);
  1873. ARes := ARes + (RegExprAllSet - TempSet);
  1874. end
  1875. else
  1876. if N = CheckerIndex_Space then
  1877. begin
  1878. GetCharSetFromSpaceChars(TempSet);
  1879. ARes := ARes + TempSet;
  1880. end
  1881. else
  1882. if N = CheckerIndex_NotSpace then
  1883. begin
  1884. GetCharSetFromSpaceChars(TempSet);
  1885. ARes := ARes + (RegExprAllSet - TempSet);
  1886. end
  1887. else
  1888. if N = CheckerIndex_Digit then
  1889. ARes := ARes + RegExprDigitSet
  1890. else
  1891. if N = CheckerIndex_NotDigit then
  1892. ARes := ARes + (RegExprAllSet - RegExprDigitSet)
  1893. else
  1894. if N = CheckerIndex_VertSep then
  1895. ARes := ARes + RegExprLineSeparatorsSet
  1896. else
  1897. if N = CheckerIndex_NotVertSep then
  1898. ARes := ARes + (RegExprAllSet - RegExprLineSeparatorsSet)
  1899. else
  1900. if N = CheckerIndex_HorzSep then
  1901. ARes := ARes + RegExprHorzSeparatorsSet
  1902. else
  1903. if N = CheckerIndex_NotHorzSep then
  1904. ARes := ARes + (RegExprAllSet - RegExprHorzSeparatorsSet)
  1905. else
  1906. if N = CheckerIndex_LowerAZ then
  1907. begin
  1908. if AIgnoreCase then
  1909. ARes := ARes + RegExprAllAzSet
  1910. else
  1911. ARes := ARes + RegExprLowerAzSet;
  1912. end
  1913. else
  1914. if N = CheckerIndex_UpperAZ then
  1915. begin
  1916. if AIgnoreCase then
  1917. ARes := ARes + RegExprAllAzSet
  1918. else
  1919. ARes := ARes + RegExprUpperAzSet;
  1920. end
  1921. else
  1922. Error(reeBadOpcodeInCharClass);
  1923. end;
  1924. OpKind_Char:
  1925. begin
  1926. Inc(ABuffer);
  1927. N := PLongInt(ABuffer)^;
  1928. Inc(ABuffer, RENumberSz);
  1929. for i := 1 to N do
  1930. begin
  1931. ch := ABuffer^;
  1932. Inc(ABuffer);
  1933. {$IFDEF UniCode}
  1934. if Ord(ch) <= $FF then
  1935. {$ENDIF}
  1936. begin
  1937. Include(ARes, byte(ch));
  1938. if AIgnoreCase then
  1939. Include(ARes, byte(InvertCase(ch)));
  1940. end;
  1941. end;
  1942. end;
  1943. else
  1944. Error(reeBadOpcodeInCharClass);
  1945. end;
  1946. until False; // assume that Buffer is ended correctly
  1947. end;
  1948. function TRegExpr.GetModifierG: boolean;
  1949. begin
  1950. Result := fModifiers.G;
  1951. end;
  1952. function TRegExpr.GetModifierI: boolean;
  1953. begin
  1954. Result := fModifiers.I;
  1955. end;
  1956. function TRegExpr.GetModifierM: boolean;
  1957. begin
  1958. Result := fModifiers.M;
  1959. end;
  1960. function TRegExpr.GetModifierR: boolean;
  1961. begin
  1962. Result := fModifiers.R;
  1963. end;
  1964. function TRegExpr.GetModifierS: boolean;
  1965. begin
  1966. Result := fModifiers.S;
  1967. end;
  1968. function TRegExpr.GetModifierX: boolean;
  1969. begin
  1970. Result := fModifiers.X;
  1971. end;
  1972. function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): boolean;
  1973. // Compile a regular expression into internal code
  1974. // We can't allocate space until we know how big the compiled form will be,
  1975. // but we can't compile it (and thus know how big it is) until we've got a
  1976. // place to put the code. So we cheat: we compile it twice, once with code
  1977. // generation turned off and size counting turned on, and once "for real".
  1978. // This also means that we don't allocate space until we are sure that the
  1979. // thing really will compile successfully, and we never have to move the
  1980. // code and thus invalidate pointers into it. (Note that it has to be in
  1981. // one piece because free() must be able to free it all.)
  1982. // Beware that the optimization-preparation code in here knows about some
  1983. // of the structure of the compiled regexp.
  1984. var
  1985. scan, longest, longestTemp: PRegExprChar;
  1986. Len, LenTemp: integer;
  1987. flags: integer;
  1988. begin
  1989. Result := False; // life too dark
  1990. flags := 0;
  1991. regparse := nil; // for correct error handling
  1992. regexpBegin := ARegExp;
  1993. regExactlyLen := nil;
  1994. ClearInternalIndexes;
  1995. fLastError := reeOk;
  1996. fLastErrorOpcode := TREOp(0);
  1997. try
  1998. if programm <> nil then
  1999. begin
  2000. FreeMem(programm);
  2001. programm := nil;
  2002. end;
  2003. if ARegExp = nil then
  2004. begin
  2005. Error(reeCompNullArgument);
  2006. Exit;
  2007. end;
  2008. fProgModifiers := fModifiers;
  2009. // well, may it's paranoia. I'll check it later... !!!!!!!!
  2010. // First pass: determine size, legality.
  2011. fSecondPass := False;
  2012. fCompModifiers := fModifiers;
  2013. regparse := ARegExp;
  2014. regnpar := 1;
  2015. regsize := 0;
  2016. regcode := @regdummy;
  2017. EmitC(OP_MAGIC);
  2018. if ParseReg(0, flags) = nil then
  2019. Exit;
  2020. // Allocate space.
  2021. GetMem(programm, regsize * SizeOf(REChar));
  2022. // Second pass: emit code.
  2023. fSecondPass := True;
  2024. fCompModifiers := fModifiers;
  2025. regparse := ARegExp;
  2026. regnpar := 1;
  2027. regcode := programm;
  2028. EmitC(OP_MAGIC);
  2029. if ParseReg(0, flags) = nil then
  2030. Exit;
  2031. // Dig out information for optimizations.
  2032. {$IFDEF UseFirstCharSet} // ###0.929
  2033. FirstCharSet := [];
  2034. FillFirstCharSet(programm + REOpSz);
  2035. for Len := 0 to 255 do
  2036. FirstCharArray[Len] := byte(Len) in FirstCharSet;
  2037. {$ENDIF}
  2038. reganchored := #0;
  2039. regmust := nil;
  2040. regmustlen := 0;
  2041. regmustString := '';
  2042. scan := programm + REOpSz; // First OP_BRANCH.
  2043. if PREOp(regnext(scan))^ = OP_EEND then
  2044. begin // Only one top-level choice.
  2045. scan := scan + REOpSz + RENextOffSz;
  2046. // Starting-point info.
  2047. if PREOp(scan)^ = OP_BOL then
  2048. Inc(reganchored);
  2049. // If there's something expensive in the r.e., find the longest
  2050. // literal string that must appear and make it the regmust. Resolve
  2051. // ties in favor of later strings, since the regstart check works
  2052. // with the beginning of the r.e. and avoiding duplication
  2053. // strengthens checking. Not a strong reason, but sufficient in the
  2054. // absence of others.
  2055. if (flags and flag_SpecStart) <> 0 then
  2056. begin
  2057. longest := nil;
  2058. Len := 0;
  2059. while scan <> nil do
  2060. begin
  2061. if PREOp(scan)^ = OP_EXACTLY then
  2062. begin
  2063. longestTemp := scan + REOpSz + RENextOffSz + RENumberSz;
  2064. LenTemp := PLongInt(scan + REOpSz + RENextOffSz)^;
  2065. if LenTemp >= Len then
  2066. begin
  2067. longest := longestTemp;
  2068. Len := LenTemp;
  2069. end;
  2070. end;
  2071. scan := regnext(scan);
  2072. end;
  2073. regmust := longest;
  2074. regmustlen := Len;
  2075. if regmustlen > 1 then // don't use regmust if too short
  2076. SetString(regmustString, regmust, regmustlen);
  2077. end;
  2078. end;
  2079. Result := True;
  2080. finally
  2081. begin
  2082. if not Result then
  2083. InvalidateProgramm;
  2084. regexpBegin := nil;
  2085. regexpIsCompiled := Result; // ###0.944
  2086. end;
  2087. end;
  2088. end; { of function TRegExpr.CompileRegExpr
  2089. -------------------------------------------------------------- }
  2090. procedure TRegExpr.SetUseOsLineEndOnReplace(AValue: boolean);
  2091. begin
  2092. if FUseOsLineEndOnReplace = AValue then
  2093. Exit;
  2094. FUseOsLineEndOnReplace := AValue;
  2095. if FUseOsLineEndOnReplace then
  2096. FReplaceLineEnd := sLineBreak
  2097. else
  2098. FReplaceLineEnd := #10;
  2099. end;
  2100. function TRegExpr.ParseReg(paren: integer; var flagp: integer): PRegExprChar;
  2101. // regular expression, i.e. main body or parenthesized thing
  2102. // Caller must absorb opening parenthesis.
  2103. // Combining parenthesis handling with the base level of regular expression
  2104. // is a trifle forced, but the need to tie the tails of the branches to what
  2105. // follows makes it hard to avoid.
  2106. var
  2107. ret, br, ender: PRegExprChar;
  2108. parno: integer;
  2109. flags: integer;
  2110. SavedModifiers: TRegExprModifiers;
  2111. begin
  2112. flags := 0;
  2113. Result := nil;
  2114. flagp := flag_HasWidth; // Tentatively.
  2115. parno := 0; // eliminate compiler stupid warning
  2116. SavedModifiers := fCompModifiers;
  2117. // Make an OP_OPEN node, if parenthesized.
  2118. if paren <> 0 then
  2119. begin
  2120. if regnpar >= NSUBEXP then
  2121. begin
  2122. Error(reeCompParseRegTooManyBrackets);
  2123. Exit;
  2124. end;
  2125. parno := regnpar;
  2126. Inc(regnpar);
  2127. ret := EmitNode(TREOp(Ord(OP_OPEN) + parno));
  2128. end
  2129. else
  2130. ret := nil;
  2131. // Pick up the branches, linking them together.
  2132. br := ParseBranch(flags);
  2133. if br = nil then
  2134. begin
  2135. Result := nil;
  2136. Exit;
  2137. end;
  2138. if ret <> nil then
  2139. Tail(ret, br) // OP_OPEN -> first.
  2140. else
  2141. ret := br;
  2142. if (flags and flag_HasWidth) = 0 then
  2143. flagp := flagp and not flag_HasWidth;
  2144. flagp := flagp or flags and flag_SpecStart;
  2145. while (regparse^ = '|') do
  2146. begin
  2147. Inc(regparse);
  2148. br := ParseBranch(flags);
  2149. if br = nil then
  2150. begin
  2151. Result := nil;
  2152. Exit;
  2153. end;
  2154. Tail(ret, br); // OP_BRANCH -> OP_BRANCH.
  2155. if (flags and flag_HasWidth) = 0 then
  2156. flagp := flagp and not flag_HasWidth;
  2157. flagp := flagp or flags and flag_SpecStart;
  2158. end;
  2159. // Make a closing node, and hook it on the end.
  2160. if paren <> 0 then
  2161. ender := EmitNode(TREOp(Ord(OP_CLOSE) + parno))
  2162. else
  2163. ender := EmitNode(OP_EEND);
  2164. Tail(ret, ender);
  2165. // Hook the tails of the branches to the closing node.
  2166. br := ret;
  2167. while br <> nil do
  2168. begin
  2169. OpTail(br, ender);
  2170. br := regnext(br);
  2171. end;
  2172. // Check for proper termination.
  2173. if paren <> 0 then
  2174. if regparse^ <> ')' then
  2175. begin
  2176. Error(reeCompParseRegUnmatchedBrackets);
  2177. Exit;
  2178. end
  2179. else
  2180. Inc(regparse); // skip trailing ')'
  2181. if (paren = 0) and (regparse < fRegexEnd) then
  2182. begin
  2183. if regparse^ = ')' then
  2184. Error(reeCompParseRegUnmatchedBrackets2)
  2185. else
  2186. Error(reeCompParseRegJunkOnEnd);
  2187. Exit;
  2188. end;
  2189. fCompModifiers := SavedModifiers; // restore modifiers of parent
  2190. Result := ret;
  2191. end; { of function TRegExpr.ParseReg
  2192. -------------------------------------------------------------- }
  2193. function TRegExpr.ParseBranch(var flagp: integer): PRegExprChar;
  2194. // one alternative of an | operator
  2195. // Implements the concatenation operator.
  2196. var
  2197. ret, chain, latest: PRegExprChar;
  2198. flags: integer;
  2199. begin
  2200. flags := 0;
  2201. flagp := flag_Worst; // Tentatively.
  2202. ret := EmitNode(OP_BRANCH);
  2203. chain := nil;
  2204. while (regparse < fRegexEnd) and (regparse^ <> '|') and (regparse^ <> ')') do
  2205. begin
  2206. latest := ParsePiece(flags);
  2207. if latest = nil then
  2208. begin
  2209. Result := nil;
  2210. Exit;
  2211. end;
  2212. flagp := flagp or flags and flag_HasWidth;
  2213. if chain = nil // First piece.
  2214. then
  2215. flagp := flagp or flags and flag_SpecStart
  2216. else
  2217. Tail(chain, latest);
  2218. chain := latest;
  2219. end;
  2220. if chain = nil // Loop ran zero times.
  2221. then
  2222. EmitNode(OP_NOTHING);
  2223. Result := ret;
  2224. end; { of function TRegExpr.ParseBranch
  2225. -------------------------------------------------------------- }
  2226. function TRegExpr.ParsePiece(var flagp: integer): PRegExprChar;
  2227. // something followed by possible [*+?{]
  2228. // Note that the branching code sequences used for ? and the general cases
  2229. // of * and + and { are somewhat optimized: they use the same OP_NOTHING node as
  2230. // both the endmarker for their branch list and the body of the last branch.
  2231. // It might seem that this node could be dispensed with entirely, but the
  2232. // endmarker role is not redundant.
  2233. function ParseNumber(AStart, AEnd: PRegExprChar): TREBracesArg;
  2234. begin
  2235. Result := 0;
  2236. if AEnd - AStart + 1 > 8 then
  2237. begin // prevent stupid scanning
  2238. Error(reeBRACESArgTooBig);
  2239. Exit;
  2240. end;
  2241. while AStart <= AEnd do
  2242. begin
  2243. Result := Result * 10 + (Ord(AStart^) - Ord('0'));
  2244. Inc(AStart);
  2245. end;
  2246. if (Result > MaxBracesArg) or (Result < 0) then
  2247. begin
  2248. Error(reeBRACESArgTooBig);
  2249. Exit;
  2250. end;
  2251. end;
  2252. var
  2253. TheOp: TREOp;
  2254. NextNode: PRegExprChar;
  2255. procedure EmitComplexBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
  2256. {$IFDEF ComplexBraces}
  2257. var
  2258. off: TRENextOff;
  2259. {$ENDIF}
  2260. begin
  2261. {$IFNDEF ComplexBraces}
  2262. Error(reeComplexBracesNotImplemented);
  2263. {$ELSE}
  2264. if ANonGreedyOp then
  2265. TheOp := OP_LOOPNG
  2266. else
  2267. TheOp := OP_LOOP;
  2268. InsertOperator(OP_LOOPENTRY, Result, REOpSz + RENextOffSz);
  2269. NextNode := EmitNode(TheOp);
  2270. if regcode <> @regdummy then
  2271. begin
  2272. off := (Result + REOpSz + RENextOffSz) - (regcode - REOpSz - RENextOffSz);
  2273. // back to Atom after OP_LOOPENTRY
  2274. PREBracesArg(AlignToInt(regcode))^ := ABracesMin;
  2275. Inc(regcode, REBracesArgSz);
  2276. PREBracesArg(AlignToInt(regcode))^ := ABracesMax;
  2277. Inc(regcode, REBracesArgSz);
  2278. PRENextOff(AlignToPtr(regcode))^ := off;
  2279. Inc(regcode, RENextOffSz);
  2280. {$IFDEF DebugSynRegExpr}
  2281. if regcode - programm > regsize then
  2282. raise Exception.Create
  2283. ('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun');
  2284. {$ENDIF}
  2285. end
  2286. else
  2287. Inc(regsize, REBracesArgSz * 2 + RENextOffSz);
  2288. Tail(Result, NextNode); // OP_LOOPENTRY -> OP_LOOP
  2289. if regcode <> @regdummy then
  2290. Tail(Result + REOpSz + RENextOffSz, NextNode); // Atom -> OP_LOOP
  2291. {$ENDIF}
  2292. end;
  2293. procedure EmitSimpleBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
  2294. begin
  2295. if ANonGreedyOp // ###0.940
  2296. then
  2297. TheOp := OP_BRACESNG
  2298. else
  2299. TheOp := OP_BRACES;
  2300. InsertOperator(TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
  2301. if regcode <> @regdummy then
  2302. begin
  2303. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin;
  2304. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax;
  2305. end;
  2306. end;
  2307. var
  2308. op: REChar;
  2309. NonGreedyOp, NonGreedyCh: boolean; // ###0.940
  2310. flags: integer;
  2311. BracesMin, Bracesmax: TREBracesArg;
  2312. p: PRegExprChar;
  2313. begin
  2314. flags := 0;
  2315. Result := ParseAtom(flags);
  2316. if Result = nil then
  2317. Exit;
  2318. op := regparse^;
  2319. if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then
  2320. begin
  2321. flagp := flags;
  2322. Exit;
  2323. end;
  2324. if ((flags and flag_HasWidth) = 0) and (op <> '?') then
  2325. begin
  2326. Error(reePlusStarOperandCouldBeEmpty);
  2327. Exit;
  2328. end;
  2329. case op of
  2330. '*':
  2331. begin
  2332. flagp := flag_Worst or flag_SpecStart;
  2333. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2334. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2335. // ###0.940
  2336. if (flags and flag_Simple) = 0 then
  2337. begin
  2338. if NonGreedyOp // ###0.940
  2339. then
  2340. EmitComplexBraces(0, MaxBracesArg, NonGreedyOp)
  2341. else
  2342. begin // Emit x* as (x&|), where & means "self".
  2343. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  2344. OpTail(Result, EmitNode(OP_BACK)); // and loop
  2345. OpTail(Result, Result); // back
  2346. Tail(Result, EmitNode(OP_BRANCH)); // or
  2347. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2348. end
  2349. end
  2350. else
  2351. begin // Simple
  2352. if NonGreedyOp // ###0.940
  2353. then
  2354. TheOp := OP_STARNG
  2355. else
  2356. TheOp := OP_STAR;
  2357. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2358. end;
  2359. if NonGreedyCh // ###0.940
  2360. then
  2361. Inc(regparse); // Skip extra char ('?')
  2362. end; { of case '*' }
  2363. '+':
  2364. begin
  2365. flagp := flag_Worst or flag_SpecStart or flag_HasWidth;
  2366. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2367. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2368. // ###0.940
  2369. if (flags and flag_Simple) = 0 then
  2370. begin
  2371. if NonGreedyOp // ###0.940
  2372. then
  2373. EmitComplexBraces(1, MaxBracesArg, NonGreedyOp)
  2374. else
  2375. begin // Emit x+ as x(&|), where & means "self".
  2376. NextNode := EmitNode(OP_BRANCH); // Either
  2377. Tail(Result, NextNode);
  2378. Tail(EmitNode(OP_BACK), Result); // loop back
  2379. Tail(NextNode, EmitNode(OP_BRANCH)); // or
  2380. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2381. end
  2382. end
  2383. else
  2384. begin // Simple
  2385. if NonGreedyOp // ###0.940
  2386. then
  2387. TheOp := OP_PLUSNG
  2388. else
  2389. TheOp := OP_PLUS;
  2390. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2391. end;
  2392. if NonGreedyCh // ###0.940
  2393. then
  2394. Inc(regparse); // Skip extra char ('?')
  2395. end; { of case '+' }
  2396. '?':
  2397. begin
  2398. flagp := flag_Worst;
  2399. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2400. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2401. // ###0.940
  2402. if NonGreedyOp then
  2403. begin // ###0.940 // We emit x?? as x{0,1}?
  2404. if (flags and flag_Simple) = 0 then
  2405. EmitComplexBraces(0, 1, NonGreedyOp)
  2406. else
  2407. EmitSimpleBraces(0, 1, NonGreedyOp);
  2408. end
  2409. else
  2410. begin // greedy '?'
  2411. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  2412. Tail(Result, EmitNode(OP_BRANCH)); // or
  2413. NextNode := EmitNode(OP_NOTHING); // nil.
  2414. Tail(Result, NextNode);
  2415. OpTail(Result, NextNode);
  2416. end;
  2417. if NonGreedyCh // ###0.940
  2418. then
  2419. Inc(regparse); // Skip extra char ('?')
  2420. end; { of case '?' }
  2421. '{':
  2422. begin
  2423. Inc(regparse);
  2424. p := regparse;
  2425. while IsDigitChar(regparse^) do // <min> MUST appear
  2426. Inc(regparse);
  2427. if (regparse^ <> '}') and (regparse^ <> ',') or (p = regparse) then
  2428. begin
  2429. Error(reeIncorrectBraces);
  2430. Exit;
  2431. end;
  2432. BracesMin := ParseNumber(p, regparse - 1);
  2433. if regparse^ = ',' then
  2434. begin
  2435. Inc(regparse);
  2436. p := regparse;
  2437. while IsDigitChar(regparse^) do
  2438. Inc(regparse);
  2439. if regparse^ <> '}' then
  2440. begin
  2441. Error(reeIncorrectBraces);
  2442. Exit;
  2443. end;
  2444. if p = regparse then
  2445. Bracesmax := MaxBracesArg
  2446. else
  2447. Bracesmax := ParseNumber(p, regparse - 1);
  2448. end
  2449. else
  2450. Bracesmax := BracesMin; // {n} == {n,n}
  2451. if BracesMin > Bracesmax then
  2452. begin
  2453. Error(reeBracesMinParamGreaterMax);
  2454. Exit;
  2455. end;
  2456. if BracesMin > 0 then
  2457. flagp := flag_Worst;
  2458. if Bracesmax > 0 then
  2459. flagp := flagp or flag_HasWidth or flag_SpecStart;
  2460. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2461. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2462. // ###0.940
  2463. if (flags and flag_Simple) <> 0 then
  2464. EmitSimpleBraces(BracesMin, Bracesmax, NonGreedyOp)
  2465. else
  2466. EmitComplexBraces(BracesMin, Bracesmax, NonGreedyOp);
  2467. if NonGreedyCh // ###0.940
  2468. then
  2469. Inc(regparse); // Skip extra char '?'
  2470. end; // of case '{'
  2471. // else // here we can't be
  2472. end; { of case op }
  2473. Inc(regparse);
  2474. op := regparse^;
  2475. if (op = '*') or (op = '+') or (op = '?') or (op = '{') then
  2476. Error(reeNestedSQP);
  2477. end; { of function TRegExpr.ParsePiece
  2478. -------------------------------------------------------------- }
  2479. function TRegExpr.HexDig(Ch: REChar): integer;
  2480. begin
  2481. case Ch of
  2482. '0' .. '9':
  2483. Result := Ord(Ch) - Ord('0');
  2484. 'a' .. 'f':
  2485. Result := Ord(Ch) - Ord('a') + 10;
  2486. 'A' .. 'F':
  2487. Result := Ord(Ch) - Ord('A') + 10;
  2488. else
  2489. begin
  2490. Result := 0;
  2491. Error(reeBadHexDigit);
  2492. end;
  2493. end;
  2494. end;
  2495. function TRegExpr.UnQuoteChar(var APtr: PRegExprChar): REChar;
  2496. var
  2497. Ch: REChar;
  2498. begin
  2499. case APtr^ of
  2500. 't':
  2501. Result := #$9; // \t => tab (HT/TAB)
  2502. 'n':
  2503. Result := #$a; // \n => newline (NL)
  2504. 'r':
  2505. Result := #$d; // \r => carriage return (CR)
  2506. 'f':
  2507. Result := #$c; // \f => form feed (FF)
  2508. 'a':
  2509. Result := #$7; // \a => alarm (bell) (BEL)
  2510. 'e':
  2511. Result := #$1b; // \e => escape (ESC)
  2512. 'c':
  2513. begin // \cK => code for Ctrl+K
  2514. Inc(APtr);
  2515. if APtr >= fRegexEnd then
  2516. Error(reeNoLetterAfterBSlashC);
  2517. Ch := APtr^;
  2518. case Ch of
  2519. 'a' .. 'z':
  2520. Result := REChar(Ord(Ch) - Ord('a') + 1);
  2521. 'A' .. 'Z':
  2522. Result := REChar(Ord(Ch) - Ord('A') + 1);
  2523. else
  2524. Error(reeNoLetterAfterBSlashC);
  2525. end;
  2526. end;
  2527. 'x':
  2528. begin // \x: hex char
  2529. Result := #0;
  2530. Inc(APtr);
  2531. if APtr >= fRegexEnd then
  2532. begin
  2533. Error(reeNoHexCodeAfterBSlashX);
  2534. Exit;
  2535. end;
  2536. if APtr^ = '{' then
  2537. begin // \x{nnnn} //###0.936
  2538. repeat
  2539. Inc(APtr);
  2540. if APtr >= fRegexEnd then
  2541. begin
  2542. Error(reeNoHexCodeAfterBSlashX);
  2543. Exit;
  2544. end;
  2545. if APtr^ <> '}' then
  2546. begin
  2547. if (Ord(Result) ShR (SizeOf(REChar) * 8 - 4)) and $F <> 0 then
  2548. begin
  2549. Error(reeHexCodeAfterBSlashXTooBig);
  2550. Exit;
  2551. end;
  2552. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  2553. // HexDig will cause Error if bad hex digit found
  2554. end
  2555. else
  2556. Break;
  2557. until False;
  2558. end
  2559. else
  2560. begin
  2561. Result := REChar(HexDig(APtr^));
  2562. // HexDig will cause Error if bad hex digit found
  2563. Inc(APtr);
  2564. if APtr >= fRegexEnd then
  2565. begin
  2566. Error(reeNoHexCodeAfterBSlashX);
  2567. Exit;
  2568. end;
  2569. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  2570. // HexDig will cause Error if bad hex digit found
  2571. end;
  2572. end;
  2573. else
  2574. Result := APtr^;
  2575. end;
  2576. end;
  2577. function TRegExpr.ParseAtom(var flagp: integer): PRegExprChar;
  2578. // the lowest level
  2579. // Optimization: gobbles an entire sequence of ordinary characters so that
  2580. // it can turn them into a single node, which is smaller to store and
  2581. // faster to run. Backslashed characters are exceptions, each becoming a
  2582. // separate node; the code is simpler that way and it's not worth fixing.
  2583. var
  2584. ret: PRegExprChar;
  2585. RangeBeg, RangeEnd: REChar;
  2586. CanBeRange: boolean;
  2587. AddrOfLen: PLongInt;
  2588. procedure EmitExactly(Ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2589. begin
  2590. if fCompModifiers.I then
  2591. ret := EmitNode(OP_EXACTLYCI)
  2592. else
  2593. ret := EmitNode(OP_EXACTLY);
  2594. EmitInt(1);
  2595. EmitC(Ch);
  2596. flagp := flagp or flag_HasWidth or flag_Simple;
  2597. end;
  2598. procedure EmitRangeChar(Ch: REChar; AStartOfRange: boolean); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2599. begin
  2600. CanBeRange := AStartOfRange;
  2601. if fCompModifiers.I then
  2602. Ch := _UpperCase(Ch);
  2603. if AStartOfRange then
  2604. begin
  2605. AddrOfLen := nil;
  2606. RangeBeg := Ch;
  2607. end
  2608. else
  2609. begin
  2610. if AddrOfLen = nil then
  2611. begin
  2612. EmitC(OpKind_Char);
  2613. Pointer(AddrOfLen) := regcode;
  2614. EmitInt(0);
  2615. end;
  2616. Inc(AddrOfLen^);
  2617. EmitC(Ch);
  2618. end;
  2619. end;
  2620. procedure EmitRangePacked(ch1, ch2: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2621. var
  2622. ChkIndex: integer;
  2623. begin
  2624. AddrOfLen := nil;
  2625. CanBeRange := False;
  2626. if fCompModifiers.I then
  2627. begin
  2628. ch1 := _UpperCase(ch1);
  2629. ch2 := _UpperCase(ch2);
  2630. end;
  2631. for ChkIndex := Low(CharCheckerInfos) to High(CharCheckerInfos) do
  2632. if (CharCheckerInfos[ChkIndex].CharBegin = ch1) and
  2633. (CharCheckerInfos[ChkIndex].CharEnd = ch2) then
  2634. begin
  2635. EmitC(OpKind_MetaClass);
  2636. EmitC(REChar(CharCheckerInfos[ChkIndex].CheckerIndex));
  2637. Exit;
  2638. end;
  2639. EmitC(OpKind_Range);
  2640. EmitC(ch1);
  2641. EmitC(ch2);
  2642. end;
  2643. var
  2644. flags: integer;
  2645. Len: integer;
  2646. SavedPtr: PRegExprChar;
  2647. EnderChar, TempChar: REChar;
  2648. begin
  2649. Result := nil;
  2650. flags := 0;
  2651. flagp := flag_Worst;
  2652. AddrOfLen := nil;
  2653. Inc(regparse);
  2654. case (regparse - 1)^ of
  2655. '^':
  2656. if not fCompModifiers.M or
  2657. ((fLineSeparators = '') and not fLinePairedSeparatorAssigned) then
  2658. ret := EmitNode(OP_BOL)
  2659. else
  2660. ret := EmitNode(OP_BOLML);
  2661. '$':
  2662. if not fCompModifiers.M or
  2663. ((fLineSeparators = '') and not fLinePairedSeparatorAssigned) then
  2664. ret := EmitNode(OP_EOL)
  2665. else
  2666. ret := EmitNode(OP_EOLML);
  2667. '.':
  2668. if fCompModifiers.S then
  2669. begin
  2670. ret := EmitNode(OP_ANY);
  2671. flagp := flagp or flag_HasWidth or flag_Simple;
  2672. end
  2673. else
  2674. begin // not /s, so emit [^:LineSeparators:]
  2675. ret := EmitNode(OP_ANYML);
  2676. flagp := flagp or flag_HasWidth; // not so simple ;)
  2677. end;
  2678. '[':
  2679. begin
  2680. if regparse^ = '^' then
  2681. begin // Complement of range.
  2682. if fCompModifiers.I then
  2683. ret := EmitNode(OP_ANYBUTCI)
  2684. else
  2685. ret := EmitNode(OP_ANYBUT);
  2686. Inc(regparse);
  2687. end
  2688. else if fCompModifiers.I then
  2689. ret := EmitNode(OP_ANYOFCI)
  2690. else
  2691. ret := EmitNode(OP_ANYOF);
  2692. CanBeRange := False;
  2693. if regparse^ = ']' then
  2694. begin
  2695. // first ']' inside [] treated as simple char, no need to check '['
  2696. EmitRangeChar(regparse^, (regparse + 1)^ = '-');
  2697. Inc(regparse);
  2698. end;
  2699. while (regparse < fRegexEnd) and (regparse^ <> ']') do
  2700. begin
  2701. if (regparse^ = '-') and ((regparse + 1) < fRegexEnd) and
  2702. ((regparse + 1)^ <> ']') and CanBeRange then
  2703. begin
  2704. Inc(regparse);
  2705. RangeEnd := regparse^;
  2706. if RangeEnd = EscChar then
  2707. begin
  2708. if _IsMetaChar((regparse + 1)^) then
  2709. begin
  2710. Error(reeMetaCharAfterMinusInRange);
  2711. Exit;
  2712. end;
  2713. Inc(regparse);
  2714. RangeEnd := UnQuoteChar(regparse);
  2715. end;
  2716. // special handling for Russian range a-YA, add 2 ranges: a-ya and A-YA
  2717. if fCompModifiers.R and
  2718. (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then
  2719. begin
  2720. EmitRangePacked(RusRangeLoLow, RusRangeLoHigh);
  2721. EmitRangePacked(RusRangeHiLow, RusRangeHiHigh);
  2722. end
  2723. else
  2724. begin // standard r.e. handling
  2725. if RangeBeg > RangeEnd then
  2726. begin
  2727. Error(reeInvalidRange);
  2728. Exit;
  2729. end;
  2730. EmitRangePacked(RangeBeg, RangeEnd);
  2731. end;
  2732. Inc(regparse);
  2733. end
  2734. else
  2735. begin
  2736. if regparse^ = EscChar then
  2737. begin
  2738. Inc(regparse);
  2739. if regparse >= fRegexEnd then
  2740. begin
  2741. Error(reeParseAtomTrailingBackSlash);
  2742. Exit;
  2743. end;
  2744. if _IsMetaChar(regparse^) then
  2745. begin
  2746. AddrOfLen := nil;
  2747. CanBeRange := False;
  2748. EmitC(OpKind_MetaClass);
  2749. case regparse^ of
  2750. 'w':
  2751. EmitC(REChar(CheckerIndex_Word));
  2752. 'W':
  2753. EmitC(REChar(CheckerIndex_NotWord));
  2754. 's':
  2755. EmitC(REChar(CheckerIndex_Space));
  2756. 'S':
  2757. EmitC(REChar(CheckerIndex_NotSpace));
  2758. 'd':
  2759. EmitC(REChar(CheckerIndex_Digit));
  2760. 'D':
  2761. EmitC(REChar(CheckerIndex_NotDigit));
  2762. 'v':
  2763. EmitC(REChar(CheckerIndex_VertSep));
  2764. 'V':
  2765. EmitC(REChar(CheckerIndex_NotVertSep));
  2766. 'h':
  2767. EmitC(REChar(CheckerIndex_HorzSep));
  2768. 'H':
  2769. EmitC(REChar(CheckerIndex_NotHorzSep));
  2770. else
  2771. Error(reeBadOpcodeInCharClass);
  2772. end;
  2773. end
  2774. else
  2775. begin
  2776. TempChar := UnQuoteChar(regparse);
  2777. EmitRangeChar(TempChar, (regparse + 1)^ = '-');
  2778. end;
  2779. end
  2780. else
  2781. begin
  2782. EmitRangeChar(regparse^, (regparse + 1)^ = '-');
  2783. end;
  2784. Inc(regparse);
  2785. end;
  2786. end; { of while }
  2787. AddrOfLen := nil;
  2788. CanBeRange := False;
  2789. EmitC(OpKind_End);
  2790. if regparse^ <> ']' then
  2791. begin
  2792. Error(reeUnmatchedSqBrackets);
  2793. Exit;
  2794. end;
  2795. Inc(regparse);
  2796. flagp := flagp or flag_HasWidth or flag_Simple;
  2797. end;
  2798. '(':
  2799. begin
  2800. if regparse^ = '?' then
  2801. begin
  2802. // check for non-capturing group: (?:text)
  2803. if (regparse + 1)^ = ':' then
  2804. begin
  2805. Inc(regparse, 2);
  2806. ret := ParseReg(1, flags);
  2807. if ret = nil then
  2808. begin
  2809. Result := nil;
  2810. Exit;
  2811. end;
  2812. flagp := flagp or flags and (flag_HasWidth or flag_SpecStart);
  2813. end
  2814. else
  2815. // check for extended Perl syntax : (?..)
  2816. if (regparse + 1)^ = '#' then
  2817. begin // (?#comment)
  2818. Inc(regparse, 2); // find closing ')'
  2819. while (regparse < fRegexEnd) and (regparse^ <> ')') do
  2820. Inc(regparse);
  2821. if regparse^ <> ')' then
  2822. begin
  2823. Error(reeUnclosedComment);
  2824. Exit;
  2825. end;
  2826. Inc(regparse); // skip ')'
  2827. ret := EmitNode(OP_COMMENT); // comment
  2828. end
  2829. else
  2830. begin // modifiers ?
  2831. Inc(regparse); // skip '?'
  2832. SavedPtr := regparse;
  2833. while (regparse < fRegexEnd) and (regparse^ <> ')') do
  2834. Inc(regparse);
  2835. if (regparse^ <> ')') or
  2836. not ParseModifiers(SavedPtr, regparse - SavedPtr, fCompModifiers) then
  2837. begin
  2838. Error(reeUnrecognizedModifier);
  2839. Exit;
  2840. end;
  2841. Inc(regparse); // skip ')'
  2842. ret := EmitNode(OP_COMMENT); // comment
  2843. // Error (reeQPSBFollowsNothing);
  2844. // Exit;
  2845. end;
  2846. end
  2847. else
  2848. begin
  2849. // normal (capturing) group
  2850. if fSecondPass then
  2851. // must skip this block for one of passes, to not double groups count
  2852. if GrpCount < NSUBEXP - 1 then
  2853. begin
  2854. Inc(GrpCount);
  2855. GrpIndexes[GrpCount] := regnpar;
  2856. end;
  2857. ret := ParseReg(1, flags);
  2858. if ret = nil then
  2859. begin
  2860. Result := nil;
  2861. Exit;
  2862. end;
  2863. flagp := flagp or flags and (flag_HasWidth or flag_SpecStart);
  2864. end;
  2865. end;
  2866. '|', ')':
  2867. begin // Supposed to be caught earlier.
  2868. Error(reeInternalUrp);
  2869. Exit;
  2870. end;
  2871. '?', '+', '*':
  2872. begin
  2873. Error(reeQPSBFollowsNothing);
  2874. Exit;
  2875. end;
  2876. EscChar:
  2877. begin
  2878. if regparse >= fRegexEnd then
  2879. begin
  2880. Error(reeTrailingBackSlash);
  2881. Exit;
  2882. end;
  2883. case regparse^ of // r.e.extensions
  2884. 'b':
  2885. ret := EmitNode(OP_BOUND); // ###0.943
  2886. 'B':
  2887. ret := EmitNode(OP_NOTBOUND); // ###0.943
  2888. 'A':
  2889. ret := EmitNode(OP_BOL); // ###0.941
  2890. 'Z':
  2891. ret := EmitNode(OP_EOL); // ###0.941
  2892. 'd':
  2893. begin // r.e.extension - any digit ('0' .. '9')
  2894. ret := EmitNode(OP_ANYDIGIT);
  2895. flagp := flagp or flag_HasWidth or flag_Simple;
  2896. end;
  2897. 'D':
  2898. begin // r.e.extension - not digit ('0' .. '9')
  2899. ret := EmitNode(OP_NOTDIGIT);
  2900. flagp := flagp or flag_HasWidth or flag_Simple;
  2901. end;
  2902. 's':
  2903. begin // r.e.extension - any space char
  2904. ret := EmitNode(OP_ANYSPACE);
  2905. flagp := flagp or flag_HasWidth or flag_Simple;
  2906. end;
  2907. 'S':
  2908. begin // r.e.extension - not space char
  2909. ret := EmitNode(OP_NOTSPACE);
  2910. flagp := flagp or flag_HasWidth or flag_Simple;
  2911. end;
  2912. 'w':
  2913. begin // r.e.extension - any english char / digit / '_'
  2914. ret := EmitNode(OP_ANYLETTER);
  2915. flagp := flagp or flag_HasWidth or flag_Simple;
  2916. end;
  2917. 'W':
  2918. begin // r.e.extension - not english char / digit / '_'
  2919. ret := EmitNode(OP_NOTLETTER);
  2920. flagp := flagp or flag_HasWidth or flag_Simple;
  2921. end;
  2922. 'v':
  2923. begin
  2924. ret := EmitNode(OP_ANYVERTSEP);
  2925. flagp := flagp or flag_HasWidth or flag_Simple;
  2926. end;
  2927. 'V':
  2928. begin
  2929. ret := EmitNode(OP_NOTVERTSEP);
  2930. flagp := flagp or flag_HasWidth or flag_Simple;
  2931. end;
  2932. 'h':
  2933. begin
  2934. ret := EmitNode(OP_ANYHORZSEP);
  2935. flagp := flagp or flag_HasWidth or flag_Simple;
  2936. end;
  2937. 'H':
  2938. begin
  2939. ret := EmitNode(OP_NOTHORZSEP);
  2940. flagp := flagp or flag_HasWidth or flag_Simple;
  2941. end;
  2942. '1' .. '9':
  2943. begin // ###0.936
  2944. if fCompModifiers.I then
  2945. ret := EmitNode(OP_BSUBEXPCI)
  2946. else
  2947. ret := EmitNode(OP_BSUBEXP);
  2948. EmitC(REChar(Ord(regparse^) - Ord('0')));
  2949. flagp := flagp or flag_HasWidth or flag_Simple;
  2950. end;
  2951. else
  2952. EmitExactly(UnQuoteChar(regparse));
  2953. end; { of case }
  2954. Inc(regparse);
  2955. end;
  2956. else
  2957. begin
  2958. Dec(regparse);
  2959. if fCompModifiers.X and // check for eXtended syntax
  2960. ((regparse^ = '#') or IsIgnoredChar(regparse^)) then
  2961. begin // ###0.941 \x
  2962. if regparse^ = '#' then
  2963. begin // Skip eXtended comment
  2964. // find comment terminator (group of \n and/or \r)
  2965. while (regparse < fRegexEnd) and (regparse^ <> #$d) and
  2966. (regparse^ <> #$a) do
  2967. Inc(regparse);
  2968. while (regparse^ = #$d) or (regparse^ = #$a)
  2969. // skip comment terminator
  2970. do
  2971. Inc(regparse);
  2972. // attempt to support different type of line separators
  2973. end
  2974. else
  2975. begin // Skip the blanks!
  2976. while IsIgnoredChar(regparse^) do
  2977. Inc(regparse);
  2978. end;
  2979. ret := EmitNode(OP_COMMENT); // comment
  2980. end
  2981. else
  2982. begin
  2983. Len := FindSkippedMetaLen(regparse, fRegexEnd);
  2984. if Len <= 0 then
  2985. if regparse^ <> '{' then
  2986. begin
  2987. Error(reeRarseAtomInternalDisaster);
  2988. Exit;
  2989. end
  2990. else
  2991. Len := FindSkippedMetaLen(regparse + 1, fRegexEnd) + 1;
  2992. // bad {n,m} - compile as EXACTLY
  2993. EnderChar := (regparse + Len)^;
  2994. if (Len > 1) and ((EnderChar = '*') or (EnderChar = '+') or (EnderChar = '?') or (EnderChar = '{')) then
  2995. Dec(Len); // back off clear of ?+*{ operand.
  2996. flagp := flagp or flag_HasWidth;
  2997. if Len = 1 then
  2998. flagp := flagp or flag_Simple;
  2999. if fCompModifiers.I then
  3000. ret := EmitNode(OP_EXACTLYCI)
  3001. else
  3002. ret := EmitNode(OP_EXACTLY);
  3003. EmitInt(0);
  3004. while (Len > 0) and ((not fCompModifiers.X) or (regparse^ <> '#')) do
  3005. begin
  3006. if not fCompModifiers.X or not IsIgnoredChar(regparse^) then
  3007. begin
  3008. EmitC(regparse^);
  3009. if regcode <> @regdummy then
  3010. Inc(regExactlyLen^);
  3011. end;
  3012. Inc(regparse);
  3013. Dec(Len);
  3014. end;
  3015. end; { of if not comment }
  3016. end; { of case else }
  3017. end; { of case }
  3018. Result := ret;
  3019. end; { of function TRegExpr.ParseAtom
  3020. -------------------------------------------------------------- }
  3021. function TRegExpr.GetCompilerErrorPos: PtrInt;
  3022. begin
  3023. Result := 0;
  3024. if (regexpBegin = nil) or (regparse = nil) then
  3025. Exit; // not in compiling mode ?
  3026. Result := regparse - regexpBegin;
  3027. end; { of function TRegExpr.GetCompilerErrorPos
  3028. -------------------------------------------------------------- }
  3029. { ============================================================= }
  3030. { ===================== Matching section ====================== }
  3031. { ============================================================= }
  3032. function TRegExpr.regrepeat(p: PRegExprChar; AMax: integer): integer;
  3033. // repeatedly match something simple, report how many
  3034. var
  3035. scan: PRegExprChar;
  3036. opnd: PRegExprChar;
  3037. TheMax, NLen: integer;
  3038. InvChar: REChar; // ###0.931
  3039. GrpStart, GrpEnd: PRegExprChar; // ###0.936
  3040. ArrayIndex: integer;
  3041. begin
  3042. Result := 0;
  3043. scan := reginput;
  3044. opnd := p + REOpSz + RENextOffSz; // OPERAND
  3045. TheMax := fInputEnd - scan;
  3046. if TheMax > AMax then
  3047. TheMax := AMax;
  3048. case PREOp(p)^ of
  3049. OP_ANY:
  3050. begin
  3051. // note - OP_ANYML cannot be proceeded in regrepeat because can skip
  3052. // more than one char at once
  3053. Result := TheMax;
  3054. Inc(scan, Result);
  3055. end;
  3056. OP_EXACTLY:
  3057. begin // in opnd can be only ONE char !!!
  3058. NLen := PLongInt(opnd)^;
  3059. if TheMax > NLen then
  3060. TheMax := NLen;
  3061. Inc(opnd, RENumberSz);
  3062. while (Result < TheMax) and (opnd^ = scan^) do
  3063. begin
  3064. Inc(Result);
  3065. Inc(scan);
  3066. end;
  3067. end;
  3068. OP_EXACTLYCI:
  3069. begin // in opnd can be only ONE char !!!
  3070. NLen := PLongInt(opnd)^;
  3071. if TheMax > NLen then
  3072. TheMax := NLen;
  3073. Inc(opnd, RENumberSz);
  3074. while (Result < TheMax) and (opnd^ = scan^) do
  3075. begin // prevent unneeded InvertCase //###0.931
  3076. Inc(Result);
  3077. Inc(scan);
  3078. end;
  3079. if Result < TheMax then
  3080. begin // ###0.931
  3081. InvChar := InvertCase(opnd^); // store in register
  3082. while (Result < TheMax) and ((opnd^ = scan^) or (InvChar = scan^)) do
  3083. begin
  3084. Inc(Result);
  3085. Inc(scan);
  3086. end;
  3087. end;
  3088. end;
  3089. OP_BSUBEXP:
  3090. begin // ###0.936
  3091. ArrayIndex := GrpIndexes[Ord(opnd^)];
  3092. if ArrayIndex < 0 then
  3093. Exit;
  3094. GrpStart := startp[ArrayIndex];
  3095. if GrpStart = nil then
  3096. Exit;
  3097. GrpEnd := endp[ArrayIndex];
  3098. if GrpEnd = nil then
  3099. Exit;
  3100. repeat
  3101. opnd := GrpStart;
  3102. while opnd < GrpEnd do
  3103. begin
  3104. if (scan >= fInputEnd) or (scan^ <> opnd^) then
  3105. Exit;
  3106. Inc(scan);
  3107. Inc(opnd);
  3108. end;
  3109. Inc(Result);
  3110. reginput := scan;
  3111. until Result >= AMax;
  3112. end;
  3113. OP_BSUBEXPCI:
  3114. begin // ###0.936
  3115. ArrayIndex := GrpIndexes[Ord(opnd^)];
  3116. if ArrayIndex < 0 then
  3117. Exit;
  3118. GrpStart := startp[ArrayIndex];
  3119. if GrpStart = nil then
  3120. Exit;
  3121. GrpEnd := endp[ArrayIndex];
  3122. if GrpEnd = nil then
  3123. Exit;
  3124. repeat
  3125. opnd := GrpStart;
  3126. while opnd < GrpEnd do
  3127. begin
  3128. if (scan >= fInputEnd) or
  3129. ((scan^ <> opnd^) and (scan^ <> InvertCase(opnd^))) then
  3130. Exit;
  3131. Inc(scan);
  3132. Inc(opnd);
  3133. end;
  3134. Inc(Result);
  3135. reginput := scan;
  3136. until Result >= AMax;
  3137. end;
  3138. OP_ANYDIGIT:
  3139. while (Result < TheMax) and IsDigitChar(scan^) do
  3140. begin
  3141. Inc(Result);
  3142. Inc(scan);
  3143. end;
  3144. OP_NOTDIGIT:
  3145. while (Result < TheMax) and not IsDigitChar(scan^) do
  3146. begin
  3147. Inc(Result);
  3148. Inc(scan);
  3149. end;
  3150. OP_ANYLETTER:
  3151. while (Result < TheMax) and IsWordChar(scan^) do // ###0.940
  3152. begin
  3153. Inc(Result);
  3154. Inc(scan);
  3155. end;
  3156. OP_NOTLETTER:
  3157. while (Result < TheMax) and not IsWordChar(scan^) do // ###0.940
  3158. begin
  3159. Inc(Result);
  3160. Inc(scan);
  3161. end;
  3162. OP_ANYSPACE:
  3163. while (Result < TheMax) and IsSpaceChar(scan^) do
  3164. begin
  3165. Inc(Result);
  3166. Inc(scan);
  3167. end;
  3168. OP_NOTSPACE:
  3169. while (Result < TheMax) and not IsSpaceChar(scan^) do
  3170. begin
  3171. Inc(Result);
  3172. Inc(scan);
  3173. end;
  3174. OP_ANYVERTSEP:
  3175. while (Result < TheMax) and IsLineSeparator(scan^) do
  3176. begin
  3177. Inc(Result);
  3178. Inc(scan);
  3179. end;
  3180. OP_NOTVERTSEP:
  3181. while (Result < TheMax) and not IsLineSeparator(scan^) do
  3182. begin
  3183. Inc(Result);
  3184. Inc(scan);
  3185. end;
  3186. OP_ANYHORZSEP:
  3187. while (Result < TheMax) and IsHorzSeparator(scan^) do
  3188. begin
  3189. Inc(Result);
  3190. Inc(scan);
  3191. end;
  3192. OP_NOTHORZSEP:
  3193. while (Result < TheMax) and not IsHorzSeparator(scan^) do
  3194. begin
  3195. Inc(Result);
  3196. Inc(scan);
  3197. end;
  3198. OP_ANYOF:
  3199. while (Result < TheMax) and FindInCharClass(opnd, scan^, False) do
  3200. begin
  3201. Inc(Result);
  3202. Inc(scan);
  3203. end;
  3204. OP_ANYBUT:
  3205. while (Result < TheMax) and not FindInCharClass(opnd, scan^, False) do
  3206. begin
  3207. Inc(Result);
  3208. Inc(scan);
  3209. end;
  3210. OP_ANYOFCI:
  3211. while (Result < TheMax) and FindInCharClass(opnd, scan^, True) do
  3212. begin
  3213. Inc(Result);
  3214. Inc(scan);
  3215. end;
  3216. OP_ANYBUTCI:
  3217. while (Result < TheMax) and not FindInCharClass(opnd, scan^, True) do
  3218. begin
  3219. Inc(Result);
  3220. Inc(scan);
  3221. end;
  3222. else
  3223. begin // Oh dear. Called inappropriately.
  3224. Result := 0; // Best compromise.
  3225. Error(reeRegRepeatCalledInappropriately);
  3226. Exit;
  3227. end;
  3228. end; { of case }
  3229. reginput := scan;
  3230. end; { of function TRegExpr.regrepeat
  3231. -------------------------------------------------------------- }
  3232. function TRegExpr.regnext(p: PRegExprChar): PRegExprChar;
  3233. // dig the "next" pointer out of a node
  3234. var
  3235. offset: TRENextOff;
  3236. begin
  3237. if p = @regdummy then
  3238. begin
  3239. Result := nil;
  3240. Exit;
  3241. end;
  3242. offset := PRENextOff(AlignToPtr(p + REOpSz))^; // ###0.933 inlined NEXT
  3243. if offset = 0 then
  3244. Result := nil
  3245. else
  3246. Result := p + offset;
  3247. end; { of function TRegExpr.regnext
  3248. -------------------------------------------------------------- }
  3249. function TRegExpr.MatchPrim(prog: PRegExprChar): boolean;
  3250. // recursively matching routine
  3251. // Conceptually the strategy is simple: check to see whether the current
  3252. // node matches, call self recursively to see whether the rest matches,
  3253. // and then act accordingly. In practice we make some effort to avoid
  3254. // recursion, in particular by going through "ordinary" nodes (that don't
  3255. // need to know whether the rest of the match failed) by a loop instead of
  3256. // by recursion.
  3257. var
  3258. scan: PRegExprChar; // Current node.
  3259. next: PRegExprChar; // Next node.
  3260. Len: PtrInt;
  3261. opnd: PRegExprChar;
  3262. no: integer;
  3263. save: PRegExprChar;
  3264. nextch: REChar;
  3265. BracesMin, Bracesmax: integer;
  3266. // we use integer instead of TREBracesArg for better support */+
  3267. {$IFDEF ComplexBraces}
  3268. SavedLoopStack: TRegExprLoopStack; // :(( very bad for recursion
  3269. SavedLoopStackIdx: integer; // ###0.925
  3270. {$ENDIF}
  3271. bound1, bound2: boolean;
  3272. begin
  3273. Result := False;
  3274. {$IFDEF ComplexBraces}
  3275. SavedLoopStack:=Default(TRegExprLoopStack);
  3276. SavedLoopStackIdx:=0;
  3277. {$ENDIF}
  3278. scan := prog;
  3279. while scan <> nil do
  3280. begin
  3281. Len := PRENextOff(AlignToPtr(scan + 1))^; // ###0.932 inlined regnext
  3282. if Len = 0 then
  3283. next := nil
  3284. else
  3285. next := scan + Len;
  3286. case scan^ of
  3287. OP_NOTBOUND,
  3288. OP_BOUND:
  3289. begin
  3290. bound1 := (reginput = fInputStart) or not IsWordChar((reginput - 1)^);
  3291. bound2 := (reginput = fInputEnd) or not IsWordChar(reginput^);
  3292. if (scan^ = OP_BOUND) xor (bound1 <> bound2) then
  3293. Exit;
  3294. end;
  3295. OP_BOL:
  3296. begin
  3297. if reginput <> fInputStart then
  3298. Exit;
  3299. end;
  3300. OP_EOL:
  3301. begin
  3302. if reginput < fInputEnd then
  3303. Exit;
  3304. end;
  3305. OP_BOLML:
  3306. if reginput > fInputStart then
  3307. begin
  3308. nextch := (reginput - 1)^;
  3309. if (nextch <> fLinePairedSeparatorTail) or
  3310. ((reginput - 1) <= fInputStart) or
  3311. ((reginput - 2)^ <> fLinePairedSeparatorHead) then
  3312. begin
  3313. if (nextch = fLinePairedSeparatorHead) and
  3314. (reginput^ = fLinePairedSeparatorTail) then
  3315. Exit; // don't stop between paired separator
  3316. if not IsCustomLineSeparator(nextch) then
  3317. Exit;
  3318. end;
  3319. end;
  3320. OP_EOLML:
  3321. if reginput < fInputEnd then
  3322. begin
  3323. nextch := reginput^;
  3324. if (nextch <> fLinePairedSeparatorHead) or
  3325. ((reginput + 1)^ <> fLinePairedSeparatorTail) then
  3326. begin
  3327. if (nextch = fLinePairedSeparatorTail) and (reginput > fInputStart)
  3328. and ((reginput - 1)^ = fLinePairedSeparatorHead) then
  3329. Exit; // don't stop between paired separator
  3330. if not IsCustomLineSeparator(nextch) then
  3331. Exit;
  3332. end;
  3333. end;
  3334. OP_ANY:
  3335. begin
  3336. if reginput = fInputEnd then
  3337. Exit;
  3338. Inc(reginput);
  3339. end;
  3340. OP_ANYML:
  3341. begin // ###0.941
  3342. if (reginput = fInputEnd) or
  3343. ((reginput^ = fLinePairedSeparatorHead) and
  3344. ((reginput + 1)^ = fLinePairedSeparatorTail)) or
  3345. IsCustomLineSeparator(reginput^)
  3346. then
  3347. Exit;
  3348. Inc(reginput);
  3349. end;
  3350. OP_ANYDIGIT:
  3351. begin
  3352. if (reginput = fInputEnd) or not IsDigitChar(reginput^) then
  3353. Exit;
  3354. Inc(reginput);
  3355. end;
  3356. OP_NOTDIGIT:
  3357. begin
  3358. if (reginput = fInputEnd) or IsDigitChar(reginput^) then
  3359. Exit;
  3360. Inc(reginput);
  3361. end;
  3362. OP_ANYLETTER:
  3363. begin
  3364. if (reginput = fInputEnd) or not IsWordChar(reginput^) // ###0.943
  3365. then
  3366. Exit;
  3367. Inc(reginput);
  3368. end;
  3369. OP_NOTLETTER:
  3370. begin
  3371. if (reginput = fInputEnd) or IsWordChar(reginput^) // ###0.943
  3372. then
  3373. Exit;
  3374. Inc(reginput);
  3375. end;
  3376. OP_ANYSPACE:
  3377. begin
  3378. if (reginput = fInputEnd) or not IsSpaceChar(reginput^) // ###0.943
  3379. then
  3380. Exit;
  3381. Inc(reginput);
  3382. end;
  3383. OP_NOTSPACE:
  3384. begin
  3385. if (reginput = fInputEnd) or IsSpaceChar(reginput^) // ###0.943
  3386. then
  3387. Exit;
  3388. Inc(reginput);
  3389. end;
  3390. OP_ANYVERTSEP:
  3391. begin
  3392. if (reginput = fInputEnd) or not IsLineSeparator(reginput^) then
  3393. Exit;
  3394. Inc(reginput);
  3395. end;
  3396. OP_NOTVERTSEP:
  3397. begin
  3398. if (reginput = fInputEnd) or IsLineSeparator(reginput^) then
  3399. Exit;
  3400. Inc(reginput);
  3401. end;
  3402. OP_ANYHORZSEP:
  3403. begin
  3404. if (reginput = fInputEnd) or not IsHorzSeparator(reginput^) then
  3405. Exit;
  3406. Inc(reginput);
  3407. end;
  3408. OP_NOTHORZSEP:
  3409. begin
  3410. if (reginput = fInputEnd) or IsHorzSeparator(reginput^) then
  3411. Exit;
  3412. Inc(reginput);
  3413. end;
  3414. OP_EXACTLYCI:
  3415. begin
  3416. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  3417. Len := PLongInt(opnd)^;
  3418. Inc(opnd, RENumberSz);
  3419. // Inline the first character, for speed.
  3420. if (opnd^ <> reginput^) and (InvertCase(opnd^) <> reginput^) then
  3421. Exit;
  3422. // ###0.929 begin
  3423. no := Len;
  3424. save := reginput;
  3425. while no > 1 do
  3426. begin
  3427. Inc(save);
  3428. Inc(opnd);
  3429. if (opnd^ <> save^) and (InvertCase(opnd^) <> save^) then
  3430. Exit;
  3431. Dec(no);
  3432. end;
  3433. // ###0.929 end
  3434. Inc(reginput, Len);
  3435. end;
  3436. OP_EXACTLY:
  3437. begin
  3438. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  3439. Len := PLongInt(opnd)^;
  3440. Inc(opnd, RENumberSz);
  3441. // Inline the first character, for speed.
  3442. if opnd^ <> reginput^ then
  3443. Exit;
  3444. // ###0.929 begin
  3445. no := Len;
  3446. save := reginput;
  3447. while no > 1 do
  3448. begin
  3449. Inc(save);
  3450. Inc(opnd);
  3451. if opnd^ <> save^ then
  3452. Exit;
  3453. Dec(no);
  3454. end;
  3455. // ###0.929 end
  3456. Inc(reginput, Len);
  3457. end;
  3458. OP_BSUBEXP:
  3459. begin // ###0.936
  3460. no := Ord((scan + REOpSz + RENextOffSz)^);
  3461. no := GrpIndexes[no];
  3462. if no < 0 then
  3463. Exit;
  3464. if startp[no] = nil then
  3465. Exit;
  3466. if endp[no] = nil then
  3467. Exit;
  3468. save := reginput;
  3469. opnd := startp[no];
  3470. while opnd < endp[no] do
  3471. begin
  3472. if (save >= fInputEnd) or (save^ <> opnd^) then
  3473. Exit;
  3474. Inc(save);
  3475. Inc(opnd);
  3476. end;
  3477. reginput := save;
  3478. end;
  3479. OP_BSUBEXPCI:
  3480. begin // ###0.936
  3481. no := Ord((scan + REOpSz + RENextOffSz)^);
  3482. no := GrpIndexes[no];
  3483. if no < 0 then
  3484. Exit;
  3485. if startp[no] = nil then
  3486. Exit;
  3487. if endp[no] = nil then
  3488. Exit;
  3489. save := reginput;
  3490. opnd := startp[no];
  3491. while opnd < endp[no] do
  3492. begin
  3493. if (save >= fInputEnd) or
  3494. ((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then
  3495. Exit;
  3496. Inc(save);
  3497. Inc(opnd);
  3498. end;
  3499. reginput := save;
  3500. end;
  3501. OP_ANYOF:
  3502. begin
  3503. if (reginput = fInputEnd) or
  3504. not FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, False) then
  3505. Exit;
  3506. Inc(reginput);
  3507. end;
  3508. OP_ANYBUT:
  3509. begin
  3510. if (reginput = fInputEnd) or
  3511. FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, False) then
  3512. Exit;
  3513. Inc(reginput);
  3514. end;
  3515. OP_ANYOFCI:
  3516. begin
  3517. if (reginput = fInputEnd) or
  3518. not FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, True) then
  3519. Exit;
  3520. Inc(reginput);
  3521. end;
  3522. OP_ANYBUTCI:
  3523. begin
  3524. if (reginput = fInputEnd) or
  3525. FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, True) then
  3526. Exit;
  3527. Inc(reginput);
  3528. end;
  3529. OP_NOTHING:
  3530. ;
  3531. OP_COMMENT:
  3532. ;
  3533. OP_BACK:
  3534. ;
  3535. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1):
  3536. begin // ###0.929
  3537. no := Ord(scan^) - Ord(OP_OPEN);
  3538. // save := reginput;
  3539. save := startp[no]; // ###0.936
  3540. startp[no] := reginput; // ###0.936
  3541. Result := MatchPrim(next);
  3542. if not Result // ###0.936
  3543. then
  3544. startp[no] := save;
  3545. // if Result and (startp [no] = nil)
  3546. // then startp [no] := save;
  3547. // Don't set startp if some later invocation of the same
  3548. // parentheses already has.
  3549. Exit;
  3550. end;
  3551. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1):
  3552. begin // ###0.929
  3553. no := Ord(scan^) - Ord(OP_CLOSE);
  3554. // save := reginput;
  3555. save := endp[no]; // ###0.936
  3556. endp[no] := reginput; // ###0.936
  3557. Result := MatchPrim(next);
  3558. if not Result // ###0.936
  3559. then
  3560. endp[no] := save;
  3561. // if Result and (endp [no] = nil)
  3562. // then endp [no] := save;
  3563. // Don't set endp if some later invocation of the same
  3564. // parentheses already has.
  3565. Exit;
  3566. end;
  3567. OP_BRANCH:
  3568. begin
  3569. if (next^ <> OP_BRANCH) // No choice.
  3570. then
  3571. next := scan + REOpSz + RENextOffSz // Avoid recursion
  3572. else
  3573. begin
  3574. repeat
  3575. save := reginput;
  3576. Result := MatchPrim(scan + REOpSz + RENextOffSz);
  3577. if Result then
  3578. Exit;
  3579. reginput := save;
  3580. scan := regnext(scan);
  3581. until (scan = nil) or (scan^ <> OP_BRANCH);
  3582. Exit;
  3583. end;
  3584. end;
  3585. {$IFDEF ComplexBraces}
  3586. OP_LOOPENTRY:
  3587. begin // ###0.925
  3588. no := LoopStackIdx;
  3589. Inc(LoopStackIdx);
  3590. if LoopStackIdx > LoopStackMax then
  3591. begin
  3592. Error(reeLoopStackExceeded);
  3593. Exit;
  3594. end;
  3595. save := reginput;
  3596. LoopStack[LoopStackIdx] := 0; // init loop counter
  3597. Result := MatchPrim(next); // execute loop
  3598. LoopStackIdx := no; // cleanup
  3599. if Result then
  3600. Exit;
  3601. reginput := save;
  3602. Exit;
  3603. end;
  3604. OP_LOOP, OP_LOOPNG:
  3605. begin // ###0.940
  3606. if LoopStackIdx <= 0 then
  3607. begin
  3608. Error(reeLoopWithoutEntry);
  3609. Exit;
  3610. end;
  3611. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + 2 * REBracesArgSz))^;
  3612. BracesMin := PREBracesArg(AlignToInt(scan + REOpSz + RENextOffSz))^;
  3613. Bracesmax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  3614. save := reginput;
  3615. if LoopStack[LoopStackIdx] >= BracesMin then
  3616. begin // Min alredy matched - we can work
  3617. if scan^ = OP_LOOP then
  3618. begin
  3619. // greedy way - first try to max deep of greed ;)
  3620. if LoopStack[LoopStackIdx] < Bracesmax then
  3621. begin
  3622. Inc(LoopStack[LoopStackIdx]);
  3623. no := LoopStackIdx;
  3624. Result := MatchPrim(opnd);
  3625. LoopStackIdx := no;
  3626. if Result then
  3627. Exit;
  3628. reginput := save;
  3629. end;
  3630. Dec(LoopStackIdx); // Fail. May be we are too greedy? ;)
  3631. Result := MatchPrim(next);
  3632. if not Result then
  3633. reginput := save;
  3634. Exit;
  3635. end
  3636. else
  3637. begin
  3638. // non-greedy - try just now
  3639. Result := MatchPrim(next);
  3640. if Result then
  3641. Exit
  3642. else
  3643. reginput := save; // failed - move next and try again
  3644. if LoopStack[LoopStackIdx] < Bracesmax then
  3645. begin
  3646. Inc(LoopStack[LoopStackIdx]);
  3647. no := LoopStackIdx;
  3648. Result := MatchPrim(opnd);
  3649. LoopStackIdx := no;
  3650. if Result then
  3651. Exit;
  3652. reginput := save;
  3653. end;
  3654. Dec(LoopStackIdx); // Failed - back up
  3655. Exit;
  3656. end
  3657. end
  3658. else
  3659. begin // first match a min_cnt times
  3660. Inc(LoopStack[LoopStackIdx]);
  3661. no := LoopStackIdx;
  3662. Result := MatchPrim(opnd);
  3663. LoopStackIdx := no;
  3664. if Result then
  3665. Exit;
  3666. Dec(LoopStack[LoopStackIdx]);
  3667. reginput := save;
  3668. Exit;
  3669. end;
  3670. end;
  3671. {$ENDIF}
  3672. OP_STAR, OP_PLUS, OP_BRACES, OP_STARNG, OP_PLUSNG, OP_BRACESNG:
  3673. begin
  3674. // Lookahead to avoid useless match attempts when we know
  3675. // what character comes next.
  3676. nextch := #0;
  3677. if next^ = OP_EXACTLY then
  3678. nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
  3679. Bracesmax := MaxInt; // infinite loop for * and + //###0.92
  3680. if (scan^ = OP_STAR) or (scan^ = OP_STARNG) then
  3681. BracesMin := 0 // star
  3682. else if (scan^ = OP_PLUS) or (scan^ = OP_PLUSNG) then
  3683. BracesMin := 1 // plus
  3684. else
  3685. begin // braces
  3686. BracesMin := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  3687. Bracesmax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  3688. end;
  3689. save := reginput;
  3690. opnd := scan + REOpSz + RENextOffSz;
  3691. if (scan^ = OP_BRACES) or (scan^ = OP_BRACESNG) then
  3692. Inc(opnd, 2 * REBracesArgSz);
  3693. if (scan^ = OP_PLUSNG) or (scan^ = OP_STARNG) or (scan^ = OP_BRACESNG) then
  3694. begin
  3695. // non-greedy mode
  3696. Bracesmax := regrepeat(opnd, Bracesmax);
  3697. // don't repeat more than BracesMax
  3698. // Now we know real Max limit to move forward (for recursion 'back up')
  3699. // In some cases it can be faster to check only Min positions first,
  3700. // but after that we have to check every position separtely instead
  3701. // of fast scannig in loop.
  3702. no := BracesMin;
  3703. while no <= Bracesmax do
  3704. begin
  3705. reginput := save + no;
  3706. // If it could work, try it.
  3707. if (nextch = #0) or (reginput^ = nextch) then
  3708. begin
  3709. {$IFDEF ComplexBraces}
  3710. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  3711. // ###0.925
  3712. SavedLoopStackIdx := LoopStackIdx;
  3713. {$ENDIF}
  3714. if MatchPrim(next) then
  3715. begin
  3716. Result := True;
  3717. Exit;
  3718. end;
  3719. {$IFDEF ComplexBraces}
  3720. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  3721. LoopStackIdx := SavedLoopStackIdx;
  3722. {$ENDIF}
  3723. end;
  3724. Inc(no); // Couldn't or didn't - move forward.
  3725. end; { of while }
  3726. Exit;
  3727. end
  3728. else
  3729. begin // greedy mode
  3730. no := regrepeat(opnd, Bracesmax); // don't repeat more than max_cnt
  3731. while no >= BracesMin do
  3732. begin
  3733. // If it could work, try it.
  3734. if (nextch = #0) or (reginput^ = nextch) then
  3735. begin
  3736. {$IFDEF ComplexBraces}
  3737. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  3738. // ###0.925
  3739. SavedLoopStackIdx := LoopStackIdx;
  3740. {$ENDIF}
  3741. if MatchPrim(next) then
  3742. begin
  3743. Result := True;
  3744. Exit;
  3745. end;
  3746. {$IFDEF ComplexBraces}
  3747. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  3748. LoopStackIdx := SavedLoopStackIdx;
  3749. {$ENDIF}
  3750. end;
  3751. Dec(no); // Couldn't or didn't - back up.
  3752. reginput := save + no;
  3753. end; { of while }
  3754. Exit;
  3755. end;
  3756. end;
  3757. OP_EEND:
  3758. begin
  3759. Result := True; // Success!
  3760. Exit;
  3761. end;
  3762. else
  3763. begin
  3764. Error(reeMatchPrimMemoryCorruption);
  3765. Exit;
  3766. end;
  3767. end; { of case scan^ }
  3768. scan := next;
  3769. end; { of while scan <> nil }
  3770. // We get here only if there's trouble -- normally "case EEND" is the
  3771. // terminating point.
  3772. Error(reeMatchPrimCorruptedPointers);
  3773. end; { of function TRegExpr.MatchPrim
  3774. -------------------------------------------------------------- }
  3775. function TRegExpr.Exec(const AInputString: RegExprString): boolean;
  3776. begin
  3777. InputString := AInputString;
  3778. Result := ExecPrim(1, False, False);
  3779. end; { of function TRegExpr.Exec
  3780. -------------------------------------------------------------- }
  3781. function TRegExpr.Exec: boolean;
  3782. var
  3783. SlowChecks: boolean;
  3784. begin
  3785. SlowChecks := Length(fInputString) < fSlowChecksSizeMax;
  3786. Result := ExecPrim(1, False, SlowChecks);
  3787. end; { of function TRegExpr.Exec
  3788. -------------------------------------------------------------- }
  3789. function TRegExpr.Exec(AOffset: integer): boolean;
  3790. begin
  3791. Result := ExecPrim(AOffset, False, False);
  3792. end; { of function TRegExpr.Exec
  3793. -------------------------------------------------------------- }
  3794. function TRegExpr.ExecPos(AOffset: integer = 1): boolean;
  3795. begin
  3796. Result := ExecPrim(AOffset, False, False);
  3797. end; { of function TRegExpr.ExecPos
  3798. -------------------------------------------------------------- }
  3799. function TRegExpr.ExecPos(AOffset: integer; ATryOnce: boolean): boolean;
  3800. begin
  3801. Result := ExecPrim(AOffset, ATryOnce, False);
  3802. end;
  3803. function TRegExpr.MatchAtOnePos(APos: PRegExprChar): boolean;
  3804. begin
  3805. reginput := APos;
  3806. Result := MatchPrim(programm + REOpSz);
  3807. if Result then
  3808. begin
  3809. startp[0] := APos;
  3810. endp[0] := reginput;
  3811. end;
  3812. end;
  3813. procedure TRegExpr.ClearMatches;
  3814. begin
  3815. FillChar(startp, SizeOf(startp), 0);
  3816. FillChar(endp, SizeOf(endp), 0);
  3817. end;
  3818. procedure TRegExpr.ClearInternalIndexes;
  3819. var
  3820. i: integer;
  3821. begin
  3822. FillChar(startp, SizeOf(startp), 0);
  3823. FillChar(endp, SizeOf(endp), 0);
  3824. for i := 0 to NSUBEXP - 1 do
  3825. GrpIndexes[i] := -1;
  3826. GrpIndexes[0] := 0;
  3827. GrpCount := 0;
  3828. end;
  3829. function TRegExpr.ExecPrim(AOffset: integer; ATryOnce, ASlowChecks: boolean): boolean;
  3830. var
  3831. Ptr: PRegExprChar;
  3832. begin
  3833. Result := False;
  3834. // Ensure that Match cleared either if optimization tricks or some error
  3835. // will lead to leaving ExecPrim without actual search. That is
  3836. // important for ExecNext logic and so on.
  3837. ClearMatches;
  3838. // Don't check IsProgrammOk here! it causes big slowdown in test_benchmark!
  3839. if programm = nil then
  3840. begin
  3841. Compile;
  3842. if programm = nil then
  3843. Exit;
  3844. end;
  3845. // Check InputString presence
  3846. if fInputString = '' then
  3847. begin
  3848. Error(reeNoInputStringSpecified);
  3849. Exit;
  3850. end;
  3851. // Check that the start position is not negative
  3852. if AOffset < 1 then
  3853. begin
  3854. Error(reeOffsetMustBePositive);
  3855. Exit;
  3856. end;
  3857. // Check that the start position is not longer than the line
  3858. // If so then exit with nothing found
  3859. if AOffset > (Length(fInputString) + 1) // for matching empty string after last char.
  3860. then
  3861. Exit;
  3862. Ptr := fInputStart + AOffset - 1;
  3863. // If there is a "must appear" string, look for it.
  3864. if ASlowChecks then
  3865. if regmustString <> '' then
  3866. if Pos(regmustString, fInputString) = 0 then Exit;
  3867. {$IFDEF ComplexBraces}
  3868. // no loops started
  3869. LoopStackIdx := 0; // ###0.925
  3870. {$ENDIF}
  3871. // ATryOnce or anchored match (it needs to be tried only once).
  3872. if ATryOnce or (reganchored <> #0) then
  3873. begin
  3874. {$IFDEF UseFirstCharSet}
  3875. {$IFDEF UniCode}
  3876. if Ord(Ptr^) <= $FF then
  3877. {$ENDIF}
  3878. if not FirstCharArray[byte(Ptr^)] then
  3879. Exit;
  3880. {$ENDIF}
  3881. Result := MatchAtOnePos(Ptr);
  3882. Exit;
  3883. end;
  3884. // Messy cases: unanchored match.
  3885. Dec(Ptr);
  3886. repeat
  3887. Inc(Ptr);
  3888. if Ptr > fInputEnd then
  3889. Exit;
  3890. {$IFDEF UseFirstCharSet}
  3891. {$IFDEF UniCode}
  3892. if Ord(Ptr^) <= $FF then
  3893. {$ENDIF}
  3894. if not FirstCharArray[byte(Ptr^)] then
  3895. Continue;
  3896. {$ENDIF}
  3897. Result := MatchAtOnePos(Ptr);
  3898. // Exit on a match or after testing the end-of-string
  3899. if Result then
  3900. Exit;
  3901. until False;
  3902. end; { of function TRegExpr.ExecPrim
  3903. -------------------------------------------------------------- }
  3904. function TRegExpr.ExecNext: boolean;
  3905. var
  3906. PtrBegin, PtrEnd: PRegExprChar;
  3907. Offset: PtrInt;
  3908. begin
  3909. PtrBegin := startp[0];
  3910. PtrEnd := endp[0];
  3911. if (PtrBegin = nil) or (PtrEnd = nil) then
  3912. begin
  3913. Error(reeExecNextWithoutExec);
  3914. Result := False;
  3915. Exit;
  3916. end;
  3917. Offset := PtrEnd - fInputStart + 1;
  3918. // prevent infinite looping if empty string matches r.e.
  3919. if PtrBegin = PtrEnd then
  3920. Inc(Offset);
  3921. Result := ExecPrim(Offset, False, False);
  3922. end; { of function TRegExpr.ExecNext
  3923. -------------------------------------------------------------- }
  3924. procedure TRegExpr.SetInputString(const AInputString: RegExprString);
  3925. begin
  3926. ClearMatches;
  3927. fInputString := AInputString;
  3928. UniqueString(fInputString);
  3929. fInputStart := PRegExprChar(fInputString);
  3930. fInputEnd := fInputStart + Length(fInputString);
  3931. end; { of procedure TRegExpr.SetInputString
  3932. -------------------------------------------------------------- }
  3933. procedure TRegExpr.SetLineSeparators(const AStr: RegExprString);
  3934. begin
  3935. if AStr <> fLineSeparators then
  3936. begin
  3937. fLineSeparators := AStr;
  3938. InitLineSepArray;
  3939. InvalidateProgramm;
  3940. end;
  3941. end; { of procedure TRegExpr.SetLineSeparators
  3942. -------------------------------------------------------------- }
  3943. procedure TRegExpr.SetLinePairedSeparator(const AStr: RegExprString);
  3944. begin
  3945. if Length(AStr) = 2 then
  3946. begin
  3947. if AStr[1] = AStr[2] then
  3948. begin
  3949. // it's impossible for our 'one-point' checking to support
  3950. // two chars separator for identical chars
  3951. Error(reeBadLinePairedSeparator);
  3952. Exit;
  3953. end;
  3954. if not fLinePairedSeparatorAssigned or (AStr[1] <> fLinePairedSeparatorHead)
  3955. or (AStr[2] <> fLinePairedSeparatorTail) then
  3956. begin
  3957. fLinePairedSeparatorAssigned := True;
  3958. fLinePairedSeparatorHead := AStr[1];
  3959. fLinePairedSeparatorTail := AStr[2];
  3960. InvalidateProgramm;
  3961. end;
  3962. end
  3963. else if Length(AStr) = 0 then
  3964. begin
  3965. if fLinePairedSeparatorAssigned then
  3966. begin
  3967. fLinePairedSeparatorAssigned := False;
  3968. InvalidateProgramm;
  3969. end;
  3970. end
  3971. else
  3972. Error(reeBadLinePairedSeparator);
  3973. end; { of procedure TRegExpr.SetLinePairedSeparator
  3974. -------------------------------------------------------------- }
  3975. function TRegExpr.GetLinePairedSeparator: RegExprString;
  3976. begin
  3977. if fLinePairedSeparatorAssigned then
  3978. begin
  3979. {$IFDEF UniCode}
  3980. // Here is some UniCode 'magic'
  3981. // If You do know better decision to concatenate
  3982. // two WideChars, please, let me know!
  3983. Result := fLinePairedSeparatorHead; // ###0.947
  3984. Result := Result + fLinePairedSeparatorTail;
  3985. {$ELSE}
  3986. Result := fLinePairedSeparatorHead + fLinePairedSeparatorTail;
  3987. {$ENDIF}
  3988. end
  3989. else
  3990. Result := '';
  3991. end; { of function TRegExpr.GetLinePairedSeparator
  3992. -------------------------------------------------------------- }
  3993. function TRegExpr.Substitute(const ATemplate: RegExprString): RegExprString;
  3994. // perform substitutions after a regexp match
  3995. var
  3996. TemplateBeg, TemplateEnd: PRegExprChar;
  3997. function ParseVarName(var APtr: PRegExprChar): integer;
  3998. // extract name of variable (digits, may be enclosed with
  3999. // curly braces) from APtr^, uses TemplateEnd !!!
  4000. var
  4001. p: PRegExprChar;
  4002. Delimited: boolean;
  4003. begin
  4004. Result := 0;
  4005. p := APtr;
  4006. Delimited := (p < TemplateEnd) and (p^ = '{');
  4007. if Delimited then
  4008. Inc(p); // skip left curly brace
  4009. if (p < TemplateEnd) and (p^ = '&') then
  4010. Inc(p) // this is '$&' or '${&}'
  4011. else
  4012. while (p < TemplateEnd) and IsDigitChar(p^) do
  4013. begin
  4014. Result := Result * 10 + (Ord(p^) - Ord('0')); // ###0.939
  4015. Inc(p);
  4016. end;
  4017. if Delimited then
  4018. if (p < TemplateEnd) and (p^ = '}') then
  4019. Inc(p) // skip right curly brace
  4020. else
  4021. p := APtr; // isn't properly terminated
  4022. if p = APtr then
  4023. Result := -1; // no valid digits found or no right curly brace
  4024. APtr := p;
  4025. end;
  4026. type
  4027. TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper, smodeAllLower);
  4028. var
  4029. Mode: TSubstMode;
  4030. p, p0, p1, ResultPtr: PRegExprChar;
  4031. ResultLen, n: integer;
  4032. Ch, QuotedChar: REChar;
  4033. begin
  4034. // Check programm and input string
  4035. if not IsProgrammOk then
  4036. Exit;
  4037. if fInputString = '' then
  4038. begin
  4039. Error(reeNoInputStringSpecified);
  4040. Exit;
  4041. end;
  4042. // Prepare for working
  4043. if ATemplate = '' then
  4044. begin // prevent nil pointers
  4045. Result := '';
  4046. Exit;
  4047. end;
  4048. TemplateBeg := PRegExprChar(ATemplate);
  4049. TemplateEnd := TemplateBeg + Length(ATemplate);
  4050. // Count result length for speed optimization.
  4051. ResultLen := 0;
  4052. p := TemplateBeg;
  4053. while p < TemplateEnd do
  4054. begin
  4055. Ch := p^;
  4056. Inc(p);
  4057. if Ch = '$' then
  4058. n := GrpIndexes[ParseVarName(p)]
  4059. else
  4060. n := -1;
  4061. if n >= 0 then
  4062. begin
  4063. Inc(ResultLen, endp[n] - startp[n]);
  4064. end
  4065. else
  4066. begin
  4067. if (Ch = EscChar) and (p < TemplateEnd) then
  4068. begin // quoted or special char followed
  4069. Ch := p^;
  4070. Inc(p);
  4071. case Ch of
  4072. 'n':
  4073. Inc(ResultLen, Length(FReplaceLineEnd));
  4074. 'u', 'l', 'U', 'L': { nothing }
  4075. ;
  4076. 'x':
  4077. begin
  4078. Inc(ResultLen);
  4079. if (p^ = '{') then
  4080. begin // skip \x{....}
  4081. while ((p^ <> '}') and (p < TemplateEnd)) do
  4082. p := p + 1;
  4083. p := p + 1;
  4084. end
  4085. else
  4086. p := p + 2 // skip \x..
  4087. end;
  4088. else
  4089. Inc(ResultLen);
  4090. end;
  4091. end
  4092. else
  4093. Inc(ResultLen);
  4094. end;
  4095. end;
  4096. // Get memory. We do it once and it significant speed up work !
  4097. if ResultLen = 0 then
  4098. begin
  4099. Result := '';
  4100. Exit;
  4101. end;
  4102. SetLength(Result, ResultLen);
  4103. // Fill Result
  4104. ResultPtr := Pointer(Result);
  4105. p := TemplateBeg;
  4106. Mode := smodeNormal;
  4107. while p < TemplateEnd do
  4108. begin
  4109. Ch := p^;
  4110. p0 := p;
  4111. Inc(p);
  4112. p1 := p;
  4113. if Ch = '$' then
  4114. n := GrpIndexes[ParseVarName(p)]
  4115. else
  4116. n := -1;
  4117. if (n >= 0) then
  4118. begin
  4119. p0 := startp[n];
  4120. p1 := endp[n];
  4121. end
  4122. else
  4123. begin
  4124. if (Ch = EscChar) and (p < TemplateEnd) then
  4125. begin // quoted or special char followed
  4126. Ch := p^;
  4127. Inc(p);
  4128. case Ch of
  4129. 'n':
  4130. begin
  4131. p0 := PRegExprChar(FReplaceLineEnd);
  4132. p1 := p0 + Length(FReplaceLineEnd);
  4133. end;
  4134. 'x', 't', 'r', 'f', 'a', 'e':
  4135. begin
  4136. p := p - 1;
  4137. // UnquoteChar expects the escaped char under the pointer
  4138. QuotedChar := UnQuoteChar(p);
  4139. p := p + 1;
  4140. // Skip after last part of the escaped sequence - UnquoteChar stops on the last symbol of it
  4141. p0 := @QuotedChar;
  4142. p1 := p0 + 1;
  4143. end;
  4144. 'l':
  4145. begin
  4146. Mode := smodeOneLower;
  4147. p1 := p0;
  4148. end;
  4149. 'L':
  4150. begin
  4151. Mode := smodeAllLower;
  4152. p1 := p0;
  4153. end;
  4154. 'u':
  4155. begin
  4156. Mode := smodeOneUpper;
  4157. p1 := p0;
  4158. end;
  4159. 'U':
  4160. begin
  4161. Mode := smodeAllUpper;
  4162. p1 := p0;
  4163. end;
  4164. else
  4165. begin
  4166. Inc(p0);
  4167. Inc(p1);
  4168. end;
  4169. end;
  4170. end
  4171. end;
  4172. if p0 < p1 then
  4173. begin
  4174. while p0 < p1 do
  4175. begin
  4176. case Mode of
  4177. smodeOneLower:
  4178. begin
  4179. ResultPtr^ := _LowerCase(p0^);
  4180. Mode := smodeNormal;
  4181. end;
  4182. smodeAllLower:
  4183. begin
  4184. ResultPtr^ := _LowerCase(p0^);
  4185. end;
  4186. smodeOneUpper:
  4187. begin
  4188. ResultPtr^ := _UpperCase(p0^);
  4189. Mode := smodeNormal;
  4190. end;
  4191. smodeAllUpper:
  4192. begin
  4193. ResultPtr^ := _UpperCase(p0^);
  4194. end;
  4195. else
  4196. ResultPtr^ := p0^;
  4197. end;
  4198. Inc(ResultPtr);
  4199. Inc(p0);
  4200. end;
  4201. Mode := smodeNormal;
  4202. end;
  4203. end;
  4204. end; { of function TRegExpr.Substitute
  4205. -------------------------------------------------------------- }
  4206. procedure TRegExpr.Split(const AInputStr: RegExprString; APieces: TStrings);
  4207. var
  4208. PrevPos: PtrInt;
  4209. begin
  4210. PrevPos := 1;
  4211. if Exec(AInputStr) then
  4212. repeat
  4213. APieces.Add(System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos));
  4214. PrevPos := MatchPos[0] + MatchLen[0];
  4215. until not ExecNext;
  4216. APieces.Add(System.Copy(AInputStr, PrevPos, MaxInt)); // Tail
  4217. end; { of procedure TRegExpr.Split
  4218. -------------------------------------------------------------- }
  4219. function TRegExpr.Replace(const AInputStr: RegExprString;
  4220. const AReplaceStr: RegExprString;
  4221. AUseSubstitution: boolean = False): RegExprString;
  4222. var
  4223. PrevPos: PtrInt;
  4224. begin
  4225. Result := '';
  4226. PrevPos := 1;
  4227. if Exec(AInputStr) then
  4228. repeat
  4229. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos);
  4230. if AUseSubstitution // ###0.946
  4231. then
  4232. Result := Result + Substitute(AReplaceStr)
  4233. else
  4234. Result := Result + AReplaceStr;
  4235. PrevPos := MatchPos[0] + MatchLen[0];
  4236. until not ExecNext;
  4237. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  4238. end; { of function TRegExpr.Replace
  4239. -------------------------------------------------------------- }
  4240. function TRegExpr.ReplaceEx(const AInputStr: RegExprString;
  4241. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  4242. var
  4243. PrevPos: PtrInt;
  4244. begin
  4245. Result := '';
  4246. PrevPos := 1;
  4247. if Exec(AInputStr) then
  4248. repeat
  4249. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos)
  4250. + AReplaceFunc(Self);
  4251. PrevPos := MatchPos[0] + MatchLen[0];
  4252. until not ExecNext;
  4253. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  4254. end; { of function TRegExpr.ReplaceEx
  4255. -------------------------------------------------------------- }
  4256. function TRegExpr.Replace(const AInputStr: RegExprString;
  4257. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  4258. begin
  4259. Result := ReplaceEx(AInputStr, AReplaceFunc);
  4260. end; { of function TRegExpr.Replace
  4261. -------------------------------------------------------------- }
  4262. { ============================================================= }
  4263. { ====================== Debug section ======================== }
  4264. { ============================================================= }
  4265. {$IFDEF UseFirstCharSet}
  4266. procedure TRegExpr.FillFirstCharSet(prog: PRegExprChar);
  4267. var
  4268. scan: PRegExprChar; // Current node.
  4269. Next: PRegExprChar; // Next node.
  4270. opnd: PRegExprChar;
  4271. Oper: TREOp;
  4272. ch: REChar;
  4273. min_cnt, i: integer;
  4274. TempSet: TRegExprCharset;
  4275. begin
  4276. TempSet := [];
  4277. scan := prog;
  4278. while scan <> nil do
  4279. begin
  4280. Next := regnext(scan);
  4281. Oper := PREOp(scan)^;
  4282. case Oper of
  4283. OP_BSUBEXP,
  4284. OP_BSUBEXPCI:
  4285. begin
  4286. // we cannot optimize r.e. if it starts with back reference
  4287. FirstCharSet := RegExprAllSet; //###0.930
  4288. Exit;
  4289. end;
  4290. OP_BOL,
  4291. OP_BOLML:
  4292. ; // Exit; //###0.937
  4293. OP_EOL,
  4294. OP_EOLML:
  4295. begin //###0.948 was empty in 0.947, was EXIT in 0.937
  4296. Include(FirstCharSet, 0);
  4297. if ModifierM then
  4298. for i := 1 to Length(LineSeparators) do
  4299. Include(FirstCharSet, byte(LineSeparators[i]));
  4300. Exit;
  4301. end;
  4302. OP_BOUND,
  4303. OP_NOTBOUND:
  4304. ; //###0.943 ?!!
  4305. OP_ANY,
  4306. OP_ANYML:
  4307. begin // we can better define ANYML !!!
  4308. FirstCharSet := RegExprAllSet; //###0.930
  4309. Exit;
  4310. end;
  4311. OP_ANYDIGIT:
  4312. begin
  4313. FirstCharSet := FirstCharSet + RegExprDigitSet;
  4314. Exit;
  4315. end;
  4316. OP_NOTDIGIT:
  4317. begin
  4318. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprDigitSet);
  4319. Exit;
  4320. end;
  4321. OP_ANYLETTER:
  4322. begin
  4323. GetCharSetFromWordChars(TempSet);
  4324. FirstCharSet := FirstCharSet + TempSet;
  4325. Exit;
  4326. end;
  4327. OP_NOTLETTER:
  4328. begin
  4329. GetCharSetFromWordChars(TempSet);
  4330. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4331. Exit;
  4332. end;
  4333. OP_ANYSPACE:
  4334. begin
  4335. GetCharSetFromSpaceChars(TempSet);
  4336. FirstCharSet := FirstCharSet + TempSet;
  4337. Exit;
  4338. end;
  4339. OP_NOTSPACE:
  4340. begin
  4341. GetCharSetFromSpaceChars(TempSet);
  4342. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4343. Exit;
  4344. end;
  4345. OP_ANYVERTSEP:
  4346. begin
  4347. FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
  4348. Exit;
  4349. end;
  4350. OP_NOTVERTSEP:
  4351. begin
  4352. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprLineSeparatorsSet);
  4353. Exit;
  4354. end;
  4355. OP_ANYHORZSEP:
  4356. begin
  4357. FirstCharSet := FirstCharSet + RegExprHorzSeparatorsSet;
  4358. Exit;
  4359. end;
  4360. OP_NOTHORZSEP:
  4361. begin
  4362. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprHorzSeparatorsSet);
  4363. Exit;
  4364. end;
  4365. OP_EXACTLYCI:
  4366. begin
  4367. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  4368. {$IFDEF UniCode}
  4369. if Ord(ch) <= $FF then
  4370. {$ENDIF}
  4371. begin
  4372. Include(FirstCharSet, byte(ch));
  4373. Include(FirstCharSet, byte(InvertCase(ch)));
  4374. end;
  4375. Exit;
  4376. end;
  4377. OP_EXACTLY:
  4378. begin
  4379. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  4380. {$IFDEF UniCode}
  4381. if Ord(ch) <= $FF then
  4382. {$ENDIF}
  4383. Include(FirstCharSet, byte(ch));
  4384. Exit;
  4385. end;
  4386. OP_ANYOF:
  4387. begin
  4388. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  4389. FirstCharSet := FirstCharSet + TempSet;
  4390. Exit;
  4391. end;
  4392. OP_ANYBUT:
  4393. begin
  4394. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  4395. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4396. Exit;
  4397. end;
  4398. OP_ANYOFCI:
  4399. begin
  4400. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  4401. FirstCharSet := FirstCharSet + TempSet;
  4402. Exit;
  4403. end;
  4404. OP_ANYBUTCI:
  4405. begin
  4406. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  4407. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4408. Exit;
  4409. end;
  4410. OP_NOTHING:
  4411. ;
  4412. OP_COMMENT:
  4413. ;
  4414. OP_BACK:
  4415. ;
  4416. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1):
  4417. begin //###0.929
  4418. FillFirstCharSet(Next);
  4419. Exit;
  4420. end;
  4421. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1):
  4422. begin //###0.929
  4423. FillFirstCharSet(Next);
  4424. Exit;
  4425. end;
  4426. OP_BRANCH:
  4427. begin
  4428. if (PREOp(Next)^ <> OP_BRANCH) // No choice.
  4429. then
  4430. Next := scan + REOpSz + RENextOffSz // Avoid recursion.
  4431. else
  4432. begin
  4433. repeat
  4434. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4435. scan := regnext(scan);
  4436. until (scan = nil) or (PREOp(scan)^ <> OP_BRANCH);
  4437. Exit;
  4438. end;
  4439. end;
  4440. {$IFDEF ComplexBraces}
  4441. OP_LOOPENTRY:
  4442. begin //###0.925
  4443. //LoopStack [LoopStackIdx] := 0; //###0.940 line removed
  4444. FillFirstCharSet(Next); // execute LOOP
  4445. Exit;
  4446. end;
  4447. OP_LOOP,
  4448. OP_LOOPNG:
  4449. begin //###0.940
  4450. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz * 2))^;
  4451. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  4452. FillFirstCharSet(opnd);
  4453. if min_cnt = 0 then
  4454. FillFirstCharSet(Next);
  4455. Exit;
  4456. end;
  4457. {$ENDIF}
  4458. OP_STAR,
  4459. OP_STARNG: //###0.940
  4460. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4461. OP_PLUS,
  4462. OP_PLUSNG:
  4463. begin //###0.940
  4464. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4465. Exit;
  4466. end;
  4467. OP_BRACES,
  4468. OP_BRACESNG:
  4469. begin //###0.940
  4470. opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
  4471. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES
  4472. FillFirstCharSet(opnd);
  4473. if min_cnt > 0 then
  4474. Exit;
  4475. end;
  4476. OP_EEND:
  4477. begin
  4478. FirstCharSet := RegExprAllSet; //###0.948
  4479. Exit;
  4480. end;
  4481. else
  4482. begin
  4483. fLastErrorOpcode := Oper;
  4484. Error(reeUnknownOpcodeInFillFirst);
  4485. Exit;
  4486. end;
  4487. end; { of case scan^}
  4488. scan := Next;
  4489. end; { of while scan <> nil}
  4490. end; { of procedure FillFirstCharSet
  4491. --------------------------------------------------------------}
  4492. {$ENDIF}
  4493. procedure TRegExpr.InitCharCheckers;
  4494. var
  4495. Cnt: integer;
  4496. //
  4497. function Add(AChecker: TRegExprCharChecker): byte;
  4498. begin
  4499. Inc(Cnt);
  4500. if Cnt > High(CharCheckers) then
  4501. raise Exception.Create('Too small CharCheckers array');
  4502. CharCheckers[Cnt - 1] := AChecker;
  4503. Result := Cnt - 1;
  4504. end;
  4505. //
  4506. begin
  4507. Cnt := 0;
  4508. FillChar(CharCheckers, SizeOf(CharCheckers), 0);
  4509. CheckerIndex_Word := Add(CharChecker_Word);
  4510. CheckerIndex_NotWord := Add(CharChecker_NotWord);
  4511. CheckerIndex_Space := Add(CharChecker_Space);
  4512. CheckerIndex_NotSpace := Add(CharChecker_NotSpace);
  4513. CheckerIndex_Digit := Add(CharChecker_Digit);
  4514. CheckerIndex_NotDigit := Add(CharChecker_NotDigit);
  4515. CheckerIndex_VertSep := Add(CharChecker_VertSep);
  4516. CheckerIndex_NotVertSep := Add(CharChecker_NotVertSep);
  4517. CheckerIndex_HorzSep := Add(CharChecker_HorzSep);
  4518. CheckerIndex_NotHorzSep := Add(CharChecker_NotHorzSep);
  4519. //CheckerIndex_AllAZ := Add(CharChecker_AllAZ);
  4520. CheckerIndex_LowerAZ := Add(CharChecker_LowerAZ);
  4521. CheckerIndex_UpperAZ := Add(CharChecker_UpperAZ);
  4522. SetLength(CharCheckerInfos, 3);
  4523. with CharCheckerInfos[0] do
  4524. begin
  4525. CharBegin := 'a';
  4526. CharEnd:= 'z';
  4527. CheckerIndex := CheckerIndex_LowerAZ;
  4528. end;
  4529. with CharCheckerInfos[1] do
  4530. begin
  4531. CharBegin := 'A';
  4532. CharEnd := 'Z';
  4533. CheckerIndex := CheckerIndex_UpperAZ;
  4534. end;
  4535. with CharCheckerInfos[2] do
  4536. begin
  4537. CharBegin := '0';
  4538. CharEnd := '9';
  4539. CheckerIndex := CheckerIndex_Digit;
  4540. end;
  4541. end;
  4542. function TRegExpr.CharChecker_Word(ch: REChar): boolean;
  4543. begin
  4544. Result := IsWordChar(ch);
  4545. end;
  4546. function TRegExpr.CharChecker_NotWord(ch: REChar): boolean;
  4547. begin
  4548. Result := not IsWordChar(ch);
  4549. end;
  4550. function TRegExpr.CharChecker_Space(ch: REChar): boolean;
  4551. begin
  4552. Result := IsSpaceChar(ch);
  4553. end;
  4554. function TRegExpr.CharChecker_NotSpace(ch: REChar): boolean;
  4555. begin
  4556. Result := not IsSpaceChar(ch);
  4557. end;
  4558. function TRegExpr.CharChecker_Digit(ch: REChar): boolean;
  4559. begin
  4560. Result := IsDigitChar(ch);
  4561. end;
  4562. function TRegExpr.CharChecker_NotDigit(ch: REChar): boolean;
  4563. begin
  4564. Result := not IsDigitChar(ch);
  4565. end;
  4566. function TRegExpr.CharChecker_VertSep(ch: REChar): boolean;
  4567. begin
  4568. Result := IsLineSeparator(ch);
  4569. end;
  4570. function TRegExpr.CharChecker_NotVertSep(ch: REChar): boolean;
  4571. begin
  4572. Result := not IsLineSeparator(ch);
  4573. end;
  4574. function TRegExpr.CharChecker_HorzSep(ch: REChar): boolean;
  4575. begin
  4576. Result := IsHorzSeparator(ch);
  4577. end;
  4578. function TRegExpr.CharChecker_NotHorzSep(ch: REChar): boolean;
  4579. begin
  4580. Result := not IsHorzSeparator(ch);
  4581. end;
  4582. function TRegExpr.CharChecker_LowerAZ(ch: REChar): boolean;
  4583. begin
  4584. case ch of
  4585. 'a' .. 'z':
  4586. Result := True;
  4587. else
  4588. Result := False;
  4589. end;
  4590. end;
  4591. function TRegExpr.CharChecker_UpperAZ(ch: REChar): boolean;
  4592. begin
  4593. case ch of
  4594. 'A' .. 'Z':
  4595. Result := True;
  4596. else
  4597. Result := False;
  4598. end;
  4599. end;
  4600. {$IFDEF RegExpPCodeDump}
  4601. function TRegExpr.DumpOp(op: TREOp): RegExprString;
  4602. // printable representation of opcode
  4603. begin
  4604. case op of
  4605. OP_BOL:
  4606. Result := 'BOL';
  4607. OP_EOL:
  4608. Result := 'EOL';
  4609. OP_BOLML:
  4610. Result := 'BOLML';
  4611. OP_EOLML:
  4612. Result := 'EOLML';
  4613. OP_BOUND:
  4614. Result := 'BOUND'; // ###0.943
  4615. OP_NOTBOUND:
  4616. Result := 'NOTBOUND'; // ###0.943
  4617. OP_ANY:
  4618. Result := 'ANY';
  4619. OP_ANYML:
  4620. Result := 'ANYML'; // ###0.941
  4621. OP_ANYLETTER:
  4622. Result := 'ANYLETTER';
  4623. OP_NOTLETTER:
  4624. Result := 'NOTLETTER';
  4625. OP_ANYDIGIT:
  4626. Result := 'ANYDIGIT';
  4627. OP_NOTDIGIT:
  4628. Result := 'NOTDIGIT';
  4629. OP_ANYSPACE:
  4630. Result := 'ANYSPACE';
  4631. OP_NOTSPACE:
  4632. Result := 'NOTSPACE';
  4633. OP_ANYHORZSEP:
  4634. Result := 'ANYHORZSEP';
  4635. OP_NOTHORZSEP:
  4636. Result := 'NOTHORZSEP';
  4637. OP_ANYVERTSEP:
  4638. Result := 'ANYVERTSEP';
  4639. OP_NOTVERTSEP:
  4640. Result := 'NOTVERTSEP';
  4641. OP_ANYOF:
  4642. Result := 'ANYOF';
  4643. OP_ANYBUT:
  4644. Result := 'ANYBUT';
  4645. OP_ANYOFCI:
  4646. Result := 'ANYOF/CI';
  4647. OP_ANYBUTCI:
  4648. Result := 'ANYBUT/CI';
  4649. OP_BRANCH:
  4650. Result := 'BRANCH';
  4651. OP_EXACTLY:
  4652. Result := 'EXACTLY';
  4653. OP_EXACTLYCI:
  4654. Result := 'EXACTLY/CI';
  4655. OP_NOTHING:
  4656. Result := 'NOTHING';
  4657. OP_COMMENT:
  4658. Result := 'COMMENT';
  4659. OP_BACK:
  4660. Result := 'BACK';
  4661. OP_EEND:
  4662. Result := 'END';
  4663. OP_BSUBEXP:
  4664. Result := 'BSUBEXP';
  4665. OP_BSUBEXPCI:
  4666. Result := 'BSUBEXP/CI';
  4667. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1): // ###0.929
  4668. Result := Format('OPEN[%d]', [Ord(op) - Ord(OP_OPEN)]);
  4669. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1): // ###0.929
  4670. Result := Format('CLOSE[%d]', [Ord(op) - Ord(OP_CLOSE)]);
  4671. OP_STAR:
  4672. Result := 'STAR';
  4673. OP_PLUS:
  4674. Result := 'PLUS';
  4675. OP_BRACES:
  4676. Result := 'BRACES';
  4677. {$IFDEF ComplexBraces}
  4678. OP_LOOPENTRY:
  4679. Result := 'LOOPENTRY'; // ###0.925
  4680. OP_LOOP:
  4681. Result := 'LOOP'; // ###0.925
  4682. OP_LOOPNG:
  4683. Result := 'LOOPNG'; // ###0.940
  4684. {$ENDIF}
  4685. OP_STARNG:
  4686. Result := 'STARNG'; // ###0.940
  4687. OP_PLUSNG:
  4688. Result := 'PLUSNG'; // ###0.940
  4689. OP_BRACESNG:
  4690. Result := 'BRACESNG'; // ###0.940
  4691. else
  4692. Error(reeDumpCorruptedOpcode);
  4693. end; { of case op }
  4694. Result := ':' + Result;
  4695. end; { of function TRegExpr.DumpOp
  4696. -------------------------------------------------------------- }
  4697. function TRegExpr.Dump: RegExprString;
  4698. // dump a regexp in vaguely comprehensible form
  4699. var
  4700. s: PRegExprChar;
  4701. op: TREOp; // Arbitrary non-END op.
  4702. next: PRegExprChar;
  4703. i, NLen: integer;
  4704. Diff: PtrInt;
  4705. Ch: AnsiChar;
  4706. function PrintableChar(AChar: REChar): string; {$IFDEF InlineFuncs}inline;{$ENDIF}
  4707. begin
  4708. if AChar < ' ' then
  4709. Result := '#' + IntToStr(Ord(AChar))
  4710. else
  4711. Result := AChar;
  4712. end;
  4713. begin
  4714. if not IsProgrammOk then
  4715. Exit;
  4716. op := OP_EXACTLY;
  4717. Result := '';
  4718. s := programm + REOpSz;
  4719. while op <> OP_EEND do
  4720. begin // While that wasn't END last time...
  4721. op := s^;
  4722. Result := Result + Format('%2d%s', [s - programm, DumpOp(s^)]);
  4723. // Where, what.
  4724. next := regnext(s);
  4725. if next = nil // Next ptr.
  4726. then
  4727. Result := Result + ' (0)'
  4728. else
  4729. begin
  4730. if next > s
  4731. // ###0.948 PWideChar subtraction workaround (see comments in Tail method for details)
  4732. then
  4733. Diff := next - s
  4734. else
  4735. Diff := -(s - next);
  4736. Result := Result + Format(' (%d) ', [(s - programm) + Diff]);
  4737. end;
  4738. Inc(s, REOpSz + RENextOffSz);
  4739. if (op = OP_ANYOF) or (op = OP_ANYOFCI) or (op = OP_ANYBUT) or (op = OP_ANYBUTCI) then
  4740. begin
  4741. repeat
  4742. case s^ of
  4743. OpKind_End:
  4744. begin
  4745. Inc(s);
  4746. Break;
  4747. end;
  4748. OpKind_Range:
  4749. begin
  4750. Result := Result + 'Rng(';
  4751. Inc(s);
  4752. Result := Result + PrintableChar(s^) + '-';
  4753. Inc(s);
  4754. Result := Result + PrintableChar(s^);
  4755. Result := Result + ') ';
  4756. Inc(s);
  4757. end;
  4758. OpKind_MetaClass:
  4759. begin
  4760. Inc(s);
  4761. Result := Result + '\' + PrintableChar(s^) + ' ';
  4762. Inc(s);
  4763. end;
  4764. OpKind_Char:
  4765. begin
  4766. Inc(s);
  4767. NLen := PLongInt(s)^;
  4768. Inc(s, RENumberSz);
  4769. Result := Result + 'Ch(';
  4770. for i := 1 to NLen do
  4771. begin
  4772. Result := Result + PrintableChar(s^);
  4773. Inc(s);
  4774. end;
  4775. Result := Result + ') ';
  4776. end;
  4777. else
  4778. Error(reeDumpCorruptedOpcode);
  4779. end;
  4780. until false;
  4781. end;
  4782. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  4783. begin
  4784. // Literal string, where present.
  4785. NLen := PLongInt(s)^;
  4786. Inc(s, RENumberSz);
  4787. for i := 1 to NLen do
  4788. begin
  4789. Result := Result + PrintableChar(s^);
  4790. Inc(s);
  4791. end;
  4792. end;
  4793. if (op = OP_BSUBEXP) or (op = OP_BSUBEXPCI) then
  4794. begin
  4795. Result := Result + ' \' + IntToStr(Ord(s^));
  4796. Inc(s);
  4797. end;
  4798. if (op = OP_BRACES) or (op = OP_BRACESNG) then
  4799. begin // ###0.941
  4800. // show min/max argument of braces operator
  4801. Result := Result + Format('{%d,%d}', [PREBracesArg(AlignToInt(s))^,
  4802. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  4803. Inc(s, REBracesArgSz * 2);
  4804. end;
  4805. {$IFDEF ComplexBraces}
  4806. if (op = OP_LOOP) or (op = OP_LOOPNG) then
  4807. begin // ###0.940
  4808. Result := Result + Format(' -> (%d) {%d,%d}',
  4809. [(s - programm - (REOpSz + RENextOffSz)) +
  4810. PRENextOff(AlignToPtr(s + 2 * REBracesArgSz))^,
  4811. PREBracesArg(AlignToInt(s))^,
  4812. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  4813. Inc(s, 2 * REBracesArgSz + RENextOffSz);
  4814. end;
  4815. {$ENDIF}
  4816. Result := Result + #$d#$a;
  4817. end; { of while }
  4818. // Header fields of interest.
  4819. if reganchored <> #0 then
  4820. Result := Result + 'Anchored; ';
  4821. if regmustString <> '' then
  4822. Result := Result + 'Must have: "' + regmustString + '"; ';
  4823. {$IFDEF UseFirstCharSet} // ###0.929
  4824. Result := Result + #$d#$a'First charset: ';
  4825. if FirstCharSet = [] then
  4826. Result := Result + '<empty set>'
  4827. else
  4828. if FirstCharSet = RegExprAllSet then
  4829. Result := Result + '<all chars>'
  4830. else
  4831. for Ch := #0 to #255 do
  4832. if byte(Ch) in FirstCharSet then
  4833. begin
  4834. if Ch < ' ' then
  4835. Result := Result + PrintableChar(Ch) // ###0.948
  4836. else
  4837. Result := Result + Ch;
  4838. end;
  4839. {$ENDIF}
  4840. Result := Result + #$d#$a;
  4841. end; { of function TRegExpr.Dump
  4842. -------------------------------------------------------------- }
  4843. {$ENDIF}
  4844. {$IFDEF reRealExceptionAddr}
  4845. {$OPTIMIZATION ON}
  4846. // ReturnAddr works correctly only if compiler optimization is ON
  4847. // I placed this method at very end of unit because there are no
  4848. // way to restore compiler optimization flag ...
  4849. {$ENDIF}
  4850. procedure TRegExpr.Error(AErrorID: integer);
  4851. {$IFDEF reRealExceptionAddr}
  4852. function ReturnAddr: Pointer; // ###0.938
  4853. asm
  4854. mov eax,[ebp+4]
  4855. end;
  4856. {$ENDIF}
  4857. var
  4858. e: ERegExpr;
  4859. begin
  4860. fLastError := AErrorID; // dummy stub - useless because will raise exception
  4861. if AErrorID < 1000 // compilation error ?
  4862. then
  4863. e := ERegExpr.Create(ErrorMsg(AErrorID) // yes - show error pos
  4864. + ' (pos ' + IntToStr(CompilerErrorPos) + ')')
  4865. else
  4866. e := ERegExpr.Create(ErrorMsg(AErrorID));
  4867. e.ErrorCode := AErrorID;
  4868. e.CompilerErrorPos := CompilerErrorPos;
  4869. raise e
  4870. {$IFDEF reRealExceptionAddr}
  4871. at ReturnAddr; // ###0.938
  4872. {$ENDIF}
  4873. end; { of procedure TRegExpr.Error
  4874. -------------------------------------------------------------- }
  4875. (*
  4876. PCode persistence:
  4877. FirstCharSet
  4878. programm, regsize
  4879. reganchored // -> programm
  4880. regmust, regmustlen // -> programm
  4881. fExprIsCompiled
  4882. *)
  4883. // be carefull - placed here code will be always compiled with
  4884. // compiler optimization flag
  4885. initialization
  4886. RegExprInvertCaseFunction := TRegExpr.InvertCaseFunction;
  4887. end.