regexpr.pas 157 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285
  1. unit RegExpr;
  2. {
  3. TRegExpr class library
  4. Delphi Regular Expressions
  5. Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
  6. You can choose to use this Pascal unit in one of the two following licenses:
  7. Option 1>
  8. You may use this software in any kind of development,
  9. including comercial, redistribute, and modify it freely,
  10. under the following restrictions :
  11. 1. This software is provided as it is, without any kind of
  12. warranty given. Use it at Your own risk.The author is not
  13. responsible for any consequences of use of this software.
  14. 2. The origin of this software may not be mispresented, You
  15. must not claim that You wrote the original software. If
  16. You use this software in any kind of product, it would be
  17. appreciated that there in a information box, or in the
  18. documentation would be an acknowledgement like
  19. Partial Copyright (c) 2004 Andrey V. Sorokin
  20. https://sorokin.engineer/
  21. [email protected]
  22. 3. You may not have any income from distributing this source
  23. (or altered version of it) to other developers. When You
  24. use this product in a comercial package, the source may
  25. not be charged seperatly.
  26. 4. Altered versions must be plainly marked as such, and must
  27. not be misrepresented as being the original software.
  28. 5. RegExp Studio application and all the visual components as
  29. well as documentation is not part of the TRegExpr library
  30. and is not free for usage.
  31. https://sorokin.engineer/
  32. [email protected]
  33. Option 2>
  34. The same modified LGPL with static linking exception as the Free Pascal RTL
  35. }
  36. interface
  37. { off $DEFINE DebugSynRegExpr }
  38. {$MODE DELPHI} // Delphi-compatible mode in FreePascal
  39. // Disabling for now, seems to cause bug in Lazarus (bug ID 36603)
  40. {$INLINE ON}
  41. // ======== Define base compiler options
  42. {$BOOLEVAL OFF}
  43. {$EXTENDEDSYNTAX ON}
  44. {$LONGSTRINGS ON}
  45. {$OPTIMIZATION ON}
  46. // ======== Define options for TRegExpr engine
  47. {$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string
  48. {$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string
  49. {$DEFINE ComplexBraces} // Support braces in complex cases
  50. {$IFNDEF UniCode}
  51. {$UNDEF UnicodeWordDetection}
  52. {$ELSE}
  53. {$DEFINE UnicodeWordDetection}
  54. {$ENDIF}
  55. uses
  56. Math, // Min
  57. Classes, // TStrings in Split method
  58. SysUtils; // Exception
  59. type
  60. {$IFDEF UniCode}
  61. PRegExprChar = PWideChar;
  62. RegExprString = UnicodeString;
  63. REChar = WideChar;
  64. {$ELSE}
  65. PRegExprChar = PChar;
  66. RegExprString = AnsiString; // ###0.952 was string
  67. REChar = Char;
  68. {$ENDIF}
  69. TREOp = REChar; // internal p-code type //###0.933
  70. PREOp = ^TREOp;
  71. type
  72. TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object;
  73. TRegExprCharset = set of byte;
  74. const
  75. // Escape char ('\' in common r.e.) used for escaping metachars (\w, \d etc)
  76. EscChar = '\';
  77. RegExprModifierI: boolean = False; // default value for ModifierI
  78. RegExprModifierR: boolean = True; // default value for ModifierR
  79. RegExprModifierS: boolean = True; // default value for ModifierS
  80. RegExprModifierG: boolean = True; // default value for ModifierG
  81. RegExprModifierM: boolean = False; // default value for ModifierM
  82. RegExprModifierX: boolean = False; // default value for ModifierX
  83. // default value for SpaceChars
  84. RegExprSpaceChars: RegExprString = ' '#$9#$A#$D#$C;
  85. // default value for WordChars
  86. RegExprWordChars: RegExprString = '0123456789'
  87. + 'abcdefghijklmnopqrstuvwxyz'
  88. + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
  89. // default value for LineSeparators
  90. RegExprLineSeparators: RegExprString = #$d#$a#$b#$c
  91. {$IFDEF UniCode}
  92. + #$2028#$2029#$85
  93. {$ENDIF};
  94. // default value for LinePairedSeparator
  95. RegExprLinePairedSeparator: RegExprString = #$d#$a;
  96. { if You need Unix-styled line separators (only \n), then use:
  97. RegExprLineSeparators = #$a;
  98. RegExprLinePairedSeparator = '';
  99. }
  100. // Tab and Unicode category "Space Separator":
  101. // https://www.compart.com/en/unicode/category/Zs
  102. RegExprHorzSeparators: RegExprString = #9#$20#$A0
  103. {$IFDEF UniCode}
  104. + #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000
  105. {$ENDIF};
  106. const
  107. NSUBEXP = 90; // max number of subexpression //###0.929
  108. // Cannot be more than NSUBEXPMAX
  109. // Be carefull - don't use values which overflow CLOSE opcode
  110. // (in this case you'll get compiler error).
  111. // Big NSUBEXP will cause more slow work and more stack required
  112. NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
  113. // Don't change it! It's defined by internal TRegExpr design.
  114. {$IFDEF ComplexBraces}
  115. const
  116. LoopStackMax = 10; // max depth of loops stack //###0.925
  117. type
  118. TRegExprLoopStack = array [1 .. LoopStackMax] of integer;
  119. {$ENDIF}
  120. type
  121. TRegExprModifiers = record
  122. I: boolean;
  123. // Case-insensitive.
  124. R: boolean;
  125. // Extended syntax for Russian ranges in [].
  126. // If True, then а-я additionally includes letter 'ё',
  127. // А-Я additionally includes 'Ё', and а-Я includes all Russian letters.
  128. // Turn it off if it interferes with your national alphabet.
  129. S: boolean;
  130. // Dot '.' matches any char, otherwise only [^\n].
  131. G: boolean;
  132. // Greedy. Switching it off switches all operators to non-greedy style,
  133. // so if G=False, then '*' works like '*?', '+' works like '+?' and so on.
  134. M: boolean;
  135. // Treat string as multiple lines. It changes `^' and `$' from
  136. // matching at only the very start/end of the string to the start/end
  137. // of any line anywhere within the string.
  138. X: boolean;
  139. // Allow comments in regex using # char.
  140. end;
  141. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  142. type
  143. TRegExpr = class;
  144. TRegExprReplaceFunction = function(ARegExpr: TRegExpr): RegExprString of object;
  145. TRegExprCharChecker = function(ch: REChar): boolean of object;
  146. TRegExprCharCheckerArray = array[0 .. 30] of TRegExprCharChecker;
  147. TRegExprCharCheckerInfo = record
  148. CharBegin, CharEnd: REChar;
  149. CheckerIndex: integer;
  150. end;
  151. TRegExprCharCheckerInfos = array of TRegExprCharCheckerInfo;
  152. { TRegExpr }
  153. TRegExpr = class
  154. private
  155. startp: array [0 .. NSUBEXP - 1] of PRegExprChar; // found expr start points
  156. endp: array [0 .. NSUBEXP - 1] of PRegExprChar; // found expr end points
  157. GrpIndexes: array [0 .. NSUBEXP - 1] of integer;
  158. GrpCount: integer;
  159. {$IFDEF ComplexBraces}
  160. LoopStack: TRegExprLoopStack; // state before entering loop
  161. LoopStackIdx: integer; // 0 - out of all loops
  162. {$ENDIF}
  163. // The "internal use only" fields to pass info from compile
  164. // to execute that permits the execute phase to run lots faster on
  165. // simple cases.
  166. reganchored: REChar; // is the match anchored (at beginning-of-line only)?
  167. regmust: PRegExprChar; // string (pointer into program) that match must include, or nil
  168. regmustlen: integer; // length of regmust string
  169. regmustString: RegExprString;
  170. // reganchored permits very fast decisions on suitable starting points
  171. // for a match, cutting down the work a lot. Regmust permits fast rejection
  172. // of lines that cannot possibly match. The regmust tests are costly enough
  173. // that regcomp() supplies a regmust only if the r.e. contains something
  174. // potentially expensive (at present, the only such thing detected is * or +
  175. // at the start of the r.e., which can involve a lot of backup). regmustlen is
  176. // supplied because the test in regexec() needs it and regcomp() is computing
  177. // it anyway.
  178. {$IFDEF UseFirstCharSet}
  179. FirstCharSet: TRegExprCharset;
  180. FirstCharArray: array[byte] of boolean;
  181. {$ENDIF}
  182. // work variables for Exec routines - save stack in recursion
  183. reginput: PRegExprChar; // String-input pointer.
  184. fInputStart: PRegExprChar; // Pointer to first char of input string.
  185. fInputEnd: PRegExprChar; // Pointer to char AFTER last char of input string
  186. fRegexStart: PRegExprChar;
  187. fRegexEnd: PRegExprChar;
  188. // work variables for compiler's routines
  189. regparse: PRegExprChar; // Input-scan pointer.
  190. regnpar: integer; // Count of () brackets.
  191. regdummy: REChar;
  192. regcode: PRegExprChar; // Code-emit pointer; @regdummy = don't.
  193. regsize: integer; // Total programm size in REChars.
  194. regExactlyLen: PLongInt;
  195. regexpBegin: PRegExprChar; // only for error handling. Contains pointer to beginning of r.e. while compiling
  196. regexpIsCompiled: boolean; // true if r.e. successfully compiled
  197. fSecondPass: boolean;
  198. // programm is essentially a linear encoding
  199. // of a nondeterministic finite-state machine (aka syntax charts or
  200. // "railroad normal form" in parsing technology). Each node is an opcode
  201. // plus a "next" pointer, possibly plus an operand. "Next" pointers of
  202. // all nodes except BRANCH implement concatenation; a "next" pointer with
  203. // a BRANCH on both ends of it connects two alternatives. (Here we
  204. // have one of the subtle syntax dependencies: an individual BRANCH (as
  205. // opposed to a collection of them) is never concatenated with anything
  206. // because of operator precedence.) The operand of some types of node is
  207. // a literal string; for others, it is a node leading into a sub-FSM. In
  208. // particular, the operand of a BRANCH node is the first node of the branch.
  209. // (NB this is *not* a tree structure: the tail of the branch connects
  210. // to the thing following the set of BRANCHes.) The opcodes are:
  211. programm: PRegExprChar; // Unwarranted chumminess with compiler.
  212. fExpression: RegExprString; // source of compiled r.e.
  213. fInputString: RegExprString; // input string
  214. fLastError: integer; // see Error, LastError
  215. fLastErrorOpcode: TREOp;
  216. fModifiers: TRegExprModifiers; // modifiers
  217. fCompModifiers: TRegExprModifiers; // compiler's copy of modifiers
  218. fProgModifiers: TRegExprModifiers; // modifiers values from last programm compilation
  219. fSpaceChars: RegExprString;
  220. fWordChars: RegExprString;
  221. fInvertCase: TRegExprInvertCaseFunction;
  222. fLineSeparators: RegExprString;
  223. fLinePairedSeparatorAssigned: boolean;
  224. fLinePairedSeparatorHead, fLinePairedSeparatorTail: REChar;
  225. FReplaceLineEnd: RegExprString; // string to use for "\n" in Substitute method
  226. FUseOsLineEndOnReplace: boolean; // use OS LineBreak chars (LF or CRLF) for FReplaceLineEnd
  227. fSlowChecksSizeMax: integer;
  228. // use ASlowChecks=True in Exec() only when Length(InputString)<SlowChecksSizeMax
  229. // ASlowChecks enables to use regmustString optimization
  230. {$IFNDEF UniCode}
  231. fLineSepArray: array[byte] of boolean;
  232. {$ENDIF}
  233. {$IFDEF UnicodeWordDetection}
  234. FUseUnicodeWordDetection: boolean;
  235. {$ENDIF}
  236. FEmptyInputRaisesError : Boolean;
  237. CharCheckers: TRegExprCharCheckerArray;
  238. CharCheckerInfos: TRegExprCharCheckerInfos;
  239. CheckerIndex_Word: byte;
  240. CheckerIndex_NotWord: byte;
  241. CheckerIndex_Digit: byte;
  242. CheckerIndex_NotDigit: byte;
  243. CheckerIndex_Space: byte;
  244. CheckerIndex_NotSpace: byte;
  245. CheckerIndex_HorzSep: byte;
  246. CheckerIndex_NotHorzSep: byte;
  247. CheckerIndex_VertSep: byte;
  248. CheckerIndex_NotVertSep: byte;
  249. CheckerIndex_LowerAZ: byte;
  250. CheckerIndex_UpperAZ: byte;
  251. procedure InitCharCheckers;
  252. function CharChecker_Word(ch: REChar): boolean;
  253. function CharChecker_NotWord(ch: REChar): boolean;
  254. function CharChecker_Space(ch: REChar): boolean;
  255. function CharChecker_NotSpace(ch: REChar): boolean;
  256. function CharChecker_Digit(ch: REChar): boolean;
  257. function CharChecker_NotDigit(ch: REChar): boolean;
  258. function CharChecker_HorzSep(ch: REChar): boolean;
  259. function CharChecker_NotHorzSep(ch: REChar): boolean;
  260. function CharChecker_VertSep(ch: REChar): boolean;
  261. function CharChecker_NotVertSep(ch: REChar): boolean;
  262. function CharChecker_LowerAZ(ch: REChar): boolean;
  263. function CharChecker_UpperAZ(ch: REChar): boolean;
  264. procedure ClearMatches; {$IFDEF InlineFuncs}inline;{$ENDIF}
  265. procedure ClearInternalIndexes; {$IFDEF InlineFuncs}inline;{$ENDIF}
  266. function FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  267. procedure GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  268. procedure GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
  269. procedure GetCharSetFromWordChars(var ARes: TRegExprCharSet);
  270. function IsWordChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  271. function IsSpaceChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  272. function IsCustomLineSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  273. procedure InitLineSepArray;
  274. // Mark programm as having to be [re]compiled
  275. procedure InvalidateProgramm;
  276. // Check if we can use precompiled r.e. or
  277. // [re]compile it if something changed
  278. function IsProgrammOk: boolean; // ###0.941
  279. procedure SetExpression(const AStr: RegExprString);
  280. function GetModifierStr: RegExprString;
  281. procedure SetModifierStr(const AStr: RegExprString);
  282. function GetModifierG: boolean;
  283. function GetModifierI: boolean;
  284. function GetModifierM: boolean;
  285. function GetModifierR: boolean;
  286. function GetModifierS: boolean;
  287. function GetModifierX: boolean;
  288. procedure SetModifierG(AValue: boolean);
  289. procedure SetModifierI(AValue: boolean);
  290. procedure SetModifierM(AValue: boolean);
  291. procedure SetModifierR(AValue: boolean);
  292. procedure SetModifierS(AValue: boolean);
  293. procedure SetModifierX(AValue: boolean);
  294. // Default handler raises exception ERegExpr with
  295. // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
  296. // and CompilerErrorPos = value of property CompilerErrorPos.
  297. procedure Error(AErrorID: integer); virtual; // error handler.
  298. { ==================== Compiler section =================== }
  299. // compile a regular expression into internal code
  300. function CompileRegExpr(ARegExp: PRegExprChar): boolean;
  301. procedure SetUseOsLineEndOnReplace(AValue: boolean);
  302. // set the next-pointer at the end of a node chain
  303. procedure Tail(p: PRegExprChar; val: PRegExprChar);
  304. // regoptail - regtail on operand of first argument; nop if operandless
  305. procedure OpTail(p: PRegExprChar; val: PRegExprChar);
  306. // regnode - emit a node, return location
  307. function EmitNode(op: TREOp): PRegExprChar;
  308. // emit (if appropriate) a byte of code
  309. procedure EmitC(ch: REChar);
  310. // emit LongInt value
  311. procedure EmitInt(AValue: LongInt);
  312. // insert an operator in front of already-emitted operand
  313. // Means relocating the operand.
  314. procedure InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  315. // ###0.90
  316. // regular expression, i.e. main body or parenthesized thing
  317. function ParseReg(paren: integer; var flagp: integer): PRegExprChar;
  318. // one alternative of an | operator
  319. function ParseBranch(var flagp: integer): PRegExprChar;
  320. // something followed by possible [*+?]
  321. function ParsePiece(var flagp: integer): PRegExprChar;
  322. function HexDig(Ch: REChar): integer;
  323. function UnQuoteChar(var APtr: PRegExprChar): REChar;
  324. // the lowest level
  325. function ParseAtom(var flagp: integer): PRegExprChar;
  326. // current pos in r.e. - for error hanling
  327. function GetCompilerErrorPos: PtrInt;
  328. {$IFDEF UseFirstCharSet} // ###0.929
  329. procedure FillFirstCharSet(prog: PRegExprChar);
  330. {$ENDIF}
  331. { ===================== Matching section =================== }
  332. // repeatedly match something simple, report how many
  333. function regrepeat(p: PRegExprChar; AMax: integer): integer;
  334. // dig the "next" pointer out of a node
  335. function regnext(p: PRegExprChar): PRegExprChar;
  336. // recursively matching routine
  337. function MatchPrim(prog: PRegExprChar): boolean;
  338. // match at specific position only, called from ExecPrim
  339. function MatchAtOnePos(APos: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  340. // Exec for stored InputString
  341. function ExecPrim(AOffset: integer; ATryOnce, ASlowChecks: boolean): boolean;
  342. {$IFDEF RegExpPCodeDump}
  343. function DumpOp(op: TREOp): RegExprString;
  344. {$ENDIF}
  345. function GetSubExprCount: integer;
  346. function GetMatchPos(Idx: integer): PtrInt;
  347. function GetMatchLen(Idx: integer): PtrInt;
  348. function GetMatch(Idx: integer): RegExprString;
  349. procedure SetInputString(const AInputString: RegExprString);
  350. procedure SetLineSeparators(const AStr: RegExprString);
  351. procedure SetLinePairedSeparator(const AStr: RegExprString);
  352. function GetLinePairedSeparator: RegExprString;
  353. public
  354. constructor Create; overload;
  355. constructor Create(const AExpression: RegExprString); overload;
  356. destructor Destroy; override;
  357. class function VersionMajor: integer;
  358. class function VersionMinor: integer;
  359. // match a programm against a string AInputString
  360. // !!! Exec store AInputString into InputString property
  361. // For Delphi 5 and higher available overloaded versions - first without
  362. // parameter (uses already assigned to InputString property value)
  363. // and second that has int parameter and is same as ExecPos
  364. function Exec(const AInputString: RegExprString): boolean; overload;
  365. function Exec: boolean; overload;
  366. function Exec(AOffset: integer): boolean; overload;
  367. // find next match:
  368. // ExecNext;
  369. // works the same as
  370. // if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
  371. // else ExecPos (MatchPos [0] + MatchLen [0]);
  372. // but it's more simpler !
  373. // Raises exception if used without preceeding SUCCESSFUL call to
  374. // Exec* (Exec, ExecPos, ExecNext). So You always must use something like
  375. // if Exec (InputString) then repeat { proceed results} until not ExecNext;
  376. function ExecNext: boolean;
  377. // find match for InputString starting from AOffset position
  378. // (AOffset=1 - first char of InputString)
  379. function ExecPos(AOffset: integer = 1): boolean; overload;
  380. function ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload;
  381. // Returns ATemplate with '$&' or '$0' replaced by whole r.e.
  382. // occurence and '$1'...'$nn' replaced by subexpression with given index.
  383. // Symbol '$' is used instead of '\' (for future extensions
  384. // and for more Perl-compatibility) and accepts more than one digit.
  385. // If you want to place into template raw '$' or '\', use prefix '\'.
  386. // Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
  387. // If you want to place any number after '$' you must enclose it
  388. // with curly braces: '${12}'.
  389. // Example: 'a$12bc' -> 'a<Match[12]>bc'
  390. // 'a${1}2bc' -> 'a<Match[1]>2bc'.
  391. function Substitute(const ATemplate: RegExprString): RegExprString;
  392. // Splits AInputStr to list by positions of all r.e. occurencies.
  393. // Internally calls Exec, ExecNext.
  394. procedure Split(const AInputStr: RegExprString; APieces: TStrings);
  395. function Replace(const AInputStr: RegExprString;
  396. const AReplaceStr: RegExprString;
  397. AUseSubstitution: boolean = False) // ###0.946
  398. : RegExprString; overload;
  399. function Replace(const AInputStr: RegExprString;
  400. AReplaceFunc: TRegExprReplaceFunction): RegExprString; overload;
  401. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr.
  402. // If AUseSubstitution is true, then AReplaceStr will be used
  403. // as template for Substitution methods.
  404. // For example:
  405. // Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
  406. // Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
  407. // will return: def 'BLOCK' value 'test1'
  408. // Replace ('BLOCK( test1)', 'def "$1" value "$2"')
  409. // will return: def "$1" value "$2"
  410. // Internally calls Exec, ExecNext.
  411. // Overloaded version and ReplaceEx operate with callback function,
  412. // so you can implement really complex functionality.
  413. function ReplaceEx(const AInputStr: RegExprString;
  414. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  415. // Returns ID of last error, 0 if no errors (unusable if
  416. // Error method raises exception) and clear internal status
  417. // into 0 (no errors).
  418. function LastError: integer;
  419. // Returns Error message for error with ID = AErrorID.
  420. function ErrorMsg(AErrorID: integer): RegExprString; virtual;
  421. // Converts Ch into upper case if it in lower case or in lower
  422. // if it in upper (uses current system local setings)
  423. class function InvertCaseFunction(const Ch: REChar): REChar;
  424. // [Re]compile r.e. Useful for example for GUI r.e. editors (to check
  425. // all properties validity).
  426. procedure Compile; // ###0.941
  427. {$IFDEF RegExpPCodeDump}
  428. // dump a compiled regexp in vaguely comprehensible form
  429. function Dump: RegExprString;
  430. {$ENDIF}
  431. // Regular expression.
  432. // For optimization, TRegExpr will automatically compiles it into 'P-code'
  433. // (You can see it with help of Dump method) and stores in internal
  434. // structures. Real [re]compilation occures only when it really needed -
  435. // while calling Exec, ExecNext, Substitute, Dump, etc
  436. // and only if Expression or other P-code affected properties was changed
  437. // after last [re]compilation.
  438. // If any errors while [re]compilation occures, Error method is called
  439. // (by default Error raises exception - see below)
  440. property Expression: RegExprString read fExpression write SetExpression;
  441. // Set/get default values of r.e.syntax modifiers. Modifiers in
  442. // r.e. (?ismx-ismx) will replace this default values.
  443. // If you try to set unsupported modifier, Error will be called
  444. // (by defaul Error raises exception ERegExpr).
  445. property ModifierStr: RegExprString read GetModifierStr write SetModifierStr;
  446. property ModifierI: boolean read GetModifierI write SetModifierI;
  447. property ModifierR: boolean read GetModifierR write SetModifierR;
  448. property ModifierS: boolean read GetModifierS write SetModifierS;
  449. property ModifierG: boolean read GetModifierG write SetModifierG;
  450. property ModifierM: boolean read GetModifierM write SetModifierM;
  451. property ModifierX: boolean read GetModifierX write SetModifierX;
  452. // returns current input string (from last Exec call or last assign
  453. // to this property).
  454. // Any assignment to this property clear Match* properties !
  455. property InputString: RegExprString read fInputString write SetInputString;
  456. // Number of subexpressions has been found in last Exec* call.
  457. // If there are no subexpr. but whole expr was found (Exec* returned True),
  458. // then SubExprMatchCount=0, if no subexpressions nor whole
  459. // r.e. found (Exec* returned false) then SubExprMatchCount=-1.
  460. // Note, that some subexpr. may be not found and for such
  461. // subexpr. MathPos=MatchLen=-1 and Match=''.
  462. // For example: Expression := '(1)?2(3)?';
  463. // Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
  464. // Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
  465. // Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
  466. // Exec ('2'): SubExprMatchCount=0, Match[0]='2'
  467. // Exec ('7') - return False: SubExprMatchCount=-1
  468. property SubExprMatchCount: integer read GetSubExprCount;
  469. // pos of entrance subexpr. #Idx into tested in last Exec*
  470. // string. First subexpr. has Idx=1, last - MatchCount,
  471. // whole r.e. has Idx=0.
  472. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  473. // not found in input string.
  474. property MatchPos[Idx: integer]: PtrInt read GetMatchPos;
  475. // len of entrance subexpr. #Idx r.e. into tested in last Exec*
  476. // string. First subexpr. has Idx=1, last - MatchCount,
  477. // whole r.e. has Idx=0.
  478. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  479. // not found in input string.
  480. // Remember - MatchLen may be 0 (if r.e. match empty string) !
  481. property MatchLen[Idx: integer]: PtrInt read GetMatchLen;
  482. // == copy (InputString, MatchPos [Idx], MatchLen [Idx])
  483. // Returns '' if in r.e. no such subexpr. or this subexpr.
  484. // not found in input string.
  485. property Match[Idx: integer]: RegExprString read GetMatch;
  486. // Returns position in r.e. where compiler stopped.
  487. // Useful for error diagnostics
  488. property CompilerErrorPos: PtrInt read GetCompilerErrorPos;
  489. // Contains chars, treated as /s (initially filled with RegExprSpaceChars
  490. // global constant)
  491. property SpaceChars: RegExprString read fSpaceChars write fSpaceChars;
  492. // ###0.927
  493. // Contains chars, treated as /w (initially filled with RegExprWordChars
  494. // global constant)
  495. property WordChars: RegExprString read fWordChars write fWordChars;
  496. // ###0.929
  497. {$IFDEF UnicodeWordDetection}
  498. // If set to true, in addition to using WordChars, a heuristic to detect unicode word letters is used for \w
  499. property UseUnicodeWordDetection: boolean read FUseUnicodeWordDetection write FUseUnicodeWordDetection;
  500. {$ENDIF}
  501. // line separators (like \n in Unix)
  502. property LineSeparators: RegExprString read fLineSeparators write SetLineSeparators; // ###0.941
  503. // paired line separator (like \r\n in DOS and Windows).
  504. // must contain exactly two chars or no chars at all
  505. property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; // ###0.941
  506. // Set this property if you want to override case-insensitive functionality.
  507. // Create set it to RegExprInvertCaseFunction (InvertCaseFunction by default)
  508. property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; // ##0.935
  509. // Use OS line end on replace or not. Default is True for backwards compatibility.
  510. // Set to false to use #10.
  511. property UseOsLineEndOnReplace: boolean read FUseOsLineEndOnReplace write SetUseOsLineEndOnReplace;
  512. property SlowChecksSizeMax: integer read fSlowChecksSizeMax write fSlowChecksSizeMax;
  513. // Raise error when input string is empty
  514. Property EmptyInputRaisesError : Boolean Read FEmptyInputRaisesError Write FEmptyInputRaisesError;
  515. end;
  516. type
  517. ERegExpr = class(Exception)
  518. public
  519. ErrorCode: integer;
  520. CompilerErrorPos: PtrInt;
  521. end;
  522. const
  523. RegExprInvertCaseFunction: TRegExprInvertCaseFunction = nil;
  524. // true if string AInputString match regular expression ARegExpr
  525. // ! will raise exeption if syntax errors in ARegExpr
  526. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  527. // Split AInputStr into APieces by r.e. ARegExpr occurencies
  528. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  529. APieces: TStrings);
  530. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr
  531. // If AUseSubstitution is true, then AReplaceStr will be used
  532. // as template for Substitution methods.
  533. // For example:
  534. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  535. // 'BLOCK( test1)', 'def "$1" value "$2"', True)
  536. // will return: def 'BLOCK' value 'test1'
  537. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  538. // 'BLOCK( test1)', 'def "$1" value "$2"')
  539. // will return: def "$1" value "$2"
  540. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  541. AUseSubstitution: boolean = False): RegExprString; overload; // ###0.947
  542. // Alternate form allowing to set more parameters.
  543. type
  544. TRegexReplaceOption = (
  545. rroModifierI,
  546. rroModifierR,
  547. rroModifierS,
  548. rroModifierG,
  549. rroModifierM,
  550. rroModifierX,
  551. rroUseSubstitution,
  552. rroUseOsLineEnd
  553. );
  554. TRegexReplaceOptions = set of TRegexReplaceOption;
  555. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  556. Options: TRegexReplaceOptions): RegExprString; overload;
  557. // Replace all metachars with its safe representation,
  558. // for example 'abc$cd.(' converts into 'abc\$cd\.\('
  559. // This function useful for r.e. autogeneration from
  560. // user input
  561. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  562. // Makes list of subexpressions found in ARegExpr r.e.
  563. // In ASubExps every item represent subexpression,
  564. // from first to last, in format:
  565. // String - subexpression text (without '()')
  566. // low word of Object - starting position in ARegExpr, including '('
  567. // if exists! (first position is 1)
  568. // high word of Object - length, including starting '(' and ending ')'
  569. // if exist!
  570. // AExtendedSyntax - must be True if modifier /m will be On while
  571. // using the r.e.
  572. // Useful for GUI editors of r.e. etc (You can find example of using
  573. // in TestRExp.dpr project)
  574. // Returns
  575. // 0 Success. No unbalanced brackets was found;
  576. // -1 There are not enough closing brackets ')';
  577. // -(n+1) At position n was found opening '[' without //###0.942
  578. // corresponding closing ']';
  579. // n At position n was found closing bracket ')' without
  580. // corresponding opening '('.
  581. // If Result <> 0, then ASubExpr can contain empty items or illegal ones
  582. function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
  583. AExtendedSyntax: boolean= False): integer;
  584. implementation
  585. {$IFDEF UnicodeWordDetection}
  586. uses
  587. UnicodeData;
  588. {$ENDIF}
  589. const
  590. // TRegExpr.VersionMajor/Minor return values of these constants:
  591. REVersionMajor = 0;
  592. REVersionMinor = 987;
  593. OpKind_End = REChar(1);
  594. OpKind_MetaClass = REChar(2);
  595. OpKind_Range = REChar(3);
  596. OpKind_Char = REChar(4);
  597. RegExprAllSet = [0 .. 255];
  598. RegExprDigitSet = [Ord('0') .. Ord('9')];
  599. RegExprLowerAzSet = [Ord('a') .. Ord('z')];
  600. RegExprUpperAzSet = [Ord('A') .. Ord('Z')];
  601. RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet;
  602. RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UniCode} + [$85] {$ENDIF};
  603. RegExprHorzSeparatorsSet = [9, $20, $A0];
  604. MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
  605. type
  606. TRENextOff = PtrInt;
  607. // internal Next "pointer" (offset to current p-code) //###0.933
  608. PRENextOff = ^TRENextOff;
  609. // used for extracting Next "pointers" from compiled r.e. //###0.933
  610. TREBracesArg = integer; // type of {m,n} arguments
  611. PREBracesArg = ^TREBracesArg;
  612. const
  613. REOpSz = SizeOf(TREOp) div SizeOf(REChar);
  614. // size of OP_ command in REChars
  615. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  616. // add space for aligning pointer
  617. // -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size
  618. RENextOffSz = (2 * SizeOf(TRENextOff) div SizeOf(REChar)) - 1;
  619. REBracesArgSz = (2 * SizeOf(TREBracesArg) div SizeOf(REChar));
  620. // add space for aligning pointer
  621. {$ELSE}
  622. RENextOffSz = (SizeOf(TRENextOff) div SizeOf(REChar));
  623. // size of Next pointer in REChars
  624. REBracesArgSz = SizeOf(TREBracesArg) div SizeOf(REChar);
  625. // size of BRACES arguments in REChars
  626. {$ENDIF}
  627. RENumberSz = SizeOf(LongInt) div SizeOf(REChar);
  628. function _FindCharInBuffer(SBegin, SEnd: PRegExprChar; Ch: REChar): PRegExprChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
  629. begin
  630. while SBegin < SEnd do
  631. begin
  632. if SBegin^ = Ch then
  633. begin
  634. Result := SBegin;
  635. Exit;
  636. end;
  637. Inc(SBegin);
  638. end;
  639. Result := nil;
  640. end;
  641. function IsIgnoredChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  642. begin
  643. case AChar of
  644. ' ', #9, #$d, #$a:
  645. Result := True
  646. else
  647. Result := False;
  648. end;
  649. end;
  650. function _IsMetaChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  651. begin
  652. case AChar of
  653. 'd', 'D',
  654. 's', 'S',
  655. 'w', 'W',
  656. 'v', 'V',
  657. 'h', 'H':
  658. Result := True
  659. else
  660. Result := False;
  661. end;
  662. end;
  663. function AlignToPtr(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  664. begin
  665. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  666. Result := Align(p, SizeOf(Pointer));
  667. {$ELSE}
  668. Result := p;
  669. {$ENDIF}
  670. end;
  671. function AlignToInt(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  672. begin
  673. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  674. Result := Align(p, SizeOf(integer));
  675. {$ELSE}
  676. Result := p;
  677. {$ENDIF}
  678. end;
  679. function _UpperCase(Ch: REChar): REChar;
  680. begin
  681. Result := Ch;
  682. if (Ch >= 'a') and (Ch <= 'z') then
  683. begin
  684. Dec(Result, 32);
  685. Exit;
  686. end;
  687. if Ord(Ch) < 128 then
  688. Exit;
  689. {$IFDEF FPC}
  690. {$IFDEF UniCode}
  691. Result := UnicodeUpperCase(Ch)[1];
  692. {$ELSE}
  693. Result := AnsiUpperCase(Ch)[1];
  694. {$ENDIF}
  695. {$ELSE}
  696. {$IFDEF UniCode}
  697. {$IFDEF D2009}
  698. Result := TCharacter.ToUpper(Ch);
  699. {$ENDIF}
  700. {$ELSE}
  701. Result := AnsiUpperCase(Ch)[1];
  702. {$ENDIF}
  703. {$ENDIF}
  704. end;
  705. function _LowerCase(Ch: REChar): REChar;
  706. begin
  707. Result := Ch;
  708. if (Ch >= 'A') and (Ch <= 'Z') then
  709. begin
  710. Inc(Result, 32);
  711. Exit;
  712. end;
  713. if Ord(Ch) < 128 then
  714. Exit;
  715. {$IFDEF FPC}
  716. {$IFDEF UniCode}
  717. Result := UnicodeLowerCase(Ch)[1];
  718. {$ELSE}
  719. Result := AnsiLowerCase(Ch)[1];
  720. {$ENDIF}
  721. {$ELSE}
  722. {$IFDEF UniCode}
  723. {$IFDEF D2009}
  724. Result := TCharacter.ToLower(Ch);
  725. {$ENDIF}
  726. {$ELSE}
  727. Result := AnsiLowerCase(Ch)[1];
  728. {$ENDIF}
  729. {$ENDIF}
  730. end;
  731. { ============================================================= }
  732. { ===================== Global functions ====================== }
  733. { ============================================================= }
  734. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  735. begin
  736. Result :=
  737. (A.I = B.I) and
  738. (A.G = B.G) and
  739. (A.M = B.M) and
  740. (A.S = B.S) and
  741. (A.R = B.R) and
  742. (A.X = B.X);
  743. end;
  744. function ParseModifiers(const APtr: PRegExprChar;
  745. ALen: integer;
  746. var AValue: TRegExprModifiers): boolean;
  747. // Parse string and set AValue if it's in format 'ismxrg-ismxrg'
  748. var
  749. IsOn: boolean;
  750. i: integer;
  751. begin
  752. Result := True;
  753. IsOn := True;
  754. for i := 0 to ALen-1 do
  755. case APtr[i] of
  756. '-':
  757. IsOn := False;
  758. 'I', 'i':
  759. AValue.I := IsOn;
  760. 'R', 'r':
  761. AValue.R := IsOn;
  762. 'S', 's':
  763. AValue.S := IsOn;
  764. 'G', 'g':
  765. AValue.G := IsOn;
  766. 'M', 'm':
  767. AValue.M := IsOn;
  768. 'X', 'x':
  769. AValue.X := IsOn;
  770. else
  771. begin
  772. Result := False;
  773. Exit;
  774. end;
  775. end;
  776. end;
  777. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  778. var
  779. r: TRegExpr;
  780. begin
  781. r := TRegExpr.Create;
  782. try
  783. r.Expression := ARegExpr;
  784. Result := r.Exec(AInputStr);
  785. finally
  786. r.Free;
  787. end;
  788. end; { of function ExecRegExpr
  789. -------------------------------------------------------------- }
  790. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  791. APieces: TStrings);
  792. var
  793. r: TRegExpr;
  794. begin
  795. APieces.Clear;
  796. r := TRegExpr.Create;
  797. try
  798. r.Expression := ARegExpr;
  799. r.Split(AInputStr, APieces);
  800. finally
  801. r.Free;
  802. end;
  803. end; { of procedure SplitRegExpr
  804. -------------------------------------------------------------- }
  805. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  806. AUseSubstitution: boolean= False): RegExprString;
  807. begin
  808. with TRegExpr.Create do
  809. try
  810. Expression := ARegExpr;
  811. Result := Replace(AInputStr, AReplaceStr, AUseSubstitution);
  812. finally
  813. Free;
  814. end;
  815. end; { of function ReplaceRegExpr
  816. -------------------------------------------------------------- }
  817. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  818. Options: TRegexReplaceOptions): RegExprString; overload;
  819. begin
  820. with TRegExpr.Create do
  821. try
  822. ModifierI := (rroModifierI in Options);
  823. ModifierR := (rroModifierR in Options);
  824. ModifierS := (rroModifierS in Options);
  825. ModifierG := (rroModifierG in Options);
  826. ModifierM := (rroModifierM in Options);
  827. ModifierX := (rroModifierX in Options);
  828. // Set this after the above, if the regex contains modifiers, they will be applied.
  829. Expression := ARegExpr;
  830. UseOsLineEndOnReplace := (rroUseOsLineEnd in Options);
  831. Result := Replace(AInputStr, AReplaceStr, rroUseSubstitution in Options);
  832. finally
  833. Free;
  834. end;
  835. end;
  836. (*
  837. const
  838. MetaChars_Init = '^$.[()|?+*' + EscChar + '{';
  839. MetaChars = MetaChars_Init; // not needed to be a variable, const is faster
  840. MetaAll = MetaChars_Init + ']}'; // Very similar to MetaChars, but slighly changed.
  841. *)
  842. function _IsMetaSymbol1(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  843. begin
  844. case ch of
  845. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{':
  846. Result := True
  847. else
  848. Result := False
  849. end;
  850. end;
  851. function _IsMetaSymbol2(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  852. begin
  853. case ch of
  854. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{',
  855. ']', '}':
  856. Result := True
  857. else
  858. Result := False
  859. end;
  860. end;
  861. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  862. var
  863. i, i0, Len: integer;
  864. ch: REChar;
  865. begin
  866. Result := '';
  867. Len := Length(AStr);
  868. i := 1;
  869. i0 := i;
  870. while i <= Len do
  871. begin
  872. ch := AStr[i];
  873. if _IsMetaSymbol2(ch) then
  874. begin
  875. Result := Result + System.Copy(AStr, i0, i - i0) + EscChar + ch;
  876. i0 := i + 1;
  877. end;
  878. Inc(i);
  879. end;
  880. Result := Result + System.Copy(AStr, i0, MaxInt); // Tail
  881. end; { of function QuoteRegExprMetaChars
  882. -------------------------------------------------------------- }
  883. function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
  884. AExtendedSyntax: boolean = False): integer;
  885. type
  886. TStackItemRec = record // ###0.945
  887. SubExprIdx: integer;
  888. StartPos: PtrInt;
  889. end;
  890. TStackArray = packed array [0 .. NSUBEXPMAX - 1] of TStackItemRec;
  891. var
  892. Len, SubExprLen: integer;
  893. i, i0: integer;
  894. Modif: TRegExprModifiers;
  895. Stack: ^TStackArray; // ###0.945
  896. StackIdx, StackSz: integer;
  897. begin
  898. Result := 0; // no unbalanced brackets found at this very moment
  899. Modif:=Default(TRegExprModifiers);
  900. ASubExprs.Clear; // I don't think that adding to non empty list
  901. // can be useful, so I simplified algorithm to work only with empty list
  902. Len := Length(ARegExpr); // some optimization tricks
  903. // first we have to calculate number of subexpression to reserve
  904. // space in Stack array (may be we'll reserve more than needed, but
  905. // it's faster then memory reallocation during parsing)
  906. StackSz := 1; // add 1 for entire r.e.
  907. for i := 1 to Len do
  908. if ARegExpr[i] = '(' then
  909. Inc(StackSz);
  910. // SetLength (Stack, StackSz); //###0.945
  911. GetMem(Stack, SizeOf(TStackItemRec) * StackSz);
  912. try
  913. StackIdx := 0;
  914. i := 1;
  915. while (i <= Len) do
  916. begin
  917. case ARegExpr[i] of
  918. '(':
  919. begin
  920. if (i < Len) and (ARegExpr[i + 1] = '?') then
  921. begin
  922. // this is not subexpression, but comment or other
  923. // Perl extension. We must check is it (?ismxrg-ismxrg)
  924. // and change AExtendedSyntax if /x is changed.
  925. Inc(i, 2); // skip '(?'
  926. i0 := i;
  927. while (i <= Len) and (ARegExpr[i] <> ')') do
  928. Inc(i);
  929. if i > Len then
  930. Result := -1 // unbalansed '('
  931. else
  932. if ParseModifiers(@ARegExpr[i0], i - i0, Modif) then
  933. // Alexey-T: original code had copy from i, not from i0
  934. AExtendedSyntax := Modif.X;
  935. end
  936. else
  937. begin // subexpression starts
  938. ASubExprs.Add(''); // just reserve space
  939. with Stack[StackIdx] do
  940. begin
  941. SubExprIdx := ASubExprs.Count - 1;
  942. StartPos := i;
  943. end;
  944. Inc(StackIdx);
  945. end;
  946. end;
  947. ')':
  948. begin
  949. if StackIdx = 0 then
  950. Result := i // unbalanced ')'
  951. else
  952. begin
  953. Dec(StackIdx);
  954. with Stack[StackIdx] do
  955. begin
  956. SubExprLen := i - StartPos + 1;
  957. ASubExprs.Objects[SubExprIdx] :=
  958. TObject(StartPos or (SubExprLen ShL 16));
  959. ASubExprs[SubExprIdx] := System.Copy(ARegExpr, StartPos + 1,
  960. SubExprLen - 2); // add without brackets
  961. end;
  962. end;
  963. end;
  964. EscChar:
  965. Inc(i); // skip quoted symbol
  966. '[':
  967. begin
  968. // we have to skip character ranges at once, because they can
  969. // contain '#', and '#' in it must NOT be recognized as eXtended
  970. // comment beginning!
  971. i0 := i;
  972. Inc(i);
  973. if ARegExpr[i] = ']' // first ']' inside [] treated as simple char, no need to check '['
  974. then
  975. Inc(i);
  976. while (i <= Len) and (ARegExpr[i] <> ']') do
  977. if ARegExpr[i] = EscChar // ###0.942
  978. then
  979. Inc(i, 2) // skip 'escaped' char to prevent stopping at '\]'
  980. else
  981. Inc(i);
  982. if (i > Len) or (ARegExpr[i] <> ']') // ###0.942
  983. then
  984. Result := -(i0 + 1); // unbalansed '[' //###0.942
  985. end;
  986. '#':
  987. if AExtendedSyntax then
  988. begin
  989. // skip eXtended comments
  990. while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a)
  991. // do not use [#$d, #$a] due to UniCode compatibility
  992. do
  993. Inc(i);
  994. while (i + 1 <= Len) and
  995. ((ARegExpr[i + 1] = #$d) or (ARegExpr[i + 1] = #$a)) do
  996. Inc(i); // attempt to work with different kinds of line separators
  997. // now we are at the line separator that must be skipped.
  998. end;
  999. // here is no 'else' clause - we simply skip ordinary chars
  1000. end; // of case
  1001. Inc(i); // skip scanned char
  1002. // ! can move after Len due to skipping quoted symbol
  1003. end;
  1004. // check brackets balance
  1005. if StackIdx <> 0 then
  1006. Result := -1; // unbalansed '('
  1007. // check if entire r.e. added
  1008. if (ASubExprs.Count = 0) or ((PtrInt(ASubExprs.Objects[0]) and $FFFF) <> 1)
  1009. or (((PtrInt(ASubExprs.Objects[0]) ShR 16) and $FFFF) <> Len)
  1010. // whole r.e. wasn't added because it isn't bracketed
  1011. // well, we add it now:
  1012. then
  1013. ASubExprs.InsertObject(0, ARegExpr, TObject((Len ShL 16) or 1));
  1014. finally
  1015. FreeMem(Stack);
  1016. end;
  1017. end; { of function RegExprSubExpressions
  1018. -------------------------------------------------------------- }
  1019. const
  1020. OP_MAGIC = TREOp(216); // programm signature
  1021. // name opcode opnd? meaning
  1022. OP_EEND = TREOp(0); // - End of program
  1023. OP_BOL = TREOp(1); // - Match "" at beginning of line
  1024. OP_EOL = TREOp(2); // - Match "" at end of line
  1025. OP_ANY = TREOp(3); // - Match any one character
  1026. OP_ANYOF = TREOp(4); // Str Match any character in string Str
  1027. OP_ANYBUT = TREOp(5); // Str Match any char. not in string Str
  1028. OP_BRANCH = TREOp(6); // Node Match this alternative, or the next
  1029. OP_BACK = TREOp(7); // - Jump backward (Next < 0)
  1030. OP_EXACTLY = TREOp(8); // Str Match string Str
  1031. OP_NOTHING = TREOp(9); // - Match empty string
  1032. OP_STAR = TREOp(10); // Node Match this (simple) thing 0 or more times
  1033. OP_PLUS = TREOp(11); // Node Match this (simple) thing 1 or more times
  1034. OP_ANYDIGIT = TREOp(12); // - Match any digit (equiv [0-9])
  1035. OP_NOTDIGIT = TREOp(13); // - Match not digit (equiv [0-9])
  1036. OP_ANYLETTER = TREOp(14); // - Match any letter from property WordChars
  1037. OP_NOTLETTER = TREOp(15); // - Match not letter from property WordChars
  1038. OP_ANYSPACE = TREOp(16); // - Match any space char (see property SpaceChars)
  1039. OP_NOTSPACE = TREOp(17); // - Match not space char (see property SpaceChars)
  1040. OP_BRACES = TREOp(18);
  1041. // Node,Min,Max Match this (simple) thing from Min to Max times.
  1042. // Min and Max are TREBracesArg
  1043. OP_COMMENT = TREOp(19); // - Comment ;)
  1044. OP_EXACTLYCI = TREOp(20); // Str Match string Str case insensitive
  1045. OP_ANYOFCI = TREOp(21);
  1046. // Str Match any character in string Str, case insensitive
  1047. OP_ANYBUTCI = TREOp(22);
  1048. // Str Match any char. not in string Str, case insensitive
  1049. OP_LOOPENTRY = TREOp(23); // Node Start of loop (Node - LOOP for this loop)
  1050. OP_LOOP = TREOp(24); // Node,Min,Max,LoopEntryJmp - back jump for LOOPENTRY.
  1051. // Min and Max are TREBracesArg
  1052. // Node - next node in sequence,
  1053. // LoopEntryJmp - associated LOOPENTRY node addr
  1054. OP_BSUBEXP = TREOp(28);
  1055. // Idx Match previously matched subexpression #Idx (stored as REChar) //###0.936
  1056. OP_BSUBEXPCI = TREOp(29); // Idx -"- in case-insensitive mode
  1057. // Non-Greedy Style Ops //###0.940
  1058. OP_STARNG = TREOp(30); // Same as OP_START but in non-greedy mode
  1059. OP_PLUSNG = TREOp(31); // Same as OP_PLUS but in non-greedy mode
  1060. OP_BRACESNG = TREOp(32); // Same as OP_BRACES but in non-greedy mode
  1061. OP_LOOPNG = TREOp(33); // Same as OP_LOOP but in non-greedy mode
  1062. // Multiline mode \m
  1063. OP_BOLML = TREOp(34); // - Match "" at beginning of line
  1064. OP_EOLML = TREOp(35); // - Match "" at end of line
  1065. OP_ANYML = TREOp(36); // - Match any one character
  1066. // Word boundary
  1067. OP_BOUND = TREOp(37); // Match "" between words //###0.943
  1068. OP_NOTBOUND = TREOp(38); // Match "" not between words //###0.943
  1069. OP_ANYHORZSEP = TREOp(39); // Any horizontal whitespace \h
  1070. OP_NOTHORZSEP = TREOp(40); // Not horizontal whitespace \H
  1071. OP_ANYVERTSEP = TREOp(41); // Any vertical whitespace \v
  1072. OP_NOTVERTSEP = TREOp(42); // Not vertical whitespace \V
  1073. // !!! Change OP_OPEN value if you add new opcodes !!!
  1074. OP_OPEN = TREOp(43); // - Mark this point in input as start of \n
  1075. // OP_OPEN + 1 is \1, etc.
  1076. OP_CLOSE = TREOp(Ord(OP_OPEN) + NSUBEXP);
  1077. // - Analogous to OP_OPEN.
  1078. // !!! Don't add new OpCodes after CLOSE !!!
  1079. // We work with p-code through pointers, compatible with PRegExprChar.
  1080. // Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
  1081. // must have lengths that can be divided by SizeOf (REChar) !
  1082. // A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
  1083. // The Next is a offset from the opcode of the node containing it.
  1084. // An operand, if any, simply follows the node. (Note that much of
  1085. // the code generation knows about this implicit relationship!)
  1086. // Using TRENextOff=PtrInt speed up p-code processing.
  1087. // Opcodes description:
  1088. //
  1089. // BRANCH The set of branches constituting a single choice are hooked
  1090. // together with their "next" pointers, since precedence prevents
  1091. // anything being concatenated to any individual branch. The
  1092. // "next" pointer of the last BRANCH in a choice points to the
  1093. // thing following the whole choice. This is also where the
  1094. // final "next" pointer of each individual branch points; each
  1095. // branch starts with the operand node of a BRANCH node.
  1096. // BACK Normal "next" pointers all implicitly point forward; BACK
  1097. // exists to make loop structures possible.
  1098. // STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
  1099. // circular BRANCH structures using BACK. Complex '{min,max}'
  1100. // - as pair LOOPENTRY-LOOP (see below). Simple cases (one
  1101. // character per match) are implemented with STAR, PLUS and
  1102. // BRACES for speed and to minimize recursive plunges.
  1103. // LOOPENTRY,LOOP {min,max} are implemented as special pair
  1104. // LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
  1105. // current level.
  1106. // OPEN,CLOSE are numbered at compile time.
  1107. { ============================================================= }
  1108. { ================== Error handling section =================== }
  1109. { ============================================================= }
  1110. const
  1111. reeOk = 0;
  1112. reeCompNullArgument = 100;
  1113. reeCompParseRegTooManyBrackets = 102;
  1114. reeCompParseRegUnmatchedBrackets = 103;
  1115. reeCompParseRegUnmatchedBrackets2 = 104;
  1116. reeCompParseRegJunkOnEnd = 105;
  1117. reePlusStarOperandCouldBeEmpty = 106;
  1118. reeNestedSQP = 107;
  1119. reeBadHexDigit = 108;
  1120. reeInvalidRange = 109;
  1121. reeParseAtomTrailingBackSlash = 110;
  1122. reeNoHexCodeAfterBSlashX = 111;
  1123. reeHexCodeAfterBSlashXTooBig = 112;
  1124. reeUnmatchedSqBrackets = 113;
  1125. reeInternalUrp = 114;
  1126. reeQPSBFollowsNothing = 115;
  1127. reeTrailingBackSlash = 116;
  1128. reeNoLetterAfterBSlashC = 117;
  1129. reeMetaCharAfterMinusInRange = 118;
  1130. reeRarseAtomInternalDisaster = 119;
  1131. reeIncorrectBraces = 121;
  1132. reeBRACESArgTooBig = 122;
  1133. reeUnknownOpcodeInFillFirst = 123;
  1134. reeBracesMinParamGreaterMax = 124;
  1135. reeUnclosedComment = 125;
  1136. reeComplexBracesNotImplemented = 126;
  1137. reeUnrecognizedModifier = 127;
  1138. reeBadLinePairedSeparator = 128;
  1139. // Runtime errors must be >= 1000
  1140. reeRegRepeatCalledInappropriately = 1000;
  1141. reeMatchPrimMemoryCorruption = 1001;
  1142. reeMatchPrimCorruptedPointers = 1002;
  1143. reeNoExpression = 1003;
  1144. reeCorruptedProgram = 1004;
  1145. reeNoInputStringSpecified = 1005;
  1146. reeOffsetMustBePositive = 1006;
  1147. reeExecNextWithoutExec = 1007;
  1148. reeBadOpcodeInCharClass = 1008;
  1149. reeDumpCorruptedOpcode = 1011;
  1150. reeModifierUnsupported = 1013;
  1151. reeLoopStackExceeded = 1014;
  1152. reeLoopWithoutEntry = 1015;
  1153. function TRegExpr.ErrorMsg(AErrorID: integer): RegExprString;
  1154. begin
  1155. case AErrorID of
  1156. reeOk:
  1157. Result := 'No errors';
  1158. reeCompNullArgument:
  1159. Result := 'TRegExpr compile: null argument';
  1160. reeCompParseRegTooManyBrackets:
  1161. Result := 'TRegExpr compile: ParseReg: too many ()';
  1162. reeCompParseRegUnmatchedBrackets:
  1163. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1164. reeCompParseRegUnmatchedBrackets2:
  1165. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1166. reeCompParseRegJunkOnEnd:
  1167. Result := 'TRegExpr compile: ParseReg: junk at end';
  1168. reePlusStarOperandCouldBeEmpty:
  1169. Result := 'TRegExpr compile: *+ operand could be empty';
  1170. reeNestedSQP:
  1171. Result := 'TRegExpr compile: nested *?+';
  1172. reeBadHexDigit:
  1173. Result := 'TRegExpr compile: bad hex digit';
  1174. reeInvalidRange:
  1175. Result := 'TRegExpr compile: invalid [] range';
  1176. reeParseAtomTrailingBackSlash:
  1177. Result := 'TRegExpr compile: parse atom trailing \';
  1178. reeNoHexCodeAfterBSlashX:
  1179. Result := 'TRegExpr compile: no hex code after \x';
  1180. reeNoLetterAfterBSlashC:
  1181. Result := 'TRegExpr compile: no letter "A".."Z" after \c';
  1182. reeMetaCharAfterMinusInRange:
  1183. Result := 'TRegExpr compile: metachar after "-" in [] range';
  1184. reeHexCodeAfterBSlashXTooBig:
  1185. Result := 'TRegExpr compile: hex code after \x is too big';
  1186. reeUnmatchedSqBrackets:
  1187. Result := 'TRegExpr compile: unmatched []';
  1188. reeInternalUrp:
  1189. Result := 'TRegExpr compile: internal fail on char "|", ")"';
  1190. reeQPSBFollowsNothing:
  1191. Result := 'TRegExpr compile: ?+*{ follows nothing';
  1192. reeTrailingBackSlash:
  1193. Result := 'TRegExpr compile: trailing \';
  1194. reeRarseAtomInternalDisaster:
  1195. Result := 'TRegExpr compile: RarseAtom internal disaster';
  1196. reeIncorrectBraces:
  1197. Result := 'TRegExpr compile: incorrect {} braces';
  1198. reeBRACESArgTooBig:
  1199. Result := 'TRegExpr compile: braces {} argument too big';
  1200. reeUnknownOpcodeInFillFirst:
  1201. Result := 'TRegExpr compile: unknown opcode in FillFirstCharSet ('+DumpOp(fLastErrorOpcode)+')';
  1202. reeBracesMinParamGreaterMax:
  1203. Result := 'TRegExpr compile: braces {} min param greater then max';
  1204. reeUnclosedComment:
  1205. Result := 'TRegExpr compile: unclosed (?#comment)';
  1206. reeComplexBracesNotImplemented:
  1207. Result := 'TRegExpr compile: if you use braces {} and non-greedy ops *?, +?, ?? for complex cases, enable {$DEFINE ComplexBraces}';
  1208. reeUnrecognizedModifier:
  1209. Result := 'TRegExpr compile: unrecognized modifier';
  1210. reeBadLinePairedSeparator:
  1211. Result := 'TRegExpr compile: LinePairedSeparator must countain two different chars or be empty';
  1212. reeRegRepeatCalledInappropriately:
  1213. Result := 'TRegExpr exec: RegRepeat called inappropriately';
  1214. reeMatchPrimMemoryCorruption:
  1215. Result := 'TRegExpr exec: MatchPrim memory corruption';
  1216. reeMatchPrimCorruptedPointers:
  1217. Result := 'TRegExpr exec: MatchPrim corrupted pointers';
  1218. reeNoExpression:
  1219. Result := 'TRegExpr exec: empty expression';
  1220. reeCorruptedProgram:
  1221. Result := 'TRegExpr exec: corrupted opcode (no magic byte)';
  1222. reeNoInputStringSpecified:
  1223. Result := 'TRegExpr exec: empty input string';
  1224. reeOffsetMustBePositive:
  1225. Result := 'TRegExpr exec: offset must be >0';
  1226. reeExecNextWithoutExec:
  1227. Result := 'TRegExpr exec: ExecNext without Exec(Pos)';
  1228. reeBadOpcodeInCharClass:
  1229. Result := 'TRegExpr exec: invalid opcode in char class';
  1230. reeDumpCorruptedOpcode:
  1231. Result := 'TRegExpr dump: corrupted opcode';
  1232. reeLoopStackExceeded:
  1233. Result := 'TRegExpr exec: loop stack exceeded';
  1234. reeLoopWithoutEntry:
  1235. Result := 'TRegExpr exec: loop without loop entry';
  1236. else
  1237. Result := 'Unknown error';
  1238. end;
  1239. end; { of procedure TRegExpr.Error
  1240. -------------------------------------------------------------- }
  1241. function TRegExpr.LastError: integer;
  1242. begin
  1243. Result := fLastError;
  1244. fLastError := reeOk;
  1245. end; { of function TRegExpr.LastError
  1246. -------------------------------------------------------------- }
  1247. { ============================================================= }
  1248. { ===================== Common section ======================== }
  1249. { ============================================================= }
  1250. class function TRegExpr.VersionMajor: integer;
  1251. begin
  1252. Result := REVersionMajor;
  1253. end;
  1254. class function TRegExpr.VersionMinor: integer;
  1255. begin
  1256. Result := REVersionMinor;
  1257. end;
  1258. constructor TRegExpr.Create;
  1259. begin
  1260. inherited;
  1261. programm := nil;
  1262. fExpression := '';
  1263. fInputString := '';
  1264. FEmptyInputRaisesError := False;
  1265. regexpBegin := nil;
  1266. regexpIsCompiled := False;
  1267. FillChar(fModifiers, SIzeOf(fModifiers), 0);
  1268. ModifierI := RegExprModifierI;
  1269. ModifierR := RegExprModifierR;
  1270. ModifierS := RegExprModifierS;
  1271. ModifierG := RegExprModifierG;
  1272. ModifierM := RegExprModifierM;
  1273. ModifierX := RegExprModifierX;
  1274. SpaceChars := RegExprSpaceChars; // ###0.927
  1275. WordChars := RegExprWordChars; // ###0.929
  1276. fInvertCase := RegExprInvertCaseFunction; // ###0.927
  1277. fLineSeparators := RegExprLineSeparators; // ###0.941
  1278. LinePairedSeparator := RegExprLinePairedSeparator; // ###0.941
  1279. FUseOsLineEndOnReplace := True;
  1280. FReplaceLineEnd := sLineBreak;
  1281. {$IFDEF UnicodeWordDetection}
  1282. FUseUnicodeWordDetection := True;
  1283. {$ENDIF}
  1284. fSlowChecksSizeMax := 2000;
  1285. InitLineSepArray;
  1286. InitCharCheckers;
  1287. end; { of constructor TRegExpr.Create
  1288. -------------------------------------------------------------- }
  1289. constructor TRegExpr.Create(const AExpression: RegExprString);
  1290. begin
  1291. Create;
  1292. Expression := AExpression;
  1293. end;
  1294. destructor TRegExpr.Destroy;
  1295. begin
  1296. if programm <> nil then
  1297. begin
  1298. FreeMem(programm);
  1299. programm := nil;
  1300. end;
  1301. end; { of destructor TRegExpr.Destroy
  1302. -------------------------------------------------------------- }
  1303. class function TRegExpr.InvertCaseFunction(const Ch: REChar): REChar;
  1304. begin
  1305. Result := Ch;
  1306. if (Ch >= 'a') and (Ch <= 'z') then
  1307. begin
  1308. Dec(Result, 32);
  1309. Exit;
  1310. end;
  1311. if (Ch >= 'A') and (Ch <= 'Z') then
  1312. begin
  1313. Inc(Result, 32);
  1314. Exit;
  1315. end;
  1316. if Ord(Ch) < 128 then
  1317. Exit;
  1318. Result := _UpperCase(Ch);
  1319. if Result = Ch then
  1320. Result := _LowerCase(Ch);
  1321. Result := _UpperCase(Ch);
  1322. if Result = Ch then
  1323. Result := _LowerCase(Ch);
  1324. end; { of function TRegExpr.InvertCaseFunction
  1325. -------------------------------------------------------------- }
  1326. procedure TRegExpr.SetExpression(const AStr: RegExprString);
  1327. begin
  1328. if (AStr <> fExpression) or not regexpIsCompiled then
  1329. begin
  1330. regexpIsCompiled := False;
  1331. fExpression := AStr;
  1332. UniqueString(fExpression);
  1333. fRegexStart := PRegExprChar(fExpression);
  1334. fRegexEnd := fRegexStart + Length(fExpression);
  1335. InvalidateProgramm; // ###0.941
  1336. end;
  1337. end; { of procedure TRegExpr.SetExpression
  1338. -------------------------------------------------------------- }
  1339. function TRegExpr.GetSubExprCount: integer;
  1340. begin
  1341. // if nothing found, we must return -1 per TRegExpr docs
  1342. if startp[0] = nil then
  1343. Result := -1
  1344. else
  1345. Result := GrpCount;
  1346. end;
  1347. function TRegExpr.GetMatchPos(Idx: integer): PtrInt;
  1348. begin
  1349. Idx := GrpIndexes[Idx];
  1350. if (Idx >= 0) and (startp[Idx] <> nil) then
  1351. Result := startp[Idx] - fInputStart + 1
  1352. else
  1353. Result := -1;
  1354. end; { of function TRegExpr.GetMatchPos
  1355. -------------------------------------------------------------- }
  1356. function TRegExpr.GetMatchLen(Idx: integer): PtrInt;
  1357. begin
  1358. Idx := GrpIndexes[Idx];
  1359. if (Idx >= 0) and (startp[Idx] <> nil) then
  1360. Result := endp[Idx] - startp[Idx]
  1361. else
  1362. Result := -1;
  1363. end; { of function TRegExpr.GetMatchLen
  1364. -------------------------------------------------------------- }
  1365. function TRegExpr.GetMatch(Idx: integer): RegExprString;
  1366. begin
  1367. Result := '';
  1368. Idx := GrpIndexes[Idx];
  1369. if (Idx >= 0) and (endp[Idx] > startp[Idx]) then
  1370. SetString(Result, startp[Idx], endp[Idx] - startp[Idx]);
  1371. {
  1372. // then Result := copy (fInputString, MatchPos [Idx], MatchLen [Idx]) //###0.929
  1373. then
  1374. begin
  1375. SetLength(Result, endp[Idx] - startp[Idx]);
  1376. System.Move(startp[Idx]^, Result[1], Length(Result) * SizeOf(REChar));
  1377. end;
  1378. }
  1379. end; { of function TRegExpr.GetMatch
  1380. -------------------------------------------------------------- }
  1381. function TRegExpr.GetModifierStr: RegExprString;
  1382. begin
  1383. Result := '-';
  1384. if ModifierI then
  1385. Result := 'i' + Result
  1386. else
  1387. Result := Result + 'i';
  1388. if ModifierR then
  1389. Result := 'r' + Result
  1390. else
  1391. Result := Result + 'r';
  1392. if ModifierS then
  1393. Result := 's' + Result
  1394. else
  1395. Result := Result + 's';
  1396. if ModifierG then
  1397. Result := 'g' + Result
  1398. else
  1399. Result := Result + 'g';
  1400. if ModifierM then
  1401. Result := 'm' + Result
  1402. else
  1403. Result := Result + 'm';
  1404. if ModifierX then
  1405. Result := 'x' + Result
  1406. else
  1407. Result := Result + 'x';
  1408. if Result[Length(Result)] = '-' // remove '-' if all modifiers are 'On'
  1409. then
  1410. System.Delete(Result, Length(Result), 1);
  1411. end; { of function TRegExpr.GetModifierStr
  1412. -------------------------------------------------------------- }
  1413. procedure TRegExpr.SetModifierG(AValue: boolean);
  1414. begin
  1415. fModifiers.G := AValue;
  1416. end;
  1417. procedure TRegExpr.SetModifierI(AValue: boolean);
  1418. begin
  1419. fModifiers.I := AValue;
  1420. end;
  1421. procedure TRegExpr.SetModifierM(AValue: boolean);
  1422. begin
  1423. fModifiers.M := AValue;
  1424. end;
  1425. procedure TRegExpr.SetModifierR(AValue: boolean);
  1426. begin
  1427. fModifiers.R := AValue;
  1428. end;
  1429. procedure TRegExpr.SetModifierS(AValue: boolean);
  1430. begin
  1431. fModifiers.S := AValue;
  1432. end;
  1433. procedure TRegExpr.SetModifierX(AValue: boolean);
  1434. begin
  1435. fModifiers.X := AValue;
  1436. end;
  1437. procedure TRegExpr.SetModifierStr(const AStr: RegExprString);
  1438. begin
  1439. if not ParseModifiers(PRegExprChar(AStr), Length(AStr), fModifiers) then
  1440. Error(reeModifierUnsupported);
  1441. end; { of procedure TRegExpr.SetModifierStr
  1442. -------------------------------------------------------------- }
  1443. { ============================================================= }
  1444. { ==================== Compiler section ======================= }
  1445. { ============================================================= }
  1446. {$IFDEF UnicodeWordDetection}
  1447. {$IFDEF FPC}
  1448. function IsUnicodeWordChar(AChar: WideChar): boolean; inline;
  1449. var
  1450. NType: byte;
  1451. begin
  1452. if Ord(AChar) >= LOW_SURROGATE_BEGIN then
  1453. Exit(False);
  1454. NType := GetProps(Ord(AChar))^.Category;
  1455. Result := (NType <= UGC_OtherNumber);
  1456. end;
  1457. {$ELSE}
  1458. function IsUnicodeWordChar(AChar: WideChar): boolean; inline;
  1459. begin
  1460. Result := System.Character.IsLetterOrDigit(AChar);
  1461. end;
  1462. {$ENDIF}
  1463. {$ENDIF}
  1464. function TRegExpr.IsWordChar(AChar: REChar): boolean;
  1465. begin
  1466. Result := Pos(AChar, fWordChars) > 0;
  1467. {$IFDEF UnicodeWordDetection}
  1468. if not Result and (Ord(AChar) >= 128) and UseUnicodeWordDetection then
  1469. Result := IsUnicodeWordChar(AChar);
  1470. {$ENDIF}
  1471. end;
  1472. function TRegExpr.IsSpaceChar(AChar: REChar): boolean;
  1473. begin
  1474. Result := Pos(AChar, fSpaceChars) > 0;
  1475. end;
  1476. function TRegExpr.IsCustomLineSeparator(AChar: REChar): boolean;
  1477. begin
  1478. {$IFDEF UniCode}
  1479. Result := Pos(AChar, fLineSeparators) > 0;
  1480. {$ELSE}
  1481. Result := fLineSepArray[byte(AChar)];
  1482. {$ENDIF}
  1483. end;
  1484. function IsDigitChar(AChar: REChar): boolean; inline;
  1485. begin
  1486. case AChar of
  1487. '0' .. '9':
  1488. Result := True;
  1489. else
  1490. Result := False;
  1491. end;
  1492. end;
  1493. function IsHorzSeparator(AChar: REChar): boolean; inline;
  1494. begin
  1495. // Tab and Unicode categoty "Space Separator": https://www.compart.com/en/unicode/category/Zs
  1496. case AChar of
  1497. #9, #$20, #$A0:
  1498. Result := True;
  1499. {$IFDEF UniCode}
  1500. #$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000:
  1501. Result := True;
  1502. {$ENDIF}
  1503. else
  1504. Result := False;
  1505. end;
  1506. end;
  1507. function IsLineSeparator(AChar: REChar): boolean; inline;
  1508. begin
  1509. case AChar of
  1510. #$d, #$a, #$b, #$c:
  1511. Result := True;
  1512. {$IFDEF UniCode}
  1513. #$2028, #$2029, #$85:
  1514. Result := True;
  1515. {$ENDIF}
  1516. else
  1517. Result := False;
  1518. end;
  1519. end;
  1520. procedure TRegExpr.InvalidateProgramm;
  1521. begin
  1522. if programm <> nil then
  1523. begin
  1524. FreeMem(programm);
  1525. programm := nil;
  1526. end;
  1527. end; { of procedure TRegExpr.InvalidateProgramm
  1528. -------------------------------------------------------------- }
  1529. procedure TRegExpr.Compile;
  1530. begin
  1531. if fExpression = '' then
  1532. begin
  1533. Error(reeNoExpression);
  1534. Exit;
  1535. end;
  1536. CompileRegExpr(PRegExprChar(fExpression));
  1537. end; { of procedure TRegExpr.Compile
  1538. -------------------------------------------------------------- }
  1539. procedure TRegExpr.InitLineSepArray;
  1540. {$IFNDEF UniCode}
  1541. var
  1542. i: integer;
  1543. {$ENDIF}
  1544. begin
  1545. {$IFNDEF UniCode}
  1546. FillChar(fLineSepArray, SizeOf(fLineSepArray), 0);
  1547. for i := 1 to Length(fLineSeparators) do
  1548. fLineSepArray[byte(fLineSeparators[i])] := True;
  1549. {$ENDIF}
  1550. end;
  1551. function TRegExpr.IsProgrammOk: boolean;
  1552. begin
  1553. Result := False;
  1554. // check modifiers
  1555. if not IsModifiersEqual(fModifiers, fProgModifiers) // ###0.941
  1556. then
  1557. InvalidateProgramm;
  1558. // [Re]compile if needed
  1559. if programm = nil then
  1560. begin
  1561. Compile; // ###0.941
  1562. // Check [re]compiled programm
  1563. if programm = nil then
  1564. Exit; // error was set/raised by Compile (was reeExecAfterCompErr)
  1565. end;
  1566. if programm[0] <> OP_MAGIC // Program corrupted.
  1567. then
  1568. Error(reeCorruptedProgram)
  1569. else
  1570. Result := True;
  1571. end; { of function TRegExpr.IsProgrammOk
  1572. -------------------------------------------------------------- }
  1573. procedure TRegExpr.Tail(p: PRegExprChar; val: PRegExprChar);
  1574. // set the next-pointer at the end of a node chain
  1575. var
  1576. scan: PRegExprChar;
  1577. temp: PRegExprChar;
  1578. begin
  1579. if p = @regdummy then
  1580. Exit;
  1581. // Find last node.
  1582. scan := p;
  1583. repeat
  1584. temp := regnext(scan);
  1585. if temp = nil then
  1586. Break;
  1587. scan := temp;
  1588. until False;
  1589. // Set Next 'pointer'
  1590. if val < scan then
  1591. PRENextOff(AlignToPtr(scan + REOpSz))^ := -(scan - val) // ###0.948
  1592. // work around PWideChar subtraction bug (Delphi uses
  1593. // shr after subtraction to calculate widechar distance %-( )
  1594. // so, if difference is negative we have .. the "feature" :(
  1595. // I could wrap it in $IFDEF UniCode, but I didn't because
  1596. // "P – Q computes the difference between the address given
  1597. // by P (the higher address) and the address given by Q (the
  1598. // lower address)" - Delphi help quotation.
  1599. else
  1600. PRENextOff(AlignToPtr(scan + REOpSz))^ := val - scan; // ###0.933
  1601. end; { of procedure TRegExpr.Tail
  1602. -------------------------------------------------------------- }
  1603. procedure TRegExpr.OpTail(p: PRegExprChar; val: PRegExprChar);
  1604. // regtail on operand of first argument; nop if operandless
  1605. begin
  1606. // "Operandless" and "op != OP_BRANCH" are synonymous in practice.
  1607. if (p = nil) or (p = @regdummy) or (PREOp(p)^ <> OP_BRANCH) then
  1608. Exit;
  1609. Tail(p + REOpSz + RENextOffSz, val); // ###0.933
  1610. end; { of procedure TRegExpr.OpTail
  1611. -------------------------------------------------------------- }
  1612. function TRegExpr.EmitNode(op: TREOp): PRegExprChar; // ###0.933
  1613. // emit a node, return location
  1614. begin
  1615. Result := regcode;
  1616. if Result <> @regdummy then
  1617. begin
  1618. PREOp(regcode)^ := op;
  1619. Inc(regcode, REOpSz);
  1620. PRENextOff(AlignToPtr(regcode))^ := 0; // Next "pointer" := nil
  1621. Inc(regcode, RENextOffSz);
  1622. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  1623. regExactlyLen := PLongInt(regcode)
  1624. else
  1625. regExactlyLen := nil;
  1626. {$IFDEF DebugSynRegExpr}
  1627. if regcode - programm > regsize then
  1628. raise Exception.Create('TRegExpr.EmitNode buffer overrun');
  1629. {$ENDIF}
  1630. end
  1631. else
  1632. Inc(regsize, REOpSz + RENextOffSz);
  1633. // compute code size without code generation
  1634. end; { of function TRegExpr.EmitNode
  1635. -------------------------------------------------------------- }
  1636. procedure TRegExpr.EmitC(ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  1637. begin
  1638. if regcode <> @regdummy then
  1639. begin
  1640. regcode^ := ch;
  1641. Inc(regcode);
  1642. {$IFDEF DebugSynRegExpr}
  1643. if regcode - programm > regsize then
  1644. raise Exception.Create('TRegExpr.EmitC buffer overrun');
  1645. {$ENDIF}
  1646. end
  1647. else
  1648. Inc(regsize, REOpSz); // Type of p-code pointer always is ^REChar
  1649. end; { of procedure TRegExpr.EmitC
  1650. -------------------------------------------------------------- }
  1651. procedure TRegExpr.EmitInt(AValue: LongInt); {$IFDEF InlineFuncs}inline;{$ENDIF}
  1652. begin
  1653. if regcode <> @regdummy then
  1654. begin
  1655. PLongInt(regcode)^ := AValue;
  1656. Inc(regcode, RENumberSz);
  1657. {$IFDEF DebugSynRegExpr}
  1658. if regcode - programm > regsize then
  1659. raise Exception.Create('TRegExpr.EmitInt buffer overrun');
  1660. {$ENDIF}
  1661. end
  1662. else
  1663. Inc(regsize, RENumberSz);
  1664. end;
  1665. procedure TRegExpr.InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  1666. // insert an operator in front of already-emitted operand
  1667. // Means relocating the operand.
  1668. var
  1669. src, dst, place: PRegExprChar;
  1670. i: integer;
  1671. begin
  1672. if regcode = @regdummy then
  1673. begin
  1674. Inc(regsize, sz);
  1675. Exit;
  1676. end;
  1677. // move code behind insert position
  1678. src := regcode;
  1679. Inc(regcode, sz);
  1680. {$IFDEF DebugSynRegExpr}
  1681. if regcode - programm > regsize then
  1682. raise Exception.Create('TRegExpr.InsertOperator buffer overrun');
  1683. // if (opnd<regcode) or (opnd-regcode>regsize) then
  1684. // raise Exception.Create('TRegExpr.InsertOperator invalid opnd');
  1685. {$ENDIF}
  1686. dst := regcode;
  1687. while src > opnd do
  1688. begin
  1689. Dec(dst);
  1690. Dec(src);
  1691. dst^ := src^;
  1692. end;
  1693. place := opnd; // Op node, where operand used to be.
  1694. PREOp(place)^ := op;
  1695. Inc(place, REOpSz);
  1696. for i := 1 + REOpSz to sz do
  1697. begin
  1698. place^ := #0;
  1699. Inc(place);
  1700. end;
  1701. end; { of procedure TRegExpr.InsertOperator
  1702. -------------------------------------------------------------- }
  1703. function FindSkippedMetaLen(PStart, PEnd: PRegExprChar): integer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1704. // find length of initial segment of PStart string consisting
  1705. // entirely of characters not from IsMetaSymbol1.
  1706. begin
  1707. Result := 0;
  1708. while PStart < PEnd do
  1709. begin
  1710. if _IsMetaSymbol1(PStart^) then
  1711. Exit;
  1712. Inc(Result);
  1713. Inc(PStart)
  1714. end;
  1715. end;
  1716. const
  1717. // Flags to be passed up and down.
  1718. flag_HasWidth = 01; // Known never to match nil string.
  1719. flag_Simple = 02; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand.
  1720. flag_SpecStart = 04; // Starts with * or +.
  1721. flag_Worst = 0; // Worst case.
  1722. {$IFDEF UniCode}
  1723. RusRangeLoLow = #$430; // 'а'
  1724. RusRangeLoHigh = #$44F; // 'я'
  1725. RusRangeHiLow = #$410; // 'А'
  1726. RusRangeHiHigh = #$42F; // 'Я'
  1727. {$ELSE}
  1728. RusRangeLoLow = #$E0; // 'а' in cp1251
  1729. RusRangeLoHigh = #$FF; // 'я' in cp1251
  1730. RusRangeHiLow = #$C0; // 'А' in cp1251
  1731. RusRangeHiHigh = #$DF; // 'Я' in cp1251
  1732. {$ENDIF}
  1733. function TRegExpr.FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  1734. // Buffer contains char pairs: (Kind, Data), where Kind is one of OpKind_ values,
  1735. // and Data depends on Kind
  1736. var
  1737. ch, ch2: REChar;
  1738. N, i: integer;
  1739. begin
  1740. if AIgnoreCase then
  1741. AChar := _UpperCase(AChar);
  1742. repeat
  1743. case ABuffer^ of
  1744. OpKind_End:
  1745. begin
  1746. Result := False;
  1747. Exit;
  1748. end;
  1749. OpKind_Range:
  1750. begin
  1751. Inc(ABuffer);
  1752. ch := ABuffer^;
  1753. Inc(ABuffer);
  1754. ch2 := ABuffer^;
  1755. Inc(ABuffer);
  1756. {
  1757. // if AIgnoreCase, ch, ch2 are upcased in opcode
  1758. if AIgnoreCase then
  1759. begin
  1760. ch := _UpperCase(ch);
  1761. ch2 := _UpperCase(ch2);
  1762. end;
  1763. }
  1764. if (AChar >= ch) and (AChar <= ch2) then
  1765. begin
  1766. Result := True;
  1767. Exit;
  1768. end;
  1769. end;
  1770. OpKind_MetaClass:
  1771. begin
  1772. Inc(ABuffer);
  1773. N := Ord(ABuffer^);
  1774. Inc(ABuffer);
  1775. if CharCheckers[N](AChar) then
  1776. begin
  1777. Result := True;
  1778. Exit
  1779. end;
  1780. end;
  1781. OpKind_Char:
  1782. begin
  1783. Inc(ABuffer);
  1784. N := PLongInt(ABuffer)^;
  1785. Inc(ABuffer, RENumberSz);
  1786. for i := 1 to N do
  1787. begin
  1788. ch := ABuffer^;
  1789. Inc(ABuffer);
  1790. {
  1791. // already upcased in opcode
  1792. if AIgnoreCase then
  1793. ch := _UpperCase(ch);
  1794. }
  1795. if ch = AChar then
  1796. begin
  1797. Result := True;
  1798. Exit;
  1799. end;
  1800. end;
  1801. end;
  1802. else
  1803. Error(reeBadOpcodeInCharClass);
  1804. end;
  1805. until False; // assume that Buffer is ended correctly
  1806. end;
  1807. procedure TRegExpr.GetCharSetFromWordChars(var ARes: TRegExprCharset);
  1808. var
  1809. i: integer;
  1810. ch: REChar;
  1811. begin
  1812. ARes := [];
  1813. for i := 1 to Length(fWordChars) do
  1814. begin
  1815. ch := fWordChars[i];
  1816. {$IFDEF UniCode}
  1817. if Ord(ch) <= $FF then
  1818. {$ENDIF}
  1819. Include(ARes, byte(ch));
  1820. end;
  1821. end;
  1822. procedure TRegExpr.GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
  1823. var
  1824. i: integer;
  1825. ch: REChar;
  1826. begin
  1827. ARes := [];
  1828. for i := 1 to Length(fSpaceChars) do
  1829. begin
  1830. ch := fSpaceChars[i];
  1831. {$IFDEF UniCode}
  1832. if Ord(ch) <= $FF then
  1833. {$ENDIF}
  1834. Include(ARes, byte(ch));
  1835. end;
  1836. end;
  1837. procedure TRegExpr.GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  1838. var
  1839. ch, ch2: REChar;
  1840. TempSet: TRegExprCharSet;
  1841. N, i: integer;
  1842. begin
  1843. ARes := [];
  1844. TempSet := [];
  1845. repeat
  1846. case ABuffer^ of
  1847. OpKind_End:
  1848. Exit;
  1849. OpKind_Range:
  1850. begin
  1851. Inc(ABuffer);
  1852. ch := ABuffer^;
  1853. Inc(ABuffer);
  1854. ch2 := ABuffer^;
  1855. Inc(ABuffer);
  1856. for i := Ord(ch) to
  1857. {$IFDEF UniCode} Min(Ord(ch2), $FF) {$ELSE} Ord(ch2) {$ENDIF} do
  1858. begin
  1859. Include(ARes, byte(i));
  1860. if AIgnoreCase then
  1861. Include(ARes, byte(InvertCase(REChar(i))));
  1862. end;
  1863. end;
  1864. OpKind_MetaClass:
  1865. begin
  1866. Inc(ABuffer);
  1867. N := Ord(ABuffer^);
  1868. Inc(ABuffer);
  1869. if N = CheckerIndex_Word then
  1870. begin
  1871. GetCharSetFromWordChars(TempSet);
  1872. ARes := ARes + TempSet;
  1873. end
  1874. else
  1875. if N = CheckerIndex_NotWord then
  1876. begin
  1877. GetCharSetFromWordChars(TempSet);
  1878. ARes := ARes + (RegExprAllSet - TempSet);
  1879. end
  1880. else
  1881. if N = CheckerIndex_Space then
  1882. begin
  1883. GetCharSetFromSpaceChars(TempSet);
  1884. ARes := ARes + TempSet;
  1885. end
  1886. else
  1887. if N = CheckerIndex_NotSpace then
  1888. begin
  1889. GetCharSetFromSpaceChars(TempSet);
  1890. ARes := ARes + (RegExprAllSet - TempSet);
  1891. end
  1892. else
  1893. if N = CheckerIndex_Digit then
  1894. ARes := ARes + RegExprDigitSet
  1895. else
  1896. if N = CheckerIndex_NotDigit then
  1897. ARes := ARes + (RegExprAllSet - RegExprDigitSet)
  1898. else
  1899. if N = CheckerIndex_VertSep then
  1900. ARes := ARes + RegExprLineSeparatorsSet
  1901. else
  1902. if N = CheckerIndex_NotVertSep then
  1903. ARes := ARes + (RegExprAllSet - RegExprLineSeparatorsSet)
  1904. else
  1905. if N = CheckerIndex_HorzSep then
  1906. ARes := ARes + RegExprHorzSeparatorsSet
  1907. else
  1908. if N = CheckerIndex_NotHorzSep then
  1909. ARes := ARes + (RegExprAllSet - RegExprHorzSeparatorsSet)
  1910. else
  1911. if N = CheckerIndex_LowerAZ then
  1912. begin
  1913. if AIgnoreCase then
  1914. ARes := ARes + RegExprAllAzSet
  1915. else
  1916. ARes := ARes + RegExprLowerAzSet;
  1917. end
  1918. else
  1919. if N = CheckerIndex_UpperAZ then
  1920. begin
  1921. if AIgnoreCase then
  1922. ARes := ARes + RegExprAllAzSet
  1923. else
  1924. ARes := ARes + RegExprUpperAzSet;
  1925. end
  1926. else
  1927. Error(reeBadOpcodeInCharClass);
  1928. end;
  1929. OpKind_Char:
  1930. begin
  1931. Inc(ABuffer);
  1932. N := PLongInt(ABuffer)^;
  1933. Inc(ABuffer, RENumberSz);
  1934. for i := 1 to N do
  1935. begin
  1936. ch := ABuffer^;
  1937. Inc(ABuffer);
  1938. {$IFDEF UniCode}
  1939. if Ord(ch) <= $FF then
  1940. {$ENDIF}
  1941. begin
  1942. Include(ARes, byte(ch));
  1943. if AIgnoreCase then
  1944. Include(ARes, byte(InvertCase(ch)));
  1945. end;
  1946. end;
  1947. end;
  1948. else
  1949. Error(reeBadOpcodeInCharClass);
  1950. end;
  1951. until False; // assume that Buffer is ended correctly
  1952. end;
  1953. function TRegExpr.GetModifierG: boolean;
  1954. begin
  1955. Result := fModifiers.G;
  1956. end;
  1957. function TRegExpr.GetModifierI: boolean;
  1958. begin
  1959. Result := fModifiers.I;
  1960. end;
  1961. function TRegExpr.GetModifierM: boolean;
  1962. begin
  1963. Result := fModifiers.M;
  1964. end;
  1965. function TRegExpr.GetModifierR: boolean;
  1966. begin
  1967. Result := fModifiers.R;
  1968. end;
  1969. function TRegExpr.GetModifierS: boolean;
  1970. begin
  1971. Result := fModifiers.S;
  1972. end;
  1973. function TRegExpr.GetModifierX: boolean;
  1974. begin
  1975. Result := fModifiers.X;
  1976. end;
  1977. function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): boolean;
  1978. // Compile a regular expression into internal code
  1979. // We can't allocate space until we know how big the compiled form will be,
  1980. // but we can't compile it (and thus know how big it is) until we've got a
  1981. // place to put the code. So we cheat: we compile it twice, once with code
  1982. // generation turned off and size counting turned on, and once "for real".
  1983. // This also means that we don't allocate space until we are sure that the
  1984. // thing really will compile successfully, and we never have to move the
  1985. // code and thus invalidate pointers into it. (Note that it has to be in
  1986. // one piece because free() must be able to free it all.)
  1987. // Beware that the optimization-preparation code in here knows about some
  1988. // of the structure of the compiled regexp.
  1989. var
  1990. scan, longest, longestTemp: PRegExprChar;
  1991. Len, LenTemp: integer;
  1992. flags: integer;
  1993. begin
  1994. Result := False; // life too dark
  1995. flags := 0;
  1996. regparse := nil; // for correct error handling
  1997. regexpBegin := ARegExp;
  1998. regExactlyLen := nil;
  1999. ClearInternalIndexes;
  2000. fLastError := reeOk;
  2001. fLastErrorOpcode := TREOp(0);
  2002. try
  2003. if programm <> nil then
  2004. begin
  2005. FreeMem(programm);
  2006. programm := nil;
  2007. end;
  2008. if ARegExp = nil then
  2009. begin
  2010. Error(reeCompNullArgument);
  2011. Exit;
  2012. end;
  2013. fProgModifiers := fModifiers;
  2014. // well, may it's paranoia. I'll check it later... !!!!!!!!
  2015. // First pass: determine size, legality.
  2016. fSecondPass := False;
  2017. fCompModifiers := fModifiers;
  2018. regparse := ARegExp;
  2019. regnpar := 1;
  2020. regsize := 0;
  2021. regcode := @regdummy;
  2022. EmitC(OP_MAGIC);
  2023. if ParseReg(0, flags) = nil then
  2024. Exit;
  2025. // Allocate space.
  2026. GetMem(programm, regsize * SizeOf(REChar));
  2027. // Second pass: emit code.
  2028. fSecondPass := True;
  2029. fCompModifiers := fModifiers;
  2030. regparse := ARegExp;
  2031. regnpar := 1;
  2032. regcode := programm;
  2033. EmitC(OP_MAGIC);
  2034. if ParseReg(0, flags) = nil then
  2035. Exit;
  2036. // Dig out information for optimizations.
  2037. {$IFDEF UseFirstCharSet} // ###0.929
  2038. FirstCharSet := [];
  2039. FillFirstCharSet(programm + REOpSz);
  2040. for Len := 0 to 255 do
  2041. FirstCharArray[Len] := byte(Len) in FirstCharSet;
  2042. {$ENDIF}
  2043. reganchored := #0;
  2044. regmust := nil;
  2045. regmustlen := 0;
  2046. regmustString := '';
  2047. scan := programm + REOpSz; // First OP_BRANCH.
  2048. if PREOp(regnext(scan))^ = OP_EEND then
  2049. begin // Only one top-level choice.
  2050. scan := scan + REOpSz + RENextOffSz;
  2051. // Starting-point info.
  2052. if PREOp(scan)^ = OP_BOL then
  2053. Inc(reganchored);
  2054. // If there's something expensive in the r.e., find the longest
  2055. // literal string that must appear and make it the regmust. Resolve
  2056. // ties in favor of later strings, since the regstart check works
  2057. // with the beginning of the r.e. and avoiding duplication
  2058. // strengthens checking. Not a strong reason, but sufficient in the
  2059. // absence of others.
  2060. if (flags and flag_SpecStart) <> 0 then
  2061. begin
  2062. longest := nil;
  2063. Len := 0;
  2064. while scan <> nil do
  2065. begin
  2066. if PREOp(scan)^ = OP_EXACTLY then
  2067. begin
  2068. longestTemp := scan + REOpSz + RENextOffSz + RENumberSz;
  2069. LenTemp := PLongInt(scan + REOpSz + RENextOffSz)^;
  2070. if LenTemp >= Len then
  2071. begin
  2072. longest := longestTemp;
  2073. Len := LenTemp;
  2074. end;
  2075. end;
  2076. scan := regnext(scan);
  2077. end;
  2078. regmust := longest;
  2079. regmustlen := Len;
  2080. if regmustlen > 1 then // don't use regmust if too short
  2081. SetString(regmustString, regmust, regmustlen);
  2082. end;
  2083. end;
  2084. Result := True;
  2085. finally
  2086. begin
  2087. if not Result then
  2088. InvalidateProgramm;
  2089. regexpBegin := nil;
  2090. regexpIsCompiled := Result; // ###0.944
  2091. end;
  2092. end;
  2093. end; { of function TRegExpr.CompileRegExpr
  2094. -------------------------------------------------------------- }
  2095. procedure TRegExpr.SetUseOsLineEndOnReplace(AValue: boolean);
  2096. begin
  2097. if FUseOsLineEndOnReplace = AValue then
  2098. Exit;
  2099. FUseOsLineEndOnReplace := AValue;
  2100. if FUseOsLineEndOnReplace then
  2101. FReplaceLineEnd := sLineBreak
  2102. else
  2103. FReplaceLineEnd := #10;
  2104. end;
  2105. function TRegExpr.ParseReg(paren: integer; var flagp: integer): PRegExprChar;
  2106. // regular expression, i.e. main body or parenthesized thing
  2107. // Caller must absorb opening parenthesis.
  2108. // Combining parenthesis handling with the base level of regular expression
  2109. // is a trifle forced, but the need to tie the tails of the branches to what
  2110. // follows makes it hard to avoid.
  2111. var
  2112. ret, br, ender: PRegExprChar;
  2113. parno: integer;
  2114. flags: integer;
  2115. SavedModifiers: TRegExprModifiers;
  2116. begin
  2117. flags := 0;
  2118. Result := nil;
  2119. flagp := flag_HasWidth; // Tentatively.
  2120. parno := 0; // eliminate compiler stupid warning
  2121. SavedModifiers := fCompModifiers;
  2122. // Make an OP_OPEN node, if parenthesized.
  2123. if paren <> 0 then
  2124. begin
  2125. if regnpar >= NSUBEXP then
  2126. begin
  2127. Error(reeCompParseRegTooManyBrackets);
  2128. Exit;
  2129. end;
  2130. parno := regnpar;
  2131. Inc(regnpar);
  2132. ret := EmitNode(TREOp(Ord(OP_OPEN) + parno));
  2133. end
  2134. else
  2135. ret := nil;
  2136. // Pick up the branches, linking them together.
  2137. br := ParseBranch(flags);
  2138. if br = nil then
  2139. begin
  2140. Result := nil;
  2141. Exit;
  2142. end;
  2143. if ret <> nil then
  2144. Tail(ret, br) // OP_OPEN -> first.
  2145. else
  2146. ret := br;
  2147. if (flags and flag_HasWidth) = 0 then
  2148. flagp := flagp and not flag_HasWidth;
  2149. flagp := flagp or flags and flag_SpecStart;
  2150. while (regparse^ = '|') do
  2151. begin
  2152. Inc(regparse);
  2153. br := ParseBranch(flags);
  2154. if br = nil then
  2155. begin
  2156. Result := nil;
  2157. Exit;
  2158. end;
  2159. Tail(ret, br); // OP_BRANCH -> OP_BRANCH.
  2160. if (flags and flag_HasWidth) = 0 then
  2161. flagp := flagp and not flag_HasWidth;
  2162. flagp := flagp or flags and flag_SpecStart;
  2163. end;
  2164. // Make a closing node, and hook it on the end.
  2165. if paren <> 0 then
  2166. ender := EmitNode(TREOp(Ord(OP_CLOSE) + parno))
  2167. else
  2168. ender := EmitNode(OP_EEND);
  2169. Tail(ret, ender);
  2170. // Hook the tails of the branches to the closing node.
  2171. br := ret;
  2172. while br <> nil do
  2173. begin
  2174. OpTail(br, ender);
  2175. br := regnext(br);
  2176. end;
  2177. // Check for proper termination.
  2178. if paren <> 0 then
  2179. if regparse^ <> ')' then
  2180. begin
  2181. Error(reeCompParseRegUnmatchedBrackets);
  2182. Exit;
  2183. end
  2184. else
  2185. Inc(regparse); // skip trailing ')'
  2186. if (paren = 0) and (regparse < fRegexEnd) then
  2187. begin
  2188. if regparse^ = ')' then
  2189. Error(reeCompParseRegUnmatchedBrackets2)
  2190. else
  2191. Error(reeCompParseRegJunkOnEnd);
  2192. Exit;
  2193. end;
  2194. fCompModifiers := SavedModifiers; // restore modifiers of parent
  2195. Result := ret;
  2196. end; { of function TRegExpr.ParseReg
  2197. -------------------------------------------------------------- }
  2198. function TRegExpr.ParseBranch(var flagp: integer): PRegExprChar;
  2199. // one alternative of an | operator
  2200. // Implements the concatenation operator.
  2201. var
  2202. ret, chain, latest: PRegExprChar;
  2203. flags: integer;
  2204. begin
  2205. flags := 0;
  2206. flagp := flag_Worst; // Tentatively.
  2207. ret := EmitNode(OP_BRANCH);
  2208. chain := nil;
  2209. while (regparse < fRegexEnd) and (regparse^ <> '|') and (regparse^ <> ')') do
  2210. begin
  2211. latest := ParsePiece(flags);
  2212. if latest = nil then
  2213. begin
  2214. Result := nil;
  2215. Exit;
  2216. end;
  2217. flagp := flagp or flags and flag_HasWidth;
  2218. if chain = nil // First piece.
  2219. then
  2220. flagp := flagp or flags and flag_SpecStart
  2221. else
  2222. Tail(chain, latest);
  2223. chain := latest;
  2224. end;
  2225. if chain = nil // Loop ran zero times.
  2226. then
  2227. EmitNode(OP_NOTHING);
  2228. Result := ret;
  2229. end; { of function TRegExpr.ParseBranch
  2230. -------------------------------------------------------------- }
  2231. function TRegExpr.ParsePiece(var flagp: integer): PRegExprChar;
  2232. // something followed by possible [*+?{]
  2233. // Note that the branching code sequences used for ? and the general cases
  2234. // of * and + and { are somewhat optimized: they use the same OP_NOTHING node as
  2235. // both the endmarker for their branch list and the body of the last branch.
  2236. // It might seem that this node could be dispensed with entirely, but the
  2237. // endmarker role is not redundant.
  2238. function ParseNumber(AStart, AEnd: PRegExprChar): TREBracesArg;
  2239. begin
  2240. Result := 0;
  2241. if AEnd - AStart + 1 > 8 then
  2242. begin // prevent stupid scanning
  2243. Error(reeBRACESArgTooBig);
  2244. Exit;
  2245. end;
  2246. while AStart <= AEnd do
  2247. begin
  2248. Result := Result * 10 + (Ord(AStart^) - Ord('0'));
  2249. Inc(AStart);
  2250. end;
  2251. if (Result > MaxBracesArg) or (Result < 0) then
  2252. begin
  2253. Error(reeBRACESArgTooBig);
  2254. Exit;
  2255. end;
  2256. end;
  2257. var
  2258. TheOp: TREOp;
  2259. NextNode: PRegExprChar;
  2260. procedure EmitComplexBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
  2261. {$IFDEF ComplexBraces}
  2262. var
  2263. off: TRENextOff;
  2264. {$ENDIF}
  2265. begin
  2266. {$IFNDEF ComplexBraces}
  2267. Error(reeComplexBracesNotImplemented);
  2268. {$ELSE}
  2269. if ANonGreedyOp then
  2270. TheOp := OP_LOOPNG
  2271. else
  2272. TheOp := OP_LOOP;
  2273. InsertOperator(OP_LOOPENTRY, Result, REOpSz + RENextOffSz);
  2274. NextNode := EmitNode(TheOp);
  2275. if regcode <> @regdummy then
  2276. begin
  2277. off := (Result + REOpSz + RENextOffSz) - (regcode - REOpSz - RENextOffSz);
  2278. // back to Atom after OP_LOOPENTRY
  2279. PREBracesArg(AlignToInt(regcode))^ := ABracesMin;
  2280. Inc(regcode, REBracesArgSz);
  2281. PREBracesArg(AlignToInt(regcode))^ := ABracesMax;
  2282. Inc(regcode, REBracesArgSz);
  2283. PRENextOff(AlignToPtr(regcode))^ := off;
  2284. Inc(regcode, RENextOffSz);
  2285. {$IFDEF DebugSynRegExpr}
  2286. if regcode - programm > regsize then
  2287. raise Exception.Create
  2288. ('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun');
  2289. {$ENDIF}
  2290. end
  2291. else
  2292. Inc(regsize, REBracesArgSz * 2 + RENextOffSz);
  2293. Tail(Result, NextNode); // OP_LOOPENTRY -> OP_LOOP
  2294. if regcode <> @regdummy then
  2295. Tail(Result + REOpSz + RENextOffSz, NextNode); // Atom -> OP_LOOP
  2296. {$ENDIF}
  2297. end;
  2298. procedure EmitSimpleBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
  2299. begin
  2300. if ANonGreedyOp // ###0.940
  2301. then
  2302. TheOp := OP_BRACESNG
  2303. else
  2304. TheOp := OP_BRACES;
  2305. InsertOperator(TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
  2306. if regcode <> @regdummy then
  2307. begin
  2308. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin;
  2309. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax;
  2310. end;
  2311. end;
  2312. var
  2313. op: REChar;
  2314. NonGreedyOp, NonGreedyCh: boolean; // ###0.940
  2315. flags: integer;
  2316. BracesMin, Bracesmax: TREBracesArg;
  2317. p: PRegExprChar;
  2318. begin
  2319. flags := 0;
  2320. Result := ParseAtom(flags);
  2321. if Result = nil then
  2322. Exit;
  2323. op := regparse^;
  2324. if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then
  2325. begin
  2326. flagp := flags;
  2327. Exit;
  2328. end;
  2329. if ((flags and flag_HasWidth) = 0) and (op <> '?') then
  2330. begin
  2331. Error(reePlusStarOperandCouldBeEmpty);
  2332. Exit;
  2333. end;
  2334. case op of
  2335. '*':
  2336. begin
  2337. flagp := flag_Worst or flag_SpecStart;
  2338. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2339. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2340. // ###0.940
  2341. if (flags and flag_Simple) = 0 then
  2342. begin
  2343. if NonGreedyOp // ###0.940
  2344. then
  2345. EmitComplexBraces(0, MaxBracesArg, NonGreedyOp)
  2346. else
  2347. begin // Emit x* as (x&|), where & means "self".
  2348. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  2349. OpTail(Result, EmitNode(OP_BACK)); // and loop
  2350. OpTail(Result, Result); // back
  2351. Tail(Result, EmitNode(OP_BRANCH)); // or
  2352. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2353. end
  2354. end
  2355. else
  2356. begin // Simple
  2357. if NonGreedyOp // ###0.940
  2358. then
  2359. TheOp := OP_STARNG
  2360. else
  2361. TheOp := OP_STAR;
  2362. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2363. end;
  2364. if NonGreedyCh // ###0.940
  2365. then
  2366. Inc(regparse); // Skip extra char ('?')
  2367. end; { of case '*' }
  2368. '+':
  2369. begin
  2370. flagp := flag_Worst or flag_SpecStart or flag_HasWidth;
  2371. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2372. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2373. // ###0.940
  2374. if (flags and flag_Simple) = 0 then
  2375. begin
  2376. if NonGreedyOp // ###0.940
  2377. then
  2378. EmitComplexBraces(1, MaxBracesArg, NonGreedyOp)
  2379. else
  2380. begin // Emit x+ as x(&|), where & means "self".
  2381. NextNode := EmitNode(OP_BRANCH); // Either
  2382. Tail(Result, NextNode);
  2383. Tail(EmitNode(OP_BACK), Result); // loop back
  2384. Tail(NextNode, EmitNode(OP_BRANCH)); // or
  2385. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2386. end
  2387. end
  2388. else
  2389. begin // Simple
  2390. if NonGreedyOp // ###0.940
  2391. then
  2392. TheOp := OP_PLUSNG
  2393. else
  2394. TheOp := OP_PLUS;
  2395. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2396. end;
  2397. if NonGreedyCh // ###0.940
  2398. then
  2399. Inc(regparse); // Skip extra char ('?')
  2400. end; { of case '+' }
  2401. '?':
  2402. begin
  2403. flagp := flag_Worst;
  2404. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2405. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2406. // ###0.940
  2407. if NonGreedyOp then
  2408. begin // ###0.940 // We emit x?? as x{0,1}?
  2409. if (flags and flag_Simple) = 0 then
  2410. EmitComplexBraces(0, 1, NonGreedyOp)
  2411. else
  2412. EmitSimpleBraces(0, 1, NonGreedyOp);
  2413. end
  2414. else
  2415. begin // greedy '?'
  2416. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  2417. Tail(Result, EmitNode(OP_BRANCH)); // or
  2418. NextNode := EmitNode(OP_NOTHING); // nil.
  2419. Tail(Result, NextNode);
  2420. OpTail(Result, NextNode);
  2421. end;
  2422. if NonGreedyCh // ###0.940
  2423. then
  2424. Inc(regparse); // Skip extra char ('?')
  2425. end; { of case '?' }
  2426. '{':
  2427. begin
  2428. Inc(regparse);
  2429. p := regparse;
  2430. while IsDigitChar(regparse^) do // <min> MUST appear
  2431. Inc(regparse);
  2432. if (regparse^ <> '}') and (regparse^ <> ',') or (p = regparse) then
  2433. begin
  2434. Error(reeIncorrectBraces);
  2435. Exit;
  2436. end;
  2437. BracesMin := ParseNumber(p, regparse - 1);
  2438. if regparse^ = ',' then
  2439. begin
  2440. Inc(regparse);
  2441. p := regparse;
  2442. while IsDigitChar(regparse^) do
  2443. Inc(regparse);
  2444. if regparse^ <> '}' then
  2445. begin
  2446. Error(reeIncorrectBraces);
  2447. Exit;
  2448. end;
  2449. if p = regparse then
  2450. Bracesmax := MaxBracesArg
  2451. else
  2452. Bracesmax := ParseNumber(p, regparse - 1);
  2453. end
  2454. else
  2455. Bracesmax := BracesMin; // {n} == {n,n}
  2456. if BracesMin > Bracesmax then
  2457. begin
  2458. Error(reeBracesMinParamGreaterMax);
  2459. Exit;
  2460. end;
  2461. if BracesMin > 0 then
  2462. flagp := flag_Worst;
  2463. if Bracesmax > 0 then
  2464. flagp := flagp or flag_HasWidth or flag_SpecStart;
  2465. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2466. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2467. // ###0.940
  2468. if (flags and flag_Simple) <> 0 then
  2469. EmitSimpleBraces(BracesMin, Bracesmax, NonGreedyOp)
  2470. else
  2471. EmitComplexBraces(BracesMin, Bracesmax, NonGreedyOp);
  2472. if NonGreedyCh // ###0.940
  2473. then
  2474. Inc(regparse); // Skip extra char '?'
  2475. end; // of case '{'
  2476. // else // here we can't be
  2477. end; { of case op }
  2478. Inc(regparse);
  2479. op := regparse^;
  2480. if (op = '*') or (op = '+') or (op = '?') or (op = '{') then
  2481. Error(reeNestedSQP);
  2482. end; { of function TRegExpr.ParsePiece
  2483. -------------------------------------------------------------- }
  2484. function TRegExpr.HexDig(Ch: REChar): integer;
  2485. begin
  2486. case Ch of
  2487. '0' .. '9':
  2488. Result := Ord(Ch) - Ord('0');
  2489. 'a' .. 'f':
  2490. Result := Ord(Ch) - Ord('a') + 10;
  2491. 'A' .. 'F':
  2492. Result := Ord(Ch) - Ord('A') + 10;
  2493. else
  2494. begin
  2495. Result := 0;
  2496. Error(reeBadHexDigit);
  2497. end;
  2498. end;
  2499. end;
  2500. function TRegExpr.UnQuoteChar(var APtr: PRegExprChar): REChar;
  2501. var
  2502. Ch: REChar;
  2503. begin
  2504. case APtr^ of
  2505. 't':
  2506. Result := #$9; // \t => tab (HT/TAB)
  2507. 'n':
  2508. Result := #$a; // \n => newline (NL)
  2509. 'r':
  2510. Result := #$d; // \r => carriage return (CR)
  2511. 'f':
  2512. Result := #$c; // \f => form feed (FF)
  2513. 'a':
  2514. Result := #$7; // \a => alarm (bell) (BEL)
  2515. 'e':
  2516. Result := #$1b; // \e => escape (ESC)
  2517. 'c':
  2518. begin // \cK => code for Ctrl+K
  2519. Inc(APtr);
  2520. if APtr >= fRegexEnd then
  2521. Error(reeNoLetterAfterBSlashC);
  2522. Ch := APtr^;
  2523. case Ch of
  2524. 'a' .. 'z':
  2525. Result := REChar(Ord(Ch) - Ord('a') + 1);
  2526. 'A' .. 'Z':
  2527. Result := REChar(Ord(Ch) - Ord('A') + 1);
  2528. else
  2529. Error(reeNoLetterAfterBSlashC);
  2530. end;
  2531. end;
  2532. 'x':
  2533. begin // \x: hex char
  2534. Result := #0;
  2535. Inc(APtr);
  2536. if APtr >= fRegexEnd then
  2537. begin
  2538. Error(reeNoHexCodeAfterBSlashX);
  2539. Exit;
  2540. end;
  2541. if APtr^ = '{' then
  2542. begin // \x{nnnn} //###0.936
  2543. repeat
  2544. Inc(APtr);
  2545. if APtr >= fRegexEnd then
  2546. begin
  2547. Error(reeNoHexCodeAfterBSlashX);
  2548. Exit;
  2549. end;
  2550. if APtr^ <> '}' then
  2551. begin
  2552. if (Ord(Result) ShR (SizeOf(REChar) * 8 - 4)) and $F <> 0 then
  2553. begin
  2554. Error(reeHexCodeAfterBSlashXTooBig);
  2555. Exit;
  2556. end;
  2557. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  2558. // HexDig will cause Error if bad hex digit found
  2559. end
  2560. else
  2561. Break;
  2562. until False;
  2563. end
  2564. else
  2565. begin
  2566. Result := REChar(HexDig(APtr^));
  2567. // HexDig will cause Error if bad hex digit found
  2568. Inc(APtr);
  2569. if APtr >= fRegexEnd then
  2570. begin
  2571. Error(reeNoHexCodeAfterBSlashX);
  2572. Exit;
  2573. end;
  2574. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  2575. // HexDig will cause Error if bad hex digit found
  2576. end;
  2577. end;
  2578. else
  2579. Result := APtr^;
  2580. end;
  2581. end;
  2582. function TRegExpr.ParseAtom(var flagp: integer): PRegExprChar;
  2583. // the lowest level
  2584. // Optimization: gobbles an entire sequence of ordinary characters so that
  2585. // it can turn them into a single node, which is smaller to store and
  2586. // faster to run. Backslashed characters are exceptions, each becoming a
  2587. // separate node; the code is simpler that way and it's not worth fixing.
  2588. var
  2589. ret: PRegExprChar;
  2590. RangeBeg, RangeEnd: REChar;
  2591. CanBeRange: boolean;
  2592. AddrOfLen: PLongInt;
  2593. procedure EmitExactly(Ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2594. begin
  2595. if fCompModifiers.I then
  2596. ret := EmitNode(OP_EXACTLYCI)
  2597. else
  2598. ret := EmitNode(OP_EXACTLY);
  2599. EmitInt(1);
  2600. EmitC(Ch);
  2601. flagp := flagp or flag_HasWidth or flag_Simple;
  2602. end;
  2603. procedure EmitRangeChar(Ch: REChar; AStartOfRange: boolean); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2604. begin
  2605. CanBeRange := AStartOfRange;
  2606. if fCompModifiers.I then
  2607. Ch := _UpperCase(Ch);
  2608. if AStartOfRange then
  2609. begin
  2610. AddrOfLen := nil;
  2611. RangeBeg := Ch;
  2612. end
  2613. else
  2614. begin
  2615. if AddrOfLen = nil then
  2616. begin
  2617. EmitC(OpKind_Char);
  2618. Pointer(AddrOfLen) := regcode;
  2619. EmitInt(0);
  2620. end;
  2621. Inc(AddrOfLen^);
  2622. EmitC(Ch);
  2623. end;
  2624. end;
  2625. procedure EmitRangePacked(ch1, ch2: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2626. var
  2627. ChkIndex: integer;
  2628. begin
  2629. AddrOfLen := nil;
  2630. CanBeRange := False;
  2631. if fCompModifiers.I then
  2632. begin
  2633. ch1 := _UpperCase(ch1);
  2634. ch2 := _UpperCase(ch2);
  2635. end;
  2636. for ChkIndex := Low(CharCheckerInfos) to High(CharCheckerInfos) do
  2637. if (CharCheckerInfos[ChkIndex].CharBegin = ch1) and
  2638. (CharCheckerInfos[ChkIndex].CharEnd = ch2) then
  2639. begin
  2640. EmitC(OpKind_MetaClass);
  2641. EmitC(REChar(CharCheckerInfos[ChkIndex].CheckerIndex));
  2642. Exit;
  2643. end;
  2644. EmitC(OpKind_Range);
  2645. EmitC(ch1);
  2646. EmitC(ch2);
  2647. end;
  2648. var
  2649. flags: integer;
  2650. Len: integer;
  2651. SavedPtr: PRegExprChar;
  2652. EnderChar, TempChar: REChar;
  2653. begin
  2654. Result := nil;
  2655. flags := 0;
  2656. flagp := flag_Worst;
  2657. AddrOfLen := nil;
  2658. Inc(regparse);
  2659. case (regparse - 1)^ of
  2660. '^':
  2661. if not fCompModifiers.M or
  2662. ((fLineSeparators = '') and not fLinePairedSeparatorAssigned) then
  2663. ret := EmitNode(OP_BOL)
  2664. else
  2665. ret := EmitNode(OP_BOLML);
  2666. '$':
  2667. if not fCompModifiers.M or
  2668. ((fLineSeparators = '') and not fLinePairedSeparatorAssigned) then
  2669. ret := EmitNode(OP_EOL)
  2670. else
  2671. ret := EmitNode(OP_EOLML);
  2672. '.':
  2673. if fCompModifiers.S then
  2674. begin
  2675. ret := EmitNode(OP_ANY);
  2676. flagp := flagp or flag_HasWidth or flag_Simple;
  2677. end
  2678. else
  2679. begin // not /s, so emit [^:LineSeparators:]
  2680. ret := EmitNode(OP_ANYML);
  2681. flagp := flagp or flag_HasWidth; // not so simple ;)
  2682. end;
  2683. '[':
  2684. begin
  2685. if regparse^ = '^' then
  2686. begin // Complement of range.
  2687. if fCompModifiers.I then
  2688. ret := EmitNode(OP_ANYBUTCI)
  2689. else
  2690. ret := EmitNode(OP_ANYBUT);
  2691. Inc(regparse);
  2692. end
  2693. else if fCompModifiers.I then
  2694. ret := EmitNode(OP_ANYOFCI)
  2695. else
  2696. ret := EmitNode(OP_ANYOF);
  2697. CanBeRange := False;
  2698. if regparse^ = ']' then
  2699. begin
  2700. // first ']' inside [] treated as simple char, no need to check '['
  2701. EmitRangeChar(regparse^, (regparse + 1)^ = '-');
  2702. Inc(regparse);
  2703. end;
  2704. while (regparse < fRegexEnd) and (regparse^ <> ']') do
  2705. begin
  2706. if (regparse^ = '-') and ((regparse + 1) < fRegexEnd) and
  2707. ((regparse + 1)^ <> ']') and CanBeRange then
  2708. begin
  2709. Inc(regparse);
  2710. RangeEnd := regparse^;
  2711. if RangeEnd = EscChar then
  2712. begin
  2713. if _IsMetaChar((regparse + 1)^) then
  2714. begin
  2715. Error(reeMetaCharAfterMinusInRange);
  2716. Exit;
  2717. end;
  2718. Inc(regparse);
  2719. RangeEnd := UnQuoteChar(regparse);
  2720. end;
  2721. // special handling for Russian range a-YA, add 2 ranges: a-ya and A-YA
  2722. if fCompModifiers.R and
  2723. (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then
  2724. begin
  2725. EmitRangePacked(RusRangeLoLow, RusRangeLoHigh);
  2726. EmitRangePacked(RusRangeHiLow, RusRangeHiHigh);
  2727. end
  2728. else
  2729. begin // standard r.e. handling
  2730. if RangeBeg > RangeEnd then
  2731. begin
  2732. Error(reeInvalidRange);
  2733. Exit;
  2734. end;
  2735. EmitRangePacked(RangeBeg, RangeEnd);
  2736. end;
  2737. Inc(regparse);
  2738. end
  2739. else
  2740. begin
  2741. if regparse^ = EscChar then
  2742. begin
  2743. Inc(regparse);
  2744. if regparse >= fRegexEnd then
  2745. begin
  2746. Error(reeParseAtomTrailingBackSlash);
  2747. Exit;
  2748. end;
  2749. if _IsMetaChar(regparse^) then
  2750. begin
  2751. AddrOfLen := nil;
  2752. CanBeRange := False;
  2753. EmitC(OpKind_MetaClass);
  2754. case regparse^ of
  2755. 'w':
  2756. EmitC(REChar(CheckerIndex_Word));
  2757. 'W':
  2758. EmitC(REChar(CheckerIndex_NotWord));
  2759. 's':
  2760. EmitC(REChar(CheckerIndex_Space));
  2761. 'S':
  2762. EmitC(REChar(CheckerIndex_NotSpace));
  2763. 'd':
  2764. EmitC(REChar(CheckerIndex_Digit));
  2765. 'D':
  2766. EmitC(REChar(CheckerIndex_NotDigit));
  2767. 'v':
  2768. EmitC(REChar(CheckerIndex_VertSep));
  2769. 'V':
  2770. EmitC(REChar(CheckerIndex_NotVertSep));
  2771. 'h':
  2772. EmitC(REChar(CheckerIndex_HorzSep));
  2773. 'H':
  2774. EmitC(REChar(CheckerIndex_NotHorzSep));
  2775. else
  2776. Error(reeBadOpcodeInCharClass);
  2777. end;
  2778. end
  2779. else
  2780. begin
  2781. TempChar := UnQuoteChar(regparse);
  2782. EmitRangeChar(TempChar, (regparse + 1)^ = '-');
  2783. end;
  2784. end
  2785. else
  2786. begin
  2787. EmitRangeChar(regparse^, (regparse + 1)^ = '-');
  2788. end;
  2789. Inc(regparse);
  2790. end;
  2791. end; { of while }
  2792. AddrOfLen := nil;
  2793. CanBeRange := False;
  2794. EmitC(OpKind_End);
  2795. if regparse^ <> ']' then
  2796. begin
  2797. Error(reeUnmatchedSqBrackets);
  2798. Exit;
  2799. end;
  2800. Inc(regparse);
  2801. flagp := flagp or flag_HasWidth or flag_Simple;
  2802. end;
  2803. '(':
  2804. begin
  2805. if regparse^ = '?' then
  2806. begin
  2807. // check for non-capturing group: (?:text)
  2808. if (regparse + 1)^ = ':' then
  2809. begin
  2810. Inc(regparse, 2);
  2811. ret := ParseReg(1, flags);
  2812. if ret = nil then
  2813. begin
  2814. Result := nil;
  2815. Exit;
  2816. end;
  2817. flagp := flagp or flags and (flag_HasWidth or flag_SpecStart);
  2818. end
  2819. else
  2820. // check for extended Perl syntax : (?..)
  2821. if (regparse + 1)^ = '#' then
  2822. begin // (?#comment)
  2823. Inc(regparse, 2); // find closing ')'
  2824. while (regparse < fRegexEnd) and (regparse^ <> ')') do
  2825. Inc(regparse);
  2826. if regparse^ <> ')' then
  2827. begin
  2828. Error(reeUnclosedComment);
  2829. Exit;
  2830. end;
  2831. Inc(regparse); // skip ')'
  2832. ret := EmitNode(OP_COMMENT); // comment
  2833. end
  2834. else
  2835. begin // modifiers ?
  2836. Inc(regparse); // skip '?'
  2837. SavedPtr := regparse;
  2838. while (regparse < fRegexEnd) and (regparse^ <> ')') do
  2839. Inc(regparse);
  2840. if (regparse^ <> ')') or
  2841. not ParseModifiers(SavedPtr, regparse - SavedPtr, fCompModifiers) then
  2842. begin
  2843. Error(reeUnrecognizedModifier);
  2844. Exit;
  2845. end;
  2846. Inc(regparse); // skip ')'
  2847. ret := EmitNode(OP_COMMENT); // comment
  2848. // Error (reeQPSBFollowsNothing);
  2849. // Exit;
  2850. end;
  2851. end
  2852. else
  2853. begin
  2854. // normal (capturing) group
  2855. if fSecondPass then
  2856. // must skip this block for one of passes, to not double groups count
  2857. if GrpCount < NSUBEXP - 1 then
  2858. begin
  2859. Inc(GrpCount);
  2860. GrpIndexes[GrpCount] := regnpar;
  2861. end;
  2862. ret := ParseReg(1, flags);
  2863. if ret = nil then
  2864. begin
  2865. Result := nil;
  2866. Exit;
  2867. end;
  2868. flagp := flagp or flags and (flag_HasWidth or flag_SpecStart);
  2869. end;
  2870. end;
  2871. '|', ')':
  2872. begin // Supposed to be caught earlier.
  2873. Error(reeInternalUrp);
  2874. Exit;
  2875. end;
  2876. '?', '+', '*':
  2877. begin
  2878. Error(reeQPSBFollowsNothing);
  2879. Exit;
  2880. end;
  2881. EscChar:
  2882. begin
  2883. if regparse >= fRegexEnd then
  2884. begin
  2885. Error(reeTrailingBackSlash);
  2886. Exit;
  2887. end;
  2888. case regparse^ of // r.e.extensions
  2889. 'b':
  2890. ret := EmitNode(OP_BOUND); // ###0.943
  2891. 'B':
  2892. ret := EmitNode(OP_NOTBOUND); // ###0.943
  2893. 'A':
  2894. ret := EmitNode(OP_BOL); // ###0.941
  2895. 'Z':
  2896. ret := EmitNode(OP_EOL); // ###0.941
  2897. 'd':
  2898. begin // r.e.extension - any digit ('0' .. '9')
  2899. ret := EmitNode(OP_ANYDIGIT);
  2900. flagp := flagp or flag_HasWidth or flag_Simple;
  2901. end;
  2902. 'D':
  2903. begin // r.e.extension - not digit ('0' .. '9')
  2904. ret := EmitNode(OP_NOTDIGIT);
  2905. flagp := flagp or flag_HasWidth or flag_Simple;
  2906. end;
  2907. 's':
  2908. begin // r.e.extension - any space char
  2909. ret := EmitNode(OP_ANYSPACE);
  2910. flagp := flagp or flag_HasWidth or flag_Simple;
  2911. end;
  2912. 'S':
  2913. begin // r.e.extension - not space char
  2914. ret := EmitNode(OP_NOTSPACE);
  2915. flagp := flagp or flag_HasWidth or flag_Simple;
  2916. end;
  2917. 'w':
  2918. begin // r.e.extension - any english char / digit / '_'
  2919. ret := EmitNode(OP_ANYLETTER);
  2920. flagp := flagp or flag_HasWidth or flag_Simple;
  2921. end;
  2922. 'W':
  2923. begin // r.e.extension - not english char / digit / '_'
  2924. ret := EmitNode(OP_NOTLETTER);
  2925. flagp := flagp or flag_HasWidth or flag_Simple;
  2926. end;
  2927. 'v':
  2928. begin
  2929. ret := EmitNode(OP_ANYVERTSEP);
  2930. flagp := flagp or flag_HasWidth or flag_Simple;
  2931. end;
  2932. 'V':
  2933. begin
  2934. ret := EmitNode(OP_NOTVERTSEP);
  2935. flagp := flagp or flag_HasWidth or flag_Simple;
  2936. end;
  2937. 'h':
  2938. begin
  2939. ret := EmitNode(OP_ANYHORZSEP);
  2940. flagp := flagp or flag_HasWidth or flag_Simple;
  2941. end;
  2942. 'H':
  2943. begin
  2944. ret := EmitNode(OP_NOTHORZSEP);
  2945. flagp := flagp or flag_HasWidth or flag_Simple;
  2946. end;
  2947. '1' .. '9':
  2948. begin // ###0.936
  2949. if fCompModifiers.I then
  2950. ret := EmitNode(OP_BSUBEXPCI)
  2951. else
  2952. ret := EmitNode(OP_BSUBEXP);
  2953. EmitC(REChar(Ord(regparse^) - Ord('0')));
  2954. flagp := flagp or flag_HasWidth or flag_Simple;
  2955. end;
  2956. else
  2957. EmitExactly(UnQuoteChar(regparse));
  2958. end; { of case }
  2959. Inc(regparse);
  2960. end;
  2961. else
  2962. begin
  2963. Dec(regparse);
  2964. if fCompModifiers.X and // check for eXtended syntax
  2965. ((regparse^ = '#') or IsIgnoredChar(regparse^)) then
  2966. begin // ###0.941 \x
  2967. if regparse^ = '#' then
  2968. begin // Skip eXtended comment
  2969. // find comment terminator (group of \n and/or \r)
  2970. while (regparse < fRegexEnd) and (regparse^ <> #$d) and
  2971. (regparse^ <> #$a) do
  2972. Inc(regparse);
  2973. while (regparse^ = #$d) or (regparse^ = #$a)
  2974. // skip comment terminator
  2975. do
  2976. Inc(regparse);
  2977. // attempt to support different type of line separators
  2978. end
  2979. else
  2980. begin // Skip the blanks!
  2981. while IsIgnoredChar(regparse^) do
  2982. Inc(regparse);
  2983. end;
  2984. ret := EmitNode(OP_COMMENT); // comment
  2985. end
  2986. else
  2987. begin
  2988. Len := FindSkippedMetaLen(regparse, fRegexEnd);
  2989. if Len <= 0 then
  2990. if regparse^ <> '{' then
  2991. begin
  2992. Error(reeRarseAtomInternalDisaster);
  2993. Exit;
  2994. end
  2995. else
  2996. Len := FindSkippedMetaLen(regparse + 1, fRegexEnd) + 1;
  2997. // bad {n,m} - compile as EXACTLY
  2998. EnderChar := (regparse + Len)^;
  2999. if (Len > 1) and ((EnderChar = '*') or (EnderChar = '+') or (EnderChar = '?') or (EnderChar = '{')) then
  3000. Dec(Len); // back off clear of ?+*{ operand.
  3001. flagp := flagp or flag_HasWidth;
  3002. if Len = 1 then
  3003. flagp := flagp or flag_Simple;
  3004. if fCompModifiers.I then
  3005. ret := EmitNode(OP_EXACTLYCI)
  3006. else
  3007. ret := EmitNode(OP_EXACTLY);
  3008. EmitInt(0);
  3009. while (Len > 0) and ((not fCompModifiers.X) or (regparse^ <> '#')) do
  3010. begin
  3011. if not fCompModifiers.X or not IsIgnoredChar(regparse^) then
  3012. begin
  3013. EmitC(regparse^);
  3014. if regcode <> @regdummy then
  3015. Inc(regExactlyLen^);
  3016. end;
  3017. Inc(regparse);
  3018. Dec(Len);
  3019. end;
  3020. end; { of if not comment }
  3021. end; { of case else }
  3022. end; { of case }
  3023. Result := ret;
  3024. end; { of function TRegExpr.ParseAtom
  3025. -------------------------------------------------------------- }
  3026. function TRegExpr.GetCompilerErrorPos: PtrInt;
  3027. begin
  3028. Result := 0;
  3029. if (regexpBegin = nil) or (regparse = nil) then
  3030. Exit; // not in compiling mode ?
  3031. Result := regparse - regexpBegin;
  3032. end; { of function TRegExpr.GetCompilerErrorPos
  3033. -------------------------------------------------------------- }
  3034. { ============================================================= }
  3035. { ===================== Matching section ====================== }
  3036. { ============================================================= }
  3037. function TRegExpr.regrepeat(p: PRegExprChar; AMax: integer): integer;
  3038. // repeatedly match something simple, report how many
  3039. var
  3040. scan: PRegExprChar;
  3041. opnd: PRegExprChar;
  3042. TheMax, NLen: integer;
  3043. InvChar: REChar; // ###0.931
  3044. GrpStart, GrpEnd: PRegExprChar; // ###0.936
  3045. ArrayIndex: integer;
  3046. begin
  3047. Result := 0;
  3048. scan := reginput;
  3049. opnd := p + REOpSz + RENextOffSz; // OPERAND
  3050. TheMax := fInputEnd - scan;
  3051. if TheMax > AMax then
  3052. TheMax := AMax;
  3053. case PREOp(p)^ of
  3054. OP_ANY:
  3055. begin
  3056. // note - OP_ANYML cannot be proceeded in regrepeat because can skip
  3057. // more than one char at once
  3058. Result := TheMax;
  3059. Inc(scan, Result);
  3060. end;
  3061. OP_EXACTLY:
  3062. begin // in opnd can be only ONE char !!!
  3063. NLen := PLongInt(opnd)^;
  3064. if TheMax > NLen then
  3065. TheMax := NLen;
  3066. Inc(opnd, RENumberSz);
  3067. while (Result < TheMax) and (opnd^ = scan^) do
  3068. begin
  3069. Inc(Result);
  3070. Inc(scan);
  3071. end;
  3072. end;
  3073. OP_EXACTLYCI:
  3074. begin // in opnd can be only ONE char !!!
  3075. NLen := PLongInt(opnd)^;
  3076. if TheMax > NLen then
  3077. TheMax := NLen;
  3078. Inc(opnd, RENumberSz);
  3079. while (Result < TheMax) and (opnd^ = scan^) do
  3080. begin // prevent unneeded InvertCase //###0.931
  3081. Inc(Result);
  3082. Inc(scan);
  3083. end;
  3084. if Result < TheMax then
  3085. begin // ###0.931
  3086. InvChar := InvertCase(opnd^); // store in register
  3087. while (Result < TheMax) and ((opnd^ = scan^) or (InvChar = scan^)) do
  3088. begin
  3089. Inc(Result);
  3090. Inc(scan);
  3091. end;
  3092. end;
  3093. end;
  3094. OP_BSUBEXP:
  3095. begin // ###0.936
  3096. ArrayIndex := GrpIndexes[Ord(opnd^)];
  3097. if ArrayIndex < 0 then
  3098. Exit;
  3099. GrpStart := startp[ArrayIndex];
  3100. if GrpStart = nil then
  3101. Exit;
  3102. GrpEnd := endp[ArrayIndex];
  3103. if GrpEnd = nil then
  3104. Exit;
  3105. repeat
  3106. opnd := GrpStart;
  3107. while opnd < GrpEnd do
  3108. begin
  3109. if (scan >= fInputEnd) or (scan^ <> opnd^) then
  3110. Exit;
  3111. Inc(scan);
  3112. Inc(opnd);
  3113. end;
  3114. Inc(Result);
  3115. reginput := scan;
  3116. until Result >= AMax;
  3117. end;
  3118. OP_BSUBEXPCI:
  3119. begin // ###0.936
  3120. ArrayIndex := GrpIndexes[Ord(opnd^)];
  3121. if ArrayIndex < 0 then
  3122. Exit;
  3123. GrpStart := startp[ArrayIndex];
  3124. if GrpStart = nil then
  3125. Exit;
  3126. GrpEnd := endp[ArrayIndex];
  3127. if GrpEnd = nil then
  3128. Exit;
  3129. repeat
  3130. opnd := GrpStart;
  3131. while opnd < GrpEnd do
  3132. begin
  3133. if (scan >= fInputEnd) or
  3134. ((scan^ <> opnd^) and (scan^ <> InvertCase(opnd^))) then
  3135. Exit;
  3136. Inc(scan);
  3137. Inc(opnd);
  3138. end;
  3139. Inc(Result);
  3140. reginput := scan;
  3141. until Result >= AMax;
  3142. end;
  3143. OP_ANYDIGIT:
  3144. while (Result < TheMax) and IsDigitChar(scan^) do
  3145. begin
  3146. Inc(Result);
  3147. Inc(scan);
  3148. end;
  3149. OP_NOTDIGIT:
  3150. while (Result < TheMax) and not IsDigitChar(scan^) do
  3151. begin
  3152. Inc(Result);
  3153. Inc(scan);
  3154. end;
  3155. OP_ANYLETTER:
  3156. while (Result < TheMax) and IsWordChar(scan^) do // ###0.940
  3157. begin
  3158. Inc(Result);
  3159. Inc(scan);
  3160. end;
  3161. OP_NOTLETTER:
  3162. while (Result < TheMax) and not IsWordChar(scan^) do // ###0.940
  3163. begin
  3164. Inc(Result);
  3165. Inc(scan);
  3166. end;
  3167. OP_ANYSPACE:
  3168. while (Result < TheMax) and IsSpaceChar(scan^) do
  3169. begin
  3170. Inc(Result);
  3171. Inc(scan);
  3172. end;
  3173. OP_NOTSPACE:
  3174. while (Result < TheMax) and not IsSpaceChar(scan^) do
  3175. begin
  3176. Inc(Result);
  3177. Inc(scan);
  3178. end;
  3179. OP_ANYVERTSEP:
  3180. while (Result < TheMax) and IsLineSeparator(scan^) do
  3181. begin
  3182. Inc(Result);
  3183. Inc(scan);
  3184. end;
  3185. OP_NOTVERTSEP:
  3186. while (Result < TheMax) and not IsLineSeparator(scan^) do
  3187. begin
  3188. Inc(Result);
  3189. Inc(scan);
  3190. end;
  3191. OP_ANYHORZSEP:
  3192. while (Result < TheMax) and IsHorzSeparator(scan^) do
  3193. begin
  3194. Inc(Result);
  3195. Inc(scan);
  3196. end;
  3197. OP_NOTHORZSEP:
  3198. while (Result < TheMax) and not IsHorzSeparator(scan^) do
  3199. begin
  3200. Inc(Result);
  3201. Inc(scan);
  3202. end;
  3203. OP_ANYOF:
  3204. while (Result < TheMax) and FindInCharClass(opnd, scan^, False) do
  3205. begin
  3206. Inc(Result);
  3207. Inc(scan);
  3208. end;
  3209. OP_ANYBUT:
  3210. while (Result < TheMax) and not FindInCharClass(opnd, scan^, False) do
  3211. begin
  3212. Inc(Result);
  3213. Inc(scan);
  3214. end;
  3215. OP_ANYOFCI:
  3216. while (Result < TheMax) and FindInCharClass(opnd, scan^, True) do
  3217. begin
  3218. Inc(Result);
  3219. Inc(scan);
  3220. end;
  3221. OP_ANYBUTCI:
  3222. while (Result < TheMax) and not FindInCharClass(opnd, scan^, True) do
  3223. begin
  3224. Inc(Result);
  3225. Inc(scan);
  3226. end;
  3227. else
  3228. begin // Oh dear. Called inappropriately.
  3229. Result := 0; // Best compromise.
  3230. Error(reeRegRepeatCalledInappropriately);
  3231. Exit;
  3232. end;
  3233. end; { of case }
  3234. reginput := scan;
  3235. end; { of function TRegExpr.regrepeat
  3236. -------------------------------------------------------------- }
  3237. function TRegExpr.regnext(p: PRegExprChar): PRegExprChar;
  3238. // dig the "next" pointer out of a node
  3239. var
  3240. offset: TRENextOff;
  3241. begin
  3242. if p = @regdummy then
  3243. begin
  3244. Result := nil;
  3245. Exit;
  3246. end;
  3247. offset := PRENextOff(AlignToPtr(p + REOpSz))^; // ###0.933 inlined NEXT
  3248. if offset = 0 then
  3249. Result := nil
  3250. else
  3251. Result := p + offset;
  3252. end; { of function TRegExpr.regnext
  3253. -------------------------------------------------------------- }
  3254. function TRegExpr.MatchPrim(prog: PRegExprChar): boolean;
  3255. // recursively matching routine
  3256. // Conceptually the strategy is simple: check to see whether the current
  3257. // node matches, call self recursively to see whether the rest matches,
  3258. // and then act accordingly. In practice we make some effort to avoid
  3259. // recursion, in particular by going through "ordinary" nodes (that don't
  3260. // need to know whether the rest of the match failed) by a loop instead of
  3261. // by recursion.
  3262. var
  3263. scan: PRegExprChar; // Current node.
  3264. next: PRegExprChar; // Next node.
  3265. Len: PtrInt;
  3266. opnd: PRegExprChar;
  3267. no: integer;
  3268. save: PRegExprChar;
  3269. nextch: REChar;
  3270. BracesMin, Bracesmax: integer;
  3271. // we use integer instead of TREBracesArg for better support */+
  3272. {$IFDEF ComplexBraces}
  3273. SavedLoopStack: TRegExprLoopStack; // :(( very bad for recursion
  3274. SavedLoopStackIdx: integer; // ###0.925
  3275. {$ENDIF}
  3276. bound1, bound2: boolean;
  3277. begin
  3278. Result := False;
  3279. {$IFDEF ComplexBraces}
  3280. SavedLoopStack:=Default(TRegExprLoopStack);
  3281. SavedLoopStackIdx:=0;
  3282. {$ENDIF}
  3283. scan := prog;
  3284. while scan <> nil do
  3285. begin
  3286. Len := PRENextOff(AlignToPtr(scan + 1))^; // ###0.932 inlined regnext
  3287. if Len = 0 then
  3288. next := nil
  3289. else
  3290. next := scan + Len;
  3291. case scan^ of
  3292. OP_NOTBOUND,
  3293. OP_BOUND:
  3294. begin
  3295. bound1 := (reginput = fInputStart) or not IsWordChar((reginput - 1)^);
  3296. bound2 := (reginput = fInputEnd) or not IsWordChar(reginput^);
  3297. if (scan^ = OP_BOUND) xor (bound1 <> bound2) then
  3298. Exit;
  3299. end;
  3300. OP_BOL:
  3301. begin
  3302. if reginput <> fInputStart then
  3303. Exit;
  3304. end;
  3305. OP_EOL:
  3306. begin
  3307. if reginput < fInputEnd then
  3308. Exit;
  3309. end;
  3310. OP_BOLML:
  3311. if reginput > fInputStart then
  3312. begin
  3313. nextch := (reginput - 1)^;
  3314. if (nextch <> fLinePairedSeparatorTail) or
  3315. ((reginput - 1) <= fInputStart) or
  3316. ((reginput - 2)^ <> fLinePairedSeparatorHead) then
  3317. begin
  3318. if (nextch = fLinePairedSeparatorHead) and
  3319. (reginput^ = fLinePairedSeparatorTail) then
  3320. Exit; // don't stop between paired separator
  3321. if not IsCustomLineSeparator(nextch) then
  3322. Exit;
  3323. end;
  3324. end;
  3325. OP_EOLML:
  3326. if reginput < fInputEnd then
  3327. begin
  3328. nextch := reginput^;
  3329. if (nextch <> fLinePairedSeparatorHead) or
  3330. ((reginput + 1)^ <> fLinePairedSeparatorTail) then
  3331. begin
  3332. if (nextch = fLinePairedSeparatorTail) and (reginput > fInputStart)
  3333. and ((reginput - 1)^ = fLinePairedSeparatorHead) then
  3334. Exit; // don't stop between paired separator
  3335. if not IsCustomLineSeparator(nextch) then
  3336. Exit;
  3337. end;
  3338. end;
  3339. OP_ANY:
  3340. begin
  3341. if reginput = fInputEnd then
  3342. Exit;
  3343. Inc(reginput);
  3344. end;
  3345. OP_ANYML:
  3346. begin // ###0.941
  3347. if (reginput = fInputEnd) or
  3348. ((reginput^ = fLinePairedSeparatorHead) and
  3349. ((reginput + 1)^ = fLinePairedSeparatorTail)) or
  3350. IsCustomLineSeparator(reginput^)
  3351. then
  3352. Exit;
  3353. Inc(reginput);
  3354. end;
  3355. OP_ANYDIGIT:
  3356. begin
  3357. if (reginput = fInputEnd) or not IsDigitChar(reginput^) then
  3358. Exit;
  3359. Inc(reginput);
  3360. end;
  3361. OP_NOTDIGIT:
  3362. begin
  3363. if (reginput = fInputEnd) or IsDigitChar(reginput^) then
  3364. Exit;
  3365. Inc(reginput);
  3366. end;
  3367. OP_ANYLETTER:
  3368. begin
  3369. if (reginput = fInputEnd) or not IsWordChar(reginput^) // ###0.943
  3370. then
  3371. Exit;
  3372. Inc(reginput);
  3373. end;
  3374. OP_NOTLETTER:
  3375. begin
  3376. if (reginput = fInputEnd) or IsWordChar(reginput^) // ###0.943
  3377. then
  3378. Exit;
  3379. Inc(reginput);
  3380. end;
  3381. OP_ANYSPACE:
  3382. begin
  3383. if (reginput = fInputEnd) or not IsSpaceChar(reginput^) // ###0.943
  3384. then
  3385. Exit;
  3386. Inc(reginput);
  3387. end;
  3388. OP_NOTSPACE:
  3389. begin
  3390. if (reginput = fInputEnd) or IsSpaceChar(reginput^) // ###0.943
  3391. then
  3392. Exit;
  3393. Inc(reginput);
  3394. end;
  3395. OP_ANYVERTSEP:
  3396. begin
  3397. if (reginput = fInputEnd) or not IsLineSeparator(reginput^) then
  3398. Exit;
  3399. Inc(reginput);
  3400. end;
  3401. OP_NOTVERTSEP:
  3402. begin
  3403. if (reginput = fInputEnd) or IsLineSeparator(reginput^) then
  3404. Exit;
  3405. Inc(reginput);
  3406. end;
  3407. OP_ANYHORZSEP:
  3408. begin
  3409. if (reginput = fInputEnd) or not IsHorzSeparator(reginput^) then
  3410. Exit;
  3411. Inc(reginput);
  3412. end;
  3413. OP_NOTHORZSEP:
  3414. begin
  3415. if (reginput = fInputEnd) or IsHorzSeparator(reginput^) then
  3416. Exit;
  3417. Inc(reginput);
  3418. end;
  3419. OP_EXACTLYCI:
  3420. begin
  3421. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  3422. Len := PLongInt(opnd)^;
  3423. Inc(opnd, RENumberSz);
  3424. // Inline the first character, for speed.
  3425. if (opnd^ <> reginput^) and (InvertCase(opnd^) <> reginput^) then
  3426. Exit;
  3427. // ###0.929 begin
  3428. no := Len;
  3429. save := reginput;
  3430. while no > 1 do
  3431. begin
  3432. Inc(save);
  3433. Inc(opnd);
  3434. if (opnd^ <> save^) and (InvertCase(opnd^) <> save^) then
  3435. Exit;
  3436. Dec(no);
  3437. end;
  3438. // ###0.929 end
  3439. Inc(reginput, Len);
  3440. end;
  3441. OP_EXACTLY:
  3442. begin
  3443. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  3444. Len := PLongInt(opnd)^;
  3445. Inc(opnd, RENumberSz);
  3446. // Inline the first character, for speed.
  3447. if opnd^ <> reginput^ then
  3448. Exit;
  3449. // ###0.929 begin
  3450. no := Len;
  3451. save := reginput;
  3452. while no > 1 do
  3453. begin
  3454. Inc(save);
  3455. Inc(opnd);
  3456. if opnd^ <> save^ then
  3457. Exit;
  3458. Dec(no);
  3459. end;
  3460. // ###0.929 end
  3461. Inc(reginput, Len);
  3462. end;
  3463. OP_BSUBEXP:
  3464. begin // ###0.936
  3465. no := Ord((scan + REOpSz + RENextOffSz)^);
  3466. no := GrpIndexes[no];
  3467. if no < 0 then
  3468. Exit;
  3469. if startp[no] = nil then
  3470. Exit;
  3471. if endp[no] = nil then
  3472. Exit;
  3473. save := reginput;
  3474. opnd := startp[no];
  3475. while opnd < endp[no] do
  3476. begin
  3477. if (save >= fInputEnd) or (save^ <> opnd^) then
  3478. Exit;
  3479. Inc(save);
  3480. Inc(opnd);
  3481. end;
  3482. reginput := save;
  3483. end;
  3484. OP_BSUBEXPCI:
  3485. begin // ###0.936
  3486. no := Ord((scan + REOpSz + RENextOffSz)^);
  3487. no := GrpIndexes[no];
  3488. if no < 0 then
  3489. Exit;
  3490. if startp[no] = nil then
  3491. Exit;
  3492. if endp[no] = nil then
  3493. Exit;
  3494. save := reginput;
  3495. opnd := startp[no];
  3496. while opnd < endp[no] do
  3497. begin
  3498. if (save >= fInputEnd) or
  3499. ((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then
  3500. Exit;
  3501. Inc(save);
  3502. Inc(opnd);
  3503. end;
  3504. reginput := save;
  3505. end;
  3506. OP_ANYOF:
  3507. begin
  3508. if (reginput = fInputEnd) or
  3509. not FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, False) then
  3510. Exit;
  3511. Inc(reginput);
  3512. end;
  3513. OP_ANYBUT:
  3514. begin
  3515. if (reginput = fInputEnd) or
  3516. FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, False) then
  3517. Exit;
  3518. Inc(reginput);
  3519. end;
  3520. OP_ANYOFCI:
  3521. begin
  3522. if (reginput = fInputEnd) or
  3523. not FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, True) then
  3524. Exit;
  3525. Inc(reginput);
  3526. end;
  3527. OP_ANYBUTCI:
  3528. begin
  3529. if (reginput = fInputEnd) or
  3530. FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, True) then
  3531. Exit;
  3532. Inc(reginput);
  3533. end;
  3534. OP_NOTHING:
  3535. ;
  3536. OP_COMMENT:
  3537. ;
  3538. OP_BACK:
  3539. ;
  3540. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1):
  3541. begin // ###0.929
  3542. no := Ord(scan^) - Ord(OP_OPEN);
  3543. // save := reginput;
  3544. save := startp[no]; // ###0.936
  3545. startp[no] := reginput; // ###0.936
  3546. Result := MatchPrim(next);
  3547. if not Result // ###0.936
  3548. then
  3549. startp[no] := save;
  3550. // if Result and (startp [no] = nil)
  3551. // then startp [no] := save;
  3552. // Don't set startp if some later invocation of the same
  3553. // parentheses already has.
  3554. Exit;
  3555. end;
  3556. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1):
  3557. begin // ###0.929
  3558. no := Ord(scan^) - Ord(OP_CLOSE);
  3559. // save := reginput;
  3560. save := endp[no]; // ###0.936
  3561. endp[no] := reginput; // ###0.936
  3562. Result := MatchPrim(next);
  3563. if not Result // ###0.936
  3564. then
  3565. endp[no] := save;
  3566. // if Result and (endp [no] = nil)
  3567. // then endp [no] := save;
  3568. // Don't set endp if some later invocation of the same
  3569. // parentheses already has.
  3570. Exit;
  3571. end;
  3572. OP_BRANCH:
  3573. begin
  3574. if (next^ <> OP_BRANCH) // No choice.
  3575. then
  3576. next := scan + REOpSz + RENextOffSz // Avoid recursion
  3577. else
  3578. begin
  3579. repeat
  3580. save := reginput;
  3581. Result := MatchPrim(scan + REOpSz + RENextOffSz);
  3582. if Result then
  3583. Exit;
  3584. reginput := save;
  3585. scan := regnext(scan);
  3586. until (scan = nil) or (scan^ <> OP_BRANCH);
  3587. Exit;
  3588. end;
  3589. end;
  3590. {$IFDEF ComplexBraces}
  3591. OP_LOOPENTRY:
  3592. begin // ###0.925
  3593. no := LoopStackIdx;
  3594. Inc(LoopStackIdx);
  3595. if LoopStackIdx > LoopStackMax then
  3596. begin
  3597. Error(reeLoopStackExceeded);
  3598. Exit;
  3599. end;
  3600. save := reginput;
  3601. LoopStack[LoopStackIdx] := 0; // init loop counter
  3602. Result := MatchPrim(next); // execute loop
  3603. LoopStackIdx := no; // cleanup
  3604. if Result then
  3605. Exit;
  3606. reginput := save;
  3607. Exit;
  3608. end;
  3609. OP_LOOP, OP_LOOPNG:
  3610. begin // ###0.940
  3611. if LoopStackIdx <= 0 then
  3612. begin
  3613. Error(reeLoopWithoutEntry);
  3614. Exit;
  3615. end;
  3616. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + 2 * REBracesArgSz))^;
  3617. BracesMin := PREBracesArg(AlignToInt(scan + REOpSz + RENextOffSz))^;
  3618. Bracesmax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  3619. save := reginput;
  3620. if LoopStack[LoopStackIdx] >= BracesMin then
  3621. begin // Min alredy matched - we can work
  3622. if scan^ = OP_LOOP then
  3623. begin
  3624. // greedy way - first try to max deep of greed ;)
  3625. if LoopStack[LoopStackIdx] < Bracesmax then
  3626. begin
  3627. Inc(LoopStack[LoopStackIdx]);
  3628. no := LoopStackIdx;
  3629. Result := MatchPrim(opnd);
  3630. LoopStackIdx := no;
  3631. if Result then
  3632. Exit;
  3633. reginput := save;
  3634. end;
  3635. Dec(LoopStackIdx); // Fail. May be we are too greedy? ;)
  3636. Result := MatchPrim(next);
  3637. if not Result then
  3638. reginput := save;
  3639. Exit;
  3640. end
  3641. else
  3642. begin
  3643. // non-greedy - try just now
  3644. Result := MatchPrim(next);
  3645. if Result then
  3646. Exit
  3647. else
  3648. reginput := save; // failed - move next and try again
  3649. if LoopStack[LoopStackIdx] < Bracesmax then
  3650. begin
  3651. Inc(LoopStack[LoopStackIdx]);
  3652. no := LoopStackIdx;
  3653. Result := MatchPrim(opnd);
  3654. LoopStackIdx := no;
  3655. if Result then
  3656. Exit;
  3657. reginput := save;
  3658. end;
  3659. Dec(LoopStackIdx); // Failed - back up
  3660. Exit;
  3661. end
  3662. end
  3663. else
  3664. begin // first match a min_cnt times
  3665. Inc(LoopStack[LoopStackIdx]);
  3666. no := LoopStackIdx;
  3667. Result := MatchPrim(opnd);
  3668. LoopStackIdx := no;
  3669. if Result then
  3670. Exit;
  3671. Dec(LoopStack[LoopStackIdx]);
  3672. reginput := save;
  3673. Exit;
  3674. end;
  3675. end;
  3676. {$ENDIF}
  3677. OP_STAR, OP_PLUS, OP_BRACES, OP_STARNG, OP_PLUSNG, OP_BRACESNG:
  3678. begin
  3679. // Lookahead to avoid useless match attempts when we know
  3680. // what character comes next.
  3681. nextch := #0;
  3682. if next^ = OP_EXACTLY then
  3683. nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
  3684. Bracesmax := MaxInt; // infinite loop for * and + //###0.92
  3685. if (scan^ = OP_STAR) or (scan^ = OP_STARNG) then
  3686. BracesMin := 0 // star
  3687. else if (scan^ = OP_PLUS) or (scan^ = OP_PLUSNG) then
  3688. BracesMin := 1 // plus
  3689. else
  3690. begin // braces
  3691. BracesMin := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  3692. Bracesmax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  3693. end;
  3694. save := reginput;
  3695. opnd := scan + REOpSz + RENextOffSz;
  3696. if (scan^ = OP_BRACES) or (scan^ = OP_BRACESNG) then
  3697. Inc(opnd, 2 * REBracesArgSz);
  3698. if (scan^ = OP_PLUSNG) or (scan^ = OP_STARNG) or (scan^ = OP_BRACESNG) then
  3699. begin
  3700. // non-greedy mode
  3701. Bracesmax := regrepeat(opnd, Bracesmax);
  3702. // don't repeat more than BracesMax
  3703. // Now we know real Max limit to move forward (for recursion 'back up')
  3704. // In some cases it can be faster to check only Min positions first,
  3705. // but after that we have to check every position separtely instead
  3706. // of fast scannig in loop.
  3707. no := BracesMin;
  3708. while no <= Bracesmax do
  3709. begin
  3710. reginput := save + no;
  3711. // If it could work, try it.
  3712. if (nextch = #0) or (reginput^ = nextch) then
  3713. begin
  3714. {$IFDEF ComplexBraces}
  3715. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  3716. // ###0.925
  3717. SavedLoopStackIdx := LoopStackIdx;
  3718. {$ENDIF}
  3719. if MatchPrim(next) then
  3720. begin
  3721. Result := True;
  3722. Exit;
  3723. end;
  3724. {$IFDEF ComplexBraces}
  3725. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  3726. LoopStackIdx := SavedLoopStackIdx;
  3727. {$ENDIF}
  3728. end;
  3729. Inc(no); // Couldn't or didn't - move forward.
  3730. end; { of while }
  3731. Exit;
  3732. end
  3733. else
  3734. begin // greedy mode
  3735. no := regrepeat(opnd, Bracesmax); // don't repeat more than max_cnt
  3736. while no >= BracesMin do
  3737. begin
  3738. // If it could work, try it.
  3739. if (nextch = #0) or (reginput^ = nextch) then
  3740. begin
  3741. {$IFDEF ComplexBraces}
  3742. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  3743. // ###0.925
  3744. SavedLoopStackIdx := LoopStackIdx;
  3745. {$ENDIF}
  3746. if MatchPrim(next) then
  3747. begin
  3748. Result := True;
  3749. Exit;
  3750. end;
  3751. {$IFDEF ComplexBraces}
  3752. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  3753. LoopStackIdx := SavedLoopStackIdx;
  3754. {$ENDIF}
  3755. end;
  3756. Dec(no); // Couldn't or didn't - back up.
  3757. reginput := save + no;
  3758. end; { of while }
  3759. Exit;
  3760. end;
  3761. end;
  3762. OP_EEND:
  3763. begin
  3764. Result := True; // Success!
  3765. Exit;
  3766. end;
  3767. else
  3768. begin
  3769. Error(reeMatchPrimMemoryCorruption);
  3770. Exit;
  3771. end;
  3772. end; { of case scan^ }
  3773. scan := next;
  3774. end; { of while scan <> nil }
  3775. // We get here only if there's trouble -- normally "case EEND" is the
  3776. // terminating point.
  3777. Error(reeMatchPrimCorruptedPointers);
  3778. end; { of function TRegExpr.MatchPrim
  3779. -------------------------------------------------------------- }
  3780. function TRegExpr.Exec(const AInputString: RegExprString): boolean;
  3781. begin
  3782. InputString := AInputString;
  3783. Result := ExecPrim(1, False, False);
  3784. end; { of function TRegExpr.Exec
  3785. -------------------------------------------------------------- }
  3786. function TRegExpr.Exec: boolean;
  3787. var
  3788. SlowChecks: boolean;
  3789. begin
  3790. SlowChecks := Length(fInputString) < fSlowChecksSizeMax;
  3791. Result := ExecPrim(1, False, SlowChecks);
  3792. end; { of function TRegExpr.Exec
  3793. -------------------------------------------------------------- }
  3794. function TRegExpr.Exec(AOffset: integer): boolean;
  3795. begin
  3796. Result := ExecPrim(AOffset, False, False);
  3797. end; { of function TRegExpr.Exec
  3798. -------------------------------------------------------------- }
  3799. function TRegExpr.ExecPos(AOffset: integer = 1): boolean;
  3800. begin
  3801. Result := ExecPrim(AOffset, False, False);
  3802. end; { of function TRegExpr.ExecPos
  3803. -------------------------------------------------------------- }
  3804. function TRegExpr.ExecPos(AOffset: integer; ATryOnce: boolean): boolean;
  3805. begin
  3806. Result := ExecPrim(AOffset, ATryOnce, False);
  3807. end;
  3808. function TRegExpr.MatchAtOnePos(APos: PRegExprChar): boolean;
  3809. begin
  3810. reginput := APos;
  3811. Result := MatchPrim(programm + REOpSz);
  3812. if Result then
  3813. begin
  3814. startp[0] := APos;
  3815. endp[0] := reginput;
  3816. end;
  3817. end;
  3818. procedure TRegExpr.ClearMatches;
  3819. begin
  3820. FillChar(startp, SizeOf(startp), 0);
  3821. FillChar(endp, SizeOf(endp), 0);
  3822. end;
  3823. procedure TRegExpr.ClearInternalIndexes;
  3824. var
  3825. i: integer;
  3826. begin
  3827. FillChar(startp, SizeOf(startp), 0);
  3828. FillChar(endp, SizeOf(endp), 0);
  3829. for i := 0 to NSUBEXP - 1 do
  3830. GrpIndexes[i] := -1;
  3831. GrpIndexes[0] := 0;
  3832. GrpCount := 0;
  3833. end;
  3834. function TRegExpr.ExecPrim(AOffset: integer; ATryOnce, ASlowChecks: boolean): boolean;
  3835. var
  3836. Ptr: PRegExprChar;
  3837. begin
  3838. Result := False;
  3839. // Ensure that Match cleared either if optimization tricks or some error
  3840. // will lead to leaving ExecPrim without actual search. That is
  3841. // important for ExecNext logic and so on.
  3842. ClearMatches;
  3843. // Don't check IsProgrammOk here! it causes big slowdown in test_benchmark!
  3844. if programm = nil then
  3845. begin
  3846. Compile;
  3847. if programm = nil then
  3848. Exit;
  3849. end;
  3850. // Check InputString presence
  3851. if fInputString = '' then
  3852. begin
  3853. if EmptyInputRaisesError then
  3854. Error(reeNoInputStringSpecified);
  3855. Exit;
  3856. end;
  3857. // Check that the start position is not negative
  3858. if AOffset < 1 then
  3859. begin
  3860. Error(reeOffsetMustBePositive);
  3861. Exit;
  3862. end;
  3863. // Check that the start position is not longer than the line
  3864. // If so then exit with nothing found
  3865. if AOffset > (Length(fInputString) + 1) // for matching empty string after last char.
  3866. then
  3867. Exit;
  3868. Ptr := fInputStart + AOffset - 1;
  3869. // If there is a "must appear" string, look for it.
  3870. if ASlowChecks then
  3871. if regmustString <> '' then
  3872. if Pos(regmustString, fInputString) = 0 then Exit;
  3873. {$IFDEF ComplexBraces}
  3874. // no loops started
  3875. LoopStackIdx := 0; // ###0.925
  3876. {$ENDIF}
  3877. // ATryOnce or anchored match (it needs to be tried only once).
  3878. if ATryOnce or (reganchored <> #0) then
  3879. begin
  3880. {$IFDEF UseFirstCharSet}
  3881. {$IFDEF UniCode}
  3882. if Ord(Ptr^) <= $FF then
  3883. {$ENDIF}
  3884. if not FirstCharArray[byte(Ptr^)] then
  3885. Exit;
  3886. {$ENDIF}
  3887. Result := MatchAtOnePos(Ptr);
  3888. Exit;
  3889. end;
  3890. // Messy cases: unanchored match.
  3891. Dec(Ptr);
  3892. repeat
  3893. Inc(Ptr);
  3894. if Ptr > fInputEnd then
  3895. Exit;
  3896. {$IFDEF UseFirstCharSet}
  3897. {$IFDEF UniCode}
  3898. if Ord(Ptr^) <= $FF then
  3899. {$ENDIF}
  3900. if not FirstCharArray[byte(Ptr^)] then
  3901. Continue;
  3902. {$ENDIF}
  3903. Result := MatchAtOnePos(Ptr);
  3904. // Exit on a match or after testing the end-of-string
  3905. if Result then
  3906. Exit;
  3907. until False;
  3908. end; { of function TRegExpr.ExecPrim
  3909. -------------------------------------------------------------- }
  3910. function TRegExpr.ExecNext: boolean;
  3911. var
  3912. PtrBegin, PtrEnd: PRegExprChar;
  3913. Offset: PtrInt;
  3914. begin
  3915. PtrBegin := startp[0];
  3916. PtrEnd := endp[0];
  3917. if (PtrBegin = nil) or (PtrEnd = nil) then
  3918. begin
  3919. Error(reeExecNextWithoutExec);
  3920. Result := False;
  3921. Exit;
  3922. end;
  3923. Offset := PtrEnd - fInputStart + 1;
  3924. // prevent infinite looping if empty string matches r.e.
  3925. if PtrBegin = PtrEnd then
  3926. Inc(Offset);
  3927. Result := ExecPrim(Offset, False, False);
  3928. end; { of function TRegExpr.ExecNext
  3929. -------------------------------------------------------------- }
  3930. procedure TRegExpr.SetInputString(const AInputString: RegExprString);
  3931. begin
  3932. ClearMatches;
  3933. fInputString := AInputString;
  3934. UniqueString(fInputString);
  3935. fInputStart := PRegExprChar(fInputString);
  3936. fInputEnd := fInputStart + Length(fInputString);
  3937. end; { of procedure TRegExpr.SetInputString
  3938. -------------------------------------------------------------- }
  3939. procedure TRegExpr.SetLineSeparators(const AStr: RegExprString);
  3940. begin
  3941. if AStr <> fLineSeparators then
  3942. begin
  3943. fLineSeparators := AStr;
  3944. InitLineSepArray;
  3945. InvalidateProgramm;
  3946. end;
  3947. end; { of procedure TRegExpr.SetLineSeparators
  3948. -------------------------------------------------------------- }
  3949. procedure TRegExpr.SetLinePairedSeparator(const AStr: RegExprString);
  3950. begin
  3951. if Length(AStr) = 2 then
  3952. begin
  3953. if AStr[1] = AStr[2] then
  3954. begin
  3955. // it's impossible for our 'one-point' checking to support
  3956. // two chars separator for identical chars
  3957. Error(reeBadLinePairedSeparator);
  3958. Exit;
  3959. end;
  3960. if not fLinePairedSeparatorAssigned or (AStr[1] <> fLinePairedSeparatorHead)
  3961. or (AStr[2] <> fLinePairedSeparatorTail) then
  3962. begin
  3963. fLinePairedSeparatorAssigned := True;
  3964. fLinePairedSeparatorHead := AStr[1];
  3965. fLinePairedSeparatorTail := AStr[2];
  3966. InvalidateProgramm;
  3967. end;
  3968. end
  3969. else if Length(AStr) = 0 then
  3970. begin
  3971. if fLinePairedSeparatorAssigned then
  3972. begin
  3973. fLinePairedSeparatorAssigned := False;
  3974. InvalidateProgramm;
  3975. end;
  3976. end
  3977. else
  3978. Error(reeBadLinePairedSeparator);
  3979. end; { of procedure TRegExpr.SetLinePairedSeparator
  3980. -------------------------------------------------------------- }
  3981. function TRegExpr.GetLinePairedSeparator: RegExprString;
  3982. begin
  3983. if fLinePairedSeparatorAssigned then
  3984. begin
  3985. {$IFDEF UniCode}
  3986. // Here is some UniCode 'magic'
  3987. // If You do know better decision to concatenate
  3988. // two WideChars, please, let me know!
  3989. Result := fLinePairedSeparatorHead; // ###0.947
  3990. Result := Result + fLinePairedSeparatorTail;
  3991. {$ELSE}
  3992. Result := fLinePairedSeparatorHead + fLinePairedSeparatorTail;
  3993. {$ENDIF}
  3994. end
  3995. else
  3996. Result := '';
  3997. end; { of function TRegExpr.GetLinePairedSeparator
  3998. -------------------------------------------------------------- }
  3999. function TRegExpr.Substitute(const ATemplate: RegExprString): RegExprString;
  4000. // perform substitutions after a regexp match
  4001. var
  4002. TemplateBeg, TemplateEnd: PRegExprChar;
  4003. function ParseVarName(var APtr: PRegExprChar): integer;
  4004. // extract name of variable (digits, may be enclosed with
  4005. // curly braces) from APtr^, uses TemplateEnd !!!
  4006. var
  4007. p: PRegExprChar;
  4008. Delimited: boolean;
  4009. begin
  4010. Result := 0;
  4011. p := APtr;
  4012. Delimited := (p < TemplateEnd) and (p^ = '{');
  4013. if Delimited then
  4014. Inc(p); // skip left curly brace
  4015. if (p < TemplateEnd) and (p^ = '&') then
  4016. Inc(p) // this is '$&' or '${&}'
  4017. else
  4018. while (p < TemplateEnd) and IsDigitChar(p^) do
  4019. begin
  4020. Result := Result * 10 + (Ord(p^) - Ord('0')); // ###0.939
  4021. Inc(p);
  4022. end;
  4023. if Delimited then
  4024. if (p < TemplateEnd) and (p^ = '}') then
  4025. Inc(p) // skip right curly brace
  4026. else
  4027. p := APtr; // isn't properly terminated
  4028. if p = APtr then
  4029. Result := -1; // no valid digits found or no right curly brace
  4030. APtr := p;
  4031. end;
  4032. type
  4033. TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper, smodeAllLower);
  4034. var
  4035. Mode: TSubstMode;
  4036. p, p0, p1, ResultPtr: PRegExprChar;
  4037. ResultLen, n: integer;
  4038. Ch, QuotedChar: REChar;
  4039. begin
  4040. // Check programm and input string
  4041. if not IsProgrammOk then
  4042. Exit;
  4043. if fInputString = '' then
  4044. begin
  4045. if EmptyInputRaisesError then
  4046. Error(reeNoInputStringSpecified);
  4047. Exit;
  4048. end;
  4049. // Prepare for working
  4050. if ATemplate = '' then
  4051. begin // prevent nil pointers
  4052. Result := '';
  4053. Exit;
  4054. end;
  4055. TemplateBeg := PRegExprChar(ATemplate);
  4056. TemplateEnd := TemplateBeg + Length(ATemplate);
  4057. // Count result length for speed optimization.
  4058. ResultLen := 0;
  4059. p := TemplateBeg;
  4060. while p < TemplateEnd do
  4061. begin
  4062. Ch := p^;
  4063. Inc(p);
  4064. if Ch = '$' then
  4065. n := GrpIndexes[ParseVarName(p)]
  4066. else
  4067. n := -1;
  4068. if n >= 0 then
  4069. begin
  4070. Inc(ResultLen, endp[n] - startp[n]);
  4071. end
  4072. else
  4073. begin
  4074. if (Ch = EscChar) and (p < TemplateEnd) then
  4075. begin // quoted or special char followed
  4076. Ch := p^;
  4077. Inc(p);
  4078. case Ch of
  4079. 'n':
  4080. Inc(ResultLen, Length(FReplaceLineEnd));
  4081. 'u', 'l', 'U', 'L': { nothing }
  4082. ;
  4083. 'x':
  4084. begin
  4085. Inc(ResultLen);
  4086. if (p^ = '{') then
  4087. begin // skip \x{....}
  4088. while ((p^ <> '}') and (p < TemplateEnd)) do
  4089. p := p + 1;
  4090. p := p + 1;
  4091. end
  4092. else
  4093. p := p + 2 // skip \x..
  4094. end;
  4095. else
  4096. Inc(ResultLen);
  4097. end;
  4098. end
  4099. else
  4100. Inc(ResultLen);
  4101. end;
  4102. end;
  4103. // Get memory. We do it once and it significant speed up work !
  4104. if ResultLen = 0 then
  4105. begin
  4106. Result := '';
  4107. Exit;
  4108. end;
  4109. SetLength(Result, ResultLen);
  4110. // Fill Result
  4111. ResultPtr := Pointer(Result);
  4112. p := TemplateBeg;
  4113. Mode := smodeNormal;
  4114. while p < TemplateEnd do
  4115. begin
  4116. Ch := p^;
  4117. p0 := p;
  4118. Inc(p);
  4119. p1 := p;
  4120. if Ch = '$' then
  4121. n := GrpIndexes[ParseVarName(p)]
  4122. else
  4123. n := -1;
  4124. if (n >= 0) then
  4125. begin
  4126. p0 := startp[n];
  4127. p1 := endp[n];
  4128. end
  4129. else
  4130. begin
  4131. if (Ch = EscChar) and (p < TemplateEnd) then
  4132. begin // quoted or special char followed
  4133. Ch := p^;
  4134. Inc(p);
  4135. case Ch of
  4136. 'n':
  4137. begin
  4138. p0 := PRegExprChar(FReplaceLineEnd);
  4139. p1 := p0 + Length(FReplaceLineEnd);
  4140. end;
  4141. 'x', 't', 'r', 'f', 'a', 'e':
  4142. begin
  4143. p := p - 1;
  4144. // UnquoteChar expects the escaped char under the pointer
  4145. QuotedChar := UnQuoteChar(p);
  4146. p := p + 1;
  4147. // Skip after last part of the escaped sequence - UnquoteChar stops on the last symbol of it
  4148. p0 := @QuotedChar;
  4149. p1 := p0 + 1;
  4150. end;
  4151. 'l':
  4152. begin
  4153. Mode := smodeOneLower;
  4154. p1 := p0;
  4155. end;
  4156. 'L':
  4157. begin
  4158. Mode := smodeAllLower;
  4159. p1 := p0;
  4160. end;
  4161. 'u':
  4162. begin
  4163. Mode := smodeOneUpper;
  4164. p1 := p0;
  4165. end;
  4166. 'U':
  4167. begin
  4168. Mode := smodeAllUpper;
  4169. p1 := p0;
  4170. end;
  4171. else
  4172. begin
  4173. Inc(p0);
  4174. Inc(p1);
  4175. end;
  4176. end;
  4177. end
  4178. end;
  4179. if p0 < p1 then
  4180. begin
  4181. while p0 < p1 do
  4182. begin
  4183. case Mode of
  4184. smodeOneLower:
  4185. begin
  4186. ResultPtr^ := _LowerCase(p0^);
  4187. Mode := smodeNormal;
  4188. end;
  4189. smodeAllLower:
  4190. begin
  4191. ResultPtr^ := _LowerCase(p0^);
  4192. end;
  4193. smodeOneUpper:
  4194. begin
  4195. ResultPtr^ := _UpperCase(p0^);
  4196. Mode := smodeNormal;
  4197. end;
  4198. smodeAllUpper:
  4199. begin
  4200. ResultPtr^ := _UpperCase(p0^);
  4201. end;
  4202. else
  4203. ResultPtr^ := p0^;
  4204. end;
  4205. Inc(ResultPtr);
  4206. Inc(p0);
  4207. end;
  4208. Mode := smodeNormal;
  4209. end;
  4210. end;
  4211. end; { of function TRegExpr.Substitute
  4212. -------------------------------------------------------------- }
  4213. procedure TRegExpr.Split(const AInputStr: RegExprString; APieces: TStrings);
  4214. var
  4215. PrevPos: PtrInt;
  4216. begin
  4217. PrevPos := 1;
  4218. if Exec(AInputStr) then
  4219. repeat
  4220. APieces.Add(System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos));
  4221. PrevPos := MatchPos[0] + MatchLen[0];
  4222. until not ExecNext;
  4223. APieces.Add(System.Copy(AInputStr, PrevPos, MaxInt)); // Tail
  4224. end; { of procedure TRegExpr.Split
  4225. -------------------------------------------------------------- }
  4226. function TRegExpr.Replace(const AInputStr: RegExprString;
  4227. const AReplaceStr: RegExprString;
  4228. AUseSubstitution: boolean = False): RegExprString;
  4229. var
  4230. PrevPos: PtrInt;
  4231. begin
  4232. Result := '';
  4233. PrevPos := 1;
  4234. if Exec(AInputStr) then
  4235. repeat
  4236. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos);
  4237. if AUseSubstitution // ###0.946
  4238. then
  4239. Result := Result + Substitute(AReplaceStr)
  4240. else
  4241. Result := Result + AReplaceStr;
  4242. PrevPos := MatchPos[0] + MatchLen[0];
  4243. until not ExecNext;
  4244. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  4245. end; { of function TRegExpr.Replace
  4246. -------------------------------------------------------------- }
  4247. function TRegExpr.ReplaceEx(const AInputStr: RegExprString;
  4248. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  4249. var
  4250. PrevPos: PtrInt;
  4251. begin
  4252. Result := '';
  4253. PrevPos := 1;
  4254. if Exec(AInputStr) then
  4255. repeat
  4256. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos)
  4257. + AReplaceFunc(Self);
  4258. PrevPos := MatchPos[0] + MatchLen[0];
  4259. until not ExecNext;
  4260. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  4261. end; { of function TRegExpr.ReplaceEx
  4262. -------------------------------------------------------------- }
  4263. function TRegExpr.Replace(const AInputStr: RegExprString;
  4264. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  4265. begin
  4266. Result := ReplaceEx(AInputStr, AReplaceFunc);
  4267. end; { of function TRegExpr.Replace
  4268. -------------------------------------------------------------- }
  4269. { ============================================================= }
  4270. { ====================== Debug section ======================== }
  4271. { ============================================================= }
  4272. {$IFDEF UseFirstCharSet}
  4273. procedure TRegExpr.FillFirstCharSet(prog: PRegExprChar);
  4274. var
  4275. scan: PRegExprChar; // Current node.
  4276. Next: PRegExprChar; // Next node.
  4277. opnd: PRegExprChar;
  4278. Oper: TREOp;
  4279. ch: REChar;
  4280. min_cnt, i: integer;
  4281. TempSet: TRegExprCharset;
  4282. begin
  4283. TempSet := [];
  4284. scan := prog;
  4285. while scan <> nil do
  4286. begin
  4287. Next := regnext(scan);
  4288. Oper := PREOp(scan)^;
  4289. case Oper of
  4290. OP_BSUBEXP,
  4291. OP_BSUBEXPCI:
  4292. begin
  4293. // we cannot optimize r.e. if it starts with back reference
  4294. FirstCharSet := RegExprAllSet; //###0.930
  4295. Exit;
  4296. end;
  4297. OP_BOL,
  4298. OP_BOLML:
  4299. ; // Exit; //###0.937
  4300. OP_EOL,
  4301. OP_EOLML:
  4302. begin //###0.948 was empty in 0.947, was EXIT in 0.937
  4303. Include(FirstCharSet, 0);
  4304. if ModifierM then
  4305. for i := 1 to Length(LineSeparators) do
  4306. Include(FirstCharSet, byte(LineSeparators[i]));
  4307. Exit;
  4308. end;
  4309. OP_BOUND,
  4310. OP_NOTBOUND:
  4311. ; //###0.943 ?!!
  4312. OP_ANY,
  4313. OP_ANYML:
  4314. begin // we can better define ANYML !!!
  4315. FirstCharSet := RegExprAllSet; //###0.930
  4316. Exit;
  4317. end;
  4318. OP_ANYDIGIT:
  4319. begin
  4320. FirstCharSet := FirstCharSet + RegExprDigitSet;
  4321. Exit;
  4322. end;
  4323. OP_NOTDIGIT:
  4324. begin
  4325. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprDigitSet);
  4326. Exit;
  4327. end;
  4328. OP_ANYLETTER:
  4329. begin
  4330. GetCharSetFromWordChars(TempSet);
  4331. FirstCharSet := FirstCharSet + TempSet;
  4332. Exit;
  4333. end;
  4334. OP_NOTLETTER:
  4335. begin
  4336. GetCharSetFromWordChars(TempSet);
  4337. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4338. Exit;
  4339. end;
  4340. OP_ANYSPACE:
  4341. begin
  4342. GetCharSetFromSpaceChars(TempSet);
  4343. FirstCharSet := FirstCharSet + TempSet;
  4344. Exit;
  4345. end;
  4346. OP_NOTSPACE:
  4347. begin
  4348. GetCharSetFromSpaceChars(TempSet);
  4349. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4350. Exit;
  4351. end;
  4352. OP_ANYVERTSEP:
  4353. begin
  4354. FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
  4355. Exit;
  4356. end;
  4357. OP_NOTVERTSEP:
  4358. begin
  4359. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprLineSeparatorsSet);
  4360. Exit;
  4361. end;
  4362. OP_ANYHORZSEP:
  4363. begin
  4364. FirstCharSet := FirstCharSet + RegExprHorzSeparatorsSet;
  4365. Exit;
  4366. end;
  4367. OP_NOTHORZSEP:
  4368. begin
  4369. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprHorzSeparatorsSet);
  4370. Exit;
  4371. end;
  4372. OP_EXACTLYCI:
  4373. begin
  4374. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  4375. {$IFDEF UniCode}
  4376. if Ord(ch) <= $FF then
  4377. {$ENDIF}
  4378. begin
  4379. Include(FirstCharSet, byte(ch));
  4380. Include(FirstCharSet, byte(InvertCase(ch)));
  4381. end;
  4382. Exit;
  4383. end;
  4384. OP_EXACTLY:
  4385. begin
  4386. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  4387. {$IFDEF UniCode}
  4388. if Ord(ch) <= $FF then
  4389. {$ENDIF}
  4390. Include(FirstCharSet, byte(ch));
  4391. Exit;
  4392. end;
  4393. OP_ANYOF:
  4394. begin
  4395. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  4396. FirstCharSet := FirstCharSet + TempSet;
  4397. Exit;
  4398. end;
  4399. OP_ANYBUT:
  4400. begin
  4401. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  4402. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4403. Exit;
  4404. end;
  4405. OP_ANYOFCI:
  4406. begin
  4407. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  4408. FirstCharSet := FirstCharSet + TempSet;
  4409. Exit;
  4410. end;
  4411. OP_ANYBUTCI:
  4412. begin
  4413. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  4414. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4415. Exit;
  4416. end;
  4417. OP_NOTHING:
  4418. ;
  4419. OP_COMMENT:
  4420. ;
  4421. OP_BACK:
  4422. ;
  4423. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1):
  4424. begin //###0.929
  4425. FillFirstCharSet(Next);
  4426. Exit;
  4427. end;
  4428. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1):
  4429. begin //###0.929
  4430. FillFirstCharSet(Next);
  4431. Exit;
  4432. end;
  4433. OP_BRANCH:
  4434. begin
  4435. if (PREOp(Next)^ <> OP_BRANCH) // No choice.
  4436. then
  4437. Next := scan + REOpSz + RENextOffSz // Avoid recursion.
  4438. else
  4439. begin
  4440. repeat
  4441. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4442. scan := regnext(scan);
  4443. until (scan = nil) or (PREOp(scan)^ <> OP_BRANCH);
  4444. Exit;
  4445. end;
  4446. end;
  4447. {$IFDEF ComplexBraces}
  4448. OP_LOOPENTRY:
  4449. begin //###0.925
  4450. //LoopStack [LoopStackIdx] := 0; //###0.940 line removed
  4451. FillFirstCharSet(Next); // execute LOOP
  4452. Exit;
  4453. end;
  4454. OP_LOOP,
  4455. OP_LOOPNG:
  4456. begin //###0.940
  4457. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz * 2))^;
  4458. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  4459. FillFirstCharSet(opnd);
  4460. if min_cnt = 0 then
  4461. FillFirstCharSet(Next);
  4462. Exit;
  4463. end;
  4464. {$ENDIF}
  4465. OP_STAR,
  4466. OP_STARNG: //###0.940
  4467. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4468. OP_PLUS,
  4469. OP_PLUSNG:
  4470. begin //###0.940
  4471. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4472. Exit;
  4473. end;
  4474. OP_BRACES,
  4475. OP_BRACESNG:
  4476. begin //###0.940
  4477. opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
  4478. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES
  4479. FillFirstCharSet(opnd);
  4480. if min_cnt > 0 then
  4481. Exit;
  4482. end;
  4483. OP_EEND:
  4484. begin
  4485. FirstCharSet := RegExprAllSet; //###0.948
  4486. Exit;
  4487. end;
  4488. else
  4489. begin
  4490. fLastErrorOpcode := Oper;
  4491. Error(reeUnknownOpcodeInFillFirst);
  4492. Exit;
  4493. end;
  4494. end; { of case scan^}
  4495. scan := Next;
  4496. end; { of while scan <> nil}
  4497. end; { of procedure FillFirstCharSet
  4498. --------------------------------------------------------------}
  4499. {$ENDIF}
  4500. procedure TRegExpr.InitCharCheckers;
  4501. var
  4502. Cnt: integer;
  4503. //
  4504. function Add(AChecker: TRegExprCharChecker): byte;
  4505. begin
  4506. Inc(Cnt);
  4507. if Cnt > High(CharCheckers) then
  4508. raise Exception.Create('Too small CharCheckers array');
  4509. CharCheckers[Cnt - 1] := AChecker;
  4510. Result := Cnt - 1;
  4511. end;
  4512. //
  4513. begin
  4514. Cnt := 0;
  4515. FillChar(CharCheckers, SizeOf(CharCheckers), 0);
  4516. CheckerIndex_Word := Add(CharChecker_Word);
  4517. CheckerIndex_NotWord := Add(CharChecker_NotWord);
  4518. CheckerIndex_Space := Add(CharChecker_Space);
  4519. CheckerIndex_NotSpace := Add(CharChecker_NotSpace);
  4520. CheckerIndex_Digit := Add(CharChecker_Digit);
  4521. CheckerIndex_NotDigit := Add(CharChecker_NotDigit);
  4522. CheckerIndex_VertSep := Add(CharChecker_VertSep);
  4523. CheckerIndex_NotVertSep := Add(CharChecker_NotVertSep);
  4524. CheckerIndex_HorzSep := Add(CharChecker_HorzSep);
  4525. CheckerIndex_NotHorzSep := Add(CharChecker_NotHorzSep);
  4526. //CheckerIndex_AllAZ := Add(CharChecker_AllAZ);
  4527. CheckerIndex_LowerAZ := Add(CharChecker_LowerAZ);
  4528. CheckerIndex_UpperAZ := Add(CharChecker_UpperAZ);
  4529. SetLength(CharCheckerInfos, 3);
  4530. with CharCheckerInfos[0] do
  4531. begin
  4532. CharBegin := 'a';
  4533. CharEnd:= 'z';
  4534. CheckerIndex := CheckerIndex_LowerAZ;
  4535. end;
  4536. with CharCheckerInfos[1] do
  4537. begin
  4538. CharBegin := 'A';
  4539. CharEnd := 'Z';
  4540. CheckerIndex := CheckerIndex_UpperAZ;
  4541. end;
  4542. with CharCheckerInfos[2] do
  4543. begin
  4544. CharBegin := '0';
  4545. CharEnd := '9';
  4546. CheckerIndex := CheckerIndex_Digit;
  4547. end;
  4548. end;
  4549. function TRegExpr.CharChecker_Word(ch: REChar): boolean;
  4550. begin
  4551. Result := IsWordChar(ch);
  4552. end;
  4553. function TRegExpr.CharChecker_NotWord(ch: REChar): boolean;
  4554. begin
  4555. Result := not IsWordChar(ch);
  4556. end;
  4557. function TRegExpr.CharChecker_Space(ch: REChar): boolean;
  4558. begin
  4559. Result := IsSpaceChar(ch);
  4560. end;
  4561. function TRegExpr.CharChecker_NotSpace(ch: REChar): boolean;
  4562. begin
  4563. Result := not IsSpaceChar(ch);
  4564. end;
  4565. function TRegExpr.CharChecker_Digit(ch: REChar): boolean;
  4566. begin
  4567. Result := IsDigitChar(ch);
  4568. end;
  4569. function TRegExpr.CharChecker_NotDigit(ch: REChar): boolean;
  4570. begin
  4571. Result := not IsDigitChar(ch);
  4572. end;
  4573. function TRegExpr.CharChecker_VertSep(ch: REChar): boolean;
  4574. begin
  4575. Result := IsLineSeparator(ch);
  4576. end;
  4577. function TRegExpr.CharChecker_NotVertSep(ch: REChar): boolean;
  4578. begin
  4579. Result := not IsLineSeparator(ch);
  4580. end;
  4581. function TRegExpr.CharChecker_HorzSep(ch: REChar): boolean;
  4582. begin
  4583. Result := IsHorzSeparator(ch);
  4584. end;
  4585. function TRegExpr.CharChecker_NotHorzSep(ch: REChar): boolean;
  4586. begin
  4587. Result := not IsHorzSeparator(ch);
  4588. end;
  4589. function TRegExpr.CharChecker_LowerAZ(ch: REChar): boolean;
  4590. begin
  4591. case ch of
  4592. 'a' .. 'z':
  4593. Result := True;
  4594. else
  4595. Result := False;
  4596. end;
  4597. end;
  4598. function TRegExpr.CharChecker_UpperAZ(ch: REChar): boolean;
  4599. begin
  4600. case ch of
  4601. 'A' .. 'Z':
  4602. Result := True;
  4603. else
  4604. Result := False;
  4605. end;
  4606. end;
  4607. {$IFDEF RegExpPCodeDump}
  4608. function TRegExpr.DumpOp(op: TREOp): RegExprString;
  4609. // printable representation of opcode
  4610. begin
  4611. case op of
  4612. OP_BOL:
  4613. Result := 'BOL';
  4614. OP_EOL:
  4615. Result := 'EOL';
  4616. OP_BOLML:
  4617. Result := 'BOLML';
  4618. OP_EOLML:
  4619. Result := 'EOLML';
  4620. OP_BOUND:
  4621. Result := 'BOUND'; // ###0.943
  4622. OP_NOTBOUND:
  4623. Result := 'NOTBOUND'; // ###0.943
  4624. OP_ANY:
  4625. Result := 'ANY';
  4626. OP_ANYML:
  4627. Result := 'ANYML'; // ###0.941
  4628. OP_ANYLETTER:
  4629. Result := 'ANYLETTER';
  4630. OP_NOTLETTER:
  4631. Result := 'NOTLETTER';
  4632. OP_ANYDIGIT:
  4633. Result := 'ANYDIGIT';
  4634. OP_NOTDIGIT:
  4635. Result := 'NOTDIGIT';
  4636. OP_ANYSPACE:
  4637. Result := 'ANYSPACE';
  4638. OP_NOTSPACE:
  4639. Result := 'NOTSPACE';
  4640. OP_ANYHORZSEP:
  4641. Result := 'ANYHORZSEP';
  4642. OP_NOTHORZSEP:
  4643. Result := 'NOTHORZSEP';
  4644. OP_ANYVERTSEP:
  4645. Result := 'ANYVERTSEP';
  4646. OP_NOTVERTSEP:
  4647. Result := 'NOTVERTSEP';
  4648. OP_ANYOF:
  4649. Result := 'ANYOF';
  4650. OP_ANYBUT:
  4651. Result := 'ANYBUT';
  4652. OP_ANYOFCI:
  4653. Result := 'ANYOF/CI';
  4654. OP_ANYBUTCI:
  4655. Result := 'ANYBUT/CI';
  4656. OP_BRANCH:
  4657. Result := 'BRANCH';
  4658. OP_EXACTLY:
  4659. Result := 'EXACTLY';
  4660. OP_EXACTLYCI:
  4661. Result := 'EXACTLY/CI';
  4662. OP_NOTHING:
  4663. Result := 'NOTHING';
  4664. OP_COMMENT:
  4665. Result := 'COMMENT';
  4666. OP_BACK:
  4667. Result := 'BACK';
  4668. OP_EEND:
  4669. Result := 'END';
  4670. OP_BSUBEXP:
  4671. Result := 'BSUBEXP';
  4672. OP_BSUBEXPCI:
  4673. Result := 'BSUBEXP/CI';
  4674. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1): // ###0.929
  4675. Result := Format('OPEN[%d]', [Ord(op) - Ord(OP_OPEN)]);
  4676. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1): // ###0.929
  4677. Result := Format('CLOSE[%d]', [Ord(op) - Ord(OP_CLOSE)]);
  4678. OP_STAR:
  4679. Result := 'STAR';
  4680. OP_PLUS:
  4681. Result := 'PLUS';
  4682. OP_BRACES:
  4683. Result := 'BRACES';
  4684. {$IFDEF ComplexBraces}
  4685. OP_LOOPENTRY:
  4686. Result := 'LOOPENTRY'; // ###0.925
  4687. OP_LOOP:
  4688. Result := 'LOOP'; // ###0.925
  4689. OP_LOOPNG:
  4690. Result := 'LOOPNG'; // ###0.940
  4691. {$ENDIF}
  4692. OP_STARNG:
  4693. Result := 'STARNG'; // ###0.940
  4694. OP_PLUSNG:
  4695. Result := 'PLUSNG'; // ###0.940
  4696. OP_BRACESNG:
  4697. Result := 'BRACESNG'; // ###0.940
  4698. else
  4699. Error(reeDumpCorruptedOpcode);
  4700. end; { of case op }
  4701. Result := ':' + Result;
  4702. end; { of function TRegExpr.DumpOp
  4703. -------------------------------------------------------------- }
  4704. function TRegExpr.Dump: RegExprString;
  4705. // dump a regexp in vaguely comprehensible form
  4706. var
  4707. s: PRegExprChar;
  4708. op: TREOp; // Arbitrary non-END op.
  4709. next: PRegExprChar;
  4710. i, NLen: integer;
  4711. Diff: PtrInt;
  4712. Ch: AnsiChar;
  4713. function PrintableChar(AChar: REChar): string; {$IFDEF InlineFuncs}inline;{$ENDIF}
  4714. begin
  4715. if AChar < ' ' then
  4716. Result := '#' + IntToStr(Ord(AChar))
  4717. else
  4718. Result := AChar;
  4719. end;
  4720. begin
  4721. if not IsProgrammOk then
  4722. Exit;
  4723. op := OP_EXACTLY;
  4724. Result := '';
  4725. s := programm + REOpSz;
  4726. while op <> OP_EEND do
  4727. begin // While that wasn't END last time...
  4728. op := s^;
  4729. Result := Result + Format('%2d%s', [s - programm, DumpOp(s^)]);
  4730. // Where, what.
  4731. next := regnext(s);
  4732. if next = nil // Next ptr.
  4733. then
  4734. Result := Result + ' (0)'
  4735. else
  4736. begin
  4737. if next > s
  4738. // ###0.948 PWideChar subtraction workaround (see comments in Tail method for details)
  4739. then
  4740. Diff := next - s
  4741. else
  4742. Diff := -(s - next);
  4743. Result := Result + Format(' (%d) ', [(s - programm) + Diff]);
  4744. end;
  4745. Inc(s, REOpSz + RENextOffSz);
  4746. if (op = OP_ANYOF) or (op = OP_ANYOFCI) or (op = OP_ANYBUT) or (op = OP_ANYBUTCI) then
  4747. begin
  4748. repeat
  4749. case s^ of
  4750. OpKind_End:
  4751. begin
  4752. Inc(s);
  4753. Break;
  4754. end;
  4755. OpKind_Range:
  4756. begin
  4757. Result := Result + 'Rng(';
  4758. Inc(s);
  4759. Result := Result + PrintableChar(s^) + '-';
  4760. Inc(s);
  4761. Result := Result + PrintableChar(s^);
  4762. Result := Result + ') ';
  4763. Inc(s);
  4764. end;
  4765. OpKind_MetaClass:
  4766. begin
  4767. Inc(s);
  4768. Result := Result + '\' + PrintableChar(s^) + ' ';
  4769. Inc(s);
  4770. end;
  4771. OpKind_Char:
  4772. begin
  4773. Inc(s);
  4774. NLen := PLongInt(s)^;
  4775. Inc(s, RENumberSz);
  4776. Result := Result + 'Ch(';
  4777. for i := 1 to NLen do
  4778. begin
  4779. Result := Result + PrintableChar(s^);
  4780. Inc(s);
  4781. end;
  4782. Result := Result + ') ';
  4783. end;
  4784. else
  4785. Error(reeDumpCorruptedOpcode);
  4786. end;
  4787. until false;
  4788. end;
  4789. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  4790. begin
  4791. // Literal string, where present.
  4792. NLen := PLongInt(s)^;
  4793. Inc(s, RENumberSz);
  4794. for i := 1 to NLen do
  4795. begin
  4796. Result := Result + PrintableChar(s^);
  4797. Inc(s);
  4798. end;
  4799. end;
  4800. if (op = OP_BSUBEXP) or (op = OP_BSUBEXPCI) then
  4801. begin
  4802. Result := Result + ' \' + IntToStr(Ord(s^));
  4803. Inc(s);
  4804. end;
  4805. if (op = OP_BRACES) or (op = OP_BRACESNG) then
  4806. begin // ###0.941
  4807. // show min/max argument of braces operator
  4808. Result := Result + Format('{%d,%d}', [PREBracesArg(AlignToInt(s))^,
  4809. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  4810. Inc(s, REBracesArgSz * 2);
  4811. end;
  4812. {$IFDEF ComplexBraces}
  4813. if (op = OP_LOOP) or (op = OP_LOOPNG) then
  4814. begin // ###0.940
  4815. Result := Result + Format(' -> (%d) {%d,%d}',
  4816. [(s - programm - (REOpSz + RENextOffSz)) +
  4817. PRENextOff(AlignToPtr(s + 2 * REBracesArgSz))^,
  4818. PREBracesArg(AlignToInt(s))^,
  4819. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  4820. Inc(s, 2 * REBracesArgSz + RENextOffSz);
  4821. end;
  4822. {$ENDIF}
  4823. Result := Result + #$d#$a;
  4824. end; { of while }
  4825. // Header fields of interest.
  4826. if reganchored <> #0 then
  4827. Result := Result + 'Anchored; ';
  4828. if regmustString <> '' then
  4829. Result := Result + 'Must have: "' + regmustString + '"; ';
  4830. {$IFDEF UseFirstCharSet} // ###0.929
  4831. Result := Result + #$d#$a'First charset: ';
  4832. if FirstCharSet = [] then
  4833. Result := Result + '<empty set>'
  4834. else
  4835. if FirstCharSet = RegExprAllSet then
  4836. Result := Result + '<all chars>'
  4837. else
  4838. for Ch := #0 to #255 do
  4839. if byte(Ch) in FirstCharSet then
  4840. begin
  4841. if Ch < ' ' then
  4842. Result := Result + PrintableChar(Ch) // ###0.948
  4843. else
  4844. Result := Result + Ch;
  4845. end;
  4846. {$ENDIF}
  4847. Result := Result + #$d#$a;
  4848. end; { of function TRegExpr.Dump
  4849. -------------------------------------------------------------- }
  4850. {$ENDIF}
  4851. {$IFDEF reRealExceptionAddr}
  4852. {$OPTIMIZATION ON}
  4853. // ReturnAddr works correctly only if compiler optimization is ON
  4854. // I placed this method at very end of unit because there are no
  4855. // way to restore compiler optimization flag ...
  4856. {$ENDIF}
  4857. procedure TRegExpr.Error(AErrorID: integer);
  4858. {$IFDEF reRealExceptionAddr}
  4859. function ReturnAddr: Pointer; // ###0.938
  4860. asm
  4861. mov eax,[ebp+4]
  4862. end;
  4863. {$ENDIF}
  4864. var
  4865. e: ERegExpr;
  4866. begin
  4867. fLastError := AErrorID; // dummy stub - useless because will raise exception
  4868. if AErrorID < 1000 // compilation error ?
  4869. then
  4870. e := ERegExpr.Create(ErrorMsg(AErrorID) // yes - show error pos
  4871. + ' (pos ' + IntToStr(CompilerErrorPos) + ')')
  4872. else
  4873. e := ERegExpr.Create(ErrorMsg(AErrorID));
  4874. e.ErrorCode := AErrorID;
  4875. e.CompilerErrorPos := CompilerErrorPos;
  4876. raise e
  4877. {$IFDEF reRealExceptionAddr}
  4878. at ReturnAddr; // ###0.938
  4879. {$ENDIF}
  4880. end; { of procedure TRegExpr.Error
  4881. -------------------------------------------------------------- }
  4882. (*
  4883. PCode persistence:
  4884. FirstCharSet
  4885. programm, regsize
  4886. reganchored // -> programm
  4887. regmust, regmustlen // -> programm
  4888. fExprIsCompiled
  4889. *)
  4890. // be carefull - placed here code will be always compiled with
  4891. // compiler optimization flag
  4892. initialization
  4893. RegExprInvertCaseFunction := TRegExpr.InvertCaseFunction;
  4894. end.