regexpr.pas 159 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348
  1. unit RegExpr;
  2. {
  3. TRegExpr class library
  4. Delphi Regular Expressions
  5. Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
  6. You can choose to use this Pascal unit in one of the two following licenses:
  7. Option 1>
  8. You may use this software in any kind of development,
  9. including comercial, redistribute, and modify it freely,
  10. under the following restrictions :
  11. 1. This software is provided as it is, without any kind of
  12. warranty given. Use it at Your own risk.The author is not
  13. responsible for any consequences of use of this software.
  14. 2. The origin of this software may not be mispresented, You
  15. must not claim that You wrote the original software. If
  16. You use this software in any kind of product, it would be
  17. appreciated that there in a information box, or in the
  18. documentation would be an acknowledgement like
  19. Partial Copyright (c) 2004 Andrey V. Sorokin
  20. https://sorokin.engineer/
  21. [email protected]
  22. 3. You may not have any income from distributing this source
  23. (or altered version of it) to other developers. When You
  24. use this product in a comercial package, the source may
  25. not be charged seperatly.
  26. 4. Altered versions must be plainly marked as such, and must
  27. not be misrepresented as being the original software.
  28. 5. RegExp Studio application and all the visual components as
  29. well as documentation is not part of the TRegExpr library
  30. and is not free for usage.
  31. https://sorokin.engineer/
  32. [email protected]
  33. Option 2>
  34. The same modified LGPL with static linking exception as the Free Pascal RTL
  35. }
  36. interface
  37. { off $DEFINE DebugSynRegExpr }
  38. {$MODE DELPHI} // Delphi-compatible mode in FreePascal
  39. // Disabling for now, seems to cause bug in Lazarus (bug ID 36603)
  40. {$INLINE ON}
  41. // ======== Define base compiler options
  42. {$BOOLEVAL OFF}
  43. {$EXTENDEDSYNTAX ON}
  44. {$LONGSTRINGS ON}
  45. { use optimization settings passed via fpmake/make }
  46. {OPTIMIZATION ON}
  47. // ======== Define options for TRegExpr engine
  48. {$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string
  49. {$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string
  50. {$DEFINE ComplexBraces} // Support braces in complex cases
  51. {$IFNDEF UniCode}
  52. {$UNDEF UnicodeWordDetection}
  53. {$ELSE}
  54. {$DEFINE UnicodeWordDetection}
  55. {$ENDIF}
  56. uses
  57. Math, // Min
  58. Classes, // TStrings in Split method
  59. SysUtils; // Exception
  60. type
  61. {$IFDEF UniCode}
  62. PRegExprChar = PWideChar;
  63. RegExprString = UnicodeString;
  64. REChar = WideChar;
  65. {$ELSE}
  66. PRegExprChar = PChar;
  67. RegExprString = AnsiString; // ###0.952 was string
  68. REChar = Char;
  69. {$ENDIF}
  70. TREOp = REChar; // internal p-code type //###0.933
  71. PREOp = ^TREOp;
  72. type
  73. TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object;
  74. TRegExprCharset = set of byte;
  75. const
  76. // Escape char ('\' in common r.e.) used for escaping metachars (\w, \d etc)
  77. EscChar = '\';
  78. RegExprModifierI: boolean = False; // default value for ModifierI
  79. RegExprModifierR: boolean = True; // default value for ModifierR
  80. RegExprModifierS: boolean = True; // default value for ModifierS
  81. RegExprModifierG: boolean = True; // default value for ModifierG
  82. RegExprModifierM: boolean = False; // default value for ModifierM
  83. RegExprModifierX: boolean = False; // default value for ModifierX
  84. // default value for SpaceChars
  85. RegExprSpaceChars: RegExprString = ' '#$9#$A#$D#$C;
  86. // default value for WordChars
  87. RegExprWordChars: RegExprString = '0123456789'
  88. + 'abcdefghijklmnopqrstuvwxyz'
  89. + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
  90. // default value for LineSeparators
  91. RegExprLineSeparators: RegExprString = #$d#$a#$b#$c
  92. {$IFDEF UniCode}
  93. + #$2028#$2029#$85
  94. {$ENDIF};
  95. // default value for LinePairedSeparator
  96. RegExprLinePairedSeparator: RegExprString = #$d#$a;
  97. { if You need Unix-styled line separators (only \n), then use:
  98. RegExprLineSeparators = #$a;
  99. RegExprLinePairedSeparator = '';
  100. }
  101. // Tab and Unicode category "Space Separator":
  102. // https://www.compart.com/en/unicode/category/Zs
  103. RegExprHorzSeparators: RegExprString = #9#$20#$A0
  104. {$IFDEF UniCode}
  105. + #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000
  106. {$ENDIF};
  107. const
  108. NSUBEXP = 90; // max number of subexpression //###0.929
  109. // Cannot be more than NSUBEXPMAX
  110. // Be carefull - don't use values which overflow CLOSE opcode
  111. // (in this case you'll get compiler error).
  112. // Big NSUBEXP will cause more slow work and more stack required
  113. NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
  114. // Don't change it! It's defined by internal TRegExpr design.
  115. {$IFDEF ComplexBraces}
  116. const
  117. LoopStackMax = 10; // max depth of loops stack //###0.925
  118. type
  119. TRegExprLoopStack = array [1 .. LoopStackMax] of integer;
  120. {$ENDIF}
  121. type
  122. TRegExprModifiers = record
  123. I: boolean;
  124. // Case-insensitive.
  125. R: boolean;
  126. // Extended syntax for Russian ranges in [].
  127. // If True, then а-я additionally includes letter 'ё',
  128. // А-Я additionally includes 'Ё', and а-Я includes all Russian letters.
  129. // Turn it off if it interferes with your national alphabet.
  130. S: boolean;
  131. // Dot '.' matches any char, otherwise only [^\n].
  132. G: boolean;
  133. // Greedy. Switching it off switches all operators to non-greedy style,
  134. // so if G=False, then '*' works like '*?', '+' works like '+?' and so on.
  135. M: boolean;
  136. // Treat string as multiple lines. It changes `^' and `$' from
  137. // matching at only the very start/end of the string to the start/end
  138. // of any line anywhere within the string.
  139. X: boolean;
  140. // Allow comments in regex using # char.
  141. end;
  142. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  143. type
  144. TRegExpr = class;
  145. TRegExprReplaceFunction = function(ARegExpr: TRegExpr): RegExprString of object;
  146. TRegExprCharChecker = function(ch: REChar): boolean of object;
  147. TRegExprCharCheckerArray = array[0 .. 30] of TRegExprCharChecker;
  148. TRegExprCharCheckerInfo = record
  149. CharBegin, CharEnd: REChar;
  150. CheckerIndex: integer;
  151. end;
  152. TRegExprCharCheckerInfos = array of TRegExprCharCheckerInfo;
  153. { TRegExpr }
  154. TRegExpr = class
  155. private
  156. startp: array [0 .. NSUBEXP - 1] of PRegExprChar; // found expr start points
  157. endp: array [0 .. NSUBEXP - 1] of PRegExprChar; // found expr end points
  158. GrpIndexes: array [0 .. NSUBEXP - 1] of integer;
  159. GrpCount: integer;
  160. {$IFDEF ComplexBraces}
  161. LoopStack: TRegExprLoopStack; // state before entering loop
  162. LoopStackIdx: integer; // 0 - out of all loops
  163. {$ENDIF}
  164. // The "internal use only" fields to pass info from compile
  165. // to execute that permits the execute phase to run lots faster on
  166. // simple cases.
  167. reganchored: REChar; // is the match anchored (at beginning-of-line only)?
  168. regmust: PRegExprChar; // string (pointer into program) that match must include, or nil
  169. regmustlen: integer; // length of regmust string
  170. regmustString: RegExprString;
  171. // reganchored permits very fast decisions on suitable starting points
  172. // for a match, cutting down the work a lot. Regmust permits fast rejection
  173. // of lines that cannot possibly match. The regmust tests are costly enough
  174. // that regcomp() supplies a regmust only if the r.e. contains something
  175. // potentially expensive (at present, the only such thing detected is * or +
  176. // at the start of the r.e., which can involve a lot of backup). regmustlen is
  177. // supplied because the test in regexec() needs it and regcomp() is computing
  178. // it anyway.
  179. {$IFDEF UseFirstCharSet}
  180. FirstCharSet: TRegExprCharset;
  181. FirstCharArray: array[byte] of boolean;
  182. {$ENDIF}
  183. // work variables for Exec routines - save stack in recursion
  184. reginput: PRegExprChar; // String-input pointer.
  185. fInputStart: PRegExprChar; // Pointer to first char of input string.
  186. fInputEnd: PRegExprChar; // Pointer to char AFTER last char of input string
  187. fRegexStart: PRegExprChar;
  188. fRegexEnd: PRegExprChar;
  189. // work variables for compiler's routines
  190. regparse: PRegExprChar; // Input-scan pointer.
  191. regnpar: integer; // Count of () brackets.
  192. regdummy: REChar;
  193. regcode: PRegExprChar; // Code-emit pointer; @regdummy = don't.
  194. regsize: integer; // Total programm size in REChars.
  195. regExactlyLen: PLongInt;
  196. regexpBegin: PRegExprChar; // only for error handling. Contains pointer to beginning of r.e. while compiling
  197. regexpIsCompiled: boolean; // true if r.e. successfully compiled
  198. fSecondPass: boolean;
  199. // programm is essentially a linear encoding
  200. // of a nondeterministic finite-state machine (aka syntax charts or
  201. // "railroad normal form" in parsing technology). Each node is an opcode
  202. // plus a "next" pointer, possibly plus an operand. "Next" pointers of
  203. // all nodes except BRANCH implement concatenation; a "next" pointer with
  204. // a BRANCH on both ends of it connects two alternatives. (Here we
  205. // have one of the subtle syntax dependencies: an individual BRANCH (as
  206. // opposed to a collection of them) is never concatenated with anything
  207. // because of operator precedence.) The operand of some types of node is
  208. // a literal string; for others, it is a node leading into a sub-FSM. In
  209. // particular, the operand of a BRANCH node is the first node of the branch.
  210. // (NB this is *not* a tree structure: the tail of the branch connects
  211. // to the thing following the set of BRANCHes.) The opcodes are:
  212. programm: PRegExprChar; // Unwarranted chumminess with compiler.
  213. fExpression: RegExprString; // source of compiled r.e.
  214. fInputString: RegExprString; // input string
  215. fLastError: integer; // see Error, LastError
  216. fLastErrorOpcode: TREOp;
  217. fModifiers: TRegExprModifiers; // modifiers
  218. fCompModifiers: TRegExprModifiers; // compiler's copy of modifiers
  219. fProgModifiers: TRegExprModifiers; // modifiers values from last programm compilation
  220. fSpaceChars: RegExprString;
  221. fWordChars: RegExprString;
  222. fInvertCase: TRegExprInvertCaseFunction;
  223. fLineSeparators: RegExprString;
  224. fLinePairedSeparatorAssigned: boolean;
  225. fLinePairedSeparatorHead, fLinePairedSeparatorTail: REChar;
  226. FReplaceLineEnd: RegExprString; // string to use for "\n" in Substitute method
  227. FUseOsLineEndOnReplace: boolean; // use OS LineBreak chars (LF or CRLF) for FReplaceLineEnd
  228. fSlowChecksSizeMax: integer;
  229. // use ASlowChecks=True in Exec() only when Length(InputString)<SlowChecksSizeMax
  230. // ASlowChecks enables to use regmustString optimization
  231. {$IFNDEF UniCode}
  232. fLineSepArray: array[byte] of boolean;
  233. {$ENDIF}
  234. {$IFDEF UnicodeWordDetection}
  235. FUseUnicodeWordDetection: boolean;
  236. {$ENDIF}
  237. FEmptyInputRaisesError : Boolean;
  238. CharCheckers: TRegExprCharCheckerArray;
  239. CharCheckerInfos: TRegExprCharCheckerInfos;
  240. CheckerIndex_Word: byte;
  241. CheckerIndex_NotWord: byte;
  242. CheckerIndex_Digit: byte;
  243. CheckerIndex_NotDigit: byte;
  244. CheckerIndex_Space: byte;
  245. CheckerIndex_NotSpace: byte;
  246. CheckerIndex_HorzSep: byte;
  247. CheckerIndex_NotHorzSep: byte;
  248. CheckerIndex_VertSep: byte;
  249. CheckerIndex_NotVertSep: byte;
  250. CheckerIndex_LowerAZ: byte;
  251. CheckerIndex_UpperAZ: byte;
  252. procedure InitCharCheckers;
  253. function CharChecker_Word(ch: REChar): boolean;
  254. function CharChecker_NotWord(ch: REChar): boolean;
  255. function CharChecker_Space(ch: REChar): boolean;
  256. function CharChecker_NotSpace(ch: REChar): boolean;
  257. function CharChecker_Digit(ch: REChar): boolean;
  258. function CharChecker_NotDigit(ch: REChar): boolean;
  259. function CharChecker_HorzSep(ch: REChar): boolean;
  260. function CharChecker_NotHorzSep(ch: REChar): boolean;
  261. function CharChecker_VertSep(ch: REChar): boolean;
  262. function CharChecker_NotVertSep(ch: REChar): boolean;
  263. function CharChecker_LowerAZ(ch: REChar): boolean;
  264. function CharChecker_UpperAZ(ch: REChar): boolean;
  265. procedure ClearMatches; {$IFDEF InlineFuncs}inline;{$ENDIF}
  266. procedure ClearInternalIndexes; {$IFDEF InlineFuncs}inline;{$ENDIF}
  267. function FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  268. procedure GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  269. procedure GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
  270. procedure GetCharSetFromWordChars(var ARes: TRegExprCharSet);
  271. function IsWordChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  272. function IsSpaceChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  273. function IsCustomLineSeparator(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  274. procedure InitLineSepArray;
  275. // Mark programm as having to be [re]compiled
  276. procedure InvalidateProgramm;
  277. // Check if we can use precompiled r.e. or
  278. // [re]compile it if something changed
  279. function IsProgrammOk: boolean; // ###0.941
  280. procedure SetExpression(const AStr: RegExprString);
  281. function GetModifierStr: RegExprString;
  282. procedure SetModifierStr(const AStr: RegExprString);
  283. function GetModifierG: boolean;
  284. function GetModifierI: boolean;
  285. function GetModifierM: boolean;
  286. function GetModifierR: boolean;
  287. function GetModifierS: boolean;
  288. function GetModifierX: boolean;
  289. procedure SetModifierG(AValue: boolean);
  290. procedure SetModifierI(AValue: boolean);
  291. procedure SetModifierM(AValue: boolean);
  292. procedure SetModifierR(AValue: boolean);
  293. procedure SetModifierS(AValue: boolean);
  294. procedure SetModifierX(AValue: boolean);
  295. // Default handler raises exception ERegExpr with
  296. // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
  297. // and CompilerErrorPos = value of property CompilerErrorPos.
  298. procedure Error(AErrorID: integer); virtual; // error handler.
  299. { ==================== Compiler section =================== }
  300. // compile a regular expression into internal code
  301. function CompileRegExpr(ARegExp: PRegExprChar): boolean;
  302. procedure SetUseOsLineEndOnReplace(AValue: boolean);
  303. // set the next-pointer at the end of a node chain
  304. procedure Tail(p: PRegExprChar; val: PRegExprChar);
  305. // regoptail - regtail on operand of first argument; nop if operandless
  306. procedure OpTail(p: PRegExprChar; val: PRegExprChar);
  307. // regnode - emit a node, return location
  308. function EmitNode(op: TREOp): PRegExprChar;
  309. // emit (if appropriate) a byte of code
  310. procedure EmitC(ch: REChar);
  311. // emit LongInt value
  312. procedure EmitInt(AValue: LongInt);
  313. // insert an operator in front of already-emitted operand
  314. // Means relocating the operand.
  315. procedure InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  316. // ###0.90
  317. // regular expression, i.e. main body or parenthesized thing
  318. function ParseReg(paren: integer; var flagp: integer): PRegExprChar;
  319. // one alternative of an | operator
  320. function ParseBranch(var flagp: integer): PRegExprChar;
  321. // something followed by possible [*+?]
  322. function ParsePiece(var flagp: integer): PRegExprChar;
  323. function HexDig(Ch: REChar): integer;
  324. function UnQuoteChar(var APtr: PRegExprChar): REChar;
  325. // the lowest level
  326. function ParseAtom(var flagp: integer): PRegExprChar;
  327. // current pos in r.e. - for error hanling
  328. function GetCompilerErrorPos: PtrInt;
  329. {$IFDEF UseFirstCharSet} // ###0.929
  330. procedure FillFirstCharSet(prog: PRegExprChar);
  331. {$ENDIF}
  332. { ===================== Matching section =================== }
  333. // repeatedly match something simple, report how many
  334. function regrepeat(p: PRegExprChar; AMax: integer): integer;
  335. // dig the "next" pointer out of a node
  336. function regnext(p: PRegExprChar): PRegExprChar;
  337. // recursively matching routine
  338. function MatchPrim(prog: PRegExprChar): boolean;
  339. // match at specific position only, called from ExecPrim
  340. function MatchAtOnePos(APos: PRegExprChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  341. // Exec for stored InputString
  342. function ExecPrim(AOffset: integer; ATryOnce, ASlowChecks, ABackward: boolean): boolean;
  343. {$IFDEF RegExpPCodeDump}
  344. function DumpOp(op: TREOp): RegExprString;
  345. {$ENDIF}
  346. function GetSubExprCount: integer;
  347. function GetMatchPos(Idx: integer): PtrInt;
  348. function GetMatchLen(Idx: integer): PtrInt;
  349. function GetMatch(Idx: integer): RegExprString;
  350. procedure SetInputString(const AInputString: RegExprString);
  351. procedure SetLineSeparators(const AStr: RegExprString);
  352. procedure SetLinePairedSeparator(const AStr: RegExprString);
  353. function GetLinePairedSeparator: RegExprString;
  354. public
  355. constructor Create; overload;
  356. constructor Create(const AExpression: RegExprString); overload;
  357. destructor Destroy; override;
  358. class function VersionMajor: integer;
  359. class function VersionMinor: integer;
  360. // match a programm against a string AInputString
  361. // !!! Exec store AInputString into InputString property
  362. // For Delphi 5 and higher available overloaded versions - first without
  363. // parameter (uses already assigned to InputString property value)
  364. // and second that has int parameter and is same as ExecPos
  365. function Exec(const AInputString: RegExprString): boolean; overload;
  366. function Exec: boolean; overload;
  367. function Exec(AOffset: integer): boolean; overload;
  368. // find next match:
  369. // ExecNext;
  370. // works the same as
  371. // if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
  372. // else ExecPos (MatchPos [0] + MatchLen [0]);
  373. // but it's more simpler !
  374. // Raises exception if used without preceeding SUCCESSFUL call to
  375. // Exec* (Exec, ExecPos, ExecNext). So You always must use something like
  376. // if Exec (InputString) then repeat { proceed results} until not ExecNext;
  377. function ExecNext: boolean; overload;
  378. function ExecNext(ABackward: boolean): boolean; overload;
  379. // find match for InputString starting from AOffset position
  380. // (AOffset=1 - first char of InputString)
  381. function ExecPos(AOffset: integer = 1): boolean; overload;
  382. function ExecPos(AOffset: integer; ATryOnce: boolean): boolean; overload;
  383. function ExecPos(AOffset: integer; ATryOnce, ABackward: boolean): boolean; overload;
  384. // Returns ATemplate with '$&' or '$0' replaced by whole r.e.
  385. // occurence and '$1'...'$nn' replaced by subexpression with given index.
  386. // Symbol '$' is used instead of '\' (for future extensions
  387. // and for more Perl-compatibility) and accepts more than one digit.
  388. // If you want to place into template raw '$' or '\', use prefix '\'.
  389. // Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
  390. // If you want to place any number after '$' you must enclose it
  391. // with curly braces: '${12}'.
  392. // Example: 'a$12bc' -> 'a<Match[12]>bc'
  393. // 'a${1}2bc' -> 'a<Match[1]>2bc'.
  394. function Substitute(const ATemplate: RegExprString): RegExprString;
  395. // Splits AInputStr to list by positions of all r.e. occurencies.
  396. // Internally calls Exec, ExecNext.
  397. procedure Split(const AInputStr: RegExprString; APieces: TStrings);
  398. function Replace(const AInputStr: RegExprString;
  399. const AReplaceStr: RegExprString;
  400. AUseSubstitution: boolean = False) // ###0.946
  401. : RegExprString; overload;
  402. function Replace(const AInputStr: RegExprString;
  403. AReplaceFunc: TRegExprReplaceFunction): RegExprString; overload;
  404. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr.
  405. // If AUseSubstitution is true, then AReplaceStr will be used
  406. // as template for Substitution methods.
  407. // For example:
  408. // Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
  409. // Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
  410. // will return: def 'BLOCK' value 'test1'
  411. // Replace ('BLOCK( test1)', 'def "$1" value "$2"')
  412. // will return: def "$1" value "$2"
  413. // Internally calls Exec, ExecNext.
  414. // Overloaded version and ReplaceEx operate with callback function,
  415. // so you can implement really complex functionality.
  416. function ReplaceEx(const AInputStr: RegExprString;
  417. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  418. // Returns ID of last error, 0 if no errors (unusable if
  419. // Error method raises exception) and clear internal status
  420. // into 0 (no errors).
  421. function LastError: integer;
  422. // Returns Error message for error with ID = AErrorID.
  423. function ErrorMsg(AErrorID: integer): RegExprString; virtual;
  424. // Converts Ch into upper case if it in lower case or in lower
  425. // if it in upper (uses current system local setings)
  426. class function InvertCaseFunction(const Ch: REChar): REChar;
  427. // [Re]compile r.e. Useful for example for GUI r.e. editors (to check
  428. // all properties validity).
  429. procedure Compile; // ###0.941
  430. {$IFDEF RegExpPCodeDump}
  431. // dump a compiled regexp in vaguely comprehensible form
  432. function Dump: RegExprString;
  433. {$ENDIF}
  434. // Regular expression.
  435. // For optimization, TRegExpr will automatically compiles it into 'P-code'
  436. // (You can see it with help of Dump method) and stores in internal
  437. // structures. Real [re]compilation occures only when it really needed -
  438. // while calling Exec, ExecNext, Substitute, Dump, etc
  439. // and only if Expression or other P-code affected properties was changed
  440. // after last [re]compilation.
  441. // If any errors while [re]compilation occures, Error method is called
  442. // (by default Error raises exception - see below)
  443. property Expression: RegExprString read fExpression write SetExpression;
  444. // Set/get default values of r.e.syntax modifiers. Modifiers in
  445. // r.e. (?ismx-ismx) will replace this default values.
  446. // If you try to set unsupported modifier, Error will be called
  447. // (by defaul Error raises exception ERegExpr).
  448. property ModifierStr: RegExprString read GetModifierStr write SetModifierStr;
  449. property ModifierI: boolean read GetModifierI write SetModifierI;
  450. property ModifierR: boolean read GetModifierR write SetModifierR;
  451. property ModifierS: boolean read GetModifierS write SetModifierS;
  452. property ModifierG: boolean read GetModifierG write SetModifierG;
  453. property ModifierM: boolean read GetModifierM write SetModifierM;
  454. property ModifierX: boolean read GetModifierX write SetModifierX;
  455. // returns current input string (from last Exec call or last assign
  456. // to this property).
  457. // Any assignment to this property clear Match* properties !
  458. property InputString: RegExprString read fInputString write SetInputString;
  459. // Number of subexpressions has been found in last Exec* call.
  460. // If there are no subexpr. but whole expr was found (Exec* returned True),
  461. // then SubExprMatchCount=0, if no subexpressions nor whole
  462. // r.e. found (Exec* returned false) then SubExprMatchCount=-1.
  463. // Note, that some subexpr. may be not found and for such
  464. // subexpr. MathPos=MatchLen=-1 and Match=''.
  465. // For example: Expression := '(1)?2(3)?';
  466. // Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
  467. // Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
  468. // Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
  469. // Exec ('2'): SubExprMatchCount=0, Match[0]='2'
  470. // Exec ('7') - return False: SubExprMatchCount=-1
  471. property SubExprMatchCount: integer read GetSubExprCount;
  472. // pos of entrance subexpr. #Idx into tested in last Exec*
  473. // string. First subexpr. has Idx=1, last - MatchCount,
  474. // whole r.e. has Idx=0.
  475. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  476. // not found in input string.
  477. property MatchPos[Idx: integer]: PtrInt read GetMatchPos;
  478. // len of entrance subexpr. #Idx r.e. into tested in last Exec*
  479. // string. First subexpr. has Idx=1, last - MatchCount,
  480. // whole r.e. has Idx=0.
  481. // Returns -1 if in r.e. no such subexpr. or this subexpr.
  482. // not found in input string.
  483. // Remember - MatchLen may be 0 (if r.e. match empty string) !
  484. property MatchLen[Idx: integer]: PtrInt read GetMatchLen;
  485. // == copy (InputString, MatchPos [Idx], MatchLen [Idx])
  486. // Returns '' if in r.e. no such subexpr. or this subexpr.
  487. // not found in input string.
  488. property Match[Idx: integer]: RegExprString read GetMatch;
  489. // Returns position in r.e. where compiler stopped.
  490. // Useful for error diagnostics
  491. property CompilerErrorPos: PtrInt read GetCompilerErrorPos;
  492. // Contains chars, treated as /s (initially filled with RegExprSpaceChars
  493. // global constant)
  494. property SpaceChars: RegExprString read fSpaceChars write fSpaceChars;
  495. // ###0.927
  496. // Contains chars, treated as /w (initially filled with RegExprWordChars
  497. // global constant)
  498. property WordChars: RegExprString read fWordChars write fWordChars;
  499. // ###0.929
  500. {$IFDEF UnicodeWordDetection}
  501. // If set to true, in addition to using WordChars, a heuristic to detect unicode word letters is used for \w
  502. property UseUnicodeWordDetection: boolean read FUseUnicodeWordDetection write FUseUnicodeWordDetection;
  503. {$ENDIF}
  504. // line separators (like \n in Unix)
  505. property LineSeparators: RegExprString read fLineSeparators write SetLineSeparators; // ###0.941
  506. // paired line separator (like \r\n in DOS and Windows).
  507. // must contain exactly two chars or no chars at all
  508. property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; // ###0.941
  509. // Set this property if you want to override case-insensitive functionality.
  510. // Create set it to RegExprInvertCaseFunction (InvertCaseFunction by default)
  511. property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; // ##0.935
  512. // Use OS line end on replace or not. Default is True for backwards compatibility.
  513. // Set to false to use #10.
  514. property UseOsLineEndOnReplace: boolean read FUseOsLineEndOnReplace write SetUseOsLineEndOnReplace;
  515. property SlowChecksSizeMax: integer read fSlowChecksSizeMax write fSlowChecksSizeMax;
  516. // Raise error when input string is empty
  517. Property EmptyInputRaisesError : Boolean Read FEmptyInputRaisesError Write FEmptyInputRaisesError;
  518. end;
  519. type
  520. ERegExpr = class(Exception)
  521. public
  522. ErrorCode: integer;
  523. CompilerErrorPos: PtrInt;
  524. end;
  525. const
  526. RegExprInvertCaseFunction: TRegExprInvertCaseFunction = nil;
  527. // true if string AInputString match regular expression ARegExpr
  528. // ! will raise exeption if syntax errors in ARegExpr
  529. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  530. // Split AInputStr into APieces by r.e. ARegExpr occurencies
  531. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  532. APieces: TStrings);
  533. // Returns AInputStr with r.e. occurencies replaced by AReplaceStr
  534. // If AUseSubstitution is true, then AReplaceStr will be used
  535. // as template for Substitution methods.
  536. // For example:
  537. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  538. // 'BLOCK( test1)', 'def "$1" value "$2"', True)
  539. // will return: def 'BLOCK' value 'test1'
  540. // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
  541. // 'BLOCK( test1)', 'def "$1" value "$2"')
  542. // will return: def "$1" value "$2"
  543. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  544. AUseSubstitution: boolean = False): RegExprString; overload; // ###0.947
  545. // Alternate form allowing to set more parameters.
  546. type
  547. TRegexReplaceOption = (
  548. rroModifierI,
  549. rroModifierR,
  550. rroModifierS,
  551. rroModifierG,
  552. rroModifierM,
  553. rroModifierX,
  554. rroUseSubstitution,
  555. rroUseOsLineEnd
  556. );
  557. TRegexReplaceOptions = set of TRegexReplaceOption;
  558. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  559. Options: TRegexReplaceOptions): RegExprString; overload;
  560. // Replace all metachars with its safe representation,
  561. // for example 'abc$cd.(' converts into 'abc\$cd\.\('
  562. // This function useful for r.e. autogeneration from
  563. // user input
  564. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  565. // Makes list of subexpressions found in ARegExpr r.e.
  566. // In ASubExps every item represent subexpression,
  567. // from first to last, in format:
  568. // String - subexpression text (without '()')
  569. // low word of Object - starting position in ARegExpr, including '('
  570. // if exists! (first position is 1)
  571. // high word of Object - length, including starting '(' and ending ')'
  572. // if exist!
  573. // AExtendedSyntax - must be True if modifier /m will be On while
  574. // using the r.e.
  575. // Useful for GUI editors of r.e. etc (You can find example of using
  576. // in TestRExp.dpr project)
  577. // Returns
  578. // 0 Success. No unbalanced brackets was found;
  579. // -1 There are not enough closing brackets ')';
  580. // -(n+1) At position n was found opening '[' without //###0.942
  581. // corresponding closing ']';
  582. // n At position n was found closing bracket ')' without
  583. // corresponding opening '('.
  584. // If Result <> 0, then ASubExpr can contain empty items or illegal ones
  585. function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
  586. AExtendedSyntax: boolean= False): integer;
  587. implementation
  588. {$IFDEF UnicodeWordDetection}
  589. uses
  590. UnicodeData;
  591. {$ENDIF}
  592. const
  593. // TRegExpr.VersionMajor/Minor return values of these constants:
  594. REVersionMajor = 0;
  595. REVersionMinor = 987;
  596. OpKind_End = REChar(1);
  597. OpKind_MetaClass = REChar(2);
  598. OpKind_Range = REChar(3);
  599. OpKind_Char = REChar(4);
  600. RegExprAllSet = [0 .. 255];
  601. RegExprDigitSet = [Ord('0') .. Ord('9')];
  602. RegExprLowerAzSet = [Ord('a') .. Ord('z')];
  603. RegExprUpperAzSet = [Ord('A') .. Ord('Z')];
  604. RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet;
  605. RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UniCode} + [$85] {$ENDIF};
  606. RegExprHorzSeparatorsSet = [9, $20, $A0];
  607. MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
  608. type
  609. TRENextOff = PtrInt;
  610. // internal Next "pointer" (offset to current p-code) //###0.933
  611. PRENextOff = ^TRENextOff;
  612. // used for extracting Next "pointers" from compiled r.e. //###0.933
  613. TREBracesArg = integer; // type of {m,n} arguments
  614. PREBracesArg = ^TREBracesArg;
  615. const
  616. REOpSz = SizeOf(TREOp) div SizeOf(REChar);
  617. // size of OP_ command in REChars
  618. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  619. // add space for aligning pointer
  620. // -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size
  621. RENextOffSz = (2 * SizeOf(TRENextOff) div SizeOf(REChar)) - 1;
  622. REBracesArgSz = (2 * SizeOf(TREBracesArg) div SizeOf(REChar));
  623. // add space for aligning pointer
  624. {$ELSE}
  625. RENextOffSz = (SizeOf(TRENextOff) div SizeOf(REChar));
  626. // size of Next pointer in REChars
  627. REBracesArgSz = SizeOf(TREBracesArg) div SizeOf(REChar);
  628. // size of BRACES arguments in REChars
  629. {$ENDIF}
  630. RENumberSz = SizeOf(LongInt) div SizeOf(REChar);
  631. function _FindCharInBuffer(SBegin, SEnd: PRegExprChar; Ch: REChar): PRegExprChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
  632. begin
  633. while SBegin < SEnd do
  634. begin
  635. if SBegin^ = Ch then
  636. begin
  637. Result := SBegin;
  638. Exit;
  639. end;
  640. Inc(SBegin);
  641. end;
  642. Result := nil;
  643. end;
  644. function IsIgnoredChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  645. begin
  646. case AChar of
  647. ' ', #9, #$d, #$a:
  648. Result := True
  649. else
  650. Result := False;
  651. end;
  652. end;
  653. function _IsMetaChar(AChar: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  654. begin
  655. case AChar of
  656. 'd', 'D',
  657. 's', 'S',
  658. 'w', 'W',
  659. 'v', 'V',
  660. 'h', 'H':
  661. Result := True
  662. else
  663. Result := False;
  664. end;
  665. end;
  666. function AlignToPtr(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  667. begin
  668. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  669. Result := Align(p, SizeOf(Pointer));
  670. {$ELSE}
  671. Result := p;
  672. {$ENDIF}
  673. end;
  674. function AlignToInt(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  675. begin
  676. {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
  677. Result := Align(p, SizeOf(integer));
  678. {$ELSE}
  679. Result := p;
  680. {$ENDIF}
  681. end;
  682. function _UpperCase(Ch: REChar): REChar;
  683. begin
  684. Result := Ch;
  685. if (Ch >= 'a') and (Ch <= 'z') then
  686. begin
  687. Dec(Result, 32);
  688. Exit;
  689. end;
  690. if Ord(Ch) < 128 then
  691. Exit;
  692. {$IFDEF FPC}
  693. {$IFDEF UniCode}
  694. Result := UnicodeUpperCase(Ch)[1];
  695. {$ELSE}
  696. Result := AnsiUpperCase(Ch)[1];
  697. {$ENDIF}
  698. {$ELSE}
  699. {$IFDEF UniCode}
  700. {$IFDEF D2009}
  701. Result := TCharacter.ToUpper(Ch);
  702. {$ENDIF}
  703. {$ELSE}
  704. Result := AnsiUpperCase(Ch)[1];
  705. {$ENDIF}
  706. {$ENDIF}
  707. end;
  708. function _LowerCase(Ch: REChar): REChar;
  709. begin
  710. Result := Ch;
  711. if (Ch >= 'A') and (Ch <= 'Z') then
  712. begin
  713. Inc(Result, 32);
  714. Exit;
  715. end;
  716. if Ord(Ch) < 128 then
  717. Exit;
  718. {$IFDEF FPC}
  719. {$IFDEF UniCode}
  720. Result := UnicodeLowerCase(Ch)[1];
  721. {$ELSE}
  722. Result := AnsiLowerCase(Ch)[1];
  723. {$ENDIF}
  724. {$ELSE}
  725. {$IFDEF UniCode}
  726. {$IFDEF D2009}
  727. Result := TCharacter.ToLower(Ch);
  728. {$ENDIF}
  729. {$ELSE}
  730. Result := AnsiLowerCase(Ch)[1];
  731. {$ENDIF}
  732. {$ENDIF}
  733. end;
  734. { ============================================================= }
  735. { ===================== Global functions ====================== }
  736. { ============================================================= }
  737. function IsModifiersEqual(const A, B: TRegExprModifiers): boolean;
  738. begin
  739. Result :=
  740. (A.I = B.I) and
  741. (A.G = B.G) and
  742. (A.M = B.M) and
  743. (A.S = B.S) and
  744. (A.R = B.R) and
  745. (A.X = B.X);
  746. end;
  747. function ParseModifiers(const APtr: PRegExprChar;
  748. ALen: integer;
  749. var AValue: TRegExprModifiers): boolean;
  750. // Parse string and set AValue if it's in format 'ismxrg-ismxrg'
  751. var
  752. IsOn: boolean;
  753. i: integer;
  754. begin
  755. Result := True;
  756. IsOn := True;
  757. for i := 0 to ALen-1 do
  758. case APtr[i] of
  759. '-':
  760. IsOn := False;
  761. 'I', 'i':
  762. AValue.I := IsOn;
  763. 'R', 'r':
  764. AValue.R := IsOn;
  765. 'S', 's':
  766. AValue.S := IsOn;
  767. 'G', 'g':
  768. AValue.G := IsOn;
  769. 'M', 'm':
  770. AValue.M := IsOn;
  771. 'X', 'x':
  772. AValue.X := IsOn;
  773. else
  774. begin
  775. Result := False;
  776. Exit;
  777. end;
  778. end;
  779. end;
  780. function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): boolean;
  781. var
  782. r: TRegExpr;
  783. begin
  784. r := TRegExpr.Create;
  785. try
  786. r.Expression := ARegExpr;
  787. Result := r.Exec(AInputStr);
  788. finally
  789. r.Free;
  790. end;
  791. end; { of function ExecRegExpr
  792. -------------------------------------------------------------- }
  793. procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
  794. APieces: TStrings);
  795. var
  796. r: TRegExpr;
  797. begin
  798. APieces.Clear;
  799. r := TRegExpr.Create;
  800. try
  801. r.Expression := ARegExpr;
  802. r.Split(AInputStr, APieces);
  803. finally
  804. r.Free;
  805. end;
  806. end; { of procedure SplitRegExpr
  807. -------------------------------------------------------------- }
  808. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  809. AUseSubstitution: boolean= False): RegExprString;
  810. begin
  811. with TRegExpr.Create do
  812. try
  813. Expression := ARegExpr;
  814. Result := Replace(AInputStr, AReplaceStr, AUseSubstitution);
  815. finally
  816. Free;
  817. end;
  818. end; { of function ReplaceRegExpr
  819. -------------------------------------------------------------- }
  820. function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
  821. Options: TRegexReplaceOptions): RegExprString; overload;
  822. begin
  823. with TRegExpr.Create do
  824. try
  825. ModifierI := (rroModifierI in Options);
  826. ModifierR := (rroModifierR in Options);
  827. ModifierS := (rroModifierS in Options);
  828. ModifierG := (rroModifierG in Options);
  829. ModifierM := (rroModifierM in Options);
  830. ModifierX := (rroModifierX in Options);
  831. // Set this after the above, if the regex contains modifiers, they will be applied.
  832. Expression := ARegExpr;
  833. UseOsLineEndOnReplace := (rroUseOsLineEnd in Options);
  834. Result := Replace(AInputStr, AReplaceStr, rroUseSubstitution in Options);
  835. finally
  836. Free;
  837. end;
  838. end;
  839. (*
  840. const
  841. MetaChars_Init = '^$.[()|?+*' + EscChar + '{';
  842. MetaChars = MetaChars_Init; // not needed to be a variable, const is faster
  843. MetaAll = MetaChars_Init + ']}'; // Very similar to MetaChars, but slighly changed.
  844. *)
  845. function _IsMetaSymbol1(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  846. begin
  847. case ch of
  848. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{':
  849. Result := True
  850. else
  851. Result := False
  852. end;
  853. end;
  854. function _IsMetaSymbol2(ch: REChar): boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
  855. begin
  856. case ch of
  857. '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{',
  858. ']', '}':
  859. Result := True
  860. else
  861. Result := False
  862. end;
  863. end;
  864. function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
  865. var
  866. i, i0, Len: integer;
  867. ch: REChar;
  868. begin
  869. Result := '';
  870. Len := Length(AStr);
  871. i := 1;
  872. i0 := i;
  873. while i <= Len do
  874. begin
  875. ch := AStr[i];
  876. if _IsMetaSymbol2(ch) then
  877. begin
  878. Result := Result + System.Copy(AStr, i0, i - i0) + EscChar + ch;
  879. i0 := i + 1;
  880. end;
  881. Inc(i);
  882. end;
  883. Result := Result + System.Copy(AStr, i0, MaxInt); // Tail
  884. end; { of function QuoteRegExprMetaChars
  885. -------------------------------------------------------------- }
  886. function RegExprSubExpressions(const ARegExpr: string; ASubExprs: TStrings;
  887. AExtendedSyntax: boolean = False): integer;
  888. type
  889. TStackItemRec = record // ###0.945
  890. SubExprIdx: integer;
  891. StartPos: PtrInt;
  892. end;
  893. TStackArray = packed array [0 .. NSUBEXPMAX - 1] of TStackItemRec;
  894. var
  895. Len, SubExprLen: integer;
  896. i, i0: integer;
  897. Modif: TRegExprModifiers;
  898. Stack: ^TStackArray; // ###0.945
  899. StackIdx, StackSz: integer;
  900. begin
  901. Result := 0; // no unbalanced brackets found at this very moment
  902. Modif:=Default(TRegExprModifiers);
  903. ASubExprs.Clear; // I don't think that adding to non empty list
  904. // can be useful, so I simplified algorithm to work only with empty list
  905. Len := Length(ARegExpr); // some optimization tricks
  906. // first we have to calculate number of subexpression to reserve
  907. // space in Stack array (may be we'll reserve more than needed, but
  908. // it's faster then memory reallocation during parsing)
  909. StackSz := 1; // add 1 for entire r.e.
  910. for i := 1 to Len do
  911. if ARegExpr[i] = '(' then
  912. Inc(StackSz);
  913. // SetLength (Stack, StackSz); //###0.945
  914. GetMem(Stack, SizeOf(TStackItemRec) * StackSz);
  915. try
  916. StackIdx := 0;
  917. i := 1;
  918. while (i <= Len) do
  919. begin
  920. case ARegExpr[i] of
  921. '(':
  922. begin
  923. if (i < Len) and (ARegExpr[i + 1] = '?') then
  924. begin
  925. // this is not subexpression, but comment or other
  926. // Perl extension. We must check is it (?ismxrg-ismxrg)
  927. // and change AExtendedSyntax if /x is changed.
  928. Inc(i, 2); // skip '(?'
  929. i0 := i;
  930. while (i <= Len) and (ARegExpr[i] <> ')') do
  931. Inc(i);
  932. if i > Len then
  933. Result := -1 // unbalansed '('
  934. else
  935. if ParseModifiers(@ARegExpr[i0], i - i0, Modif) then
  936. // Alexey-T: original code had copy from i, not from i0
  937. AExtendedSyntax := Modif.X;
  938. end
  939. else
  940. begin // subexpression starts
  941. ASubExprs.Add(''); // just reserve space
  942. with Stack[StackIdx] do
  943. begin
  944. SubExprIdx := ASubExprs.Count - 1;
  945. StartPos := i;
  946. end;
  947. Inc(StackIdx);
  948. end;
  949. end;
  950. ')':
  951. begin
  952. if StackIdx = 0 then
  953. Result := i // unbalanced ')'
  954. else
  955. begin
  956. Dec(StackIdx);
  957. with Stack[StackIdx] do
  958. begin
  959. SubExprLen := i - StartPos + 1;
  960. ASubExprs.Objects[SubExprIdx] :=
  961. TObject(StartPos or (SubExprLen ShL 16));
  962. ASubExprs[SubExprIdx] := System.Copy(ARegExpr, StartPos + 1,
  963. SubExprLen - 2); // add without brackets
  964. end;
  965. end;
  966. end;
  967. EscChar:
  968. Inc(i); // skip quoted symbol
  969. '[':
  970. begin
  971. // we have to skip character ranges at once, because they can
  972. // contain '#', and '#' in it must NOT be recognized as eXtended
  973. // comment beginning!
  974. i0 := i;
  975. Inc(i);
  976. if ARegExpr[i] = ']' // first ']' inside [] treated as simple char, no need to check '['
  977. then
  978. Inc(i);
  979. while (i <= Len) and (ARegExpr[i] <> ']') do
  980. if ARegExpr[i] = EscChar // ###0.942
  981. then
  982. Inc(i, 2) // skip 'escaped' char to prevent stopping at '\]'
  983. else
  984. Inc(i);
  985. if (i > Len) or (ARegExpr[i] <> ']') // ###0.942
  986. then
  987. Result := -(i0 + 1); // unbalansed '[' //###0.942
  988. end;
  989. '#':
  990. if AExtendedSyntax then
  991. begin
  992. // skip eXtended comments
  993. while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a)
  994. // do not use [#$d, #$a] due to UniCode compatibility
  995. do
  996. Inc(i);
  997. while (i + 1 <= Len) and
  998. ((ARegExpr[i + 1] = #$d) or (ARegExpr[i + 1] = #$a)) do
  999. Inc(i); // attempt to work with different kinds of line separators
  1000. // now we are at the line separator that must be skipped.
  1001. end;
  1002. // here is no 'else' clause - we simply skip ordinary chars
  1003. end; // of case
  1004. Inc(i); // skip scanned char
  1005. // ! can move after Len due to skipping quoted symbol
  1006. end;
  1007. // check brackets balance
  1008. if StackIdx <> 0 then
  1009. Result := -1; // unbalansed '('
  1010. // check if entire r.e. added
  1011. if (ASubExprs.Count = 0) or ((PtrInt(ASubExprs.Objects[0]) and $FFFF) <> 1)
  1012. or (((PtrInt(ASubExprs.Objects[0]) ShR 16) and $FFFF) <> Len)
  1013. // whole r.e. wasn't added because it isn't bracketed
  1014. // well, we add it now:
  1015. then
  1016. ASubExprs.InsertObject(0, ARegExpr, TObject((Len ShL 16) or 1));
  1017. finally
  1018. FreeMem(Stack);
  1019. end;
  1020. end; { of function RegExprSubExpressions
  1021. -------------------------------------------------------------- }
  1022. const
  1023. OP_MAGIC = TREOp(216); // programm signature
  1024. // name opcode opnd? meaning
  1025. OP_EEND = TREOp(0); // - End of program
  1026. OP_BOL = TREOp(1); // - Match "" at beginning of line
  1027. OP_EOL = TREOp(2); // - Match "" at end of line
  1028. OP_ANY = TREOp(3); // - Match any one character
  1029. OP_ANYOF = TREOp(4); // Str Match any character in string Str
  1030. OP_ANYBUT = TREOp(5); // Str Match any char. not in string Str
  1031. OP_BRANCH = TREOp(6); // Node Match this alternative, or the next
  1032. OP_BACK = TREOp(7); // - Jump backward (Next < 0)
  1033. OP_EXACTLY = TREOp(8); // Str Match string Str
  1034. OP_NOTHING = TREOp(9); // - Match empty string
  1035. OP_STAR = TREOp(10); // Node Match this (simple) thing 0 or more times
  1036. OP_PLUS = TREOp(11); // Node Match this (simple) thing 1 or more times
  1037. OP_ANYDIGIT = TREOp(12); // - Match any digit (equiv [0-9])
  1038. OP_NOTDIGIT = TREOp(13); // - Match not digit (equiv [0-9])
  1039. OP_ANYLETTER = TREOp(14); // - Match any letter from property WordChars
  1040. OP_NOTLETTER = TREOp(15); // - Match not letter from property WordChars
  1041. OP_ANYSPACE = TREOp(16); // - Match any space char (see property SpaceChars)
  1042. OP_NOTSPACE = TREOp(17); // - Match not space char (see property SpaceChars)
  1043. OP_BRACES = TREOp(18);
  1044. // Node,Min,Max Match this (simple) thing from Min to Max times.
  1045. // Min and Max are TREBracesArg
  1046. OP_COMMENT = TREOp(19); // - Comment ;)
  1047. OP_EXACTLYCI = TREOp(20); // Str Match string Str case insensitive
  1048. OP_ANYOFCI = TREOp(21);
  1049. // Str Match any character in string Str, case insensitive
  1050. OP_ANYBUTCI = TREOp(22);
  1051. // Str Match any char. not in string Str, case insensitive
  1052. OP_LOOPENTRY = TREOp(23); // Node Start of loop (Node - LOOP for this loop)
  1053. OP_LOOP = TREOp(24); // Node,Min,Max,LoopEntryJmp - back jump for LOOPENTRY.
  1054. // Min and Max are TREBracesArg
  1055. // Node - next node in sequence,
  1056. // LoopEntryJmp - associated LOOPENTRY node addr
  1057. OP_BSUBEXP = TREOp(28);
  1058. // Idx Match previously matched subexpression #Idx (stored as REChar) //###0.936
  1059. OP_BSUBEXPCI = TREOp(29); // Idx -"- in case-insensitive mode
  1060. // Non-Greedy Style Ops //###0.940
  1061. OP_STARNG = TREOp(30); // Same as OP_START but in non-greedy mode
  1062. OP_PLUSNG = TREOp(31); // Same as OP_PLUS but in non-greedy mode
  1063. OP_BRACESNG = TREOp(32); // Same as OP_BRACES but in non-greedy mode
  1064. OP_LOOPNG = TREOp(33); // Same as OP_LOOP but in non-greedy mode
  1065. // Multiline mode \m
  1066. OP_BOLML = TREOp(34); // - Match "" at beginning of line
  1067. OP_EOLML = TREOp(35); // - Match "" at end of line
  1068. OP_ANYML = TREOp(36); // - Match any one character
  1069. // Word boundary
  1070. OP_BOUND = TREOp(37); // Match "" between words //###0.943
  1071. OP_NOTBOUND = TREOp(38); // Match "" not between words //###0.943
  1072. OP_ANYHORZSEP = TREOp(39); // Any horizontal whitespace \h
  1073. OP_NOTHORZSEP = TREOp(40); // Not horizontal whitespace \H
  1074. OP_ANYVERTSEP = TREOp(41); // Any vertical whitespace \v
  1075. OP_NOTVERTSEP = TREOp(42); // Not vertical whitespace \V
  1076. // !!! Change OP_OPEN value if you add new opcodes !!!
  1077. OP_OPEN = TREOp(43); // - Mark this point in input as start of \n
  1078. // OP_OPEN + 1 is \1, etc.
  1079. OP_CLOSE = TREOp(Ord(OP_OPEN) + NSUBEXP);
  1080. // - Analogous to OP_OPEN.
  1081. // !!! Don't add new OpCodes after CLOSE !!!
  1082. // We work with p-code through pointers, compatible with PRegExprChar.
  1083. // Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
  1084. // must have lengths that can be divided by SizeOf (REChar) !
  1085. // A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
  1086. // The Next is a offset from the opcode of the node containing it.
  1087. // An operand, if any, simply follows the node. (Note that much of
  1088. // the code generation knows about this implicit relationship!)
  1089. // Using TRENextOff=PtrInt speed up p-code processing.
  1090. // Opcodes description:
  1091. //
  1092. // BRANCH The set of branches constituting a single choice are hooked
  1093. // together with their "next" pointers, since precedence prevents
  1094. // anything being concatenated to any individual branch. The
  1095. // "next" pointer of the last BRANCH in a choice points to the
  1096. // thing following the whole choice. This is also where the
  1097. // final "next" pointer of each individual branch points; each
  1098. // branch starts with the operand node of a BRANCH node.
  1099. // BACK Normal "next" pointers all implicitly point forward; BACK
  1100. // exists to make loop structures possible.
  1101. // STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
  1102. // circular BRANCH structures using BACK. Complex '{min,max}'
  1103. // - as pair LOOPENTRY-LOOP (see below). Simple cases (one
  1104. // character per match) are implemented with STAR, PLUS and
  1105. // BRACES for speed and to minimize recursive plunges.
  1106. // LOOPENTRY,LOOP {min,max} are implemented as special pair
  1107. // LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
  1108. // current level.
  1109. // OPEN,CLOSE are numbered at compile time.
  1110. { ============================================================= }
  1111. { ================== Error handling section =================== }
  1112. { ============================================================= }
  1113. const
  1114. reeOk = 0;
  1115. reeCompNullArgument = 100;
  1116. reeCompParseRegTooManyBrackets = 102;
  1117. reeCompParseRegUnmatchedBrackets = 103;
  1118. reeCompParseRegUnmatchedBrackets2 = 104;
  1119. reeCompParseRegJunkOnEnd = 105;
  1120. reePlusStarOperandCouldBeEmpty = 106;
  1121. reeNestedSQP = 107;
  1122. reeBadHexDigit = 108;
  1123. reeInvalidRange = 109;
  1124. reeParseAtomTrailingBackSlash = 110;
  1125. reeNoHexCodeAfterBSlashX = 111;
  1126. reeHexCodeAfterBSlashXTooBig = 112;
  1127. reeUnmatchedSqBrackets = 113;
  1128. reeInternalUrp = 114;
  1129. reeQPSBFollowsNothing = 115;
  1130. reeTrailingBackSlash = 116;
  1131. reeNoLetterAfterBSlashC = 117;
  1132. reeMetaCharAfterMinusInRange = 118;
  1133. reeRarseAtomInternalDisaster = 119;
  1134. reeIncorrectBraces = 121;
  1135. reeBRACESArgTooBig = 122;
  1136. reeUnknownOpcodeInFillFirst = 123;
  1137. reeBracesMinParamGreaterMax = 124;
  1138. reeUnclosedComment = 125;
  1139. reeComplexBracesNotImplemented = 126;
  1140. reeUnrecognizedModifier = 127;
  1141. reeBadLinePairedSeparator = 128;
  1142. // Runtime errors must be >= 1000
  1143. reeRegRepeatCalledInappropriately = 1000;
  1144. reeMatchPrimMemoryCorruption = 1001;
  1145. reeMatchPrimCorruptedPointers = 1002;
  1146. reeNoExpression = 1003;
  1147. reeCorruptedProgram = 1004;
  1148. reeNoInputStringSpecified = 1005;
  1149. reeOffsetMustBePositive = 1006;
  1150. reeExecNextWithoutExec = 1007;
  1151. reeBadOpcodeInCharClass = 1008;
  1152. reeDumpCorruptedOpcode = 1011;
  1153. reeModifierUnsupported = 1013;
  1154. reeLoopStackExceeded = 1014;
  1155. reeLoopWithoutEntry = 1015;
  1156. function TRegExpr.ErrorMsg(AErrorID: integer): RegExprString;
  1157. begin
  1158. case AErrorID of
  1159. reeOk:
  1160. Result := 'No errors';
  1161. reeCompNullArgument:
  1162. Result := 'TRegExpr compile: null argument';
  1163. reeCompParseRegTooManyBrackets:
  1164. Result := 'TRegExpr compile: ParseReg: too many ()';
  1165. reeCompParseRegUnmatchedBrackets:
  1166. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1167. reeCompParseRegUnmatchedBrackets2:
  1168. Result := 'TRegExpr compile: ParseReg: unmatched ()';
  1169. reeCompParseRegJunkOnEnd:
  1170. Result := 'TRegExpr compile: ParseReg: junk at end';
  1171. reePlusStarOperandCouldBeEmpty:
  1172. Result := 'TRegExpr compile: *+ operand could be empty';
  1173. reeNestedSQP:
  1174. Result := 'TRegExpr compile: nested *?+';
  1175. reeBadHexDigit:
  1176. Result := 'TRegExpr compile: bad hex digit';
  1177. reeInvalidRange:
  1178. Result := 'TRegExpr compile: invalid [] range';
  1179. reeParseAtomTrailingBackSlash:
  1180. Result := 'TRegExpr compile: parse atom trailing \';
  1181. reeNoHexCodeAfterBSlashX:
  1182. Result := 'TRegExpr compile: no hex code after \x';
  1183. reeNoLetterAfterBSlashC:
  1184. Result := 'TRegExpr compile: no letter "A".."Z" after \c';
  1185. reeMetaCharAfterMinusInRange:
  1186. Result := 'TRegExpr compile: metachar after "-" in [] range';
  1187. reeHexCodeAfterBSlashXTooBig:
  1188. Result := 'TRegExpr compile: hex code after \x is too big';
  1189. reeUnmatchedSqBrackets:
  1190. Result := 'TRegExpr compile: unmatched []';
  1191. reeInternalUrp:
  1192. Result := 'TRegExpr compile: internal fail on char "|", ")"';
  1193. reeQPSBFollowsNothing:
  1194. Result := 'TRegExpr compile: ?+*{ follows nothing';
  1195. reeTrailingBackSlash:
  1196. Result := 'TRegExpr compile: trailing \';
  1197. reeRarseAtomInternalDisaster:
  1198. Result := 'TRegExpr compile: RarseAtom internal disaster';
  1199. reeIncorrectBraces:
  1200. Result := 'TRegExpr compile: incorrect {} braces';
  1201. reeBRACESArgTooBig:
  1202. Result := 'TRegExpr compile: braces {} argument too big';
  1203. reeUnknownOpcodeInFillFirst:
  1204. Result := 'TRegExpr compile: unknown opcode in FillFirstCharSet ('+DumpOp(fLastErrorOpcode)+')';
  1205. reeBracesMinParamGreaterMax:
  1206. Result := 'TRegExpr compile: braces {} min param greater then max';
  1207. reeUnclosedComment:
  1208. Result := 'TRegExpr compile: unclosed (?#comment)';
  1209. reeComplexBracesNotImplemented:
  1210. Result := 'TRegExpr compile: if you use braces {} and non-greedy ops *?, +?, ?? for complex cases, enable {$DEFINE ComplexBraces}';
  1211. reeUnrecognizedModifier:
  1212. Result := 'TRegExpr compile: unrecognized modifier';
  1213. reeBadLinePairedSeparator:
  1214. Result := 'TRegExpr compile: LinePairedSeparator must countain two different chars or be empty';
  1215. reeRegRepeatCalledInappropriately:
  1216. Result := 'TRegExpr exec: RegRepeat called inappropriately';
  1217. reeMatchPrimMemoryCorruption:
  1218. Result := 'TRegExpr exec: MatchPrim memory corruption';
  1219. reeMatchPrimCorruptedPointers:
  1220. Result := 'TRegExpr exec: MatchPrim corrupted pointers';
  1221. reeNoExpression:
  1222. Result := 'TRegExpr exec: empty expression';
  1223. reeCorruptedProgram:
  1224. Result := 'TRegExpr exec: corrupted opcode (no magic byte)';
  1225. reeNoInputStringSpecified:
  1226. Result := 'TRegExpr exec: empty input string';
  1227. reeOffsetMustBePositive:
  1228. Result := 'TRegExpr exec: offset must be >0';
  1229. reeExecNextWithoutExec:
  1230. Result := 'TRegExpr exec: ExecNext without Exec(Pos)';
  1231. reeBadOpcodeInCharClass:
  1232. Result := 'TRegExpr exec: invalid opcode in char class';
  1233. reeDumpCorruptedOpcode:
  1234. Result := 'TRegExpr dump: corrupted opcode';
  1235. reeLoopStackExceeded:
  1236. Result := 'TRegExpr exec: loop stack exceeded';
  1237. reeLoopWithoutEntry:
  1238. Result := 'TRegExpr exec: loop without loop entry';
  1239. else
  1240. Result := 'Unknown error';
  1241. end;
  1242. end; { of procedure TRegExpr.Error
  1243. -------------------------------------------------------------- }
  1244. function TRegExpr.LastError: integer;
  1245. begin
  1246. Result := fLastError;
  1247. fLastError := reeOk;
  1248. end; { of function TRegExpr.LastError
  1249. -------------------------------------------------------------- }
  1250. { ============================================================= }
  1251. { ===================== Common section ======================== }
  1252. { ============================================================= }
  1253. class function TRegExpr.VersionMajor: integer;
  1254. begin
  1255. Result := REVersionMajor;
  1256. end;
  1257. class function TRegExpr.VersionMinor: integer;
  1258. begin
  1259. Result := REVersionMinor;
  1260. end;
  1261. constructor TRegExpr.Create;
  1262. begin
  1263. inherited;
  1264. programm := nil;
  1265. fExpression := '';
  1266. fInputString := '';
  1267. FEmptyInputRaisesError := False;
  1268. regexpBegin := nil;
  1269. regexpIsCompiled := False;
  1270. FillChar(fModifiers, SIzeOf(fModifiers), 0);
  1271. ModifierI := RegExprModifierI;
  1272. ModifierR := RegExprModifierR;
  1273. ModifierS := RegExprModifierS;
  1274. ModifierG := RegExprModifierG;
  1275. ModifierM := RegExprModifierM;
  1276. ModifierX := RegExprModifierX;
  1277. SpaceChars := RegExprSpaceChars; // ###0.927
  1278. WordChars := RegExprWordChars; // ###0.929
  1279. fInvertCase := RegExprInvertCaseFunction; // ###0.927
  1280. fLineSeparators := RegExprLineSeparators; // ###0.941
  1281. LinePairedSeparator := RegExprLinePairedSeparator; // ###0.941
  1282. FUseOsLineEndOnReplace := True;
  1283. FReplaceLineEnd := sLineBreak;
  1284. {$IFDEF UnicodeWordDetection}
  1285. FUseUnicodeWordDetection := True;
  1286. {$ENDIF}
  1287. fSlowChecksSizeMax := 2000;
  1288. InitLineSepArray;
  1289. InitCharCheckers;
  1290. end; { of constructor TRegExpr.Create
  1291. -------------------------------------------------------------- }
  1292. constructor TRegExpr.Create(const AExpression: RegExprString);
  1293. begin
  1294. Create;
  1295. Expression := AExpression;
  1296. end;
  1297. destructor TRegExpr.Destroy;
  1298. begin
  1299. if programm <> nil then
  1300. begin
  1301. FreeMem(programm);
  1302. programm := nil;
  1303. end;
  1304. end; { of destructor TRegExpr.Destroy
  1305. -------------------------------------------------------------- }
  1306. class function TRegExpr.InvertCaseFunction(const Ch: REChar): REChar;
  1307. begin
  1308. Result := Ch;
  1309. if (Ch >= 'a') and (Ch <= 'z') then
  1310. begin
  1311. Dec(Result, 32);
  1312. Exit;
  1313. end;
  1314. if (Ch >= 'A') and (Ch <= 'Z') then
  1315. begin
  1316. Inc(Result, 32);
  1317. Exit;
  1318. end;
  1319. if Ord(Ch) < 128 then
  1320. Exit;
  1321. Result := _UpperCase(Ch);
  1322. if Result = Ch then
  1323. Result := _LowerCase(Ch);
  1324. Result := _UpperCase(Ch);
  1325. if Result = Ch then
  1326. Result := _LowerCase(Ch);
  1327. end; { of function TRegExpr.InvertCaseFunction
  1328. -------------------------------------------------------------- }
  1329. procedure TRegExpr.SetExpression(const AStr: RegExprString);
  1330. begin
  1331. if (AStr <> fExpression) or not regexpIsCompiled then
  1332. begin
  1333. regexpIsCompiled := False;
  1334. fExpression := AStr;
  1335. UniqueString(fExpression);
  1336. fRegexStart := PRegExprChar(fExpression);
  1337. fRegexEnd := fRegexStart + Length(fExpression);
  1338. InvalidateProgramm; // ###0.941
  1339. end;
  1340. end; { of procedure TRegExpr.SetExpression
  1341. -------------------------------------------------------------- }
  1342. function TRegExpr.GetSubExprCount: integer;
  1343. begin
  1344. // if nothing found, we must return -1 per TRegExpr docs
  1345. if startp[0] = nil then
  1346. Result := -1
  1347. else
  1348. Result := GrpCount;
  1349. end;
  1350. function TRegExpr.GetMatchPos(Idx: integer): PtrInt;
  1351. begin
  1352. Idx := GrpIndexes[Idx];
  1353. if (Idx >= 0) and (startp[Idx] <> nil) then
  1354. Result := startp[Idx] - fInputStart + 1
  1355. else
  1356. Result := -1;
  1357. end; { of function TRegExpr.GetMatchPos
  1358. -------------------------------------------------------------- }
  1359. function TRegExpr.GetMatchLen(Idx: integer): PtrInt;
  1360. begin
  1361. Idx := GrpIndexes[Idx];
  1362. if (Idx >= 0) and (startp[Idx] <> nil) then
  1363. Result := endp[Idx] - startp[Idx]
  1364. else
  1365. Result := -1;
  1366. end; { of function TRegExpr.GetMatchLen
  1367. -------------------------------------------------------------- }
  1368. function TRegExpr.GetMatch(Idx: integer): RegExprString;
  1369. begin
  1370. Result := '';
  1371. Idx := GrpIndexes[Idx];
  1372. if (Idx >= 0) and (endp[Idx] > startp[Idx]) then
  1373. SetString(Result, startp[Idx], endp[Idx] - startp[Idx]);
  1374. {
  1375. // then Result := copy (fInputString, MatchPos [Idx], MatchLen [Idx]) //###0.929
  1376. then
  1377. begin
  1378. SetLength(Result, endp[Idx] - startp[Idx]);
  1379. System.Move(startp[Idx]^, Result[1], Length(Result) * SizeOf(REChar));
  1380. end;
  1381. }
  1382. end; { of function TRegExpr.GetMatch
  1383. -------------------------------------------------------------- }
  1384. function TRegExpr.GetModifierStr: RegExprString;
  1385. begin
  1386. Result := '-';
  1387. if ModifierI then
  1388. Result := 'i' + Result
  1389. else
  1390. Result := Result + 'i';
  1391. if ModifierR then
  1392. Result := 'r' + Result
  1393. else
  1394. Result := Result + 'r';
  1395. if ModifierS then
  1396. Result := 's' + Result
  1397. else
  1398. Result := Result + 's';
  1399. if ModifierG then
  1400. Result := 'g' + Result
  1401. else
  1402. Result := Result + 'g';
  1403. if ModifierM then
  1404. Result := 'm' + Result
  1405. else
  1406. Result := Result + 'm';
  1407. if ModifierX then
  1408. Result := 'x' + Result
  1409. else
  1410. Result := Result + 'x';
  1411. if Result[Length(Result)] = '-' // remove '-' if all modifiers are 'On'
  1412. then
  1413. System.Delete(Result, Length(Result), 1);
  1414. end; { of function TRegExpr.GetModifierStr
  1415. -------------------------------------------------------------- }
  1416. procedure TRegExpr.SetModifierG(AValue: boolean);
  1417. begin
  1418. fModifiers.G := AValue;
  1419. end;
  1420. procedure TRegExpr.SetModifierI(AValue: boolean);
  1421. begin
  1422. fModifiers.I := AValue;
  1423. end;
  1424. procedure TRegExpr.SetModifierM(AValue: boolean);
  1425. begin
  1426. fModifiers.M := AValue;
  1427. end;
  1428. procedure TRegExpr.SetModifierR(AValue: boolean);
  1429. begin
  1430. fModifiers.R := AValue;
  1431. end;
  1432. procedure TRegExpr.SetModifierS(AValue: boolean);
  1433. begin
  1434. fModifiers.S := AValue;
  1435. end;
  1436. procedure TRegExpr.SetModifierX(AValue: boolean);
  1437. begin
  1438. fModifiers.X := AValue;
  1439. end;
  1440. procedure TRegExpr.SetModifierStr(const AStr: RegExprString);
  1441. begin
  1442. if not ParseModifiers(PRegExprChar(AStr), Length(AStr), fModifiers) then
  1443. Error(reeModifierUnsupported);
  1444. end; { of procedure TRegExpr.SetModifierStr
  1445. -------------------------------------------------------------- }
  1446. { ============================================================= }
  1447. { ==================== Compiler section ======================= }
  1448. { ============================================================= }
  1449. {$IFDEF UnicodeWordDetection}
  1450. {$IFDEF FPC}
  1451. function IsUnicodeWordChar(AChar: WideChar): boolean; inline;
  1452. var
  1453. NType: byte;
  1454. begin
  1455. if Ord(AChar) >= LOW_SURROGATE_BEGIN then
  1456. Exit(False);
  1457. NType := GetProps(Ord(AChar))^.Category;
  1458. Result := (NType <= UGC_OtherNumber);
  1459. end;
  1460. {$ELSE}
  1461. function IsUnicodeWordChar(AChar: WideChar): boolean; inline;
  1462. begin
  1463. Result := System.Character.IsLetterOrDigit(AChar);
  1464. end;
  1465. {$ENDIF}
  1466. {$ENDIF}
  1467. function TRegExpr.IsWordChar(AChar: REChar): boolean;
  1468. begin
  1469. Result := Pos(AChar, fWordChars) > 0;
  1470. {$IFDEF UnicodeWordDetection}
  1471. if not Result and (Ord(AChar) >= 128) and UseUnicodeWordDetection then
  1472. Result := IsUnicodeWordChar(AChar);
  1473. {$ENDIF}
  1474. end;
  1475. function TRegExpr.IsSpaceChar(AChar: REChar): boolean;
  1476. begin
  1477. Result := Pos(AChar, fSpaceChars) > 0;
  1478. end;
  1479. function TRegExpr.IsCustomLineSeparator(AChar: REChar): boolean;
  1480. begin
  1481. {$IFDEF UniCode}
  1482. Result := Pos(AChar, fLineSeparators) > 0;
  1483. {$ELSE}
  1484. Result := fLineSepArray[byte(AChar)];
  1485. {$ENDIF}
  1486. end;
  1487. function IsDigitChar(AChar: REChar): boolean; inline;
  1488. begin
  1489. case AChar of
  1490. '0' .. '9':
  1491. Result := True;
  1492. else
  1493. Result := False;
  1494. end;
  1495. end;
  1496. function IsHorzSeparator(AChar: REChar): boolean; inline;
  1497. begin
  1498. // Tab and Unicode categoty "Space Separator": https://www.compart.com/en/unicode/category/Zs
  1499. case AChar of
  1500. #9, #$20, #$A0:
  1501. Result := True;
  1502. {$IFDEF UniCode}
  1503. #$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000:
  1504. Result := True;
  1505. {$ENDIF}
  1506. else
  1507. Result := False;
  1508. end;
  1509. end;
  1510. function IsLineSeparator(AChar: REChar): boolean; inline;
  1511. begin
  1512. case AChar of
  1513. #$d, #$a, #$b, #$c:
  1514. Result := True;
  1515. {$IFDEF UniCode}
  1516. #$2028, #$2029, #$85:
  1517. Result := True;
  1518. {$ENDIF}
  1519. else
  1520. Result := False;
  1521. end;
  1522. end;
  1523. procedure TRegExpr.InvalidateProgramm;
  1524. begin
  1525. if programm <> nil then
  1526. begin
  1527. FreeMem(programm);
  1528. programm := nil;
  1529. end;
  1530. end; { of procedure TRegExpr.InvalidateProgramm
  1531. -------------------------------------------------------------- }
  1532. procedure TRegExpr.Compile;
  1533. begin
  1534. if fExpression = '' then
  1535. begin
  1536. Error(reeNoExpression);
  1537. Exit;
  1538. end;
  1539. CompileRegExpr(PRegExprChar(fExpression));
  1540. end; { of procedure TRegExpr.Compile
  1541. -------------------------------------------------------------- }
  1542. procedure TRegExpr.InitLineSepArray;
  1543. {$IFNDEF UniCode}
  1544. var
  1545. i: integer;
  1546. {$ENDIF}
  1547. begin
  1548. {$IFNDEF UniCode}
  1549. FillChar(fLineSepArray, SizeOf(fLineSepArray), 0);
  1550. for i := 1 to Length(fLineSeparators) do
  1551. fLineSepArray[byte(fLineSeparators[i])] := True;
  1552. {$ENDIF}
  1553. end;
  1554. function TRegExpr.IsProgrammOk: boolean;
  1555. begin
  1556. Result := False;
  1557. // check modifiers
  1558. if not IsModifiersEqual(fModifiers, fProgModifiers) // ###0.941
  1559. then
  1560. InvalidateProgramm;
  1561. // [Re]compile if needed
  1562. if programm = nil then
  1563. begin
  1564. Compile; // ###0.941
  1565. // Check [re]compiled programm
  1566. if programm = nil then
  1567. Exit; // error was set/raised by Compile (was reeExecAfterCompErr)
  1568. end;
  1569. if programm[0] <> OP_MAGIC // Program corrupted.
  1570. then
  1571. Error(reeCorruptedProgram)
  1572. else
  1573. Result := True;
  1574. end; { of function TRegExpr.IsProgrammOk
  1575. -------------------------------------------------------------- }
  1576. procedure TRegExpr.Tail(p: PRegExprChar; val: PRegExprChar);
  1577. // set the next-pointer at the end of a node chain
  1578. var
  1579. scan: PRegExprChar;
  1580. temp: PRegExprChar;
  1581. begin
  1582. if p = @regdummy then
  1583. Exit;
  1584. // Find last node.
  1585. scan := p;
  1586. repeat
  1587. temp := regnext(scan);
  1588. if temp = nil then
  1589. Break;
  1590. scan := temp;
  1591. until False;
  1592. // Set Next 'pointer'
  1593. if val < scan then
  1594. PRENextOff(AlignToPtr(scan + REOpSz))^ := -(scan - val) // ###0.948
  1595. // work around PWideChar subtraction bug (Delphi uses
  1596. // shr after subtraction to calculate widechar distance %-( )
  1597. // so, if difference is negative we have .. the "feature" :(
  1598. // I could wrap it in $IFDEF UniCode, but I didn't because
  1599. // "P – Q computes the difference between the address given
  1600. // by P (the higher address) and the address given by Q (the
  1601. // lower address)" - Delphi help quotation.
  1602. else
  1603. PRENextOff(AlignToPtr(scan + REOpSz))^ := val - scan; // ###0.933
  1604. end; { of procedure TRegExpr.Tail
  1605. -------------------------------------------------------------- }
  1606. procedure TRegExpr.OpTail(p: PRegExprChar; val: PRegExprChar);
  1607. // regtail on operand of first argument; nop if operandless
  1608. begin
  1609. // "Operandless" and "op != OP_BRANCH" are synonymous in practice.
  1610. if (p = nil) or (p = @regdummy) or (PREOp(p)^ <> OP_BRANCH) then
  1611. Exit;
  1612. Tail(p + REOpSz + RENextOffSz, val); // ###0.933
  1613. end; { of procedure TRegExpr.OpTail
  1614. -------------------------------------------------------------- }
  1615. function TRegExpr.EmitNode(op: TREOp): PRegExprChar; // ###0.933
  1616. // emit a node, return location
  1617. begin
  1618. Result := regcode;
  1619. if Result <> @regdummy then
  1620. begin
  1621. PREOp(regcode)^ := op;
  1622. Inc(regcode, REOpSz);
  1623. PRENextOff(AlignToPtr(regcode))^ := 0; // Next "pointer" := nil
  1624. Inc(regcode, RENextOffSz);
  1625. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  1626. regExactlyLen := PLongInt(regcode)
  1627. else
  1628. regExactlyLen := nil;
  1629. {$IFDEF DebugSynRegExpr}
  1630. if regcode - programm > regsize then
  1631. raise Exception.Create('TRegExpr.EmitNode buffer overrun');
  1632. {$ENDIF}
  1633. end
  1634. else
  1635. Inc(regsize, REOpSz + RENextOffSz);
  1636. // compute code size without code generation
  1637. end; { of function TRegExpr.EmitNode
  1638. -------------------------------------------------------------- }
  1639. procedure TRegExpr.EmitC(ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  1640. begin
  1641. if regcode <> @regdummy then
  1642. begin
  1643. regcode^ := ch;
  1644. Inc(regcode);
  1645. {$IFDEF DebugSynRegExpr}
  1646. if regcode - programm > regsize then
  1647. raise Exception.Create('TRegExpr.EmitC buffer overrun');
  1648. {$ENDIF}
  1649. end
  1650. else
  1651. Inc(regsize, REOpSz); // Type of p-code pointer always is ^REChar
  1652. end; { of procedure TRegExpr.EmitC
  1653. -------------------------------------------------------------- }
  1654. procedure TRegExpr.EmitInt(AValue: LongInt); {$IFDEF InlineFuncs}inline;{$ENDIF}
  1655. begin
  1656. if regcode <> @regdummy then
  1657. begin
  1658. PLongInt(regcode)^ := AValue;
  1659. Inc(regcode, RENumberSz);
  1660. {$IFDEF DebugSynRegExpr}
  1661. if regcode - programm > regsize then
  1662. raise Exception.Create('TRegExpr.EmitInt buffer overrun');
  1663. {$ENDIF}
  1664. end
  1665. else
  1666. Inc(regsize, RENumberSz);
  1667. end;
  1668. procedure TRegExpr.InsertOperator(op: TREOp; opnd: PRegExprChar; sz: integer);
  1669. // insert an operator in front of already-emitted operand
  1670. // Means relocating the operand.
  1671. var
  1672. src, dst, place: PRegExprChar;
  1673. i: integer;
  1674. begin
  1675. if regcode = @regdummy then
  1676. begin
  1677. Inc(regsize, sz);
  1678. Exit;
  1679. end;
  1680. // move code behind insert position
  1681. src := regcode;
  1682. Inc(regcode, sz);
  1683. {$IFDEF DebugSynRegExpr}
  1684. if regcode - programm > regsize then
  1685. raise Exception.Create('TRegExpr.InsertOperator buffer overrun');
  1686. // if (opnd<regcode) or (opnd-regcode>regsize) then
  1687. // raise Exception.Create('TRegExpr.InsertOperator invalid opnd');
  1688. {$ENDIF}
  1689. dst := regcode;
  1690. while src > opnd do
  1691. begin
  1692. Dec(dst);
  1693. Dec(src);
  1694. dst^ := src^;
  1695. end;
  1696. place := opnd; // Op node, where operand used to be.
  1697. PREOp(place)^ := op;
  1698. Inc(place, REOpSz);
  1699. for i := 1 + REOpSz to sz do
  1700. begin
  1701. place^ := #0;
  1702. Inc(place);
  1703. end;
  1704. end; { of procedure TRegExpr.InsertOperator
  1705. -------------------------------------------------------------- }
  1706. function FindSkippedMetaLen(PStart, PEnd: PRegExprChar): integer; {$IFDEF InlineFuncs}inline;{$ENDIF}
  1707. // find length of initial segment of PStart string consisting
  1708. // entirely of characters not from IsMetaSymbol1.
  1709. begin
  1710. Result := 0;
  1711. while PStart < PEnd do
  1712. begin
  1713. if _IsMetaSymbol1(PStart^) then
  1714. Exit;
  1715. Inc(Result);
  1716. Inc(PStart)
  1717. end;
  1718. end;
  1719. const
  1720. // Flags to be passed up and down.
  1721. flag_HasWidth = 01; // Known never to match nil string.
  1722. flag_Simple = 02; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand.
  1723. flag_SpecStart = 04; // Starts with * or +.
  1724. flag_Worst = 0; // Worst case.
  1725. {$IFDEF UniCode}
  1726. RusRangeLoLow = #$430; // 'а'
  1727. RusRangeLoHigh = #$44F; // 'я'
  1728. RusRangeHiLow = #$410; // 'А'
  1729. RusRangeHiHigh = #$42F; // 'Я'
  1730. {$ELSE}
  1731. RusRangeLoLow = #$E0; // 'а' in cp1251
  1732. RusRangeLoHigh = #$FF; // 'я' in cp1251
  1733. RusRangeHiLow = #$C0; // 'А' in cp1251
  1734. RusRangeHiHigh = #$DF; // 'Я' in cp1251
  1735. {$ENDIF}
  1736. function TRegExpr.FindInCharClass(ABuffer: PRegExprChar; AChar: REChar; AIgnoreCase: boolean): boolean;
  1737. // Buffer contains char pairs: (Kind, Data), where Kind is one of OpKind_ values,
  1738. // and Data depends on Kind
  1739. var
  1740. ch, ch2: REChar;
  1741. N, i: integer;
  1742. begin
  1743. if AIgnoreCase then
  1744. AChar := _UpperCase(AChar);
  1745. repeat
  1746. case ABuffer^ of
  1747. OpKind_End:
  1748. begin
  1749. Result := False;
  1750. Exit;
  1751. end;
  1752. OpKind_Range:
  1753. begin
  1754. Inc(ABuffer);
  1755. ch := ABuffer^;
  1756. Inc(ABuffer);
  1757. ch2 := ABuffer^;
  1758. Inc(ABuffer);
  1759. {
  1760. // if AIgnoreCase, ch, ch2 are upcased in opcode
  1761. if AIgnoreCase then
  1762. begin
  1763. ch := _UpperCase(ch);
  1764. ch2 := _UpperCase(ch2);
  1765. end;
  1766. }
  1767. if (AChar >= ch) and (AChar <= ch2) then
  1768. begin
  1769. Result := True;
  1770. Exit;
  1771. end;
  1772. end;
  1773. OpKind_MetaClass:
  1774. begin
  1775. Inc(ABuffer);
  1776. N := Ord(ABuffer^);
  1777. Inc(ABuffer);
  1778. if CharCheckers[N](AChar) then
  1779. begin
  1780. Result := True;
  1781. Exit
  1782. end;
  1783. end;
  1784. OpKind_Char:
  1785. begin
  1786. Inc(ABuffer);
  1787. N := PLongInt(ABuffer)^;
  1788. Inc(ABuffer, RENumberSz);
  1789. for i := 1 to N do
  1790. begin
  1791. ch := ABuffer^;
  1792. Inc(ABuffer);
  1793. {
  1794. // already upcased in opcode
  1795. if AIgnoreCase then
  1796. ch := _UpperCase(ch);
  1797. }
  1798. if ch = AChar then
  1799. begin
  1800. Result := True;
  1801. Exit;
  1802. end;
  1803. end;
  1804. end;
  1805. else
  1806. Error(reeBadOpcodeInCharClass);
  1807. end;
  1808. until False; // assume that Buffer is ended correctly
  1809. end;
  1810. procedure TRegExpr.GetCharSetFromWordChars(var ARes: TRegExprCharset);
  1811. var
  1812. i: integer;
  1813. ch: REChar;
  1814. begin
  1815. ARes := [];
  1816. for i := 1 to Length(fWordChars) do
  1817. begin
  1818. ch := fWordChars[i];
  1819. {$IFDEF UniCode}
  1820. if Ord(ch) <= $FF then
  1821. {$ENDIF}
  1822. Include(ARes, byte(ch));
  1823. end;
  1824. end;
  1825. procedure TRegExpr.GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
  1826. var
  1827. i: integer;
  1828. ch: REChar;
  1829. begin
  1830. ARes := [];
  1831. for i := 1 to Length(fSpaceChars) do
  1832. begin
  1833. ch := fSpaceChars[i];
  1834. {$IFDEF UniCode}
  1835. if Ord(ch) <= $FF then
  1836. {$ENDIF}
  1837. Include(ARes, byte(ch));
  1838. end;
  1839. end;
  1840. procedure TRegExpr.GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: boolean; var ARes: TRegExprCharset);
  1841. var
  1842. ch, ch2: REChar;
  1843. TempSet: TRegExprCharSet;
  1844. N, i: integer;
  1845. begin
  1846. ARes := [];
  1847. TempSet := [];
  1848. repeat
  1849. case ABuffer^ of
  1850. OpKind_End:
  1851. Exit;
  1852. OpKind_Range:
  1853. begin
  1854. Inc(ABuffer);
  1855. ch := ABuffer^;
  1856. Inc(ABuffer);
  1857. ch2 := ABuffer^;
  1858. Inc(ABuffer);
  1859. for i := Ord(ch) to
  1860. {$IFDEF UniCode} Min(Ord(ch2), $FF) {$ELSE} Ord(ch2) {$ENDIF} do
  1861. begin
  1862. Include(ARes, byte(i));
  1863. if AIgnoreCase then
  1864. Include(ARes, byte(InvertCase(REChar(i))));
  1865. end;
  1866. end;
  1867. OpKind_MetaClass:
  1868. begin
  1869. Inc(ABuffer);
  1870. N := Ord(ABuffer^);
  1871. Inc(ABuffer);
  1872. if N = CheckerIndex_Word then
  1873. begin
  1874. GetCharSetFromWordChars(TempSet);
  1875. ARes := ARes + TempSet;
  1876. end
  1877. else
  1878. if N = CheckerIndex_NotWord then
  1879. begin
  1880. GetCharSetFromWordChars(TempSet);
  1881. ARes := ARes + (RegExprAllSet - TempSet);
  1882. end
  1883. else
  1884. if N = CheckerIndex_Space then
  1885. begin
  1886. GetCharSetFromSpaceChars(TempSet);
  1887. ARes := ARes + TempSet;
  1888. end
  1889. else
  1890. if N = CheckerIndex_NotSpace then
  1891. begin
  1892. GetCharSetFromSpaceChars(TempSet);
  1893. ARes := ARes + (RegExprAllSet - TempSet);
  1894. end
  1895. else
  1896. if N = CheckerIndex_Digit then
  1897. ARes := ARes + RegExprDigitSet
  1898. else
  1899. if N = CheckerIndex_NotDigit then
  1900. ARes := ARes + (RegExprAllSet - RegExprDigitSet)
  1901. else
  1902. if N = CheckerIndex_VertSep then
  1903. ARes := ARes + RegExprLineSeparatorsSet
  1904. else
  1905. if N = CheckerIndex_NotVertSep then
  1906. ARes := ARes + (RegExprAllSet - RegExprLineSeparatorsSet)
  1907. else
  1908. if N = CheckerIndex_HorzSep then
  1909. ARes := ARes + RegExprHorzSeparatorsSet
  1910. else
  1911. if N = CheckerIndex_NotHorzSep then
  1912. ARes := ARes + (RegExprAllSet - RegExprHorzSeparatorsSet)
  1913. else
  1914. if N = CheckerIndex_LowerAZ then
  1915. begin
  1916. if AIgnoreCase then
  1917. ARes := ARes + RegExprAllAzSet
  1918. else
  1919. ARes := ARes + RegExprLowerAzSet;
  1920. end
  1921. else
  1922. if N = CheckerIndex_UpperAZ then
  1923. begin
  1924. if AIgnoreCase then
  1925. ARes := ARes + RegExprAllAzSet
  1926. else
  1927. ARes := ARes + RegExprUpperAzSet;
  1928. end
  1929. else
  1930. Error(reeBadOpcodeInCharClass);
  1931. end;
  1932. OpKind_Char:
  1933. begin
  1934. Inc(ABuffer);
  1935. N := PLongInt(ABuffer)^;
  1936. Inc(ABuffer, RENumberSz);
  1937. for i := 1 to N do
  1938. begin
  1939. ch := ABuffer^;
  1940. Inc(ABuffer);
  1941. {$IFDEF UniCode}
  1942. if Ord(ch) <= $FF then
  1943. {$ENDIF}
  1944. begin
  1945. Include(ARes, byte(ch));
  1946. if AIgnoreCase then
  1947. Include(ARes, byte(InvertCase(ch)));
  1948. end;
  1949. end;
  1950. end;
  1951. else
  1952. Error(reeBadOpcodeInCharClass);
  1953. end;
  1954. until False; // assume that Buffer is ended correctly
  1955. end;
  1956. function TRegExpr.GetModifierG: boolean;
  1957. begin
  1958. Result := fModifiers.G;
  1959. end;
  1960. function TRegExpr.GetModifierI: boolean;
  1961. begin
  1962. Result := fModifiers.I;
  1963. end;
  1964. function TRegExpr.GetModifierM: boolean;
  1965. begin
  1966. Result := fModifiers.M;
  1967. end;
  1968. function TRegExpr.GetModifierR: boolean;
  1969. begin
  1970. Result := fModifiers.R;
  1971. end;
  1972. function TRegExpr.GetModifierS: boolean;
  1973. begin
  1974. Result := fModifiers.S;
  1975. end;
  1976. function TRegExpr.GetModifierX: boolean;
  1977. begin
  1978. Result := fModifiers.X;
  1979. end;
  1980. function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): boolean;
  1981. // Compile a regular expression into internal code
  1982. // We can't allocate space until we know how big the compiled form will be,
  1983. // but we can't compile it (and thus know how big it is) until we've got a
  1984. // place to put the code. So we cheat: we compile it twice, once with code
  1985. // generation turned off and size counting turned on, and once "for real".
  1986. // This also means that we don't allocate space until we are sure that the
  1987. // thing really will compile successfully, and we never have to move the
  1988. // code and thus invalidate pointers into it. (Note that it has to be in
  1989. // one piece because free() must be able to free it all.)
  1990. // Beware that the optimization-preparation code in here knows about some
  1991. // of the structure of the compiled regexp.
  1992. var
  1993. scan, longest, longestTemp: PRegExprChar;
  1994. Len, LenTemp: integer;
  1995. flags: integer;
  1996. begin
  1997. Result := False; // life too dark
  1998. flags := 0;
  1999. regparse := nil; // for correct error handling
  2000. regexpBegin := ARegExp;
  2001. regExactlyLen := nil;
  2002. ClearInternalIndexes;
  2003. fLastError := reeOk;
  2004. fLastErrorOpcode := TREOp(0);
  2005. try
  2006. if programm <> nil then
  2007. begin
  2008. FreeMem(programm);
  2009. programm := nil;
  2010. end;
  2011. if ARegExp = nil then
  2012. begin
  2013. Error(reeCompNullArgument);
  2014. Exit;
  2015. end;
  2016. fProgModifiers := fModifiers;
  2017. // well, may it's paranoia. I'll check it later... !!!!!!!!
  2018. // First pass: determine size, legality.
  2019. fSecondPass := False;
  2020. fCompModifiers := fModifiers;
  2021. regparse := ARegExp;
  2022. regnpar := 1;
  2023. regsize := 0;
  2024. regcode := @regdummy;
  2025. EmitC(OP_MAGIC);
  2026. if ParseReg(0, flags) = nil then
  2027. Exit;
  2028. // Allocate space.
  2029. GetMem(programm, regsize * SizeOf(REChar));
  2030. // Second pass: emit code.
  2031. fSecondPass := True;
  2032. fCompModifiers := fModifiers;
  2033. regparse := ARegExp;
  2034. regnpar := 1;
  2035. regcode := programm;
  2036. EmitC(OP_MAGIC);
  2037. if ParseReg(0, flags) = nil then
  2038. Exit;
  2039. // Dig out information for optimizations.
  2040. {$IFDEF UseFirstCharSet} // ###0.929
  2041. FirstCharSet := [];
  2042. FillFirstCharSet(programm + REOpSz);
  2043. for Len := 0 to 255 do
  2044. FirstCharArray[Len] := byte(Len) in FirstCharSet;
  2045. {$ENDIF}
  2046. reganchored := #0;
  2047. regmust := nil;
  2048. regmustlen := 0;
  2049. regmustString := '';
  2050. scan := programm + REOpSz; // First OP_BRANCH.
  2051. if PREOp(regnext(scan))^ = OP_EEND then
  2052. begin // Only one top-level choice.
  2053. scan := scan + REOpSz + RENextOffSz;
  2054. // Starting-point info.
  2055. if PREOp(scan)^ = OP_BOL then
  2056. Inc(reganchored);
  2057. // If there's something expensive in the r.e., find the longest
  2058. // literal string that must appear and make it the regmust. Resolve
  2059. // ties in favor of later strings, since the regstart check works
  2060. // with the beginning of the r.e. and avoiding duplication
  2061. // strengthens checking. Not a strong reason, but sufficient in the
  2062. // absence of others.
  2063. if (flags and flag_SpecStart) <> 0 then
  2064. begin
  2065. longest := nil;
  2066. Len := 0;
  2067. while scan <> nil do
  2068. begin
  2069. if PREOp(scan)^ = OP_EXACTLY then
  2070. begin
  2071. longestTemp := scan + REOpSz + RENextOffSz + RENumberSz;
  2072. LenTemp := PLongInt(scan + REOpSz + RENextOffSz)^;
  2073. if LenTemp >= Len then
  2074. begin
  2075. longest := longestTemp;
  2076. Len := LenTemp;
  2077. end;
  2078. end;
  2079. scan := regnext(scan);
  2080. end;
  2081. regmust := longest;
  2082. regmustlen := Len;
  2083. if regmustlen > 1 then // don't use regmust if too short
  2084. SetString(regmustString, regmust, regmustlen);
  2085. end;
  2086. end;
  2087. Result := True;
  2088. finally
  2089. begin
  2090. if not Result then
  2091. InvalidateProgramm;
  2092. regexpBegin := nil;
  2093. regexpIsCompiled := Result; // ###0.944
  2094. end;
  2095. end;
  2096. end; { of function TRegExpr.CompileRegExpr
  2097. -------------------------------------------------------------- }
  2098. procedure TRegExpr.SetUseOsLineEndOnReplace(AValue: boolean);
  2099. begin
  2100. if FUseOsLineEndOnReplace = AValue then
  2101. Exit;
  2102. FUseOsLineEndOnReplace := AValue;
  2103. if FUseOsLineEndOnReplace then
  2104. FReplaceLineEnd := sLineBreak
  2105. else
  2106. FReplaceLineEnd := #10;
  2107. end;
  2108. function TRegExpr.ParseReg(paren: integer; var flagp: integer): PRegExprChar;
  2109. // regular expression, i.e. main body or parenthesized thing
  2110. // Caller must absorb opening parenthesis.
  2111. // Combining parenthesis handling with the base level of regular expression
  2112. // is a trifle forced, but the need to tie the tails of the branches to what
  2113. // follows makes it hard to avoid.
  2114. var
  2115. ret, br, ender: PRegExprChar;
  2116. parno: integer;
  2117. flags: integer;
  2118. SavedModifiers: TRegExprModifiers;
  2119. begin
  2120. flags := 0;
  2121. Result := nil;
  2122. flagp := flag_HasWidth; // Tentatively.
  2123. parno := 0; // eliminate compiler stupid warning
  2124. SavedModifiers := fCompModifiers;
  2125. // Make an OP_OPEN node, if parenthesized.
  2126. if paren <> 0 then
  2127. begin
  2128. if regnpar >= NSUBEXP then
  2129. begin
  2130. Error(reeCompParseRegTooManyBrackets);
  2131. Exit;
  2132. end;
  2133. parno := regnpar;
  2134. Inc(regnpar);
  2135. ret := EmitNode(TREOp(Ord(OP_OPEN) + parno));
  2136. end
  2137. else
  2138. ret := nil;
  2139. // Pick up the branches, linking them together.
  2140. br := ParseBranch(flags);
  2141. if br = nil then
  2142. begin
  2143. Result := nil;
  2144. Exit;
  2145. end;
  2146. if ret <> nil then
  2147. Tail(ret, br) // OP_OPEN -> first.
  2148. else
  2149. ret := br;
  2150. if (flags and flag_HasWidth) = 0 then
  2151. flagp := flagp and not flag_HasWidth;
  2152. flagp := flagp or flags and flag_SpecStart;
  2153. while (regparse^ = '|') do
  2154. begin
  2155. Inc(regparse);
  2156. br := ParseBranch(flags);
  2157. if br = nil then
  2158. begin
  2159. Result := nil;
  2160. Exit;
  2161. end;
  2162. Tail(ret, br); // OP_BRANCH -> OP_BRANCH.
  2163. if (flags and flag_HasWidth) = 0 then
  2164. flagp := flagp and not flag_HasWidth;
  2165. flagp := flagp or flags and flag_SpecStart;
  2166. end;
  2167. // Make a closing node, and hook it on the end.
  2168. if paren <> 0 then
  2169. ender := EmitNode(TREOp(Ord(OP_CLOSE) + parno))
  2170. else
  2171. ender := EmitNode(OP_EEND);
  2172. Tail(ret, ender);
  2173. // Hook the tails of the branches to the closing node.
  2174. br := ret;
  2175. while br <> nil do
  2176. begin
  2177. OpTail(br, ender);
  2178. br := regnext(br);
  2179. end;
  2180. // Check for proper termination.
  2181. if paren <> 0 then
  2182. if regparse^ <> ')' then
  2183. begin
  2184. Error(reeCompParseRegUnmatchedBrackets);
  2185. Exit;
  2186. end
  2187. else
  2188. Inc(regparse); // skip trailing ')'
  2189. if (paren = 0) and (regparse < fRegexEnd) then
  2190. begin
  2191. if regparse^ = ')' then
  2192. Error(reeCompParseRegUnmatchedBrackets2)
  2193. else
  2194. Error(reeCompParseRegJunkOnEnd);
  2195. Exit;
  2196. end;
  2197. fCompModifiers := SavedModifiers; // restore modifiers of parent
  2198. Result := ret;
  2199. end; { of function TRegExpr.ParseReg
  2200. -------------------------------------------------------------- }
  2201. function TRegExpr.ParseBranch(var flagp: integer): PRegExprChar;
  2202. // one alternative of an | operator
  2203. // Implements the concatenation operator.
  2204. var
  2205. ret, chain, latest: PRegExprChar;
  2206. flags: integer;
  2207. begin
  2208. flags := 0;
  2209. flagp := flag_Worst; // Tentatively.
  2210. ret := EmitNode(OP_BRANCH);
  2211. chain := nil;
  2212. while (regparse < fRegexEnd) and (regparse^ <> '|') and (regparse^ <> ')') do
  2213. begin
  2214. latest := ParsePiece(flags);
  2215. if latest = nil then
  2216. begin
  2217. Result := nil;
  2218. Exit;
  2219. end;
  2220. flagp := flagp or flags and flag_HasWidth;
  2221. if chain = nil // First piece.
  2222. then
  2223. flagp := flagp or flags and flag_SpecStart
  2224. else
  2225. Tail(chain, latest);
  2226. chain := latest;
  2227. end;
  2228. if chain = nil // Loop ran zero times.
  2229. then
  2230. EmitNode(OP_NOTHING);
  2231. Result := ret;
  2232. end; { of function TRegExpr.ParseBranch
  2233. -------------------------------------------------------------- }
  2234. function TRegExpr.ParsePiece(var flagp: integer): PRegExprChar;
  2235. // something followed by possible [*+?{]
  2236. // Note that the branching code sequences used for ? and the general cases
  2237. // of * and + and { are somewhat optimized: they use the same OP_NOTHING node as
  2238. // both the endmarker for their branch list and the body of the last branch.
  2239. // It might seem that this node could be dispensed with entirely, but the
  2240. // endmarker role is not redundant.
  2241. function ParseNumber(AStart, AEnd: PRegExprChar): TREBracesArg;
  2242. begin
  2243. Result := 0;
  2244. if AEnd - AStart + 1 > 8 then
  2245. begin // prevent stupid scanning
  2246. Error(reeBRACESArgTooBig);
  2247. Exit;
  2248. end;
  2249. while AStart <= AEnd do
  2250. begin
  2251. Result := Result * 10 + (Ord(AStart^) - Ord('0'));
  2252. Inc(AStart);
  2253. end;
  2254. if (Result > MaxBracesArg) or (Result < 0) then
  2255. begin
  2256. Error(reeBRACESArgTooBig);
  2257. Exit;
  2258. end;
  2259. end;
  2260. var
  2261. TheOp: TREOp;
  2262. NextNode: PRegExprChar;
  2263. procedure EmitComplexBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
  2264. {$IFDEF ComplexBraces}
  2265. var
  2266. off: TRENextOff;
  2267. {$ENDIF}
  2268. begin
  2269. {$IFNDEF ComplexBraces}
  2270. Error(reeComplexBracesNotImplemented);
  2271. {$ELSE}
  2272. if ANonGreedyOp then
  2273. TheOp := OP_LOOPNG
  2274. else
  2275. TheOp := OP_LOOP;
  2276. InsertOperator(OP_LOOPENTRY, Result, REOpSz + RENextOffSz);
  2277. NextNode := EmitNode(TheOp);
  2278. if regcode <> @regdummy then
  2279. begin
  2280. off := (Result + REOpSz + RENextOffSz) - (regcode - REOpSz - RENextOffSz);
  2281. // back to Atom after OP_LOOPENTRY
  2282. PREBracesArg(AlignToInt(regcode))^ := ABracesMin;
  2283. Inc(regcode, REBracesArgSz);
  2284. PREBracesArg(AlignToInt(regcode))^ := ABracesMax;
  2285. Inc(regcode, REBracesArgSz);
  2286. PRENextOff(AlignToPtr(regcode))^ := off;
  2287. Inc(regcode, RENextOffSz);
  2288. {$IFDEF DebugSynRegExpr}
  2289. if regcode - programm > regsize then
  2290. raise Exception.Create
  2291. ('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun');
  2292. {$ENDIF}
  2293. end
  2294. else
  2295. Inc(regsize, REBracesArgSz * 2 + RENextOffSz);
  2296. Tail(Result, NextNode); // OP_LOOPENTRY -> OP_LOOP
  2297. if regcode <> @regdummy then
  2298. Tail(Result + REOpSz + RENextOffSz, NextNode); // Atom -> OP_LOOP
  2299. {$ENDIF}
  2300. end;
  2301. procedure EmitSimpleBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp: boolean); // ###0.940
  2302. begin
  2303. if ANonGreedyOp // ###0.940
  2304. then
  2305. TheOp := OP_BRACESNG
  2306. else
  2307. TheOp := OP_BRACES;
  2308. InsertOperator(TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
  2309. if regcode <> @regdummy then
  2310. begin
  2311. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin;
  2312. PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax;
  2313. end;
  2314. end;
  2315. var
  2316. op: REChar;
  2317. NonGreedyOp, NonGreedyCh: boolean; // ###0.940
  2318. flags: integer;
  2319. BracesMin, Bracesmax: TREBracesArg;
  2320. p: PRegExprChar;
  2321. begin
  2322. flags := 0;
  2323. Result := ParseAtom(flags);
  2324. if Result = nil then
  2325. Exit;
  2326. op := regparse^;
  2327. if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then
  2328. begin
  2329. flagp := flags;
  2330. Exit;
  2331. end;
  2332. if ((flags and flag_HasWidth) = 0) and (op <> '?') then
  2333. begin
  2334. Error(reePlusStarOperandCouldBeEmpty);
  2335. Exit;
  2336. end;
  2337. case op of
  2338. '*':
  2339. begin
  2340. flagp := flag_Worst or flag_SpecStart;
  2341. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2342. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2343. // ###0.940
  2344. if (flags and flag_Simple) = 0 then
  2345. begin
  2346. if NonGreedyOp // ###0.940
  2347. then
  2348. EmitComplexBraces(0, MaxBracesArg, NonGreedyOp)
  2349. else
  2350. begin // Emit x* as (x&|), where & means "self".
  2351. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  2352. OpTail(Result, EmitNode(OP_BACK)); // and loop
  2353. OpTail(Result, Result); // back
  2354. Tail(Result, EmitNode(OP_BRANCH)); // or
  2355. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2356. end
  2357. end
  2358. else
  2359. begin // Simple
  2360. if NonGreedyOp // ###0.940
  2361. then
  2362. TheOp := OP_STARNG
  2363. else
  2364. TheOp := OP_STAR;
  2365. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2366. end;
  2367. if NonGreedyCh // ###0.940
  2368. then
  2369. Inc(regparse); // Skip extra char ('?')
  2370. end; { of case '*' }
  2371. '+':
  2372. begin
  2373. flagp := flag_Worst or flag_SpecStart or flag_HasWidth;
  2374. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2375. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2376. // ###0.940
  2377. if (flags and flag_Simple) = 0 then
  2378. begin
  2379. if NonGreedyOp // ###0.940
  2380. then
  2381. EmitComplexBraces(1, MaxBracesArg, NonGreedyOp)
  2382. else
  2383. begin // Emit x+ as x(&|), where & means "self".
  2384. NextNode := EmitNode(OP_BRANCH); // Either
  2385. Tail(Result, NextNode);
  2386. Tail(EmitNode(OP_BACK), Result); // loop back
  2387. Tail(NextNode, EmitNode(OP_BRANCH)); // or
  2388. Tail(Result, EmitNode(OP_NOTHING)); // nil.
  2389. end
  2390. end
  2391. else
  2392. begin // Simple
  2393. if NonGreedyOp // ###0.940
  2394. then
  2395. TheOp := OP_PLUSNG
  2396. else
  2397. TheOp := OP_PLUS;
  2398. InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
  2399. end;
  2400. if NonGreedyCh // ###0.940
  2401. then
  2402. Inc(regparse); // Skip extra char ('?')
  2403. end; { of case '+' }
  2404. '?':
  2405. begin
  2406. flagp := flag_Worst;
  2407. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2408. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2409. // ###0.940
  2410. if NonGreedyOp then
  2411. begin // ###0.940 // We emit x?? as x{0,1}?
  2412. if (flags and flag_Simple) = 0 then
  2413. EmitComplexBraces(0, 1, NonGreedyOp)
  2414. else
  2415. EmitSimpleBraces(0, 1, NonGreedyOp);
  2416. end
  2417. else
  2418. begin // greedy '?'
  2419. InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz); // Either x
  2420. Tail(Result, EmitNode(OP_BRANCH)); // or
  2421. NextNode := EmitNode(OP_NOTHING); // nil.
  2422. Tail(Result, NextNode);
  2423. OpTail(Result, NextNode);
  2424. end;
  2425. if NonGreedyCh // ###0.940
  2426. then
  2427. Inc(regparse); // Skip extra char ('?')
  2428. end; { of case '?' }
  2429. '{':
  2430. begin
  2431. Inc(regparse);
  2432. p := regparse;
  2433. while IsDigitChar(regparse^) do // <min> MUST appear
  2434. Inc(regparse);
  2435. if (regparse^ <> '}') and (regparse^ <> ',') or (p = regparse) then
  2436. begin
  2437. Error(reeIncorrectBraces);
  2438. Exit;
  2439. end;
  2440. BracesMin := ParseNumber(p, regparse - 1);
  2441. if regparse^ = ',' then
  2442. begin
  2443. Inc(regparse);
  2444. p := regparse;
  2445. while IsDigitChar(regparse^) do
  2446. Inc(regparse);
  2447. if regparse^ <> '}' then
  2448. begin
  2449. Error(reeIncorrectBraces);
  2450. Exit;
  2451. end;
  2452. if p = regparse then
  2453. Bracesmax := MaxBracesArg
  2454. else
  2455. Bracesmax := ParseNumber(p, regparse - 1);
  2456. end
  2457. else
  2458. Bracesmax := BracesMin; // {n} == {n,n}
  2459. if BracesMin > Bracesmax then
  2460. begin
  2461. Error(reeBracesMinParamGreaterMax);
  2462. Exit;
  2463. end;
  2464. if BracesMin > 0 then
  2465. flagp := flag_Worst;
  2466. if Bracesmax > 0 then
  2467. flagp := flagp or flag_HasWidth or flag_SpecStart;
  2468. NonGreedyCh := (regparse + 1)^ = '?'; // ###0.940
  2469. NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
  2470. // ###0.940
  2471. if (flags and flag_Simple) <> 0 then
  2472. EmitSimpleBraces(BracesMin, Bracesmax, NonGreedyOp)
  2473. else
  2474. EmitComplexBraces(BracesMin, Bracesmax, NonGreedyOp);
  2475. if NonGreedyCh // ###0.940
  2476. then
  2477. Inc(regparse); // Skip extra char '?'
  2478. end; // of case '{'
  2479. // else // here we can't be
  2480. end; { of case op }
  2481. Inc(regparse);
  2482. op := regparse^;
  2483. if (op = '*') or (op = '+') or (op = '?') or (op = '{') then
  2484. Error(reeNestedSQP);
  2485. end; { of function TRegExpr.ParsePiece
  2486. -------------------------------------------------------------- }
  2487. function TRegExpr.HexDig(Ch: REChar): integer;
  2488. begin
  2489. case Ch of
  2490. '0' .. '9':
  2491. Result := Ord(Ch) - Ord('0');
  2492. 'a' .. 'f':
  2493. Result := Ord(Ch) - Ord('a') + 10;
  2494. 'A' .. 'F':
  2495. Result := Ord(Ch) - Ord('A') + 10;
  2496. else
  2497. begin
  2498. Result := 0;
  2499. Error(reeBadHexDigit);
  2500. end;
  2501. end;
  2502. end;
  2503. function TRegExpr.UnQuoteChar(var APtr: PRegExprChar): REChar;
  2504. var
  2505. Ch: REChar;
  2506. begin
  2507. case APtr^ of
  2508. 't':
  2509. Result := #$9; // \t => tab (HT/TAB)
  2510. 'n':
  2511. Result := #$a; // \n => newline (NL)
  2512. 'r':
  2513. Result := #$d; // \r => carriage return (CR)
  2514. 'f':
  2515. Result := #$c; // \f => form feed (FF)
  2516. 'a':
  2517. Result := #$7; // \a => alarm (bell) (BEL)
  2518. 'e':
  2519. Result := #$1b; // \e => escape (ESC)
  2520. 'c':
  2521. begin // \cK => code for Ctrl+K
  2522. Inc(APtr);
  2523. if APtr >= fRegexEnd then
  2524. Error(reeNoLetterAfterBSlashC);
  2525. Ch := APtr^;
  2526. case Ch of
  2527. 'a' .. 'z':
  2528. Result := REChar(Ord(Ch) - Ord('a') + 1);
  2529. 'A' .. 'Z':
  2530. Result := REChar(Ord(Ch) - Ord('A') + 1);
  2531. else
  2532. Error(reeNoLetterAfterBSlashC);
  2533. end;
  2534. end;
  2535. 'x':
  2536. begin // \x: hex char
  2537. Result := #0;
  2538. Inc(APtr);
  2539. if APtr >= fRegexEnd then
  2540. begin
  2541. Error(reeNoHexCodeAfterBSlashX);
  2542. Exit;
  2543. end;
  2544. if APtr^ = '{' then
  2545. begin // \x{nnnn} //###0.936
  2546. repeat
  2547. Inc(APtr);
  2548. if APtr >= fRegexEnd then
  2549. begin
  2550. Error(reeNoHexCodeAfterBSlashX);
  2551. Exit;
  2552. end;
  2553. if APtr^ <> '}' then
  2554. begin
  2555. if (Ord(Result) ShR (SizeOf(REChar) * 8 - 4)) and $F <> 0 then
  2556. begin
  2557. Error(reeHexCodeAfterBSlashXTooBig);
  2558. Exit;
  2559. end;
  2560. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  2561. // HexDig will cause Error if bad hex digit found
  2562. end
  2563. else
  2564. Break;
  2565. until False;
  2566. end
  2567. else
  2568. begin
  2569. Result := REChar(HexDig(APtr^));
  2570. // HexDig will cause Error if bad hex digit found
  2571. Inc(APtr);
  2572. if APtr >= fRegexEnd then
  2573. begin
  2574. Error(reeNoHexCodeAfterBSlashX);
  2575. Exit;
  2576. end;
  2577. Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
  2578. // HexDig will cause Error if bad hex digit found
  2579. end;
  2580. end;
  2581. else
  2582. Result := APtr^;
  2583. end;
  2584. end;
  2585. function TRegExpr.ParseAtom(var flagp: integer): PRegExprChar;
  2586. // the lowest level
  2587. // Optimization: gobbles an entire sequence of ordinary characters so that
  2588. // it can turn them into a single node, which is smaller to store and
  2589. // faster to run. Backslashed characters are exceptions, each becoming a
  2590. // separate node; the code is simpler that way and it's not worth fixing.
  2591. var
  2592. ret: PRegExprChar;
  2593. RangeBeg, RangeEnd: REChar;
  2594. CanBeRange: boolean;
  2595. AddrOfLen: PLongInt;
  2596. procedure EmitExactly(Ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2597. begin
  2598. if fCompModifiers.I then
  2599. ret := EmitNode(OP_EXACTLYCI)
  2600. else
  2601. ret := EmitNode(OP_EXACTLY);
  2602. EmitInt(1);
  2603. EmitC(Ch);
  2604. flagp := flagp or flag_HasWidth or flag_Simple;
  2605. end;
  2606. procedure EmitRangeChar(Ch: REChar; AStartOfRange: boolean); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2607. begin
  2608. CanBeRange := AStartOfRange;
  2609. if fCompModifiers.I then
  2610. Ch := _UpperCase(Ch);
  2611. if AStartOfRange then
  2612. begin
  2613. AddrOfLen := nil;
  2614. RangeBeg := Ch;
  2615. end
  2616. else
  2617. begin
  2618. if AddrOfLen = nil then
  2619. begin
  2620. EmitC(OpKind_Char);
  2621. Pointer(AddrOfLen) := regcode;
  2622. EmitInt(0);
  2623. end;
  2624. Inc(AddrOfLen^);
  2625. EmitC(Ch);
  2626. end;
  2627. end;
  2628. procedure EmitRangePacked(ch1, ch2: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
  2629. var
  2630. ChkIndex: integer;
  2631. begin
  2632. AddrOfLen := nil;
  2633. CanBeRange := False;
  2634. if fCompModifiers.I then
  2635. begin
  2636. ch1 := _UpperCase(ch1);
  2637. ch2 := _UpperCase(ch2);
  2638. end;
  2639. for ChkIndex := Low(CharCheckerInfos) to High(CharCheckerInfos) do
  2640. if (CharCheckerInfos[ChkIndex].CharBegin = ch1) and
  2641. (CharCheckerInfos[ChkIndex].CharEnd = ch2) then
  2642. begin
  2643. EmitC(OpKind_MetaClass);
  2644. EmitC(REChar(CharCheckerInfos[ChkIndex].CheckerIndex));
  2645. Exit;
  2646. end;
  2647. EmitC(OpKind_Range);
  2648. EmitC(ch1);
  2649. EmitC(ch2);
  2650. end;
  2651. var
  2652. flags: integer;
  2653. Len: integer;
  2654. SavedPtr: PRegExprChar;
  2655. EnderChar, TempChar: REChar;
  2656. DashForRange: Boolean;
  2657. begin
  2658. Result := nil;
  2659. flags := 0;
  2660. flagp := flag_Worst;
  2661. AddrOfLen := nil;
  2662. Inc(regparse);
  2663. case (regparse - 1)^ of
  2664. '^':
  2665. if not fCompModifiers.M or
  2666. ((fLineSeparators = '') and not fLinePairedSeparatorAssigned) then
  2667. ret := EmitNode(OP_BOL)
  2668. else
  2669. ret := EmitNode(OP_BOLML);
  2670. '$':
  2671. if not fCompModifiers.M or
  2672. ((fLineSeparators = '') and not fLinePairedSeparatorAssigned) then
  2673. ret := EmitNode(OP_EOL)
  2674. else
  2675. ret := EmitNode(OP_EOLML);
  2676. '.':
  2677. if fCompModifiers.S then
  2678. begin
  2679. ret := EmitNode(OP_ANY);
  2680. flagp := flagp or flag_HasWidth or flag_Simple;
  2681. end
  2682. else
  2683. begin // not /s, so emit [^:LineSeparators:]
  2684. ret := EmitNode(OP_ANYML);
  2685. flagp := flagp or flag_HasWidth; // not so simple ;)
  2686. end;
  2687. '[':
  2688. begin
  2689. if regparse^ = '^' then
  2690. begin // Complement of range.
  2691. if fCompModifiers.I then
  2692. ret := EmitNode(OP_ANYBUTCI)
  2693. else
  2694. ret := EmitNode(OP_ANYBUT);
  2695. Inc(regparse);
  2696. end
  2697. else if fCompModifiers.I then
  2698. ret := EmitNode(OP_ANYOFCI)
  2699. else
  2700. ret := EmitNode(OP_ANYOF);
  2701. CanBeRange := False;
  2702. if regparse^ = ']' then
  2703. begin
  2704. // first ']' inside [] treated as simple char, no need to check '['
  2705. EmitRangeChar(regparse^, (regparse + 1)^ = '-');
  2706. Inc(regparse);
  2707. end;
  2708. while (regparse < fRegexEnd) and (regparse^ <> ']') do
  2709. begin
  2710. // last '-' inside [] treated as simple dash
  2711. if (regparse^ = '-') and
  2712. ((regparse + 1) < fRegexEnd) and
  2713. ((regparse + 1)^ = ']') then
  2714. begin
  2715. EmitRangeChar('-', False);
  2716. Inc(regparse);
  2717. Break;
  2718. end;
  2719. // char '-' which (maybe) makes a range
  2720. if (regparse^ = '-') and ((regparse + 1) < fRegexEnd) and CanBeRange then
  2721. begin
  2722. Inc(regparse);
  2723. RangeEnd := regparse^;
  2724. if RangeEnd = EscChar then
  2725. begin
  2726. if _IsMetaChar((regparse + 1)^) then
  2727. begin
  2728. Error(reeMetaCharAfterMinusInRange);
  2729. Exit;
  2730. end;
  2731. Inc(regparse);
  2732. RangeEnd := UnQuoteChar(regparse);
  2733. end;
  2734. // special handling for Russian range a-YA, add 2 ranges: a-ya and A-YA
  2735. if fCompModifiers.R and
  2736. (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then
  2737. begin
  2738. EmitRangePacked(RusRangeLoLow, RusRangeLoHigh);
  2739. EmitRangePacked(RusRangeHiLow, RusRangeHiHigh);
  2740. end
  2741. else
  2742. begin // standard r.e. handling
  2743. if RangeBeg > RangeEnd then
  2744. begin
  2745. Error(reeInvalidRange);
  2746. Exit;
  2747. end;
  2748. EmitRangePacked(RangeBeg, RangeEnd);
  2749. end;
  2750. Inc(regparse);
  2751. end
  2752. else
  2753. begin
  2754. if regparse^ = EscChar then
  2755. begin
  2756. Inc(regparse);
  2757. if regparse >= fRegexEnd then
  2758. begin
  2759. Error(reeParseAtomTrailingBackSlash);
  2760. Exit;
  2761. end;
  2762. if _IsMetaChar(regparse^) then
  2763. begin
  2764. AddrOfLen := nil;
  2765. CanBeRange := False;
  2766. EmitC(OpKind_MetaClass);
  2767. case regparse^ of
  2768. 'w':
  2769. EmitC(REChar(CheckerIndex_Word));
  2770. 'W':
  2771. EmitC(REChar(CheckerIndex_NotWord));
  2772. 's':
  2773. EmitC(REChar(CheckerIndex_Space));
  2774. 'S':
  2775. EmitC(REChar(CheckerIndex_NotSpace));
  2776. 'd':
  2777. EmitC(REChar(CheckerIndex_Digit));
  2778. 'D':
  2779. EmitC(REChar(CheckerIndex_NotDigit));
  2780. 'v':
  2781. EmitC(REChar(CheckerIndex_VertSep));
  2782. 'V':
  2783. EmitC(REChar(CheckerIndex_NotVertSep));
  2784. 'h':
  2785. EmitC(REChar(CheckerIndex_HorzSep));
  2786. 'H':
  2787. EmitC(REChar(CheckerIndex_NotHorzSep));
  2788. else
  2789. Error(reeBadOpcodeInCharClass);
  2790. end;
  2791. end
  2792. else
  2793. begin
  2794. TempChar := UnQuoteChar(regparse);
  2795. // False if '-' is last char in []
  2796. DashForRange :=
  2797. (regparse + 2 < fRegexEnd) and
  2798. ((regparse + 1)^ = '-') and
  2799. ((regparse + 2)^ <> ']');
  2800. EmitRangeChar(TempChar, DashForRange);
  2801. end;
  2802. end
  2803. else
  2804. begin
  2805. // False if '-' is last char in []
  2806. DashForRange :=
  2807. (regparse + 2 < fRegexEnd) and
  2808. ((regparse + 1)^ = '-') and
  2809. ((regparse + 2)^ <> ']');
  2810. EmitRangeChar(regparse^, DashForRange);
  2811. end;
  2812. Inc(regparse);
  2813. end;
  2814. end; { of while }
  2815. AddrOfLen := nil;
  2816. CanBeRange := False;
  2817. EmitC(OpKind_End);
  2818. if regparse^ <> ']' then
  2819. begin
  2820. Error(reeUnmatchedSqBrackets);
  2821. Exit;
  2822. end;
  2823. Inc(regparse);
  2824. flagp := flagp or flag_HasWidth or flag_Simple;
  2825. end;
  2826. '(':
  2827. begin
  2828. if regparse^ = '?' then
  2829. begin
  2830. // check for non-capturing group: (?:text)
  2831. if (regparse + 1)^ = ':' then
  2832. begin
  2833. Inc(regparse, 2);
  2834. ret := ParseReg(1, flags);
  2835. if ret = nil then
  2836. begin
  2837. Result := nil;
  2838. Exit;
  2839. end;
  2840. flagp := flagp or flags and (flag_HasWidth or flag_SpecStart);
  2841. end
  2842. else
  2843. // check for extended Perl syntax : (?..)
  2844. if (regparse + 1)^ = '#' then
  2845. begin // (?#comment)
  2846. Inc(regparse, 2); // find closing ')'
  2847. while (regparse < fRegexEnd) and (regparse^ <> ')') do
  2848. Inc(regparse);
  2849. if regparse^ <> ')' then
  2850. begin
  2851. Error(reeUnclosedComment);
  2852. Exit;
  2853. end;
  2854. Inc(regparse); // skip ')'
  2855. ret := EmitNode(OP_COMMENT); // comment
  2856. end
  2857. else
  2858. begin // modifiers ?
  2859. Inc(regparse); // skip '?'
  2860. SavedPtr := regparse;
  2861. while (regparse < fRegexEnd) and (regparse^ <> ')') do
  2862. Inc(regparse);
  2863. if (regparse^ <> ')') or
  2864. not ParseModifiers(SavedPtr, regparse - SavedPtr, fCompModifiers) then
  2865. begin
  2866. Error(reeUnrecognizedModifier);
  2867. Exit;
  2868. end;
  2869. Inc(regparse); // skip ')'
  2870. ret := EmitNode(OP_COMMENT); // comment
  2871. // Error (reeQPSBFollowsNothing);
  2872. // Exit;
  2873. end;
  2874. end
  2875. else
  2876. begin
  2877. // normal (capturing) group
  2878. if fSecondPass then
  2879. // must skip this block for one of passes, to not double groups count
  2880. if GrpCount < NSUBEXP - 1 then
  2881. begin
  2882. Inc(GrpCount);
  2883. GrpIndexes[GrpCount] := regnpar;
  2884. end;
  2885. ret := ParseReg(1, flags);
  2886. if ret = nil then
  2887. begin
  2888. Result := nil;
  2889. Exit;
  2890. end;
  2891. flagp := flagp or flags and (flag_HasWidth or flag_SpecStart);
  2892. end;
  2893. end;
  2894. '|', ')':
  2895. begin // Supposed to be caught earlier.
  2896. Error(reeInternalUrp);
  2897. Exit;
  2898. end;
  2899. '?', '+', '*':
  2900. begin
  2901. Error(reeQPSBFollowsNothing);
  2902. Exit;
  2903. end;
  2904. EscChar:
  2905. begin
  2906. if regparse >= fRegexEnd then
  2907. begin
  2908. Error(reeTrailingBackSlash);
  2909. Exit;
  2910. end;
  2911. case regparse^ of // r.e.extensions
  2912. 'b':
  2913. ret := EmitNode(OP_BOUND); // ###0.943
  2914. 'B':
  2915. ret := EmitNode(OP_NOTBOUND); // ###0.943
  2916. 'A':
  2917. ret := EmitNode(OP_BOL); // ###0.941
  2918. 'Z':
  2919. ret := EmitNode(OP_EOL); // ###0.941
  2920. 'd':
  2921. begin // r.e.extension - any digit ('0' .. '9')
  2922. ret := EmitNode(OP_ANYDIGIT);
  2923. flagp := flagp or flag_HasWidth or flag_Simple;
  2924. end;
  2925. 'D':
  2926. begin // r.e.extension - not digit ('0' .. '9')
  2927. ret := EmitNode(OP_NOTDIGIT);
  2928. flagp := flagp or flag_HasWidth or flag_Simple;
  2929. end;
  2930. 's':
  2931. begin // r.e.extension - any space char
  2932. ret := EmitNode(OP_ANYSPACE);
  2933. flagp := flagp or flag_HasWidth or flag_Simple;
  2934. end;
  2935. 'S':
  2936. begin // r.e.extension - not space char
  2937. ret := EmitNode(OP_NOTSPACE);
  2938. flagp := flagp or flag_HasWidth or flag_Simple;
  2939. end;
  2940. 'w':
  2941. begin // r.e.extension - any english char / digit / '_'
  2942. ret := EmitNode(OP_ANYLETTER);
  2943. flagp := flagp or flag_HasWidth or flag_Simple;
  2944. end;
  2945. 'W':
  2946. begin // r.e.extension - not english char / digit / '_'
  2947. ret := EmitNode(OP_NOTLETTER);
  2948. flagp := flagp or flag_HasWidth or flag_Simple;
  2949. end;
  2950. 'v':
  2951. begin
  2952. ret := EmitNode(OP_ANYVERTSEP);
  2953. flagp := flagp or flag_HasWidth or flag_Simple;
  2954. end;
  2955. 'V':
  2956. begin
  2957. ret := EmitNode(OP_NOTVERTSEP);
  2958. flagp := flagp or flag_HasWidth or flag_Simple;
  2959. end;
  2960. 'h':
  2961. begin
  2962. ret := EmitNode(OP_ANYHORZSEP);
  2963. flagp := flagp or flag_HasWidth or flag_Simple;
  2964. end;
  2965. 'H':
  2966. begin
  2967. ret := EmitNode(OP_NOTHORZSEP);
  2968. flagp := flagp or flag_HasWidth or flag_Simple;
  2969. end;
  2970. '1' .. '9':
  2971. begin // ###0.936
  2972. if fCompModifiers.I then
  2973. ret := EmitNode(OP_BSUBEXPCI)
  2974. else
  2975. ret := EmitNode(OP_BSUBEXP);
  2976. EmitC(REChar(Ord(regparse^) - Ord('0')));
  2977. flagp := flagp or flag_HasWidth or flag_Simple;
  2978. end;
  2979. else
  2980. EmitExactly(UnQuoteChar(regparse));
  2981. end; { of case }
  2982. Inc(regparse);
  2983. end;
  2984. else
  2985. begin
  2986. Dec(regparse);
  2987. if fCompModifiers.X and // check for eXtended syntax
  2988. ((regparse^ = '#') or IsIgnoredChar(regparse^)) then
  2989. begin // ###0.941 \x
  2990. if regparse^ = '#' then
  2991. begin // Skip eXtended comment
  2992. // find comment terminator (group of \n and/or \r)
  2993. while (regparse < fRegexEnd) and (regparse^ <> #$d) and
  2994. (regparse^ <> #$a) do
  2995. Inc(regparse);
  2996. while (regparse^ = #$d) or (regparse^ = #$a)
  2997. // skip comment terminator
  2998. do
  2999. Inc(regparse);
  3000. // attempt to support different type of line separators
  3001. end
  3002. else
  3003. begin // Skip the blanks!
  3004. while IsIgnoredChar(regparse^) do
  3005. Inc(regparse);
  3006. end;
  3007. ret := EmitNode(OP_COMMENT); // comment
  3008. end
  3009. else
  3010. begin
  3011. Len := FindSkippedMetaLen(regparse, fRegexEnd);
  3012. if Len <= 0 then
  3013. if regparse^ <> '{' then
  3014. begin
  3015. Error(reeRarseAtomInternalDisaster);
  3016. Exit;
  3017. end
  3018. else
  3019. Len := FindSkippedMetaLen(regparse + 1, fRegexEnd) + 1;
  3020. // bad {n,m} - compile as EXACTLY
  3021. EnderChar := (regparse + Len)^;
  3022. if (Len > 1) and ((EnderChar = '*') or (EnderChar = '+') or (EnderChar = '?') or (EnderChar = '{')) then
  3023. Dec(Len); // back off clear of ?+*{ operand.
  3024. flagp := flagp or flag_HasWidth;
  3025. if Len = 1 then
  3026. flagp := flagp or flag_Simple;
  3027. if fCompModifiers.I then
  3028. ret := EmitNode(OP_EXACTLYCI)
  3029. else
  3030. ret := EmitNode(OP_EXACTLY);
  3031. EmitInt(0);
  3032. while (Len > 0) and ((not fCompModifiers.X) or (regparse^ <> '#')) do
  3033. begin
  3034. if not fCompModifiers.X or not IsIgnoredChar(regparse^) then
  3035. begin
  3036. EmitC(regparse^);
  3037. if regcode <> @regdummy then
  3038. Inc(regExactlyLen^);
  3039. end;
  3040. Inc(regparse);
  3041. Dec(Len);
  3042. end;
  3043. end; { of if not comment }
  3044. end; { of case else }
  3045. end; { of case }
  3046. Result := ret;
  3047. end; { of function TRegExpr.ParseAtom
  3048. -------------------------------------------------------------- }
  3049. function TRegExpr.GetCompilerErrorPos: PtrInt;
  3050. begin
  3051. Result := 0;
  3052. if (regexpBegin = nil) or (regparse = nil) then
  3053. Exit; // not in compiling mode ?
  3054. Result := regparse - regexpBegin;
  3055. end; { of function TRegExpr.GetCompilerErrorPos
  3056. -------------------------------------------------------------- }
  3057. { ============================================================= }
  3058. { ===================== Matching section ====================== }
  3059. { ============================================================= }
  3060. function TRegExpr.regrepeat(p: PRegExprChar; AMax: integer): integer;
  3061. // repeatedly match something simple, report how many
  3062. var
  3063. scan: PRegExprChar;
  3064. opnd: PRegExprChar;
  3065. TheMax: PtrInt; // PtrInt, gets diff of 2 pointers
  3066. //NLen: integer;
  3067. InvChar: REChar; // ###0.931
  3068. GrpStart, GrpEnd: PRegExprChar; // ###0.936
  3069. ArrayIndex: integer;
  3070. begin
  3071. Result := 0;
  3072. scan := reginput;
  3073. opnd := p + REOpSz + RENextOffSz; // OPERAND
  3074. TheMax := fInputEnd - scan;
  3075. if TheMax > AMax then
  3076. TheMax := AMax;
  3077. case PREOp(p)^ of
  3078. OP_ANY:
  3079. begin
  3080. // note - OP_ANYML cannot be proceeded in regrepeat because can skip
  3081. // more than one char at once
  3082. Result := TheMax;
  3083. Inc(scan, Result);
  3084. end;
  3085. OP_EXACTLY:
  3086. begin // in opnd can be only ONE char !!!
  3087. {
  3088. // Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
  3089. NLen := PLongInt(opnd)^;
  3090. if TheMax > NLen then
  3091. TheMax := NLen;
  3092. }
  3093. Inc(opnd, RENumberSz);
  3094. while (Result < TheMax) and (opnd^ = scan^) do
  3095. begin
  3096. Inc(Result);
  3097. Inc(scan);
  3098. end;
  3099. end;
  3100. OP_EXACTLYCI:
  3101. begin // in opnd can be only ONE char !!!
  3102. {
  3103. // Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
  3104. NLen := PLongInt(opnd)^;
  3105. if TheMax > NLen then
  3106. TheMax := NLen;
  3107. }
  3108. Inc(opnd, RENumberSz);
  3109. while (Result < TheMax) and (opnd^ = scan^) do
  3110. begin // prevent unneeded InvertCase //###0.931
  3111. Inc(Result);
  3112. Inc(scan);
  3113. end;
  3114. if Result < TheMax then
  3115. begin // ###0.931
  3116. InvChar := InvertCase(opnd^); // store in register
  3117. while (Result < TheMax) and ((opnd^ = scan^) or (InvChar = scan^)) do
  3118. begin
  3119. Inc(Result);
  3120. Inc(scan);
  3121. end;
  3122. end;
  3123. end;
  3124. OP_BSUBEXP:
  3125. begin // ###0.936
  3126. ArrayIndex := GrpIndexes[Ord(opnd^)];
  3127. if ArrayIndex < 0 then
  3128. Exit;
  3129. GrpStart := startp[ArrayIndex];
  3130. if GrpStart = nil then
  3131. Exit;
  3132. GrpEnd := endp[ArrayIndex];
  3133. if GrpEnd = nil then
  3134. Exit;
  3135. repeat
  3136. opnd := GrpStart;
  3137. while opnd < GrpEnd do
  3138. begin
  3139. if (scan >= fInputEnd) or (scan^ <> opnd^) then
  3140. Exit;
  3141. Inc(scan);
  3142. Inc(opnd);
  3143. end;
  3144. Inc(Result);
  3145. reginput := scan;
  3146. until Result >= AMax;
  3147. end;
  3148. OP_BSUBEXPCI:
  3149. begin // ###0.936
  3150. ArrayIndex := GrpIndexes[Ord(opnd^)];
  3151. if ArrayIndex < 0 then
  3152. Exit;
  3153. GrpStart := startp[ArrayIndex];
  3154. if GrpStart = nil then
  3155. Exit;
  3156. GrpEnd := endp[ArrayIndex];
  3157. if GrpEnd = nil then
  3158. Exit;
  3159. repeat
  3160. opnd := GrpStart;
  3161. while opnd < GrpEnd do
  3162. begin
  3163. if (scan >= fInputEnd) or
  3164. ((scan^ <> opnd^) and (scan^ <> InvertCase(opnd^))) then
  3165. Exit;
  3166. Inc(scan);
  3167. Inc(opnd);
  3168. end;
  3169. Inc(Result);
  3170. reginput := scan;
  3171. until Result >= AMax;
  3172. end;
  3173. OP_ANYDIGIT:
  3174. while (Result < TheMax) and IsDigitChar(scan^) do
  3175. begin
  3176. Inc(Result);
  3177. Inc(scan);
  3178. end;
  3179. OP_NOTDIGIT:
  3180. while (Result < TheMax) and not IsDigitChar(scan^) do
  3181. begin
  3182. Inc(Result);
  3183. Inc(scan);
  3184. end;
  3185. OP_ANYLETTER:
  3186. while (Result < TheMax) and IsWordChar(scan^) do // ###0.940
  3187. begin
  3188. Inc(Result);
  3189. Inc(scan);
  3190. end;
  3191. OP_NOTLETTER:
  3192. while (Result < TheMax) and not IsWordChar(scan^) do // ###0.940
  3193. begin
  3194. Inc(Result);
  3195. Inc(scan);
  3196. end;
  3197. OP_ANYSPACE:
  3198. while (Result < TheMax) and IsSpaceChar(scan^) do
  3199. begin
  3200. Inc(Result);
  3201. Inc(scan);
  3202. end;
  3203. OP_NOTSPACE:
  3204. while (Result < TheMax) and not IsSpaceChar(scan^) do
  3205. begin
  3206. Inc(Result);
  3207. Inc(scan);
  3208. end;
  3209. OP_ANYVERTSEP:
  3210. while (Result < TheMax) and IsLineSeparator(scan^) do
  3211. begin
  3212. Inc(Result);
  3213. Inc(scan);
  3214. end;
  3215. OP_NOTVERTSEP:
  3216. while (Result < TheMax) and not IsLineSeparator(scan^) do
  3217. begin
  3218. Inc(Result);
  3219. Inc(scan);
  3220. end;
  3221. OP_ANYHORZSEP:
  3222. while (Result < TheMax) and IsHorzSeparator(scan^) do
  3223. begin
  3224. Inc(Result);
  3225. Inc(scan);
  3226. end;
  3227. OP_NOTHORZSEP:
  3228. while (Result < TheMax) and not IsHorzSeparator(scan^) do
  3229. begin
  3230. Inc(Result);
  3231. Inc(scan);
  3232. end;
  3233. OP_ANYOF:
  3234. while (Result < TheMax) and FindInCharClass(opnd, scan^, False) do
  3235. begin
  3236. Inc(Result);
  3237. Inc(scan);
  3238. end;
  3239. OP_ANYBUT:
  3240. while (Result < TheMax) and not FindInCharClass(opnd, scan^, False) do
  3241. begin
  3242. Inc(Result);
  3243. Inc(scan);
  3244. end;
  3245. OP_ANYOFCI:
  3246. while (Result < TheMax) and FindInCharClass(opnd, scan^, True) do
  3247. begin
  3248. Inc(Result);
  3249. Inc(scan);
  3250. end;
  3251. OP_ANYBUTCI:
  3252. while (Result < TheMax) and not FindInCharClass(opnd, scan^, True) do
  3253. begin
  3254. Inc(Result);
  3255. Inc(scan);
  3256. end;
  3257. else
  3258. begin // Oh dear. Called inappropriately.
  3259. Result := 0; // Best compromise.
  3260. Error(reeRegRepeatCalledInappropriately);
  3261. Exit;
  3262. end;
  3263. end; { of case }
  3264. reginput := scan;
  3265. end; { of function TRegExpr.regrepeat
  3266. -------------------------------------------------------------- }
  3267. function TRegExpr.regnext(p: PRegExprChar): PRegExprChar;
  3268. // dig the "next" pointer out of a node
  3269. var
  3270. offset: TRENextOff;
  3271. begin
  3272. if p = @regdummy then
  3273. begin
  3274. Result := nil;
  3275. Exit;
  3276. end;
  3277. offset := PRENextOff(AlignToPtr(p + REOpSz))^; // ###0.933 inlined NEXT
  3278. if offset = 0 then
  3279. Result := nil
  3280. else
  3281. Result := p + offset;
  3282. end; { of function TRegExpr.regnext
  3283. -------------------------------------------------------------- }
  3284. function TRegExpr.MatchPrim(prog: PRegExprChar): boolean;
  3285. // recursively matching routine
  3286. // Conceptually the strategy is simple: check to see whether the current
  3287. // node matches, call self recursively to see whether the rest matches,
  3288. // and then act accordingly. In practice we make some effort to avoid
  3289. // recursion, in particular by going through "ordinary" nodes (that don't
  3290. // need to know whether the rest of the match failed) by a loop instead of
  3291. // by recursion.
  3292. var
  3293. scan: PRegExprChar; // Current node.
  3294. next: PRegExprChar; // Next node.
  3295. Len: PtrInt;
  3296. opnd: PRegExprChar;
  3297. no: integer;
  3298. save: PRegExprChar;
  3299. nextch: REChar;
  3300. BracesMin, Bracesmax: integer;
  3301. // we use integer instead of TREBracesArg for better support */+
  3302. {$IFDEF ComplexBraces}
  3303. SavedLoopStack: TRegExprLoopStack; // :(( very bad for recursion
  3304. SavedLoopStackIdx: integer; // ###0.925
  3305. {$ENDIF}
  3306. bound1, bound2: boolean;
  3307. begin
  3308. Result := False;
  3309. {$IFDEF ComplexBraces}
  3310. SavedLoopStack:=Default(TRegExprLoopStack);
  3311. SavedLoopStackIdx:=0;
  3312. {$ENDIF}
  3313. scan := prog;
  3314. while scan <> nil do
  3315. begin
  3316. Len := PRENextOff(AlignToPtr(scan + 1))^; // ###0.932 inlined regnext
  3317. if Len = 0 then
  3318. next := nil
  3319. else
  3320. next := scan + Len;
  3321. case scan^ of
  3322. OP_NOTBOUND,
  3323. OP_BOUND:
  3324. begin
  3325. bound1 := (reginput = fInputStart) or not IsWordChar((reginput - 1)^);
  3326. bound2 := (reginput = fInputEnd) or not IsWordChar(reginput^);
  3327. if (scan^ = OP_BOUND) xor (bound1 <> bound2) then
  3328. Exit;
  3329. end;
  3330. OP_BOL:
  3331. begin
  3332. if reginput <> fInputStart then
  3333. Exit;
  3334. end;
  3335. OP_EOL:
  3336. begin
  3337. if reginput < fInputEnd then
  3338. Exit;
  3339. end;
  3340. OP_BOLML:
  3341. if reginput > fInputStart then
  3342. begin
  3343. nextch := (reginput - 1)^;
  3344. if (nextch <> fLinePairedSeparatorTail) or
  3345. ((reginput - 1) <= fInputStart) or
  3346. ((reginput - 2)^ <> fLinePairedSeparatorHead) then
  3347. begin
  3348. if (nextch = fLinePairedSeparatorHead) and
  3349. (reginput^ = fLinePairedSeparatorTail) then
  3350. Exit; // don't stop between paired separator
  3351. if not IsCustomLineSeparator(nextch) then
  3352. Exit;
  3353. end;
  3354. end;
  3355. OP_EOLML:
  3356. if reginput < fInputEnd then
  3357. begin
  3358. nextch := reginput^;
  3359. if (nextch <> fLinePairedSeparatorHead) or
  3360. ((reginput + 1)^ <> fLinePairedSeparatorTail) then
  3361. begin
  3362. if (nextch = fLinePairedSeparatorTail) and (reginput > fInputStart)
  3363. and ((reginput - 1)^ = fLinePairedSeparatorHead) then
  3364. Exit; // don't stop between paired separator
  3365. if not IsCustomLineSeparator(nextch) then
  3366. Exit;
  3367. end;
  3368. end;
  3369. OP_ANY:
  3370. begin
  3371. if reginput = fInputEnd then
  3372. Exit;
  3373. Inc(reginput);
  3374. end;
  3375. OP_ANYML:
  3376. begin // ###0.941
  3377. if (reginput = fInputEnd) or
  3378. ((reginput^ = fLinePairedSeparatorHead) and
  3379. ((reginput + 1)^ = fLinePairedSeparatorTail)) or
  3380. IsCustomLineSeparator(reginput^)
  3381. then
  3382. Exit;
  3383. Inc(reginput);
  3384. end;
  3385. OP_ANYDIGIT:
  3386. begin
  3387. if (reginput = fInputEnd) or not IsDigitChar(reginput^) then
  3388. Exit;
  3389. Inc(reginput);
  3390. end;
  3391. OP_NOTDIGIT:
  3392. begin
  3393. if (reginput = fInputEnd) or IsDigitChar(reginput^) then
  3394. Exit;
  3395. Inc(reginput);
  3396. end;
  3397. OP_ANYLETTER:
  3398. begin
  3399. if (reginput = fInputEnd) or not IsWordChar(reginput^) // ###0.943
  3400. then
  3401. Exit;
  3402. Inc(reginput);
  3403. end;
  3404. OP_NOTLETTER:
  3405. begin
  3406. if (reginput = fInputEnd) or IsWordChar(reginput^) // ###0.943
  3407. then
  3408. Exit;
  3409. Inc(reginput);
  3410. end;
  3411. OP_ANYSPACE:
  3412. begin
  3413. if (reginput = fInputEnd) or not IsSpaceChar(reginput^) // ###0.943
  3414. then
  3415. Exit;
  3416. Inc(reginput);
  3417. end;
  3418. OP_NOTSPACE:
  3419. begin
  3420. if (reginput = fInputEnd) or IsSpaceChar(reginput^) // ###0.943
  3421. then
  3422. Exit;
  3423. Inc(reginput);
  3424. end;
  3425. OP_ANYVERTSEP:
  3426. begin
  3427. if (reginput = fInputEnd) or not IsLineSeparator(reginput^) then
  3428. Exit;
  3429. Inc(reginput);
  3430. end;
  3431. OP_NOTVERTSEP:
  3432. begin
  3433. if (reginput = fInputEnd) or IsLineSeparator(reginput^) then
  3434. Exit;
  3435. Inc(reginput);
  3436. end;
  3437. OP_ANYHORZSEP:
  3438. begin
  3439. if (reginput = fInputEnd) or not IsHorzSeparator(reginput^) then
  3440. Exit;
  3441. Inc(reginput);
  3442. end;
  3443. OP_NOTHORZSEP:
  3444. begin
  3445. if (reginput = fInputEnd) or IsHorzSeparator(reginput^) then
  3446. Exit;
  3447. Inc(reginput);
  3448. end;
  3449. OP_EXACTLYCI:
  3450. begin
  3451. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  3452. Len := PLongInt(opnd)^;
  3453. Inc(opnd, RENumberSz);
  3454. // Inline the first character, for speed.
  3455. if (opnd^ <> reginput^) and (InvertCase(opnd^) <> reginput^) then
  3456. Exit;
  3457. // ###0.929 begin
  3458. no := Len;
  3459. save := reginput;
  3460. while no > 1 do
  3461. begin
  3462. Inc(save);
  3463. Inc(opnd);
  3464. if (opnd^ <> save^) and (InvertCase(opnd^) <> save^) then
  3465. Exit;
  3466. Dec(no);
  3467. end;
  3468. // ###0.929 end
  3469. Inc(reginput, Len);
  3470. end;
  3471. OP_EXACTLY:
  3472. begin
  3473. opnd := scan + REOpSz + RENextOffSz; // OPERAND
  3474. Len := PLongInt(opnd)^;
  3475. Inc(opnd, RENumberSz);
  3476. // Inline the first character, for speed.
  3477. if opnd^ <> reginput^ then
  3478. Exit;
  3479. // ###0.929 begin
  3480. no := Len;
  3481. save := reginput;
  3482. while no > 1 do
  3483. begin
  3484. Inc(save);
  3485. Inc(opnd);
  3486. if opnd^ <> save^ then
  3487. Exit;
  3488. Dec(no);
  3489. end;
  3490. // ###0.929 end
  3491. Inc(reginput, Len);
  3492. end;
  3493. OP_BSUBEXP:
  3494. begin // ###0.936
  3495. no := Ord((scan + REOpSz + RENextOffSz)^);
  3496. no := GrpIndexes[no];
  3497. if no < 0 then
  3498. Exit;
  3499. if startp[no] = nil then
  3500. Exit;
  3501. if endp[no] = nil then
  3502. Exit;
  3503. save := reginput;
  3504. opnd := startp[no];
  3505. while opnd < endp[no] do
  3506. begin
  3507. if (save >= fInputEnd) or (save^ <> opnd^) then
  3508. Exit;
  3509. Inc(save);
  3510. Inc(opnd);
  3511. end;
  3512. reginput := save;
  3513. end;
  3514. OP_BSUBEXPCI:
  3515. begin // ###0.936
  3516. no := Ord((scan + REOpSz + RENextOffSz)^);
  3517. no := GrpIndexes[no];
  3518. if no < 0 then
  3519. Exit;
  3520. if startp[no] = nil then
  3521. Exit;
  3522. if endp[no] = nil then
  3523. Exit;
  3524. save := reginput;
  3525. opnd := startp[no];
  3526. while opnd < endp[no] do
  3527. begin
  3528. if (save >= fInputEnd) or
  3529. ((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then
  3530. Exit;
  3531. Inc(save);
  3532. Inc(opnd);
  3533. end;
  3534. reginput := save;
  3535. end;
  3536. OP_ANYOF:
  3537. begin
  3538. if (reginput = fInputEnd) or
  3539. not FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, False) then
  3540. Exit;
  3541. Inc(reginput);
  3542. end;
  3543. OP_ANYBUT:
  3544. begin
  3545. if (reginput = fInputEnd) or
  3546. FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, False) then
  3547. Exit;
  3548. Inc(reginput);
  3549. end;
  3550. OP_ANYOFCI:
  3551. begin
  3552. if (reginput = fInputEnd) or
  3553. not FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, True) then
  3554. Exit;
  3555. Inc(reginput);
  3556. end;
  3557. OP_ANYBUTCI:
  3558. begin
  3559. if (reginput = fInputEnd) or
  3560. FindInCharClass(scan + REOpSz + RENextOffSz, reginput^, True) then
  3561. Exit;
  3562. Inc(reginput);
  3563. end;
  3564. OP_NOTHING:
  3565. ;
  3566. OP_COMMENT:
  3567. ;
  3568. OP_BACK:
  3569. ;
  3570. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1):
  3571. begin // ###0.929
  3572. no := Ord(scan^) - Ord(OP_OPEN);
  3573. // save := reginput;
  3574. save := startp[no]; // ###0.936
  3575. startp[no] := reginput; // ###0.936
  3576. Result := MatchPrim(next);
  3577. if not Result // ###0.936
  3578. then
  3579. startp[no] := save;
  3580. // if Result and (startp [no] = nil)
  3581. // then startp [no] := save;
  3582. // Don't set startp if some later invocation of the same
  3583. // parentheses already has.
  3584. Exit;
  3585. end;
  3586. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1):
  3587. begin // ###0.929
  3588. no := Ord(scan^) - Ord(OP_CLOSE);
  3589. // save := reginput;
  3590. save := endp[no]; // ###0.936
  3591. endp[no] := reginput; // ###0.936
  3592. Result := MatchPrim(next);
  3593. if not Result // ###0.936
  3594. then
  3595. endp[no] := save;
  3596. // if Result and (endp [no] = nil)
  3597. // then endp [no] := save;
  3598. // Don't set endp if some later invocation of the same
  3599. // parentheses already has.
  3600. Exit;
  3601. end;
  3602. OP_BRANCH:
  3603. begin
  3604. if (next^ <> OP_BRANCH) // No choice.
  3605. then
  3606. next := scan + REOpSz + RENextOffSz // Avoid recursion
  3607. else
  3608. begin
  3609. repeat
  3610. save := reginput;
  3611. Result := MatchPrim(scan + REOpSz + RENextOffSz);
  3612. if Result then
  3613. Exit;
  3614. reginput := save;
  3615. scan := regnext(scan);
  3616. until (scan = nil) or (scan^ <> OP_BRANCH);
  3617. Exit;
  3618. end;
  3619. end;
  3620. {$IFDEF ComplexBraces}
  3621. OP_LOOPENTRY:
  3622. begin // ###0.925
  3623. no := LoopStackIdx;
  3624. Inc(LoopStackIdx);
  3625. if LoopStackIdx > LoopStackMax then
  3626. begin
  3627. Error(reeLoopStackExceeded);
  3628. Exit;
  3629. end;
  3630. save := reginput;
  3631. LoopStack[LoopStackIdx] := 0; // init loop counter
  3632. Result := MatchPrim(next); // execute loop
  3633. LoopStackIdx := no; // cleanup
  3634. if Result then
  3635. Exit;
  3636. reginput := save;
  3637. Exit;
  3638. end;
  3639. OP_LOOP, OP_LOOPNG:
  3640. begin // ###0.940
  3641. if LoopStackIdx <= 0 then
  3642. begin
  3643. Error(reeLoopWithoutEntry);
  3644. Exit;
  3645. end;
  3646. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + 2 * REBracesArgSz))^;
  3647. BracesMin := PREBracesArg(AlignToInt(scan + REOpSz + RENextOffSz))^;
  3648. Bracesmax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  3649. save := reginput;
  3650. if LoopStack[LoopStackIdx] >= BracesMin then
  3651. begin // Min alredy matched - we can work
  3652. if scan^ = OP_LOOP then
  3653. begin
  3654. // greedy way - first try to max deep of greed ;)
  3655. if LoopStack[LoopStackIdx] < Bracesmax then
  3656. begin
  3657. Inc(LoopStack[LoopStackIdx]);
  3658. no := LoopStackIdx;
  3659. Result := MatchPrim(opnd);
  3660. LoopStackIdx := no;
  3661. if Result then
  3662. Exit;
  3663. reginput := save;
  3664. end;
  3665. Dec(LoopStackIdx); // Fail. May be we are too greedy? ;)
  3666. Result := MatchPrim(next);
  3667. if not Result then
  3668. reginput := save;
  3669. Exit;
  3670. end
  3671. else
  3672. begin
  3673. // non-greedy - try just now
  3674. Result := MatchPrim(next);
  3675. if Result then
  3676. Exit
  3677. else
  3678. reginput := save; // failed - move next and try again
  3679. if LoopStack[LoopStackIdx] < Bracesmax then
  3680. begin
  3681. Inc(LoopStack[LoopStackIdx]);
  3682. no := LoopStackIdx;
  3683. Result := MatchPrim(opnd);
  3684. LoopStackIdx := no;
  3685. if Result then
  3686. Exit;
  3687. reginput := save;
  3688. end;
  3689. Dec(LoopStackIdx); // Failed - back up
  3690. Exit;
  3691. end
  3692. end
  3693. else
  3694. begin // first match a min_cnt times
  3695. Inc(LoopStack[LoopStackIdx]);
  3696. no := LoopStackIdx;
  3697. Result := MatchPrim(opnd);
  3698. LoopStackIdx := no;
  3699. if Result then
  3700. Exit;
  3701. Dec(LoopStack[LoopStackIdx]);
  3702. reginput := save;
  3703. Exit;
  3704. end;
  3705. end;
  3706. {$ENDIF}
  3707. OP_STAR, OP_PLUS, OP_BRACES, OP_STARNG, OP_PLUSNG, OP_BRACESNG:
  3708. begin
  3709. // Lookahead to avoid useless match attempts when we know
  3710. // what character comes next.
  3711. nextch := #0;
  3712. if next^ = OP_EXACTLY then
  3713. nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
  3714. Bracesmax := MaxInt; // infinite loop for * and + //###0.92
  3715. if (scan^ = OP_STAR) or (scan^ = OP_STARNG) then
  3716. BracesMin := 0 // star
  3717. else if (scan^ = OP_PLUS) or (scan^ = OP_PLUSNG) then
  3718. BracesMin := 1 // plus
  3719. else
  3720. begin // braces
  3721. BracesMin := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  3722. Bracesmax := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
  3723. end;
  3724. save := reginput;
  3725. opnd := scan + REOpSz + RENextOffSz;
  3726. if (scan^ = OP_BRACES) or (scan^ = OP_BRACESNG) then
  3727. Inc(opnd, 2 * REBracesArgSz);
  3728. if (scan^ = OP_PLUSNG) or (scan^ = OP_STARNG) or (scan^ = OP_BRACESNG) then
  3729. begin
  3730. // non-greedy mode
  3731. Bracesmax := regrepeat(opnd, Bracesmax);
  3732. // don't repeat more than BracesMax
  3733. // Now we know real Max limit to move forward (for recursion 'back up')
  3734. // In some cases it can be faster to check only Min positions first,
  3735. // but after that we have to check every position separtely instead
  3736. // of fast scannig in loop.
  3737. no := BracesMin;
  3738. while no <= Bracesmax do
  3739. begin
  3740. reginput := save + no;
  3741. // If it could work, try it.
  3742. if (nextch = #0) or (reginput^ = nextch) then
  3743. begin
  3744. {$IFDEF ComplexBraces}
  3745. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  3746. // ###0.925
  3747. SavedLoopStackIdx := LoopStackIdx;
  3748. {$ENDIF}
  3749. if MatchPrim(next) then
  3750. begin
  3751. Result := True;
  3752. Exit;
  3753. end;
  3754. {$IFDEF ComplexBraces}
  3755. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  3756. LoopStackIdx := SavedLoopStackIdx;
  3757. {$ENDIF}
  3758. end;
  3759. Inc(no); // Couldn't or didn't - move forward.
  3760. end; { of while }
  3761. Exit;
  3762. end
  3763. else
  3764. begin // greedy mode
  3765. no := regrepeat(opnd, Bracesmax); // don't repeat more than max_cnt
  3766. while no >= BracesMin do
  3767. begin
  3768. // If it could work, try it.
  3769. if (nextch = #0) or (reginput^ = nextch) then
  3770. begin
  3771. {$IFDEF ComplexBraces}
  3772. System.Move(LoopStack, SavedLoopStack, SizeOf(LoopStack));
  3773. // ###0.925
  3774. SavedLoopStackIdx := LoopStackIdx;
  3775. {$ENDIF}
  3776. if MatchPrim(next) then
  3777. begin
  3778. Result := True;
  3779. Exit;
  3780. end;
  3781. {$IFDEF ComplexBraces}
  3782. System.Move(SavedLoopStack, LoopStack, SizeOf(LoopStack));
  3783. LoopStackIdx := SavedLoopStackIdx;
  3784. {$ENDIF}
  3785. end;
  3786. Dec(no); // Couldn't or didn't - back up.
  3787. reginput := save + no;
  3788. end; { of while }
  3789. Exit;
  3790. end;
  3791. end;
  3792. OP_EEND:
  3793. begin
  3794. Result := True; // Success!
  3795. Exit;
  3796. end;
  3797. else
  3798. begin
  3799. Error(reeMatchPrimMemoryCorruption);
  3800. Exit;
  3801. end;
  3802. end; { of case scan^ }
  3803. scan := next;
  3804. end; { of while scan <> nil }
  3805. // We get here only if there's trouble -- normally "case EEND" is the
  3806. // terminating point.
  3807. Error(reeMatchPrimCorruptedPointers);
  3808. end; { of function TRegExpr.MatchPrim
  3809. -------------------------------------------------------------- }
  3810. function TRegExpr.Exec(const AInputString: RegExprString): boolean;
  3811. begin
  3812. InputString := AInputString;
  3813. Result := ExecPrim(1, False, False, False);
  3814. end; { of function TRegExpr.Exec
  3815. -------------------------------------------------------------- }
  3816. function TRegExpr.Exec: boolean;
  3817. var
  3818. SlowChecks: boolean;
  3819. begin
  3820. SlowChecks := Length(fInputString) < fSlowChecksSizeMax;
  3821. Result := ExecPrim(1, False, SlowChecks, False);
  3822. end; { of function TRegExpr.Exec
  3823. -------------------------------------------------------------- }
  3824. function TRegExpr.Exec(AOffset: integer): boolean;
  3825. begin
  3826. Result := ExecPrim(AOffset, False, False, False);
  3827. end; { of function TRegExpr.Exec
  3828. -------------------------------------------------------------- }
  3829. function TRegExpr.ExecPos(AOffset: integer = 1): boolean;
  3830. begin
  3831. Result := ExecPrim(AOffset, False, False, False);
  3832. end; { of function TRegExpr.ExecPos
  3833. -------------------------------------------------------------- }
  3834. function TRegExpr.ExecPos(AOffset: integer; ATryOnce: boolean): boolean;
  3835. begin
  3836. Result := ExecPrim(AOffset, ATryOnce, False, False);
  3837. end;
  3838. function TRegExpr.ExecPos(AOffset: integer; ATryOnce, aBackward: boolean): boolean;
  3839. begin
  3840. Result := ExecPrim(AOffset, ATryOnce, False, ABackward);
  3841. end;
  3842. function TRegExpr.MatchAtOnePos(APos: PRegExprChar): boolean;
  3843. begin
  3844. reginput := APos;
  3845. Result := MatchPrim(programm + REOpSz);
  3846. if Result then
  3847. begin
  3848. startp[0] := APos;
  3849. endp[0] := reginput;
  3850. end;
  3851. end;
  3852. procedure TRegExpr.ClearMatches;
  3853. begin
  3854. FillChar(startp, SizeOf(startp), 0);
  3855. FillChar(endp, SizeOf(endp), 0);
  3856. end;
  3857. procedure TRegExpr.ClearInternalIndexes;
  3858. var
  3859. i: integer;
  3860. begin
  3861. FillChar(startp, SizeOf(startp), 0);
  3862. FillChar(endp, SizeOf(endp), 0);
  3863. for i := 0 to NSUBEXP - 1 do
  3864. GrpIndexes[i] := -1;
  3865. GrpIndexes[0] := 0;
  3866. GrpCount := 0;
  3867. end;
  3868. function TRegExpr.ExecPrim(AOffset: integer;
  3869. ATryOnce, ASlowChecks, ABackward: boolean): boolean;
  3870. var
  3871. Ptr: PRegExprChar;
  3872. begin
  3873. Result := False;
  3874. // Ensure that Match cleared either if optimization tricks or some error
  3875. // will lead to leaving ExecPrim without actual search. That is
  3876. // important for ExecNext logic and so on.
  3877. ClearMatches;
  3878. // Don't check IsProgrammOk here! it causes big slowdown in test_benchmark!
  3879. if programm = nil then
  3880. begin
  3881. Compile;
  3882. if programm = nil then
  3883. Exit;
  3884. end;
  3885. // Check InputString presence
  3886. if fInputString = '' then
  3887. begin
  3888. if EmptyInputRaisesError then
  3889. Error(reeNoInputStringSpecified);
  3890. Exit;
  3891. end;
  3892. // Check that the start position is not negative
  3893. if AOffset < 1 then
  3894. begin
  3895. Error(reeOffsetMustBePositive);
  3896. Exit;
  3897. end;
  3898. // Check that the start position is not longer than the line
  3899. // If so then exit with nothing found
  3900. if AOffset > (Length(fInputString) + 1) // for matching empty string after last char.
  3901. then
  3902. Exit;
  3903. Ptr := fInputStart + AOffset - 1;
  3904. // If there is a "must appear" string, look for it.
  3905. if ASlowChecks then
  3906. if regmustString <> '' then
  3907. if Pos(regmustString, fInputString) = 0 then Exit;
  3908. {$IFDEF ComplexBraces}
  3909. // no loops started
  3910. LoopStackIdx := 0; // ###0.925
  3911. {$ENDIF}
  3912. // ATryOnce or anchored match (it needs to be tried only once).
  3913. if ATryOnce or (reganchored <> #0) then
  3914. begin
  3915. {$IFDEF UseFirstCharSet}
  3916. {$IFDEF UniCode}
  3917. if Ord(Ptr^) <= $FF then
  3918. {$ENDIF}
  3919. if not FirstCharArray[byte(Ptr^)] then
  3920. Exit;
  3921. {$ENDIF}
  3922. Result := MatchAtOnePos(Ptr);
  3923. Exit;
  3924. end;
  3925. // Messy cases: unanchored match.
  3926. if ABackward then
  3927. Inc(Ptr, 2)
  3928. else
  3929. Dec(Ptr);
  3930. repeat
  3931. if ABackward then
  3932. begin
  3933. Dec(Ptr);
  3934. if Ptr < fInputStart then
  3935. Exit;
  3936. end
  3937. else
  3938. begin
  3939. Inc(Ptr);
  3940. if Ptr > fInputEnd then
  3941. Exit;
  3942. end;
  3943. {$IFDEF UseFirstCharSet}
  3944. {$IFDEF UniCode}
  3945. if Ord(Ptr^) <= $FF then
  3946. {$ENDIF}
  3947. if not FirstCharArray[byte(Ptr^)] then
  3948. Continue;
  3949. {$ENDIF}
  3950. Result := MatchAtOnePos(Ptr);
  3951. // Exit on a match or after testing the end-of-string
  3952. if Result then
  3953. Exit;
  3954. until False;
  3955. end; { of function TRegExpr.ExecPrim
  3956. -------------------------------------------------------------- }
  3957. function TRegExpr.ExecNext: boolean;
  3958. begin
  3959. Result:=ExecNext(False);
  3960. end;
  3961. function TRegExpr.ExecNext(ABackward: boolean): boolean;
  3962. var
  3963. PtrBegin, PtrEnd: PRegExprChar;
  3964. Offset: PtrInt;
  3965. begin
  3966. PtrBegin := startp[0];
  3967. PtrEnd := endp[0];
  3968. if (PtrBegin = nil) or (PtrEnd = nil) then
  3969. begin
  3970. Error(reeExecNextWithoutExec);
  3971. Result := False;
  3972. Exit;
  3973. end;
  3974. Offset := PtrEnd - fInputStart + 1;
  3975. // prevent infinite looping if empty string matches r.e.
  3976. if PtrBegin = PtrEnd then
  3977. Inc(Offset);
  3978. Result := ExecPrim(Offset, False, False, ABackward);
  3979. end; { of function TRegExpr.ExecNext
  3980. -------------------------------------------------------------- }
  3981. procedure TRegExpr.SetInputString(const AInputString: RegExprString);
  3982. begin
  3983. ClearMatches;
  3984. fInputString := AInputString;
  3985. UniqueString(fInputString);
  3986. fInputStart := PRegExprChar(fInputString);
  3987. fInputEnd := fInputStart + Length(fInputString);
  3988. end; { of procedure TRegExpr.SetInputString
  3989. -------------------------------------------------------------- }
  3990. procedure TRegExpr.SetLineSeparators(const AStr: RegExprString);
  3991. begin
  3992. if AStr <> fLineSeparators then
  3993. begin
  3994. fLineSeparators := AStr;
  3995. InitLineSepArray;
  3996. InvalidateProgramm;
  3997. end;
  3998. end; { of procedure TRegExpr.SetLineSeparators
  3999. -------------------------------------------------------------- }
  4000. procedure TRegExpr.SetLinePairedSeparator(const AStr: RegExprString);
  4001. begin
  4002. if Length(AStr) = 2 then
  4003. begin
  4004. if AStr[1] = AStr[2] then
  4005. begin
  4006. // it's impossible for our 'one-point' checking to support
  4007. // two chars separator for identical chars
  4008. Error(reeBadLinePairedSeparator);
  4009. Exit;
  4010. end;
  4011. if not fLinePairedSeparatorAssigned or (AStr[1] <> fLinePairedSeparatorHead)
  4012. or (AStr[2] <> fLinePairedSeparatorTail) then
  4013. begin
  4014. fLinePairedSeparatorAssigned := True;
  4015. fLinePairedSeparatorHead := AStr[1];
  4016. fLinePairedSeparatorTail := AStr[2];
  4017. InvalidateProgramm;
  4018. end;
  4019. end
  4020. else if Length(AStr) = 0 then
  4021. begin
  4022. if fLinePairedSeparatorAssigned then
  4023. begin
  4024. fLinePairedSeparatorAssigned := False;
  4025. InvalidateProgramm;
  4026. end;
  4027. end
  4028. else
  4029. Error(reeBadLinePairedSeparator);
  4030. end; { of procedure TRegExpr.SetLinePairedSeparator
  4031. -------------------------------------------------------------- }
  4032. function TRegExpr.GetLinePairedSeparator: RegExprString;
  4033. begin
  4034. if fLinePairedSeparatorAssigned then
  4035. begin
  4036. {$IFDEF UniCode}
  4037. // Here is some UniCode 'magic'
  4038. // If You do know better decision to concatenate
  4039. // two WideChars, please, let me know!
  4040. Result := fLinePairedSeparatorHead; // ###0.947
  4041. Result := Result + fLinePairedSeparatorTail;
  4042. {$ELSE}
  4043. Result := fLinePairedSeparatorHead + fLinePairedSeparatorTail;
  4044. {$ENDIF}
  4045. end
  4046. else
  4047. Result := '';
  4048. end; { of function TRegExpr.GetLinePairedSeparator
  4049. -------------------------------------------------------------- }
  4050. function TRegExpr.Substitute(const ATemplate: RegExprString): RegExprString;
  4051. // perform substitutions after a regexp match
  4052. var
  4053. TemplateBeg, TemplateEnd: PRegExprChar;
  4054. function ParseVarName(var APtr: PRegExprChar): integer;
  4055. // extract name of variable (digits, may be enclosed with
  4056. // curly braces) from APtr^, uses TemplateEnd !!!
  4057. var
  4058. p: PRegExprChar;
  4059. Delimited: boolean;
  4060. begin
  4061. Result := 0;
  4062. p := APtr;
  4063. Delimited := (p < TemplateEnd) and (p^ = '{');
  4064. if Delimited then
  4065. Inc(p); // skip left curly brace
  4066. if (p < TemplateEnd) and (p^ = '&') then
  4067. Inc(p) // this is '$&' or '${&}'
  4068. else
  4069. while (p < TemplateEnd) and IsDigitChar(p^) do
  4070. begin
  4071. Result := Result * 10 + (Ord(p^) - Ord('0')); // ###0.939
  4072. Inc(p);
  4073. end;
  4074. if Delimited then
  4075. if (p < TemplateEnd) and (p^ = '}') then
  4076. Inc(p) // skip right curly brace
  4077. else
  4078. p := APtr; // isn't properly terminated
  4079. if p = APtr then
  4080. Result := -1; // no valid digits found or no right curly brace
  4081. APtr := p;
  4082. end;
  4083. type
  4084. TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper, smodeAllLower);
  4085. var
  4086. Mode: TSubstMode;
  4087. p, p0, p1, ResultPtr: PRegExprChar;
  4088. ResultLen, n: integer;
  4089. Ch, QuotedChar: REChar;
  4090. begin
  4091. // Check programm and input string
  4092. if not IsProgrammOk then
  4093. Exit;
  4094. if fInputString = '' then
  4095. begin
  4096. if EmptyInputRaisesError then
  4097. Error(reeNoInputStringSpecified);
  4098. Exit;
  4099. end;
  4100. // Prepare for working
  4101. if ATemplate = '' then
  4102. begin // prevent nil pointers
  4103. Result := '';
  4104. Exit;
  4105. end;
  4106. TemplateBeg := PRegExprChar(ATemplate);
  4107. TemplateEnd := TemplateBeg + Length(ATemplate);
  4108. // Count result length for speed optimization.
  4109. ResultLen := 0;
  4110. p := TemplateBeg;
  4111. while p < TemplateEnd do
  4112. begin
  4113. Ch := p^;
  4114. Inc(p);
  4115. n := -1;
  4116. if Ch = '$' then
  4117. begin
  4118. n := ParseVarName(p);
  4119. if (n >= 0) and (n <= High(GrpIndexes)) then
  4120. n := GrpIndexes[n];
  4121. end;
  4122. if n >= 0 then
  4123. begin
  4124. Inc(ResultLen, endp[n] - startp[n]);
  4125. end
  4126. else
  4127. begin
  4128. if (Ch = EscChar) and (p < TemplateEnd) then
  4129. begin // quoted or special char followed
  4130. Ch := p^;
  4131. Inc(p);
  4132. case Ch of
  4133. 'n':
  4134. Inc(ResultLen, Length(FReplaceLineEnd));
  4135. 'u', 'l', 'U', 'L': { nothing }
  4136. ;
  4137. 'x':
  4138. begin
  4139. Inc(ResultLen);
  4140. if (p^ = '{') then
  4141. begin // skip \x{....}
  4142. while ((p^ <> '}') and (p < TemplateEnd)) do
  4143. p := p + 1;
  4144. p := p + 1;
  4145. end
  4146. else
  4147. p := p + 2 // skip \x..
  4148. end;
  4149. else
  4150. Inc(ResultLen);
  4151. end;
  4152. end
  4153. else
  4154. Inc(ResultLen);
  4155. end;
  4156. end;
  4157. // Get memory. We do it once and it significant speed up work !
  4158. if ResultLen = 0 then
  4159. begin
  4160. Result := '';
  4161. Exit;
  4162. end;
  4163. SetLength(Result, ResultLen);
  4164. // Fill Result
  4165. ResultPtr := PRegExprChar(Result);
  4166. p := TemplateBeg;
  4167. Mode := smodeNormal;
  4168. while p < TemplateEnd do
  4169. begin
  4170. Ch := p^;
  4171. p0 := p;
  4172. Inc(p);
  4173. p1 := p;
  4174. n := -1;
  4175. if Ch = '$' then
  4176. begin
  4177. n := ParseVarName(p);
  4178. if (n >= 0) and (n <= High(GrpIndexes)) then
  4179. n := GrpIndexes[n];
  4180. end;
  4181. if (n >= 0) then
  4182. begin
  4183. p0 := startp[n];
  4184. p1 := endp[n];
  4185. end
  4186. else
  4187. begin
  4188. if (Ch = EscChar) and (p < TemplateEnd) then
  4189. begin // quoted or special char followed
  4190. Ch := p^;
  4191. Inc(p);
  4192. case Ch of
  4193. 'n':
  4194. begin
  4195. p0 := PRegExprChar(FReplaceLineEnd);
  4196. p1 := p0 + Length(FReplaceLineEnd);
  4197. end;
  4198. 'x', 't', 'r', 'f', 'a', 'e':
  4199. begin
  4200. p := p - 1;
  4201. // UnquoteChar expects the escaped char under the pointer
  4202. QuotedChar := UnQuoteChar(p);
  4203. p := p + 1;
  4204. // Skip after last part of the escaped sequence - UnquoteChar stops on the last symbol of it
  4205. p0 := @QuotedChar;
  4206. p1 := p0 + 1;
  4207. end;
  4208. 'l':
  4209. begin
  4210. Mode := smodeOneLower;
  4211. p1 := p0;
  4212. end;
  4213. 'L':
  4214. begin
  4215. Mode := smodeAllLower;
  4216. p1 := p0;
  4217. end;
  4218. 'u':
  4219. begin
  4220. Mode := smodeOneUpper;
  4221. p1 := p0;
  4222. end;
  4223. 'U':
  4224. begin
  4225. Mode := smodeAllUpper;
  4226. p1 := p0;
  4227. end;
  4228. else
  4229. begin
  4230. Inc(p0);
  4231. Inc(p1);
  4232. end;
  4233. end;
  4234. end
  4235. end;
  4236. if p0 < p1 then
  4237. begin
  4238. while p0 < p1 do
  4239. begin
  4240. case Mode of
  4241. smodeOneLower:
  4242. begin
  4243. ResultPtr^ := _LowerCase(p0^);
  4244. Mode := smodeNormal;
  4245. end;
  4246. smodeAllLower:
  4247. begin
  4248. ResultPtr^ := _LowerCase(p0^);
  4249. end;
  4250. smodeOneUpper:
  4251. begin
  4252. ResultPtr^ := _UpperCase(p0^);
  4253. Mode := smodeNormal;
  4254. end;
  4255. smodeAllUpper:
  4256. begin
  4257. ResultPtr^ := _UpperCase(p0^);
  4258. end;
  4259. else
  4260. ResultPtr^ := p0^;
  4261. end;
  4262. Inc(ResultPtr);
  4263. Inc(p0);
  4264. end;
  4265. Mode := smodeNormal;
  4266. end;
  4267. end;
  4268. end; { of function TRegExpr.Substitute
  4269. -------------------------------------------------------------- }
  4270. procedure TRegExpr.Split(const AInputStr: RegExprString; APieces: TStrings);
  4271. var
  4272. PrevPos: PtrInt;
  4273. begin
  4274. PrevPos := 1;
  4275. if Exec(AInputStr) then
  4276. repeat
  4277. APieces.Add(System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos));
  4278. PrevPos := MatchPos[0] + MatchLen[0];
  4279. until not ExecNext;
  4280. APieces.Add(System.Copy(AInputStr, PrevPos, MaxInt)); // Tail
  4281. end; { of procedure TRegExpr.Split
  4282. -------------------------------------------------------------- }
  4283. function TRegExpr.Replace(const AInputStr: RegExprString;
  4284. const AReplaceStr: RegExprString;
  4285. AUseSubstitution: boolean = False): RegExprString;
  4286. var
  4287. PrevPos: PtrInt;
  4288. begin
  4289. Result := '';
  4290. PrevPos := 1;
  4291. if Exec(AInputStr) then
  4292. repeat
  4293. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos);
  4294. if AUseSubstitution // ###0.946
  4295. then
  4296. Result := Result + Substitute(AReplaceStr)
  4297. else
  4298. Result := Result + AReplaceStr;
  4299. PrevPos := MatchPos[0] + MatchLen[0];
  4300. until not ExecNext;
  4301. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  4302. end; { of function TRegExpr.Replace
  4303. -------------------------------------------------------------- }
  4304. function TRegExpr.ReplaceEx(const AInputStr: RegExprString;
  4305. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  4306. var
  4307. PrevPos: PtrInt;
  4308. begin
  4309. Result := '';
  4310. PrevPos := 1;
  4311. if Exec(AInputStr) then
  4312. repeat
  4313. Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos)
  4314. + AReplaceFunc(Self);
  4315. PrevPos := MatchPos[0] + MatchLen[0];
  4316. until not ExecNext;
  4317. Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
  4318. end; { of function TRegExpr.ReplaceEx
  4319. -------------------------------------------------------------- }
  4320. function TRegExpr.Replace(const AInputStr: RegExprString;
  4321. AReplaceFunc: TRegExprReplaceFunction): RegExprString;
  4322. begin
  4323. Result := ReplaceEx(AInputStr, AReplaceFunc);
  4324. end; { of function TRegExpr.Replace
  4325. -------------------------------------------------------------- }
  4326. { ============================================================= }
  4327. { ====================== Debug section ======================== }
  4328. { ============================================================= }
  4329. {$IFDEF UseFirstCharSet}
  4330. procedure TRegExpr.FillFirstCharSet(prog: PRegExprChar);
  4331. var
  4332. scan: PRegExprChar; // Current node.
  4333. Next: PRegExprChar; // Next node.
  4334. opnd: PRegExprChar;
  4335. Oper: TREOp;
  4336. ch: REChar;
  4337. min_cnt, i: integer;
  4338. TempSet: TRegExprCharset;
  4339. begin
  4340. TempSet := [];
  4341. scan := prog;
  4342. while scan <> nil do
  4343. begin
  4344. Next := regnext(scan);
  4345. Oper := PREOp(scan)^;
  4346. case Oper of
  4347. OP_BSUBEXP,
  4348. OP_BSUBEXPCI:
  4349. begin
  4350. // we cannot optimize r.e. if it starts with back reference
  4351. FirstCharSet := RegExprAllSet; //###0.930
  4352. Exit;
  4353. end;
  4354. OP_BOL,
  4355. OP_BOLML:
  4356. ; // Exit; //###0.937
  4357. OP_EOL,
  4358. OP_EOLML:
  4359. begin //###0.948 was empty in 0.947, was EXIT in 0.937
  4360. Include(FirstCharSet, 0);
  4361. if ModifierM then
  4362. for i := 1 to Length(LineSeparators) do
  4363. Include(FirstCharSet, byte(LineSeparators[i]));
  4364. Exit;
  4365. end;
  4366. OP_BOUND,
  4367. OP_NOTBOUND:
  4368. ; //###0.943 ?!!
  4369. OP_ANY,
  4370. OP_ANYML:
  4371. begin // we can better define ANYML !!!
  4372. FirstCharSet := RegExprAllSet; //###0.930
  4373. Exit;
  4374. end;
  4375. OP_ANYDIGIT:
  4376. begin
  4377. FirstCharSet := FirstCharSet + RegExprDigitSet;
  4378. Exit;
  4379. end;
  4380. OP_NOTDIGIT:
  4381. begin
  4382. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprDigitSet);
  4383. Exit;
  4384. end;
  4385. OP_ANYLETTER:
  4386. begin
  4387. GetCharSetFromWordChars(TempSet);
  4388. FirstCharSet := FirstCharSet + TempSet;
  4389. Exit;
  4390. end;
  4391. OP_NOTLETTER:
  4392. begin
  4393. GetCharSetFromWordChars(TempSet);
  4394. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4395. Exit;
  4396. end;
  4397. OP_ANYSPACE:
  4398. begin
  4399. GetCharSetFromSpaceChars(TempSet);
  4400. FirstCharSet := FirstCharSet + TempSet;
  4401. Exit;
  4402. end;
  4403. OP_NOTSPACE:
  4404. begin
  4405. GetCharSetFromSpaceChars(TempSet);
  4406. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4407. Exit;
  4408. end;
  4409. OP_ANYVERTSEP:
  4410. begin
  4411. FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
  4412. Exit;
  4413. end;
  4414. OP_NOTVERTSEP:
  4415. begin
  4416. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprLineSeparatorsSet);
  4417. Exit;
  4418. end;
  4419. OP_ANYHORZSEP:
  4420. begin
  4421. FirstCharSet := FirstCharSet + RegExprHorzSeparatorsSet;
  4422. Exit;
  4423. end;
  4424. OP_NOTHORZSEP:
  4425. begin
  4426. FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprHorzSeparatorsSet);
  4427. Exit;
  4428. end;
  4429. OP_EXACTLYCI:
  4430. begin
  4431. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  4432. {$IFDEF UniCode}
  4433. if Ord(ch) <= $FF then
  4434. {$ENDIF}
  4435. begin
  4436. Include(FirstCharSet, byte(ch));
  4437. Include(FirstCharSet, byte(InvertCase(ch)));
  4438. end;
  4439. Exit;
  4440. end;
  4441. OP_EXACTLY:
  4442. begin
  4443. ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
  4444. {$IFDEF UniCode}
  4445. if Ord(ch) <= $FF then
  4446. {$ENDIF}
  4447. Include(FirstCharSet, byte(ch));
  4448. Exit;
  4449. end;
  4450. OP_ANYOF:
  4451. begin
  4452. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  4453. FirstCharSet := FirstCharSet + TempSet;
  4454. Exit;
  4455. end;
  4456. OP_ANYBUT:
  4457. begin
  4458. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
  4459. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4460. Exit;
  4461. end;
  4462. OP_ANYOFCI:
  4463. begin
  4464. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  4465. FirstCharSet := FirstCharSet + TempSet;
  4466. Exit;
  4467. end;
  4468. OP_ANYBUTCI:
  4469. begin
  4470. GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
  4471. FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
  4472. Exit;
  4473. end;
  4474. OP_NOTHING:
  4475. ;
  4476. OP_COMMENT:
  4477. ;
  4478. OP_BACK:
  4479. ;
  4480. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1):
  4481. begin //###0.929
  4482. FillFirstCharSet(Next);
  4483. Exit;
  4484. end;
  4485. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1):
  4486. begin //###0.929
  4487. FillFirstCharSet(Next);
  4488. Exit;
  4489. end;
  4490. OP_BRANCH:
  4491. begin
  4492. if (PREOp(Next)^ <> OP_BRANCH) // No choice.
  4493. then
  4494. Next := scan + REOpSz + RENextOffSz // Avoid recursion.
  4495. else
  4496. begin
  4497. repeat
  4498. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4499. scan := regnext(scan);
  4500. until (scan = nil) or (PREOp(scan)^ <> OP_BRANCH);
  4501. Exit;
  4502. end;
  4503. end;
  4504. {$IFDEF ComplexBraces}
  4505. OP_LOOPENTRY:
  4506. begin //###0.925
  4507. //LoopStack [LoopStackIdx] := 0; //###0.940 line removed
  4508. FillFirstCharSet(Next); // execute LOOP
  4509. Exit;
  4510. end;
  4511. OP_LOOP,
  4512. OP_LOOPNG:
  4513. begin //###0.940
  4514. opnd := scan + PRENextOff(AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz * 2))^;
  4515. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
  4516. FillFirstCharSet(opnd);
  4517. if min_cnt = 0 then
  4518. FillFirstCharSet(Next);
  4519. Exit;
  4520. end;
  4521. {$ENDIF}
  4522. OP_STAR,
  4523. OP_STARNG: //###0.940
  4524. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4525. OP_PLUS,
  4526. OP_PLUSNG:
  4527. begin //###0.940
  4528. FillFirstCharSet(scan + REOpSz + RENextOffSz);
  4529. Exit;
  4530. end;
  4531. OP_BRACES,
  4532. OP_BRACESNG:
  4533. begin //###0.940
  4534. opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
  4535. min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES
  4536. FillFirstCharSet(opnd);
  4537. if min_cnt > 0 then
  4538. Exit;
  4539. end;
  4540. OP_EEND:
  4541. begin
  4542. FirstCharSet := RegExprAllSet; //###0.948
  4543. Exit;
  4544. end;
  4545. else
  4546. begin
  4547. fLastErrorOpcode := Oper;
  4548. Error(reeUnknownOpcodeInFillFirst);
  4549. Exit;
  4550. end;
  4551. end; { of case scan^}
  4552. scan := Next;
  4553. end; { of while scan <> nil}
  4554. end; { of procedure FillFirstCharSet
  4555. --------------------------------------------------------------}
  4556. {$ENDIF}
  4557. procedure TRegExpr.InitCharCheckers;
  4558. var
  4559. Cnt: integer;
  4560. //
  4561. function Add(AChecker: TRegExprCharChecker): byte;
  4562. begin
  4563. Inc(Cnt);
  4564. if Cnt > High(CharCheckers) then
  4565. raise Exception.Create('Too small CharCheckers array');
  4566. CharCheckers[Cnt - 1] := AChecker;
  4567. Result := Cnt - 1;
  4568. end;
  4569. //
  4570. begin
  4571. Cnt := 0;
  4572. FillChar(CharCheckers, SizeOf(CharCheckers), 0);
  4573. CheckerIndex_Word := Add(CharChecker_Word);
  4574. CheckerIndex_NotWord := Add(CharChecker_NotWord);
  4575. CheckerIndex_Space := Add(CharChecker_Space);
  4576. CheckerIndex_NotSpace := Add(CharChecker_NotSpace);
  4577. CheckerIndex_Digit := Add(CharChecker_Digit);
  4578. CheckerIndex_NotDigit := Add(CharChecker_NotDigit);
  4579. CheckerIndex_VertSep := Add(CharChecker_VertSep);
  4580. CheckerIndex_NotVertSep := Add(CharChecker_NotVertSep);
  4581. CheckerIndex_HorzSep := Add(CharChecker_HorzSep);
  4582. CheckerIndex_NotHorzSep := Add(CharChecker_NotHorzSep);
  4583. //CheckerIndex_AllAZ := Add(CharChecker_AllAZ);
  4584. CheckerIndex_LowerAZ := Add(CharChecker_LowerAZ);
  4585. CheckerIndex_UpperAZ := Add(CharChecker_UpperAZ);
  4586. SetLength(CharCheckerInfos, 3);
  4587. with CharCheckerInfos[0] do
  4588. begin
  4589. CharBegin := 'a';
  4590. CharEnd:= 'z';
  4591. CheckerIndex := CheckerIndex_LowerAZ;
  4592. end;
  4593. with CharCheckerInfos[1] do
  4594. begin
  4595. CharBegin := 'A';
  4596. CharEnd := 'Z';
  4597. CheckerIndex := CheckerIndex_UpperAZ;
  4598. end;
  4599. with CharCheckerInfos[2] do
  4600. begin
  4601. CharBegin := '0';
  4602. CharEnd := '9';
  4603. CheckerIndex := CheckerIndex_Digit;
  4604. end;
  4605. end;
  4606. function TRegExpr.CharChecker_Word(ch: REChar): boolean;
  4607. begin
  4608. Result := IsWordChar(ch);
  4609. end;
  4610. function TRegExpr.CharChecker_NotWord(ch: REChar): boolean;
  4611. begin
  4612. Result := not IsWordChar(ch);
  4613. end;
  4614. function TRegExpr.CharChecker_Space(ch: REChar): boolean;
  4615. begin
  4616. Result := IsSpaceChar(ch);
  4617. end;
  4618. function TRegExpr.CharChecker_NotSpace(ch: REChar): boolean;
  4619. begin
  4620. Result := not IsSpaceChar(ch);
  4621. end;
  4622. function TRegExpr.CharChecker_Digit(ch: REChar): boolean;
  4623. begin
  4624. Result := IsDigitChar(ch);
  4625. end;
  4626. function TRegExpr.CharChecker_NotDigit(ch: REChar): boolean;
  4627. begin
  4628. Result := not IsDigitChar(ch);
  4629. end;
  4630. function TRegExpr.CharChecker_VertSep(ch: REChar): boolean;
  4631. begin
  4632. Result := IsLineSeparator(ch);
  4633. end;
  4634. function TRegExpr.CharChecker_NotVertSep(ch: REChar): boolean;
  4635. begin
  4636. Result := not IsLineSeparator(ch);
  4637. end;
  4638. function TRegExpr.CharChecker_HorzSep(ch: REChar): boolean;
  4639. begin
  4640. Result := IsHorzSeparator(ch);
  4641. end;
  4642. function TRegExpr.CharChecker_NotHorzSep(ch: REChar): boolean;
  4643. begin
  4644. Result := not IsHorzSeparator(ch);
  4645. end;
  4646. function TRegExpr.CharChecker_LowerAZ(ch: REChar): boolean;
  4647. begin
  4648. case ch of
  4649. 'a' .. 'z':
  4650. Result := True;
  4651. else
  4652. Result := False;
  4653. end;
  4654. end;
  4655. function TRegExpr.CharChecker_UpperAZ(ch: REChar): boolean;
  4656. begin
  4657. case ch of
  4658. 'A' .. 'Z':
  4659. Result := True;
  4660. else
  4661. Result := False;
  4662. end;
  4663. end;
  4664. {$IFDEF RegExpPCodeDump}
  4665. function TRegExpr.DumpOp(op: TREOp): RegExprString;
  4666. // printable representation of opcode
  4667. begin
  4668. case op of
  4669. OP_BOL:
  4670. Result := 'BOL';
  4671. OP_EOL:
  4672. Result := 'EOL';
  4673. OP_BOLML:
  4674. Result := 'BOLML';
  4675. OP_EOLML:
  4676. Result := 'EOLML';
  4677. OP_BOUND:
  4678. Result := 'BOUND'; // ###0.943
  4679. OP_NOTBOUND:
  4680. Result := 'NOTBOUND'; // ###0.943
  4681. OP_ANY:
  4682. Result := 'ANY';
  4683. OP_ANYML:
  4684. Result := 'ANYML'; // ###0.941
  4685. OP_ANYLETTER:
  4686. Result := 'ANYLETTER';
  4687. OP_NOTLETTER:
  4688. Result := 'NOTLETTER';
  4689. OP_ANYDIGIT:
  4690. Result := 'ANYDIGIT';
  4691. OP_NOTDIGIT:
  4692. Result := 'NOTDIGIT';
  4693. OP_ANYSPACE:
  4694. Result := 'ANYSPACE';
  4695. OP_NOTSPACE:
  4696. Result := 'NOTSPACE';
  4697. OP_ANYHORZSEP:
  4698. Result := 'ANYHORZSEP';
  4699. OP_NOTHORZSEP:
  4700. Result := 'NOTHORZSEP';
  4701. OP_ANYVERTSEP:
  4702. Result := 'ANYVERTSEP';
  4703. OP_NOTVERTSEP:
  4704. Result := 'NOTVERTSEP';
  4705. OP_ANYOF:
  4706. Result := 'ANYOF';
  4707. OP_ANYBUT:
  4708. Result := 'ANYBUT';
  4709. OP_ANYOFCI:
  4710. Result := 'ANYOF/CI';
  4711. OP_ANYBUTCI:
  4712. Result := 'ANYBUT/CI';
  4713. OP_BRANCH:
  4714. Result := 'BRANCH';
  4715. OP_EXACTLY:
  4716. Result := 'EXACTLY';
  4717. OP_EXACTLYCI:
  4718. Result := 'EXACTLY/CI';
  4719. OP_NOTHING:
  4720. Result := 'NOTHING';
  4721. OP_COMMENT:
  4722. Result := 'COMMENT';
  4723. OP_BACK:
  4724. Result := 'BACK';
  4725. OP_EEND:
  4726. Result := 'END';
  4727. OP_BSUBEXP:
  4728. Result := 'BSUBEXP';
  4729. OP_BSUBEXPCI:
  4730. Result := 'BSUBEXP/CI';
  4731. Succ(OP_OPEN) .. TREOp(Ord(OP_OPEN) + NSUBEXP - 1): // ###0.929
  4732. Result := Format('OPEN[%d]', [Ord(op) - Ord(OP_OPEN)]);
  4733. Succ(OP_CLOSE) .. TREOp(Ord(OP_CLOSE) + NSUBEXP - 1): // ###0.929
  4734. Result := Format('CLOSE[%d]', [Ord(op) - Ord(OP_CLOSE)]);
  4735. OP_STAR:
  4736. Result := 'STAR';
  4737. OP_PLUS:
  4738. Result := 'PLUS';
  4739. OP_BRACES:
  4740. Result := 'BRACES';
  4741. {$IFDEF ComplexBraces}
  4742. OP_LOOPENTRY:
  4743. Result := 'LOOPENTRY'; // ###0.925
  4744. OP_LOOP:
  4745. Result := 'LOOP'; // ###0.925
  4746. OP_LOOPNG:
  4747. Result := 'LOOPNG'; // ###0.940
  4748. {$ENDIF}
  4749. OP_STARNG:
  4750. Result := 'STARNG'; // ###0.940
  4751. OP_PLUSNG:
  4752. Result := 'PLUSNG'; // ###0.940
  4753. OP_BRACESNG:
  4754. Result := 'BRACESNG'; // ###0.940
  4755. else
  4756. Error(reeDumpCorruptedOpcode);
  4757. end; { of case op }
  4758. Result := ':' + Result;
  4759. end; { of function TRegExpr.DumpOp
  4760. -------------------------------------------------------------- }
  4761. function TRegExpr.Dump: RegExprString;
  4762. // dump a regexp in vaguely comprehensible form
  4763. var
  4764. s: PRegExprChar;
  4765. op: TREOp; // Arbitrary non-END op.
  4766. next: PRegExprChar;
  4767. i, NLen: integer;
  4768. Diff: PtrInt;
  4769. Ch: AnsiChar;
  4770. function PrintableChar(AChar: REChar): string; {$IFDEF InlineFuncs}inline;{$ENDIF}
  4771. begin
  4772. if AChar < ' ' then
  4773. Result := '#' + IntToStr(Ord(AChar))
  4774. else
  4775. Result := AChar;
  4776. end;
  4777. begin
  4778. if not IsProgrammOk then
  4779. Exit;
  4780. op := OP_EXACTLY;
  4781. Result := '';
  4782. s := programm + REOpSz;
  4783. while op <> OP_EEND do
  4784. begin // While that wasn't END last time...
  4785. op := s^;
  4786. Result := Result + Format('%2d%s', [s - programm, DumpOp(s^)]);
  4787. // Where, what.
  4788. next := regnext(s);
  4789. if next = nil // Next ptr.
  4790. then
  4791. Result := Result + ' (0)'
  4792. else
  4793. begin
  4794. if next > s
  4795. // ###0.948 PWideChar subtraction workaround (see comments in Tail method for details)
  4796. then
  4797. Diff := next - s
  4798. else
  4799. Diff := -(s - next);
  4800. Result := Result + Format(' (%d) ', [(s - programm) + Diff]);
  4801. end;
  4802. Inc(s, REOpSz + RENextOffSz);
  4803. if (op = OP_ANYOF) or (op = OP_ANYOFCI) or (op = OP_ANYBUT) or (op = OP_ANYBUTCI) then
  4804. begin
  4805. repeat
  4806. case s^ of
  4807. OpKind_End:
  4808. begin
  4809. Inc(s);
  4810. Break;
  4811. end;
  4812. OpKind_Range:
  4813. begin
  4814. Result := Result + 'Rng(';
  4815. Inc(s);
  4816. Result := Result + PrintableChar(s^) + '-';
  4817. Inc(s);
  4818. Result := Result + PrintableChar(s^);
  4819. Result := Result + ') ';
  4820. Inc(s);
  4821. end;
  4822. OpKind_MetaClass:
  4823. begin
  4824. Inc(s);
  4825. Result := Result + '\' + PrintableChar(s^) + ' ';
  4826. Inc(s);
  4827. end;
  4828. OpKind_Char:
  4829. begin
  4830. Inc(s);
  4831. NLen := PLongInt(s)^;
  4832. Inc(s, RENumberSz);
  4833. Result := Result + 'Ch(';
  4834. for i := 1 to NLen do
  4835. begin
  4836. Result := Result + PrintableChar(s^);
  4837. Inc(s);
  4838. end;
  4839. Result := Result + ') ';
  4840. end;
  4841. else
  4842. Error(reeDumpCorruptedOpcode);
  4843. end;
  4844. until false;
  4845. end;
  4846. if (op = OP_EXACTLY) or (op = OP_EXACTLYCI) then
  4847. begin
  4848. // Literal string, where present.
  4849. NLen := PLongInt(s)^;
  4850. Inc(s, RENumberSz);
  4851. for i := 1 to NLen do
  4852. begin
  4853. Result := Result + PrintableChar(s^);
  4854. Inc(s);
  4855. end;
  4856. end;
  4857. if (op = OP_BSUBEXP) or (op = OP_BSUBEXPCI) then
  4858. begin
  4859. Result := Result + ' \' + IntToStr(Ord(s^));
  4860. Inc(s);
  4861. end;
  4862. if (op = OP_BRACES) or (op = OP_BRACESNG) then
  4863. begin // ###0.941
  4864. // show min/max argument of braces operator
  4865. Result := Result + Format('{%d,%d}', [PREBracesArg(AlignToInt(s))^,
  4866. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  4867. Inc(s, REBracesArgSz * 2);
  4868. end;
  4869. {$IFDEF ComplexBraces}
  4870. if (op = OP_LOOP) or (op = OP_LOOPNG) then
  4871. begin // ###0.940
  4872. Result := Result + Format(' -> (%d) {%d,%d}',
  4873. [(s - programm - (REOpSz + RENextOffSz)) +
  4874. PRENextOff(AlignToPtr(s + 2 * REBracesArgSz))^,
  4875. PREBracesArg(AlignToInt(s))^,
  4876. PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
  4877. Inc(s, 2 * REBracesArgSz + RENextOffSz);
  4878. end;
  4879. {$ENDIF}
  4880. Result := Result + #$d#$a;
  4881. end; { of while }
  4882. // Header fields of interest.
  4883. if reganchored <> #0 then
  4884. Result := Result + 'Anchored; ';
  4885. if regmustString <> '' then
  4886. Result := Result + 'Must have: "' + regmustString + '"; ';
  4887. {$IFDEF UseFirstCharSet} // ###0.929
  4888. Result := Result + #$d#$a'First charset: ';
  4889. if FirstCharSet = [] then
  4890. Result := Result + '<empty set>'
  4891. else
  4892. if FirstCharSet = RegExprAllSet then
  4893. Result := Result + '<all chars>'
  4894. else
  4895. for Ch := #0 to #255 do
  4896. if byte(Ch) in FirstCharSet then
  4897. begin
  4898. if Ch < ' ' then
  4899. Result := Result + PrintableChar(Ch) // ###0.948
  4900. else
  4901. Result := Result + Ch;
  4902. end;
  4903. {$ENDIF}
  4904. Result := Result + #$d#$a;
  4905. end; { of function TRegExpr.Dump
  4906. -------------------------------------------------------------- }
  4907. {$ENDIF}
  4908. {$IFDEF reRealExceptionAddr}
  4909. {$OPTIMIZATION ON}
  4910. // ReturnAddr works correctly only if compiler optimization is ON
  4911. // I placed this method at very end of unit because there are no
  4912. // way to restore compiler optimization flag ...
  4913. {$ENDIF}
  4914. procedure TRegExpr.Error(AErrorID: integer);
  4915. {$IFDEF reRealExceptionAddr}
  4916. function ReturnAddr: Pointer; // ###0.938
  4917. asm
  4918. mov eax,[ebp+4]
  4919. end;
  4920. {$ENDIF}
  4921. var
  4922. e: ERegExpr;
  4923. begin
  4924. fLastError := AErrorID; // dummy stub - useless because will raise exception
  4925. if AErrorID < 1000 // compilation error ?
  4926. then
  4927. e := ERegExpr.Create(ErrorMsg(AErrorID) // yes - show error pos
  4928. + ' (pos ' + IntToStr(CompilerErrorPos) + ')')
  4929. else
  4930. e := ERegExpr.Create(ErrorMsg(AErrorID));
  4931. e.ErrorCode := AErrorID;
  4932. e.CompilerErrorPos := CompilerErrorPos;
  4933. raise e
  4934. {$IFDEF reRealExceptionAddr}
  4935. at ReturnAddr; // ###0.938
  4936. {$ENDIF}
  4937. end; { of procedure TRegExpr.Error
  4938. -------------------------------------------------------------- }
  4939. (*
  4940. PCode persistence:
  4941. FirstCharSet
  4942. programm, regsize
  4943. reganchored // -> programm
  4944. regmust, regmustlen // -> programm
  4945. fExprIsCompiled
  4946. *)
  4947. // be carefull - placed here code will be always compiled with
  4948. // compiler optimization flag
  4949. initialization
  4950. RegExprInvertCaseFunction := TRegExpr.InvertCaseFunction;
  4951. end.