12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190 |
- unit RegExpr;
- {
- TRegExpr class library
- Delphi Regular Expressions
- Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
- You can choose to use this Pascal unit in one of the two following licenses:
- Option 1>
- You may use this software in any kind of development,
- including comercial, redistribute, and modify it freely,
- under the following restrictions :
- 1. This software is provided as it is, without any kind of
- warranty given. Use it at Your own risk.The author is not
- responsible for any consequences of use of this software.
- 2. The origin of this software may not be mispresented, You
- must not claim that You wrote the original software. If
- You use this software in any kind of product, it would be
- appreciated that there in a information box, or in the
- documentation would be an acknowledgement like
- Partial Copyright (c) 2004 Andrey V. Sorokin
- http://RegExpStudio.com
- mailto:[email protected]
- 3. You may not have any income from distributing this source
- (or altered version of it) to other developers. When You
- use this product in a comercial package, the source may
- not be charged seperatly.
- 4. Altered versions must be plainly marked as such, and must
- not be misrepresented as being the original software.
- 5. RegExp Studio application and all the visual components as
- well as documentation is not part of the TRegExpr library
- and is not free for usage.
- mailto:[email protected]
- http://RegExpStudio.com
- http://anso.da.ru/
- Option 2>
- The same modified LGPL with static linking exception as the Free Pascal RTL
- }
- interface
- {off $DEFINE DebugSynRegExpr}
- {$IFDEF FPC}
- {$MODE DELPHI} // Delphi-compatible mode in FreePascal
- {$ENDIF}
- // ======== Determine compiler
- {$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
- {$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
- {$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
- {$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
- {$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
- {$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
- {$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
- {$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
- {$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7
- // ======== Define base compiler options
- {$BOOLEVAL OFF}
- {$EXTENDEDSYNTAX ON}
- {$LONGSTRINGS ON}
- {$IFNDEF FPC}
- {$OPTIMIZATION ON}
- {$ENDIF}
- {$IFDEF D6}
- {$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
- {$ENDIF}
- {$IFDEF D7}
- {$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
- {$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
- {$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
- {$ENDIF}
- // ======== Define options for TRegExpr engine
- {.$DEFINE UniCode} // Unicode support
- {$ifdef FPC_OS_UNICODE}
- {$define UNICODE}
- {$endif}
- {$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
- {$IFNDEF FPC} // the option is not supported in FreePascal
- {$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
- {$ENDIF}
- {$DEFINE ComplexBraces} // support braces in complex cases
- {$IFNDEF UniCode} // the option applicable only for non-UniCode mode
- {$IFNDEF FPC_REQUIRES_PROPER_ALIGNMENT} //sets have to be aligned
- {$DEFINE UseSetOfChar} // Significant optimization by using set of char
- {$ENDIF}
- {$ENDIF}
- {$IFDEF UseSetOfChar}
- {$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
- {$ENDIF}
- {$DEFINE UseOsLineEndOnReplace} // On Replace if replace-with has "\n", use System.LineEnding (#10 #13 or #13#10); else use #10
- // ======== Define Pascal-language options
- // Define 'UseAsserts' option (do not edit this definitions).
- // Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
- // completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
- {$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
- {$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
- // Define 'use subroutine parameters default values' option (do not edit this definition).
- {$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
- // Define 'OverMeth' options, to use method overloading (do not edit this definitions).
- {$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
- {$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
- uses
- Classes, // TStrings in Split method
- SysUtils; // Exception
- type
- {$IFDEF UniCode}
- PRegExprChar = PWideChar;
- RegExprString = UnicodeString;
- REChar = WideChar;
- {$ELSE}
- PRegExprChar = PChar;
- RegExprString = AnsiString; //###0.952 was string
- REChar = Char;
- {$ENDIF}
- TREOp = REChar; // internal p-code type //###0.933
- PREOp = ^TREOp;
- TRENextOff = PtrInt; // internal Next "pointer" (offset to current p-code) //###0.933
- PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
- TREBracesArg = integer; // type of {m,n} arguments
- PREBracesArg = ^TREBracesArg;
- const
- REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
- {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
- // add space for aligning pointer
- // -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size
- RENextOffSz = (2 * SizeOf (TRENextOff) div SizeOf (REChar))-1;
- REBracesArgSz = (2 * SizeOf (TREBracesArg) div SizeOf (REChar)); // add space for aligning pointer
- {$ELSE}
- RENextOffSz = (SizeOf (TRENextOff) div SizeOf (REChar)); // size of Next 'pointer' -"-
- REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
- {$ENDIF}
- type
- TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
- of object;
- const
- EscChar = '\'; // 'Escape'-char ('\' in common r.e.) used for escaping metachars (\w, \d etc).
- RegExprModifierI : boolean = False; // default value for ModifierI
- RegExprModifierR : boolean = True; // default value for ModifierR
- RegExprModifierS : boolean = True; // default value for ModifierS
- RegExprModifierG : boolean = True; // default value for ModifierG
- RegExprModifierM : boolean = False; // default value for ModifierM
- RegExprModifierX : boolean = False; // default value for ModifierX
- RegExprSpaceChars : RegExprString = // default value for SpaceChars
- ' '#$9#$A#$D#$C;
- RegExprWordChars : RegExprString = // default value for WordChars
- '0123456789' //###0.940
- + 'abcdefghijklmnopqrstuvwxyz'
- + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
- RegExprLineSeparators : RegExprString =// default value for LineSeparators
- #$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
- RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
- #$d#$a;
- { if You need Unix-styled line separators (only \n), then use:
- RegExprLineSeparators = #$a;
- RegExprLinePairedSeparator = '';
- }
- const
- NSUBEXP = 90; // max number of subexpression //###0.929
- // Cannot be more than NSUBEXPMAX
- // Be carefull - don't use values which overflow CLOSE opcode
- // (in this case you'll get compiler error).
- // Big NSUBEXP will cause more slow work and more stack required
- NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
- // Don't change it! It's defined by internal TRegExpr design.
- MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
- {$IFDEF ComplexBraces}
- LoopStackMax = 10; // max depth of loops stack //###0.925
- {$ENDIF}
- TinySetLen = 3;
- // if range includes more then TinySetLen chars, //###0.934
- // then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
- // !!! Attension ! If you change TinySetLen, you must
- // change code marked as "//!!!TinySet"
- type
- {$IFDEF UseSetOfChar}
- PSetOfREChar = ^TSetOfREChar;
- TSetOfREChar = set of REChar;
- {$ENDIF}
- TRegExpr = class;
- TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
- of object;
- TRegExpr = class
- private
- startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
- endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
- {$IFDEF ComplexBraces}
- LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
- LoopStackIdx : integer; // 0 - out of all loops
- {$ENDIF}
- // The "internal use only" fields to pass info from compile
- // to execute that permits the execute phase to run lots faster on
- // simple cases.
- regstart : REChar; // char that must begin a match; '\0' if none obvious
- reganch : REChar; // is the match anchored (at beginning-of-line only)?
- regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
- regmlen : PtrInt; // length of regmust string
- // Regstart and reganch permit very fast decisions on suitable starting points
- // for a match, cutting down the work a lot. Regmust permits fast rejection
- // of lines that cannot possibly match. The regmust tests are costly enough
- // that regcomp() supplies a regmust only if the r.e. contains something
- // potentially expensive (at present, the only such thing detected is * or +
- // at the start of the r.e., which can involve a lot of backup). Regmlen is
- // supplied because the test in regexec() needs it and regcomp() is computing
- // it anyway.
- {$IFDEF UseFirstCharSet} //###0.929
- FirstCharSet : TSetOfREChar;
- {$ENDIF}
- // work variables for Exec's routins - save stack in recursion}
- reginput : PRegExprChar; // String-input pointer.
- fInputStart : PRegExprChar; // Pointer to first char of input string.
- fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
- // work variables for compiler's routines
- regparse : PRegExprChar; // Input-scan pointer.
- regnpar : PtrInt; // count.
- regdummy : char;
- regcode : PRegExprChar; // Code-emit pointer; @regdummy = don't.
- regsize : PtrInt; // Code size.
- regexpbeg : PRegExprChar; // only for error handling. Contains
- // pointer to beginning of r.e. while compiling
- fExprIsCompiled : boolean; // true if r.e. successfully compiled
- // programm is essentially a linear encoding
- // of a nondeterministic finite-state machine (aka syntax charts or
- // "railroad normal form" in parsing technology). Each node is an opcode
- // plus a "next" pointer, possibly plus an operand. "Next" pointers of
- // all nodes except BRANCH implement concatenation; a "next" pointer with
- // a BRANCH on both ends of it connects two alternatives. (Here we
- // have one of the subtle syntax dependencies: an individual BRANCH (as
- // opposed to a collection of them) is never concatenated with anything
- // because of operator precedence.) The operand of some types of node is
- // a literal string; for others, it is a node leading into a sub-FSM. In
- // particular, the operand of a BRANCH node is the first node of the branch.
- // (NB this is *not* a tree structure: the tail of the branch connects
- // to the thing following the set of BRANCHes.) The opcodes are:
- programm : PRegExprChar; // Unwarranted chumminess with compiler.
- fExpression : PRegExprChar; // source of compiled r.e.
- fInputString : PRegExprChar; // input string
- fLastError : integer; // see Error, LastError
- fModifiers : integer; // modifiers
- fCompModifiers : integer; // compiler's copy of modifiers
- fProgModifiers : integer; // modifiers values from last programm compilation
- fSpaceChars : RegExprString; //###0.927
- fWordChars : RegExprString; //###0.929
- fInvertCase : TRegExprInvertCaseFunction; //###0.927
- fLineSeparators : RegExprString; //###0.941
- fLinePairedSeparatorAssigned : boolean;
- fLinePairedSeparatorHead,
- fLinePairedSeparatorTail : REChar;
- {$IFNDEF UniCode}
- fLineSeparatorsSet : set of REChar;
- {$ENDIF}
- // Mark programm as having to be [re]compiled
- procedure InvalidateProgramm;
- // Check if we can use precompiled r.e. or
- // [re]compile it if something changed
- function IsProgrammOk : boolean; //###0.941
- function GetExpression : RegExprString;
- procedure SetExpression (const s : RegExprString);
- function GetModifierStr : RegExprString;
- // Parse AModifiers string and return true and set AModifiersInt
- // if it's in format 'ismxrg-ismxrg'.
- class function ParseModifiersStr (const AModifiers : RegExprString;
- var AModifiersInt : integer) : boolean; //###0.941 class function now
- procedure SetModifierStr (const AModifiers : RegExprString);
- function GetModifier (AIndex : integer) : boolean;
- procedure SetModifier (AIndex : integer; ASet : boolean);
- // Default handler raises exception ERegExpr with
- // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
- // and CompilerErrorPos = value of property CompilerErrorPos.
- procedure Error (AErrorID : integer); virtual; // error handler.
- {==================== Compiler section ===================}
- // compile a regular expression into internal code
- function CompileRegExpr (exp : PRegExprChar) : boolean;
- // set the next-pointer at the end of a node chain
- procedure Tail (p : PRegExprChar; val : PRegExprChar);
- // regoptail - regtail on operand of first argument; nop if operandless
- procedure OpTail (p : PRegExprChar; val : PRegExprChar);
- // regnode - emit a node, return location
- function EmitNode (op : TREOp) : PRegExprChar;
- // emit (if appropriate) a byte of code
- procedure EmitC (b : REChar);
- // insert an operator in front of already-emitted operand
- // Means relocating the operand.
- procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
- // regular expression, i.e. main body or parenthesized thing
- function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
- // one alternative of an | operator
- function ParseBranch (var flagp : integer) : PRegExprChar;
- // something followed by possible [*+?]
- function ParsePiece (var flagp : integer) : PRegExprChar;
- // the lowest level
- function ParseAtom (var flagp : integer) : PRegExprChar;
- // current pos in r.e. - for error hanling
- function GetCompilerErrorPos : PtrInt;
- {$IFDEF UseFirstCharSet} //###0.929
- procedure FillFirstCharSet (prog : PRegExprChar);
- {$ENDIF}
- {===================== Matching section ===================}
- // repeatedly match something simple, report how many
- function regrepeat (p : PRegExprChar; AMax : PtrInt) : PtrInt;
- // dig the "next" pointer out of a node
- function regnext (p : PRegExprChar) : PRegExprChar;
- // recursively matching routine
- function MatchPrim (prog : PRegExprChar) : boolean;
- // Exec for stored InputString
- function ExecPrim (AOffset: PtrInt) : boolean;
- {$IFDEF RegExpPCodeDump}
- function DumpOp (op : REChar) : RegExprString;
- {$ENDIF}
- function GetSubExprMatchCount : integer;
- function GetMatchPos (Idx : integer) : PtrInt;
- function GetMatchLen (Idx : integer) : PtrInt;
- function GetMatch (Idx : integer) : RegExprString;
- function GetInputString : RegExprString;
- procedure SetInputString (const AInputString : RegExprString);
- {$IFNDEF UseSetOfChar}
- function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
- {$ENDIF}
- procedure SetLineSeparators (const AStr : RegExprString);
- procedure SetLinePairedSeparator (const AStr : RegExprString);
- function GetLinePairedSeparator : RegExprString;
- public
- constructor Create; overload;
- constructor Create(AExpression:string); overload;
- destructor Destroy; override;
- class function VersionMajor : integer; //###0.944
- class function VersionMinor : integer; //###0.944
- // Regular expression.
- // For optimization, TRegExpr will automatically compiles it into 'P-code'
- // (You can see it with help of Dump method) and stores in internal
- // structures. Real [re]compilation occures only when it really needed -
- // while calling Exec[Next], Substitute, Dump, etc
- // and only if Expression or other P-code affected properties was changed
- // after last [re]compilation.
- // If any errors while [re]compilation occures, Error method is called
- // (by default Error raises exception - see below)
- property Expression : RegExprString read GetExpression write SetExpression;
- // Set/get default values of r.e.syntax modifiers. Modifiers in
- // r.e. (?ismx-ismx) will replace this default values.
- // If you try to set unsupported modifier, Error will be called
- // (by defaul Error raises exception ERegExpr).
- property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
- // Modifier /i - caseinsensitive, initialized from RegExprModifierI
- property ModifierI : boolean index 1 read GetModifier write SetModifier;
- // Modifier /r - use r.e.syntax extended for russian,
- // (was property ExtSyntaxEnabled in previous versions)
- // If true, then а-я additional include russian letter 'ё',
- // А-Я additional include 'Ё', and а-Я include all russian symbols.
- // You have to turn it off if it can interfere with you national alphabet.
- // , initialized from RegExprModifierR
- property ModifierR : boolean index 2 read GetModifier write SetModifier;
- // Modifier /s - '.' works as any char (else as [^\n]),
- // , initialized from RegExprModifierS
- property ModifierS : boolean index 3 read GetModifier write SetModifier;
- // Switching off modifier /g switchs all operators in
- // non-greedy style, so if ModifierG = False, then
- // all '*' works as '*?', all '+' as '+?' and so on.
- // , initialized from RegExprModifierG
- property ModifierG : boolean index 4 read GetModifier write SetModifier;
- // Treat string as multiple lines. That is, change `^' and `$' from
- // matching at only the very start or end of the string to the start
- // or end of any line anywhere within the string.
- // , initialized from RegExprModifierM
- property ModifierM : boolean index 5 read GetModifier write SetModifier;
- // Modifier /x - eXtended syntax, allow r.e. text formatting,
- // see description in the help. Initialized from RegExprModifierX
- property ModifierX : boolean index 6 read GetModifier write SetModifier;
- // match a programm against a string AInputString
- // !!! Exec store AInputString into InputString property
- // For Delphi 5 and higher available overloaded versions - first without
- // parameter (uses already assigned to InputString property value)
- // and second that has PtrInt parameter and is same as ExecPos
- function Exec (const AInputString : RegExprString) : boolean; {$IFDEF OverMeth} overload;
- {$IFNDEF FPC} // I do not know why FreePascal cannot overload methods with empty param list
- function Exec : boolean; overload; //###0.949
- {$ENDIF}
- function Exec (AOffset: PtrInt) : boolean; overload; //###0.949
- {$ENDIF}
- // find next match:
- // ExecNext;
- // works the same as
- // if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
- // else ExecPos (MatchPos [0] + MatchLen [0]);
- // but it's more simpler !
- // Raises exception if used without preceeding SUCCESSFUL call to
- // Exec* (Exec, ExecPos, ExecNext). So You always must use something like
- // if Exec (InputString) then repeat { proceed results} until not ExecNext;
- function ExecNext : boolean;
- // find match for InputString starting from AOffset position
- // (AOffset=1 - first char of InputString)
- function ExecPos (AOffset: PtrInt {$IFDEF DefParam}= 1{$ENDIF}) : boolean;
- // returns current input string (from last Exec call or last assign
- // to this property).
- // Any assignment to this property clear Match* properties !
- property InputString : RegExprString read GetInputString write SetInputString;
- // Returns ATemplate with '$&' or '$0' replaced by whole r.e.
- // occurence and '$n' replaced by occurence of subexpression #n.
- // Since v.0.929 '$' used instead of '\' (for future extensions
- // and for more Perl-compatibility) and accept more then one digit.
- // If you want place into template raw '$' or '\', use prefix '\'
- // Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
- // If you want to place raw digit after '$n' you must delimit
- // n with curly braces '{}'.
- // Example: 'a$12bc' -> 'a<Match[12]>bc'
- // 'a${1}2bc' -> 'a<Match[1]>2bc'.
- function Substitute (const ATemplate : RegExprString) : RegExprString;
- // Split AInputStr into APieces by r.e. occurencies
- // Internally calls Exec[Next]
- procedure Split (AInputStr : RegExprString; APieces : TStrings);
- function Replace (AInputStr : RegExprString;
- const AReplaceStr : RegExprString;
- AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) //###0.946
- : RegExprString; {$IFDEF OverMeth} overload;
- function Replace (AInputStr : RegExprString;
- AReplaceFunc : TRegExprReplaceFunction)
- : RegExprString; overload;
- {$ENDIF}
- // Returns AInputStr with r.e. occurencies replaced by AReplaceStr
- // If AUseSubstitution is true, then AReplaceStr will be used
- // as template for Substitution methods.
- // For example:
- // Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
- // Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
- // will return: def 'BLOCK' value 'test1'
- // Replace ('BLOCK( test1)', 'def "$1" value "$2"')
- // will return: def "$1" value "$2"
- // Internally calls Exec[Next]
- // Overloaded version and ReplaceEx operate with call-back function,
- // so you can implement really complex functionality.
- function ReplaceEx (AInputStr : RegExprString;
- AReplaceFunc : TRegExprReplaceFunction):
- RegExprString;
- // Number of subexpressions has been found in last Exec* call.
- // If there are no subexpr. but whole expr was found (Exec* returned True),
- // then SubExprMatchCount=0, if no subexpressions nor whole
- // r.e. found (Exec* returned false) then SubExprMatchCount=-1.
- // Note, that some subexpr. may be not found and for such
- // subexpr. MathPos=MatchLen=-1 and Match=''.
- // For example: Expression := '(1)?2(3)?';
- // Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
- // Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
- // Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
- // Exec ('2'): SubExprMatchCount=0, Match[0]='2'
- // Exec ('7') - return False: SubExprMatchCount=-1
- property SubExprMatchCount : integer read GetSubExprMatchCount;
- // pos of entrance subexpr. #Idx into tested in last Exec*
- // string. First subexpr. has Idx=1, last - MatchCount,
- // whole r.e. has Idx=0.
- // Returns -1 if in r.e. no such subexpr. or this subexpr.
- // not found in input string.
- property MatchPos [Idx : integer] : PtrInt read GetMatchPos;
- // len of entrance subexpr. #Idx r.e. into tested in last Exec*
- // string. First subexpr. has Idx=1, last - MatchCount,
- // whole r.e. has Idx=0.
- // Returns -1 if in r.e. no such subexpr. or this subexpr.
- // not found in input string.
- // Remember - MatchLen may be 0 (if r.e. match empty string) !
- property MatchLen [Idx : integer] : PtrInt read GetMatchLen;
- // == copy (InputString, MatchPos [Idx], MatchLen [Idx])
- // Returns '' if in r.e. no such subexpr. or this subexpr.
- // not found in input string.
- property Match [Idx : integer] : RegExprString read GetMatch;
- // Returns ID of last error, 0 if no errors (unusable if
- // Error method raises exception) and clear internal status
- // into 0 (no errors).
- function LastError : integer;
- // Returns Error message for error with ID = AErrorID.
- function ErrorMsg (AErrorID : integer) : RegExprString; virtual;
- // Returns position in r.e. where compiler stopped.
- // Useful for error diagnostics
- property CompilerErrorPos : PtrInt read GetCompilerErrorPos;
- // Contains chars, treated as /s (initially filled with RegExprSpaceChars
- // global constant)
- property SpaceChars : RegExprString read fSpaceChars write fSpaceChars; //###0.927
- // Contains chars, treated as /w (initially filled with RegExprWordChars
- // global constant)
- property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
- // line separators (like \n in Unix)
- property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
- // paired line separator (like \r\n in DOS and Windows).
- // must contain exactly two chars or no chars at all
- property LinePairedSeparator : RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; //###0.941
- // Converts Ch into upper case if it in lower case or in lower
- // if it in upper (uses current system local setings)
- class function InvertCaseFunction (const Ch : REChar) : REChar;
- // Set this property if you want to override case-insensitive functionality.
- // Create set it to RegExprInvertCaseFunction (InvertCaseFunction by default)
- property InvertCase : TRegExprInvertCaseFunction read fInvertCase write fInvertCase; //##0.935
- // [Re]compile r.e. Useful for example for GUI r.e. editors (to check
- // all properties validity).
- procedure Compile; //###0.941
- {$IFDEF RegExpPCodeDump}
- // dump a compiled regexp in vaguely comprehensible form
- function Dump : RegExprString;
- {$ENDIF}
- end;
- ERegExpr = class (Exception)
- public
- ErrorCode : integer;
- CompilerErrorPos : PtrInt;
- end;
- const
- // default for InvertCase property:
- RegExprInvertCaseFunction : TRegExprInvertCaseFunction = {$IFDEF FPC} nil {$ELSE} TRegExpr.InvertCaseFunction{$ENDIF};
- // true if string AInputString match regular expression ARegExpr
- // ! will raise exeption if syntax errors in ARegExpr
- function ExecRegExpr (const ARegExpr, AInputStr : RegExprString) : boolean;
- // Split AInputStr into APieces by r.e. ARegExpr occurencies
- procedure SplitRegExpr (const ARegExpr, AInputStr : RegExprString; APieces : TStrings);
- // Returns AInputStr with r.e. occurencies replaced by AReplaceStr
- // If AUseSubstitution is true, then AReplaceStr will be used
- // as template for Substitution methods.
- // For example:
- // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
- // 'BLOCK( test1)', 'def "$1" value "$2"', True)
- // will return: def 'BLOCK' value 'test1'
- // ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
- // 'BLOCK( test1)', 'def "$1" value "$2"')
- // will return: def "$1" value "$2"
- function ReplaceRegExpr (const ARegExpr, AInputStr, AReplaceStr : RegExprString;
- AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString; //###0.947
- // Replace all metachars with its safe representation,
- // for example 'abc$cd.(' converts into 'abc\$cd\.\('
- // This function useful for r.e. autogeneration from
- // user input
- function QuoteRegExprMetaChars (const AStr : RegExprString) : RegExprString;
- // Makes list of subexpressions found in ARegExpr r.e.
- // In ASubExps every item represent subexpression,
- // from first to last, in format:
- // String - subexpression text (without '()')
- // low word of Object - starting position in ARegExpr, including '('
- // if exists! (first position is 1)
- // high word of Object - length, including starting '(' and ending ')'
- // if exist!
- // AExtendedSyntax - must be True if modifier /m will be On while
- // using the r.e.
- // Useful for GUI editors of r.e. etc (You can find example of using
- // in TestRExp.dpr project)
- // Returns
- // 0 Success. No unbalanced brackets was found;
- // -1 There are not enough closing brackets ')';
- // -(n+1) At position n was found opening '[' without //###0.942
- // corresponding closing ']';
- // n At position n was found closing bracket ')' without
- // corresponding opening '('.
- // If Result <> 0, then ASubExpr can contain empty items or illegal ones
- function RegExprSubExpressions (const ARegExpr : string;
- ASubExprs : TStrings; AExtendedSyntax : boolean{$IFDEF DefParam}= False{$ENDIF}) : PtrInt;
- implementation
- {$IFDEF FPC}
- {$ELSE}
- uses
- {$IFDEF SYN_WIN32}
- Windows; // CharUpper/Lower
- {$ELSE}
- Libc; //Qt.pas from Borland does not expose char handling functions
- {$ENDIF}
- {$ENDIF}
- const
- // TRegExpr.VersionMajor/Minor return values of these constants:
- TRegExprVersionMajor : integer = 0;
- TRegExprVersionMinor : integer = 952;
- MaskModI = 1; // modifier /i bit in fModifiers
- MaskModR = 2; // -"- /r
- MaskModS = 4; // -"- /s
- MaskModG = 8; // -"- /g
- MaskModM = 16; // -"- /m
- MaskModX = 32; // -"- /x
- {$IFDEF UniCode}
- XIgnoredChars = ' '#9#$d#$a;
- {$ELSE}
- XIgnoredChars = [' ', #9, #$d, #$a];
- {$ENDIF}
- function AlignToPtr(const p: Pointer): Pointer;
- begin
- {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
- Result := Align(p, SizeOf(Pointer));
- {$ELSE}
- Result := p;
- {$ENDIF}
- end;
- function AlignToInt(const p: Pointer): Pointer;
- begin
- {$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
- Result := Align(p, SizeOf(integer));
- {$ELSE}
- Result := p;
- {$ENDIF}
- end;
- {=============================================================}
- {=================== WideString functions ====================}
- {=============================================================}
- {$IFDEF UniCode}
- function StrPCopy (Dest: PRegExprChar; const Source: RegExprString): PRegExprChar;
- var
- i, Len : PtrInt;
- begin
- Len := length (Source); //###0.932
- if Len>0 then
- move(Source[1],Dest[0],Len*sizeof(ReChar));
- Dest [Len] := #0;
- Result := Dest;
- end; { of function StrPCopy
- --------------------------------------------------------------}
- function StrLCopy (Dest, Source: PRegExprChar; MaxLen: PtrUInt): PRegExprChar;
- var i: PtrInt;
- begin
- if MaxLen>0 then
- move(Source[0],Dest[0],MaxLen*sizeof(ReChar));
- Result := Dest;
- end; { of function StrLCopy
- --------------------------------------------------------------}
- function StrLen (Str: PRegExprChar): PtrUInt;
- begin
- Result:=0;
- while Str [result] <> #0
- do Inc (Result);
- end; { of function StrLen
- --------------------------------------------------------------}
- function StrPos (Str1, Str2: PRegExprChar): PRegExprChar;
- var n: PtrInt;
- begin
- Result := nil;
- n := Pos (RegExprString (Str2), RegExprString (Str1));
- if n = 0
- then EXIT;
- Result := Str1 + n - 1;
- end; { of function StrPos
- --------------------------------------------------------------}
- function StrLComp (Str1, Str2: PRegExprChar; MaxLen: PtrUInt): PtrInt;
- var S1, S2: RegExprString;
- begin
- S1 := Str1;
- S2 := Str2;
- if Copy (S1, 1, MaxLen) > Copy (S2, 1, MaxLen)
- then Result := 1
- else
- if Copy (S1, 1, MaxLen) < Copy (S2, 1, MaxLen)
- then Result := -1
- else Result := 0;
- end; { function StrLComp
- --------------------------------------------------------------}
- function StrScan (Str: PRegExprChar; Chr: WideChar): PRegExprChar;
- begin
- Result := nil;
- while (Str^ <> #0) and (Str^ <> Chr)
- do Inc (Str);
- if (Str^ <> #0)
- then Result := Str;
- end; { of function StrScan
- --------------------------------------------------------------}
- {$ENDIF}
- {=============================================================}
- {===================== Global functions ======================}
- {=============================================================}
- function ExecRegExpr (const ARegExpr, AInputStr : RegExprString) : boolean;
- var r : TRegExpr;
- begin
- r := TRegExpr.Create;
- try
- r.Expression := ARegExpr;
- Result := r.Exec (AInputStr);
- finally r.Free;
- end;
- end; { of function ExecRegExpr
- --------------------------------------------------------------}
- procedure SplitRegExpr (const ARegExpr, AInputStr : RegExprString; APieces : TStrings);
- var r : TRegExpr;
- begin
- APieces.Clear;
- r := TRegExpr.Create;
- try
- r.Expression := ARegExpr;
- r.Split (AInputStr, APieces);
- finally r.Free;
- end;
- end; { of procedure SplitRegExpr
- --------------------------------------------------------------}
- function ReplaceRegExpr (const ARegExpr, AInputStr, AReplaceStr : RegExprString;
- AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString;
- begin
- with TRegExpr.Create do try
- Expression := ARegExpr;
- Result := Replace (AInputStr, AReplaceStr, AUseSubstitution);
- finally Free;
- end;
- end; { of function ReplaceRegExpr
- --------------------------------------------------------------}
- function QuoteRegExprMetaChars (const AStr : RegExprString) : RegExprString;
- const
- RegExprMetaSet : RegExprString = '^$.[()|?+*'+EscChar+'{'
- + ']}'; // - this last are additional to META.
- // Very similar to META array, but slighly changed.
- // !Any changes in META array must be synchronized with this set.
- var
- i, i0, Len : PtrInt;
- begin
- Result := '';
- Len := length (AStr);
- i := 1;
- i0 := i;
- while i <= Len do begin
- if Pos (AStr [i], RegExprMetaSet) > 0 then begin
- Result := Result + System.Copy (AStr, i0, i - i0)
- + EscChar + AStr [i];
- i0 := i + 1;
- end;
- inc (i);
- end;
- Result := Result + System.Copy (AStr, i0, MaxInt); // Tail
- end; { of function QuoteRegExprMetaChars
- --------------------------------------------------------------}
- function RegExprSubExpressions (const ARegExpr : string;
- ASubExprs : TStrings; AExtendedSyntax : boolean{$IFDEF DefParam}= False{$ENDIF}) : PtrInt;
- type
- TStackItemRec = record //###0.945
- SubExprIdx : integer;
- StartPos : PtrInt;
- end;
- TStackArray = packed array [0 .. NSUBEXPMAX - 1] of TStackItemRec;
- var
- Len, SubExprLen : PtrInt;
- i, i0 : PtrInt;
- Modif : integer;
- Stack : ^TStackArray; //###0.945
- StackIdx, StackSz : PtrInt;
- begin
- Result := 0; // no unbalanced brackets found at this very moment
- ASubExprs.Clear; // I don't think that adding to non empty list
- // can be useful, so I simplified algorithm to work only with empty list
- Len := length (ARegExpr); // some optimization tricks
- // first we have to calculate number of subexpression to reserve
- // space in Stack array (may be we'll reserve more than needed, but
- // it's faster then memory reallocation during parsing)
- StackSz := 1; // add 1 for entire r.e.
- for i := 1 to Len do
- if ARegExpr [i] = '('
- then inc (StackSz);
- // SetLength (Stack, StackSz); //###0.945
- GetMem (Stack, SizeOf (TStackItemRec) * StackSz);
- try
- StackIdx := 0;
- i := 1;
- while (i <= Len) do begin
- case ARegExpr [i] of
- '(': begin
- if (i < Len) and (ARegExpr [i + 1] = '?') then begin
- // this is not subexpression, but comment or other
- // Perl extension. We must check is it (?ismxrg-ismxrg)
- // and change AExtendedSyntax if /x is changed.
- inc (i, 2); // skip '(?'
- i0 := i;
- while (i <= Len) and (ARegExpr [i] <> ')')
- do inc (i);
- if i > Len
- then Result := -1 // unbalansed '('
- else
- if TRegExpr.ParseModifiersStr (System.Copy (ARegExpr, i, i - i0), Modif)
- then AExtendedSyntax := (Modif and MaskModX) <> 0;
- end
- else begin // subexpression starts
- ASubExprs.Add (''); // just reserve space
- with Stack [StackIdx] do begin
- SubExprIdx := ASubExprs.Count - 1;
- StartPos := i;
- end;
- inc (StackIdx);
- end;
- end;
- ')': begin
- if StackIdx = 0
- then Result := i // unbalanced ')'
- else begin
- dec (StackIdx);
- with Stack [StackIdx] do begin
- SubExprLen := i - StartPos + 1;
- ASubExprs.Objects [SubExprIdx] :=
- TObject (StartPos or (SubExprLen ShL 16));
- ASubExprs [SubExprIdx] := System.Copy (
- ARegExpr, StartPos + 1, SubExprLen - 2); // add without brackets
- end;
- end;
- end;
- EscChar: inc (i); // skip quoted symbol
- '[': begin
- // we have to skip character ranges at once, because they can
- // contain '#', and '#' in it must NOT be recognized as eXtended
- // comment beginning!
- i0 := i;
- inc (i);
- if ARegExpr [i] = ']' // cannot be 'emty' ranges - this interpretes
- then inc (i); // as ']' by itself
- while (i <= Len) and (ARegExpr [i] <> ']') do
- if ARegExpr [i] = EscChar //###0.942
- then inc (i, 2) // skip 'escaped' char to prevent stopping at '\]'
- else inc (i);
- if (i > Len) or (ARegExpr [i] <> ']') //###0.942
- then Result := - (i0 + 1); // unbalansed '[' //###0.942
- end;
- '#': if AExtendedSyntax then begin
- // skip eXtended comments
- while (i <= Len) and (ARegExpr [i] <> #$d) and (ARegExpr [i] <> #$a)
- // do not use [#$d, #$a] due to UniCode compatibility
- do inc (i);
- while (i + 1 <= Len) and ((ARegExpr [i + 1] = #$d) or (ARegExpr [i + 1] = #$a))
- do inc (i); // attempt to work with different kinds of line separators
- // now we are at the line separator that must be skipped.
- end;
- // here is no 'else' clause - we simply skip ordinary chars
- end; // of case
- inc (i); // skip scanned char
- // ! can move after Len due to skipping quoted symbol
- end;
- // check brackets balance
- if StackIdx <> 0
- then Result := -1; // unbalansed '('
- // check if entire r.e. added
- if (ASubExprs.Count = 0)
- or ((PtrInt (ASubExprs.Objects [0]) and $FFFF) <> 1)
- or (((PtrInt (ASubExprs.Objects [0]) ShR 16) and $FFFF) <> Len)
- // whole r.e. wasn't added because it isn't bracketed
- // well, we add it now:
- then ASubExprs.InsertObject (0, ARegExpr, TObject ((Len ShL 16) or 1));
- finally FreeMem (Stack);
- end;
- end; { of function RegExprSubExpressions
- --------------------------------------------------------------}
- const
- MAGIC = TREOp (216);// programm signature
- // name opcode opnd? meaning
- EEND = TREOp (0); // - End of program
- BOL = TREOp (1); // - Match "" at beginning of line
- EOL = TREOp (2); // - Match "" at end of line
- ANY = TREOp (3); // - Match any one character
- ANYOF = TREOp (4); // Str Match any character in string Str
- ANYBUT = TREOp (5); // Str Match any char. not in string Str
- BRANCH = TREOp (6); // Node Match this alternative, or the next
- BACK = TREOp (7); // - Jump backward (Next < 0)
- EXACTLY = TREOp (8); // Str Match string Str
- NOTHING = TREOp (9); // - Match empty string
- STAR = TREOp (10); // Node Match this (simple) thing 0 or more times
- PLUS = TREOp (11); // Node Match this (simple) thing 1 or more times
- ANYDIGIT = TREOp (12); // - Match any digit (equiv [0-9])
- NOTDIGIT = TREOp (13); // - Match not digit (equiv [0-9])
- ANYLETTER = TREOp (14); // - Match any letter from property WordChars
- NOTLETTER = TREOp (15); // - Match not letter from property WordChars
- ANYSPACE = TREOp (16); // - Match any space char (see property SpaceChars)
- NOTSPACE = TREOp (17); // - Match not space char (see property SpaceChars)
- BRACES = TREOp (18); // Node,Min,Max Match this (simple) thing from Min to Max times.
- // Min and Max are TREBracesArg
- COMMENT = TREOp (19); // - Comment ;)
- EXACTLYCI = TREOp (20); // Str Match string Str case insensitive
- ANYOFCI = TREOp (21); // Str Match any character in string Str, case insensitive
- ANYBUTCI = TREOp (22); // Str Match any char. not in string Str, case insensitive
- LOOPENTRY = TREOp (23); // Node Start of loop (Node - LOOP for this loop)
- LOOP = TREOp (24); // Node,Min,Max,LoopEntryJmp - back jump for LOOPENTRY.
- // Min and Max are TREBracesArg
- // Node - next node in sequence,
- // LoopEntryJmp - associated LOOPENTRY node addr
- ANYOFTINYSET= TREOp (25); // Chrs Match any one char from Chrs (exactly TinySetLen chars)
- ANYBUTTINYSET=TREOp (26); // Chrs Match any one char not in Chrs (exactly TinySetLen chars)
- ANYOFFULLSET= TREOp (27); // Set Match any one char from set of char
- // - very fast (one CPU instruction !) but takes 32 bytes of p-code
- BSUBEXP = TREOp (28); // Idx Match previously matched subexpression #Idx (stored as REChar) //###0.936
- BSUBEXPCI = TREOp (29); // Idx -"- in case-insensitive mode
- // Non-Greedy Style Ops //###0.940
- STARNG = TREOp (30); // Same as START but in non-greedy mode
- PLUSNG = TREOp (31); // Same as PLUS but in non-greedy mode
- BRACESNG = TREOp (32); // Same as BRACES but in non-greedy mode
- LOOPNG = TREOp (33); // Same as LOOP but in non-greedy mode
- // Multiline mode \m
- BOLML = TREOp (34); // - Match "" at beginning of line
- EOLML = TREOp (35); // - Match "" at end of line
- ANYML = TREOp (36); // - Match any one character
- // Word boundary
- BOUND = TREOp (37); // Match "" between words //###0.943
- NOTBOUND = TREOp (38); // Match "" not between words //###0.943
- // !!! Change OPEN value if you add new opcodes !!!
- OPEN = TREOp (39); // - Mark this point in input as start of \n
- // OPEN + 1 is \1, etc.
- CLOSE = TREOp (ord (OPEN) + NSUBEXP);
- // - Analogous to OPEN.
- // !!! Don't add new OpCodes after CLOSE !!!
- // We work with p-code through pointers, compatible with PRegExprChar.
- // Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
- // must have lengths that can be divided by SizeOf (REChar) !
- // A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
- // The Next is a offset from the opcode of the node containing it.
- // An operand, if any, simply follows the node. (Note that much of
- // the code generation knows about this implicit relationship!)
- // Using TRENextOff=PtrInt speed up p-code processing.
- // Opcodes description:
- //
- // BRANCH The set of branches constituting a single choice are hooked
- // together with their "next" pointers, since precedence prevents
- // anything being concatenated to any individual branch. The
- // "next" pointer of the last BRANCH in a choice points to the
- // thing following the whole choice. This is also where the
- // final "next" pointer of each individual branch points; each
- // branch starts with the operand node of a BRANCH node.
- // BACK Normal "next" pointers all implicitly point forward; BACK
- // exists to make loop structures possible.
- // STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
- // circular BRANCH structures using BACK. Complex '{min,max}'
- // - as pair LOOPENTRY-LOOP (see below). Simple cases (one
- // character per match) are implemented with STAR, PLUS and
- // BRACES for speed and to minimize recursive plunges.
- // LOOPENTRY,LOOP {min,max} are implemented as special pair
- // LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
- // current level.
- // OPEN,CLOSE are numbered at compile time.
- {=============================================================}
- {================== Error handling section ===================}
- {=============================================================}
- const
- reeOk = 0;
- reeCompNullArgument = 100;
- reeCompRegexpTooBig = 101;
- reeCompParseRegTooManyBrackets = 102;
- reeCompParseRegUnmatchedBrackets = 103;
- reeCompParseRegUnmatchedBrackets2 = 104;
- reeCompParseRegJunkOnEnd = 105;
- reePlusStarOperandCouldBeEmpty = 106;
- reeNestedSQP = 107;
- reeBadHexDigit = 108;
- reeInvalidRange = 109;
- reeParseAtomTrailingBackSlash = 110;
- reeNoHexCodeAfterBSlashX = 111;
- reeHexCodeAfterBSlashXTooBig = 112;
- reeUnmatchedSqBrackets = 113;
- reeInternalUrp = 114;
- reeQPSBFollowsNothing = 115;
- reeTrailingBackSlash = 116;
- reeRarseAtomInternalDisaster = 119;
- reeBRACESArgTooBig = 122;
- reeBracesMinParamGreaterMax = 124;
- reeUnclosedComment = 125;
- reeComplexBracesNotImplemented = 126;
- reeUrecognizedModifier = 127;
- reeBadLinePairedSeparator = 128;
- reeRegRepeatCalledInappropriately = 1000;
- reeMatchPrimMemoryCorruption = 1001;
- reeMatchPrimCorruptedPointers = 1002;
- reeNoExpression = 1003;
- reeCorruptedProgram = 1004;
- reeNoInputStringSpecified = 1005;
- reeOffsetMustBeGreaterThen0 = 1006;
- reeExecNextWithoutExec = 1007;
- reeGetInputStringWithoutInputString = 1008;
- reeDumpCorruptedOpcode = 1011;
- reeModifierUnsupported = 1013;
- reeLoopStackExceeded = 1014;
- reeLoopWithoutEntry = 1015;
- reeBadPCodeImported = 2000;
- function TRegExpr.ErrorMsg (AErrorID : integer) : RegExprString;
- begin
- case AErrorID of
- reeOk: Result := 'No errors';
- reeCompNullArgument: Result := 'TRegExpr(comp): Null Argument';
- reeCompRegexpTooBig: Result := 'TRegExpr(comp): Regexp Too Big';
- reeCompParseRegTooManyBrackets: Result := 'TRegExpr(comp): ParseReg Too Many ()';
- reeCompParseRegUnmatchedBrackets: Result := 'TRegExpr(comp): ParseReg Unmatched ()';
- reeCompParseRegUnmatchedBrackets2: Result := 'TRegExpr(comp): ParseReg Unmatched ()';
- reeCompParseRegJunkOnEnd: Result := 'TRegExpr(comp): ParseReg Junk On End';
- reePlusStarOperandCouldBeEmpty: Result := 'TRegExpr(comp): *+ Operand Could Be Empty';
- reeNestedSQP: Result := 'TRegExpr(comp): Nested *?+';
- reeBadHexDigit: Result := 'TRegExpr(comp): Bad Hex Digit';
- reeInvalidRange: Result := 'TRegExpr(comp): Invalid [] Range';
- reeParseAtomTrailingBackSlash: Result := 'TRegExpr(comp): Parse Atom Trailing \';
- reeNoHexCodeAfterBSlashX: Result := 'TRegExpr(comp): No Hex Code After \x';
- reeHexCodeAfterBSlashXTooBig: Result := 'TRegExpr(comp): Hex Code After \x Is Too Big';
- reeUnmatchedSqBrackets: Result := 'TRegExpr(comp): Unmatched []';
- reeInternalUrp: Result := 'TRegExpr(comp): Internal Urp';
- reeQPSBFollowsNothing: Result := 'TRegExpr(comp): ?+*{ Follows Nothing';
- reeTrailingBackSlash: Result := 'TRegExpr(comp): Trailing \';
- reeRarseAtomInternalDisaster: Result := 'TRegExpr(comp): RarseAtom Internal Disaster';
- reeBRACESArgTooBig: Result := 'TRegExpr(comp): BRACES Argument Too Big';
- reeBracesMinParamGreaterMax: Result := 'TRegExpr(comp): BRACE Min Param Greater then Max';
- reeUnclosedComment: Result := 'TRegExpr(comp): Unclosed (?#Comment)';
- reeComplexBracesNotImplemented: Result := 'TRegExpr(comp): If you want take part in beta-testing BRACES ''{min,max}'' and non-greedy ops ''*?'', ''+?'', ''??'' for complex cases - remove ''.'' from {.$DEFINE ComplexBraces}';
- reeUrecognizedModifier: Result := 'TRegExpr(comp): Urecognized Modifier';
- reeBadLinePairedSeparator: Result := 'TRegExpr(comp): LinePairedSeparator must countain two different chars or no chars at all';
- reeRegRepeatCalledInappropriately: Result := 'TRegExpr(exec): RegRepeat Called Inappropriately';
- reeMatchPrimMemoryCorruption: Result := 'TRegExpr(exec): MatchPrim Memory Corruption';
- reeMatchPrimCorruptedPointers: Result := 'TRegExpr(exec): MatchPrim Corrupted Pointers';
- reeNoExpression: Result := 'TRegExpr(exec): Not Assigned Expression Property';
- reeCorruptedProgram: Result := 'TRegExpr(exec): Corrupted Program';
- reeNoInputStringSpecified: Result := 'TRegExpr(exec): No Input String Specified';
- reeOffsetMustBeGreaterThen0: Result := 'TRegExpr(exec): Offset Must Be Greater Then 0';
- reeExecNextWithoutExec: Result := 'TRegExpr(exec): ExecNext Without Exec[Pos]';
- reeGetInputStringWithoutInputString: Result := 'TRegExpr(exec): GetInputString Without InputString';
- reeDumpCorruptedOpcode: Result := 'TRegExpr(dump): Corrupted Opcode';
- reeLoopStackExceeded: Result := 'TRegExpr(exec): Loop Stack Exceeded';
- reeLoopWithoutEntry: Result := 'TRegExpr(exec): Loop Without LoopEntry !';
- reeBadPCodeImported: Result := 'TRegExpr(misc): Bad p-code imported';
- else Result := 'Unknown error';
- end;
- end; { of procedure TRegExpr.Error
- --------------------------------------------------------------}
- function TRegExpr.LastError : integer;
- begin
- Result := fLastError;
- fLastError := reeOk;
- end; { of function TRegExpr.LastError
- --------------------------------------------------------------}
- {=============================================================}
- {===================== Common section ========================}
- {=============================================================}
- class function TRegExpr.VersionMajor : integer; //###0.944
- begin
- Result := TRegExprVersionMajor;
- end; { of class function TRegExpr.VersionMajor
- --------------------------------------------------------------}
- class function TRegExpr.VersionMinor : integer; //###0.944
- begin
- Result := TRegExprVersionMinor;
- end; { of class function TRegExpr.VersionMinor
- --------------------------------------------------------------}
- constructor TRegExpr.Create;
- begin
- inherited;
- programm := nil;
- fExpression := nil;
- fInputString := nil;
- regexpbeg := nil;
- fExprIsCompiled := false;
- ModifierI := RegExprModifierI;
- ModifierR := RegExprModifierR;
- ModifierS := RegExprModifierS;
- ModifierG := RegExprModifierG;
- ModifierM := RegExprModifierM; //###0.940
- SpaceChars := RegExprSpaceChars; //###0.927
- WordChars := RegExprWordChars; //###0.929
- fInvertCase := RegExprInvertCaseFunction; //###0.927
- fLineSeparators := RegExprLineSeparators; //###0.941
- LinePairedSeparator := RegExprLinePairedSeparator; //###0.941
- end; { of constructor TRegExpr.Create
- --------------------------------------------------------------}
- constructor TRegExpr.Create(AExpression:string);
- begin
- create;
- Expression:=AExpression;
- end;
- destructor TRegExpr.Destroy;
- begin
- if programm <> nil then
- begin
- FreeMem (programm);
- programm:=nil;
- end;
- if fExpression <> nil then
- begin
- FreeMem (fExpression);
- fExpression:=nil;
- end;
- if fInputString <> nil then
- begin
- FreeMem (fInputString);
- fInputString:=nil;
- end;
- end; { of destructor TRegExpr.Destroy
- --------------------------------------------------------------}
- class function TRegExpr.InvertCaseFunction (const Ch : REChar) : REChar;
- begin
- {$IFDEF UniCode}
- if Ch >= #128
- then Result := Ch
- else
- {$ENDIF}
- begin
- Result := {$IFDEF FPC}AnsiUpperCase (Ch) [1]{$ELSE} {$IFDEF SYN_WIN32}REChar (CharUpper (PChar (Ch))){$ELSE}REChar (toupper (integer (Ch))){$ENDIF} {$ENDIF};
- if Result = Ch
- then Result := {$IFDEF FPC}AnsiLowerCase (Ch) [1]{$ELSE} {$IFDEF SYN_WIN32}REChar (CharLower (PChar (Ch))){$ELSE}REChar(tolower (integer (Ch))){$ENDIF} {$ENDIF};
- end;
- end; { of function TRegExpr.InvertCaseFunction
- --------------------------------------------------------------}
- function TRegExpr.GetExpression : RegExprString;
- begin
- if fExpression <> nil
- then Result := fExpression
- else Result := '';
- end; { of function TRegExpr.GetExpression
- --------------------------------------------------------------}
- procedure TRegExpr.SetExpression (const s : RegExprString);
- var
- Len : PtrInt; //###0.950
- begin
- if (s <> fExpression) or not fExprIsCompiled then begin
- fExprIsCompiled := false;
- if fExpression <> nil then begin
- FreeMem (fExpression);
- fExpression := nil;
- end;
- if s <> '' then begin
- Len := length (s); //###0.950
- GetMem (fExpression, (Len + 1) * SizeOf (REChar));
- System.Move(s[1],fExpression^,(Len + 1) * SizeOf (REChar));
- InvalidateProgramm; //###0.941
- end;
- end;
- end; { of procedure TRegExpr.SetExpression
- --------------------------------------------------------------}
- function TRegExpr.GetSubExprMatchCount : integer;
- begin
- if Assigned (fInputString) then begin
- Result := NSUBEXP - 1;
- while (Result > 0) and ((startp [Result] = nil)
- or (endp [Result] = nil))
- do dec (Result);
- end
- else Result := -1;
- end; { of function TRegExpr.GetSubExprMatchCount
- --------------------------------------------------------------}
- function TRegExpr.GetMatchPos (Idx : integer) : PtrInt;
- begin
- if (Idx >= 0) and (Idx < NSUBEXP) and Assigned (fInputString)
- and Assigned (startp [Idx]) and Assigned (endp [Idx]) then begin
- Result := (startp [Idx] - fInputString) + 1;
- end
- else Result := -1;
- end; { of function TRegExpr.GetMatchPos
- --------------------------------------------------------------}
- function TRegExpr.GetMatchLen (Idx : integer) : PtrInt;
- begin
- if (Idx >= 0) and (Idx < NSUBEXP) and Assigned (fInputString)
- and Assigned (startp [Idx]) and Assigned (endp [Idx]) then begin
- Result := endp [Idx] - startp [Idx];
- end
- else Result := -1;
- end; { of function TRegExpr.GetMatchLen
- --------------------------------------------------------------}
- function TRegExpr.GetMatch (Idx : integer) : RegExprString;
- begin
- if (Idx >= 0) and (Idx < NSUBEXP) and Assigned (fInputString)
- and Assigned (startp [Idx]) and Assigned (endp [Idx])
- and (endp [Idx] > startp[Idx])
- //then Result := copy (fInputString, MatchPos [Idx], MatchLen [Idx]) //###0.929
- then begin
- //SetString (Result, startp [idx], endp [idx] - startp [idx])
- SetLength(Result,endp [idx] - startp [idx]);
- System.Move(startp [idx]^,Result[1],length(Result)*sizeof(REChar));
- end
- else Result := '';
- end; { of function TRegExpr.GetMatch
- --------------------------------------------------------------}
- function TRegExpr.GetModifierStr : RegExprString;
- begin
- Result := '-';
- if ModifierI
- then Result := 'i' + Result
- else Result := Result + 'i';
- if ModifierR
- then Result := 'r' + Result
- else Result := Result + 'r';
- if ModifierS
- then Result := 's' + Result
- else Result := Result + 's';
- if ModifierG
- then Result := 'g' + Result
- else Result := Result + 'g';
- if ModifierM
- then Result := 'm' + Result
- else Result := Result + 'm';
- if ModifierX
- then Result := 'x' + Result
- else Result := Result + 'x';
- if Result [length (Result)] = '-' // remove '-' if all modifiers are 'On'
- then System.Delete (Result, length (Result), 1);
- end; { of function TRegExpr.GetModifierStr
- --------------------------------------------------------------}
- class function TRegExpr.ParseModifiersStr (const AModifiers : RegExprString;
- var AModifiersInt : integer) : boolean;
- // !!! Be carefull - this is class function and must not use object instance fields
- var
- i : integer;
- IsOn : boolean;
- Mask : integer;
- begin
- Result := true;
- IsOn := true;
- Mask := 0; // prevent compiler warning
- for i := 1 to length (AModifiers) do
- if AModifiers [i] = '-'
- then IsOn := false
- else begin
- if Pos (AModifiers [i], 'iI') > 0
- then Mask := MaskModI
- else if Pos (AModifiers [i], 'rR') > 0
- then Mask := MaskModR
- else if Pos (AModifiers [i], 'sS') > 0
- then Mask := MaskModS
- else if Pos (AModifiers [i], 'gG') > 0
- then Mask := MaskModG
- else if Pos (AModifiers [i], 'mM') > 0
- then Mask := MaskModM
- else if Pos (AModifiers [i], 'xX') > 0
- then Mask := MaskModX
- else begin
- Result := false;
- EXIT;
- end;
- if IsOn
- then AModifiersInt := AModifiersInt or Mask
- else AModifiersInt := AModifiersInt and not Mask;
- end;
- end; { of function TRegExpr.ParseModifiersStr
- --------------------------------------------------------------}
- procedure TRegExpr.SetModifierStr (const AModifiers : RegExprString);
- begin
- if not ParseModifiersStr (AModifiers, fModifiers)
- then Error (reeModifierUnsupported);
- end; { of procedure TRegExpr.SetModifierStr
- --------------------------------------------------------------}
- function TRegExpr.GetModifier (AIndex : integer) : boolean;
- var
- Mask : integer;
- begin
- Result := false;
- case AIndex of
- 1: Mask := MaskModI;
- 2: Mask := MaskModR;
- 3: Mask := MaskModS;
- 4: Mask := MaskModG;
- 5: Mask := MaskModM;
- 6: Mask := MaskModX;
- else begin
- Error (reeModifierUnsupported);
- EXIT;
- end;
- end;
- Result := (fModifiers and Mask) <> 0;
- end; { of function TRegExpr.GetModifier
- --------------------------------------------------------------}
- procedure TRegExpr.SetModifier (AIndex : integer; ASet : boolean);
- var
- Mask : integer;
- begin
- case AIndex of
- 1: Mask := MaskModI;
- 2: Mask := MaskModR;
- 3: Mask := MaskModS;
- 4: Mask := MaskModG;
- 5: Mask := MaskModM;
- 6: Mask := MaskModX;
- else begin
- Error (reeModifierUnsupported);
- EXIT;
- end;
- end;
- if ASet
- then fModifiers := fModifiers or Mask
- else fModifiers := fModifiers and not Mask;
- end; { of procedure TRegExpr.SetModifier
- --------------------------------------------------------------}
- {=============================================================}
- {==================== Compiler section =======================}
- {=============================================================}
- procedure TRegExpr.InvalidateProgramm;
- begin
- if programm <> nil then begin
- FreeMem (programm);
- programm := nil;
- end;
- end; { of procedure TRegExpr.InvalidateProgramm
- --------------------------------------------------------------}
- procedure TRegExpr.Compile; //###0.941
- begin
- if fExpression = nil then begin // No Expression assigned
- Error (reeNoExpression);
- EXIT;
- end;
- CompileRegExpr (fExpression);
- end; { of procedure TRegExpr.Compile
- --------------------------------------------------------------}
- function TRegExpr.IsProgrammOk : boolean;
- {$IFNDEF UniCode}
- var
- i : integer;
- {$ENDIF}
- begin
- Result := false;
- // check modifiers
- if fModifiers <> fProgModifiers //###0.941
- then InvalidateProgramm;
- // can we optimize line separators by using sets?
- {$IFNDEF UniCode}
- fLineSeparatorsSet := [];
- for i := 1 to length (fLineSeparators)
- do System.Include (fLineSeparatorsSet, fLineSeparators [i]);
- {$ENDIF}
- // [Re]compile if needed
- if programm = nil
- then Compile; //###0.941
- // check [re]compiled programm
- if programm = nil
- then EXIT // error was set/raised by Compile (was reeExecAfterCompErr)
- else if programm [0] <> MAGIC // Program corrupted.
- then Error (reeCorruptedProgram)
- else Result := true;
- end; { of function TRegExpr.IsProgrammOk
- --------------------------------------------------------------}
- procedure TRegExpr.Tail (p : PRegExprChar; val : PRegExprChar);
- // set the next-pointer at the end of a node chain
- var
- scan : PRegExprChar;
- temp : PRegExprChar;
- // i : int64;
- begin
- if p = @regdummy
- then EXIT;
- // Find last node.
- scan := p;
- REPEAT
- temp := regnext (scan);
- if temp = nil
- then BREAK;
- scan := temp;
- UNTIL false;
- // Set Next 'pointer'
- if val < scan
- then PRENextOff (AlignToPtr(scan + REOpSz))^ := - (scan - val) //###0.948
- // work around PWideChar subtraction bug (Delphi uses
- // shr after subtraction to calculate widechar distance %-( )
- // so, if difference is negative we have .. the "feature" :(
- // I could wrap it in $IFDEF UniCode, but I didn't because
- // "P – Q computes the difference between the address given
- // by P (the higher address) and the address given by Q (the
- // lower address)" - Delphi help quotation.
- else PRENextOff (AlignToPtr(scan + REOpSz))^ := val - scan; //###0.933
- end; { of procedure TRegExpr.Tail
- --------------------------------------------------------------}
- procedure TRegExpr.OpTail (p : PRegExprChar; val : PRegExprChar);
- // regtail on operand of first argument; nop if operandless
- begin
- // "Operandless" and "op != BRANCH" are synonymous in practice.
- if (p = nil) or (p = @regdummy) or (PREOp (p)^ <> BRANCH)
- then EXIT;
- Tail (p + REOpSz + RENextOffSz, val); //###0.933
- end; { of procedure TRegExpr.OpTail
- --------------------------------------------------------------}
- function TRegExpr.EmitNode (op : TREOp) : PRegExprChar; //###0.933
- // emit a node, return location
- begin
- Result := regcode;
- if Result <> @regdummy then begin
- PREOp (regcode)^ := op;
- inc (regcode, REOpSz);
- PRENextOff (AlignToPtr(regcode))^ := 0; // Next "pointer" := nil
- inc (regcode, RENextOffSz);
- {$IFDEF DebugSynRegExpr}
- if regcode-programm>regsize then
- raise Exception.Create('TRegExpr.EmitNode buffer overrun');
- {$ENDIF}
- end
- else inc (regsize, REOpSz + RENextOffSz); // compute code size without code generation
- end; { of function TRegExpr.EmitNode
- --------------------------------------------------------------}
- procedure TRegExpr.EmitC (b : REChar);
- // emit a byte to code
- begin
- if regcode <> @regdummy then begin
- regcode^ := b;
- inc (regcode);
- {$IFDEF DebugSynRegExpr}
- if regcode-programm>regsize then
- raise Exception.Create('TRegExpr.EmitC buffer overrun');
- {$ENDIF}
- end
- else inc (regsize, REOpSz); // Type of p-code pointer always is ^REChar
- end; { of procedure TRegExpr.EmitC
- --------------------------------------------------------------}
- procedure TRegExpr.InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer);
- // insert an operator in front of already-emitted operand
- // Means relocating the operand.
- var
- src, dst, place : PRegExprChar;
- i : integer;
- begin
- if regcode = @regdummy then begin
- inc (regsize, sz);
- EXIT;
- end;
- // move code behind insert position
- src := regcode;
- inc (regcode, sz);
- {$IFDEF DebugSynRegExpr}
- if regcode-programm>regsize then
- raise Exception.Create('TRegExpr.InsertOperator buffer overrun');
- // if (opnd<regcode) or (opnd-regcode>regsize) then
- // raise Exception.Create('TRegExpr.InsertOperator invalid opnd');
- {$ENDIF}
- dst := regcode;
- while src > opnd do begin
- dec (dst);
- dec (src);
- dst^ := src^;
- end;
- place := opnd; // Op node, where operand used to be.
- PREOp (place)^ := op;
- inc (place, REOpSz);
- for i := 1 + REOpSz to sz do begin
- place^ := #0;
- inc (place);
- end;
- end; { of procedure TRegExpr.InsertOperator
- --------------------------------------------------------------}
- function strcspn (s1 : PRegExprChar; s2 : PRegExprChar) : PtrInt;
- // find length of initial segment of s1 consisting
- // entirely of characters not from s2
- var scan1, scan2 : PRegExprChar;
- begin
- Result := 0;
- scan1 := s1;
- while scan1^ <> #0 do begin
- scan2 := s2;
- while scan2^ <> #0 do
- if scan1^ = scan2^
- then EXIT
- else inc (scan2);
- inc (Result);
- inc (scan1)
- end;
- end; { of function strcspn
- --------------------------------------------------------------}
- const
- // Flags to be passed up and down.
- HASWIDTH = 01; // Known never to match nil string.
- SIMPLE = 02; // Simple enough to be STAR/PLUS/BRACES operand.
- SPSTART = 04; // Starts with * or +.
- WORST = 0; // Worst case.
- META : array [0 .. 12] of REChar = (
- '^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{', #0);
- // Any modification must be synchronized with QuoteRegExprMetaChars !!!
- {$IFDEF UniCode}
- RusRangeLo : array [0 .. 33] of REChar =
- (#$430,#$431,#$432,#$433,#$434,#$435,#$451,#$436,#$437,
- #$438,#$439,#$43A,#$43B,#$43C,#$43D,#$43E,#$43F,
- #$440,#$441,#$442,#$443,#$444,#$445,#$446,#$447,
- #$448,#$449,#$44A,#$44B,#$44C,#$44D,#$44E,#$44F,#0);
- RusRangeHi : array [0 .. 33] of REChar =
- (#$410,#$411,#$412,#$413,#$414,#$415,#$401,#$416,#$417,
- #$418,#$419,#$41A,#$41B,#$41C,#$41D,#$41E,#$41F,
- #$420,#$421,#$422,#$423,#$424,#$425,#$426,#$427,
- #$428,#$429,#$42A,#$42B,#$42C,#$42D,#$42E,#$42F,#0);
- RusRangeLoLow = #$430{'а'};
- RusRangeLoHigh = #$44F{'я'};
- RusRangeHiLow = #$410{'А'};
- RusRangeHiHigh = #$42F{'Я'};
- {$ELSE}
- RusRangeLo = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя';
- RusRangeHi = 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ';
- RusRangeLoLow = 'а';
- RusRangeLoHigh = 'я';
- RusRangeHiLow = 'А';
- RusRangeHiHigh = 'Я';
- {$ENDIF}
- function TRegExpr.CompileRegExpr (exp : PRegExprChar) : boolean;
- // Compile a regular expression into internal code
- // We can't allocate space until we know how big the compiled form will be,
- // but we can't compile it (and thus know how big it is) until we've got a
- // place to put the code. So we cheat: we compile it twice, once with code
- // generation turned off and size counting turned on, and once "for real".
- // This also means that we don't allocate space until we are sure that the
- // thing really will compile successfully, and we never have to move the
- // code and thus invalidate pointers into it. (Note that it has to be in
- // one piece because free() must be able to free it all.)
- // Beware that the optimization-preparation code in here knows about some
- // of the structure of the compiled regexp.
- var
- scan, longest : PRegExprChar;
- len : PtrUInt;
- flags : integer;
- begin
- Result := false; // life too dark
- regparse := nil; // for correct error handling
- regexpbeg := exp;
- try
- if programm <> nil then begin
- FreeMem (programm);
- programm := nil;
- end;
- if exp = nil then begin
- Error (reeCompNullArgument);
- EXIT;
- end;
- fProgModifiers := fModifiers;
- // well, may it's paranoia. I'll check it later... !!!!!!!!
- // First pass: determine size, legality.
- fCompModifiers := fModifiers;
- regparse := exp;
- regnpar := 1;
- regsize := 0;
- regcode := @regdummy;
- EmitC (MAGIC);
- if ParseReg (0, flags) = nil
- then EXIT;
- // Allocate space.
- GetMem (programm, regsize * SizeOf (REChar));
- // Second pass: emit code.
- fCompModifiers := fModifiers;
- regparse := exp;
- regnpar := 1;
- regcode := programm;
- EmitC (MAGIC);
- if ParseReg (0, flags) = nil
- then EXIT;
- // Dig out information for optimizations.
- {$IFDEF UseFirstCharSet} //###0.929
- FirstCharSet := [];
- FillFirstCharSet (programm + REOpSz);
- {$ENDIF}
- regstart := #0; // Worst-case defaults.
- reganch := #0;
- regmust := nil;
- regmlen := 0;
- scan := programm + REOpSz; // First BRANCH.
- if PREOp (regnext (scan))^ = EEND then begin // Only one top-level choice.
- scan := scan + REOpSz + RENextOffSz;
- // Starting-point info.
- if PREOp (scan)^ = EXACTLY
- then regstart := (scan + REOpSz + RENextOffSz)^
- else if PREOp (scan)^ = BOL
- then inc (reganch);
- // If there's something expensive in the r.e., find the longest
- // literal string that must appear and make it the regmust. Resolve
- // ties in favor of later strings, since the regstart check works
- // with the beginning of the r.e. and avoiding duplication
- // strengthens checking. Not a strong reason, but sufficient in the
- // absence of others.
- if (flags and SPSTART) <> 0 then begin
- longest := nil;
- len := 0;
- while scan <> nil do begin
- if (PREOp (scan)^ = EXACTLY)
- and (strlen (scan + REOpSz + RENextOffSz) >= PtrInt(len)) then begin
- longest := scan + REOpSz + RENextOffSz;
- len := strlen (longest);
- end;
- scan := regnext (scan);
- end;
- regmust := longest;
- regmlen := len;
- end;
- end;
- Result := true;
- finally begin
- if not Result
- then InvalidateProgramm;
- regexpbeg := nil;
- fExprIsCompiled := Result; //###0.944
- end;
- end;
- end; { of function TRegExpr.CompileRegExpr
- --------------------------------------------------------------}
- function TRegExpr.ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
- // regular expression, i.e. main body or parenthesized thing
- // Caller must absorb opening parenthesis.
- // Combining parenthesis handling with the base level of regular expression
- // is a trifle forced, but the need to tie the tails of the branches to what
- // follows makes it hard to avoid.
- var
- ret, br, ender : PRegExprChar;
- parno : integer;
- flags : integer;
- SavedModifiers : integer;
- begin
- Result := nil;
- flagp := HASWIDTH; // Tentatively.
- parno := 0; // eliminate compiler stupid warning
- SavedModifiers := fCompModifiers;
- // Make an OPEN node, if parenthesized.
- if paren <> 0 then begin
- if regnpar >= NSUBEXP then begin
- Error (reeCompParseRegTooManyBrackets);
- EXIT;
- end;
- parno := regnpar;
- inc (regnpar);
- ret := EmitNode (TREOp (ord (OPEN) + parno));
- end
- else ret := nil;
- // Pick up the branches, linking them together.
- br := ParseBranch (flags);
- if br = nil then begin
- Result := nil;
- EXIT;
- end;
- if ret <> nil
- then Tail (ret, br) // OPEN -> first.
- else ret := br;
- if (flags and HASWIDTH) = 0
- then flagp := flagp and not HASWIDTH;
- flagp := flagp or flags and SPSTART;
- while (regparse^ = '|') do begin
- inc (regparse);
- br := ParseBranch (flags);
- if br = nil then begin
- Result := nil;
- EXIT;
- end;
- Tail (ret, br); // BRANCH -> BRANCH.
- if (flags and HASWIDTH) = 0
- then flagp := flagp and not HASWIDTH;
- flagp := flagp or flags and SPSTART;
- end;
- // Make a closing node, and hook it on the end.
- if paren <> 0
- then ender := EmitNode (TREOp (ord (CLOSE) + parno))
- else ender := EmitNode (EEND);
- Tail (ret, ender);
- // Hook the tails of the branches to the closing node.
- br := ret;
- while br <> nil do begin
- OpTail (br, ender);
- br := regnext (br);
- end;
- // Check for proper termination.
- if paren <> 0 then
- if regparse^ <> ')' then begin
- Error (reeCompParseRegUnmatchedBrackets);
- EXIT;
- end
- else inc (regparse); // skip trailing ')'
- if (paren = 0) and (regparse^ <> #0) then begin
- if regparse^ = ')'
- then Error (reeCompParseRegUnmatchedBrackets2)
- else Error (reeCompParseRegJunkOnEnd);
- EXIT;
- end;
- fCompModifiers := SavedModifiers; // restore modifiers of parent
- Result := ret;
- end; { of function TRegExpr.ParseReg
- --------------------------------------------------------------}
- function TRegExpr.ParseBranch (var flagp : integer) : PRegExprChar;
- // one alternative of an | operator
- // Implements the concatenation operator.
- var
- ret, chain, latest : PRegExprChar;
- flags : integer;
- begin
- flagp := WORST; // Tentatively.
- ret := EmitNode (BRANCH);
- chain := nil;
- while (regparse^ <> #0) and (regparse^ <> '|')
- and (regparse^ <> ')') do begin
- latest := ParsePiece (flags);
- if latest = nil then begin
- Result := nil;
- EXIT;
- end;
- flagp := flagp or flags and HASWIDTH;
- if chain = nil // First piece.
- then flagp := flagp or flags and SPSTART
- else Tail (chain, latest);
- chain := latest;
- end;
- if chain = nil // Loop ran zero times.
- then EmitNode (NOTHING);
- Result := ret;
- end; { of function TRegExpr.ParseBranch
- --------------------------------------------------------------}
- function TRegExpr.ParsePiece (var flagp : integer) : PRegExprChar;
- // something followed by possible [*+?{]
- // Note that the branching code sequences used for ? and the general cases
- // of * and + and { are somewhat optimized: they use the same NOTHING node as
- // both the endmarker for their branch list and the body of the last branch.
- // It might seem that this node could be dispensed with entirely, but the
- // endmarker role is not redundant.
- function parsenum (AStart, AEnd : PRegExprChar) : TREBracesArg;
- begin
- Result := 0;
- if AEnd - AStart + 1 > 8 then begin // prevent stupid scanning
- Error (reeBRACESArgTooBig);
- EXIT;
- end;
- while AStart <= AEnd do begin
- Result := Result * 10 + (ord (AStart^) - ord ('0'));
- inc (AStart);
- end;
- if (Result > MaxBracesArg) or (Result < 0) then begin
- Error (reeBRACESArgTooBig);
- EXIT;
- end;
- end;
- var
- op : REChar;
- NonGreedyOp, NonGreedyCh : boolean; //###0.940
- TheOp : TREOp; //###0.940
- NextNode : PRegExprChar;
- flags : integer;
- BracesMin, Bracesmax : TREBracesArg;
- p, savedparse : PRegExprChar;
- procedure EmitComplexBraces (ABracesMin, ABracesMax : TREBracesArg;
- ANonGreedyOp : boolean); //###0.940
- {$IFDEF ComplexBraces}
- var
- off : TRENextOff;
- {$ENDIF}
- begin
- {$IFNDEF ComplexBraces}
- Error (reeComplexBracesNotImplemented);
- {$ELSE}
- if ANonGreedyOp
- then TheOp := LOOPNG
- else TheOp := LOOP;
- InsertOperator (LOOPENTRY, Result, REOpSz + RENextOffSz);
- NextNode := EmitNode (TheOp);
- if regcode <> @regdummy then begin
- off := (Result + REOpSz + RENextOffSz)
- - (regcode - REOpSz - RENextOffSz); // back to Atom after LOOPENTRY
- PREBracesArg (AlignToInt(regcode))^ := ABracesMin;
- inc (regcode, REBracesArgSz);
- PREBracesArg (AlignToInt(regcode))^ := ABracesMax;
- inc (regcode, REBracesArgSz);
- PRENextOff (AlignToPtr(regcode))^ := off;
- inc (regcode, RENextOffSz);
- {$IFDEF DebugSynRegExpr}
- if regcode-programm>regsize then
- raise Exception.Create('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun');
- {$ENDIF}
- end
- else inc (regsize, REBracesArgSz * 2 + RENextOffSz);
- Tail (Result, NextNode); // LOOPENTRY -> LOOP
- if regcode <> @regdummy then
- Tail (Result + REOpSz + RENextOffSz, NextNode); // Atom -> LOOP
- {$ENDIF}
- end;
- procedure EmitSimpleBraces (ABracesMin, ABracesMax : TREBracesArg;
- ANonGreedyOp : boolean); //###0.940
- begin
- if ANonGreedyOp //###0.940
- then TheOp := BRACESNG
- else TheOp := BRACES;
- InsertOperator (TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
- if regcode <> @regdummy then begin
- PREBracesArg (AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin;
- PREBracesArg (AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax;
- end;
- end;
- begin
- Result := ParseAtom (flags);
- if Result = nil
- then EXIT;
- op := regparse^;
- if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then begin
- flagp := flags;
- EXIT;
- end;
- if ((flags and HASWIDTH) = 0) and (op <> '?') then begin
- Error (reePlusStarOperandCouldBeEmpty);
- EXIT;
- end;
- case op of
- '*': begin
- flagp := WORST or SPSTART;
- NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
- NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
- if (flags and SIMPLE) = 0 then begin
- if NonGreedyOp //###0.940
- then EmitComplexBraces (0, MaxBracesArg, NonGreedyOp)
- else begin // Emit x* as (x&|), where & means "self".
- InsertOperator (BRANCH, Result, REOpSz + RENextOffSz); // Either x
- OpTail (Result, EmitNode (BACK)); // and loop
- OpTail (Result, Result); // back
- Tail (Result, EmitNode (BRANCH)); // or
- Tail (Result, EmitNode (NOTHING)); // nil.
- end
- end
- else begin // Simple
- if NonGreedyOp //###0.940
- then TheOp := STARNG
- else TheOp := STAR;
- InsertOperator (TheOp, Result, REOpSz + RENextOffSz);
- end;
- if NonGreedyCh //###0.940
- then inc (regparse); // Skip extra char ('?')
- end; { of case '*'}
- '+': begin
- flagp := WORST or SPSTART or HASWIDTH;
- NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
- NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
- if (flags and SIMPLE) = 0 then begin
- if NonGreedyOp //###0.940
- then EmitComplexBraces (1, MaxBracesArg, NonGreedyOp)
- else begin // Emit x+ as x(&|), where & means "self".
- NextNode := EmitNode (BRANCH); // Either
- Tail (Result, NextNode);
- Tail (EmitNode (BACK), Result); // loop back
- Tail (NextNode, EmitNode (BRANCH)); // or
- Tail (Result, EmitNode (NOTHING)); // nil.
- end
- end
- else begin // Simple
- if NonGreedyOp //###0.940
- then TheOp := PLUSNG
- else TheOp := PLUS;
- InsertOperator (TheOp, Result, REOpSz + RENextOffSz);
- end;
- if NonGreedyCh //###0.940
- then inc (regparse); // Skip extra char ('?')
- end; { of case '+'}
- '?': begin
- flagp := WORST;
- NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
- NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
- if NonGreedyOp then begin //###0.940 // We emit x?? as x{0,1}?
- if (flags and SIMPLE) = 0
- then EmitComplexBraces (0, 1, NonGreedyOp)
- else EmitSimpleBraces (0, 1, NonGreedyOp);
- end
- else begin // greedy '?'
- InsertOperator (BRANCH, Result, REOpSz + RENextOffSz); // Either x
- Tail (Result, EmitNode (BRANCH)); // or
- NextNode := EmitNode (NOTHING); // nil.
- Tail (Result, NextNode);
- OpTail (Result, NextNode);
- end;
- if NonGreedyCh //###0.940
- then inc (regparse); // Skip extra char ('?')
- end; { of case '?'}
- '{': begin
- savedparse := regparse;
- // !!!!!!!!!!!!
- // Filip Jirsak's note - what will happen, when we are at the end of regparse?
- inc (regparse);
- p := regparse;
- while Pos (regparse^, '0123456789') > 0 // <min> MUST appear
- do inc (regparse);
- if (regparse^ <> '}') and (regparse^ <> ',') or (p = regparse) then begin
- regparse := savedparse;
- flagp := flags;
- EXIT;
- end;
- BracesMin := parsenum (p, regparse - 1);
- if regparse^ = ',' then begin
- inc (regparse);
- p := regparse;
- while Pos (regparse^, '0123456789') > 0
- do inc (regparse);
- if regparse^ <> '}' then begin
- regparse := savedparse;
- EXIT;
- end;
- if p = regparse
- then BracesMax := MaxBracesArg
- else BracesMax := parsenum (p, regparse - 1);
- end
- else BracesMax := BracesMin; // {n} == {n,n}
- if BracesMin > BracesMax then begin
- Error (reeBracesMinParamGreaterMax);
- EXIT;
- end;
- if BracesMin > 0
- then flagp := WORST;
- if BracesMax > 0
- then flagp := flagp or HASWIDTH or SPSTART;
- NonGreedyCh := (regparse + 1)^ = '?'; //###0.940
- NonGreedyOp := NonGreedyCh or ((fCompModifiers and MaskModG) = 0); //###0.940
- if (flags and SIMPLE) <> 0
- then EmitSimpleBraces (BracesMin, BracesMax, NonGreedyOp)
- else EmitComplexBraces (BracesMin, BracesMax, NonGreedyOp);
- if NonGreedyCh //###0.940
- then inc (regparse); // Skip extra char '?'
- end; // of case '{'
- // else // here we can't be
- end; { of case op}
- inc (regparse);
- if (regparse^ = '*') or (regparse^ = '+') or (regparse^ = '?') or (regparse^ = '{') then begin
- Error (reeNestedSQP);
- EXIT;
- end;
- end; { of function TRegExpr.ParsePiece
- --------------------------------------------------------------}
- function TRegExpr.ParseAtom (var flagp : integer) : PRegExprChar;
- // the lowest level
- // Optimization: gobbles an entire sequence of ordinary characters so that
- // it can turn them into a single node, which is smaller to store and
- // faster to run. Backslashed characters are exceptions, each becoming a
- // separate node; the code is simpler that way and it's not worth fixing.
- var
- ret : PRegExprChar;
- flags : integer;
- RangeBeg, RangeEnd : REChar;
- CanBeRange : boolean;
- len : PtrInt;
- ender : REChar;
- begmodfs : PRegExprChar;
- {$IFDEF UseSetOfChar} //###0.930
- RangePCodeBeg : PRegExprChar;
- RangePCodeIdx : PtrInt;
- RangeIsCI : boolean;
- RangeSet : TSetOfREChar;
- RangeLen : PtrInt;
- RangeChMin, RangeChMax : REChar;
- {$ENDIF}
- procedure EmitExactly (ch : REChar);
- begin
- if (fCompModifiers and MaskModI) <> 0
- then ret := EmitNode (EXACTLYCI)
- else ret := EmitNode (EXACTLY);
- EmitC (ch);
- EmitC (#0);
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- procedure EmitStr (const s : RegExprString);
- var i : PtrInt;
- begin
- for i := 1 to length (s)
- do EmitC (s [i]);
- end;
- function HexDig (ch : REChar) : PtrInt;
- begin
- Result := 0;
- if (ch >= 'a') and (ch <= 'f')
- then ch := REChar (ord (ch) - (ord ('a') - ord ('A')));
- if (ch < '0') or (ch > 'F') or ((ch > '9') and (ch < 'A')) then begin
- Error (reeBadHexDigit);
- EXIT;
- end;
- Result := ord (ch) - ord ('0');
- if ch >= 'A'
- then Result := Result - (ord ('A') - ord ('9') - 1);
- end;
- function EmitRange (AOpCode : REChar) : PRegExprChar;
- begin
- {$IFDEF UseSetOfChar}
- case AOpCode of
- ANYBUTCI, ANYBUT:
- Result := EmitNode (ANYBUTTINYSET);
- else // ANYOFCI, ANYOF
- Result := EmitNode (ANYOFTINYSET);
- end;
- case AOpCode of
- ANYBUTCI, ANYOFCI:
- RangeIsCI := True;
- else // ANYBUT, ANYOF
- RangeIsCI := False;
- end;
- RangePCodeBeg := regcode;
- RangePCodeIdx := regsize;
- RangeLen := 0;
- RangeSet := [];
- RangeChMin := #255;
- RangeChMax := #0;
- {$ELSE}
- Result := EmitNode (AOpCode);
- // ToDo:
- // !!!!!!!!!!!!! Implement ANYOF[BUT]TINYSET generation for UniCode !!!!!!!!!!
- {$ENDIF}
- end;
- {$IFDEF UseSetOfChar}
- procedure EmitRangeCPrim (b : REChar); //###0.930
- begin
- if b in RangeSet
- then EXIT;
- inc (RangeLen);
- if b < RangeChMin
- then RangeChMin := b;
- if b > RangeChMax
- then RangeChMax := b;
- Include (RangeSet, b);
- end;
- {$ENDIF}
- procedure EmitRangeC (b : REChar);
- {$IFDEF UseSetOfChar}
- var
- Ch : REChar;
- {$ENDIF}
- begin
- CanBeRange := false;
- {$IFDEF UseSetOfChar}
- if b <> #0 then begin
- EmitRangeCPrim (b); //###0.930
- if RangeIsCI
- then EmitRangeCPrim (InvertCase (b)); //###0.930
- end
- else begin
- {$IFDEF UseAsserts}
- Assert (RangeLen > 0, 'TRegExpr.ParseAtom(subroutine EmitRangeC): empty range'); // impossible, but who knows..
- Assert (RangeChMin <= RangeChMax, 'TRegExpr.ParseAtom(subroutine EmitRangeC): RangeChMin > RangeChMax'); // impossible, but who knows..
- {$ENDIF}
- if RangeLen <= TinySetLen then begin // emit "tiny set"
- if regcode = @regdummy then begin
- regsize := RangePCodeIdx + TinySetLen; // RangeChMin/Max !!!
- EXIT;
- end;
- regcode := RangePCodeBeg;
- for Ch := RangeChMin to RangeChMax do //###0.930
- if Ch in RangeSet then begin
- regcode^ := Ch;
- inc (regcode);
- end;
- // fill rest:
- while regcode < RangePCodeBeg + TinySetLen do begin
- regcode^ := RangeChMax;
- inc (regcode);
- end;
- {$IFDEF DebugSynRegExpr}
- if regcode-programm>regsize then
- raise Exception.Create('TRegExpr.ParseAtom.EmitRangeC TinySetLen buffer overrun');
- {$ENDIF}
- end
- else begin
- if regcode = @regdummy then begin
- regsize := RangePCodeIdx + SizeOf (TSetOfREChar);
- EXIT;
- end;
- if (RangePCodeBeg - REOpSz - RENextOffSz)^ = ANYBUTTINYSET
- then RangeSet := [#0 .. #255] - RangeSet;
- PREOp (RangePCodeBeg - REOpSz - RENextOffSz)^ := ANYOFFULLSET;
- regcode := RangePCodeBeg;
- Move (RangeSet, regcode^, SizeOf (TSetOfREChar));
- inc (regcode, SizeOf (TSetOfREChar));
- {$IFDEF DebugSynRegExpr}
- if regcode-programm>regsize then
- raise Exception.Create('TRegExpr.ParseAtom.EmitRangeC non TinySetLen buffer overrun');
- {$ENDIF}
- end;
- end;
- {$ELSE}
- EmitC (b);
- {$ENDIF}
- end;
- procedure EmitSimpleRangeC (b : REChar);
- begin
- RangeBeg := b;
- EmitRangeC (b);
- CanBeRange := true;
- end;
- procedure EmitRangeStr (const s : RegExprString);
- var i : PtrInt;
- begin
- for i := 1 to length (s)
- do EmitRangeC (s [i]);
- end;
- function UnQuoteChar (var APtr : PRegExprChar) : REChar; //###0.934
- begin
- case APtr^ of
- 't': Result := #$9; // \t => tab (HT/TAB)
- 'n': Result := #$a; // \n => newline (NL)
- 'r': Result := #$d; // \r => carriage return (CR)
- 'f': Result := #$c; // \f => form feed (FF)
- 'a': Result := #$7; // \a => alarm (bell) (BEL)
- 'e': Result := #$1b; // \e => escape (ESC)
- 'x': begin // \x: hex char
- Result := #0;
- inc (APtr);
- if APtr^ = #0 then begin
- Error (reeNoHexCodeAfterBSlashX);
- EXIT;
- end;
- if APtr^ = '{' then begin // \x{nnnn} //###0.936
- REPEAT
- inc (APtr);
- if APtr^ = #0 then begin
- Error (reeNoHexCodeAfterBSlashX);
- EXIT;
- end;
- if APtr^ <> '}' then begin
- if (Ord (Result)
- ShR (SizeOf (REChar) * 8 - 4)) and $F <> 0 then begin
- Error (reeHexCodeAfterBSlashXTooBig);
- EXIT;
- end;
- Result := REChar ((Ord (Result) ShL 4) or HexDig (APtr^));
- // HexDig will cause Error if bad hex digit found
- end
- else BREAK;
- UNTIL False;
- end
- else begin
- Result := REChar (HexDig (APtr^));
- // HexDig will cause Error if bad hex digit found
- inc (APtr);
- if APtr^ = #0 then begin
- Error (reeNoHexCodeAfterBSlashX);
- EXIT;
- end;
- Result := REChar ((Ord (Result) ShL 4) or HexDig (APtr^));
- // HexDig will cause Error if bad hex digit found
- end;
- end;
- else Result := APtr^;
- end;
- end;
- begin
- Result := nil;
- flagp := WORST; // Tentatively.
- inc (regparse);
- case (regparse - 1)^ of
- '^': if ((fCompModifiers and MaskModM) = 0)
- or ((fLineSeparators = '') and not fLinePairedSeparatorAssigned)
- then ret := EmitNode (BOL)
- else ret := EmitNode (BOLML);
- '$': if ((fCompModifiers and MaskModM) = 0)
- or ((fLineSeparators = '') and not fLinePairedSeparatorAssigned)
- then ret := EmitNode (EOL)
- else ret := EmitNode (EOLML);
- '.':
- if (fCompModifiers and MaskModS) <> 0 then begin
- ret := EmitNode (ANY);
- flagp := flagp or HASWIDTH or SIMPLE;
- end
- else begin // not /s, so emit [^:LineSeparators:]
- ret := EmitNode (ANYML);
- flagp := flagp or HASWIDTH; // not so simple ;)
- // ret := EmitRange (ANYBUT);
- // EmitRangeStr (LineSeparators); //###0.941
- // EmitRangeStr (LinePairedSeparator); // !!! isn't correct if have to accept only paired
- // EmitRangeC (#0);
- // flagp := flagp or HASWIDTH or SIMPLE;
- end;
- '[': begin
- if regparse^ = '^' then begin // Complement of range.
- if (fCompModifiers and MaskModI) <> 0
- then ret := EmitRange (ANYBUTCI)
- else ret := EmitRange (ANYBUT);
- inc (regparse);
- end
- else
- if (fCompModifiers and MaskModI) <> 0
- then ret := EmitRange (ANYOFCI)
- else ret := EmitRange (ANYOF);
- CanBeRange := false;
- if (regparse^ = ']') then begin
- EmitSimpleRangeC (regparse^); // []-a] -> ']' .. 'a'
- inc (regparse);
- end;
- while (regparse^ <> #0) and (regparse^ <> ']') do begin
- if (regparse^ = '-')
- and ((regparse + 1)^ <> #0) and ((regparse + 1)^ <> ']')
- and CanBeRange then begin
- inc (regparse);
- RangeEnd := regparse^;
- if RangeEnd = EscChar then begin
- {$IFDEF UniCode} //###0.935
- if (ord ((regparse + 1)^) < 256)
- and (char ((regparse + 1)^)
- in ['d', 'D', 's', 'S', 'w', 'W']) then begin
- {$ELSE}
- if (regparse + 1)^ in ['d', 'D', 's', 'S', 'w', 'W'] then begin
- {$ENDIF}
- EmitRangeC ('-'); // or treat as error ?!!
- CONTINUE;
- end;
- inc (regparse);
- RangeEnd := UnQuoteChar (regparse);
- end;
- // r.e.ranges extension for russian
- if ((fCompModifiers and MaskModR) <> 0)
- and (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeLoHigh) then begin
- EmitRangeStr (RusRangeLo);
- end
- else if ((fCompModifiers and MaskModR) <> 0)
- and (RangeBeg = RusRangeHiLow) and (RangeEnd = RusRangeHiHigh) then begin
- EmitRangeStr (RusRangeHi);
- end
- else if ((fCompModifiers and MaskModR) <> 0)
- and (RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then begin
- EmitRangeStr (RusRangeLo);
- EmitRangeStr (RusRangeHi);
- end
- else begin // standard r.e. handling
- if RangeBeg > RangeEnd then begin
- Error (reeInvalidRange);
- EXIT;
- end;
- inc (RangeBeg);
- EmitRangeC (RangeEnd); // prevent infinite loop if RangeEnd=$ff
- while RangeBeg < RangeEnd do begin //###0.929
- EmitRangeC (RangeBeg);
- inc (RangeBeg);
- end;
- end;
- inc (regparse);
- end
- else begin
- if regparse^ = EscChar then begin
- inc (regparse);
- if regparse^ = #0 then begin
- Error (reeParseAtomTrailingBackSlash);
- EXIT;
- end;
- case regparse^ of // r.e.extensions
- 'd': EmitRangeStr ('0123456789');
- 'w': EmitRangeStr (WordChars);
- 's': EmitRangeStr (SpaceChars);
- else EmitSimpleRangeC (UnQuoteChar (regparse));
- end; { of case}
- end
- else EmitSimpleRangeC (regparse^);
- inc (regparse);
- end;
- end; { of while}
- EmitRangeC (#0);
- if regparse^ <> ']' then begin
- Error (reeUnmatchedSqBrackets);
- EXIT;
- end;
- inc (regparse);
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- '(': begin
- if regparse^ = '?' then begin
- // check for extended Perl syntax : (?..)
- if (regparse + 1)^ = '#' then begin // (?#comment)
- inc (regparse, 2); // find closing ')'
- while (regparse^ <> #0) and (regparse^ <> ')')
- do inc (regparse);
- if regparse^ <> ')' then begin
- Error (reeUnclosedComment);
- EXIT;
- end;
- inc (regparse); // skip ')'
- ret := EmitNode (COMMENT); // comment
- end
- else begin // modifiers ?
- inc (regparse); // skip '?'
- begmodfs := regparse;
- while (regparse^ <> #0) and (regparse^ <> ')')
- do inc (regparse);
- if (regparse^ <> ')')
- or not ParseModifiersStr (copy (begmodfs, 1, (regparse - begmodfs)), fCompModifiers) then begin
- Error (reeUrecognizedModifier);
- EXIT;
- end;
- inc (regparse); // skip ')'
- ret := EmitNode (COMMENT); // comment
- // Error (reeQPSBFollowsNothing);
- // EXIT;
- end;
- end
- else begin
- ret := ParseReg (1, flags);
- if ret = nil then begin
- Result := nil;
- EXIT;
- end;
- flagp := flagp or flags and (HASWIDTH or SPSTART);
- end;
- end;
- #0, '|', ')': begin // Supposed to be caught earlier.
- Error (reeInternalUrp);
- EXIT;
- end;
- '?', '+', '*': begin
- Error (reeQPSBFollowsNothing);
- EXIT;
- end;
- EscChar: begin
- if regparse^ = #0 then begin
- Error (reeTrailingBackSlash);
- EXIT;
- end;
- case regparse^ of // r.e.extensions
- 'b': ret := EmitNode (BOUND); //###0.943
- 'B': ret := EmitNode (NOTBOUND); //###0.943
- 'A': ret := EmitNode (BOL); //###0.941
- 'Z': ret := EmitNode (EOL); //###0.941
- 'd': begin // r.e.extension - any digit ('0' .. '9')
- ret := EmitNode (ANYDIGIT);
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- 'D': begin // r.e.extension - not digit ('0' .. '9')
- ret := EmitNode (NOTDIGIT);
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- 's': begin // r.e.extension - any space char
- {$IFDEF UseSetOfChar}
- ret := EmitRange (ANYOF);
- EmitRangeStr (SpaceChars);
- EmitRangeC (#0);
- {$ELSE}
- ret := EmitNode (ANYSPACE);
- {$ENDIF}
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- 'S': begin // r.e.extension - not space char
- {$IFDEF UseSetOfChar}
- ret := EmitRange (ANYBUT);
- EmitRangeStr (SpaceChars);
- EmitRangeC (#0);
- {$ELSE}
- ret := EmitNode (NOTSPACE);
- {$ENDIF}
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- 'w': begin // r.e.extension - any english char / digit / '_'
- {$IFDEF UseSetOfChar}
- ret := EmitRange (ANYOF);
- EmitRangeStr (WordChars);
- EmitRangeC (#0);
- {$ELSE}
- ret := EmitNode (ANYLETTER);
- {$ENDIF}
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- 'W': begin // r.e.extension - not english char / digit / '_'
- {$IFDEF UseSetOfChar}
- ret := EmitRange (ANYBUT);
- EmitRangeStr (WordChars);
- EmitRangeC (#0);
- {$ELSE}
- ret := EmitNode (NOTLETTER);
- {$ENDIF}
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- '1' .. '9': begin //###0.936
- if (fCompModifiers and MaskModI) <> 0
- then ret := EmitNode (BSUBEXPCI)
- else ret := EmitNode (BSUBEXP);
- EmitC (REChar (ord (regparse^) - ord ('0')));
- flagp := flagp or HASWIDTH or SIMPLE;
- end;
- else EmitExactly (UnQuoteChar (regparse));
- end; { of case}
- inc (regparse);
- end;
- else begin
- dec (regparse);
- if ((fCompModifiers and MaskModX) <> 0) and // check for eXtended syntax
- ((regparse^ = '#')
- or ({$IFDEF UniCode}StrScan (XIgnoredChars, regparse^) <> nil //###0.947
- {$ELSE}regparse^ in XIgnoredChars{$ENDIF})) then begin //###0.941 \x
- if regparse^ = '#' then begin // Skip eXtended comment
- // find comment terminator (group of \n and/or \r)
- while (regparse^ <> #0) and (regparse^ <> #$d) and (regparse^ <> #$a)
- do inc (regparse);
- while (regparse^ = #$d) or (regparse^ = #$a) // skip comment terminator
- do inc (regparse); // attempt to support different type of line separators
- end
- else begin // Skip the blanks!
- while {$IFDEF UniCode}StrScan (XIgnoredChars, regparse^) <> nil //###0.947
- {$ELSE}regparse^ in XIgnoredChars{$ENDIF}
- do inc (regparse);
- end;
- ret := EmitNode (COMMENT); // comment
- end
- else begin
- len := strcspn (regparse, META);
- if len <= 0 then
- if regparse^ <> '{' then begin
- Error (reeRarseAtomInternalDisaster);
- EXIT;
- end
- else len := strcspn (regparse + 1, META) + 1; // bad {n,m} - compile as EXATLY
- ender := (regparse + len)^;
- if (len > 1)
- and ((ender = '*') or (ender = '+') or (ender = '?') or (ender = '{'))
- then dec (len); // Back off clear of ?+*{ operand.
- flagp := flagp or HASWIDTH;
- if len = 1
- then flagp := flagp or SIMPLE;
- if (fCompModifiers and MaskModI) <> 0
- then ret := EmitNode (EXACTLYCI)
- else ret := EmitNode (EXACTLY);
- while (len > 0)
- and (((fCompModifiers and MaskModX) = 0) or (regparse^ <> '#')) do begin
- if ((fCompModifiers and MaskModX) = 0) or not ( //###0.941
- {$IFDEF UniCode}StrScan (XIgnoredChars, regparse^) <> nil //###0.947
- {$ELSE}regparse^ in XIgnoredChars{$ENDIF} )
- then EmitC (regparse^);
- inc (regparse);
- dec (len);
- end;
- EmitC (#0);
- end; { of if not comment}
- end; { of case else}
- end; { of case}
- Result := ret;
- end; { of function TRegExpr.ParseAtom
- --------------------------------------------------------------}
- function TRegExpr.GetCompilerErrorPos : PtrInt;
- begin
- Result := 0;
- if (regexpbeg = nil) or (regparse = nil)
- then EXIT; // not in compiling mode ?
- Result := regparse - regexpbeg;
- end; { of function TRegExpr.GetCompilerErrorPos
- --------------------------------------------------------------}
- {=============================================================}
- {===================== Matching section ======================}
- {=============================================================}
- {$IFNDEF UseSetOfChar}
- function TRegExpr.StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928 - now method of TRegExpr
- begin
- while (s^ <> #0) and (s^ <> ch) and (s^ <> InvertCase (ch))
- do inc (s);
- if s^ <> #0
- then Result := s
- else Result := nil;
- end; { of function TRegExpr.StrScanCI
- --------------------------------------------------------------}
- {$ENDIF}
- function TRegExpr.regrepeat (p : PRegExprChar; AMax : PtrInt) : PtrInt;
- // repeatedly match something simple, report how many
- var
- scan : PRegExprChar;
- opnd : PRegExprChar;
- TheMax : integer;
- {Ch,} InvCh : REChar; //###0.931
- sestart, seend : PRegExprChar; //###0.936
- begin
- Result := 0;
- scan := reginput;
- opnd := p + REOpSz + RENextOffSz; //OPERAND
- TheMax := fInputEnd - scan;
- if TheMax > AMax
- then TheMax := AMax;
- case PREOp (p)^ of
- ANY: begin
- // note - ANYML cannot be proceeded in regrepeat because can skip
- // more than one char at once
- Result := TheMax;
- inc (scan, Result);
- end;
- EXACTLY: begin // in opnd can be only ONE char !!!
- // Ch := opnd^; // store in register //###0.931
- while (Result < TheMax) and (opnd^ = scan^) do begin
- inc (Result);
- inc (scan);
- end;
- end;
- EXACTLYCI: begin // in opnd can be only ONE char !!!
- // Ch := opnd^; // store in register //###0.931
- while (Result < TheMax) and (opnd^ = scan^) do begin // prevent unneeded InvertCase //###0.931
- inc (Result);
- inc (scan);
- end;
- if Result < TheMax then begin //###0.931
- InvCh := InvertCase (opnd^); // store in register
- while (Result < TheMax) and
- ((opnd^ = scan^) or (InvCh = scan^)) do begin
- inc (Result);
- inc (scan);
- end;
- end;
- end;
- BSUBEXP: begin //###0.936
- sestart := startp [ord (opnd^)];
- if sestart = nil
- then EXIT;
- seend := endp [ord (opnd^)];
- if seend = nil
- then EXIT;
- REPEAT
- opnd := sestart;
- while opnd < seend do begin
- if (scan >= fInputEnd) or (scan^ <> opnd^)
- then EXIT;
- inc (scan);
- inc (opnd);
- end;
- inc (Result);
- reginput := scan;
- UNTIL Result >= AMax;
- end;
- BSUBEXPCI: begin //###0.936
- sestart := startp [ord (opnd^)];
- if sestart = nil
- then EXIT;
- seend := endp [ord (opnd^)];
- if seend = nil
- then EXIT;
- REPEAT
- opnd := sestart;
- while opnd < seend do begin
- if (scan >= fInputEnd) or
- ((scan^ <> opnd^) and (scan^ <> InvertCase (opnd^)))
- then EXIT;
- inc (scan);
- inc (opnd);
- end;
- inc (Result);
- reginput := scan;
- UNTIL Result >= AMax;
- end;
- ANYDIGIT:
- while (Result < TheMax) and
- (scan^ >= '0') and (scan^ <= '9') do begin
- inc (Result);
- inc (scan);
- end;
- NOTDIGIT:
- while (Result < TheMax) and
- ((scan^ < '0') or (scan^ > '9')) do begin
- inc (Result);
- inc (scan);
- end;
- {$IFNDEF UseSetOfChar} //###0.929
- ANYLETTER:
- while (Result < TheMax) and
- (Pos (scan^, fWordChars) > 0) //###0.940
- { ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
- or (scan^ >= 'A') and (scan^ <= 'Z') or (scan^ = '_'))} do begin
- inc (Result);
- inc (scan);
- end;
- NOTLETTER:
- while (Result < TheMax) and
- (Pos (scan^, fWordChars) <= 0) //###0.940
- { not ((scan^ >= 'a') and (scan^ <= 'z') !! I've forgotten (>='0') and (<='9')
- or (scan^ >= 'A') and (scan^ <= 'Z')
- or (scan^ = '_'))} do begin
- inc (Result);
- inc (scan);
- end;
- ANYSPACE:
- while (Result < TheMax) and
- (Pos (scan^, fSpaceChars) > 0) do begin
- inc (Result);
- inc (scan);
- end;
- NOTSPACE:
- while (Result < TheMax) and
- (Pos (scan^, fSpaceChars) <= 0) do begin
- inc (Result);
- inc (scan);
- end;
- {$ENDIF}
- ANYOFTINYSET: begin
- while (Result < TheMax) and //!!!TinySet
- ((scan^ = opnd^) or (scan^ = (opnd + 1)^)
- or (scan^ = (opnd + 2)^)) do begin
- inc (Result);
- inc (scan);
- end;
- end;
- ANYBUTTINYSET: begin
- while (Result < TheMax) and //!!!TinySet
- (scan^ <> opnd^) and (scan^ <> (opnd + 1)^)
- and (scan^ <> (opnd + 2)^) do begin
- inc (Result);
- inc (scan);
- end;
- end;
- {$IFDEF UseSetOfChar} //###0.929
- ANYOFFULLSET: begin
- while (Result < TheMax) and
- (scan^ in PSetOfREChar (opnd)^) do begin
- inc (Result);
- inc (scan);
- end;
- end;
- {$ELSE}
- ANYOF:
- while (Result < TheMax) and
- (StrScan (opnd, scan^) <> nil) do begin
- inc (Result);
- inc (scan);
- end;
- ANYBUT:
- while (Result < TheMax) and
- (StrScan (opnd, scan^) = nil) do begin
- inc (Result);
- inc (scan);
- end;
- ANYOFCI:
- while (Result < TheMax) and (StrScanCI (opnd, scan^) <> nil) do begin
- inc (Result);
- inc (scan);
- end;
- ANYBUTCI:
- while (Result < TheMax) and (StrScanCI (opnd, scan^) = nil) do begin
- inc (Result);
- inc (scan);
- end;
- {$ENDIF}
- else begin // Oh dear. Called inappropriately.
- Result := 0; // Best compromise.
- Error (reeRegRepeatCalledInappropriately);
- EXIT;
- end;
- end; { of case}
- reginput := scan;
- end; { of function TRegExpr.regrepeat
- --------------------------------------------------------------}
- function TRegExpr.regnext (p : PRegExprChar) : PRegExprChar;
- // dig the "next" pointer out of a node
- var offset : TRENextOff;
- begin
- if p = @regdummy then begin
- Result := nil;
- EXIT;
- end;
- offset := PRENextOff (AlignToPtr(p + REOpSz))^; //###0.933 inlined NEXT
- if offset = 0
- then Result := nil
- else Result := p + offset;
- end; { of function TRegExpr.regnext
- --------------------------------------------------------------}
- function TRegExpr.MatchPrim (prog : PRegExprChar) : boolean;
- // recursively matching routine
- // Conceptually the strategy is simple: check to see whether the current
- // node matches, call self recursively to see whether the rest matches,
- // and then act accordingly. In practice we make some effort to avoid
- // recursion, in particular by going through "ordinary" nodes (that don't
- // need to know whether the rest of the match failed) by a loop instead of
- // by recursion.
- var
- scan : PRegExprChar; // Current node.
- next : PRegExprChar; // Next node.
- len : PtrInt;
- opnd : PRegExprChar;
- no : PtrInt;
- save : PRegExprChar;
- nextch : REChar;
- BracesMin, BracesMax : PtrInt; // we use integer instead of TREBracesArg for better support */+
- {$IFDEF ComplexBraces}
- SavedLoopStack : array [1 .. LoopStackMax] of integer; // :(( very bad for recursion
- SavedLoopStackIdx : integer; //###0.925
- {$ENDIF}
- begin
- Result := false;
- scan := prog;
- while scan <> nil do begin
- len := PRENextOff (AlignToPtr(scan + 1))^; //###0.932 inlined regnext
- if len = 0
- then next := nil
- else next := scan + len;
- case scan^ of
- NOTBOUND, //###0.943 //!!! think about UseSetOfChar !!!
- BOUND:
- if (scan^ = BOUND)
- xor (
- ((reginput = fInputStart) or (Pos ((reginput - 1)^, fWordChars) <= 0))
- and (reginput^ <> #0) and (Pos (reginput^, fWordChars) > 0)
- or
- (reginput <> fInputStart) and (Pos ((reginput - 1)^, fWordChars) > 0)
- and ((reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0)))
- then EXIT;
- BOL: if reginput <> fInputStart
- then EXIT;
- EOL: if reginput^ <> #0
- then EXIT;
- BOLML: if reginput > fInputStart then begin
- nextch := (reginput - 1)^;
- if (nextch <> fLinePairedSeparatorTail)
- or ((reginput - 1) <= fInputStart)
- or ((reginput - 2)^ <> fLinePairedSeparatorHead)
- then begin
- if (nextch = fLinePairedSeparatorHead)
- and (reginput^ = fLinePairedSeparatorTail)
- then EXIT; // don't stop between paired separator
- if
- {$IFNDEF UniCode}
- not (nextch in fLineSeparatorsSet)
- {$ELSE}
- (pos (nextch, fLineSeparators) <= 0)
- {$ENDIF}
- then EXIT;
- end;
- end;
- EOLML: if reginput^ <> #0 then begin
- nextch := reginput^;
- if (nextch <> fLinePairedSeparatorHead)
- or ((reginput + 1)^ <> fLinePairedSeparatorTail)
- then begin
- if (nextch = fLinePairedSeparatorTail)
- and (reginput > fInputStart)
- and ((reginput - 1)^ = fLinePairedSeparatorHead)
- then EXIT; // don't stop between paired separator
- if
- {$IFNDEF UniCode}
- not (nextch in fLineSeparatorsSet)
- {$ELSE}
- (pos (nextch, fLineSeparators) <= 0)
- {$ENDIF}
- then EXIT;
- end;
- end;
- ANY: begin
- if reginput^ = #0
- then EXIT;
- inc (reginput);
- end;
- ANYML: begin //###0.941
- if (reginput^ = #0)
- or ((reginput^ = fLinePairedSeparatorHead)
- and ((reginput + 1)^ = fLinePairedSeparatorTail))
- or {$IFNDEF UniCode} (reginput^ in fLineSeparatorsSet)
- {$ELSE} (pos (reginput^, fLineSeparators) > 0) {$ENDIF}
- then EXIT;
- inc (reginput);
- end;
- ANYDIGIT: begin
- if (reginput^ = #0) or (reginput^ < '0') or (reginput^ > '9')
- then EXIT;
- inc (reginput);
- end;
- NOTDIGIT: begin
- if (reginput^ = #0) or ((reginput^ >= '0') and (reginput^ <= '9'))
- then EXIT;
- inc (reginput);
- end;
- {$IFNDEF UseSetOfChar} //###0.929
- ANYLETTER: begin
- if (reginput^ = #0) or (Pos (reginput^, fWordChars) <= 0) //###0.943
- then EXIT;
- inc (reginput);
- end;
- NOTLETTER: begin
- if (reginput^ = #0) or (Pos (reginput^, fWordChars) > 0) //###0.943
- then EXIT;
- inc (reginput);
- end;
- ANYSPACE: begin
- if (reginput^ = #0) or not (Pos (reginput^, fSpaceChars) > 0) //###0.943
- then EXIT;
- inc (reginput);
- end;
- NOTSPACE: begin
- if (reginput^ = #0) or (Pos (reginput^, fSpaceChars) > 0) //###0.943
- then EXIT;
- inc (reginput);
- end;
- {$ENDIF}
- EXACTLYCI: begin
- opnd := scan + REOpSz + RENextOffSz; // OPERAND
- // Inline the first character, for speed.
- if (opnd^ <> reginput^)
- and (InvertCase (opnd^) <> reginput^)
- then EXIT;
- len := strlen (opnd);
- //###0.929 begin
- no := len;
- save := reginput;
- while no > 1 do begin
- inc (save);
- inc (opnd);
- if (opnd^ <> save^)
- and (InvertCase (opnd^) <> save^)
- then EXIT;
- dec (no);
- end;
- //###0.929 end
- inc (reginput, len);
- end;
- EXACTLY: begin
- opnd := scan + REOpSz + RENextOffSz; // OPERAND
- // Inline the first character, for speed.
- if opnd^ <> reginput^
- then EXIT;
- len := strlen (opnd);
- //###0.929 begin
- no := len;
- save := reginput;
- while no > 1 do begin
- inc (save);
- inc (opnd);
- if opnd^ <> save^
- then EXIT;
- dec (no);
- end;
- //###0.929 end
- inc (reginput, len);
- end;
- BSUBEXP: begin //###0.936
- no := ord ((scan + REOpSz + RENextOffSz)^);
- if startp [no] = nil
- then EXIT;
- if endp [no] = nil
- then EXIT;
- save := reginput;
- opnd := startp [no];
- while opnd < endp [no] do begin
- if (save >= fInputEnd) or (save^ <> opnd^)
- then EXIT;
- inc (save);
- inc (opnd);
- end;
- reginput := save;
- end;
- BSUBEXPCI: begin //###0.936
- no := ord ((scan + REOpSz + RENextOffSz)^);
- if startp [no] = nil
- then EXIT;
- if endp [no] = nil
- then EXIT;
- save := reginput;
- opnd := startp [no];
- while opnd < endp [no] do begin
- if (save >= fInputEnd) or
- ((save^ <> opnd^) and (save^ <> InvertCase (opnd^)))
- then EXIT;
- inc (save);
- inc (opnd);
- end;
- reginput := save;
- end;
- ANYOFTINYSET: begin
- if (reginput^ = #0) or //!!!TinySet
- ((reginput^ <> (scan + REOpSz + RENextOffSz)^)
- and (reginput^ <> (scan + REOpSz + RENextOffSz + 1)^)
- and (reginput^ <> (scan + REOpSz + RENextOffSz + 2)^))
- then EXIT;
- inc (reginput);
- end;
- ANYBUTTINYSET: begin
- if (reginput^ = #0) or //!!!TinySet
- (reginput^ = (scan + REOpSz + RENextOffSz)^)
- or (reginput^ = (scan + REOpSz + RENextOffSz + 1)^)
- or (reginput^ = (scan + REOpSz + RENextOffSz + 2)^)
- then EXIT;
- inc (reginput);
- end;
- {$IFDEF UseSetOfChar} //###0.929
- ANYOFFULLSET: begin
- if (reginput^ = #0)
- or not (reginput^ in PSetOfREChar (scan + REOpSz + RENextOffSz)^)
- then EXIT;
- inc (reginput);
- end;
- {$ELSE}
- ANYOF: begin
- if (reginput^ = #0) or (StrScan (scan + REOpSz + RENextOffSz, reginput^) = nil)
- then EXIT;
- inc (reginput);
- end;
- ANYBUT: begin
- if (reginput^ = #0) or (StrScan (scan + REOpSz + RENextOffSz, reginput^) <> nil)
- then EXIT;
- inc (reginput);
- end;
- ANYOFCI: begin
- if (reginput^ = #0) or (StrScanCI (scan + REOpSz + RENextOffSz, reginput^) = nil)
- then EXIT;
- inc (reginput);
- end;
- ANYBUTCI: begin
- if (reginput^ = #0) or (StrScanCI (scan + REOpSz + RENextOffSz, reginput^) <> nil)
- then EXIT;
- inc (reginput);
- end;
- {$ENDIF}
- NOTHING: ;
- COMMENT: ;
- BACK: ;
- Succ (OPEN) .. TREOp (Ord (OPEN) + NSUBEXP - 1) : begin //###0.929
- no := ord (scan^) - ord (OPEN);
- // save := reginput;
- save := startp [no]; //###0.936
- startp [no] := reginput; //###0.936
- Result := MatchPrim (next);
- if not Result //###0.936
- then startp [no] := save;
- // if Result and (startp [no] = nil)
- // then startp [no] := save;
- // Don't set startp if some later invocation of the same
- // parentheses already has.
- EXIT;
- end;
- Succ (CLOSE) .. TREOp (Ord (CLOSE) + NSUBEXP - 1): begin //###0.929
- no := ord (scan^) - ord (CLOSE);
- // save := reginput;
- save := endp [no]; //###0.936
- endp [no] := reginput; //###0.936
- Result := MatchPrim (next);
- if not Result //###0.936
- then endp [no] := save;
- // if Result and (endp [no] = nil)
- // then endp [no] := save;
- // Don't set endp if some later invocation of the same
- // parentheses already has.
- EXIT;
- end;
- BRANCH: begin
- if (next^ <> BRANCH) // No choice.
- then next := scan + REOpSz + RENextOffSz // Avoid recursion
- else begin
- REPEAT
- save := reginput;
- Result := MatchPrim (scan + REOpSz + RENextOffSz);
- if Result
- then EXIT;
- reginput := save;
- scan := regnext (scan);
- UNTIL (scan = nil) or (scan^ <> BRANCH);
- EXIT;
- end;
- end;
- {$IFDEF ComplexBraces}
- LOOPENTRY: begin //###0.925
- no := LoopStackIdx;
- inc (LoopStackIdx);
- if LoopStackIdx > LoopStackMax then begin
- Error (reeLoopStackExceeded);
- EXIT;
- end;
- save := reginput;
- LoopStack [LoopStackIdx] := 0; // init loop counter
- Result := MatchPrim (next); // execute LOOP
- LoopStackIdx := no; // cleanup
- if Result
- then EXIT;
- reginput := save;
- EXIT;
- end;
- LOOP, LOOPNG: begin //###0.940
- if LoopStackIdx <= 0 then begin
- Error (reeLoopWithoutEntry);
- EXIT;
- end;
- opnd := scan + PRENextOff (AlignToPtr(scan + REOpSz + RENextOffSz + 2 * REBracesArgSz))^;
- BracesMin := PREBracesArg (AlignToInt(scan + REOpSz + RENextOffSz))^;
- BracesMax := PREBracesArg (AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
- save := reginput;
- if LoopStack [LoopStackIdx] >= BracesMin then begin // Min alredy matched - we can work
- if scan^ = LOOP then begin
- // greedy way - first try to max deep of greed ;)
- if LoopStack [LoopStackIdx] < BracesMax then begin
- inc (LoopStack [LoopStackIdx]);
- no := LoopStackIdx;
- Result := MatchPrim (opnd);
- LoopStackIdx := no;
- if Result
- then EXIT;
- reginput := save;
- end;
- dec (LoopStackIdx); // Fail. May be we are too greedy? ;)
- Result := MatchPrim (next);
- if not Result
- then reginput := save;
- EXIT;
- end
- else begin
- // non-greedy - try just now
- Result := MatchPrim (next);
- if Result
- then EXIT
- else reginput := save; // failed - move next and try again
- if LoopStack [LoopStackIdx] < BracesMax then begin
- inc (LoopStack [LoopStackIdx]);
- no := LoopStackIdx;
- Result := MatchPrim (opnd);
- LoopStackIdx := no;
- if Result
- then EXIT;
- reginput := save;
- end;
- dec (LoopStackIdx); // Failed - back up
- EXIT;
- end
- end
- else begin // first match a min_cnt times
- inc (LoopStack [LoopStackIdx]);
- no := LoopStackIdx;
- Result := MatchPrim (opnd);
- LoopStackIdx := no;
- if Result
- then EXIT;
- dec (LoopStack [LoopStackIdx]);
- reginput := save;
- EXIT;
- end;
- end;
- {$ENDIF}
- STAR, PLUS, BRACES, STARNG, PLUSNG, BRACESNG: begin
- // Lookahead to avoid useless match attempts when we know
- // what character comes next.
- nextch := #0;
- if next^ = EXACTLY
- then nextch := (next + REOpSz + RENextOffSz)^;
- BracesMax := MaxInt; // infinite loop for * and + //###0.92
- if (scan^ = STAR) or (scan^ = STARNG)
- then BracesMin := 0 // STAR
- else if (scan^ = PLUS) or (scan^ = PLUSNG)
- then BracesMin := 1 // PLUS
- else begin // BRACES
- BracesMin := PREBracesArg (AlignToPtr(scan + REOpSz + RENextOffSz))^;
- BracesMax := PREBracesArg (AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz))^;
- end;
- save := reginput;
- opnd := scan + REOpSz + RENextOffSz;
- if (scan^ = BRACES) or (scan^ = BRACESNG)
- then inc (opnd, 2 * REBracesArgSz);
- if (scan^ = PLUSNG) or (scan^ = STARNG) or (scan^ = BRACESNG) then begin
- // non-greedy mode
- BracesMax := regrepeat (opnd, BracesMax); // don't repeat more than BracesMax
- // Now we know real Max limit to move forward (for recursion 'back up')
- // In some cases it can be faster to check only Min positions first,
- // but after that we have to check every position separtely instead
- // of fast scannig in loop.
- no := BracesMin;
- while no <= BracesMax do begin
- reginput := save + no;
- // If it could work, try it.
- if (nextch = #0) or (reginput^ = nextch) then begin
- {$IFDEF ComplexBraces}
- System.Move (LoopStack, SavedLoopStack, SizeOf (LoopStack)); //###0.925
- SavedLoopStackIdx := LoopStackIdx;
- {$ENDIF}
- if MatchPrim (next) then begin
- Result := true;
- EXIT;
- end;
- {$IFDEF ComplexBraces}
- System.Move (SavedLoopStack, LoopStack, SizeOf (LoopStack));
- LoopStackIdx := SavedLoopStackIdx;
- {$ENDIF}
- end;
- inc (no); // Couldn't or didn't - move forward.
- end; { of while}
- EXIT;
- end
- else begin // greedy mode
- no := regrepeat (opnd, BracesMax); // don't repeat more than max_cnt
- while no >= BracesMin do begin
- // If it could work, try it.
- if (nextch = #0) or (reginput^ = nextch) then begin
- {$IFDEF ComplexBraces}
- System.Move (LoopStack, SavedLoopStack, SizeOf (LoopStack)); //###0.925
- SavedLoopStackIdx := LoopStackIdx;
- {$ENDIF}
- if MatchPrim (next) then begin
- Result := true;
- EXIT;
- end;
- {$IFDEF ComplexBraces}
- System.Move (SavedLoopStack, LoopStack, SizeOf (LoopStack));
- LoopStackIdx := SavedLoopStackIdx;
- {$ENDIF}
- end;
- dec (no); // Couldn't or didn't - back up.
- reginput := save + no;
- end; { of while}
- EXIT;
- end;
- end;
- EEND: begin
- Result := true; // Success!
- EXIT;
- end;
- else begin
- Error (reeMatchPrimMemoryCorruption);
- EXIT;
- end;
- end; { of case scan^}
- scan := next;
- end; { of while scan <> nil}
- // We get here only if there's trouble -- normally "case EEND" is the
- // terminating point.
- Error (reeMatchPrimCorruptedPointers);
- end; { of function TRegExpr.MatchPrim
- --------------------------------------------------------------}
- {$IFDEF UseFirstCharSet} //###0.929
- procedure TRegExpr.FillFirstCharSet (prog : PRegExprChar);
- var
- scan : PRegExprChar; // Current node.
- next : PRegExprChar; // Next node.
- opnd : PRegExprChar;
- min_cnt : integer;
- begin
- scan := prog;
- while scan <> nil do begin
- next := regnext (scan);
- case PREOp (scan)^ of
- BSUBEXP, BSUBEXPCI: begin //###0.938
- FirstCharSet := [#0 .. #255]; // :((( we cannot
- // optimize r.e. if it starts with back reference
- EXIT;
- end;
- BOL, BOLML: ; // EXIT; //###0.937
- EOL, EOLML: begin //###0.948 was empty in 0.947, was EXIT in 0.937
- Include (FirstCharSet, #0);
- if ModifierM
- then begin
- opnd := PRegExprChar (LineSeparators);
- while opnd^ <> #0 do begin
- Include (FirstCharSet, opnd^);
- inc (opnd);
- end;
- end;
- EXIT;
- end;
- BOUND, NOTBOUND: ; //###0.943 ?!!
- ANY, ANYML: begin // we can better define ANYML !!!
- FirstCharSet := [#0 .. #255]; //###0.930
- EXIT;
- end;
- ANYDIGIT: begin
- FirstCharSet := FirstCharSet + ['0' .. '9'];
- EXIT;
- end;
- NOTDIGIT: begin
- FirstCharSet := FirstCharSet + ([#0 .. #255] - ['0' .. '9']); //###0.948 FirstCharSet was forgotten
- EXIT;
- end;
- EXACTLYCI: begin
- Include (FirstCharSet, (scan + REOpSz + RENextOffSz)^);
- Include (FirstCharSet, InvertCase ((scan + REOpSz + RENextOffSz)^));
- EXIT;
- end;
- EXACTLY: begin
- Include (FirstCharSet, (scan + REOpSz + RENextOffSz)^);
- EXIT;
- end;
- ANYOFFULLSET: begin
- FirstCharSet := FirstCharSet + PSetOfREChar (scan + REOpSz + RENextOffSz)^;
- EXIT;
- end;
- ANYOFTINYSET: begin
- //!!!TinySet
- Include (FirstCharSet, (scan + REOpSz + RENextOffSz)^);
- Include (FirstCharSet, (scan + REOpSz + RENextOffSz + 1)^);
- Include (FirstCharSet, (scan + REOpSz + RENextOffSz + 2)^);
- // ... // up to TinySetLen
- EXIT;
- end;
- ANYBUTTINYSET: begin
- //!!!TinySet
- FirstCharSet := FirstCharSet + ([#0 .. #255] - [ //###0.948 FirstCharSet was forgotten
- (scan + REOpSz + RENextOffSz)^,
- (scan + REOpSz + RENextOffSz + 1)^,
- (scan + REOpSz + RENextOffSz + 2)^]);
- // ... // up to TinySetLen
- EXIT;
- end;
- NOTHING: ;
- COMMENT: ;
- BACK: ;
- Succ (OPEN) .. TREOp (Ord (OPEN) + NSUBEXP - 1) : begin //###0.929
- FillFirstCharSet (next);
- EXIT;
- end;
- Succ (CLOSE) .. TREOp (Ord (CLOSE) + NSUBEXP - 1): begin //###0.929
- FillFirstCharSet (next);
- EXIT;
- end;
- BRANCH: begin
- if (PREOp (next)^ <> BRANCH) // No choice.
- then next := scan + REOpSz + RENextOffSz // Avoid recursion.
- else begin
- REPEAT
- FillFirstCharSet (scan + REOpSz + RENextOffSz);
- scan := regnext (scan);
- UNTIL (scan = nil) or (PREOp (scan)^ <> BRANCH);
- EXIT;
- end;
- end;
- {$IFDEF ComplexBraces}
- LOOPENTRY: begin //###0.925
- // LoopStack [LoopStackIdx] := 0; //###0.940 line removed
- FillFirstCharSet (next); // execute LOOP
- EXIT;
- end;
- LOOP, LOOPNG: begin //###0.940
- opnd := scan + PRENextOff (AlignToPtr(scan + REOpSz + RENextOffSz + REBracesArgSz * 2))^;
- min_cnt := PREBracesArg (AlignToPtr(scan + REOpSz + RENextOffSz))^;
- FillFirstCharSet (opnd);
- if min_cnt = 0
- then FillFirstCharSet (next);
- EXIT;
- end;
- {$ENDIF}
- STAR, STARNG: //###0.940
- FillFirstCharSet (scan + REOpSz + RENextOffSz);
- PLUS, PLUSNG: begin //###0.940
- FillFirstCharSet (scan + REOpSz + RENextOffSz);
- EXIT;
- end;
- BRACES, BRACESNG: begin //###0.940
- opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
- min_cnt := PREBracesArg (AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES
- FillFirstCharSet (opnd);
- if min_cnt > 0
- then EXIT;
- end;
- EEND: begin
- FirstCharSet := [#0 .. #255]; //###0.948
- EXIT;
- end;
- else begin
- Error (reeMatchPrimMemoryCorruption);
- EXIT;
- end;
- end; { of case scan^}
- scan := next;
- end; { of while scan <> nil}
- end; { of procedure FillFirstCharSet
- --------------------------------------------------------------}
- {$ENDIF}
- function TRegExpr.Exec (const AInputString : RegExprString) : boolean;
- begin
- InputString := AInputString;
- Result := ExecPrim (1);
- end; { of function TRegExpr.Exec
- --------------------------------------------------------------}
- {$IFDEF OverMeth}
- {$IFNDEF FPC}
- function TRegExpr.Exec : boolean;
- begin
- Result := ExecPrim (1);
- end; { of function TRegExpr.Exec
- --------------------------------------------------------------}
- {$ENDIF}
- function TRegExpr.Exec (AOffset: PtrInt) : boolean;
- begin
- Result := ExecPrim (AOffset);
- end; { of function TRegExpr.Exec
- --------------------------------------------------------------}
- {$ENDIF}
- function TRegExpr.ExecPos (AOffset: PtrInt {$IFDEF DefParam}= 1{$ENDIF}) : boolean;
- begin
- Result := ExecPrim (AOffset);
- end; { of function TRegExpr.ExecPos
- --------------------------------------------------------------}
- function TRegExpr.ExecPrim (AOffset: PtrInt) : boolean;
- procedure ClearMatchs;
- // Clears matchs array
- var i : integer;
- begin
- for i := 0 to NSUBEXP - 1 do begin
- startp [i] := nil;
- endp [i] := nil;
- end;
- end; { of procedure ClearMatchs;
- ..............................................................}
- function RegMatch (str : PRegExprChar) : boolean;
- // try match at specific point
- begin
- //###0.949 removed clearing of start\endp
- reginput := str;
- Result := MatchPrim (programm + REOpSz);
- if Result then begin
- startp [0] := str;
- endp [0] := reginput;
- end;
- end; { of function RegMatch
- ..............................................................}
- var
- s : PRegExprChar;
- StartPtr: PRegExprChar;
- InputLen : PtrInt;
- begin
- Result := false; // Be paranoid...
- ClearMatchs; //###0.949
- // ensure that Match cleared either if optimization tricks or some error
- // will lead to leaving ExecPrim without actual search. That is
- // importent for ExecNext logic and so on.
- if not IsProgrammOk //###0.929
- then EXIT;
- // Check InputString presence
- if not Assigned (fInputString) then begin
- Error (reeNoInputStringSpecified);
- EXIT;
- end;
- InputLen := length (fInputString);
- //Check that the start position is not negative
- if AOffset < 1 then begin
- Error (reeOffsetMustBeGreaterThen0);
- EXIT;
- end;
- // Check that the start position is not longer than the line
- // If so then exit with nothing found
- if AOffset > (InputLen + 1) // for matching empty string after last char.
- then EXIT;
- StartPtr := fInputString + AOffset - 1;
- // If there is a "must appear" string, look for it.
- if regmust <> nil then begin
- s := StartPtr;
- REPEAT
- s := StrScan (s, regmust [0]);
- if s <> nil then begin
- if StrLComp (s, regmust, regmlen) = 0
- then BREAK; // Found it.
- inc (s);
- end;
- UNTIL s = nil;
- if s = nil // Not present.
- then EXIT;
- end;
- // Mark beginning of line for ^ .
- fInputStart := fInputString;
- // Pointer to end of input stream - for
- // pascal-style string processing (may include #0)
- fInputEnd := fInputString + InputLen;
- {$IFDEF ComplexBraces}
- // no loops started
- LoopStackIdx := 0; //###0.925
- {$ENDIF}
- // Simplest case: anchored match need be tried only once.
- if reganch <> #0 then begin
- Result := RegMatch (StartPtr);
- EXIT;
- end;
- // Messy cases: unanchored match.
- s := StartPtr;
- if regstart <> #0 then // We know what char it must start with.
- REPEAT
- s := StrScan (s, regstart);
- if s <> nil then begin
- Result := RegMatch (s);
- if Result
- then EXIT
- else ClearMatchs; //###0.949
- inc (s);
- end;
- UNTIL s = nil
- else begin // We don't - general case.
- repeat //###0.948
- {$IFDEF UseFirstCharSet}
- if s^ in FirstCharSet
- then Result := RegMatch (s);
- {$ELSE}
- Result := RegMatch (s);
- {$ENDIF}
- if Result or (s^ = #0) // Exit on a match or after testing the end-of-string.
- then EXIT
- else ClearMatchs; //###0.949
- inc (s);
- until false;
- (* optimized and fixed by Martin Fuller - empty strings
- were not allowed to pass through in UseFirstCharSet mode
- {$IFDEF UseFirstCharSet} //###0.929
- while s^ <> #0 do begin
- if s^ in FirstCharSet
- then Result := RegMatch (s);
- if Result
- then EXIT;
- inc (s);
- end;
- {$ELSE}
- REPEAT
- Result := RegMatch (s);
- if Result
- then EXIT;
- inc (s);
- UNTIL s^ = #0;
- {$ENDIF}
- *)
- end;
- // Failure
- end; { of function TRegExpr.ExecPrim
- --------------------------------------------------------------}
- function TRegExpr.ExecNext : boolean;
- var offset : PtrInt;
- begin
- Result := false;
- if not Assigned (startp[0]) or not Assigned (endp[0]) then begin
- Error (reeExecNextWithoutExec);
- EXIT;
- end;
- // Offset := MatchPos [0] + MatchLen [0];
- // if MatchLen [0] = 0
- Offset := endp [0] - fInputString + 1; //###0.929
- if endp [0] = startp [0] //###0.929
- then inc (Offset); // prevent infinite looping if empty string match r.e.
- Result := ExecPrim (Offset);
- end; { of function TRegExpr.ExecNext
- --------------------------------------------------------------}
- function TRegExpr.GetInputString : RegExprString;
- begin
- if not Assigned (fInputString) then begin
- Error (reeGetInputStringWithoutInputString);
- EXIT;
- end;
- Result := fInputString;
- end; { of function TRegExpr.GetInputString
- --------------------------------------------------------------}
- procedure TRegExpr.SetInputString (const AInputString : RegExprString);
- var
- Len : PtrInt;
- i : PtrInt;
- begin
- // clear Match* - before next Exec* call it's undefined
- for i := 0 to NSUBEXP - 1 do begin
- startp [i] := nil;
- endp [i] := nil;
- end;
- // need reallocation of input string buffer ?
- Len := length (AInputString);
- ReAllocMem(fInputString,(Len + 1) * SizeOf (REChar));
- // copy input string into buffer
- if Len>0 then
- System.Move(AInputString[1],fInputString^,(Len+1)* SizeOf (REChar)) // with #0
- else
- fInputString[0]:=#0;
- {
- fInputString : string;
- fInputStart, fInputEnd : PRegExprChar;
- SetInputString:
- fInputString := AInputString;
- UniqueString (fInputString);
- fInputStart := PChar (fInputString);
- Len := length (fInputString);
- fInputEnd := PRegExprChar (integer (fInputStart) + Len); ??
- !! startp/endp все равно будет опасно использовать ?
- }
- end; { of procedure TRegExpr.SetInputString
- --------------------------------------------------------------}
- procedure TRegExpr.SetLineSeparators (const AStr : RegExprString);
- begin
- if AStr <> fLineSeparators then begin
- fLineSeparators := AStr;
- InvalidateProgramm;
- end;
- end; { of procedure TRegExpr.SetLineSeparators
- --------------------------------------------------------------}
- procedure TRegExpr.SetLinePairedSeparator (const AStr : RegExprString);
- begin
- if length (AStr) = 2 then begin
- if AStr [1] = AStr [2] then begin
- // it's impossible for our 'one-point' checking to support
- // two chars separator for identical chars
- Error (reeBadLinePairedSeparator);
- EXIT;
- end;
- if not fLinePairedSeparatorAssigned
- or (AStr [1] <> fLinePairedSeparatorHead)
- or (AStr [2] <> fLinePairedSeparatorTail) then begin
- fLinePairedSeparatorAssigned := true;
- fLinePairedSeparatorHead := AStr [1];
- fLinePairedSeparatorTail := AStr [2];
- InvalidateProgramm;
- end;
- end
- else if length (AStr) = 0 then begin
- if fLinePairedSeparatorAssigned then begin
- fLinePairedSeparatorAssigned := false;
- InvalidateProgramm;
- end;
- end
- else Error (reeBadLinePairedSeparator);
- end; { of procedure TRegExpr.SetLinePairedSeparator
- --------------------------------------------------------------}
- function TRegExpr.GetLinePairedSeparator : RegExprString;
- begin
- if fLinePairedSeparatorAssigned then begin
- {$IFDEF UniCode}
- // Here is some UniCode 'magic'
- // If You do know better decision to concatenate
- // two WideChars, please, let me know!
- Result := fLinePairedSeparatorHead; //###0.947
- Result := Result + fLinePairedSeparatorTail;
- {$ELSE}
- Result := fLinePairedSeparatorHead + fLinePairedSeparatorTail;
- {$ENDIF}
- end
- else Result := '';
- end; { of function TRegExpr.GetLinePairedSeparator
- --------------------------------------------------------------}
- function TRegExpr.Substitute (const ATemplate : RegExprString) : RegExprString;
- // perform substitutions after a regexp match
- // completely rewritten in 0.929
- type
- TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper,
- smodeAllLower);
- var
- TemplateLen : PtrInt;
- TemplateBeg, TemplateEnd : PRegExprChar;
- p, p0, p1, ResultPtr : PRegExprChar;
- ResultLen : PtrInt;
- n : PtrInt;
- Ch : REChar;
- Mode: TSubstMode;
- LineEnd: String = {$ifdef UseOsLineEndOnReplace} System.LineEnding {$else} Chr(10) {$endif};
- function ParseVarName (var APtr : PRegExprChar) : PtrInt;
- // extract name of variable (digits, may be enclosed with
- // curly braces) from APtr^, uses TemplateEnd !!!
- const
- Digits = ['0' .. '9'];
- var
- p : PRegExprChar;
- Delimited : boolean;
- begin
- Result := 0;
- p := APtr;
- Delimited := (p < TemplateEnd) and (p^ = '{');
- if Delimited
- then inc (p); // skip left curly brace
- if (p < TemplateEnd) and (p^ = '&')
- then inc (p) // this is '$&' or '${&}'
- else
- while (p < TemplateEnd) and
- {$IFDEF UniCode} //###0.935
- (ord (p^) < 256) and (char (p^) in Digits)
- {$ELSE}
- (p^ in Digits)
- {$ENDIF}
- do begin
- Result := Result * 10 + (ord (p^) - ord ('0')); //###0.939
- inc (p);
- end;
- if Delimited then
- if (p < TemplateEnd) and (p^ = '}')
- then inc (p) // skip right curly brace
- else p := APtr; // isn't properly terminated
- if p = APtr
- then Result := -1; // no valid digits found or no right curly brace
- APtr := p;
- end;
- begin
- // Check programm and input string
- if not IsProgrammOk
- then EXIT;
- if not Assigned (fInputString) then begin
- Error (reeNoInputStringSpecified);
- EXIT;
- end;
- // Prepare for working
- TemplateLen := length (ATemplate);
- if TemplateLen = 0 then begin // prevent nil pointers
- Result := '';
- EXIT;
- end;
- TemplateBeg := pointer (ATemplate);
- TemplateEnd := TemplateBeg + TemplateLen;
- // Count result length for speed optimization.
- ResultLen := 0;
- p := TemplateBeg;
- while p < TemplateEnd do begin
- Ch := p^;
- inc (p);
- if Ch = '$'
- then n := ParseVarName (p)
- else n := -1;
- if n >= 0 then begin
- if (n < NSUBEXP) and Assigned (startp [n]) and Assigned (endp [n])
- then inc (ResultLen, endp [n] - startp [n]);
- end
- else begin
- if (Ch = EscChar) and (p < TemplateEnd) then begin // quoted or special char followed
- Ch := p^;
- inc (p);
- case Ch of
- 'n' : inc(ResultLen, Length(LineEnd));
- 'u', 'l', 'U', 'L': {nothing};
- else inc(ResultLen);
- end;
- end
- else
- inc(ResultLen);
- end;
- end;
- // Get memory. We do it once and it significant speed up work !
- if ResultLen = 0 then begin
- Result := '';
- EXIT;
- end;
- //SetString (Result, nil, ResultLen);
- SetLength(Result,ResultLen);
- // Fill Result
- ResultPtr := pointer (Result);
- p := TemplateBeg;
- Mode := smodeNormal;
- while p < TemplateEnd do begin
- Ch := p^;
- p0 := p;
- inc (p);
- p1 := p;
- if Ch = '$'
- then n := ParseVarName (p)
- else n := -1;
- if (n >= 0) then begin
- p0 := startp[n];
- p1 := endp[n];
- if (n >= NSUBEXP) or not Assigned (p0) or not Assigned (endp [n]) then
- p1 := p0; // empty
- end
- else begin
- if (Ch = EscChar) and (p < TemplateEnd) then begin // quoted or special char followed
- Ch := p^;
- inc (p);
- case Ch of
- 'n' : begin
- p0 := @LineEnd[1];
- p1 := p0 + Length(LineEnd);
- end;
- 'l' : begin
- Mode := smodeOneLower;
- p1 := p0;
- end;
- 'L' : begin
- Mode := smodeAllLower;
- p1 := p0;
- end;
- 'u' : begin
- Mode := smodeOneUpper;
- p1 := p0;
- end;
- 'U' : begin
- Mode := smodeAllUpper;
- p1 := p0;
- end;
- else
- begin
- inc(p0);
- inc(p1);
- end;
- end;
- end
- end;
- if p0 < p1 then begin
- while p0 < p1 do begin
- case Mode of
- smodeOneLower, smodeAllLower:
- begin
- Ch := p0^;
- if Ch < #128 then
- Ch := AnsiLowerCase(Ch)[1];
- ResultPtr^ := Ch;
- if Mode = smodeOneLower then
- Mode := smodeNormal;
- end;
- smodeOneUpper, smodeAllUpper:
- begin
- Ch := p0^;
- if Ch < #128 then
- Ch := AnsiUpperCase(Ch)[1];
- ResultPtr^ := Ch;
- if Mode = smodeOneUpper then
- Mode := smodeNormal;
- end;
- else
- ResultPtr^ := p0^;
- end;
- inc (ResultPtr);
- inc (p0);
- end;
- Mode := smodeNormal;
- end;
- end;
- end; { of function TRegExpr.Substitute
- --------------------------------------------------------------}
- procedure TRegExpr.Split (AInputStr : RegExprString; APieces : TStrings);
- var PrevPos : PtrInt;
- begin
- PrevPos := 1;
- if Exec (AInputStr) then
- REPEAT
- APieces.Add (System.Copy (AInputStr, PrevPos, MatchPos [0] - PrevPos));
- PrevPos := MatchPos [0] + MatchLen [0];
- UNTIL not ExecNext;
- APieces.Add (System.Copy (AInputStr, PrevPos, MaxInt)); // Tail
- end; { of procedure TRegExpr.Split
- --------------------------------------------------------------}
- function TRegExpr.Replace (AInputStr : RegExprString; const AReplaceStr : RegExprString;
- AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString;
- var
- PrevPos : PtrInt;
- begin
- Result := '';
- PrevPos := 1;
- if Exec (AInputStr) then
- REPEAT
- Result := Result + System.Copy (AInputStr, PrevPos,
- MatchPos [0] - PrevPos);
- if AUseSubstitution //###0.946
- then Result := Result + Substitute (AReplaceStr)
- else Result := Result + AReplaceStr;
- PrevPos := MatchPos [0] + MatchLen [0];
- UNTIL not ExecNext;
- Result := Result + System.Copy (AInputStr, PrevPos, MaxInt); // Tail
- end; { of function TRegExpr.Replace
- --------------------------------------------------------------}
- function TRegExpr.ReplaceEx (AInputStr : RegExprString;
- AReplaceFunc : TRegExprReplaceFunction)
- : RegExprString;
- var
- PrevPos : PtrInt;
- begin
- Result := '';
- PrevPos := 1;
- if Exec (AInputStr) then
- REPEAT
- Result := Result + System.Copy (AInputStr, PrevPos,
- MatchPos [0] - PrevPos)
- + AReplaceFunc (Self);
- PrevPos := MatchPos [0] + MatchLen [0];
- UNTIL not ExecNext;
- Result := Result + System.Copy (AInputStr, PrevPos, MaxInt); // Tail
- end; { of function TRegExpr.ReplaceEx
- --------------------------------------------------------------}
- {$IFDEF OverMeth}
- function TRegExpr.Replace (AInputStr : RegExprString;
- AReplaceFunc : TRegExprReplaceFunction)
- : RegExprString;
- begin
- {$IFDEF FPC}Result:={$ENDIF}ReplaceEx (AInputStr, AReplaceFunc);
- end; { of function TRegExpr.Replace
- --------------------------------------------------------------}
- {$ENDIF}
- {=============================================================}
- {====================== Debug section ========================}
- {=============================================================}
- {$IFDEF RegExpPCodeDump}
- function TRegExpr.DumpOp (op : TREOp) : RegExprString;
- // printable representation of opcode
- begin
- case op of
- BOL: Result := 'BOL';
- EOL: Result := 'EOL';
- BOLML: Result := 'BOLML';
- EOLML: Result := 'EOLML';
- BOUND: Result := 'BOUND'; //###0.943
- NOTBOUND: Result := 'NOTBOUND'; //###0.943
- ANY: Result := 'ANY';
- ANYML: Result := 'ANYML'; //###0.941
- ANYLETTER: Result := 'ANYLETTER';
- NOTLETTER: Result := 'NOTLETTER';
- ANYDIGIT: Result := 'ANYDIGIT';
- NOTDIGIT: Result := 'NOTDIGIT';
- ANYSPACE: Result := 'ANYSPACE';
- NOTSPACE: Result := 'NOTSPACE';
- ANYOF: Result := 'ANYOF';
- ANYBUT: Result := 'ANYBUT';
- ANYOFCI: Result := 'ANYOF/CI';
- ANYBUTCI: Result := 'ANYBUT/CI';
- BRANCH: Result := 'BRANCH';
- EXACTLY: Result := 'EXACTLY';
- EXACTLYCI: Result := 'EXACTLY/CI';
- NOTHING: Result := 'NOTHING';
- COMMENT: Result := 'COMMENT';
- BACK: Result := 'BACK';
- EEND: Result := 'END';
- BSUBEXP: Result := 'BSUBEXP';
- BSUBEXPCI: Result := 'BSUBEXP/CI';
- Succ (OPEN) .. TREOp (Ord (OPEN) + NSUBEXP - 1): //###0.929
- Result := Format ('OPEN[%d]', [ord (op) - ord (OPEN)]);
- Succ (CLOSE) .. TREOp (Ord (CLOSE) + NSUBEXP - 1): //###0.929
- Result := Format ('CLOSE[%d]', [ord (op) - ord (CLOSE)]);
- STAR: Result := 'STAR';
- PLUS: Result := 'PLUS';
- BRACES: Result := 'BRACES';
- {$IFDEF ComplexBraces}
- LOOPENTRY: Result := 'LOOPENTRY'; //###0.925
- LOOP: Result := 'LOOP'; //###0.925
- LOOPNG: Result := 'LOOPNG'; //###0.940
- {$ENDIF}
- ANYOFTINYSET: Result:= 'ANYOFTINYSET';
- ANYBUTTINYSET:Result:= 'ANYBUTTINYSET';
- {$IFDEF UseSetOfChar} //###0.929
- ANYOFFULLSET: Result:= 'ANYOFFULLSET';
- {$ENDIF}
- STARNG: Result := 'STARNG'; //###0.940
- PLUSNG: Result := 'PLUSNG'; //###0.940
- BRACESNG: Result := 'BRACESNG'; //###0.940
- else Error (reeDumpCorruptedOpcode);
- end; {of case op}
- Result := ':' + Result;
- end; { of function TRegExpr.DumpOp
- --------------------------------------------------------------}
- function TRegExpr.Dump : RegExprString;
- // dump a regexp in vaguely comprehensible form
- var
- s : PRegExprChar;
- op : TREOp; // Arbitrary non-END op.
- next : PRegExprChar;
- i : PtrInt;
- Diff : PtrInt;
- {$IFDEF UseSetOfChar} //###0.929
- Ch : REChar;
- {$ENDIF}
- begin
- if not IsProgrammOk //###0.929
- then EXIT;
- op := EXACTLY;
- Result := '';
- s := programm + REOpSz;
- while op <> EEND do begin // While that wasn't END last time...
- op := s^;
- Result := Result + Format ('%2d%s', [s - programm, DumpOp (s^)]); // Where, what.
- next := regnext (s);
- if next = nil // Next ptr.
- then Result := Result + ' (0)'
- else begin
- if next > s //###0.948 PWideChar subtraction workaround (see comments in Tail method for details)
- then Diff := next - s
- else Diff := - (s - next);
- Result := Result + Format (' (%d) ', [(s - programm) + Diff]);
- end;
- inc (s, REOpSz + RENextOffSz);
- if (op = ANYOF) or (op = ANYOFCI) or (op = ANYBUT) or (op = ANYBUTCI)
- or (op = EXACTLY) or (op = EXACTLYCI) then begin
- // Literal string, where present.
- while s^ <> #0 do begin
- Result := Result + s^;
- inc (s);
- end;
- inc (s);
- end;
- if (op = ANYOFTINYSET) or (op = ANYBUTTINYSET) then begin
- for i := 1 to TinySetLen do begin
- Result := Result + s^;
- inc (s);
- end;
- end;
- if (op = BSUBEXP) or (op = BSUBEXPCI) then begin
- Result := Result + ' \' + IntToStr (Ord (s^));
- inc (s);
- end;
- {$IFDEF UseSetOfChar} //###0.929
- if op = ANYOFFULLSET then begin
- for Ch := #0 to #255 do
- if Ch in PSetOfREChar (s)^ then
- if Ch < ' '
- then Result := Result + '#' + IntToStr (Ord (Ch)) //###0.936
- else Result := Result + Ch;
- inc (s, SizeOf (TSetOfREChar));
- end;
- {$ENDIF}
- if (op = BRACES) or (op = BRACESNG) then begin //###0.941
- // show min/max argument of BRACES operator
- Result := Result + Format ('{%d,%d}', [PREBracesArg (AlignToInt(s))^, PREBracesArg (AlignToInt(s + REBracesArgSz))^]);
- inc (s, REBracesArgSz * 2);
- end;
- {$IFDEF ComplexBraces}
- if (op = LOOP) or (op = LOOPNG) then begin //###0.940
- Result := Result + Format (' -> (%d) {%d,%d}', [
- (s - programm - (REOpSz + RENextOffSz)) + PRENextOff (AlignToPtr(s + 2 * REBracesArgSz))^,
- PREBracesArg (AlignToInt(s))^, PREBracesArg (AlignToInt(s + REBracesArgSz))^]);
- inc (s, 2 * REBracesArgSz + RENextOffSz);
- end;
- {$ENDIF}
- Result := Result + #$d#$a;
- end; { of while}
- // Header fields of interest.
- if regstart <> #0
- then Result := Result + 'start ' + regstart;
- if reganch <> #0
- then Result := Result + 'anchored ';
- if regmust <> nil
- then Result := Result + 'must have ' + regmust;
- {$IFDEF UseFirstCharSet} //###0.929
- Result := Result + #$d#$a'FirstCharSet:';
- for Ch := #0 to #255 do
- if Ch in FirstCharSet
- then begin
- if Ch < ' '
- then Result := Result + '#' + IntToStr(Ord(Ch)) //###0.948
- else Result := Result + Ch;
- end;
- {$ENDIF}
- Result := Result + #$d#$a;
- end; { of function TRegExpr.Dump
- --------------------------------------------------------------}
- {$ENDIF}
- {$IFDEF reRealExceptionAddr}
- {$OPTIMIZATION ON}
- // ReturnAddr works correctly only if compiler optimization is ON
- // I placed this method at very end of unit because there are no
- // way to restore compiler optimization flag ...
- {$ENDIF}
- procedure TRegExpr.Error (AErrorID : integer);
- {$IFDEF reRealExceptionAddr}
- function ReturnAddr : pointer; //###0.938
- asm
- mov eax,[ebp+4]
- end;
- {$ENDIF}
- var
- e : ERegExpr;
- begin
- fLastError := AErrorID; // dummy stub - useless because will raise exception
- if AErrorID < 1000 // compilation error ?
- then e := ERegExpr.Create (ErrorMsg (AErrorID) // yes - show error pos
- + ' (pos ' + IntToStr (CompilerErrorPos) + ')')
- else e := ERegExpr.Create (ErrorMsg (AErrorID));
- e.ErrorCode := AErrorID;
- e.CompilerErrorPos := CompilerErrorPos;
- raise e
- {$IFDEF reRealExceptionAddr}
- At ReturnAddr; //###0.938
- {$ENDIF}
- end; { of procedure TRegExpr.Error
- --------------------------------------------------------------}
- (*
- PCode persistence:
- FirstCharSet
- programm, regsize
- regstart // -> programm
- reganch // -> programm
- regmust, regmlen // -> programm
- fExprIsCompiled
- *)
- // be carefull - placed here code will be always compiled with
- // compiler optimization flag
- {$IFDEF FPC}
- initialization
- RegExprInvertCaseFunction := TRegExpr.InvertCaseFunction;
- {$ENDIF}
- end.
|