12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940 |
- /*************************************************
- * Perl-Compatible Regular Expressions *
- *************************************************/
- /* PCRE is a library of functions to support regular expressions whose syntax
- and semantics are as close as possible to those of the Perl 5 language.
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
- -----------------------------------------------------------------------------
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- -----------------------------------------------------------------------------
- */
- /* This module contains pcre_exec(), the externally visible function that does
- pattern matching using an NFA algorithm, trying to mimic Perl as closely as
- possible. There are also some static supporting functions. */
- #ifdef HAVE_CONFIG_H
- #include "config.h"
- #endif
- #define NLBLOCK md /* Block containing newline information */
- #define PSSTART start_subject /* Field containing processed string start */
- #define PSEND end_subject /* Field containing processed string end */
- #include "pcre_internal.h"
- /* Undefine some potentially clashing cpp symbols */
- #undef min
- #undef max
- /* Flag bits for the match() function */
- #define match_condassert 0x01 /* Called to check a condition assertion */
- #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
- /* Non-error returns from the match() function. Error returns are externally
- defined PCRE_ERROR_xxx codes, which are all negative. */
- #define MATCH_MATCH 1
- #define MATCH_NOMATCH 0
- /* Special internal returns from the match() function. Make them sufficiently
- negative to avoid the external error codes. */
- #define MATCH_COMMIT (-999)
- #define MATCH_PRUNE (-998)
- #define MATCH_SKIP (-997)
- #define MATCH_THEN (-996)
- /* Maximum number of ints of offset to save on the stack for recursive calls.
- If the offset vector is bigger, malloc is used. This should be a multiple of 3,
- because the offset vector is always a multiple of 3 long. */
- #define REC_STACK_SAVE_MAX 30
- /* Min and max values for the common repeats; for the maxima, 0 => infinity */
- static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
- static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
- #ifdef DEBUG
- /*************************************************
- * Debugging function to print chars *
- *************************************************/
- /* Print a sequence of chars in printable format, stopping at the end of the
- subject if the requested.
- Arguments:
- p points to characters
- length number to print
- is_subject TRUE if printing from within md->start_subject
- md pointer to matching data block, if is_subject is TRUE
- Returns: nothing
- */
- static void
- pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
- {
- unsigned int c;
- if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
- while (length-- > 0)
- if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
- }
- #endif
- /*************************************************
- * Match a back-reference *
- *************************************************/
- /* If a back reference hasn't been set, the length that is passed is greater
- than the number of characters left in the string, so the match fails.
- Arguments:
- offset index into the offset vector
- eptr points into the subject
- length length to be matched
- md points to match data block
- ims the ims flags
- Returns: TRUE if matched
- */
- static BOOL
- match_ref(int offset, register USPTR eptr, int length, match_data *md,
- unsigned long int ims)
- {
- USPTR p = md->start_subject + md->offset_vector[offset];
- #ifdef DEBUG
- if (eptr >= md->end_subject)
- printf("matching subject <null>");
- else
- {
- printf("matching subject ");
- pchars(eptr, length, TRUE, md);
- }
- printf(" against backref ");
- pchars(p, length, FALSE, md);
- printf("\n");
- #endif
- /* Always fail if not enough characters left */
- if (length > md->end_subject - eptr) return FALSE;
- /* Separate the caselesss case for speed */
- if ((ims & PCRE_CASELESS) != 0)
- {
- while (length-- > 0)
- if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
- }
- else
- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
- return TRUE;
- }
- /***************************************************************************
- ****************************************************************************
- RECURSION IN THE match() FUNCTION
- The match() function is highly recursive, though not every recursive call
- increases the recursive depth. Nevertheless, some regular expressions can cause
- it to recurse to a great depth. I was writing for Unix, so I just let it call
- itself recursively. This uses the stack for saving everything that has to be
- saved for a recursive call. On Unix, the stack can be large, and this works
- fine.
- It turns out that on some non-Unix-like systems there are problems with
- programs that use a lot of stack. (This despite the fact that every last chip
- has oodles of memory these days, and techniques for extending the stack have
- been known for decades.) So....
- There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
- calls by keeping local variables that need to be preserved in blocks of memory
- obtained from malloc() instead instead of on the stack. Macros are used to
- achieve this so that the actual code doesn't look very different to what it
- always used to.
- The original heap-recursive code used longjmp(). However, it seems that this
- can be very slow on some operating systems. Following a suggestion from Stan
- Switzer, the use of longjmp() has been abolished, at the cost of having to
- provide a unique number for each call to RMATCH. There is no way of generating
- a sequence of numbers at compile time in C. I have given them names, to make
- them stand out more clearly.
- Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
- FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
- tests. Furthermore, not using longjmp() means that local dynamic variables
- don't have indeterminate values; this has meant that the frame size can be
- reduced because the result can be "passed back" by straight setting of the
- variable instead of being passed in the frame.
- ****************************************************************************
- ***************************************************************************/
- /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
- below must be updated in sync. */
- enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
- RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
- RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
- RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
- RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
- RM51, RM52, RM53, RM54 };
- /* These versions of the macros use the stack, as normal. There are debugging
- versions and production versions. Note that the "rw" argument of RMATCH isn't
- actuall used in this definition. */
- #ifndef NO_RECURSE
- #define REGISTER register
- #ifdef DEBUG
- #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
- { \
- printf("match() called in line %d\n", __LINE__); \
- rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
- printf("to line %d\n", __LINE__); \
- }
- #define RRETURN(ra) \
- { \
- printf("match() returned %d from line %d ", ra, __LINE__); \
- return ra; \
- }
- #else
- #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
- rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
- #define RRETURN(ra) return ra
- #endif
- #else
- /* These versions of the macros manage a private stack on the heap. Note that
- the "rd" argument of RMATCH isn't actually used in this definition. It's the md
- argument of match(), which never changes. */
- #define REGISTER
- #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
- {\
- heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
- frame->Xwhere = rw; \
- newframe->Xeptr = ra;\
- newframe->Xecode = rb;\
- newframe->Xmstart = mstart;\
- newframe->Xoffset_top = rc;\
- newframe->Xims = re;\
- newframe->Xeptrb = rf;\
- newframe->Xflags = rg;\
- newframe->Xrdepth = frame->Xrdepth + 1;\
- newframe->Xprevframe = frame;\
- frame = newframe;\
- DPRINTF(("restarting from line %d\n", __LINE__));\
- goto HEAP_RECURSE;\
- L_##rw:\
- DPRINTF(("jumped back to line %d\n", __LINE__));\
- }
- #define RRETURN(ra)\
- {\
- heapframe *newframe = frame;\
- frame = newframe->Xprevframe;\
- (pcre_stack_free)(newframe);\
- if (frame != NULL)\
- {\
- rrc = ra;\
- goto HEAP_RETURN;\
- }\
- return ra;\
- }
- /* Structure for remembering the local variables in a private frame */
- typedef struct heapframe {
- struct heapframe *Xprevframe;
- /* Function arguments that may change */
- const uschar *Xeptr;
- const uschar *Xecode;
- const uschar *Xmstart;
- int Xoffset_top;
- long int Xims;
- eptrblock *Xeptrb;
- int Xflags;
- unsigned int Xrdepth;
- /* Function local variables */
- const uschar *Xcallpat;
- const uschar *Xcharptr;
- const uschar *Xdata;
- const uschar *Xnext;
- const uschar *Xpp;
- const uschar *Xprev;
- const uschar *Xsaved_eptr;
- recursion_info Xnew_recursive;
- BOOL Xcur_is_word;
- BOOL Xcondition;
- BOOL Xprev_is_word;
- unsigned long int Xoriginal_ims;
- #ifdef SUPPORT_UCP
- int Xprop_type;
- int Xprop_value;
- int Xprop_fail_result;
- int Xprop_category;
- int Xprop_chartype;
- int Xprop_script;
- int Xoclength;
- uschar Xocchars[8];
- #endif
- int Xctype;
- unsigned int Xfc;
- int Xfi;
- int Xlength;
- int Xmax;
- int Xmin;
- int Xnumber;
- int Xoffset;
- int Xop;
- int Xsave_capture_last;
- int Xsave_offset1, Xsave_offset2, Xsave_offset3;
- int Xstacksave[REC_STACK_SAVE_MAX];
- eptrblock Xnewptrb;
- /* Where to jump back to */
- int Xwhere;
- } heapframe;
- #endif
- /***************************************************************************
- ***************************************************************************/
- /*************************************************
- * Match from current position *
- *************************************************/
- /* This function is called recursively in many circumstances. Whenever it
- returns a negative (error) response, the outer incarnation must also return the
- same response.
- Performance note: It might be tempting to extract commonly used fields from the
- md structure (e.g. utf8, end_subject) into individual variables to improve
- performance. Tests using gcc on a SPARC disproved this; in the first case, it
- made performance worse.
- Arguments:
- eptr pointer to current character in subject
- ecode pointer to current position in compiled code
- mstart pointer to the current match start position (can be modified
- by encountering \K)
- offset_top current top pointer
- md pointer to "static" info for the match
- ims current /i, /m, and /s options
- eptrb pointer to chain of blocks containing eptr at start of
- brackets - for testing for empty matches
- flags can contain
- match_condassert - this is an assertion condition
- match_cbegroup - this is the start of an unlimited repeat
- group that can match an empty string
- rdepth the recursion depth
- Returns: MATCH_MATCH if matched ) these values are >= 0
- MATCH_NOMATCH if failed to match )
- a negative PCRE_ERROR_xxx value if aborted by an error condition
- (e.g. stopped by repeated call or recursion limit)
- */
- static int
- match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
- int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
- int flags, unsigned int rdepth)
- {
- /* These variables do not need to be preserved over recursion in this function,
- so they can be ordinary variables in all cases. Mark some of them with
- "register" because they are used a lot in loops. */
- register int rrc; /* Returns from recursive calls */
- register int i; /* Used for loops not involving calls to RMATCH() */
- register unsigned int c; /* Character values not kept over RMATCH() calls */
- register BOOL utf8; /* Local copy of UTF-8 flag for speed */
- BOOL minimize, possessive; /* Quantifier options */
- /* When recursion is not being used, all "local" variables that have to be
- preserved over calls to RMATCH() are part of a "frame" which is obtained from
- heap storage. Set up the top-level frame here; others are obtained from the
- heap whenever RMATCH() does a "recursion". See the macro definitions above. */
- #ifdef NO_RECURSE
- heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
- frame->Xprevframe = NULL; /* Marks the top level */
- /* Copy in the original argument variables */
- frame->Xeptr = eptr;
- frame->Xecode = ecode;
- frame->Xmstart = mstart;
- frame->Xoffset_top = offset_top;
- frame->Xims = ims;
- frame->Xeptrb = eptrb;
- frame->Xflags = flags;
- frame->Xrdepth = rdepth;
- /* This is where control jumps back to to effect "recursion" */
- HEAP_RECURSE:
- /* Macros make the argument variables come from the current frame */
- #define eptr frame->Xeptr
- #define ecode frame->Xecode
- #define mstart frame->Xmstart
- #define offset_top frame->Xoffset_top
- #define ims frame->Xims
- #define eptrb frame->Xeptrb
- #define flags frame->Xflags
- #define rdepth frame->Xrdepth
- /* Ditto for the local variables */
- #ifdef SUPPORT_UTF8
- #define charptr frame->Xcharptr
- #endif
- #define callpat frame->Xcallpat
- #define data frame->Xdata
- #define next frame->Xnext
- #define pp frame->Xpp
- #define prev frame->Xprev
- #define saved_eptr frame->Xsaved_eptr
- #define new_recursive frame->Xnew_recursive
- #define cur_is_word frame->Xcur_is_word
- #define condition frame->Xcondition
- #define prev_is_word frame->Xprev_is_word
- #define original_ims frame->Xoriginal_ims
- #ifdef SUPPORT_UCP
- #define prop_type frame->Xprop_type
- #define prop_value frame->Xprop_value
- #define prop_fail_result frame->Xprop_fail_result
- #define prop_category frame->Xprop_category
- #define prop_chartype frame->Xprop_chartype
- #define prop_script frame->Xprop_script
- #define oclength frame->Xoclength
- #define occhars frame->Xocchars
- #endif
- #define ctype frame->Xctype
- #define fc frame->Xfc
- #define fi frame->Xfi
- #define length frame->Xlength
- #define max frame->Xmax
- #define min frame->Xmin
- #define number frame->Xnumber
- #define offset frame->Xoffset
- #define op frame->Xop
- #define save_capture_last frame->Xsave_capture_last
- #define save_offset1 frame->Xsave_offset1
- #define save_offset2 frame->Xsave_offset2
- #define save_offset3 frame->Xsave_offset3
- #define stacksave frame->Xstacksave
- #define newptrb frame->Xnewptrb
- /* When recursion is being used, local variables are allocated on the stack and
- get preserved during recursion in the normal way. In this environment, fi and
- i, and fc and c, can be the same variables. */
- #else /* NO_RECURSE not defined */
- #define fi i
- #define fc c
- #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
- const uschar *charptr; /* in small blocks of the code. My normal */
- #endif /* style of coding would have declared */
- const uschar *callpat; /* them within each of those blocks. */
- const uschar *data; /* However, in order to accommodate the */
- const uschar *next; /* version of this code that uses an */
- USPTR pp; /* external "stack" implemented on the */
- const uschar *prev; /* heap, it is easier to declare them all */
- USPTR saved_eptr; /* here, so the declarations can be cut */
- /* out in a block. The only declarations */
- recursion_info new_recursive; /* within blocks below are for variables */
- /* that do not have to be preserved over */
- BOOL cur_is_word; /* a recursive call to RMATCH(). */
- BOOL condition;
- BOOL prev_is_word;
- unsigned long int original_ims;
- #ifdef SUPPORT_UCP
- int prop_type;
- int prop_value;
- int prop_fail_result;
- int prop_category;
- int prop_chartype;
- int prop_script;
- int oclength;
- uschar occhars[8];
- #endif
- int ctype;
- int length;
- int max;
- int min;
- int number;
- int offset;
- int op;
- int save_capture_last;
- int save_offset1, save_offset2, save_offset3;
- int stacksave[REC_STACK_SAVE_MAX];
- eptrblock newptrb;
- #endif /* NO_RECURSE */
- /* These statements are here to stop the compiler complaining about unitialized
- variables. */
- #ifdef SUPPORT_UCP
- prop_value = 0;
- prop_fail_result = 0;
- #endif
- /* This label is used for tail recursion, which is used in a few cases even
- when NO_RECURSE is not defined, in order to reduce the amount of stack that is
- used. Thanks to Ian Taylor for noticing this possibility and sending the
- original patch. */
- TAIL_RECURSE:
- /* OK, now we can get on with the real code of the function. Recursive calls
- are specified by the macro RMATCH and RRETURN is used to return. When
- NO_RECURSE is *not* defined, these just turn into a recursive call to match()
- and a "return", respectively (possibly with some debugging if DEBUG is
- defined). However, RMATCH isn't like a function call because it's quite a
- complicated macro. It has to be used in one particular way. This shouldn't,
- however, impact performance when true recursion is being used. */
- #ifdef SUPPORT_UTF8
- utf8 = md->utf8; /* Local copy of the flag */
- #else
- utf8 = FALSE;
- #endif
- /* First check that we haven't called match() too many times, or that we
- haven't exceeded the recursive call limit. */
- if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
- if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
- original_ims = ims; /* Save for resetting on ')' */
- /* At the start of a group with an unlimited repeat that may match an empty
- string, the match_cbegroup flag is set. When this is the case, add the current
- subject pointer to the chain of such remembered pointers, to be checked when we
- hit the closing ket, in order to break infinite loops that match no characters.
- When match() is called in other circumstances, don't add to the chain. The
- match_cbegroup flag must NOT be used with tail recursion, because the memory
- block that is used is on the stack, so a new one may be required for each
- match(). */
- if ((flags & match_cbegroup) != 0)
- {
- newptrb.epb_saved_eptr = eptr;
- newptrb.epb_prev = eptrb;
- eptrb = &newptrb;
- }
- /* Now start processing the opcodes. */
- for (;;)
- {
- minimize = possessive = FALSE;
- op = *ecode;
- /* For partial matching, remember if we ever hit the end of the subject after
- matching at least one subject character. */
- if (md->partial &&
- eptr >= md->end_subject &&
- eptr > mstart)
- md->hitend = TRUE;
- switch(op)
- {
- case OP_FAIL:
- RRETURN(MATCH_NOMATCH);
- case OP_PRUNE:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM51);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_PRUNE);
- case OP_COMMIT:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM52);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_COMMIT);
- case OP_SKIP:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM53);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- md->start_match_ptr = eptr; /* Pass back current position */
- RRETURN(MATCH_SKIP);
- case OP_THEN:
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM54);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- RRETURN(MATCH_THEN);
- /* Handle a capturing bracket. If there is space in the offset vector, save
- the current subject position in the working slot at the top of the vector.
- We mustn't change the current values of the data slot, because they may be
- set from a previous iteration of this group, and be referred to by a
- reference inside the group.
- If the bracket fails to match, we need to restore this value and also the
- values of the final offsets, in case they were set by a previous iteration
- of the same bracket.
- If there isn't enough space in the offset vector, treat this as if it were
- a non-capturing bracket. Don't worry about setting the flag for the error
- case here; that is handled in the code for KET. */
- case OP_CBRA:
- case OP_SCBRA:
- number = GET2(ecode, 1+LINK_SIZE);
- offset = number << 1;
- #ifdef DEBUG
- printf("start bracket %d\n", number);
- printf("subject=");
- pchars(eptr, 16, TRUE, md);
- printf("\n");
- #endif
- if (offset < md->offset_max)
- {
- save_offset1 = md->offset_vector[offset];
- save_offset2 = md->offset_vector[offset+1];
- save_offset3 = md->offset_vector[md->offset_end - number];
- save_capture_last = md->capture_last;
- DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
- md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
- flags = (op == OP_SCBRA)? match_cbegroup : 0;
- do
- {
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
- ims, eptrb, flags, RM1);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- md->capture_last = save_capture_last;
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
- DPRINTF(("bracket %d failed\n", number));
- md->offset_vector[offset] = save_offset1;
- md->offset_vector[offset+1] = save_offset2;
- md->offset_vector[md->offset_end - number] = save_offset3;
- RRETURN(MATCH_NOMATCH);
- }
- /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
- as a non-capturing bracket. */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- DPRINTF(("insufficient capture room: treat as non-capturing\n"));
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* VVVVVVVVVVVVVVVVVVVVVVVVV */
- /* Non-capturing bracket. Loop for all the alternatives. When we get to the
- final alternative within the brackets, we would return the result of a
- recursive call to match() whatever happened. We can reduce stack usage by
- turning this into a tail recursion, except in the case when match_cbegroup
- is set.*/
- case OP_BRA:
- case OP_SBRA:
- DPRINTF(("start non-capturing bracket\n"));
- flags = (op >= OP_SBRA)? match_cbegroup : 0;
- for (;;)
- {
- if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
- {
- if (flags == 0) /* Not a possibly empty group */
- {
- ecode += _pcre_OP_lengths[*ecode];
- DPRINTF(("bracket 0 tail recursion\n"));
- goto TAIL_RECURSE;
- }
- /* Possibly empty group; can't use tail recursion. */
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
- eptrb, flags, RM48);
- RRETURN(rrc);
- }
- /* For non-final alternatives, continue the loop for a NOMATCH result;
- otherwise return. */
- RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
- eptrb, flags, RM2);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode, 1);
- }
- /* Control never reaches here. */
- /* Conditional group: compilation checked that there are no more than
- two branches. If the condition is false, skipping the first branch takes us
- past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. As there is only one branch to be
- obeyed, we can use tail recursion to avoid using another stack frame. */
- case OP_COND:
- case OP_SCOND:
- if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
- {
- offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
- condition = md->recursive != NULL &&
- (offset == RREF_ANY || offset == md->recursive->group_num);
- ecode += condition? 3 : GET(ecode, 1);
- }
- else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
- {
- offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
- condition = offset < offset_top && md->offset_vector[offset] >= 0;
- ecode += condition? 3 : GET(ecode, 1);
- }
- else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
- /* The condition is an assertion. Call match() to evaluate it - setting
- the final argument match_condassert causes it to stop at the end of an
- assertion. */
- else
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_condassert, RM3);
- if (rrc == MATCH_MATCH)
- {
- condition = TRUE;
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
- while (*ecode == OP_ALT) ecode += GET(ecode, 1);
- }
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
- {
- RRETURN(rrc); /* Need braces because of following else */
- }
- else
- {
- condition = FALSE;
- ecode += GET(ecode, 1);
- }
- }
- /* We are now at the branch that is to be obeyed. As there is only one,
- we can use tail recursion to avoid using another stack frame, except when
- match_cbegroup is required for an unlimited repeat of a possibly empty
- group. If the second alternative doesn't exist, we can just plough on. */
- if (condition || *ecode == OP_ALT)
- {
- ecode += 1 + LINK_SIZE;
- if (op == OP_SCOND) /* Possibly empty group */
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
- RRETURN(rrc);
- }
- else /* Group must match something */
- {
- flags = 0;
- goto TAIL_RECURSE;
- }
- }
- else /* Condition false & no 2nd alternative */
- {
- ecode += 1 + LINK_SIZE;
- }
- break;
- /* End of the pattern, either real or forced. If we are in a top-level
- recursion, we should restore the offsets appropriately and continue from
- after the call. */
- case OP_ACCEPT:
- case OP_END:
- if (md->recursive != NULL && md->recursive->group_num == 0)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("End of pattern in a (?0) recursion\n"));
- md->recursive = rec->prevrec;
- memmove(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- mstart = rec->save_start;
- ims = original_ims;
- ecode = rec->after_call;
- break;
- }
- /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
- string - backtracking will then try other alternatives, if any. */
- if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
- md->end_match_ptr = eptr; /* Record where we ended */
- md->end_offset_top = offset_top; /* and how many extracts were taken */
- md->start_match_ptr = mstart; /* and the start (\K can modify) */
- RRETURN(MATCH_MATCH);
- /* Change option settings */
- case OP_OPT:
- ims = ecode[1];
- ecode += 2;
- DPRINTF(("ims set to %02lx\n", ims));
- break;
- /* Assertion brackets. Check the alternative branches in turn - the
- matching won't pass the KET for an assertion. If any one branch matches,
- the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
- start of each branch to move the current point backwards, so the code at
- this level is identical to the lookahead case. */
- case OP_ASSERT:
- case OP_ASSERTBACK:
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
- RM4);
- if (rrc == MATCH_MATCH) break;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode, 1);
- }
- while (*ecode == OP_ALT);
- if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
- /* If checking an assertion for a condition, return MATCH_MATCH. */
- if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- ecode += 1 + LINK_SIZE;
- offset_top = md->end_offset_top;
- continue;
- /* Negative assertion: all branches must fail to match */
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK_NOT:
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
- RM5);
- if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
- if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
- ecode += 1 + LINK_SIZE;
- continue;
- /* Move the subject pointer back. This occurs only at the start of
- each branch of a lookbehind assertion. If we are too close to the start to
- move back, this match function fails. When working with UTF-8 we move
- back a number of characters, not bytes. */
- case OP_REVERSE:
- #ifdef SUPPORT_UTF8
- if (utf8)
- {
- i = GET(ecode, 1);
- while (i-- > 0)
- {
- eptr--;
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
- BACKCHAR(eptr);
- }
- }
- else
- #endif
- /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
- {
- eptr -= GET(ecode, 1);
- if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
- }
- /* Skip to next op code */
- ecode += 1 + LINK_SIZE;
- break;
- /* The callout item calls an external function, if one is provided, passing
- details of the match so far. This is mainly for debugging, though the
- function is able to force a failure. */
- case OP_CALLOUT:
- if (pcre_callout != NULL)
- {
- pcre_callout_block cb;
- cb.version = 1; /* Version 1 of the callout block */
- cb.callout_number = ecode[1];
- cb.offset_vector = md->offset_vector;
- cb.subject = (PCRE_SPTR)md->start_subject;
- cb.subject_length = md->end_subject - md->start_subject;
- cb.start_match = mstart - md->start_subject;
- cb.current_position = eptr - md->start_subject;
- cb.pattern_position = GET(ecode, 2);
- cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
- cb.capture_top = offset_top/2;
- cb.capture_last = md->capture_last;
- cb.callout_data = md->callout_data;
- if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
- if (rrc < 0) RRETURN(rrc);
- }
- ecode += 2 + 2*LINK_SIZE;
- break;
- /* Recursion either matches the current regex, or some subexpression. The
- offset data is the offset to the starting bracket from the start of the
- whole pattern. (This is so that it works from duplicated subpatterns.)
- If there are any capturing brackets started but not finished, we have to
- save their starting points and reinstate them after the recursion. However,
- we don't know how many such there are (offset_top records the completed
- total) so we just have to save all the potential data. There may be up to
- 65535 such values, which is too large to put on the stack, but using malloc
- for small numbers seems expensive. As a compromise, the stack is used when
- there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
- is used. A problem is what to do if the malloc fails ... there is no way of
- returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
- values on the stack, and accept that the rest may be wrong.
- There are also other values that have to be saved. We use a chained
- sequence of blocks that actually live on the stack. Thanks to Robin Houston
- for the original version of this logic. */
- case OP_RECURSE:
- {
- callpat = md->start_code + GET(ecode, 1);
- new_recursive.group_num = (callpat == md->start_code)? 0 :
- GET2(callpat, 1 + LINK_SIZE);
- /* Add to "recursing stack" */
- new_recursive.prevrec = md->recursive;
- md->recursive = &new_recursive;
- /* Find where to continue from afterwards */
- ecode += 1 + LINK_SIZE;
- new_recursive.after_call = ecode;
- /* Now save the offset data. */
- new_recursive.saved_max = md->offset_end;
- if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
- new_recursive.offset_save = stacksave;
- else
- {
- new_recursive.offset_save =
- (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
- if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
- }
- memcpy(new_recursive.offset_save, md->offset_vector,
- new_recursive.saved_max * sizeof(int));
- new_recursive.save_start = mstart;
- mstart = eptr;
- /* OK, now we can do the recursion. For each top-level alternative we
- restore the offset and recursion data. */
- DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
- flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
- do
- {
- RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
- md, ims, eptrb, flags, RM6);
- if (rrc == MATCH_MATCH)
- {
- DPRINTF(("Recursion matched\n"));
- md->recursive = new_recursive.prevrec;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_MATCH);
- }
- else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
- {
- DPRINTF(("Recursion gave error %d\n", rrc));
- RRETURN(rrc);
- }
- md->recursive = &new_recursive;
- memcpy(md->offset_vector, new_recursive.offset_save,
- new_recursive.saved_max * sizeof(int));
- callpat += GET(callpat, 1);
- }
- while (*callpat == OP_ALT);
- DPRINTF(("Recursion didn't match\n"));
- md->recursive = new_recursive.prevrec;
- if (new_recursive.offset_save != stacksave)
- (pcre_free)(new_recursive.offset_save);
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never reaches here */
- /* "Once" brackets are like assertion brackets except that after a match,
- the point in the subject string is not moved back. Thus there can never be
- a move back into the brackets. Friedl calls these "atomic" subpatterns.
- Check the alternative branches in turn - the matching won't pass the KET
- for this kind of subpattern. If any one branch matches, we carry on as at
- the end of a normal bracket, leaving the subject pointer. */
- case OP_ONCE:
- prev = ecode;
- saved_eptr = eptr;
- do
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
- if (rrc == MATCH_MATCH) break;
- if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
- ecode += GET(ecode,1);
- }
- while (*ecode == OP_ALT);
- /* If hit the end of the group (which could be repeated), fail */
- if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
- /* Continue as from after the assertion, updating the offsets high water
- mark, since extracts may have been taken. */
- do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
- offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1+LINK_SIZE;
- break;
- }
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. The second "call" of match()
- uses tail recursion, to avoid using another stack frame. We need to reset
- any options that changed within the bracket before re-running it, so
- check the next opcode. */
- if (ecode[1+LINK_SIZE] == OP_OPT)
- {
- ims = (ims & ~PCRE_IMS) | ecode[4];
- DPRINTF(("ims set to %02lx at group repeat\n", ims));
- }
- if (*ecode == OP_KETRMIN)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode = prev;
- flags = 0;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += 1 + LINK_SIZE;
- flags = 0;
- goto TAIL_RECURSE;
- }
- /* Control never gets here */
- /* An alternation is the end of a branch; scan along to find the end of the
- bracketed group and go to there. */
- case OP_ALT:
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
- break;
- /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
- that it may occur zero times. It may repeat infinitely, or not at all -
- i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
- repeat limits are compiled as a number of copies, with the optional ones
- preceded by BRAZERO or BRAMINZERO. */
- case OP_BRAZERO:
- {
- next = ecode+1;
- RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- do next += GET(next,1); while (*next == OP_ALT);
- ecode = next + 1 + LINK_SIZE;
- }
- break;
- case OP_BRAMINZERO:
- {
- next = ecode+1;
- do next += GET(next, 1); while (*next == OP_ALT);
- RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode++;
- }
- break;
- /* End of a group, repeated or non-repeating. */
- case OP_KET:
- case OP_KETRMIN:
- case OP_KETRMAX:
- prev = ecode - GET(ecode, 1);
- /* If this was a group that remembered the subject start, in order to break
- infinite repeats of empty string matches, retrieve the subject start from
- the chain. Otherwise, set it NULL. */
- if (*prev >= OP_SBRA)
- {
- saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
- eptrb = eptrb->epb_prev; /* Backup to previous group */
- }
- else saved_eptr = NULL;
- /* If we are at the end of an assertion group, stop matching and return
- MATCH_MATCH, but record the current high water mark for use by positive
- assertions. Do this also for the "once" (atomic) groups. */
- if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
- *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
- *prev == OP_ONCE)
- {
- md->end_match_ptr = eptr; /* For ONCE */
- md->end_offset_top = offset_top;
- RRETURN(MATCH_MATCH);
- }
- /* For capturing groups we have to check the group number back at the start
- and if necessary complete handling an extraction by setting the offsets and
- bumping the high water mark. Note that whole-pattern recursion is coded as
- a recurse into group 0, so it won't be picked up here. Instead, we catch it
- when the OP_END is reached. Other recursion is handled here. */
- if (*prev == OP_CBRA || *prev == OP_SCBRA)
- {
- number = GET2(prev, 1+LINK_SIZE);
- offset = number << 1;
- #ifdef DEBUG
- printf("end bracket %d", number);
- printf("\n");
- #endif
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
- {
- md->offset_vector[offset] =
- md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
- if (offset_top <= offset) offset_top = offset + 2;
- }
- /* Handle a recursively called group. Restore the offsets
- appropriately and continue from after the call. */
- if (md->recursive != NULL && md->recursive->group_num == number)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
- md->recursive = rec->prevrec;
- mstart = rec->save_start;
- memcpy(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- ecode = rec->after_call;
- ims = original_ims;
- break;
- }
- }
- /* For both capturing and non-capturing groups, reset the value of the ims
- flags, in case they got changed during the group. */
- ims = original_ims;
- DPRINTF(("ims reset to %02lx\n", ims));
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 1 + LINK_SIZE;
- break;
- }
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. In the second case, we can use
- tail recursion to avoid using another stack frame, unless we have an
- unlimited repeat of a group that can match an empty string. */
- flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
- if (*ecode == OP_KETRMIN)
- {
- RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (flags != 0) /* Could match an empty string */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
- RRETURN(rrc);
- }
- ecode = prev;
- goto TAIL_RECURSE;
- }
- else /* OP_KETRMAX */
- {
- RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- ecode += 1 + LINK_SIZE;
- flags = 0;
- goto TAIL_RECURSE;
- }
- /* Control never gets here */
- /* Start of subject unless notbol, or after internal newline if multiline */
- case OP_CIRC:
- if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr != md->start_subject &&
- (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
- /* ... else fall through */
- /* Start of subject assertion */
- case OP_SOD:
- if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- /* Start of match assertion */
- case OP_SOM:
- if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- /* Reset the start of match point */
- case OP_SET_SOM:
- mstart = eptr;
- ecode++;
- break;
- /* Assert before internal newline if multiline, or before a terminating
- newline unless endonly is set, else end of subject unless noteol is set. */
- case OP_DOLL:
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr < md->end_subject)
- { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
- else
- { if (md->noteol) RRETURN(MATCH_NOMATCH); }
- ecode++;
- break;
- }
- else
- {
- if (md->noteol) RRETURN(MATCH_NOMATCH);
- if (!md->endonly)
- {
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- }
- }
- /* ... else fall through for endonly */
- /* End of subject assertion (\z) */
- case OP_EOD:
- if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- /* End of subject or ending \n assertion (\Z) */
- case OP_EODN:
- if (eptr != md->end_subject &&
- (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- /* Word boundary assertions */
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- {
- /* Find out if the previous and current characters are "word" characters.
- It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
- be "non-word" characters. */
- #ifdef SUPPORT_UTF8
- if (utf8)
- {
- if (eptr == md->start_subject) prev_is_word = FALSE; else
- {
- const uschar *lastptr = eptr - 1;
- while((*lastptr & 0xc0) == 0x80) lastptr--;
- GETCHAR(c, lastptr);
- prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
- }
- if (eptr >= md->end_subject) cur_is_word = FALSE; else
- {
- GETCHAR(c, eptr);
- cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
- }
- }
- else
- #endif
- /* More streamlined when not in UTF-8 mode */
- {
- prev_is_word = (eptr != md->start_subject) &&
- ((md->ctypes[eptr[-1]] & ctype_word) != 0);
- cur_is_word = (eptr < md->end_subject) &&
- ((md->ctypes[*eptr] & ctype_word) != 0);
- }
- /* Now see if the situation is what we want */
- if ((*ecode++ == OP_WORD_BOUNDARY)?
- cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- RRETURN(MATCH_NOMATCH);
- }
- break;
- /* Match a single character type; inline for speed */
- case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- }
- if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- ecode++;
- break;
- /* Match a single byte, even in UTF-8 mode. This opcode really does match
- any byte, even newline, independent of the setting of PCRE_DOTALL. */
- case OP_ANYBYTE:
- if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- case OP_NOT_DIGIT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
- #ifdef SUPPORT_UTF8
- c < 256 &&
- #endif
- (md->ctypes[c] & ctype_digit) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- case OP_DIGIT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
- #ifdef SUPPORT_UTF8
- c >= 256 ||
- #endif
- (md->ctypes[c] & ctype_digit) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- case OP_NOT_WHITESPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
- #ifdef SUPPORT_UTF8
- c < 256 &&
- #endif
- (md->ctypes[c] & ctype_space) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- case OP_WHITESPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
- #ifdef SUPPORT_UTF8
- c >= 256 ||
- #endif
- (md->ctypes[c] & ctype_space) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- case OP_NOT_WORDCHAR:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
- #ifdef SUPPORT_UTF8
- c < 256 &&
- #endif
- (md->ctypes[c] & ctype_word) != 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- case OP_WORDCHAR:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- if (
- #ifdef SUPPORT_UTF8
- c >= 256 ||
- #endif
- (md->ctypes[c] & ctype_word) == 0
- )
- RRETURN(MATCH_NOMATCH);
- ecode++;
- break;
- case OP_ANYNL:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- ecode++;
- break;
- case OP_NOT_HSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
- case OP_HSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- ecode++;
- break;
- case OP_NOT_VSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- ecode++;
- break;
- case OP_VSPACE:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- ecode++;
- break;
- #ifdef SUPPORT_UCP
- /* Check the next character by Unicode property. We will get here only
- if the support is in the binary; otherwise a compile-time error occurs. */
- case OP_PROP:
- case OP_NOTPROP:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- {
- int chartype, script;
- int category = _pcre_ucp_findprop(c, &chartype, &script);
- switch(ecode[1])
- {
- case PT_ANY:
- if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
- break;
- case PT_LAMP:
- if ((chartype == ucp_Lu ||
- chartype == ucp_Ll ||
- chartype == ucp_Lt) == (op == OP_NOTPROP))
- RRETURN(MATCH_NOMATCH);
- break;
- case PT_GC:
- if ((ecode[2] != category) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
- case PT_PC:
- if ((ecode[2] != chartype) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
- case PT_SC:
- if ((ecode[2] != script) == (op == OP_PROP))
- RRETURN(MATCH_NOMATCH);
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- ecode += 3;
- }
- break;
- /* Match an extended Unicode sequence. We will get here only if the support
- is in the binary; otherwise a compile-time error occurs. */
- case OP_EXTUNI:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- {
- int chartype, script;
- int category = _pcre_ucp_findprop(c, &chartype, &script);
- if (category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- category = _pcre_ucp_findprop(c, &chartype, &script);
- if (category != ucp_M) break;
- eptr += len;
- }
- }
- ecode++;
- break;
- #endif
- /* Match a back reference, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. The code is similar
- to that for character classes, but repeated for efficiency. Then obey
- similar code to character type repeats - written out again for speed.
- However, if the referenced string is the empty string, always treat
- it as matched, any number of times (otherwise there could be infinite
- loops). */
- case OP_REF:
- {
- offset = GET2(ecode, 1) << 1; /* Doubled ref number */
- ecode += 3; /* Advance past item */
- /* If the reference is unset, set the length to be longer than the amount
- of subject left; this ensures that every attempt at a match fails. We
- can't just fail here, because of the possibility of quantifiers with zero
- minima. */
- length = (offset >= offset_top || md->offset_vector[offset] < 0)?
- md->end_subject - eptr + 1 :
- md->offset_vector[offset+1] - md->offset_vector[offset];
- /* Set up for repetition, or handle the non-repeated case */
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
- default: /* No repeat follows */
- if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
- eptr += length;
- continue; /* With the main loop */
- }
- /* If the length of the reference is zero, just continue with the
- main loop. */
- if (length == 0) continue;
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
- for (i = 1; i <= min; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
- eptr += length;
- }
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
- if (min == max) continue;
- /* If minimizing, keep trying and advancing the pointer */
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || !match_ref(offset, eptr, length, md, ims))
- RRETURN(MATCH_NOMATCH);
- eptr += length;
- }
- /* Control never gets here */
- }
- /* If maximizing, find the longest string and work backwards */
- else
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) break;
- eptr += length;
- }
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr -= length;
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- /* Match a bit-mapped character class, possibly repeatedly. This op code is
- used when all the characters in the class have values in the range 0-255,
- and either the matching is caseful, or the characters are in the range
- 0-127 when UTF-8 processing is enabled. The only difference between
- OP_CLASS and OP_NCLASS occurs when a data character outside the range is
- encountered.
- First, look past the end of the item to see if there is repeat information
- following. Then obey similar code to character type repeats - written out
- again for speed. */
- case OP_NCLASS:
- case OP_CLASS:
- {
- data = ecode + 1; /* Save for matching */
- ecode += 33; /* Advance past the item */
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
- /* First, ensure the minimum number of matches are present. */
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c > 255)
- {
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- /* If max == min we can continue with the main loop without the
- need to recurse. */
- if (min == max) continue;
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
- if (minimize)
- {
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c > 255)
- {
- if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
- /* If maximizing, find the longest possible run, then work backwards. */
- else
- {
- pp = eptr;
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c > 255)
- {
- if (op == OP_CLASS) break;
- }
- else
- {
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- }
- eptr += len;
- }
- for (;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- eptr++;
- }
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- /* Match an extended character class. This opcode is encountered only
- in UTF-8 mode, because that's the only time it is compiled. */
- #ifdef SUPPORT_UTF8
- case OP_XCLASS:
- {
- data = ecode + 1 + LINK_SIZE; /* Save for matching */
- ecode += GET(ecode, 1); /* Advance past the item */
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = GET2(ecode, 1);
- max = GET2(ecode, 3);
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
- /* First, ensure the minimum number of matches are present. */
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
- }
- /* If max == min we can continue with the main loop without the
- need to recurse. */
- if (min == max) continue;
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- /* If maximizing, find the longest possible run, then work backwards. */
- else
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (!_pcre_xclass(c, data)) break;
- eptr += len;
- }
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- if (utf8) BACKCHAR(eptr);
- }
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- #endif /* End of XCLASS */
- /* Match a single character, casefully */
- case OP_CHAR:
- #ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
- if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
- }
- else
- #endif
- /* Non-UTF-8 mode */
- {
- if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
- if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
- ecode += 2;
- }
- break;
- /* Match a single character, caselessly */
- case OP_CHARNC:
- #ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- ecode++;
- GETCHARLEN(fc, ecode, length);
- if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- /* If the pattern character's value is < 128, we have only one byte, and
- can use the fast lookup table. */
- if (fc < 128)
- {
- if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- }
- /* Otherwise we must pick up the subject character */
- else
- {
- unsigned int dc;
- GETCHARINC(dc, eptr);
- ecode += length;
- /* If we have Unicode property support, we can use it to test the other
- case of the character, if there is one. */
- if (fc != dc)
- {
- #ifdef SUPPORT_UCP
- if (dc != _pcre_ucp_othercase(fc))
- #endif
- RRETURN(MATCH_NOMATCH);
- }
- }
- }
- else
- #endif /* SUPPORT_UTF8 */
- /* Non-UTF-8 mode */
- {
- if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
- if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- ecode += 2;
- }
- break;
- /* Match a single character repeatedly. */
- case OP_EXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATCHAR;
- case OP_POSUPTO:
- possessive = TRUE;
- /* Fall through */
- case OP_UPTO:
- case OP_MINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_MINUPTO;
- ecode += 3;
- goto REPEATCHAR;
- case OP_POSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
- case OP_POSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATCHAR;
- case OP_POSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATCHAR;
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- c = *ecode++ - OP_STAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character matches. We can give
- up quickly if there are fewer than the minimum number of characters left in
- the subject. */
- REPEATCHAR:
- #ifdef SUPPORT_UTF8
- if (utf8)
- {
- length = 1;
- charptr = ecode;
- GETCHARLEN(fc, ecode, length);
- if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- ecode += length;
- /* Handle multibyte character matching specially here. There is
- support for caseless matching if UCP support is present. */
- if (length > 1)
- {
- #ifdef SUPPORT_UCP
- unsigned int othercase;
- if ((ims & PCRE_CASELESS) != 0 &&
- (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
- oclength = _pcre_ord2utf8(othercase, occhars);
- else oclength = 0;
- #endif /* SUPPORT_UCP */
- for (i = 1; i <= min; i++)
- {
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
- #ifdef SUPPORT_UCP
- /* Need braces because of following else */
- else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
- eptr += oclength;
- }
- #else /* without SUPPORT_UCP */
- else { RRETURN(MATCH_NOMATCH); }
- #endif /* SUPPORT_UCP */
- }
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
- #ifdef SUPPORT_UCP
- /* Need braces because of following else */
- else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
- eptr += oclength;
- }
- #else /* without SUPPORT_UCP */
- else { RRETURN (MATCH_NOMATCH); }
- #endif /* SUPPORT_UCP */
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr > md->end_subject - length) break;
- if (memcmp(eptr, charptr, length) == 0) eptr += length;
- #ifdef SUPPORT_UCP
- else if (oclength == 0) break;
- else
- {
- if (memcmp(eptr, occhars, oclength) != 0) break;
- eptr += oclength;
- }
- #else /* without SUPPORT_UCP */
- else break;
- #endif /* SUPPORT_UCP */
- }
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr == pp) RRETURN(MATCH_NOMATCH);
- #ifdef SUPPORT_UCP
- eptr--;
- BACKCHAR(eptr);
- #else /* without SUPPORT_UCP */
- eptr -= length;
- #endif /* SUPPORT_UCP */
- }
- }
- /* Control never gets here */
- }
- /* If the length of a UTF-8 character is 1, we fall through here, and
- obey the code as for non-UTF-8 characters below, though in this case the
- value of fc will always be < 128. */
- }
- else
- #endif /* SUPPORT_UTF8 */
- /* When not in UTF-8 mode, load a single-byte character. */
- {
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- fc = *ecode++;
- }
- /* The value of fc at this point is always less than 256, though we may or
- may not be in UTF-8 mode. The code is duplicated for the caseless and
- caseful cases, for speed, since matching characters is likely to be quite
- common. First, ensure the minimum number of matches are present. If min =
- max, continue at the same level without recursing. Otherwise, if
- minimizing, keep trying the rest of the expression and advancing one
- matching character if failing, up to the maximum. Alternatively, if
- maximizing, find the maximum number of characters and work backwards. */
- DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
- max, eptr));
- if ((ims & PCRE_CASELESS) != 0)
- {
- fc = md->lcc[fc];
- for (i = 1; i <= min; i++)
- if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- fc != md->lcc[*eptr++])
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- /* Caseful comparisons (includes all multi-byte characters) */
- else
- {
- for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
- if (min == max) continue;
- if (minimize)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- else /* Maximize */
- {
- pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc != *eptr) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- /* Match a negated single one-byte character. The character we are
- checking can be multibyte. */
- case OP_NOT:
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- ecode++;
- GETCHARINCTEST(c, eptr);
- if ((ims & PCRE_CASELESS) != 0)
- {
- #ifdef SUPPORT_UTF8
- if (c < 256)
- #endif
- c = md->lcc[c];
- if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
- }
- else
- {
- if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
- }
- break;
- /* Match a negated single one-byte character repeatedly. This is almost a
- repeat of the code for a repeated single character, but I haven't found a
- nice way of commoning these up that doesn't require a test of the
- positive/negative option for each character match. Maybe that wouldn't add
- very much to the time taken, but character matching *is* what this is all
- about... */
- case OP_NOTEXACT:
- min = max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATNOTCHAR;
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_NOTMINUPTO;
- ecode += 3;
- goto REPEATNOTCHAR;
- case OP_NOTPOSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
- case OP_NOTPOSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATNOTCHAR;
- case OP_NOTPOSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATNOTCHAR;
- case OP_NOTPOSUPTO:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATNOTCHAR;
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- c = *ecode++ - OP_NOTSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-byte matches. We can give up quickly
- if there are fewer than the minimum number of bytes left in the
- subject. */
- REPEATNOTCHAR:
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- fc = *ecode++;
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
- DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
- max, eptr));
- if ((ims & PCRE_CASELESS) != 0)
- {
- fc = md->lcc[fc];
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = 1; i <= min; i++)
- {
- GETCHARINC(d, eptr);
- if (d < 256) d = md->lcc[d];
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
- }
- if (min == max) continue;
- if (minimize)
- {
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- GETCHARINC(d, eptr);
- if (d < 256) d = md->lcc[d];
- if (fi >= max || eptr >= md->end_subject || fc == d)
- RRETURN(MATCH_NOMATCH);
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
- /* Maximize case */
- else
- {
- pp = eptr;
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(d, eptr, len);
- if (d < 256) d = md->lcc[d];
- if (fc == d) break;
- eptr += len;
- }
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- }
- /* Caseful comparisons */
- else
- {
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = 1; i <= min; i++)
- {
- GETCHARINC(d, eptr);
- if (fc == d) RRETURN(MATCH_NOMATCH);
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (i = 1; i <= min; i++)
- if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
- }
- if (min == max) continue;
- if (minimize)
- {
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- GETCHARINC(d, eptr);
- if (fi >= max || eptr >= md->end_subject || fc == d)
- RRETURN(MATCH_NOMATCH);
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- }
- /* Maximize case */
- else
- {
- pp = eptr;
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- register unsigned int d;
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(d, eptr, len);
- if (fc == d) break;
- eptr += len;
- }
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || fc == *eptr) break;
- eptr++;
- }
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- eptr--;
- }
- }
- RRETURN(MATCH_NOMATCH);
- }
- }
- /* Control never gets here */
- /* Match a single character type repeatedly; several different opcodes
- share code. This is very similar to the code for single characters, but we
- repeat it in the interests of efficiency. */
- case OP_TYPEEXACT:
- min = max = GET2(ecode, 1);
- minimize = TRUE;
- ecode += 3;
- goto REPEATTYPE;
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- min = 0;
- max = GET2(ecode, 1);
- minimize = *ecode == OP_TYPEMINUPTO;
- ecode += 3;
- goto REPEATTYPE;
- case OP_TYPEPOSSTAR:
- possessive = TRUE;
- min = 0;
- max = INT_MAX;
- ecode++;
- goto REPEATTYPE;
- case OP_TYPEPOSPLUS:
- possessive = TRUE;
- min = 1;
- max = INT_MAX;
- ecode++;
- goto REPEATTYPE;
- case OP_TYPEPOSQUERY:
- possessive = TRUE;
- min = 0;
- max = 1;
- ecode++;
- goto REPEATTYPE;
- case OP_TYPEPOSUPTO:
- possessive = TRUE;
- min = 0;
- max = GET2(ecode, 1);
- ecode += 3;
- goto REPEATTYPE;
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- c = *ecode++ - OP_TYPESTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- /* Common code for all repeated single character type matches. Note that
- in UTF-8 mode, '.' matches a character of any length, but for the other
- character types, the valid characters are all one-byte long. */
- REPEATTYPE:
- ctype = *ecode++; /* Code for the character type */
- #ifdef SUPPORT_UCP
- if (ctype == OP_PROP || ctype == OP_NOTPROP)
- {
- prop_fail_result = ctype == OP_NOTPROP;
- prop_type = *ecode++;
- prop_value = *ecode++;
- }
- else prop_type = -1;
- #endif
- /* First, ensure the minimum number of matches are present. Use inline
- code for maximizing the speed, and do the type test once at the start
- (i.e. keep it out of the loop). Also we can test that there are at least
- the minimum number of bytes before we start. This isn't as effective in
- UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
- is tidier. Also separate the UCP code, which can be the same for both UTF-8
- and single-bytes. */
- if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
- if (min > 0)
- {
- #ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- }
- break;
- case PT_LAMP:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case PT_GC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case PT_PC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case PT_SC:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
- else if (ctype == OP_EXTUNI)
- {
- for (i = 1; i <= min; i++)
- {
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
- }
- else
- #endif /* SUPPORT_UCP */
- /* Handle all other cases when the coding is UTF-8 */
- #ifdef SUPPORT_UTF8
- if (utf8) switch(ctype)
- {
- case OP_ANY:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- break;
- case OP_ANYBYTE:
- eptr += min;
- break;
- case OP_ANYNL:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- }
- break;
- case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
- case OP_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- }
- break;
- case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
- case OP_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- }
- break;
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
- RRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
- }
- break;
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
- RRETURN(MATCH_NOMATCH);
- while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
- }
- break;
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- /* No need to skip more bytes - we know it's a 1-byte character */
- }
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- } /* End switch(ctype) */
- else
- #endif /* SUPPORT_UTF8 */
- /* Code for the non-UTF-8 case for minimum matching of operators other
- than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
- number of bytes present, as this was tested above. */
- switch(ctype)
- {
- case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = 1; i <= min; i++)
- {
- if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
- eptr++;
- }
- }
- else eptr += min;
- break;
- case OP_ANYBYTE:
- eptr += min;
- break;
- /* Because of the CRLF case, we can't assume the minimum number of
- bytes are present in this case. */
- case OP_ANYNL:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
- case 0x000b:
- case 0x000c:
- case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- }
- break;
- case OP_NOT_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
- case OP_HSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- break;
- }
- }
- break;
- case OP_NOT_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
- }
- }
- break;
- case OP_VSPACE:
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- switch(*eptr++)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- break;
- }
- }
- break;
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- /* If min = max, continue at the same level without recursing */
- if (min == max) continue;
- /* If minimizing, we have to test the rest of the pattern before each
- subsequent match. Again, separate the UTF-8 case for speed, and also
- separate the UCP cases. */
- if (minimize)
- {
- #ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- if (prop_fail_result) RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- case PT_LAMP:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- case PT_GC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_category == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- case PT_PC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- case PT_SC:
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_script == prop_value) == prop_fail_result)
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
- else if (ctype == OP_EXTUNI)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
- }
- else
- #endif /* SUPPORT_UCP */
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
- IS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
- GETCHARINC(c, eptr);
- switch(ctype)
- {
- case OP_ANY: /* This is the DOTALL case */
- break;
- case OP_ANYBYTE:
- break;
- case OP_ANYNL:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
- case 0x000b:
- case 0x000c:
- case 0x0085:
- case 0x2028:
- case 0x2029:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- break;
- case OP_NOT_HSPACE:
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case OP_HSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- break;
- }
- break;
- case OP_NOT_VSPACE:
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case OP_VSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- break;
- }
- break;
- case OP_NOT_DIGIT:
- if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
- case OP_DIGIT:
- if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
- case OP_NOT_WHITESPACE:
- if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
- case OP_WHITESPACE:
- if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
- case OP_NOT_WORDCHAR:
- if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
- RRETURN(MATCH_NOMATCH);
- break;
- case OP_WORDCHAR:
- if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
- RRETURN(MATCH_NOMATCH);
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- }
- else
- #endif
- /* Not UTF-8 mode */
- {
- for (fi = min;; fi++)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (fi >= max || eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
- RRETURN(MATCH_NOMATCH);
- c = *eptr++;
- switch(ctype)
- {
- case OP_ANY: /* This is the DOTALL case */
- break;
- case OP_ANYBYTE:
- break;
- case OP_ANYNL:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x000d:
- if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
- break;
- case 0x000a:
- break;
- case 0x000b:
- case 0x000c:
- case 0x0085:
- if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
- break;
- }
- break;
- case OP_NOT_HSPACE:
- switch(c)
- {
- default: break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case OP_HSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- break;
- }
- break;
- case OP_NOT_VSPACE:
- switch(c)
- {
- default: break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- RRETURN(MATCH_NOMATCH);
- }
- break;
- case OP_VSPACE:
- switch(c)
- {
- default: RRETURN(MATCH_NOMATCH);
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- break;
- }
- break;
- case OP_NOT_DIGIT:
- if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_DIGIT:
- if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_NOT_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_NOT_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
- break;
- case OP_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- }
- }
- /* Control never gets here */
- }
- /* If maximizing, it is worth using inline code for speed, doing the type
- test once at the start (i.e. keep it out of the loop). Again, keep the
- UTF-8 and UCP stuff separate. */
- else
- {
- pp = eptr; /* Remember where we started */
- #ifdef SUPPORT_UCP
- if (prop_type >= 0)
- {
- switch(prop_type)
- {
- case PT_ANY:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (prop_fail_result) break;
- eptr+= len;
- }
- break;
- case PT_LAMP:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == ucp_Lu ||
- prop_chartype == ucp_Ll ||
- prop_chartype == ucp_Lt) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
- case PT_GC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_category == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
- case PT_PC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_chartype == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
- case PT_SC:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if ((prop_script == prop_value) == prop_fail_result)
- break;
- eptr+= len;
- }
- break;
- }
- /* eptr is now past the end of the maximum run */
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- if (utf8) BACKCHAR(eptr);
- }
- }
- /* Match extended Unicode sequences. We will get here only if the
- support is in the binary; otherwise a compile-time error occurs. */
- else if (ctype == OP_EXTUNI)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- GETCHARINCTEST(c, eptr);
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category == ucp_M) break;
- while (eptr < md->end_subject)
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr += len;
- }
- }
- /* eptr is now past the end of the maximum run */
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- for (;;) /* Move back over one extended */
- {
- int len = 1;
- if (!utf8) c = *eptr; else
- {
- BACKCHAR(eptr);
- GETCHARLEN(c, eptr, len);
- }
- prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
- if (prop_category != ucp_M) break;
- eptr--;
- }
- }
- }
- else
- #endif /* SUPPORT_UCP */
- #ifdef SUPPORT_UTF8
- /* UTF-8 mode */
- if (utf8)
- {
- switch(ctype)
- {
- case OP_ANY:
- if (max < INT_MAX)
- {
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- }
- /* Handle unlimited UTF-8 repeat */
- else
- {
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else
- {
- eptr = md->end_subject;
- }
- }
- break;
- /* The byte case is the same as non-UTF8 */
- case OP_ANYBYTE:
- c = max - min;
- if (c > (unsigned int)(md->end_subject - eptr))
- c = md->end_subject - eptr;
- eptr += c;
- break;
- case OP_ANYNL:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c == 0x000d)
- {
- if (++eptr >= md->end_subject) break;
- if (*eptr == 0x000a) eptr++;
- }
- else
- {
- if (c != 0x000a &&
- (md->bsr_anycrlf ||
- (c != 0x000b && c != 0x000c &&
- c != 0x0085 && c != 0x2028 && c != 0x2029)))
- break;
- eptr += len;
- }
- }
- break;
- case OP_NOT_HSPACE:
- case OP_HSPACE:
- for (i = min; i < max; i++)
- {
- BOOL gotspace;
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- switch(c)
- {
- default: gotspace = FALSE; break;
- case 0x09: /* HT */
- case 0x20: /* SPACE */
- case 0xa0: /* NBSP */
- case 0x1680: /* OGHAM SPACE MARK */
- case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
- case 0x2000: /* EN QUAD */
- case 0x2001: /* EM QUAD */
- case 0x2002: /* EN SPACE */
- case 0x2003: /* EM SPACE */
- case 0x2004: /* THREE-PER-EM SPACE */
- case 0x2005: /* FOUR-PER-EM SPACE */
- case 0x2006: /* SIX-PER-EM SPACE */
- case 0x2007: /* FIGURE SPACE */
- case 0x2008: /* PUNCTUATION SPACE */
- case 0x2009: /* THIN SPACE */
- case 0x200A: /* HAIR SPACE */
- case 0x202f: /* NARROW NO-BREAK SPACE */
- case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
- case 0x3000: /* IDEOGRAPHIC SPACE */
- gotspace = TRUE;
- break;
- }
- if (gotspace == (ctype == OP_NOT_HSPACE)) break;
- eptr += len;
- }
- break;
- case OP_NOT_VSPACE:
- case OP_VSPACE:
- for (i = min; i < max; i++)
- {
- BOOL gotspace;
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- switch(c)
- {
- default: gotspace = FALSE; break;
- case 0x0a: /* LF */
- case 0x0b: /* VT */
- case 0x0c: /* FF */
- case 0x0d: /* CR */
- case 0x85: /* NEL */
- case 0x2028: /* LINE SEPARATOR */
- case 0x2029: /* PARAGRAPH SEPARATOR */
- gotspace = TRUE;
- break;
- }
- if (gotspace == (ctype == OP_NOT_VSPACE)) break;
- eptr += len;
- }
- break;
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
- eptr+= len;
- }
- break;
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
- eptr+= len;
- }
- break;
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
- eptr+= len;
- }
- break;
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
- eptr+= len;
- }
- break;
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
- eptr+= len;
- }
- break;
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- int len = 1;
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len);
- if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
- eptr+= len;
- }
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- /* eptr is now past the end of the maximum run */
- if (possessive) continue;
- for(;;)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- if (eptr-- == pp) break; /* Stop if tried at original pos */
- BACKCHAR(eptr);
- }
- }
- else
- #endif /* SUPPORT_UTF8 */
- /* Not UTF-8 mode */
- {
- switch(ctype)
- {
- case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
- eptr++;
- }
- break;
- }
- /* For DOTALL case, fall through and treat as \C */
- case OP_ANYBYTE:
- c = max - min;
- if (c > (unsigned int)(md->end_subject - eptr))
- c = md->end_subject - eptr;
- eptr += c;
- break;
- case OP_ANYNL:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x000d)
- {
- if (++eptr >= md->end_subject) break;
- if (*eptr == 0x000a) eptr++;
- }
- else
- {
- if (c != 0x000a &&
- (md->bsr_anycrlf ||
- (c != 0x000b && c != 0x000c && c != 0x0085)))
- break;
- eptr++;
- }
- }
- break;
- case OP_NOT_HSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x09 || c == 0x20 || c == 0xa0) break;
- eptr++;
- }
- break;
- case OP_HSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c != 0x09 && c != 0x20 && c != 0xa0) break;
- eptr++;
- }
- break;
- case OP_NOT_VSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
- break;
- eptr++;
- }
- break;
- case OP_VSPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- c = *eptr;
- if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
- break;
- eptr++;
- }
- break;
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
- break;
- eptr++;
- }
- break;
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
- break;
- eptr++;
- }
- break;
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
- break;
- eptr++;
- }
- break;
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
- break;
- eptr++;
- }
- break;
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
- break;
- eptr++;
- }
- break;
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
- break;
- eptr++;
- }
- break;
- default:
- RRETURN(PCRE_ERROR_INTERNAL);
- }
- /* eptr is now past the end of the maximum run */
- if (possessive) continue;
- while (eptr >= pp)
- {
- RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
- eptr--;
- if (rrc != MATCH_NOMATCH) RRETURN(rrc);
- }
- }
- /* Get here if we can't make it match with any permitted repetitions */
- RRETURN(MATCH_NOMATCH);
- }
- /* Control never gets here */
- /* There's been some horrible disaster. Arrival here can only mean there is
- something seriously wrong in the code above or the OP_xxx definitions. */
- default:
- DPRINTF(("Unknown opcode %d\n", *ecode));
- RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
- }
- /* Do not stick any code in here without much thought; it is assumed
- that "continue" in the code above comes out to here to repeat the main
- loop. */
- } /* End of main loop */
- /* Control never reaches here */
- /* When compiling to use the heap rather than the stack for recursive calls to
- match(), the RRETURN() macro jumps here. The number that is saved in
- frame->Xwhere indicates which label we actually want to return to. */
- #ifdef NO_RECURSE
- #define LBL(val) case val: goto L_RM##val;
- HEAP_RETURN:
- switch (frame->Xwhere)
- {
- LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
- LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
- LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
- LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
- LBL(53) LBL(54)
- #ifdef SUPPORT_UTF8
- LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
- LBL(32) LBL(34) LBL(42) LBL(46)
- #ifdef SUPPORT_UCP
- LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
- #endif /* SUPPORT_UCP */
- #endif /* SUPPORT_UTF8 */
- default:
- DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
- return PCRE_ERROR_INTERNAL;
- }
- #undef LBL
- #endif /* NO_RECURSE */
- }
- /***************************************************************************
- ****************************************************************************
- RECURSION IN THE match() FUNCTION
- Undefine all the macros that were defined above to handle this. */
- #ifdef NO_RECURSE
- #undef eptr
- #undef ecode
- #undef mstart
- #undef offset_top
- #undef ims
- #undef eptrb
- #undef flags
- #undef callpat
- #undef charptr
- #undef data
- #undef next
- #undef pp
- #undef prev
- #undef saved_eptr
- #undef new_recursive
- #undef cur_is_word
- #undef condition
- #undef prev_is_word
- #undef original_ims
- #undef ctype
- #undef length
- #undef max
- #undef min
- #undef number
- #undef offset
- #undef op
- #undef save_capture_last
- #undef save_offset1
- #undef save_offset2
- #undef save_offset3
- #undef stacksave
- #undef newptrb
- #endif
- /* These two are defined as macros in both cases */
- #undef fc
- #undef fi
- /***************************************************************************
- ***************************************************************************/
- /*************************************************
- * Execute a Regular Expression *
- *************************************************/
- /* This function applies a compiled re to a subject string and picks out
- portions of the string if it matches. Two elements in the vector are set for
- each substring: the offsets to the start and end of the substring.
- Arguments:
- argument_re points to the compiled expression
- extra_data points to extra data or is NULL
- subject points to the subject string
- length length of subject string (may contain binary zeros)
- start_offset where to start in the subject string
- options option bits
- offsets points to a vector of ints to be filled in with offsets
- offsetcount the number of elements in the vector
- Returns: > 0 => success; value is the number of elements filled in
- = 0 => success, but offsets is not big enough
- -1 => failed to match
- < -1 => some kind of unexpected problem
- */
- PCRE_EXP_DEFN int
- pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
- PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
- int offsetcount)
- {
- int rc, resetcount, ocount;
- int first_byte = -1;
- int req_byte = -1;
- int req_byte2 = -1;
- int newline;
- unsigned long int ims;
- BOOL using_temporary_offsets = FALSE;
- BOOL anchored;
- BOOL startline;
- BOOL firstline;
- BOOL first_byte_caseless = FALSE;
- BOOL req_byte_caseless = FALSE;
- BOOL utf8;
- match_data match_block;
- match_data *md = &match_block;
- const uschar *tables;
- const uschar *start_bits = NULL;
- USPTR start_match = (USPTR)subject + start_offset;
- USPTR end_subject;
- USPTR req_byte_ptr = start_match - 1;
- pcre_study_data internal_study;
- const pcre_study_data *study;
- real_pcre internal_re;
- const real_pcre *external_re = (const real_pcre *)argument_re;
- const real_pcre *re = external_re;
- /* Plausibility checks */
- if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
- if (re == NULL || subject == NULL ||
- (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
- if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
- /* Fish out the optional data from the extra_data structure, first setting
- the default values. */
- study = NULL;
- md->match_limit = MATCH_LIMIT;
- md->match_limit_recursion = MATCH_LIMIT_RECURSION;
- md->callout_data = NULL;
- /* The table pointer is always in native byte order. */
- tables = external_re->tables;
- if (extra_data != NULL)
- {
- register unsigned int flags = extra_data->flags;
- if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
- study = (const pcre_study_data *)extra_data->study_data;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
- md->match_limit = extra_data->match_limit;
- if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
- md->match_limit_recursion = extra_data->match_limit_recursion;
- if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
- md->callout_data = extra_data->callout_data;
- if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
- }
- /* If the exec call supplied NULL for tables, use the inbuilt ones. This
- is a feature that makes it possible to save compiled regex and re-use them
- in other programs later. */
- if (tables == NULL) tables = _pcre_default_tables;
- /* Check that the first field in the block is the magic number. If it is not,
- test for a regex that was compiled on a host of opposite endianness. If this is
- the case, flipped values are put in internal_re and internal_study if there was
- study data too. */
- if (re->magic_number != MAGIC_NUMBER)
- {
- re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
- if (re == NULL) return PCRE_ERROR_BADMAGIC;
- if (study != NULL) study = &internal_study;
- }
- /* Set up other data */
- anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
- startline = (re->flags & PCRE_STARTLINE) != 0;
- firstline = (re->options & PCRE_FIRSTLINE) != 0;
- /* The code starts after the real_pcre block and the capture name table. */
- md->start_code = (const uschar *)external_re + re->name_table_offset +
- re->name_count * re->name_entry_size;
- md->start_subject = (USPTR)subject;
- md->start_offset = start_offset;
- md->end_subject = md->start_subject + length;
- end_subject = md->end_subject;
- md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
- utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
- md->notbol = (options & PCRE_NOTBOL) != 0;
- md->noteol = (options & PCRE_NOTEOL) != 0;
- md->notempty = (options & PCRE_NOTEMPTY) != 0;
- md->partial = (options & PCRE_PARTIAL) != 0;
- md->hitend = FALSE;
- md->recursive = NULL; /* No recursion at top level */
- md->lcc = tables + lcc_offset;
- md->ctypes = tables + ctypes_offset;
- /* Handle different \R options. */
- switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
- {
- case 0:
- if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
- md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
- else
- #ifdef BSR_ANYCRLF
- md->bsr_anycrlf = TRUE;
- #else
- md->bsr_anycrlf = FALSE;
- #endif
- break;
- case PCRE_BSR_ANYCRLF:
- md->bsr_anycrlf = TRUE;
- break;
- case PCRE_BSR_UNICODE:
- md->bsr_anycrlf = FALSE;
- break;
- default: return PCRE_ERROR_BADNEWLINE;
- }
- /* Handle different types of newline. The three bits give eight cases. If
- nothing is set at run time, whatever was used at compile time applies. */
- switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
- (pcre_uint32)options) & PCRE_NEWLINE_BITS)
- {
- case 0: newline = NEWLINE; break; /* Compile-time default */
- case PCRE_NEWLINE_CR: newline = '\r'; break;
- case PCRE_NEWLINE_LF: newline = '\n'; break;
- case PCRE_NEWLINE_CR+
- PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
- case PCRE_NEWLINE_ANY: newline = -1; break;
- case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
- default: return PCRE_ERROR_BADNEWLINE;
- }
- if (newline == -2)
- {
- md->nltype = NLTYPE_ANYCRLF;
- }
- else if (newline < 0)
- {
- md->nltype = NLTYPE_ANY;
- }
- else
- {
- md->nltype = NLTYPE_FIXED;
- if (newline > 255)
- {
- md->nllen = 2;
- md->nl[0] = (newline >> 8) & 255;
- md->nl[1] = newline & 255;
- }
- else
- {
- md->nllen = 1;
- md->nl[0] = newline;
- }
- }
- /* Partial matching is supported only for a restricted set of regexes at the
- moment. */
- if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
- return PCRE_ERROR_BADPARTIAL;
- /* Check a UTF-8 string if required. Unfortunately there's no way of passing
- back the character offset. */
- #ifdef SUPPORT_UTF8
- if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
- {
- if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
- return PCRE_ERROR_BADUTF8;
- if (start_offset > 0 && start_offset < length)
- {
- int tb = ((uschar *)subject)[start_offset];
- if (tb > 127)
- {
- tb &= 0xc0;
- if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
- }
- }
- }
- #endif
- /* The ims options can vary during the matching as a result of the presence
- of (?ims) items in the pattern. They are kept in a local variable so that
- restoring at the exit of a group is easy. */
- ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
- /* If the expression has got more back references than the offsets supplied can
- hold, we get a temporary chunk of working store to use during the matching.
- Otherwise, we can use the vector supplied, rounding down its size to a multiple
- of 3. */
- ocount = offsetcount - (offsetcount % 3);
- if (re->top_backref > 0 && re->top_backref >= ocount/3)
- {
- ocount = re->top_backref * 3 + 3;
- md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
- if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
- using_temporary_offsets = TRUE;
- DPRINTF(("Got memory to hold back references\n"));
- }
- else md->offset_vector = offsets;
- md->offset_end = ocount;
- md->offset_max = (2*ocount)/3;
- md->offset_overflow = FALSE;
- md->capture_last = -1;
- /* Compute the minimum number of offsets that we need to reset each time. Doing
- this makes a huge difference to execution time when there aren't many brackets
- in the pattern. */
- resetcount = 2 + re->top_bracket * 2;
- if (resetcount > offsetcount) resetcount = ocount;
- /* Reset the working variable associated with each extraction. These should
- never be used unless previously set, but they get saved and restored, and so we
- initialize them to avoid reading uninitialized locations. */
- if (md->offset_vector != NULL)
- {
- register int *iptr = md->offset_vector + ocount;
- register int *iend = iptr - resetcount/2 + 1;
- while (--iptr >= iend) *iptr = -1;
- }
- /* Set up the first character to match, if available. The first_byte value is
- never set for an anchored regular expression, but the anchoring may be forced
- at run time, so we have to test for anchoring. The first char may be unset for
- an unanchored pattern, of course. If there's no first char and the pattern was
- studied, there may be a bitmap of possible first characters. */
- if (!anchored)
- {
- if ((re->flags & PCRE_FIRSTSET) != 0)
- {
- first_byte = re->first_byte & 255;
- if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
- first_byte = md->lcc[first_byte];
- }
- else
- if (!startline && study != NULL &&
- (study->options & PCRE_STUDY_MAPPED) != 0)
- start_bits = study->start_bits;
- }
- /* For anchored or unanchored matches, there may be a "last known required
- character" set. */
- if ((re->flags & PCRE_REQCHSET) != 0)
- {
- req_byte = re->req_byte & 255;
- req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
- req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
- }
- /* ==========================================================================*/
- /* Loop for handling unanchored repeated matching attempts; for anchored regexs
- the loop runs just once. */
- for(;;)
- {
- USPTR save_end_subject = end_subject;
- USPTR new_start_match;
- /* Reset the maximum number of extractions we might see. */
- if (md->offset_vector != NULL)
- {
- register int *iptr = md->offset_vector;
- register int *iend = iptr + resetcount;
- while (iptr < iend) *iptr++ = -1;
- }
- /* Advance to a unique first char if possible. If firstline is TRUE, the
- start of the match is constrained to the first line of a multiline string.
- That is, the match must be before or at the first newline. Implement this by
- temporarily adjusting end_subject so that we stop scanning at a newline. If
- the match fails at the newline, later code breaks this loop. */
- if (firstline)
- {
- USPTR t = start_match;
- while (t < md->end_subject && !IS_NEWLINE(t)) t++;
- end_subject = t;
- }
- /* Now test for a unique first byte */
- if (first_byte >= 0)
- {
- if (first_byte_caseless)
- while (start_match < end_subject &&
- md->lcc[*start_match] != first_byte)
- { NEXTCHAR(start_match); }
- else
- while (start_match < end_subject && *start_match != first_byte)
- { NEXTCHAR(start_match); }
- }
- /* Or to just after a linebreak for a multiline match if possible */
- else if (startline)
- {
- if (start_match > md->start_subject + start_offset)
- {
- while (start_match <= end_subject && !WAS_NEWLINE(start_match))
- { NEXTCHAR(start_match); }
- /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
- and we are now at a LF, advance the match position by one more character.
- */
- if (start_match[-1] == '\r' &&
- (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
- start_match < end_subject &&
- *start_match == '\n')
- start_match++;
- }
- }
- /* Or to a non-unique first char after study */
- else if (start_bits != NULL)
- {
- while (start_match < end_subject)
- {
- register unsigned int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0)
- { NEXTCHAR(start_match); }
- else break;
- }
- }
- /* Restore fudged end_subject */
- end_subject = save_end_subject;
- #ifdef DEBUG /* Sigh. Some compilers never learn. */
- printf(">>>> Match against: ");
- pchars(start_match, end_subject - start_match, TRUE, md);
- printf("\n");
- #endif
- /* If req_byte is set, we know that that character must appear in the subject
- for the match to succeed. If the first character is set, req_byte must be
- later in the subject; otherwise the test starts at the match point. This
- optimization can save a huge amount of backtracking in patterns with nested
- unlimited repeats that aren't going to match. Writing separate code for
- cased/caseless versions makes it go faster, as does using an autoincrement
- and backing off on a match.
- HOWEVER: when the subject string is very, very long, searching to its end can
- take a long time, and give bad performance on quite ordinary patterns. This
- showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
- string... so we don't do this when the string is sufficiently long.
- ALSO: this processing is disabled when partial matching is requested.
- */
- if (req_byte >= 0 &&
- end_subject - start_match < REQ_BYTE_MAX &&
- !md->partial)
- {
- register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
- if (p > req_byte_ptr)
- {
- if (req_byte_caseless)
- {
- while (p < end_subject)
- {
- register int pp = *p++;
- if (pp == req_byte || pp == req_byte2) { p--; break; }
- }
- }
- else
- {
- while (p < end_subject)
- {
- if (*p++ == req_byte) { p--; break; }
- }
- }
- /* If we can't find the required character, break the matching loop,
- forcing a match failure. */
- if (p >= end_subject)
- {
- rc = MATCH_NOMATCH;
- break;
- }
- /* If we have found the required character, save the point where we
- found it, so that we don't search again next time round the loop if
- the start hasn't passed this character yet. */
- req_byte_ptr = p;
- }
- }
- /* OK, we can now run the match. */
- md->start_match_ptr = start_match;
- md->match_call_count = 0;
- rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
- switch(rc)
- {
- /* NOMATCH and PRUNE advance by one character. THEN at this level acts
- exactly like PRUNE. */
- case MATCH_NOMATCH:
- case MATCH_PRUNE:
- case MATCH_THEN:
- new_start_match = start_match + 1;
- #ifdef SUPPORT_UTF8
- if (utf8)
- while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
- new_start_match++;
- #endif
- break;
- /* SKIP passes back the next starting point explicitly. */
- case MATCH_SKIP:
- new_start_match = md->start_match_ptr;
- break;
- /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
- case MATCH_COMMIT:
- rc = MATCH_NOMATCH;
- goto ENDLOOP;
- /* Any other return is some kind of error. */
- default:
- goto ENDLOOP;
- }
- /* Control reaches here for the various types of "no match at this point"
- result. Reset the code to MATCH_NOMATCH for subsequent checking. */
- rc = MATCH_NOMATCH;
- /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
- newline in the subject (though it may continue over the newline). Therefore,
- if we have just failed to match, starting at a newline, do not continue. */
- if (firstline && IS_NEWLINE(start_match)) break;
- /* Advance to new matching position */
- start_match = new_start_match;
- /* Break the loop if the pattern is anchored or if we have passed the end of
- the subject. */
- if (anchored || start_match > end_subject) break;
- /* If we have just passed a CR and we are now at a LF, and the pattern does
- not contain any explicit matches for \r or \n, and the newline option is CRLF
- or ANY or ANYCRLF, advance the match position by one more character. */
- if (start_match[-1] == '\r' &&
- start_match < end_subject &&
- *start_match == '\n' &&
- (re->flags & PCRE_HASCRORLF) == 0 &&
- (md->nltype == NLTYPE_ANY ||
- md->nltype == NLTYPE_ANYCRLF ||
- md->nllen == 2))
- start_match++;
- } /* End of for(;;) "bumpalong" loop */
- /* ==========================================================================*/
- /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
- conditions is true:
- (1) The pattern is anchored or the match was failed by (*COMMIT);
- (2) We are past the end of the subject;
- (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
- this option requests that a match occur at or before the first newline in
- the subject.
- When we have a match and the offset vector is big enough to deal with any
- backreferences, captured substring offsets will already be set up. In the case
- where we had to get some local store to hold offsets for backreference
- processing, copy those that we can. In this case there need not be overflow if
- certain parts of the pattern were not used, even though there are more
- capturing parentheses than vector slots. */
- ENDLOOP:
- if (rc == MATCH_MATCH)
- {
- if (using_temporary_offsets)
- {
- if (offsetcount >= 4)
- {
- memcpy(offsets + 2, md->offset_vector + 2,
- (offsetcount - 2) * sizeof(int));
- DPRINTF(("Copied offsets from temporary memory\n"));
- }
- if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(md->offset_vector);
- }
- /* Set the return code to the number of captured strings, or 0 if there are
- too many to fit into the vector. */
- rc = md->offset_overflow? 0 : md->end_offset_top/2;
- /* If there is space, set up the whole thing as substring 0. The value of
- md->start_match_ptr might be modified if \K was encountered on the success
- matching path. */
- if (offsetcount < 2) rc = 0; else
- {
- offsets[0] = md->start_match_ptr - md->start_subject;
- offsets[1] = md->end_match_ptr - md->start_subject;
- }
- DPRINTF((">>>> returning %d\n", rc));
- return rc;
- }
- /* Control gets here if there has been an error, or if the overall match
- attempt has failed at all permitted starting positions. */
- if (using_temporary_offsets)
- {
- DPRINTF(("Freeing temporary memory\n"));
- (pcre_free)(md->offset_vector);
- }
- if (rc != MATCH_NOMATCH)
- {
- DPRINTF((">>>> error: returning %d\n", rc));
- return rc;
- }
- else if (md->partial && md->hitend)
- {
- DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
- return PCRE_ERROR_PARTIAL;
- }
- else
- {
- DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
- return PCRE_ERROR_NOMATCH;
- }
- }
- /* End of pcre_exec.c */
|