2
0

regcomp.c 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577
  1. /*-
  2. * This code is derived from OpenBSD's libc/regex, original license follows:
  3. *
  4. * Copyright (c) 1992, 1993, 1994 Henry Spencer.
  5. * Copyright (c) 1992, 1993, 1994
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * This code is derived from software contributed to Berkeley by
  9. * Henry Spencer.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. *
  35. * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
  36. */
  37. #include <sys/types.h>
  38. #include <stdio.h>
  39. #include <string.h>
  40. #include <ctype.h>
  41. #include <limits.h>
  42. #include <stdlib.h>
  43. #include "regex_impl.h"
  44. #include "regutils.h"
  45. #include "regex2.h"
  46. #include "regcclass.h"
  47. #include "regcname.h"
  48. #include "llvm/Config/config.h"
  49. #if HAVE_STDINT_H
  50. #include <stdint.h>
  51. #else
  52. /* Pessimistically bound memory use */
  53. #define SIZE_MAX UINT_MAX
  54. #endif
  55. /*
  56. * parse structure, passed up and down to avoid global variables and
  57. * other clumsinesses
  58. */
  59. struct parse {
  60. char *next; /* next character in RE */
  61. char *end; /* end of string (-> NUL normally) */
  62. int error; /* has an error been seen? */
  63. sop *strip; /* malloced strip */
  64. sopno ssize; /* malloced strip size (allocated) */
  65. sopno slen; /* malloced strip length (used) */
  66. int ncsalloc; /* number of csets allocated */
  67. struct re_guts *g;
  68. # define NPAREN 10 /* we need to remember () 1-9 for back refs */
  69. sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
  70. sopno pend[NPAREN]; /* -> ) ([0] unused) */
  71. };
  72. static void p_ere(struct parse *, int);
  73. static void p_ere_exp(struct parse *);
  74. static void p_str(struct parse *);
  75. static void p_bre(struct parse *, int, int);
  76. static int p_simp_re(struct parse *, int);
  77. static int p_count(struct parse *);
  78. static void p_bracket(struct parse *);
  79. static void p_b_term(struct parse *, cset *);
  80. static void p_b_cclass(struct parse *, cset *);
  81. static void p_b_eclass(struct parse *, cset *);
  82. static char p_b_symbol(struct parse *);
  83. static char p_b_coll_elem(struct parse *, int);
  84. static char othercase(int);
  85. static void bothcases(struct parse *, int);
  86. static void ordinary(struct parse *, int);
  87. static void nonnewline(struct parse *);
  88. static void repeat(struct parse *, sopno, int, int);
  89. static int seterr(struct parse *, int);
  90. static cset *allocset(struct parse *);
  91. static void freeset(struct parse *, cset *);
  92. static int freezeset(struct parse *, cset *);
  93. static int firstch(struct parse *, cset *);
  94. static int nch(struct parse *, cset *);
  95. static void mcadd(struct parse *, cset *, const char *);
  96. static void mcinvert(struct parse *, cset *);
  97. static void mccase(struct parse *, cset *);
  98. static int isinsets(struct re_guts *, int);
  99. static int samesets(struct re_guts *, int, int);
  100. static void categorize(struct parse *, struct re_guts *);
  101. static sopno dupl(struct parse *, sopno, sopno);
  102. static void doemit(struct parse *, sop, size_t);
  103. static void doinsert(struct parse *, sop, size_t, sopno);
  104. static void dofwd(struct parse *, sopno, sop);
  105. static void enlarge(struct parse *, sopno);
  106. static void stripsnug(struct parse *, struct re_guts *);
  107. static void findmust(struct parse *, struct re_guts *);
  108. static sopno pluscount(struct parse *, struct re_guts *);
  109. static char nuls[10]; /* place to point scanner in event of error */
  110. /*
  111. * macros for use with parse structure
  112. * BEWARE: these know that the parse structure is named `p' !!!
  113. */
  114. #define PEEK() (*p->next)
  115. #define PEEK2() (*(p->next+1))
  116. #define MORE() (p->next < p->end)
  117. #define MORE2() (p->next+1 < p->end)
  118. #define SEE(c) (MORE() && PEEK() == (c))
  119. #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
  120. #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
  121. #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
  122. #define NEXT() (p->next++)
  123. #define NEXT2() (p->next += 2)
  124. #define NEXTn(n) (p->next += (n))
  125. #define GETNEXT() (*p->next++)
  126. #define SETERROR(e) seterr(p, (e))
  127. #define REQUIRE(co, e) (void)((co) || SETERROR(e))
  128. #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
  129. #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
  130. #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
  131. #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
  132. #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
  133. #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
  134. #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
  135. #define HERE() (p->slen)
  136. #define THERE() (p->slen - 1)
  137. #define THERETHERE() (p->slen - 2)
  138. #define DROP(n) (p->slen -= (n))
  139. #ifdef _POSIX2_RE_DUP_MAX
  140. #define DUPMAX _POSIX2_RE_DUP_MAX
  141. #else
  142. #define DUPMAX 255
  143. #endif
  144. #define INFINITY (DUPMAX + 1)
  145. #ifndef NDEBUG
  146. static int never = 0; /* for use in asserts; shuts lint up */
  147. #else
  148. #define never 0 /* some <assert.h>s have bugs too */
  149. #endif
  150. /*
  151. - llvm_regcomp - interface for parser and compilation
  152. */
  153. int /* 0 success, otherwise REG_something */
  154. llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
  155. {
  156. struct parse pa;
  157. struct re_guts *g;
  158. struct parse *p = &pa;
  159. int i;
  160. size_t len;
  161. #ifdef REDEBUG
  162. # define GOODFLAGS(f) (f)
  163. #else
  164. # define GOODFLAGS(f) ((f)&~REG_DUMP)
  165. #endif
  166. cflags = GOODFLAGS(cflags);
  167. if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
  168. return(REG_INVARG);
  169. if (cflags&REG_PEND) {
  170. if (preg->re_endp < pattern)
  171. return(REG_INVARG);
  172. len = preg->re_endp - pattern;
  173. } else
  174. len = strlen((const char *)pattern);
  175. /* do the mallocs early so failure handling is easy */
  176. g = (struct re_guts *)regex_malloc(sizeof(struct re_guts) + // HLSL Change: Use custom allocator
  177. (NC-1)*sizeof(cat_t));
  178. if (g == NULL)
  179. return(REG_ESPACE);
  180. p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
  181. p->strip = (sop *)regex_calloc(p->ssize, sizeof(sop)); // HLSL Change: Use custom allocator
  182. p->slen = 0;
  183. if (p->strip == NULL) {
  184. regex_free((char *)g); // HLSL Change: Use custom allocator
  185. return(REG_ESPACE);
  186. }
  187. /* set things up */
  188. p->g = g;
  189. p->next = (char *)pattern; /* convenience; we do not modify it */
  190. p->end = p->next + len;
  191. p->error = 0;
  192. p->ncsalloc = 0;
  193. for (i = 0; i < NPAREN; i++) {
  194. p->pbegin[i] = 0;
  195. p->pend[i] = 0;
  196. }
  197. g->csetsize = NC;
  198. g->sets = NULL;
  199. g->setbits = NULL;
  200. g->ncsets = 0;
  201. g->cflags = cflags;
  202. g->iflags = 0;
  203. g->nbol = 0;
  204. g->neol = 0;
  205. g->must = NULL;
  206. g->mlen = 0;
  207. g->nsub = 0;
  208. g->ncategories = 1; /* category 0 is "everything else" */
  209. g->categories = &g->catspace[-(CHAR_MIN)];
  210. (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
  211. g->backrefs = 0;
  212. /* do it */
  213. EMIT(OEND, 0);
  214. g->firststate = THERE();
  215. if (cflags&REG_EXTENDED)
  216. p_ere(p, OUT);
  217. else if (cflags&REG_NOSPEC)
  218. p_str(p);
  219. else
  220. p_bre(p, OUT, OUT);
  221. EMIT(OEND, 0);
  222. g->laststate = THERE();
  223. /* tidy up loose ends and fill things in */
  224. categorize(p, g);
  225. stripsnug(p, g);
  226. findmust(p, g);
  227. g->nplus = pluscount(p, g);
  228. g->magic = MAGIC2;
  229. preg->re_nsub = g->nsub;
  230. preg->re_g = g;
  231. preg->re_magic = MAGIC1;
  232. #ifndef REDEBUG
  233. /* not debugging, so can't rely on the assert() in llvm_regexec() */
  234. if (g->iflags&REGEX_BAD)
  235. SETERROR(REG_ASSERT);
  236. #endif
  237. /* win or lose, we're done */
  238. if (p->error != 0) /* lose */
  239. llvm_regfree(preg);
  240. return(p->error);
  241. }
  242. /*
  243. - p_ere - ERE parser top level, concatenation and alternation
  244. */
  245. static void
  246. p_ere(struct parse *p, int stop) /* character this ERE should end at */
  247. {
  248. char c;
  249. sopno prevback = 0;
  250. sopno prevfwd = 0;
  251. sopno conc;
  252. int first = 1; /* is this the first alternative? */
  253. for (;;) {
  254. /* do a bunch of concatenated expressions */
  255. conc = HERE();
  256. while (MORE() && (c = PEEK()) != '|' && c != stop)
  257. p_ere_exp(p);
  258. REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
  259. if (!EAT('|'))
  260. break; /* NOTE BREAK OUT */
  261. if (first) {
  262. INSERT(OCH_, conc); /* offset is wrong */
  263. prevfwd = conc;
  264. prevback = conc;
  265. first = 0;
  266. }
  267. ASTERN(OOR1, prevback);
  268. prevback = THERE();
  269. AHEAD(prevfwd); /* fix previous offset */
  270. prevfwd = HERE();
  271. EMIT(OOR2, 0); /* offset is very wrong */
  272. }
  273. if (!first) { /* tail-end fixups */
  274. AHEAD(prevfwd);
  275. ASTERN(O_CH, prevback);
  276. }
  277. assert(!MORE() || SEE(stop));
  278. }
  279. /*
  280. - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
  281. */
  282. static void
  283. p_ere_exp(struct parse *p)
  284. {
  285. char c;
  286. sopno pos;
  287. int count;
  288. int count2;
  289. int backrefnum;
  290. sopno subno;
  291. int wascaret = 0;
  292. assert(MORE()); /* caller should have ensured this */
  293. c = GETNEXT();
  294. pos = HERE();
  295. switch (c) {
  296. case '(':
  297. REQUIRE(MORE(), REG_EPAREN);
  298. p->g->nsub++;
  299. subno = p->g->nsub;
  300. if (subno < NPAREN)
  301. p->pbegin[subno] = HERE();
  302. EMIT(OLPAREN, subno);
  303. if (!SEE(')'))
  304. p_ere(p, ')');
  305. if (subno < NPAREN) {
  306. p->pend[subno] = HERE();
  307. assert(p->pend[subno] != 0);
  308. }
  309. EMIT(ORPAREN, subno);
  310. MUSTEAT(')', REG_EPAREN);
  311. break;
  312. #ifndef POSIX_MISTAKE
  313. case ')': /* happens only if no current unmatched ( */
  314. /*
  315. * You may ask, why the ifndef? Because I didn't notice
  316. * this until slightly too late for 1003.2, and none of the
  317. * other 1003.2 regular-expression reviewers noticed it at
  318. * all. So an unmatched ) is legal POSIX, at least until
  319. * we can get it fixed.
  320. */
  321. SETERROR(REG_EPAREN);
  322. break;
  323. #endif
  324. case '^':
  325. EMIT(OBOL, 0);
  326. p->g->iflags |= USEBOL;
  327. p->g->nbol++;
  328. wascaret = 1;
  329. break;
  330. case '$':
  331. EMIT(OEOL, 0);
  332. p->g->iflags |= USEEOL;
  333. p->g->neol++;
  334. break;
  335. case '|':
  336. SETERROR(REG_EMPTY);
  337. break;
  338. case '*':
  339. case '+':
  340. case '?':
  341. SETERROR(REG_BADRPT);
  342. break;
  343. case '.':
  344. if (p->g->cflags&REG_NEWLINE)
  345. nonnewline(p);
  346. else
  347. EMIT(OANY, 0);
  348. break;
  349. case '[':
  350. p_bracket(p);
  351. break;
  352. case '\\':
  353. REQUIRE(MORE(), REG_EESCAPE);
  354. c = GETNEXT();
  355. if (c >= '1' && c <= '9') {
  356. /* \[0-9] is taken to be a back-reference to a previously specified
  357. * matching group. backrefnum will hold the number. The matching
  358. * group must exist (i.e. if \4 is found there must have been at
  359. * least 4 matching groups specified in the pattern previously).
  360. */
  361. backrefnum = c - '0';
  362. if (p->pend[backrefnum] == 0) {
  363. SETERROR(REG_ESUBREG);
  364. break;
  365. }
  366. /* Make sure everything checks out and emit the sequence
  367. * that marks a back-reference to the parse structure.
  368. */
  369. assert(backrefnum <= p->g->nsub);
  370. EMIT(OBACK_, backrefnum);
  371. assert(p->pbegin[backrefnum] != 0);
  372. assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
  373. assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
  374. (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
  375. EMIT(O_BACK, backrefnum);
  376. p->g->backrefs = 1;
  377. } else {
  378. /* Other chars are simply themselves when escaped with a backslash.
  379. */
  380. ordinary(p, c);
  381. }
  382. break;
  383. case '{': /* okay as ordinary except if digit follows */
  384. REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
  385. /* FALLTHROUGH */
  386. default:
  387. ordinary(p, c);
  388. break;
  389. }
  390. if (!MORE())
  391. return;
  392. c = PEEK();
  393. /* we call { a repetition if followed by a digit */
  394. if (!( c == '*' || c == '+' || c == '?' ||
  395. (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
  396. return; /* no repetition, we're done */
  397. NEXT();
  398. REQUIRE(!wascaret, REG_BADRPT);
  399. switch (c) {
  400. case '*': /* implemented as +? */
  401. /* this case does not require the (y|) trick, noKLUDGE */
  402. INSERT(OPLUS_, pos);
  403. ASTERN(O_PLUS, pos);
  404. INSERT(OQUEST_, pos);
  405. ASTERN(O_QUEST, pos);
  406. break;
  407. case '+':
  408. INSERT(OPLUS_, pos);
  409. ASTERN(O_PLUS, pos);
  410. break;
  411. case '?':
  412. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  413. INSERT(OCH_, pos); /* offset slightly wrong */
  414. ASTERN(OOR1, pos); /* this one's right */
  415. AHEAD(pos); /* fix the OCH_ */
  416. EMIT(OOR2, 0); /* offset very wrong... */
  417. AHEAD(THERE()); /* ...so fix it */
  418. ASTERN(O_CH, THERETHERE());
  419. break;
  420. case '{':
  421. count = p_count(p);
  422. if (EAT(',')) {
  423. if (isdigit((uch)PEEK())) {
  424. count2 = p_count(p);
  425. REQUIRE(count <= count2, REG_BADBR);
  426. } else /* single number with comma */
  427. count2 = INFINITY;
  428. } else /* just a single number */
  429. count2 = count;
  430. repeat(p, pos, count, count2);
  431. if (!EAT('}')) { /* error heuristics */
  432. while (MORE() && PEEK() != '}')
  433. NEXT();
  434. REQUIRE(MORE(), REG_EBRACE);
  435. SETERROR(REG_BADBR);
  436. }
  437. break;
  438. }
  439. if (!MORE())
  440. return;
  441. c = PEEK();
  442. if (!( c == '*' || c == '+' || c == '?' ||
  443. (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
  444. return;
  445. SETERROR(REG_BADRPT);
  446. }
  447. /*
  448. - p_str - string (no metacharacters) "parser"
  449. */
  450. static void
  451. p_str(struct parse *p)
  452. {
  453. REQUIRE(MORE(), REG_EMPTY);
  454. while (MORE())
  455. ordinary(p, GETNEXT());
  456. }
  457. /*
  458. - p_bre - BRE parser top level, anchoring and concatenation
  459. * Giving end1 as OUT essentially eliminates the end1/end2 check.
  460. *
  461. * This implementation is a bit of a kludge, in that a trailing $ is first
  462. * taken as an ordinary character and then revised to be an anchor. The
  463. * only undesirable side effect is that '$' gets included as a character
  464. * category in such cases. This is fairly harmless; not worth fixing.
  465. * The amount of lookahead needed to avoid this kludge is excessive.
  466. */
  467. static void
  468. p_bre(struct parse *p,
  469. int end1, /* first terminating character */
  470. int end2) /* second terminating character */
  471. {
  472. sopno start = HERE();
  473. int first = 1; /* first subexpression? */
  474. int wasdollar = 0;
  475. if (EAT('^')) {
  476. EMIT(OBOL, 0);
  477. p->g->iflags |= USEBOL;
  478. p->g->nbol++;
  479. }
  480. while (MORE() && !SEETWO(end1, end2)) {
  481. wasdollar = p_simp_re(p, first);
  482. first = 0;
  483. }
  484. if (wasdollar) { /* oops, that was a trailing anchor */
  485. DROP(1);
  486. EMIT(OEOL, 0);
  487. p->g->iflags |= USEEOL;
  488. p->g->neol++;
  489. }
  490. REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
  491. }
  492. /*
  493. - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
  494. */
  495. static int /* was the simple RE an unbackslashed $? */
  496. p_simp_re(struct parse *p,
  497. int starordinary) /* is a leading * an ordinary character? */
  498. {
  499. int c;
  500. int count;
  501. int count2;
  502. sopno pos;
  503. int i;
  504. sopno subno;
  505. # define BACKSL (1<<CHAR_BIT)
  506. pos = HERE(); /* repetition op, if any, covers from here */
  507. assert(MORE()); /* caller should have ensured this */
  508. c = GETNEXT();
  509. if (c == '\\') {
  510. REQUIRE(MORE(), REG_EESCAPE);
  511. c = BACKSL | GETNEXT();
  512. }
  513. switch (c) {
  514. case '.':
  515. if (p->g->cflags&REG_NEWLINE)
  516. nonnewline(p);
  517. else
  518. EMIT(OANY, 0);
  519. break;
  520. case '[':
  521. p_bracket(p);
  522. break;
  523. case BACKSL|'{':
  524. SETERROR(REG_BADRPT);
  525. break;
  526. case BACKSL|'(':
  527. p->g->nsub++;
  528. subno = p->g->nsub;
  529. if (subno < NPAREN)
  530. p->pbegin[subno] = HERE();
  531. EMIT(OLPAREN, subno);
  532. /* the MORE here is an error heuristic */
  533. if (MORE() && !SEETWO('\\', ')'))
  534. p_bre(p, '\\', ')');
  535. if (subno < NPAREN) {
  536. p->pend[subno] = HERE();
  537. assert(p->pend[subno] != 0);
  538. }
  539. EMIT(ORPAREN, subno);
  540. REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
  541. break;
  542. case BACKSL|')': /* should not get here -- must be user */
  543. case BACKSL|'}':
  544. SETERROR(REG_EPAREN);
  545. break;
  546. case BACKSL|'1':
  547. case BACKSL|'2':
  548. case BACKSL|'3':
  549. case BACKSL|'4':
  550. case BACKSL|'5':
  551. case BACKSL|'6':
  552. case BACKSL|'7':
  553. case BACKSL|'8':
  554. case BACKSL|'9':
  555. i = (c&~BACKSL) - '0';
  556. assert(i < NPAREN);
  557. if (p->pend[i] != 0) {
  558. assert(i <= p->g->nsub);
  559. EMIT(OBACK_, i);
  560. assert(p->pbegin[i] != 0);
  561. assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
  562. assert(OP(p->strip[p->pend[i]]) == ORPAREN);
  563. (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
  564. EMIT(O_BACK, i);
  565. } else
  566. SETERROR(REG_ESUBREG);
  567. p->g->backrefs = 1;
  568. break;
  569. case '*':
  570. REQUIRE(starordinary, REG_BADRPT);
  571. /* FALLTHROUGH */
  572. default:
  573. ordinary(p, (char)c);
  574. break;
  575. }
  576. if (EAT('*')) { /* implemented as +? */
  577. /* this case does not require the (y|) trick, noKLUDGE */
  578. INSERT(OPLUS_, pos);
  579. ASTERN(O_PLUS, pos);
  580. INSERT(OQUEST_, pos);
  581. ASTERN(O_QUEST, pos);
  582. } else if (EATTWO('\\', '{')) {
  583. count = p_count(p);
  584. if (EAT(',')) {
  585. if (MORE() && isdigit((uch)PEEK())) {
  586. count2 = p_count(p);
  587. REQUIRE(count <= count2, REG_BADBR);
  588. } else /* single number with comma */
  589. count2 = INFINITY;
  590. } else /* just a single number */
  591. count2 = count;
  592. repeat(p, pos, count, count2);
  593. if (!EATTWO('\\', '}')) { /* error heuristics */
  594. while (MORE() && !SEETWO('\\', '}'))
  595. NEXT();
  596. REQUIRE(MORE(), REG_EBRACE);
  597. SETERROR(REG_BADBR);
  598. }
  599. } else if (c == '$') /* $ (but not \$) ends it */
  600. return(1);
  601. return(0);
  602. }
  603. /*
  604. - p_count - parse a repetition count
  605. */
  606. static int /* the value */
  607. p_count(struct parse *p)
  608. {
  609. int count = 0;
  610. int ndigits = 0;
  611. while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
  612. count = count*10 + (GETNEXT() - '0');
  613. ndigits++;
  614. }
  615. REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
  616. return(count);
  617. }
  618. /*
  619. - p_bracket - parse a bracketed character list
  620. *
  621. * Note a significant property of this code: if the allocset() did SETERROR,
  622. * no set operations are done.
  623. */
  624. static void
  625. p_bracket(struct parse *p)
  626. {
  627. cset *cs;
  628. int invert = 0;
  629. /* Dept of Truly Sickening Special-Case Kludges */
  630. if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
  631. EMIT(OBOW, 0);
  632. NEXTn(6);
  633. return;
  634. }
  635. if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
  636. EMIT(OEOW, 0);
  637. NEXTn(6);
  638. return;
  639. }
  640. if ((cs = allocset(p)) == NULL) {
  641. /* allocset did set error status in p */
  642. return;
  643. }
  644. if (EAT('^'))
  645. invert++; /* make note to invert set at end */
  646. if (EAT(']'))
  647. CHadd(cs, ']');
  648. else if (EAT('-'))
  649. CHadd(cs, '-');
  650. while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
  651. p_b_term(p, cs);
  652. if (EAT('-'))
  653. CHadd(cs, '-');
  654. MUSTEAT(']', REG_EBRACK);
  655. if (p->error != 0) { /* don't mess things up further */
  656. freeset(p, cs);
  657. return;
  658. }
  659. if (p->g->cflags&REG_ICASE) {
  660. int i;
  661. int ci;
  662. for (i = p->g->csetsize - 1; i >= 0; i--)
  663. if (CHIN(cs, i) && isalpha(i)) {
  664. ci = othercase(i);
  665. if (ci != i)
  666. CHadd(cs, ci);
  667. }
  668. if (cs->multis != NULL)
  669. mccase(p, cs);
  670. }
  671. if (invert) {
  672. int i;
  673. for (i = p->g->csetsize - 1; i >= 0; i--)
  674. if (CHIN(cs, i))
  675. CHsub(cs, i);
  676. else
  677. CHadd(cs, i);
  678. if (p->g->cflags&REG_NEWLINE)
  679. CHsub(cs, '\n');
  680. if (cs->multis != NULL)
  681. mcinvert(p, cs);
  682. }
  683. assert(cs->multis == NULL); /* xxx */
  684. if (nch(p, cs) == 1) { /* optimize singleton sets */
  685. ordinary(p, firstch(p, cs));
  686. freeset(p, cs);
  687. } else
  688. EMIT(OANYOF, freezeset(p, cs));
  689. }
  690. /*
  691. - p_b_term - parse one term of a bracketed character list
  692. */
  693. static void
  694. p_b_term(struct parse *p, cset *cs)
  695. {
  696. char c;
  697. char start, finish;
  698. int i;
  699. /* classify what we've got */
  700. switch ((MORE()) ? PEEK() : '\0') {
  701. case '[':
  702. c = (MORE2()) ? PEEK2() : '\0';
  703. break;
  704. case '-':
  705. SETERROR(REG_ERANGE);
  706. return; /* NOTE RETURN */
  707. break;
  708. default:
  709. c = '\0';
  710. break;
  711. }
  712. switch (c) {
  713. case ':': /* character class */
  714. NEXT2();
  715. REQUIRE(MORE(), REG_EBRACK);
  716. c = PEEK();
  717. REQUIRE(c != '-' && c != ']', REG_ECTYPE);
  718. p_b_cclass(p, cs);
  719. REQUIRE(MORE(), REG_EBRACK);
  720. REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
  721. break;
  722. case '=': /* equivalence class */
  723. NEXT2();
  724. REQUIRE(MORE(), REG_EBRACK);
  725. c = PEEK();
  726. REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
  727. p_b_eclass(p, cs);
  728. REQUIRE(MORE(), REG_EBRACK);
  729. REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
  730. break;
  731. default: /* symbol, ordinary character, or range */
  732. /* xxx revision needed for multichar stuff */
  733. start = p_b_symbol(p);
  734. if (SEE('-') && MORE2() && PEEK2() != ']') {
  735. /* range */
  736. NEXT();
  737. if (EAT('-'))
  738. finish = '-';
  739. else
  740. finish = p_b_symbol(p);
  741. } else
  742. finish = start;
  743. /* xxx what about signed chars here... */
  744. REQUIRE(start <= finish, REG_ERANGE);
  745. for (i = start; i <= finish; i++)
  746. CHadd(cs, i);
  747. break;
  748. }
  749. }
  750. /*
  751. - p_b_cclass - parse a character-class name and deal with it
  752. */
  753. static void
  754. p_b_cclass(struct parse *p, cset *cs)
  755. {
  756. char *sp = p->next;
  757. struct cclass *cp;
  758. size_t len;
  759. const char *u;
  760. char c;
  761. while (MORE() && isalpha((uch)PEEK()))
  762. NEXT();
  763. len = p->next - sp;
  764. for (cp = cclasses; cp->name != NULL; cp++)
  765. if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
  766. break;
  767. if (cp->name == NULL) {
  768. /* oops, didn't find it */
  769. SETERROR(REG_ECTYPE);
  770. return;
  771. }
  772. u = cp->chars;
  773. while ((c = *u++) != '\0')
  774. CHadd(cs, c);
  775. for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
  776. MCadd(p, cs, u);
  777. }
  778. /*
  779. - p_b_eclass - parse an equivalence-class name and deal with it
  780. *
  781. * This implementation is incomplete. xxx
  782. */
  783. static void
  784. p_b_eclass(struct parse *p, cset *cs)
  785. {
  786. char c;
  787. c = p_b_coll_elem(p, '=');
  788. CHadd(cs, c);
  789. }
  790. /*
  791. - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
  792. */
  793. static char /* value of symbol */
  794. p_b_symbol(struct parse *p)
  795. {
  796. char value;
  797. REQUIRE(MORE(), REG_EBRACK);
  798. if (!EATTWO('[', '.'))
  799. return(GETNEXT());
  800. /* collating symbol */
  801. value = p_b_coll_elem(p, '.');
  802. REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
  803. return(value);
  804. }
  805. /*
  806. - p_b_coll_elem - parse a collating-element name and look it up
  807. */
  808. static char /* value of collating element */
  809. p_b_coll_elem(struct parse *p,
  810. int endc) /* name ended by endc,']' */
  811. {
  812. char *sp = p->next;
  813. struct cname *cp;
  814. int len;
  815. while (MORE() && !SEETWO(endc, ']'))
  816. NEXT();
  817. if (!MORE()) {
  818. SETERROR(REG_EBRACK);
  819. return(0);
  820. }
  821. len = p->next - sp;
  822. for (cp = cnames; cp->name != NULL; cp++)
  823. if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
  824. return(cp->code); /* known name */
  825. if (len == 1)
  826. return(*sp); /* single character */
  827. SETERROR(REG_ECOLLATE); /* neither */
  828. return(0);
  829. }
  830. /*
  831. - othercase - return the case counterpart of an alphabetic
  832. */
  833. static char /* if no counterpart, return ch */
  834. othercase(int ch)
  835. {
  836. ch = (uch)ch;
  837. assert(isalpha(ch));
  838. if (isupper(ch))
  839. return ((uch)tolower(ch));
  840. else if (islower(ch))
  841. return ((uch)toupper(ch));
  842. else /* peculiar, but could happen */
  843. return(ch);
  844. }
  845. /*
  846. - bothcases - emit a dualcase version of a two-case character
  847. *
  848. * Boy, is this implementation ever a kludge...
  849. */
  850. static void
  851. bothcases(struct parse *p, int ch)
  852. {
  853. char *oldnext = p->next;
  854. char *oldend = p->end;
  855. char bracket[3];
  856. ch = (uch)ch;
  857. assert(othercase(ch) != ch); /* p_bracket() would recurse */
  858. p->next = bracket;
  859. p->end = bracket+2;
  860. bracket[0] = ch;
  861. bracket[1] = ']';
  862. bracket[2] = '\0';
  863. p_bracket(p);
  864. assert(p->next == bracket+2);
  865. p->next = oldnext;
  866. p->end = oldend;
  867. }
  868. /*
  869. - ordinary - emit an ordinary character
  870. */
  871. static void
  872. ordinary(struct parse *p, int ch)
  873. {
  874. cat_t *cap = p->g->categories;
  875. if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
  876. bothcases(p, ch);
  877. else {
  878. EMIT(OCHAR, (uch)ch);
  879. if (cap[ch] == 0)
  880. cap[ch] = p->g->ncategories++;
  881. }
  882. }
  883. /*
  884. - nonnewline - emit REG_NEWLINE version of OANY
  885. *
  886. * Boy, is this implementation ever a kludge...
  887. */
  888. static void
  889. nonnewline(struct parse *p)
  890. {
  891. char *oldnext = p->next;
  892. char *oldend = p->end;
  893. char bracket[4];
  894. p->next = bracket;
  895. p->end = bracket+3;
  896. bracket[0] = '^';
  897. bracket[1] = '\n';
  898. bracket[2] = ']';
  899. bracket[3] = '\0';
  900. p_bracket(p);
  901. assert(p->next == bracket+3);
  902. p->next = oldnext;
  903. p->end = oldend;
  904. }
  905. /*
  906. - repeat - generate code for a bounded repetition, recursively if needed
  907. */
  908. static void
  909. repeat(struct parse *p,
  910. sopno start, /* operand from here to end of strip */
  911. int from, /* repeated from this number */
  912. int to) /* to this number of times (maybe INFINITY) */
  913. {
  914. sopno finish = HERE();
  915. # define N 2
  916. # define INF 3
  917. # define REP(f, t) ((f)*8 + (t))
  918. # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
  919. sopno copy;
  920. if (p->error != 0) /* head off possible runaway recursion */
  921. return;
  922. assert(from <= to);
  923. switch (REP(MAP(from), MAP(to))) {
  924. case REP(0, 0): /* must be user doing this */
  925. DROP(finish-start); /* drop the operand */
  926. break;
  927. case REP(0, 1): /* as x{1,1}? */
  928. case REP(0, N): /* as x{1,n}? */
  929. case REP(0, INF): /* as x{1,}? */
  930. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  931. INSERT(OCH_, start); /* offset is wrong... */
  932. repeat(p, start+1, 1, to);
  933. ASTERN(OOR1, start);
  934. AHEAD(start); /* ... fix it */
  935. EMIT(OOR2, 0);
  936. AHEAD(THERE());
  937. ASTERN(O_CH, THERETHERE());
  938. break;
  939. case REP(1, 1): /* trivial case */
  940. /* done */
  941. break;
  942. case REP(1, N): /* as x?x{1,n-1} */
  943. /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
  944. INSERT(OCH_, start);
  945. ASTERN(OOR1, start);
  946. AHEAD(start);
  947. EMIT(OOR2, 0); /* offset very wrong... */
  948. AHEAD(THERE()); /* ...so fix it */
  949. ASTERN(O_CH, THERETHERE());
  950. copy = dupl(p, start+1, finish+1);
  951. assert(copy == finish+4);
  952. repeat(p, copy, 1, to-1);
  953. break;
  954. case REP(1, INF): /* as x+ */
  955. INSERT(OPLUS_, start);
  956. ASTERN(O_PLUS, start);
  957. break;
  958. case REP(N, N): /* as xx{m-1,n-1} */
  959. copy = dupl(p, start, finish);
  960. repeat(p, copy, from-1, to-1);
  961. break;
  962. case REP(N, INF): /* as xx{n-1,INF} */
  963. copy = dupl(p, start, finish);
  964. repeat(p, copy, from-1, to);
  965. break;
  966. default: /* "can't happen" */
  967. SETERROR(REG_ASSERT); /* just in case */
  968. break;
  969. }
  970. }
  971. /*
  972. - seterr - set an error condition
  973. */
  974. static int /* useless but makes type checking happy */
  975. seterr(struct parse *p, int e)
  976. {
  977. if (p->error == 0) /* keep earliest error condition */
  978. p->error = e;
  979. p->next = nuls; /* try to bring things to a halt */
  980. p->end = nuls;
  981. return(0); /* make the return value well-defined */
  982. }
  983. /*
  984. - allocset - allocate a set of characters for []
  985. */
  986. static cset *
  987. allocset(struct parse *p)
  988. {
  989. int no = p->g->ncsets++;
  990. size_t nc;
  991. size_t nbytes;
  992. cset *cs;
  993. size_t css = (size_t)p->g->csetsize;
  994. int i;
  995. if (no >= p->ncsalloc) { /* need another column of space */
  996. void *ptr;
  997. p->ncsalloc += CHAR_BIT;
  998. nc = p->ncsalloc;
  999. if (nc > SIZE_MAX / sizeof(cset))
  1000. goto nomem;
  1001. assert(nc % CHAR_BIT == 0);
  1002. nbytes = nc / CHAR_BIT * css;
  1003. ptr = (cset *)regex_realloc((char *)p->g->sets, no * sizeof(cset), nc * sizeof(cset)); // HLSL Change: Use custom allocator
  1004. if (ptr == NULL)
  1005. goto nomem;
  1006. p->g->sets = ptr;
  1007. ptr = (uch *)regex_realloc((char *)p->g->setbits, no / CHAR_BIT * css, nbytes); // HLSL Change: Use custom allocator
  1008. if (ptr == NULL)
  1009. goto nomem;
  1010. p->g->setbits = ptr;
  1011. for (i = 0; i < no; i++) {
  1012. _Analysis_assume_(i < nc); // HLSL Change
  1013. p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
  1014. }
  1015. (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
  1016. }
  1017. /* XXX should not happen */
  1018. if (p->g->sets == NULL || p->g->setbits == NULL)
  1019. goto nomem;
  1020. cs = &p->g->sets[no];
  1021. cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
  1022. cs->mask = 1 << ((no) % CHAR_BIT);
  1023. cs->hash = 0;
  1024. cs->smultis = 0;
  1025. cs->multis = NULL;
  1026. return(cs);
  1027. nomem:
  1028. regex_free(p->g->sets); // HLSL Change: Use custom allocator
  1029. p->g->sets = NULL;
  1030. regex_free(p->g->setbits); // HLSL Change: Use custom allocator
  1031. p->g->setbits = NULL;
  1032. SETERROR(REG_ESPACE);
  1033. /* caller's responsibility not to do set ops */
  1034. return(NULL);
  1035. }
  1036. /*
  1037. - freeset - free a now-unused set
  1038. */
  1039. static void
  1040. freeset(struct parse *p, cset *cs)
  1041. {
  1042. size_t i;
  1043. cset *top = &p->g->sets[p->g->ncsets];
  1044. size_t css = (size_t)p->g->csetsize;
  1045. for (i = 0; i < css; i++)
  1046. CHsub(cs, i);
  1047. if (cs == top-1) /* recover only the easy case */
  1048. p->g->ncsets--;
  1049. }
  1050. /*
  1051. - freezeset - final processing on a set of characters
  1052. *
  1053. * The main task here is merging identical sets. This is usually a waste
  1054. * of time (although the hash code minimizes the overhead), but can win
  1055. * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
  1056. * is done using addition rather than xor -- all ASCII [aA] sets xor to
  1057. * the same value!
  1058. */
  1059. static int /* set number */
  1060. freezeset(struct parse *p, cset *cs)
  1061. {
  1062. uch h = cs->hash;
  1063. size_t i;
  1064. cset *top = &p->g->sets[p->g->ncsets];
  1065. cset *cs2;
  1066. size_t css = (size_t)p->g->csetsize;
  1067. /* look for an earlier one which is the same */
  1068. for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
  1069. if (cs2->hash == h && cs2 != cs) {
  1070. /* maybe */
  1071. for (i = 0; i < css; i++)
  1072. if (!!CHIN(cs2, i) != !!CHIN(cs, i))
  1073. break; /* no */
  1074. if (i == css)
  1075. break; /* yes */
  1076. }
  1077. if (cs2 < top) { /* found one */
  1078. freeset(p, cs);
  1079. cs = cs2;
  1080. }
  1081. return((int)(cs - p->g->sets));
  1082. }
  1083. /*
  1084. - firstch - return first character in a set (which must have at least one)
  1085. */
  1086. static int /* character; there is no "none" value */
  1087. firstch(struct parse *p, cset *cs)
  1088. {
  1089. size_t i;
  1090. size_t css = (size_t)p->g->csetsize;
  1091. for (i = 0; i < css; i++)
  1092. if (CHIN(cs, i))
  1093. return((char)i);
  1094. assert(never);
  1095. return(0); /* arbitrary */
  1096. }
  1097. /*
  1098. - nch - number of characters in a set
  1099. */
  1100. static int
  1101. nch(struct parse *p, cset *cs)
  1102. {
  1103. size_t i;
  1104. size_t css = (size_t)p->g->csetsize;
  1105. int n = 0;
  1106. for (i = 0; i < css; i++)
  1107. if (CHIN(cs, i))
  1108. n++;
  1109. return(n);
  1110. }
  1111. /*
  1112. - mcadd - add a collating element to a cset
  1113. */
  1114. static void
  1115. mcadd( struct parse *p, cset *cs, const char *cp)
  1116. {
  1117. size_t oldend = cs->smultis;
  1118. void *np;
  1119. cs->smultis += strlen(cp) + 1;
  1120. np = regex_realloc(cs->multis, oldend, cs->smultis); // HLSL Change: Use custom allocator
  1121. if (np == NULL) {
  1122. if (cs->multis)
  1123. regex_free(cs->multis); // HLSL Change: Use custom allocator
  1124. cs->multis = NULL;
  1125. SETERROR(REG_ESPACE);
  1126. return;
  1127. }
  1128. cs->multis = np;
  1129. llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
  1130. }
  1131. /*
  1132. - mcinvert - invert the list of collating elements in a cset
  1133. *
  1134. * This would have to know the set of possibilities. Implementation
  1135. * is deferred.
  1136. */
  1137. /* ARGSUSED */
  1138. static void
  1139. mcinvert(struct parse *p, cset *cs)
  1140. {
  1141. assert(cs->multis == NULL); /* xxx */
  1142. }
  1143. /*
  1144. - mccase - add case counterparts of the list of collating elements in a cset
  1145. *
  1146. * This would have to know the set of possibilities. Implementation
  1147. * is deferred.
  1148. */
  1149. /* ARGSUSED */
  1150. static void
  1151. mccase(struct parse *p, cset *cs)
  1152. {
  1153. assert(cs->multis == NULL); /* xxx */
  1154. }
  1155. /*
  1156. - isinsets - is this character in any sets?
  1157. */
  1158. static int /* predicate */
  1159. isinsets(struct re_guts *g, int c)
  1160. {
  1161. uch *col;
  1162. int i;
  1163. int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1164. unsigned uc = (uch)c;
  1165. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1166. if (col[uc] != 0)
  1167. return(1);
  1168. return(0);
  1169. }
  1170. /*
  1171. - samesets - are these two characters in exactly the same sets?
  1172. */
  1173. static int /* predicate */
  1174. samesets(struct re_guts *g, int c1, int c2)
  1175. {
  1176. uch *col;
  1177. int i;
  1178. int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
  1179. unsigned uc1 = (uch)c1;
  1180. unsigned uc2 = (uch)c2;
  1181. for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
  1182. if (col[uc1] != col[uc2])
  1183. return(0);
  1184. return(1);
  1185. }
  1186. /*
  1187. - categorize - sort out character categories
  1188. */
  1189. static void
  1190. categorize(struct parse *p, struct re_guts *g)
  1191. {
  1192. cat_t *cats = g->categories;
  1193. int c;
  1194. int c2;
  1195. cat_t cat;
  1196. /* avoid making error situations worse */
  1197. if (p->error != 0)
  1198. return;
  1199. for (c = CHAR_MIN; c <= CHAR_MAX; c++)
  1200. if (cats[c] == 0 && isinsets(g, c)) {
  1201. cat = g->ncategories++;
  1202. cats[c] = cat;
  1203. for (c2 = c+1; c2 <= CHAR_MAX; c2++)
  1204. if (cats[c2] == 0 && samesets(g, c, c2))
  1205. cats[c2] = cat;
  1206. }
  1207. }
  1208. /*
  1209. - dupl - emit a duplicate of a bunch of sops
  1210. */
  1211. static sopno /* start of duplicate */
  1212. dupl(struct parse *p,
  1213. sopno start, /* from here */
  1214. sopno finish) /* to this less one */
  1215. {
  1216. sopno ret = HERE();
  1217. sopno len = finish - start;
  1218. assert(finish >= start);
  1219. if (len == 0)
  1220. return(ret);
  1221. enlarge(p, p->ssize + len); /* this many unexpected additions */
  1222. assert(p->ssize >= p->slen + len);
  1223. (void) memmove((char *)(p->strip + p->slen),
  1224. (char *)(p->strip + start), (size_t)len*sizeof(sop));
  1225. p->slen += len;
  1226. return(ret);
  1227. }
  1228. /*
  1229. - doemit - emit a strip operator
  1230. *
  1231. * It might seem better to implement this as a macro with a function as
  1232. * hard-case backup, but it's just too big and messy unless there are
  1233. * some changes to the data structures. Maybe later.
  1234. */
  1235. static void
  1236. doemit(struct parse *p, sop op, size_t opnd)
  1237. {
  1238. /* avoid making error situations worse */
  1239. if (p->error != 0)
  1240. return;
  1241. /* deal with oversize operands ("can't happen", more or less) */
  1242. assert(opnd < 1<<OPSHIFT);
  1243. /* deal with undersized strip */
  1244. if (p->slen >= p->ssize)
  1245. enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
  1246. assert(p->slen < p->ssize);
  1247. /* finally, it's all reduced to the easy case */
  1248. p->strip[p->slen++] = SOP(op, opnd);
  1249. }
  1250. /*
  1251. - doinsert - insert a sop into the strip
  1252. */
  1253. static void
  1254. doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
  1255. {
  1256. sopno sn;
  1257. sop s;
  1258. int i;
  1259. /* avoid making error situations worse */
  1260. if (p->error != 0)
  1261. return;
  1262. sn = HERE();
  1263. EMIT(op, opnd); /* do checks, ensure space */
  1264. assert(HERE() == sn+1);
  1265. s = p->strip[sn];
  1266. /* adjust paren pointers */
  1267. assert(pos > 0);
  1268. for (i = 1; i < NPAREN; i++) {
  1269. if (p->pbegin[i] >= pos) {
  1270. p->pbegin[i]++;
  1271. }
  1272. if (p->pend[i] >= pos) {
  1273. p->pend[i]++;
  1274. }
  1275. }
  1276. memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
  1277. (HERE()-pos-1)*sizeof(sop));
  1278. p->strip[pos] = s;
  1279. }
  1280. /*
  1281. - dofwd - complete a forward reference
  1282. */
  1283. static void
  1284. dofwd(struct parse *p, sopno pos, sop value)
  1285. {
  1286. /* avoid making error situations worse */
  1287. if (p->error != 0)
  1288. return;
  1289. assert(value < 1<<OPSHIFT);
  1290. p->strip[pos] = OP(p->strip[pos]) | value;
  1291. }
  1292. /*
  1293. - enlarge - enlarge the strip
  1294. */
  1295. static void
  1296. enlarge(struct parse *p, sopno size)
  1297. {
  1298. sop *sp;
  1299. if (p->ssize >= size)
  1300. return;
  1301. if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) {
  1302. SETERROR(REG_ESPACE);
  1303. return;
  1304. }
  1305. sp = (sop *)regex_realloc(p->strip, p->ssize*sizeof(sop), size*sizeof(sop)); // HLSL Change: Use custom allocator
  1306. if (sp == NULL) {
  1307. SETERROR(REG_ESPACE);
  1308. return;
  1309. }
  1310. p->strip = sp;
  1311. p->ssize = size;
  1312. }
  1313. /*
  1314. - stripsnug - compact the strip
  1315. */
  1316. static void
  1317. stripsnug(struct parse *p, struct re_guts *g)
  1318. {
  1319. g->nstates = p->slen;
  1320. if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) {
  1321. g->strip = p->strip;
  1322. SETERROR(REG_ESPACE);
  1323. return;
  1324. }
  1325. g->strip = (sop *)regex_realloc((char *)p->strip, p->slen * sizeof(sop), p->slen * sizeof(sop)); // HLSL Change: Use custom allocator
  1326. if (g->strip == NULL) {
  1327. SETERROR(REG_ESPACE);
  1328. g->strip = p->strip;
  1329. }
  1330. }
  1331. /*
  1332. - findmust - fill in must and mlen with longest mandatory literal string
  1333. *
  1334. * This algorithm could do fancy things like analyzing the operands of |
  1335. * for common subsequences. Someday. This code is simple and finds most
  1336. * of the interesting cases.
  1337. *
  1338. * Note that must and mlen got initialized during setup.
  1339. */
  1340. static void
  1341. findmust(struct parse *p, struct re_guts *g)
  1342. {
  1343. sop *scan;
  1344. sop *start = 0; /* start initialized in the default case, after that */
  1345. sop *newstart = 0; /* newstart was initialized in the OCHAR case */
  1346. sopno newlen;
  1347. sop s;
  1348. char *cp;
  1349. sopno i;
  1350. /* avoid making error situations worse */
  1351. if (p->error != 0)
  1352. return;
  1353. /* find the longest OCHAR sequence in strip */
  1354. newlen = 0;
  1355. scan = g->strip + 1;
  1356. do {
  1357. s = *scan++;
  1358. switch (OP(s)) {
  1359. case OCHAR: /* sequence member */
  1360. if (newlen == 0) /* new sequence */
  1361. newstart = scan - 1;
  1362. newlen++;
  1363. break;
  1364. case OPLUS_: /* things that don't break one */
  1365. case OLPAREN:
  1366. case ORPAREN:
  1367. break;
  1368. case OQUEST_: /* things that must be skipped */
  1369. case OCH_:
  1370. scan--;
  1371. do {
  1372. scan += OPND(s);
  1373. s = *scan;
  1374. /* assert() interferes w debug printouts */
  1375. if (OP(s) != O_QUEST && OP(s) != O_CH &&
  1376. OP(s) != OOR2) {
  1377. g->iflags |= REGEX_BAD;
  1378. return;
  1379. }
  1380. } while (OP(s) != O_QUEST && OP(s) != O_CH);
  1381. /* fallthrough */
  1382. default: /* things that break a sequence */
  1383. if (newlen > g->mlen) { /* ends one */
  1384. start = newstart;
  1385. g->mlen = newlen;
  1386. }
  1387. newlen = 0;
  1388. break;
  1389. }
  1390. } while (OP(s) != OEND);
  1391. if (g->mlen == 0) /* there isn't one */
  1392. return;
  1393. /* turn it into a character string */
  1394. g->must = regex_malloc((size_t)g->mlen + 1); // HLSL Change: Use custom allocator
  1395. if (g->must == NULL) { /* argh; just forget it */
  1396. g->mlen = 0;
  1397. return;
  1398. }
  1399. cp = g->must;
  1400. scan = start;
  1401. for (i = g->mlen; i > 0; i--) {
  1402. while (OP(s = *scan++) != OCHAR)
  1403. continue;
  1404. assert(cp < g->must + g->mlen);
  1405. *cp++ = (char)OPND(s);
  1406. }
  1407. assert(cp == g->must + g->mlen);
  1408. *cp++ = '\0'; /* just on general principles */
  1409. }
  1410. /*
  1411. - pluscount - count + nesting
  1412. */
  1413. static sopno /* nesting depth */
  1414. pluscount(struct parse *p, struct re_guts *g)
  1415. {
  1416. sop *scan;
  1417. sop s;
  1418. sopno plusnest = 0;
  1419. sopno maxnest = 0;
  1420. if (p->error != 0)
  1421. return(0); /* there may not be an OEND */
  1422. scan = g->strip + 1;
  1423. do {
  1424. s = *scan++;
  1425. switch (OP(s)) {
  1426. case OPLUS_:
  1427. plusnest++;
  1428. break;
  1429. case O_PLUS:
  1430. if (plusnest > maxnest)
  1431. maxnest = plusnest;
  1432. plusnest--;
  1433. break;
  1434. }
  1435. } while (OP(s) != OEND);
  1436. if (plusnest != 0)
  1437. g->iflags |= REGEX_BAD;
  1438. return(maxnest);
  1439. }