parser.cs 26 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: parser.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. using System;
  9. using System.Collections;
  10. using System.Globalization;
  11. namespace System.Text.RegularExpressions.Syntax {
  12. class Parser {
  13. public static int ParseDecimal (string str, ref int ptr) {
  14. return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
  15. }
  16. public static int ParseOctal (string str, ref int ptr) {
  17. return ParseNumber (str, ref ptr, 8, 1, 3);
  18. }
  19. public static int ParseHex (string str, ref int ptr, int digits) {
  20. return ParseNumber (str, ref ptr, 16, digits, digits);
  21. }
  22. public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
  23. int p = ptr, n = 0, digits = 0, d;
  24. if (max < min)
  25. max = Int32.MaxValue;
  26. while (digits < max && p < str.Length) {
  27. d = ParseDigit (str[p ++], b, digits);
  28. if (d < 0) {
  29. -- p;
  30. break;
  31. }
  32. n = n * b + d;
  33. ++ digits;
  34. }
  35. if (digits < min)
  36. return -1;
  37. ptr = p;
  38. return n;
  39. }
  40. public static string ParseName (string str, ref int ptr) {
  41. if (Char.IsDigit (str[ptr])) {
  42. int gid = ParseNumber (str, ref ptr, 10, 1, 0);
  43. if (gid > 0)
  44. return gid.ToString ();
  45. return null;
  46. }
  47. int start = ptr;
  48. for (;;) {
  49. if (!IsNameChar (str[ptr]))
  50. break;
  51. ++ ptr;
  52. }
  53. if (ptr - start > 0)
  54. return str.Substring (start, ptr - start);
  55. return null;
  56. }
  57. public static string Escape (string str) {
  58. string result = "";
  59. for (int i = 0; i < str.Length; ++ i) {
  60. char c = str[i];
  61. switch (c) {
  62. case '\\': case '*': case '+': case '?': case '|':
  63. case '{': case '[': case '(': case ')': case '^':
  64. case '$': case '.': case '#': case ' ':
  65. result += "\\" + c;
  66. break;
  67. case '\t': result += "\\t"; break;
  68. case '\n': result += "\\n"; break;
  69. case '\r': result += "\\r"; break;
  70. case '\f': result += "\\f"; break;
  71. default: result += c; break;
  72. }
  73. }
  74. return result;
  75. }
  76. public static string Unescape (string str) {
  77. return new Parser ().ParseString (str);
  78. }
  79. // public instance
  80. public Parser () {
  81. this.caps = new ArrayList ();
  82. this.refs = new Hashtable ();
  83. }
  84. public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
  85. this.pattern = pattern;
  86. this.ptr = 0;
  87. caps.Clear ();
  88. refs.Clear ();
  89. this.num_groups = 0;
  90. try {
  91. RegularExpression re = new RegularExpression ();
  92. ParseGroup (re, options, null);
  93. ResolveReferences ();
  94. re.GroupCount = num_groups;
  95. return re;
  96. }
  97. catch (IndexOutOfRangeException) {
  98. throw NewParseException ("Unexpected end of pattern.");
  99. }
  100. }
  101. public IDictionary GetMapping () {
  102. Hashtable mapping = new Hashtable ();
  103. int end = caps.Count;
  104. mapping.Add ("0", 0);
  105. for (int i = 0; i < end;) {
  106. CapturingGroup group = (CapturingGroup) caps [i];
  107. i++;
  108. if (group.Name != null && !mapping.Contains (group.Name))
  109. mapping.Add (group.Name, group.Number);
  110. else
  111. mapping.Add (i.ToString (), i);
  112. }
  113. return mapping;
  114. }
  115. // private methods
  116. private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
  117. bool is_top_level = group is RegularExpression;
  118. Alternation alternation = null;
  119. string literal = null;
  120. Group current = new Group ();
  121. Expression expr = null;
  122. bool closed = false;
  123. while (true) {
  124. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  125. if (ptr >= pattern.Length)
  126. break;
  127. // (1) Parse for Expressions
  128. char ch = pattern[ptr ++];
  129. switch (ch) {
  130. case '^': {
  131. Position pos =
  132. IsMultiline (options) ? Position.StartOfLine : Position.Start;
  133. expr = new PositionAssertion (pos);
  134. break;
  135. }
  136. case '$': {
  137. Position pos =
  138. IsMultiline (options) ? Position.EndOfLine : Position.End;
  139. expr = new PositionAssertion (pos);
  140. break;
  141. }
  142. case '.': {
  143. Category cat =
  144. IsSingleline (options) ? Category.AnySingleline : Category.Any;
  145. expr = new CharacterClass (cat, false);
  146. break;
  147. }
  148. case '\\': {
  149. int c = ParseEscape ();
  150. if (c >= 0)
  151. ch = (char)c;
  152. else {
  153. expr = ParseSpecial (options);
  154. if (expr == null)
  155. ch = pattern[ptr ++]; // default escape
  156. }
  157. break;
  158. }
  159. case '[': {
  160. expr = ParseCharacterClass (options);
  161. break;
  162. }
  163. case '(': {
  164. bool ignore = IsIgnoreCase (options);
  165. expr = ParseGroupingConstruct (ref options);
  166. if (expr == null) {
  167. if (literal != null && IsIgnoreCase (options) != ignore) {
  168. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  169. literal = null;
  170. }
  171. continue;
  172. }
  173. break;
  174. }
  175. case ')': {
  176. closed = true;
  177. goto EndOfGroup;
  178. }
  179. case '|': {
  180. if (literal != null) {
  181. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  182. literal = null;
  183. }
  184. if (assertion != null) {
  185. if (assertion.TrueExpression == null)
  186. assertion.TrueExpression = current;
  187. else if (assertion.FalseExpression == null)
  188. assertion.FalseExpression = current;
  189. else
  190. throw NewParseException ("Too many | in (?()|).");
  191. }
  192. else {
  193. if (alternation == null)
  194. alternation = new Alternation ();
  195. alternation.AddAlternative (current);
  196. }
  197. current = new Group ();
  198. continue;
  199. }
  200. case '*': case '+': case '?': case '{': {
  201. throw NewParseException ("Bad quantifier.");
  202. }
  203. default:
  204. break; // literal character
  205. }
  206. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  207. // (2) Check for Repetitions
  208. if (ptr < pattern.Length) {
  209. char k = pattern[ptr];
  210. if (k == '?' || k == '*' || k == '+' || k == '{') {
  211. ++ ptr;
  212. int min = 0, max = 0;
  213. bool lazy = false;
  214. switch (k) {
  215. case '?': min = 0; max = 1; break;
  216. case '*': min = 0; max = 0xffff; break;
  217. case '+': min = 1; max = 0xffff; break;
  218. case '{': ParseRepetitionBounds (out min, out max, options); break;
  219. }
  220. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  221. if (ptr < pattern.Length && pattern[ptr] == '?') {
  222. ++ ptr;
  223. lazy = true;
  224. }
  225. Repetition repetition = new Repetition (min, max, lazy);
  226. if (expr == null)
  227. repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
  228. else
  229. repetition.Expression = expr;
  230. expr = repetition;
  231. }
  232. }
  233. // (3) Append Expression and/or Literal
  234. if (expr == null) {
  235. if (literal == null)
  236. literal = "";
  237. literal += ch;
  238. }
  239. else {
  240. if (literal != null) {
  241. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  242. literal = null;
  243. }
  244. current.AppendExpression (expr);
  245. expr = null;
  246. }
  247. if (is_top_level && ptr >= pattern.Length)
  248. goto EndOfGroup;
  249. }
  250. EndOfGroup:
  251. if (is_top_level && closed)
  252. throw NewParseException ("Too many )'s.");
  253. if (!is_top_level && !closed)
  254. throw NewParseException ("Not enough )'s.");
  255. // clean up literals and alternations
  256. if (literal != null)
  257. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  258. if (assertion != null) {
  259. if (assertion.TrueExpression == null)
  260. assertion.TrueExpression = current;
  261. else
  262. assertion.FalseExpression = current;
  263. group.AppendExpression (assertion);
  264. }
  265. else if (alternation != null) {
  266. alternation.AddAlternative (current);
  267. group.AppendExpression (alternation);
  268. }
  269. else
  270. group.AppendExpression (current);
  271. }
  272. private Expression ParseGroupingConstruct (ref RegexOptions options) {
  273. if (pattern[ptr] != '?') {
  274. Group group;
  275. if (IsExplicitCapture (options))
  276. group = new Group ();
  277. else {
  278. group = new CapturingGroup ();
  279. caps.Add (group);
  280. }
  281. ParseGroup (group, options, null);
  282. return group;
  283. }
  284. else
  285. ++ ptr;
  286. switch (pattern[ptr]) {
  287. case ':': { // non-capturing group
  288. ++ ptr;
  289. Group group = new Group ();
  290. ParseGroup (group, options, null);
  291. return group;
  292. }
  293. case '>': { // non-backtracking group
  294. ++ ptr;
  295. Group group = new NonBacktrackingGroup ();
  296. ParseGroup (group, options, null);
  297. return group;
  298. }
  299. case 'i': case 'm': case 'n':
  300. case 's': case 'x': case '-': { // options
  301. RegexOptions o = options;
  302. ParseOptions (ref o, false);
  303. if (pattern[ptr] == '-') {
  304. ++ ptr;
  305. ParseOptions (ref o, true);
  306. }
  307. if (pattern[ptr] == ':') { // pass options to child group
  308. ++ ptr;
  309. Group group = new Group ();
  310. ParseGroup (group, o, null);
  311. return group;
  312. }
  313. else if (pattern[ptr] == ')') { // change options of enclosing group
  314. ++ ptr;
  315. options = o;
  316. return null;
  317. }
  318. else
  319. throw NewParseException ("Bad options");
  320. }
  321. case '<': case '=': case '!': { // lookahead/lookbehind
  322. ExpressionAssertion asn = new ExpressionAssertion ();
  323. if (!ParseAssertionType (asn))
  324. goto case '\''; // it's a (?<name> ) construct
  325. Group test = new Group ();
  326. ParseGroup (test, options, null);
  327. asn.TestExpression = test;
  328. return asn;
  329. }
  330. case '\'': { // named/balancing group
  331. char delim;
  332. if (pattern[ptr] == '<')
  333. delim = '>';
  334. else
  335. delim = '\'';
  336. ++ ptr;
  337. string name = ParseName ();
  338. if (pattern[ptr] == delim) {
  339. // capturing group
  340. if (name == null)
  341. throw NewParseException ("Bad group name.");
  342. ++ ptr;
  343. CapturingGroup cap = new CapturingGroup ();
  344. cap.Name = name;
  345. caps.Add (cap);
  346. ParseGroup (cap, options, null);
  347. return cap;
  348. }
  349. else if (pattern[ptr] == '-') {
  350. // balancing group
  351. ++ ptr;
  352. string balance_name = ParseName ();
  353. if (balance_name == null || pattern[ptr] != delim)
  354. throw NewParseException ("Bad balancing group name.");
  355. ++ ptr;
  356. BalancingGroup bal = new BalancingGroup ();
  357. bal.Name = name;
  358. caps.Add (bal);
  359. refs.Add (bal, balance_name);
  360. return bal;
  361. }
  362. else
  363. throw NewParseException ("Bad group name.");
  364. }
  365. case '(': { // expression/capture test
  366. Assertion asn;
  367. ++ ptr;
  368. int p = ptr;
  369. string name = ParseName ();
  370. if (name == null || pattern[ptr] != ')') { // expression test
  371. // FIXME MS implementation doesn't seem to
  372. // implement this version of (?(x) ...)
  373. ptr = p;
  374. ExpressionAssertion expr_asn = new ExpressionAssertion ();
  375. if (pattern[ptr] == '?') {
  376. ++ ptr;
  377. if (!ParseAssertionType (expr_asn))
  378. throw NewParseException ("Bad conditional.");
  379. }
  380. else {
  381. expr_asn.Negate = false;
  382. expr_asn.Reverse = false;
  383. }
  384. Group test = new Group ();
  385. ParseGroup (test, options, null);
  386. expr_asn.TestExpression = test;
  387. asn = expr_asn;
  388. }
  389. else { // capture test
  390. ++ ptr;
  391. asn = new CaptureAssertion ();
  392. refs.Add (asn, name);
  393. }
  394. Group group = new Group ();
  395. ParseGroup (group, options, asn);
  396. return group;
  397. }
  398. case '#': { // comment
  399. ++ ptr;
  400. while (pattern[ptr ++] != ')') {
  401. if (ptr >= pattern.Length)
  402. throw NewParseException ("Unterminated (?#...) comment.");
  403. }
  404. return null;
  405. }
  406. default: // error
  407. throw NewParseException ("Bad grouping construct.");
  408. }
  409. }
  410. private bool ParseAssertionType (ExpressionAssertion assertion) {
  411. if (pattern[ptr] == '<') {
  412. switch (pattern[ptr + 1]) {
  413. case '=':
  414. assertion.Negate = false;
  415. break;
  416. case '!':
  417. assertion.Negate = true;
  418. break;
  419. default:
  420. return false;
  421. }
  422. assertion.Reverse = true;
  423. ptr += 2;
  424. }
  425. else {
  426. switch (pattern[ptr]) {
  427. case '=':
  428. assertion.Negate = false;
  429. break;
  430. case '!':
  431. assertion.Negate = true;
  432. break;
  433. default:
  434. return false;
  435. }
  436. assertion.Reverse = false;
  437. ptr += 1;
  438. }
  439. return true;
  440. }
  441. private void ParseOptions (ref RegexOptions options, bool negate) {
  442. for (;;) {
  443. switch (pattern[ptr]) {
  444. case 'i':
  445. if (negate)
  446. options &= ~RegexOptions.IgnoreCase;
  447. else
  448. options |= RegexOptions.IgnoreCase;
  449. break;
  450. case 'm':
  451. if (negate)
  452. options &= ~RegexOptions.Multiline;
  453. else
  454. options |= RegexOptions.Multiline;
  455. break;
  456. case 'n':
  457. if (negate)
  458. options &= ~RegexOptions.ExplicitCapture;
  459. else
  460. options |= RegexOptions.ExplicitCapture;
  461. break;
  462. case 's':
  463. if (negate)
  464. options &= ~RegexOptions.Singleline;
  465. else
  466. options |= RegexOptions.Singleline;
  467. break;
  468. case 'x':
  469. if (negate)
  470. options &= ~RegexOptions.IgnorePatternWhitespace;
  471. else
  472. options |= RegexOptions.IgnorePatternWhitespace;
  473. break;
  474. default:
  475. return;
  476. }
  477. ++ ptr;
  478. }
  479. }
  480. private Expression ParseCharacterClass (RegexOptions options) {
  481. bool negate, ecma;
  482. if (pattern[ptr] == '^') {
  483. negate = true;
  484. ++ ptr;
  485. }
  486. else
  487. negate = false;
  488. ecma = IsECMAScript (options);
  489. CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
  490. if (pattern[ptr] == ']') {
  491. cls.AddCharacter (']');
  492. ++ ptr;
  493. }
  494. int c = -1;
  495. int last = -1;
  496. bool range = false;
  497. bool closed = false;
  498. while (ptr < pattern.Length) {
  499. c = pattern[ptr ++];
  500. if (c == ']') {
  501. closed = true;
  502. break;
  503. }
  504. if (c == '-') {
  505. range = true;
  506. continue;
  507. }
  508. if (c == '\\') {
  509. c = ParseEscape ();
  510. if (c < 0) {
  511. // didn't recognize escape
  512. c = pattern[ptr ++];
  513. switch (c) {
  514. case 'b': c = '\b'; break;
  515. case 'd':
  516. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
  517. last = -1;
  518. continue;
  519. case 'w':
  520. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
  521. last = -1;
  522. continue;
  523. case 's':
  524. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  525. last = -1;
  526. continue;
  527. case 'p':
  528. cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
  529. last = -1;
  530. continue;
  531. case 'D':
  532. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
  533. last = -1;
  534. continue;
  535. case 'W':
  536. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
  537. last = -1;
  538. continue;
  539. case 'S':
  540. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  541. last = -1;
  542. continue;
  543. case 'P':
  544. cls.AddCategory (ParseUnicodeCategory (), true);
  545. last = -1;
  546. continue;
  547. default: break; // add escaped character
  548. }
  549. }
  550. }
  551. if (range) {
  552. if (c < last)
  553. throw NewParseException ("[x-y] range in reverse order.");
  554. if (last >=0 )
  555. cls.AddRange ((char)last, (char)c);
  556. else {
  557. cls.AddCharacter ((char)c);
  558. cls.AddCharacter ('-');
  559. }
  560. range = false;
  561. last = -1;
  562. }
  563. else {
  564. cls.AddCharacter ((char)c);
  565. last = c;
  566. }
  567. }
  568. if (!closed)
  569. throw NewParseException ("Unterminated [] set.");
  570. if (range)
  571. cls.AddCharacter ('-');
  572. return cls;
  573. }
  574. private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
  575. int n, m;
  576. /* check syntax */
  577. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  578. n = ParseNumber (10, 1, 0);
  579. if (n < 0)
  580. throw NewParseException ("Illegal {x,y} - bad value of x.");
  581. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  582. switch (pattern[ptr ++]) {
  583. case '}':
  584. m = n;
  585. break;
  586. case ',':
  587. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  588. m = ParseNumber (10, 1, 0);
  589. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  590. if (pattern[ptr ++] != '}')
  591. throw NewParseException ("Illegal {x,y} - bad value of y.");
  592. break;
  593. default:
  594. throw NewParseException ("Illegal {x,y}");
  595. }
  596. /* check bounds and ordering */
  597. if (n >= 0xffff || m >= 0xffff)
  598. throw NewParseException ("Illegal {x, y} - maximum of 65535.");
  599. if (m >= 0 && m < n)
  600. throw NewParseException ("Illegal {x, y} with x > y.");
  601. /* assign min and max */
  602. min = n;
  603. if (m > 0)
  604. max = m;
  605. else
  606. max = 0xffff;
  607. }
  608. private Category ParseUnicodeCategory () {
  609. if (pattern[ptr ++] != '{')
  610. throw NewParseException ("Incomplete \\p{X} character escape.");
  611. string name = ParseName (pattern, ref ptr);
  612. if (name == null)
  613. throw NewParseException ("Incomplete \\p{X} character escape.");
  614. Category cat = CategoryUtils.CategoryFromName (name);
  615. if (cat == Category.None)
  616. throw NewParseException ("Unknown property '" + name + "'.");
  617. if (pattern[ptr ++] != '}')
  618. throw NewParseException ("Incomplete \\p{X} character escape.");
  619. return cat;
  620. }
  621. private Expression ParseSpecial (RegexOptions options) {
  622. int p = ptr;
  623. bool ecma = IsECMAScript (options);
  624. Expression expr = null;
  625. switch (pattern[ptr ++]) {
  626. // categories
  627. case 'd':
  628. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
  629. break;
  630. case 'w':
  631. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
  632. break;
  633. case 's':
  634. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  635. break;
  636. case 'p':
  637. // this is odd - ECMAScript isn't supposed to support Unicode,
  638. // yet \p{..} compiles and runs under the MS implementation
  639. // identically to canonical mode. That's why I'm ignoring the
  640. // value of ecma here.
  641. expr = new CharacterClass (ParseUnicodeCategory (), false);
  642. break;
  643. case 'D':
  644. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
  645. break;
  646. case 'W':
  647. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
  648. break;
  649. case 'S':
  650. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  651. break;
  652. case 'P':
  653. expr = new CharacterClass (ParseUnicodeCategory (), true);
  654. break;
  655. // positions
  656. case 'A': expr = new PositionAssertion (Position.StartOfString); break;
  657. case 'Z': expr = new PositionAssertion (Position.End); break;
  658. case 'z': expr = new PositionAssertion (Position.EndOfString); break;
  659. case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
  660. case 'b': expr = new PositionAssertion (Position.Boundary); break;
  661. case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
  662. // references
  663. case '1': case '2': case '3': case '4': case '5':
  664. case '6': case '7': case '8': case '9': {
  665. ptr --;
  666. int n = ParseNumber (10, 1, 0);
  667. if (n < 0) {
  668. ptr = p;
  669. return null;
  670. }
  671. // FIXME test if number is within number of assigned groups
  672. // this may present a problem for right-to-left matching
  673. Reference reference = new Reference (IsIgnoreCase (options));
  674. refs.Add (reference, n.ToString ());
  675. expr = reference;
  676. break;
  677. }
  678. case 'k': {
  679. char delim = pattern[ptr ++];
  680. if (delim == '<')
  681. delim = '>';
  682. else if (delim != '\'')
  683. throw NewParseException ("Malformed \\k<...> named backreference.");
  684. string name = ParseName ();
  685. if (name == null || pattern[ptr] != delim)
  686. throw NewParseException ("Malformed \\k<...> named backreference.");
  687. ++ ptr;
  688. Reference reference = new Reference (IsIgnoreCase (options));
  689. refs.Add (reference, name);
  690. expr = reference;
  691. break;
  692. }
  693. default:
  694. expr = null;
  695. break;
  696. }
  697. if (expr == null)
  698. ptr = p;
  699. return expr;
  700. }
  701. private int ParseEscape () {
  702. int p = ptr;
  703. int c;
  704. if (p >= pattern.Length)
  705. throw new ArgumentException (
  706. String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
  707. "pattern.", pattern), pattern);
  708. switch (pattern[ptr ++]) {
  709. // standard escapes (except \b)
  710. case 'a': return '\u0007';
  711. case 't': return '\u0009';
  712. case 'r': return '\u000d';
  713. case 'v': return '\u000b';
  714. case 'f': return '\u000c';
  715. case 'n': return '\u000a';
  716. case 'e': return '\u001b';
  717. case '\\': return '\\';
  718. // character codes
  719. case '0': return ParseOctal (pattern, ref ptr);
  720. case 'x':
  721. c = ParseHex (pattern, ref ptr, 2);
  722. if (c < 0)
  723. throw NewParseException ("Insufficient hex digits");
  724. return c;
  725. case 'u':
  726. c = ParseHex (pattern, ref ptr, 4);
  727. if (c < 0)
  728. throw NewParseException ("Insufficient hex digits");
  729. return c;
  730. // control characters
  731. case 'c':
  732. c = pattern[p ++];
  733. if (c >= 'A' && c <= 'Z')
  734. return c - 'A';
  735. else if (c >= '@' && c <= '_')
  736. return c - '@';
  737. else
  738. throw NewParseException ("Unrecognized control character.");
  739. // unknown escape
  740. default:
  741. ptr = p;
  742. return -1;
  743. }
  744. }
  745. private string ParseName () {
  746. return Parser.ParseName (pattern, ref ptr);
  747. }
  748. private static bool IsNameChar (char c) {
  749. UnicodeCategory cat = Char.GetUnicodeCategory (c);
  750. if (cat == UnicodeCategory.ModifierLetter)
  751. return false;
  752. if (cat == UnicodeCategory.ConnectorPunctuation)
  753. return true;
  754. return Char.IsLetterOrDigit (c);
  755. }
  756. private int ParseNumber (int b, int min, int max) {
  757. return Parser.ParseNumber (pattern, ref ptr, b, min, max);
  758. }
  759. private int ParseDecimal () {
  760. return Parser.ParseDecimal (pattern, ref ptr);
  761. }
  762. private static int ParseDigit (char c, int b, int n) {
  763. switch (b) {
  764. case 8:
  765. if (c >= '0' && c <= '7')
  766. return c - '0';
  767. else
  768. return -1;
  769. case 10:
  770. if (c >= '0' && c <= '9')
  771. return c - '0';
  772. else
  773. return -1;
  774. case 16:
  775. if (c >= '0' && c <= '9')
  776. return c - '0';
  777. else if (c >= 'a' && c <= 'f')
  778. return 10 + c - 'a';
  779. else if (c >= 'A' && c <= 'F')
  780. return 10 + c - 'A';
  781. else
  782. return -1;
  783. default:
  784. return -1;
  785. }
  786. }
  787. private void ConsumeWhitespace (bool ignore) {
  788. while (true) {
  789. if (ptr >= pattern.Length)
  790. break;
  791. if (pattern[ptr] == '(') {
  792. if (ptr + 3 >= pattern.Length)
  793. return;
  794. if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
  795. return;
  796. ptr += 3;
  797. while (pattern[ptr ++] != ')')
  798. /* ignore */ ;
  799. }
  800. else if (ignore && pattern[ptr] == '#') {
  801. while (ptr < pattern.Length && pattern[ptr ++] != '\n')
  802. /* ignore */ ;
  803. }
  804. else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
  805. while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
  806. ++ ptr;
  807. }
  808. else
  809. return;
  810. }
  811. }
  812. private string ParseString (string pattern) {
  813. this.pattern = pattern;
  814. this.ptr = 0;
  815. string result = "";
  816. while (ptr < pattern.Length) {
  817. int c = pattern[ptr ++];
  818. if (c == '\\')
  819. c = ParseEscape ();
  820. result += (char)c;
  821. }
  822. return result;
  823. }
  824. private void ResolveReferences () {
  825. int gid = 1;
  826. Hashtable dict = new Hashtable ();
  827. // number unnamed groups
  828. foreach (CapturingGroup group in caps) {
  829. if (group.Name == null) {
  830. dict.Add (gid.ToString (), group);
  831. group.Number = gid ++;
  832. ++ num_groups;
  833. }
  834. }
  835. // number named groups
  836. foreach (CapturingGroup group in caps) {
  837. if (group.Name != null) {
  838. if (!dict.Contains (group.Name)) {
  839. dict.Add (group.Name, group);
  840. group.Number = gid ++;
  841. ++ num_groups;
  842. }
  843. else {
  844. CapturingGroup prev = (CapturingGroup)dict[group.Name];
  845. group.Number = prev.Number;
  846. }
  847. }
  848. }
  849. // resolve references
  850. foreach (Expression expr in refs.Keys) {
  851. string name = (string)refs[expr];
  852. if (!dict.Contains (name)) {
  853. throw NewParseException ("Reference to undefined group " +
  854. (Char.IsDigit (name[0]) ? "number " : "name ") +
  855. name);
  856. }
  857. CapturingGroup group = (CapturingGroup)dict[name];
  858. if (expr is Reference)
  859. ((Reference)expr).CapturingGroup = group;
  860. else if (expr is CaptureAssertion)
  861. ((CaptureAssertion)expr).CapturingGroup = group;
  862. else if (expr is BalancingGroup)
  863. ((BalancingGroup)expr).Balance = group;
  864. }
  865. }
  866. // flag helper functions
  867. private static bool IsIgnoreCase (RegexOptions options) {
  868. return (options & RegexOptions.IgnoreCase) != 0;
  869. }
  870. private static bool IsMultiline (RegexOptions options) {
  871. return (options & RegexOptions.Multiline) != 0;
  872. }
  873. private static bool IsExplicitCapture (RegexOptions options) {
  874. return (options & RegexOptions.ExplicitCapture) != 0;
  875. }
  876. private static bool IsSingleline (RegexOptions options) {
  877. return (options & RegexOptions.Singleline) != 0;
  878. }
  879. private static bool IsIgnorePatternWhitespace (RegexOptions options) {
  880. return (options & RegexOptions.IgnorePatternWhitespace) != 0;
  881. }
  882. private static bool IsRightToLeft (RegexOptions options) {
  883. return (options & RegexOptions.RightToLeft) != 0;
  884. }
  885. private static bool IsECMAScript (RegexOptions options) {
  886. return (options & RegexOptions.ECMAScript) != 0;
  887. }
  888. // exception creation
  889. private ArgumentException NewParseException (string msg) {
  890. msg = "parsing \"" + pattern + "\" - " + msg;
  891. return new ArgumentException (msg, pattern);
  892. }
  893. private string pattern;
  894. private int ptr;
  895. private ArrayList caps;
  896. private Hashtable refs;
  897. private int num_groups;
  898. }
  899. }