parser.cs 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: parser.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. using System;
  9. using System.Collections;
  10. using System.Globalization;
  11. namespace System.Text.RegularExpressions.Syntax {
  12. class Parser {
  13. public static int ParseDecimal (string str, ref int ptr) {
  14. return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
  15. }
  16. public static int ParseOctal (string str, ref int ptr) {
  17. return ParseNumber (str, ref ptr, 8, 1, 3);
  18. }
  19. public static int ParseHex (string str, ref int ptr, int digits) {
  20. return ParseNumber (str, ref ptr, 16, digits, digits);
  21. }
  22. public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
  23. int p = ptr, n = 0, digits = 0, d;
  24. if (max < min)
  25. max = Int32.MaxValue;
  26. while (digits < max && p < str.Length) {
  27. d = ParseDigit (str[p ++], b, digits);
  28. if (d < 0) {
  29. -- p;
  30. break;
  31. }
  32. n = n * b + d;
  33. ++ digits;
  34. }
  35. if (digits < min)
  36. return -1;
  37. ptr = p;
  38. return n;
  39. }
  40. public static string ParseName (string str, ref int ptr) {
  41. if (Char.IsDigit (str[ptr])) {
  42. int gid = ParseNumber (str, ref ptr, 10, 1, 0);
  43. if (gid > 0)
  44. return gid.ToString ();
  45. return null;
  46. }
  47. int start = ptr;
  48. for (;;) {
  49. if (!IsNameChar (str[ptr]))
  50. break;
  51. ++ ptr;
  52. }
  53. if (ptr - start > 0)
  54. return str.Substring (start, ptr - start);
  55. return null;
  56. }
  57. public static string Escape (string str) {
  58. string result = "";
  59. for (int i = 0; i < str.Length; ++ i) {
  60. char c = str[i];
  61. switch (c) {
  62. case '\\': case '*': case '+': case '?': case '|':
  63. case '{': case '[': case '(': case ')': case '^':
  64. case '$': case '.': case '#': case ' ':
  65. result += "\\" + c;
  66. break;
  67. case '\t': result += "\\t"; break;
  68. case '\n': result += "\\n"; break;
  69. case '\r': result += "\\r"; break;
  70. case '\f': result += "\\f"; break;
  71. default: result += c; break;
  72. }
  73. }
  74. return result;
  75. }
  76. public static string Unescape (string str) {
  77. return new Parser ().ParseString (str);
  78. }
  79. // public instance
  80. public Parser () {
  81. this.caps = new ArrayList ();
  82. this.refs = new Hashtable ();
  83. }
  84. public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
  85. this.pattern = pattern;
  86. this.ptr = 0;
  87. caps.Clear ();
  88. refs.Clear ();
  89. this.num_groups = 0;
  90. try {
  91. RegularExpression re = new RegularExpression ();
  92. ParseGroup (re, options, null);
  93. ResolveReferences ();
  94. re.GroupCount = num_groups;
  95. return re;
  96. }
  97. catch (IndexOutOfRangeException) {
  98. throw NewParseException ("Unexpected end of pattern.");
  99. }
  100. }
  101. public IDictionary GetMapping () {
  102. Hashtable mapping = new Hashtable ();
  103. foreach (CapturingGroup group in caps) {
  104. if (group.Name != null)
  105. mapping.Add (group.Name, group.Number);
  106. }
  107. return mapping;
  108. }
  109. // private methods
  110. private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
  111. bool is_top_level = group is RegularExpression;
  112. Alternation alternation = null;
  113. string literal = null;
  114. Group current = new Group ();
  115. Expression expr = null;
  116. bool closed = false;
  117. while (true) {
  118. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  119. if (ptr >= pattern.Length)
  120. break;
  121. // (1) Parse for Expressions
  122. char ch = pattern[ptr ++];
  123. switch (ch) {
  124. case '^': {
  125. Position pos =
  126. IsMultiline (options) ? Position.StartOfLine : Position.Start;
  127. expr = new PositionAssertion (pos);
  128. break;
  129. }
  130. case '$': {
  131. Position pos =
  132. IsMultiline (options) ? Position.EndOfLine : Position.End;
  133. expr = new PositionAssertion (pos);
  134. break;
  135. }
  136. case '.': {
  137. Category cat =
  138. IsSingleline (options) ? Category.AnySingleline : Category.Any;
  139. expr = new CharacterClass (cat, false);
  140. break;
  141. }
  142. case '\\': {
  143. int c = ParseEscape ();
  144. if (c >= 0)
  145. ch = (char)c;
  146. else {
  147. expr = ParseSpecial (options);
  148. if (expr == null)
  149. ch = pattern[ptr ++]; // default escape
  150. }
  151. break;
  152. }
  153. case '[': {
  154. expr = ParseCharacterClass (options);
  155. break;
  156. }
  157. case '(': {
  158. bool ignore = IsIgnoreCase (options);
  159. expr = ParseGroupingConstruct (ref options);
  160. if (expr == null) {
  161. if (literal != null && IsIgnoreCase (options) != ignore) {
  162. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  163. literal = null;
  164. }
  165. continue;
  166. }
  167. break;
  168. }
  169. case ')': {
  170. closed = true;
  171. goto EndOfGroup;
  172. }
  173. case '|': {
  174. if (literal != null) {
  175. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  176. literal = null;
  177. }
  178. if (assertion != null) {
  179. if (assertion.TrueExpression == null)
  180. assertion.TrueExpression = current;
  181. else if (assertion.FalseExpression == null)
  182. assertion.FalseExpression = current;
  183. else
  184. throw NewParseException ("Too many | in (?()|).");
  185. }
  186. else {
  187. if (alternation == null)
  188. alternation = new Alternation ();
  189. alternation.AddAlternative (current);
  190. }
  191. current = new Group ();
  192. continue;
  193. }
  194. case '*': case '+': case '?': case '{': {
  195. throw NewParseException ("Bad quantifier.");
  196. }
  197. default:
  198. break; // literal character
  199. }
  200. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  201. // (2) Check for Repetitions
  202. if (ptr < pattern.Length) {
  203. char k = pattern[ptr];
  204. if (k == '?' || k == '*' || k == '+' || k == '{') {
  205. ++ ptr;
  206. int min = 0, max = 0;
  207. bool lazy = false;
  208. switch (k) {
  209. case '?': min = 0; max = 1; break;
  210. case '*': min = 0; max = 0xffff; break;
  211. case '+': min = 1; max = 0xffff; break;
  212. case '{': ParseRepetitionBounds (out min, out max, options); break;
  213. }
  214. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  215. if (ptr < pattern.Length && pattern[ptr] == '?') {
  216. ++ ptr;
  217. lazy = true;
  218. }
  219. Repetition repetition = new Repetition (min, max, lazy);
  220. if (expr == null)
  221. repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
  222. else
  223. repetition.Expression = expr;
  224. expr = repetition;
  225. }
  226. }
  227. // (3) Append Expression and/or Literal
  228. if (expr == null) {
  229. if (literal == null)
  230. literal = "";
  231. literal += ch;
  232. }
  233. else {
  234. if (literal != null) {
  235. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  236. literal = null;
  237. }
  238. current.AppendExpression (expr);
  239. expr = null;
  240. }
  241. if (is_top_level && ptr >= pattern.Length)
  242. goto EndOfGroup;
  243. }
  244. EndOfGroup:
  245. if (is_top_level && closed)
  246. throw NewParseException ("Too many )'s.");
  247. if (!is_top_level && !closed)
  248. throw NewParseException ("Not enough )'s.");
  249. // clean up literals and alternations
  250. if (literal != null)
  251. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  252. if (assertion != null) {
  253. if (assertion.TrueExpression == null)
  254. assertion.TrueExpression = current;
  255. else
  256. assertion.FalseExpression = current;
  257. group.AppendExpression (assertion);
  258. }
  259. else if (alternation != null) {
  260. alternation.AddAlternative (current);
  261. group.AppendExpression (alternation);
  262. }
  263. else
  264. group.AppendExpression (current);
  265. }
  266. private Expression ParseGroupingConstruct (ref RegexOptions options) {
  267. if (pattern[ptr] != '?') {
  268. Group group;
  269. if (IsExplicitCapture (options))
  270. group = new Group ();
  271. else {
  272. group = new CapturingGroup ();
  273. caps.Add (group);
  274. }
  275. ParseGroup (group, options, null);
  276. return group;
  277. }
  278. else
  279. ++ ptr;
  280. switch (pattern[ptr]) {
  281. case ':': { // non-capturing group
  282. ++ ptr;
  283. Group group = new Group ();
  284. ParseGroup (group, options, null);
  285. return group;
  286. }
  287. case '>': { // non-backtracking group
  288. ++ ptr;
  289. Group group = new NonBacktrackingGroup ();
  290. ParseGroup (group, options, null);
  291. return group;
  292. }
  293. case 'i': case 'm': case 'n':
  294. case 's': case 'x': case '-': { // options
  295. RegexOptions o = options;
  296. ParseOptions (ref o, false);
  297. if (pattern[ptr] == '-') {
  298. ++ ptr;
  299. ParseOptions (ref o, true);
  300. }
  301. if (pattern[ptr] == ':') { // pass options to child group
  302. ++ ptr;
  303. Group group = new Group ();
  304. ParseGroup (group, o, null);
  305. return group;
  306. }
  307. else if (pattern[ptr] == ')') { // change options of enclosing group
  308. ++ ptr;
  309. options = o;
  310. return null;
  311. }
  312. else
  313. throw NewParseException ("Bad options");
  314. }
  315. case '<': case '=': case '!': { // lookahead/lookbehind
  316. ExpressionAssertion asn = new ExpressionAssertion ();
  317. if (!ParseAssertionType (asn))
  318. goto case '\''; // it's a (?<name> ) construct
  319. Group test = new Group ();
  320. ParseGroup (test, options, null);
  321. asn.TestExpression = test;
  322. return asn;
  323. }
  324. case '\'': { // named/balancing group
  325. char delim;
  326. if (pattern[ptr] == '<')
  327. delim = '>';
  328. else
  329. delim = '\'';
  330. ++ ptr;
  331. string name = ParseName ();
  332. if (pattern[ptr] == delim) {
  333. // capturing group
  334. if (name == null)
  335. throw NewParseException ("Bad group name.");
  336. ++ ptr;
  337. CapturingGroup cap = new CapturingGroup ();
  338. cap.Name = name;
  339. caps.Add (cap);
  340. ParseGroup (cap, options, null);
  341. return cap;
  342. }
  343. else if (pattern[ptr] == '-') {
  344. // balancing group
  345. ++ ptr;
  346. string balance_name = ParseName ();
  347. if (balance_name == null || pattern[ptr] != delim)
  348. throw NewParseException ("Bad balancing group name.");
  349. ++ ptr;
  350. BalancingGroup bal = new BalancingGroup ();
  351. bal.Name = name;
  352. caps.Add (bal);
  353. refs.Add (bal, balance_name);
  354. return bal;
  355. }
  356. else
  357. throw NewParseException ("Bad group name.");
  358. }
  359. case '(': { // expression/capture test
  360. Assertion asn;
  361. ++ ptr;
  362. int p = ptr;
  363. string name = ParseName ();
  364. if (name == null || pattern[ptr] != ')') { // expression test
  365. // FIXME MS implementation doesn't seem to
  366. // implement this version of (?(x) ...)
  367. ptr = p;
  368. ExpressionAssertion expr_asn = new ExpressionAssertion ();
  369. if (pattern[ptr] == '?') {
  370. ++ ptr;
  371. if (!ParseAssertionType (expr_asn))
  372. throw NewParseException ("Bad conditional.");
  373. }
  374. else {
  375. expr_asn.Negate = false;
  376. expr_asn.Reverse = false;
  377. }
  378. Group test = new Group ();
  379. ParseGroup (test, options, null);
  380. expr_asn.TestExpression = test;
  381. asn = expr_asn;
  382. }
  383. else { // capture test
  384. ++ ptr;
  385. asn = new CaptureAssertion ();
  386. refs.Add (asn, name);
  387. }
  388. Group group = new Group ();
  389. ParseGroup (group, options, asn);
  390. return group;
  391. }
  392. case '#': { // comment
  393. ++ ptr;
  394. while (pattern[ptr ++] != ')') {
  395. if (ptr >= pattern.Length)
  396. throw NewParseException ("Unterminated (?#...) comment.");
  397. }
  398. return null;
  399. }
  400. default: // error
  401. throw NewParseException ("Bad grouping construct.");
  402. }
  403. }
  404. private bool ParseAssertionType (ExpressionAssertion assertion) {
  405. if (pattern[ptr] == '<') {
  406. switch (pattern[ptr + 1]) {
  407. case '=':
  408. assertion.Negate = false;
  409. break;
  410. case '!':
  411. assertion.Negate = true;
  412. break;
  413. default:
  414. return false;
  415. }
  416. assertion.Reverse = true;
  417. ptr += 2;
  418. }
  419. else {
  420. switch (pattern[ptr]) {
  421. case '=':
  422. assertion.Negate = false;
  423. break;
  424. case '!':
  425. assertion.Negate = true;
  426. break;
  427. default:
  428. return false;
  429. }
  430. assertion.Reverse = false;
  431. ptr += 1;
  432. }
  433. return true;
  434. }
  435. private void ParseOptions (ref RegexOptions options, bool negate) {
  436. for (;;) {
  437. switch (pattern[ptr]) {
  438. case 'i':
  439. if (negate)
  440. options &= ~RegexOptions.IgnoreCase;
  441. else
  442. options |= RegexOptions.IgnoreCase;
  443. break;
  444. case 'm':
  445. if (negate)
  446. options &= ~RegexOptions.Multiline;
  447. else
  448. options |= RegexOptions.Multiline;
  449. break;
  450. case 'n':
  451. if (negate)
  452. options &= ~RegexOptions.ExplicitCapture;
  453. else
  454. options |= RegexOptions.ExplicitCapture;
  455. break;
  456. case 's':
  457. if (negate)
  458. options &= ~RegexOptions.Singleline;
  459. else
  460. options |= RegexOptions.Singleline;
  461. break;
  462. case 'x':
  463. if (negate)
  464. options &= ~RegexOptions.IgnorePatternWhitespace;
  465. else
  466. options |= RegexOptions.IgnorePatternWhitespace;
  467. break;
  468. default:
  469. return;
  470. }
  471. ++ ptr;
  472. }
  473. }
  474. private Expression ParseCharacterClass (RegexOptions options) {
  475. bool negate, ecma;
  476. if (pattern[ptr] == '^') {
  477. negate = true;
  478. ++ ptr;
  479. }
  480. else
  481. negate = false;
  482. ecma = IsECMAScript (options);
  483. CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
  484. if (pattern[ptr] == ']') {
  485. cls.AddCharacter (']');
  486. ++ ptr;
  487. }
  488. int c = -1;
  489. int last = -1;
  490. bool range = false;
  491. bool closed = false;
  492. while (ptr < pattern.Length) {
  493. c = pattern[ptr ++];
  494. if (c == ']') {
  495. closed = true;
  496. break;
  497. }
  498. if (c == '-') {
  499. range = true;
  500. continue;
  501. }
  502. if (c == '\\') {
  503. c = ParseEscape ();
  504. if (c < 0) {
  505. // didn't recognize escape
  506. c = pattern[ptr ++];
  507. switch (c) {
  508. case 'b': c = '\b'; break;
  509. case 'd':
  510. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
  511. last = -1;
  512. continue;
  513. case 'w':
  514. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
  515. last = -1;
  516. continue;
  517. case 's':
  518. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  519. last = -1;
  520. continue;
  521. case 'p':
  522. cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
  523. last = -1;
  524. continue;
  525. case 'D':
  526. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
  527. last = -1;
  528. continue;
  529. case 'W':
  530. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
  531. last = -1;
  532. continue;
  533. case 'S':
  534. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  535. last = -1;
  536. continue;
  537. case 'P':
  538. cls.AddCategory (ParseUnicodeCategory (), true);
  539. last = -1;
  540. continue;
  541. default: break; // add escaped character
  542. }
  543. }
  544. }
  545. if (range) {
  546. if (c < last)
  547. throw NewParseException ("[x-y] range in reverse order.");
  548. if (last >=0 )
  549. cls.AddRange ((char)last, (char)c);
  550. else {
  551. cls.AddCharacter ((char)c);
  552. cls.AddCharacter ('-');
  553. }
  554. range = false;
  555. last = -1;
  556. }
  557. else {
  558. cls.AddCharacter ((char)c);
  559. last = c;
  560. }
  561. }
  562. if (!closed)
  563. throw NewParseException ("Unterminated [] set.");
  564. if (range)
  565. cls.AddCharacter ('-');
  566. return cls;
  567. }
  568. private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
  569. int n, m;
  570. /* check syntax */
  571. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  572. n = ParseNumber (10, 1, 0);
  573. if (n < 0)
  574. throw NewParseException ("Illegal {x,y} - bad value of x.");
  575. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  576. switch (pattern[ptr ++]) {
  577. case '}':
  578. m = n;
  579. break;
  580. case ',':
  581. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  582. m = ParseNumber (10, 1, 0);
  583. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  584. if (pattern[ptr ++] != '}')
  585. throw NewParseException ("Illegal {x,y} - bad value of y.");
  586. break;
  587. default:
  588. throw NewParseException ("Illegal {x,y}");
  589. }
  590. /* check bounds and ordering */
  591. if (n >= 0xffff || m >= 0xffff)
  592. throw NewParseException ("Illegal {x, y} - maximum of 65535.");
  593. if (m >= 0 && m < n)
  594. throw NewParseException ("Illegal {x, y} with x > y.");
  595. /* assign min and max */
  596. min = n;
  597. if (m > 0)
  598. max = m;
  599. else
  600. max = 0xffff;
  601. }
  602. private Category ParseUnicodeCategory () {
  603. if (pattern[ptr ++] != '{')
  604. throw NewParseException ("Incomplete \\p{X} character escape.");
  605. string name = ParseName (pattern, ref ptr);
  606. if (name == null)
  607. throw NewParseException ("Incomplete \\p{X} character escape.");
  608. Category cat = CategoryUtils.CategoryFromName (name);
  609. if (cat == Category.None)
  610. throw NewParseException ("Unknown property '" + name + "'.");
  611. if (pattern[ptr ++] != '}')
  612. throw NewParseException ("Incomplete \\p{X} character escape.");
  613. return cat;
  614. }
  615. private Expression ParseSpecial (RegexOptions options) {
  616. int p = ptr;
  617. bool ecma = IsECMAScript (options);
  618. Expression expr = null;
  619. switch (pattern[ptr ++]) {
  620. // categories
  621. case 'd':
  622. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
  623. break;
  624. case 'w':
  625. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
  626. break;
  627. case 's':
  628. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  629. break;
  630. case 'p':
  631. // this is odd - ECMAScript isn't supposed to support Unicode,
  632. // yet \p{..} compiles and runs under the MS implementation
  633. // identically to canonical mode. That's why I'm ignoring the
  634. // value of ecma here.
  635. expr = new CharacterClass (ParseUnicodeCategory (), false);
  636. break;
  637. case 'D':
  638. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
  639. break;
  640. case 'W':
  641. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
  642. break;
  643. case 'S':
  644. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  645. break;
  646. case 'P':
  647. expr = new CharacterClass (ParseUnicodeCategory (), true);
  648. break;
  649. // positions
  650. case 'A': expr = new PositionAssertion (Position.StartOfString); break;
  651. case 'Z': expr = new PositionAssertion (Position.End); break;
  652. case 'z': expr = new PositionAssertion (Position.EndOfString); break;
  653. case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
  654. case 'b': expr = new PositionAssertion (Position.Boundary); break;
  655. case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
  656. // references
  657. case '1': case '2': case '3': case '4': case '5':
  658. case '6': case '7': case '8': case '9': {
  659. ptr --;
  660. int n = ParseNumber (10, 1, 0);
  661. if (n < 0) {
  662. ptr = p;
  663. return null;
  664. }
  665. // FIXME test if number is within number of assigned groups
  666. // this may present a problem for right-to-left matching
  667. Reference reference = new Reference (IsIgnoreCase (options));
  668. refs.Add (reference, n.ToString ());
  669. expr = reference;
  670. break;
  671. }
  672. case 'k': {
  673. char delim = pattern[ptr ++];
  674. if (delim == '<')
  675. delim = '>';
  676. else if (delim != '\'')
  677. throw NewParseException ("Malformed \\k<...> named backreference.");
  678. string name = ParseName ();
  679. if (name == null || pattern[ptr] != delim)
  680. throw NewParseException ("Malformed \\k<...> named backreference.");
  681. ++ ptr;
  682. Reference reference = new Reference (IsIgnoreCase (options));
  683. refs.Add (reference, name);
  684. expr = reference;
  685. break;
  686. }
  687. default:
  688. expr = null;
  689. break;
  690. }
  691. if (expr == null)
  692. ptr = p;
  693. return expr;
  694. }
  695. private int ParseEscape () {
  696. int p = ptr;
  697. int c;
  698. switch (pattern[ptr ++]) {
  699. // standard escapes (except \b)
  700. case 'a': return '\u0007';
  701. case 't': return '\u0009';
  702. case 'r': return '\u000d';
  703. case 'v': return '\u000b';
  704. case 'f': return '\u000c';
  705. case 'n': return '\u000a';
  706. case 'e': return '\u001b';
  707. case '\\': return '\\';
  708. // character codes
  709. case '0': return ParseOctal (pattern, ref ptr);
  710. case 'x':
  711. c = ParseHex (pattern, ref ptr, 2);
  712. if (c < 0)
  713. throw NewParseException ("Insufficient hex digits");
  714. return c;
  715. case 'u':
  716. c = ParseHex (pattern, ref ptr, 4);
  717. if (c < 0)
  718. throw NewParseException ("Insufficient hex digits");
  719. return c;
  720. // control characters
  721. case 'c':
  722. c = pattern[p ++];
  723. if (c >= 'A' && c <= 'Z')
  724. return c - 'A';
  725. else if (c >= '@' && c <= '_')
  726. return c - '@';
  727. else
  728. throw NewParseException ("Unrecognized control character.");
  729. // unknown escape
  730. default:
  731. ptr = p;
  732. return -1;
  733. }
  734. }
  735. private string ParseName () {
  736. return Parser.ParseName (pattern, ref ptr);
  737. }
  738. private static bool IsNameChar (char c) {
  739. UnicodeCategory cat = Char.GetUnicodeCategory (c);
  740. if (cat == UnicodeCategory.ModifierLetter)
  741. return false;
  742. if (cat == UnicodeCategory.ConnectorPunctuation)
  743. return true;
  744. return Char.IsLetterOrDigit (c);
  745. }
  746. private int ParseNumber (int b, int min, int max) {
  747. return Parser.ParseNumber (pattern, ref ptr, b, min, max);
  748. }
  749. private int ParseDecimal () {
  750. return Parser.ParseDecimal (pattern, ref ptr);
  751. }
  752. private static int ParseDigit (char c, int b, int n) {
  753. switch (b) {
  754. case 8:
  755. if (c >= '0' && c <= '7')
  756. return c - '0';
  757. else
  758. return -1;
  759. case 10:
  760. if (c >= '0' && c <= '9')
  761. return c - '0';
  762. else
  763. return -1;
  764. case 16:
  765. if (c >= '0' && c <= '9')
  766. return c - '0';
  767. else if (c >= 'a' && c <= 'f')
  768. return 10 + c - 'a';
  769. else if (c >= 'A' && c <= 'F')
  770. return 10 + c - 'A';
  771. else
  772. return -1;
  773. default:
  774. return -1;
  775. }
  776. }
  777. private void ConsumeWhitespace (bool ignore) {
  778. while (true) {
  779. if (ptr >= pattern.Length)
  780. break;
  781. if (pattern[ptr] == '(') {
  782. if (ptr + 3 >= pattern.Length)
  783. return;
  784. if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
  785. return;
  786. ptr += 3;
  787. while (pattern[ptr ++] != ')')
  788. /* ignore */ ;
  789. }
  790. else if (ignore && pattern[ptr] == '#') {
  791. while (ptr < pattern.Length && pattern[ptr ++] != '\n')
  792. /* ignore */ ;
  793. }
  794. else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
  795. while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
  796. ++ ptr;
  797. }
  798. else
  799. return;
  800. }
  801. }
  802. private string ParseString (string pattern) {
  803. this.pattern = pattern;
  804. this.ptr = 0;
  805. string result = "";
  806. while (ptr < pattern.Length) {
  807. int c = pattern[ptr ++];
  808. if (c == '\\')
  809. c = ParseEscape ();
  810. result += (char)c;
  811. }
  812. return result;
  813. }
  814. private void ResolveReferences () {
  815. int gid = 1;
  816. Hashtable dict = new Hashtable ();
  817. // number unnamed groups
  818. foreach (CapturingGroup group in caps) {
  819. if (group.Name == null) {
  820. dict.Add (gid.ToString (), group);
  821. group.Number = gid ++;
  822. ++ num_groups;
  823. }
  824. }
  825. // number named groups
  826. foreach (CapturingGroup group in caps) {
  827. if (group.Name != null) {
  828. if (!dict.Contains (group.Name)) {
  829. dict.Add (group.Name, group);
  830. group.Number = gid ++;
  831. ++ num_groups;
  832. }
  833. else {
  834. CapturingGroup prev = (CapturingGroup)dict[group.Name];
  835. group.Number = prev.Number;
  836. }
  837. }
  838. }
  839. // resolve references
  840. foreach (Expression expr in refs.Keys) {
  841. string name = (string)refs[expr];
  842. if (!dict.Contains (name)) {
  843. throw NewParseException ("Reference to undefined group " +
  844. (Char.IsDigit (name[0]) ? "number " : "name ") +
  845. name);
  846. }
  847. CapturingGroup group = (CapturingGroup)dict[name];
  848. if (expr is Reference)
  849. ((Reference)expr).CapturingGroup = group;
  850. else if (expr is CaptureAssertion)
  851. ((CaptureAssertion)expr).CapturingGroup = group;
  852. else if (expr is BalancingGroup)
  853. ((BalancingGroup)expr).Balance = group;
  854. }
  855. }
  856. // flag helper functions
  857. private static bool IsIgnoreCase (RegexOptions options) {
  858. return (options & RegexOptions.IgnoreCase) != 0;
  859. }
  860. private static bool IsMultiline (RegexOptions options) {
  861. return (options & RegexOptions.Multiline) != 0;
  862. }
  863. private static bool IsExplicitCapture (RegexOptions options) {
  864. return (options & RegexOptions.ExplicitCapture) != 0;
  865. }
  866. private static bool IsSingleline (RegexOptions options) {
  867. return (options & RegexOptions.Singleline) != 0;
  868. }
  869. private static bool IsIgnorePatternWhitespace (RegexOptions options) {
  870. return (options & RegexOptions.IgnorePatternWhitespace) != 0;
  871. }
  872. private static bool IsRightToLeft (RegexOptions options) {
  873. return (options & RegexOptions.RightToLeft) != 0;
  874. }
  875. private static bool IsECMAScript (RegexOptions options) {
  876. return (options & RegexOptions.ECMAScript) != 0;
  877. }
  878. // exception creation
  879. private ArgumentException NewParseException (string msg) {
  880. msg = "parsing \"" + pattern + "\" - " + msg;
  881. return new ArgumentException (msg, pattern);
  882. }
  883. private string pattern;
  884. private int ptr;
  885. private ArrayList caps;
  886. private Hashtable refs;
  887. private int num_groups;
  888. }
  889. }