parser.cs 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: parser.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. //
  9. // Permission is hereby granted, free of charge, to any person obtaining
  10. // a copy of this software and associated documentation files (the
  11. // "Software"), to deal in the Software without restriction, including
  12. // without limitation the rights to use, copy, modify, merge, publish,
  13. // distribute, sublicense, and/or sell copies of the Software, and to
  14. // permit persons to whom the Software is furnished to do so, subject to
  15. // the following conditions:
  16. //
  17. // The above copyright notice and this permission notice shall be
  18. // included in all copies or substantial portions of the Software.
  19. //
  20. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  24. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  25. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  26. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27. //
  28. using System;
  29. using System.Collections;
  30. using System.Globalization;
  31. namespace System.Text.RegularExpressions.Syntax {
  32. class Parser {
  33. public static int ParseDecimal (string str, ref int ptr) {
  34. return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
  35. }
  36. public static int ParseOctal (string str, ref int ptr) {
  37. return ParseNumber (str, ref ptr, 8, 1, 3);
  38. }
  39. public static int ParseHex (string str, ref int ptr, int digits) {
  40. return ParseNumber (str, ref ptr, 16, digits, digits);
  41. }
  42. public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
  43. int p = ptr, n = 0, digits = 0, d;
  44. if (max < min)
  45. max = Int32.MaxValue;
  46. while (digits < max && p < str.Length) {
  47. d = ParseDigit (str[p ++], b, digits);
  48. if (d < 0) {
  49. -- p;
  50. break;
  51. }
  52. n = n * b + d;
  53. ++ digits;
  54. }
  55. if (digits < min)
  56. return -1;
  57. ptr = p;
  58. return n;
  59. }
  60. public static string ParseName (string str, ref int ptr) {
  61. if (Char.IsDigit (str[ptr])) {
  62. int gid = ParseNumber (str, ref ptr, 10, 1, 0);
  63. if (gid > 0)
  64. return gid.ToString ();
  65. return null;
  66. }
  67. int start = ptr;
  68. for (;;) {
  69. if (!IsNameChar (str[ptr]))
  70. break;
  71. ++ ptr;
  72. }
  73. if (ptr - start > 0)
  74. return str.Substring (start, ptr - start);
  75. return null;
  76. }
  77. public static string Escape (string str) {
  78. string result = "";
  79. for (int i = 0; i < str.Length; ++ i) {
  80. char c = str[i];
  81. switch (c) {
  82. case '\\': case '*': case '+': case '?': case '|':
  83. case '{': case '[': case '(': case ')': case '^':
  84. case '$': case '.': case '#': case ' ':
  85. result += "\\" + c;
  86. break;
  87. case '\t': result += "\\t"; break;
  88. case '\n': result += "\\n"; break;
  89. case '\r': result += "\\r"; break;
  90. case '\f': result += "\\f"; break;
  91. default: result += c; break;
  92. }
  93. }
  94. return result;
  95. }
  96. public static string Unescape (string str) {
  97. return new Parser ().ParseString (str);
  98. }
  99. // public instance
  100. public Parser () {
  101. this.caps = new ArrayList ();
  102. this.refs = new Hashtable ();
  103. }
  104. public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
  105. this.pattern = pattern;
  106. this.ptr = 0;
  107. caps.Clear ();
  108. refs.Clear ();
  109. this.num_groups = 0;
  110. try {
  111. RegularExpression re = new RegularExpression ();
  112. ParseGroup (re, options, null);
  113. ResolveReferences ();
  114. re.GroupCount = num_groups;
  115. return re;
  116. }
  117. catch (IndexOutOfRangeException) {
  118. throw NewParseException ("Unexpected end of pattern.");
  119. }
  120. }
  121. public IDictionary GetMapping () {
  122. Hashtable mapping = new Hashtable ();
  123. Hashtable numbers = new Hashtable ();
  124. int end = caps.Count;
  125. mapping.Add ("0", 0);
  126. for (int i = 0; i < end; i++) {
  127. CapturingGroup group = (CapturingGroup) caps [i];
  128. if (group.Name != null && !mapping.Contains (group.Name)) {
  129. mapping.Add (group.Name, group.Number);
  130. numbers.Add (group.Number, group.Number);
  131. }
  132. }
  133. for (int i = 1; i < end; i++) {
  134. if (numbers [i] == null)
  135. mapping.Add (i.ToString (), i);
  136. }
  137. return mapping;
  138. }
  139. // private methods
  140. private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
  141. bool is_top_level = group is RegularExpression;
  142. Alternation alternation = null;
  143. string literal = null;
  144. Group current = new Group ();
  145. Expression expr = null;
  146. bool closed = false;
  147. while (true) {
  148. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  149. if (ptr >= pattern.Length)
  150. break;
  151. // (1) Parse for Expressions
  152. char ch = pattern[ptr ++];
  153. switch (ch) {
  154. case '^': {
  155. Position pos =
  156. IsMultiline (options) ? Position.StartOfLine : Position.Start;
  157. expr = new PositionAssertion (pos);
  158. break;
  159. }
  160. case '$': {
  161. Position pos =
  162. IsMultiline (options) ? Position.EndOfLine : Position.End;
  163. expr = new PositionAssertion (pos);
  164. break;
  165. }
  166. case '.': {
  167. Category cat =
  168. IsSingleline (options) ? Category.AnySingleline : Category.Any;
  169. expr = new CharacterClass (cat, false);
  170. break;
  171. }
  172. case '\\': {
  173. int c = ParseEscape ();
  174. if (c >= 0)
  175. ch = (char)c;
  176. else {
  177. expr = ParseSpecial (options);
  178. if (expr == null)
  179. ch = pattern[ptr ++]; // default escape
  180. }
  181. break;
  182. }
  183. case '[': {
  184. expr = ParseCharacterClass (options);
  185. break;
  186. }
  187. case '(': {
  188. bool ignore = IsIgnoreCase (options);
  189. expr = ParseGroupingConstruct (ref options);
  190. if (expr == null) {
  191. if (literal != null && IsIgnoreCase (options) != ignore) {
  192. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  193. literal = null;
  194. }
  195. continue;
  196. }
  197. break;
  198. }
  199. case ')': {
  200. closed = true;
  201. goto EndOfGroup;
  202. }
  203. case '|': {
  204. if (literal != null) {
  205. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  206. literal = null;
  207. }
  208. if (assertion != null) {
  209. if (assertion.TrueExpression == null)
  210. assertion.TrueExpression = current;
  211. else if (assertion.FalseExpression == null)
  212. assertion.FalseExpression = current;
  213. else
  214. throw NewParseException ("Too many | in (?()|).");
  215. }
  216. else {
  217. if (alternation == null)
  218. alternation = new Alternation ();
  219. alternation.AddAlternative (current);
  220. }
  221. current = new Group ();
  222. continue;
  223. }
  224. case '*': case '+': case '?': {
  225. throw NewParseException ("Bad quantifier.");
  226. }
  227. default:
  228. break; // literal character
  229. }
  230. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  231. // (2) Check for Repetitions
  232. if (ptr < pattern.Length) {
  233. char k = pattern[ptr];
  234. if (k == '?' || k == '*' || k == '+' || k == '{') {
  235. ++ ptr;
  236. int min = 0, max = 0;
  237. bool lazy = false;
  238. switch (k) {
  239. case '?': min = 0; max = 1; break;
  240. case '*': min = 0; max = 0xffff; break;
  241. case '+': min = 1; max = 0xffff; break;
  242. case '{': ParseRepetitionBounds (out min, out max, options); break;
  243. }
  244. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  245. if (ptr < pattern.Length && pattern[ptr] == '?') {
  246. ++ ptr;
  247. lazy = true;
  248. }
  249. Repetition repetition = new Repetition (min, max, lazy);
  250. if (expr == null)
  251. repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
  252. else
  253. repetition.Expression = expr;
  254. expr = repetition;
  255. }
  256. }
  257. // (3) Append Expression and/or Literal
  258. if (expr == null) {
  259. if (literal == null)
  260. literal = "";
  261. literal += ch;
  262. }
  263. else {
  264. if (literal != null) {
  265. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  266. literal = null;
  267. }
  268. current.AppendExpression (expr);
  269. expr = null;
  270. }
  271. if (is_top_level && ptr >= pattern.Length)
  272. goto EndOfGroup;
  273. }
  274. EndOfGroup:
  275. if (is_top_level && closed)
  276. throw NewParseException ("Too many )'s.");
  277. if (!is_top_level && !closed)
  278. throw NewParseException ("Not enough )'s.");
  279. // clean up literals and alternations
  280. if (literal != null)
  281. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  282. if (assertion != null) {
  283. if (assertion.TrueExpression == null)
  284. assertion.TrueExpression = current;
  285. else
  286. assertion.FalseExpression = current;
  287. group.AppendExpression (assertion);
  288. }
  289. else if (alternation != null) {
  290. alternation.AddAlternative (current);
  291. group.AppendExpression (alternation);
  292. }
  293. else
  294. group.AppendExpression (current);
  295. }
  296. private Expression ParseGroupingConstruct (ref RegexOptions options) {
  297. if (pattern[ptr] != '?') {
  298. Group group;
  299. if (IsExplicitCapture (options))
  300. group = new Group ();
  301. else {
  302. group = new CapturingGroup ();
  303. caps.Add (group);
  304. }
  305. ParseGroup (group, options, null);
  306. return group;
  307. }
  308. else
  309. ++ ptr;
  310. switch (pattern[ptr]) {
  311. case ':': { // non-capturing group
  312. ++ ptr;
  313. Group group = new Group ();
  314. ParseGroup (group, options, null);
  315. return group;
  316. }
  317. case '>': { // non-backtracking group
  318. ++ ptr;
  319. Group group = new NonBacktrackingGroup ();
  320. ParseGroup (group, options, null);
  321. return group;
  322. }
  323. case 'i': case 'm': case 'n':
  324. case 's': case 'x': case '-': { // options
  325. RegexOptions o = options;
  326. ParseOptions (ref o, false);
  327. if (pattern[ptr] == '-') {
  328. ++ ptr;
  329. ParseOptions (ref o, true);
  330. }
  331. if (pattern[ptr] == ':') { // pass options to child group
  332. ++ ptr;
  333. Group group = new Group ();
  334. ParseGroup (group, o, null);
  335. return group;
  336. }
  337. else if (pattern[ptr] == ')') { // change options of enclosing group
  338. ++ ptr;
  339. options = o;
  340. return null;
  341. }
  342. else
  343. throw NewParseException ("Bad options");
  344. }
  345. case '<': case '=': case '!': { // lookahead/lookbehind
  346. ExpressionAssertion asn = new ExpressionAssertion ();
  347. if (!ParseAssertionType (asn))
  348. goto case '\''; // it's a (?<name> ) construct
  349. Group test = new Group ();
  350. ParseGroup (test, options, null);
  351. asn.TestExpression = test;
  352. return asn;
  353. }
  354. case '\'': { // named/balancing group
  355. char delim;
  356. if (pattern[ptr] == '<')
  357. delim = '>';
  358. else
  359. delim = '\'';
  360. ++ ptr;
  361. string name = ParseName ();
  362. if (pattern[ptr] == delim) {
  363. // capturing group
  364. if (name == null)
  365. throw NewParseException ("Bad group name.");
  366. ++ ptr;
  367. CapturingGroup cap = new CapturingGroup ();
  368. cap.Name = name;
  369. caps.Add (cap);
  370. ParseGroup (cap, options, null);
  371. return cap;
  372. }
  373. else if (pattern[ptr] == '-') {
  374. // balancing group
  375. ++ ptr;
  376. string balance_name = ParseName ();
  377. if (balance_name == null || pattern[ptr] != delim)
  378. throw NewParseException ("Bad balancing group name.");
  379. ++ ptr;
  380. BalancingGroup bal = new BalancingGroup ();
  381. bal.Name = name;
  382. if(bal.IsNamed) {
  383. caps.Add (bal);
  384. }
  385. refs.Add (bal, balance_name);
  386. ParseGroup (bal, options, null);
  387. return bal;
  388. }
  389. else
  390. throw NewParseException ("Bad group name.");
  391. }
  392. case '(': { // expression/capture test
  393. Assertion asn;
  394. ++ ptr;
  395. int p = ptr;
  396. string name = ParseName ();
  397. if (name == null || pattern[ptr] != ')') { // expression test
  398. // FIXME MS implementation doesn't seem to
  399. // implement this version of (?(x) ...)
  400. ptr = p;
  401. ExpressionAssertion expr_asn = new ExpressionAssertion ();
  402. if (pattern[ptr] == '?') {
  403. ++ ptr;
  404. if (!ParseAssertionType (expr_asn))
  405. throw NewParseException ("Bad conditional.");
  406. }
  407. else {
  408. expr_asn.Negate = false;
  409. expr_asn.Reverse = false;
  410. }
  411. Group test = new Group ();
  412. ParseGroup (test, options, null);
  413. expr_asn.TestExpression = test;
  414. asn = expr_asn;
  415. }
  416. else { // capture test
  417. ++ ptr;
  418. asn = new CaptureAssertion ();
  419. refs.Add (asn, name);
  420. }
  421. Group group = new Group ();
  422. ParseGroup (group, options, asn);
  423. return group;
  424. }
  425. case '#': { // comment
  426. ++ ptr;
  427. while (pattern[ptr ++] != ')') {
  428. if (ptr >= pattern.Length)
  429. throw NewParseException ("Unterminated (?#...) comment.");
  430. }
  431. return null;
  432. }
  433. default: // error
  434. throw NewParseException ("Bad grouping construct.");
  435. }
  436. }
  437. private bool ParseAssertionType (ExpressionAssertion assertion) {
  438. if (pattern[ptr] == '<') {
  439. switch (pattern[ptr + 1]) {
  440. case '=':
  441. assertion.Negate = false;
  442. break;
  443. case '!':
  444. assertion.Negate = true;
  445. break;
  446. default:
  447. return false;
  448. }
  449. assertion.Reverse = true;
  450. ptr += 2;
  451. }
  452. else {
  453. switch (pattern[ptr]) {
  454. case '=':
  455. assertion.Negate = false;
  456. break;
  457. case '!':
  458. assertion.Negate = true;
  459. break;
  460. default:
  461. return false;
  462. }
  463. assertion.Reverse = false;
  464. ptr += 1;
  465. }
  466. return true;
  467. }
  468. private void ParseOptions (ref RegexOptions options, bool negate) {
  469. for (;;) {
  470. switch (pattern[ptr]) {
  471. case 'i':
  472. if (negate)
  473. options &= ~RegexOptions.IgnoreCase;
  474. else
  475. options |= RegexOptions.IgnoreCase;
  476. break;
  477. case 'm':
  478. if (negate)
  479. options &= ~RegexOptions.Multiline;
  480. else
  481. options |= RegexOptions.Multiline;
  482. break;
  483. case 'n':
  484. if (negate)
  485. options &= ~RegexOptions.ExplicitCapture;
  486. else
  487. options |= RegexOptions.ExplicitCapture;
  488. break;
  489. case 's':
  490. if (negate)
  491. options &= ~RegexOptions.Singleline;
  492. else
  493. options |= RegexOptions.Singleline;
  494. break;
  495. case 'x':
  496. if (negate)
  497. options &= ~RegexOptions.IgnorePatternWhitespace;
  498. else
  499. options |= RegexOptions.IgnorePatternWhitespace;
  500. break;
  501. default:
  502. return;
  503. }
  504. ++ ptr;
  505. }
  506. }
  507. private Expression ParseCharacterClass (RegexOptions options) {
  508. bool negate, ecma;
  509. if (pattern[ptr] == '^') {
  510. negate = true;
  511. ++ ptr;
  512. }
  513. else
  514. negate = false;
  515. ecma = IsECMAScript (options);
  516. CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
  517. if (pattern[ptr] == ']') {
  518. cls.AddCharacter (']');
  519. ++ ptr;
  520. }
  521. int c = -1;
  522. int last = -1;
  523. bool range = false;
  524. bool closed = false;
  525. while (ptr < pattern.Length) {
  526. c = pattern[ptr ++];
  527. if (c == ']') {
  528. closed = true;
  529. break;
  530. }
  531. if (c == '-') {
  532. range = true;
  533. continue;
  534. }
  535. if (c == '\\') {
  536. c = ParseEscape ();
  537. if (c < 0) {
  538. // didn't recognize escape
  539. c = pattern[ptr ++];
  540. switch (c) {
  541. case 'b': c = '\b'; break;
  542. case 'd':
  543. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
  544. last = -1;
  545. continue;
  546. case 'w':
  547. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
  548. last = -1;
  549. continue;
  550. case 's':
  551. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  552. last = -1;
  553. continue;
  554. case 'p':
  555. cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
  556. last = -1;
  557. continue;
  558. case 'D':
  559. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
  560. last = -1;
  561. continue;
  562. case 'W':
  563. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
  564. last = -1;
  565. continue;
  566. case 'S':
  567. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  568. last = -1;
  569. continue;
  570. case 'P':
  571. cls.AddCategory (ParseUnicodeCategory (), true);
  572. last = -1;
  573. continue;
  574. default: break; // add escaped character
  575. }
  576. }
  577. }
  578. if (range) {
  579. if (c < last)
  580. throw NewParseException ("[x-y] range in reverse order.");
  581. if (last >=0 )
  582. cls.AddRange ((char)last, (char)c);
  583. else {
  584. cls.AddCharacter ((char)c);
  585. cls.AddCharacter ('-');
  586. }
  587. range = false;
  588. last = -1;
  589. }
  590. else {
  591. cls.AddCharacter ((char)c);
  592. last = c;
  593. }
  594. }
  595. if (!closed)
  596. throw NewParseException ("Unterminated [] set.");
  597. if (range)
  598. cls.AddCharacter ('-');
  599. return cls;
  600. }
  601. private void ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
  602. int n, m;
  603. /* check syntax */
  604. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  605. if (pattern[ptr] == ',') {
  606. n = -1;
  607. } else {
  608. n = ParseNumber (10, 1, 0);
  609. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  610. }
  611. switch (pattern[ptr ++]) {
  612. case '}':
  613. m = n;
  614. break;
  615. case ',':
  616. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  617. m = ParseNumber (10, 1, 0);
  618. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  619. if (pattern[ptr ++] != '}')
  620. throw NewParseException ("Illegal {x,y} - bad value of y.");
  621. break;
  622. default:
  623. throw NewParseException ("Illegal {x,y}");
  624. }
  625. /* check bounds and ordering */
  626. if (n >= 0xffff || m >= 0xffff)
  627. throw NewParseException ("Illegal {x, y} - maximum of 65535.");
  628. if (m >= 0 && m < n)
  629. throw NewParseException ("Illegal {x, y} with x > y.");
  630. /* assign min and max */
  631. min = n;
  632. if (m > 0)
  633. max = m;
  634. else
  635. max = 0xffff;
  636. }
  637. private Category ParseUnicodeCategory () {
  638. if (pattern[ptr ++] != '{')
  639. throw NewParseException ("Incomplete \\p{X} character escape.");
  640. string name = ParseName (pattern, ref ptr);
  641. if (name == null)
  642. throw NewParseException ("Incomplete \\p{X} character escape.");
  643. Category cat = CategoryUtils.CategoryFromName (name);
  644. if (cat == Category.None)
  645. throw NewParseException ("Unknown property '" + name + "'.");
  646. if (pattern[ptr ++] != '}')
  647. throw NewParseException ("Incomplete \\p{X} character escape.");
  648. return cat;
  649. }
  650. private Expression ParseSpecial (RegexOptions options) {
  651. int p = ptr;
  652. bool ecma = IsECMAScript (options);
  653. Expression expr = null;
  654. switch (pattern[ptr ++]) {
  655. // categories
  656. case 'd':
  657. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
  658. break;
  659. case 'w':
  660. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
  661. break;
  662. case 's':
  663. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  664. break;
  665. case 'p':
  666. // this is odd - ECMAScript isn't supposed to support Unicode,
  667. // yet \p{..} compiles and runs under the MS implementation
  668. // identically to canonical mode. That's why I'm ignoring the
  669. // value of ecma here.
  670. expr = new CharacterClass (ParseUnicodeCategory (), false);
  671. break;
  672. case 'D':
  673. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
  674. break;
  675. case 'W':
  676. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
  677. break;
  678. case 'S':
  679. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  680. break;
  681. case 'P':
  682. expr = new CharacterClass (ParseUnicodeCategory (), true);
  683. break;
  684. // positions
  685. case 'A': expr = new PositionAssertion (Position.StartOfString); break;
  686. case 'Z': expr = new PositionAssertion (Position.End); break;
  687. case 'z': expr = new PositionAssertion (Position.EndOfString); break;
  688. case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
  689. case 'b': expr = new PositionAssertion (Position.Boundary); break;
  690. case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
  691. // references
  692. case '1': case '2': case '3': case '4': case '5':
  693. case '6': case '7': case '8': case '9': {
  694. ptr --;
  695. int n = ParseNumber (10, 1, 0);
  696. if (n < 0) {
  697. ptr = p;
  698. return null;
  699. }
  700. // FIXME test if number is within number of assigned groups
  701. // this may present a problem for right-to-left matching
  702. Reference reference = new Reference (IsIgnoreCase (options));
  703. refs.Add (reference, n.ToString ());
  704. expr = reference;
  705. break;
  706. }
  707. case 'k': {
  708. char delim = pattern[ptr ++];
  709. if (delim == '<')
  710. delim = '>';
  711. else if (delim != '\'')
  712. throw NewParseException ("Malformed \\k<...> named backreference.");
  713. string name = ParseName ();
  714. if (name == null || pattern[ptr] != delim)
  715. throw NewParseException ("Malformed \\k<...> named backreference.");
  716. ++ ptr;
  717. Reference reference = new Reference (IsIgnoreCase (options));
  718. refs.Add (reference, name);
  719. expr = reference;
  720. break;
  721. }
  722. default:
  723. expr = null;
  724. break;
  725. }
  726. if (expr == null)
  727. ptr = p;
  728. return expr;
  729. }
  730. private int ParseEscape () {
  731. int p = ptr;
  732. int c;
  733. if (p >= pattern.Length)
  734. throw new ArgumentException (
  735. String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
  736. "pattern.", pattern), pattern);
  737. switch (pattern[ptr ++]) {
  738. // standard escapes (except \b)
  739. case 'a': return '\u0007';
  740. case 't': return '\u0009';
  741. case 'r': return '\u000d';
  742. case 'v': return '\u000b';
  743. case 'f': return '\u000c';
  744. case 'n': return '\u000a';
  745. case 'e': return '\u001b';
  746. case '\\': return '\\';
  747. // character codes
  748. case '0':
  749. int prevptr = ptr;
  750. int result = ParseOctal (pattern, ref ptr);
  751. if (result == -1 && prevptr == ptr)
  752. return 0;
  753. return result;
  754. case 'x':
  755. c = ParseHex (pattern, ref ptr, 2);
  756. if (c < 0)
  757. throw NewParseException ("Insufficient hex digits");
  758. return c;
  759. case 'u':
  760. c = ParseHex (pattern, ref ptr, 4);
  761. if (c < 0)
  762. throw NewParseException ("Insufficient hex digits");
  763. return c;
  764. // control characters
  765. case 'c':
  766. c = pattern[ptr ++];
  767. if (c >= '@' && c <= '_')
  768. return c - '@';
  769. else
  770. throw NewParseException ("Unrecognized control character.");
  771. // unknown escape
  772. default:
  773. ptr = p;
  774. return -1;
  775. }
  776. }
  777. private string ParseName () {
  778. return Parser.ParseName (pattern, ref ptr);
  779. }
  780. private static bool IsNameChar (char c) {
  781. UnicodeCategory cat = Char.GetUnicodeCategory (c);
  782. if (cat == UnicodeCategory.ModifierLetter)
  783. return false;
  784. if (cat == UnicodeCategory.ConnectorPunctuation)
  785. return true;
  786. return Char.IsLetterOrDigit (c);
  787. }
  788. private int ParseNumber (int b, int min, int max) {
  789. return Parser.ParseNumber (pattern, ref ptr, b, min, max);
  790. }
  791. private int ParseDecimal () {
  792. return Parser.ParseDecimal (pattern, ref ptr);
  793. }
  794. private static int ParseDigit (char c, int b, int n) {
  795. switch (b) {
  796. case 8:
  797. if (c >= '0' && c <= '7')
  798. return c - '0';
  799. else
  800. return -1;
  801. case 10:
  802. if (c >= '0' && c <= '9')
  803. return c - '0';
  804. else
  805. return -1;
  806. case 16:
  807. if (c >= '0' && c <= '9')
  808. return c - '0';
  809. else if (c >= 'a' && c <= 'f')
  810. return 10 + c - 'a';
  811. else if (c >= 'A' && c <= 'F')
  812. return 10 + c - 'A';
  813. else
  814. return -1;
  815. default:
  816. return -1;
  817. }
  818. }
  819. private void ConsumeWhitespace (bool ignore) {
  820. while (true) {
  821. if (ptr >= pattern.Length)
  822. break;
  823. if (pattern[ptr] == '(') {
  824. if (ptr + 3 >= pattern.Length)
  825. return;
  826. if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
  827. return;
  828. ptr += 3;
  829. while (pattern[ptr ++] != ')')
  830. /* ignore */ ;
  831. }
  832. else if (ignore && pattern[ptr] == '#') {
  833. while (ptr < pattern.Length && pattern[ptr ++] != '\n')
  834. /* ignore */ ;
  835. }
  836. else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
  837. while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
  838. ++ ptr;
  839. }
  840. else
  841. return;
  842. }
  843. }
  844. private string ParseString (string pattern) {
  845. this.pattern = pattern;
  846. this.ptr = 0;
  847. StringBuilder result = new StringBuilder (pattern.Length);
  848. while (ptr < pattern.Length) {
  849. int c = pattern[ptr ++];
  850. if (c == '\\') {
  851. c = ParseEscape ();
  852. if(c < 0) {
  853. c = pattern[ptr ++];
  854. if(c == 'b')
  855. c = '\b';
  856. }
  857. }
  858. result.Append (c);
  859. }
  860. return result.ToString ();
  861. }
  862. private void ResolveReferences () {
  863. int gid = 1;
  864. Hashtable dict = new Hashtable ();
  865. // number unnamed groups
  866. foreach (CapturingGroup group in caps) {
  867. if (group.Name == null) {
  868. dict.Add (gid.ToString (), group);
  869. group.Number = gid ++;
  870. ++ num_groups;
  871. }
  872. }
  873. // number named groups
  874. foreach (CapturingGroup group in caps) {
  875. if (group.Name != null) {
  876. if (!dict.Contains (group.Name)) {
  877. dict.Add (group.Name, group);
  878. group.Number = gid ++;
  879. ++ num_groups;
  880. }
  881. else {
  882. CapturingGroup prev = (CapturingGroup)dict[group.Name];
  883. group.Number = prev.Number;
  884. }
  885. }
  886. }
  887. // resolve references
  888. foreach (Expression expr in refs.Keys) {
  889. string name = (string)refs[expr];
  890. if (!dict.Contains (name)) {
  891. throw NewParseException ("Reference to undefined group " +
  892. (Char.IsDigit (name[0]) ? "number " : "name ") +
  893. name);
  894. }
  895. CapturingGroup group = (CapturingGroup)dict[name];
  896. if (expr is Reference)
  897. ((Reference)expr).CapturingGroup = group;
  898. else if (expr is CaptureAssertion)
  899. ((CaptureAssertion)expr).CapturingGroup = group;
  900. else if (expr is BalancingGroup)
  901. ((BalancingGroup)expr).Balance = group;
  902. }
  903. }
  904. // flag helper functions
  905. private static bool IsIgnoreCase (RegexOptions options) {
  906. return (options & RegexOptions.IgnoreCase) != 0;
  907. }
  908. private static bool IsMultiline (RegexOptions options) {
  909. return (options & RegexOptions.Multiline) != 0;
  910. }
  911. private static bool IsExplicitCapture (RegexOptions options) {
  912. return (options & RegexOptions.ExplicitCapture) != 0;
  913. }
  914. private static bool IsSingleline (RegexOptions options) {
  915. return (options & RegexOptions.Singleline) != 0;
  916. }
  917. private static bool IsIgnorePatternWhitespace (RegexOptions options) {
  918. return (options & RegexOptions.IgnorePatternWhitespace) != 0;
  919. }
  920. private static bool IsRightToLeft (RegexOptions options) {
  921. return (options & RegexOptions.RightToLeft) != 0;
  922. }
  923. private static bool IsECMAScript (RegexOptions options) {
  924. return (options & RegexOptions.ECMAScript) != 0;
  925. }
  926. // exception creation
  927. private ArgumentException NewParseException (string msg) {
  928. msg = "parsing \"" + pattern + "\" - " + msg;
  929. return new ArgumentException (msg, pattern);
  930. }
  931. private string pattern;
  932. private int ptr;
  933. private ArrayList caps;
  934. private Hashtable refs;
  935. private int num_groups;
  936. }
  937. }