parser.cs 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: parser.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. //
  9. // Permission is hereby granted, free of charge, to any person obtaining
  10. // a copy of this software and associated documentation files (the
  11. // "Software"), to deal in the Software without restriction, including
  12. // without limitation the rights to use, copy, modify, merge, publish,
  13. // distribute, sublicense, and/or sell copies of the Software, and to
  14. // permit persons to whom the Software is furnished to do so, subject to
  15. // the following conditions:
  16. //
  17. // The above copyright notice and this permission notice shall be
  18. // included in all copies or substantial portions of the Software.
  19. //
  20. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  24. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  25. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  26. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27. //
  28. using System;
  29. using System.Collections;
  30. using System.Globalization;
  31. namespace System.Text.RegularExpressions.Syntax {
  32. class Parser {
  33. public static int ParseDecimal (string str, ref int ptr) {
  34. return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
  35. }
  36. public static int ParseOctal (string str, ref int ptr) {
  37. return ParseNumber (str, ref ptr, 8, 1, 3);
  38. }
  39. public static int ParseHex (string str, ref int ptr, int digits) {
  40. return ParseNumber (str, ref ptr, 16, digits, digits);
  41. }
  42. public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
  43. int p = ptr, n = 0, digits = 0, d;
  44. if (max < min)
  45. max = Int32.MaxValue;
  46. while (digits < max && p < str.Length) {
  47. d = ParseDigit (str[p ++], b, digits);
  48. if (d < 0) {
  49. -- p;
  50. break;
  51. }
  52. n = n * b + d;
  53. ++ digits;
  54. }
  55. if (digits < min)
  56. return -1;
  57. ptr = p;
  58. return n;
  59. }
  60. public static string ParseName (string str, ref int ptr) {
  61. if (Char.IsDigit (str[ptr])) {
  62. int gid = ParseNumber (str, ref ptr, 10, 1, 0);
  63. if (gid > 0)
  64. return gid.ToString ();
  65. return null;
  66. }
  67. int start = ptr;
  68. for (;;) {
  69. if (!IsNameChar (str[ptr]))
  70. break;
  71. ++ ptr;
  72. }
  73. if (ptr - start > 0)
  74. return str.Substring (start, ptr - start);
  75. return null;
  76. }
  77. public static string Escape (string str) {
  78. string result = "";
  79. for (int i = 0; i < str.Length; ++ i) {
  80. char c = str[i];
  81. switch (c) {
  82. case '\\': case '*': case '+': case '?': case '|':
  83. case '{': case '[': case '(': case ')': case '^':
  84. case '$': case '.': case '#': case ' ':
  85. result += "\\" + c;
  86. break;
  87. case '\t': result += "\\t"; break;
  88. case '\n': result += "\\n"; break;
  89. case '\r': result += "\\r"; break;
  90. case '\f': result += "\\f"; break;
  91. default: result += c; break;
  92. }
  93. }
  94. return result;
  95. }
  96. public static string Unescape (string str) {
  97. if (str.IndexOf ('\\') == -1)
  98. return str;
  99. return new Parser ().ParseString (str);
  100. }
  101. // public instance
  102. public Parser () {
  103. this.caps = new ArrayList ();
  104. this.refs = new Hashtable ();
  105. }
  106. public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
  107. this.pattern = pattern;
  108. this.ptr = 0;
  109. caps.Clear ();
  110. refs.Clear ();
  111. this.num_groups = 0;
  112. try {
  113. RegularExpression re = new RegularExpression ();
  114. ParseGroup (re, options, null);
  115. ResolveReferences ();
  116. re.GroupCount = num_groups;
  117. return re;
  118. }
  119. catch (IndexOutOfRangeException) {
  120. throw NewParseException ("Unexpected end of pattern.");
  121. }
  122. }
  123. public IDictionary GetMapping () {
  124. Hashtable mapping = new Hashtable ();
  125. Hashtable numbers = new Hashtable ();
  126. int end = caps.Count;
  127. mapping.Add ("0", 0);
  128. for (int i = 0; i < end; i++) {
  129. CapturingGroup group = (CapturingGroup) caps [i];
  130. if (group.Name != null && !mapping.Contains (group.Name)) {
  131. mapping.Add (group.Name, group.Number);
  132. numbers.Add (group.Number, group.Number);
  133. }
  134. }
  135. for (int i = 1; i < end; i++) {
  136. if (numbers [i] == null)
  137. mapping.Add (i.ToString (), i);
  138. }
  139. return mapping;
  140. }
  141. // private methods
  142. private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
  143. bool is_top_level = group is RegularExpression;
  144. Alternation alternation = null;
  145. string literal = null;
  146. Group current = new Group ();
  147. Expression expr = null;
  148. bool closed = false;
  149. while (true) {
  150. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  151. if (ptr >= pattern.Length)
  152. break;
  153. // (1) Parse for Expressions
  154. char ch = pattern[ptr ++];
  155. switch (ch) {
  156. case '^': {
  157. Position pos =
  158. IsMultiline (options) ? Position.StartOfLine : Position.Start;
  159. expr = new PositionAssertion (pos);
  160. break;
  161. }
  162. case '$': {
  163. Position pos =
  164. IsMultiline (options) ? Position.EndOfLine : Position.End;
  165. expr = new PositionAssertion (pos);
  166. break;
  167. }
  168. case '.': {
  169. Category cat =
  170. IsSingleline (options) ? Category.AnySingleline : Category.Any;
  171. expr = new CharacterClass (cat, false);
  172. break;
  173. }
  174. case '\\': {
  175. int c = ParseEscape ();
  176. if (c >= 0)
  177. ch = (char)c;
  178. else {
  179. expr = ParseSpecial (options);
  180. if (expr == null)
  181. ch = pattern[ptr ++]; // default escape
  182. }
  183. break;
  184. }
  185. case '[': {
  186. expr = ParseCharacterClass (options);
  187. break;
  188. }
  189. case '(': {
  190. bool ignore = IsIgnoreCase (options);
  191. expr = ParseGroupingConstruct (ref options);
  192. if (expr == null) {
  193. if (literal != null && IsIgnoreCase (options) != ignore) {
  194. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  195. literal = null;
  196. }
  197. continue;
  198. }
  199. break;
  200. }
  201. case ')': {
  202. closed = true;
  203. goto EndOfGroup;
  204. }
  205. case '|': {
  206. if (literal != null) {
  207. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  208. literal = null;
  209. }
  210. if (assertion != null) {
  211. if (assertion.TrueExpression == null)
  212. assertion.TrueExpression = current;
  213. else if (assertion.FalseExpression == null)
  214. assertion.FalseExpression = current;
  215. else
  216. throw NewParseException ("Too many | in (?()|).");
  217. }
  218. else {
  219. if (alternation == null)
  220. alternation = new Alternation ();
  221. alternation.AddAlternative (current);
  222. }
  223. current = new Group ();
  224. continue;
  225. }
  226. case '*': case '+': case '?': {
  227. throw NewParseException ("Bad quantifier.");
  228. }
  229. default:
  230. break; // literal character
  231. }
  232. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  233. // (2) Check for Repetitions
  234. if (ptr < pattern.Length) {
  235. char k = pattern[ptr];
  236. int min = 0, max = 0;
  237. bool lazy = false;
  238. bool haveRep = false;
  239. if (k == '?' || k == '*' || k == '+') {
  240. ++ ptr;
  241. haveRep = true;
  242. switch (k) {
  243. case '?': min = 0; max = 1; break;
  244. case '*': min = 0; max = 0xffff; break;
  245. case '+': min = 1; max = 0xffff; break;
  246. }
  247. } else if (k == '{' && ptr + 1 < pattern.Length) {
  248. int saved_ptr = ptr;
  249. ++ptr;
  250. haveRep = ParseRepetitionBounds (out min, out max, options);
  251. if (!haveRep)
  252. ptr = saved_ptr;
  253. }
  254. if (haveRep) {
  255. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  256. if (ptr < pattern.Length && pattern[ptr] == '?') {
  257. ++ ptr;
  258. lazy = true;
  259. }
  260. Repetition repetition = new Repetition (min, max, lazy);
  261. if (expr == null)
  262. repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
  263. else
  264. repetition.Expression = expr;
  265. expr = repetition;
  266. }
  267. }
  268. // (3) Append Expression and/or Literal
  269. if (expr == null) {
  270. if (literal == null)
  271. literal = "";
  272. literal += ch;
  273. }
  274. else {
  275. if (literal != null) {
  276. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  277. literal = null;
  278. }
  279. current.AppendExpression (expr);
  280. expr = null;
  281. }
  282. if (is_top_level && ptr >= pattern.Length)
  283. goto EndOfGroup;
  284. }
  285. EndOfGroup:
  286. if (is_top_level && closed)
  287. throw NewParseException ("Too many )'s.");
  288. if (!is_top_level && !closed)
  289. throw NewParseException ("Not enough )'s.");
  290. // clean up literals and alternations
  291. if (literal != null)
  292. current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
  293. if (assertion != null) {
  294. if (assertion.TrueExpression == null)
  295. assertion.TrueExpression = current;
  296. else
  297. assertion.FalseExpression = current;
  298. group.AppendExpression (assertion);
  299. }
  300. else if (alternation != null) {
  301. alternation.AddAlternative (current);
  302. group.AppendExpression (alternation);
  303. }
  304. else
  305. group.AppendExpression (current);
  306. }
  307. private Expression ParseGroupingConstruct (ref RegexOptions options) {
  308. if (pattern[ptr] != '?') {
  309. Group group;
  310. if (IsExplicitCapture (options))
  311. group = new Group ();
  312. else {
  313. group = new CapturingGroup ();
  314. caps.Add (group);
  315. }
  316. ParseGroup (group, options, null);
  317. return group;
  318. }
  319. else
  320. ++ ptr;
  321. switch (pattern[ptr]) {
  322. case ':': { // non-capturing group
  323. ++ ptr;
  324. Group group = new Group ();
  325. ParseGroup (group, options, null);
  326. return group;
  327. }
  328. case '>': { // non-backtracking group
  329. ++ ptr;
  330. Group group = new NonBacktrackingGroup ();
  331. ParseGroup (group, options, null);
  332. return group;
  333. }
  334. case 'i': case 'm': case 'n':
  335. case 's': case 'x': case '-': { // options
  336. RegexOptions o = options;
  337. ParseOptions (ref o, false);
  338. if (pattern[ptr] == '-') {
  339. ++ ptr;
  340. ParseOptions (ref o, true);
  341. }
  342. if (pattern[ptr] == ':') { // pass options to child group
  343. ++ ptr;
  344. Group group = new Group ();
  345. ParseGroup (group, o, null);
  346. return group;
  347. }
  348. else if (pattern[ptr] == ')') { // change options of enclosing group
  349. ++ ptr;
  350. options = o;
  351. return null;
  352. }
  353. else
  354. throw NewParseException ("Bad options");
  355. }
  356. case '<': case '=': case '!': { // lookahead/lookbehind
  357. ExpressionAssertion asn = new ExpressionAssertion ();
  358. if (!ParseAssertionType (asn))
  359. goto case '\''; // it's a (?<name> ) construct
  360. Group test = new Group ();
  361. ParseGroup (test, options, null);
  362. asn.TestExpression = test;
  363. return asn;
  364. }
  365. case '\'': { // named/balancing group
  366. char delim;
  367. if (pattern[ptr] == '<')
  368. delim = '>';
  369. else
  370. delim = '\'';
  371. ++ ptr;
  372. string name = ParseName ();
  373. if (pattern[ptr] == delim) {
  374. // capturing group
  375. if (name == null)
  376. throw NewParseException ("Bad group name.");
  377. ++ ptr;
  378. CapturingGroup cap = new CapturingGroup ();
  379. cap.Name = name;
  380. caps.Add (cap);
  381. ParseGroup (cap, options, null);
  382. return cap;
  383. }
  384. else if (pattern[ptr] == '-') {
  385. // balancing group
  386. ++ ptr;
  387. string balance_name = ParseName ();
  388. if (balance_name == null || pattern[ptr] != delim)
  389. throw NewParseException ("Bad balancing group name.");
  390. ++ ptr;
  391. BalancingGroup bal = new BalancingGroup ();
  392. bal.Name = name;
  393. if(bal.IsNamed) {
  394. caps.Add (bal);
  395. }
  396. refs.Add (bal, balance_name);
  397. ParseGroup (bal, options, null);
  398. return bal;
  399. }
  400. else
  401. throw NewParseException ("Bad group name.");
  402. }
  403. case '(': { // expression/capture test
  404. Assertion asn;
  405. ++ ptr;
  406. int p = ptr;
  407. string name = ParseName ();
  408. if (name == null || pattern[ptr] != ')') { // expression test
  409. // FIXME MS implementation doesn't seem to
  410. // implement this version of (?(x) ...)
  411. ptr = p;
  412. ExpressionAssertion expr_asn = new ExpressionAssertion ();
  413. if (pattern[ptr] == '?') {
  414. ++ ptr;
  415. if (!ParseAssertionType (expr_asn))
  416. throw NewParseException ("Bad conditional.");
  417. }
  418. else {
  419. expr_asn.Negate = false;
  420. expr_asn.Reverse = false;
  421. }
  422. Group test = new Group ();
  423. ParseGroup (test, options, null);
  424. expr_asn.TestExpression = test;
  425. asn = expr_asn;
  426. }
  427. else { // capture test
  428. ++ ptr;
  429. asn = new CaptureAssertion ();
  430. refs.Add (asn, name);
  431. }
  432. Group group = new Group ();
  433. ParseGroup (group, options, asn);
  434. return group;
  435. }
  436. case '#': { // comment
  437. ++ ptr;
  438. while (pattern[ptr ++] != ')') {
  439. if (ptr >= pattern.Length)
  440. throw NewParseException ("Unterminated (?#...) comment.");
  441. }
  442. return null;
  443. }
  444. default: // error
  445. throw NewParseException ("Bad grouping construct.");
  446. }
  447. }
  448. private bool ParseAssertionType (ExpressionAssertion assertion) {
  449. if (pattern[ptr] == '<') {
  450. switch (pattern[ptr + 1]) {
  451. case '=':
  452. assertion.Negate = false;
  453. break;
  454. case '!':
  455. assertion.Negate = true;
  456. break;
  457. default:
  458. return false;
  459. }
  460. assertion.Reverse = true;
  461. ptr += 2;
  462. }
  463. else {
  464. switch (pattern[ptr]) {
  465. case '=':
  466. assertion.Negate = false;
  467. break;
  468. case '!':
  469. assertion.Negate = true;
  470. break;
  471. default:
  472. return false;
  473. }
  474. assertion.Reverse = false;
  475. ptr += 1;
  476. }
  477. return true;
  478. }
  479. private void ParseOptions (ref RegexOptions options, bool negate) {
  480. for (;;) {
  481. switch (pattern[ptr]) {
  482. case 'i':
  483. if (negate)
  484. options &= ~RegexOptions.IgnoreCase;
  485. else
  486. options |= RegexOptions.IgnoreCase;
  487. break;
  488. case 'm':
  489. if (negate)
  490. options &= ~RegexOptions.Multiline;
  491. else
  492. options |= RegexOptions.Multiline;
  493. break;
  494. case 'n':
  495. if (negate)
  496. options &= ~RegexOptions.ExplicitCapture;
  497. else
  498. options |= RegexOptions.ExplicitCapture;
  499. break;
  500. case 's':
  501. if (negate)
  502. options &= ~RegexOptions.Singleline;
  503. else
  504. options |= RegexOptions.Singleline;
  505. break;
  506. case 'x':
  507. if (negate)
  508. options &= ~RegexOptions.IgnorePatternWhitespace;
  509. else
  510. options |= RegexOptions.IgnorePatternWhitespace;
  511. break;
  512. default:
  513. return;
  514. }
  515. ++ ptr;
  516. }
  517. }
  518. private Expression ParseCharacterClass (RegexOptions options) {
  519. bool negate, ecma;
  520. if (pattern[ptr] == '^') {
  521. negate = true;
  522. ++ ptr;
  523. }
  524. else
  525. negate = false;
  526. ecma = IsECMAScript (options);
  527. CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
  528. if (pattern[ptr] == ']') {
  529. cls.AddCharacter (']');
  530. ++ ptr;
  531. }
  532. int c = -1;
  533. int last = -1;
  534. bool range = false;
  535. bool closed = false;
  536. while (ptr < pattern.Length) {
  537. c = pattern[ptr ++];
  538. if (c == ']') {
  539. closed = true;
  540. break;
  541. }
  542. if (c == '-') {
  543. range = true;
  544. continue;
  545. }
  546. if (c == '\\') {
  547. c = ParseEscape ();
  548. if (c < 0) {
  549. // didn't recognize escape
  550. c = pattern[ptr ++];
  551. switch (c) {
  552. case 'b': c = '\b'; break;
  553. case 'd':
  554. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, false);
  555. last = -1;
  556. continue;
  557. case 'w':
  558. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, false);
  559. last = -1;
  560. continue;
  561. case 's':
  562. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  563. last = -1;
  564. continue;
  565. case 'p':
  566. cls.AddCategory (ParseUnicodeCategory (), false); // ignore ecma
  567. last = -1;
  568. continue;
  569. case 'D':
  570. cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, true);
  571. last = -1;
  572. continue;
  573. case 'W':
  574. cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, true);
  575. last = -1;
  576. continue;
  577. case 'S':
  578. cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  579. last = -1;
  580. continue;
  581. case 'P':
  582. cls.AddCategory (ParseUnicodeCategory (), true);
  583. last = -1;
  584. continue;
  585. default: break; // add escaped character
  586. }
  587. }
  588. }
  589. if (range) {
  590. if (c < last)
  591. throw NewParseException ("[x-y] range in reverse order.");
  592. if (last >=0 )
  593. cls.AddRange ((char)last, (char)c);
  594. else {
  595. cls.AddCharacter ((char)c);
  596. cls.AddCharacter ('-');
  597. }
  598. range = false;
  599. last = -1;
  600. }
  601. else {
  602. cls.AddCharacter ((char)c);
  603. last = c;
  604. }
  605. }
  606. if (!closed)
  607. throw NewParseException ("Unterminated [] set.");
  608. if (range)
  609. cls.AddCharacter ('-');
  610. return cls;
  611. }
  612. private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
  613. int n, m;
  614. min = max = 0;
  615. /* check syntax */
  616. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  617. if (pattern[ptr] == ',') {
  618. n = -1;
  619. } else {
  620. n = ParseNumber (10, 1, 0);
  621. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  622. }
  623. switch (pattern[ptr ++]) {
  624. case '}':
  625. m = n;
  626. break;
  627. case ',':
  628. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  629. m = ParseNumber (10, 1, 0);
  630. ConsumeWhitespace (IsIgnorePatternWhitespace (options));
  631. if (pattern[ptr ++] != '}')
  632. return false;
  633. break;
  634. default:
  635. return false;
  636. }
  637. /* check bounds and ordering */
  638. if (n >= 0xffff || m >= 0xffff)
  639. throw NewParseException ("Illegal {x, y} - maximum of 65535.");
  640. if (m >= 0 && m < n)
  641. throw NewParseException ("Illegal {x, y} with x > y.");
  642. /* assign min and max */
  643. min = n;
  644. if (m > 0)
  645. max = m;
  646. else
  647. max = 0xffff;
  648. return true;
  649. }
  650. private Category ParseUnicodeCategory () {
  651. if (pattern[ptr ++] != '{')
  652. throw NewParseException ("Incomplete \\p{X} character escape.");
  653. string name = ParseName (pattern, ref ptr);
  654. if (name == null)
  655. throw NewParseException ("Incomplete \\p{X} character escape.");
  656. Category cat = CategoryUtils.CategoryFromName (name);
  657. if (cat == Category.None)
  658. throw NewParseException ("Unknown property '" + name + "'.");
  659. if (pattern[ptr ++] != '}')
  660. throw NewParseException ("Incomplete \\p{X} character escape.");
  661. return cat;
  662. }
  663. private Expression ParseSpecial (RegexOptions options) {
  664. int p = ptr;
  665. bool ecma = IsECMAScript (options);
  666. Expression expr = null;
  667. switch (pattern[ptr ++]) {
  668. // categories
  669. case 'd':
  670. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
  671. break;
  672. case 'w':
  673. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
  674. break;
  675. case 's':
  676. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
  677. break;
  678. case 'p':
  679. // this is odd - ECMAScript isn't supposed to support Unicode,
  680. // yet \p{..} compiles and runs under the MS implementation
  681. // identically to canonical mode. That's why I'm ignoring the
  682. // value of ecma here.
  683. expr = new CharacterClass (ParseUnicodeCategory (), false);
  684. break;
  685. case 'D':
  686. expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
  687. break;
  688. case 'W':
  689. expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
  690. break;
  691. case 'S':
  692. expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
  693. break;
  694. case 'P':
  695. expr = new CharacterClass (ParseUnicodeCategory (), true);
  696. break;
  697. // positions
  698. case 'A': expr = new PositionAssertion (Position.StartOfString); break;
  699. case 'Z': expr = new PositionAssertion (Position.End); break;
  700. case 'z': expr = new PositionAssertion (Position.EndOfString); break;
  701. case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
  702. case 'b': expr = new PositionAssertion (Position.Boundary); break;
  703. case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
  704. // references
  705. case '1': case '2': case '3': case '4': case '5':
  706. case '6': case '7': case '8': case '9': {
  707. ptr --;
  708. int n = ParseNumber (10, 1, 0);
  709. if (n < 0) {
  710. ptr = p;
  711. return null;
  712. }
  713. // FIXME test if number is within number of assigned groups
  714. // this may present a problem for right-to-left matching
  715. Reference reference = new Reference (IsIgnoreCase (options));
  716. refs.Add (reference, n.ToString ());
  717. expr = reference;
  718. break;
  719. }
  720. case 'k': {
  721. char delim = pattern[ptr ++];
  722. if (delim == '<')
  723. delim = '>';
  724. else if (delim != '\'')
  725. throw NewParseException ("Malformed \\k<...> named backreference.");
  726. string name = ParseName ();
  727. if (name == null || pattern[ptr] != delim)
  728. throw NewParseException ("Malformed \\k<...> named backreference.");
  729. ++ ptr;
  730. Reference reference = new Reference (IsIgnoreCase (options));
  731. refs.Add (reference, name);
  732. expr = reference;
  733. break;
  734. }
  735. default:
  736. expr = null;
  737. break;
  738. }
  739. if (expr == null)
  740. ptr = p;
  741. return expr;
  742. }
  743. private int ParseEscape () {
  744. int p = ptr;
  745. int c;
  746. if (p >= pattern.Length)
  747. throw new ArgumentException (
  748. String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
  749. "pattern.", pattern), pattern);
  750. switch (pattern[ptr ++]) {
  751. // standard escapes (except \b)
  752. case 'a': return '\u0007';
  753. case 't': return '\u0009';
  754. case 'r': return '\u000d';
  755. case 'v': return '\u000b';
  756. case 'f': return '\u000c';
  757. case 'n': return '\u000a';
  758. case 'e': return '\u001b';
  759. case '\\': return '\\';
  760. // character codes
  761. case '0':
  762. //
  763. // Turns out that octal values can be specified
  764. // without a leading zero. But also the limit
  765. // of three character should include this first
  766. // one.
  767. //
  768. ptr--;
  769. int prevptr = ptr;
  770. int result = ParseOctal (pattern, ref ptr);
  771. if (result == -1 && prevptr == ptr)
  772. return 0;
  773. return result;
  774. case 'x':
  775. c = ParseHex (pattern, ref ptr, 2);
  776. if (c < 0)
  777. throw NewParseException ("Insufficient hex digits");
  778. return c;
  779. case 'u':
  780. c = ParseHex (pattern, ref ptr, 4);
  781. if (c < 0)
  782. throw NewParseException ("Insufficient hex digits");
  783. return c;
  784. // control characters
  785. case 'c':
  786. c = pattern[ptr ++];
  787. if (c >= '@' && c <= '_')
  788. return c - '@';
  789. else
  790. throw NewParseException ("Unrecognized control character.");
  791. // unknown escape
  792. default:
  793. ptr = p;
  794. return -1;
  795. }
  796. }
  797. private string ParseName () {
  798. return Parser.ParseName (pattern, ref ptr);
  799. }
  800. private static bool IsNameChar (char c) {
  801. UnicodeCategory cat = Char.GetUnicodeCategory (c);
  802. if (cat == UnicodeCategory.ModifierLetter)
  803. return false;
  804. if (cat == UnicodeCategory.ConnectorPunctuation)
  805. return true;
  806. return Char.IsLetterOrDigit (c);
  807. }
  808. private int ParseNumber (int b, int min, int max) {
  809. return Parser.ParseNumber (pattern, ref ptr, b, min, max);
  810. }
  811. private static int ParseDigit (char c, int b, int n) {
  812. switch (b) {
  813. case 8:
  814. if (c >= '0' && c <= '7')
  815. return c - '0';
  816. else
  817. return -1;
  818. case 10:
  819. if (c >= '0' && c <= '9')
  820. return c - '0';
  821. else
  822. return -1;
  823. case 16:
  824. if (c >= '0' && c <= '9')
  825. return c - '0';
  826. else if (c >= 'a' && c <= 'f')
  827. return 10 + c - 'a';
  828. else if (c >= 'A' && c <= 'F')
  829. return 10 + c - 'A';
  830. else
  831. return -1;
  832. default:
  833. return -1;
  834. }
  835. }
  836. private void ConsumeWhitespace (bool ignore) {
  837. while (true) {
  838. if (ptr >= pattern.Length)
  839. break;
  840. if (pattern[ptr] == '(') {
  841. if (ptr + 3 >= pattern.Length)
  842. return;
  843. if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
  844. return;
  845. ptr += 3;
  846. while (pattern[ptr ++] != ')')
  847. /* ignore */ ;
  848. }
  849. else if (ignore && pattern[ptr] == '#') {
  850. while (ptr < pattern.Length && pattern[ptr ++] != '\n')
  851. /* ignore */ ;
  852. }
  853. else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
  854. while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
  855. ++ ptr;
  856. }
  857. else
  858. return;
  859. }
  860. }
  861. private string ParseString (string pattern) {
  862. this.pattern = pattern;
  863. this.ptr = 0;
  864. StringBuilder result = new StringBuilder (pattern.Length);
  865. while (ptr < pattern.Length) {
  866. int c = pattern[ptr ++];
  867. if (c == '\\') {
  868. c = ParseEscape ();
  869. if(c < 0) {
  870. c = pattern[ptr ++];
  871. if(c == 'b')
  872. c = '\b';
  873. }
  874. }
  875. result.Append ((char) c);
  876. }
  877. return result.ToString ();
  878. }
  879. private void ResolveReferences () {
  880. int gid = 1;
  881. Hashtable dict = new Hashtable ();
  882. // number unnamed groups
  883. foreach (CapturingGroup group in caps) {
  884. if (group.Name == null) {
  885. dict.Add (gid.ToString (), group);
  886. group.Number = gid ++;
  887. ++ num_groups;
  888. }
  889. }
  890. // number named groups
  891. foreach (CapturingGroup group in caps) {
  892. if (group.Name != null) {
  893. if (!dict.Contains (group.Name)) {
  894. dict.Add (group.Name, group);
  895. group.Number = gid ++;
  896. ++ num_groups;
  897. }
  898. else {
  899. CapturingGroup prev = (CapturingGroup)dict[group.Name];
  900. group.Number = prev.Number;
  901. }
  902. }
  903. }
  904. // resolve references
  905. foreach (Expression expr in refs.Keys) {
  906. string name = (string)refs[expr];
  907. if (!dict.Contains (name)) {
  908. throw NewParseException ("Reference to undefined group " +
  909. (Char.IsDigit (name[0]) ? "number " : "name ") +
  910. name);
  911. }
  912. CapturingGroup group = (CapturingGroup)dict[name];
  913. if (expr is Reference)
  914. ((Reference)expr).CapturingGroup = group;
  915. else if (expr is CaptureAssertion)
  916. ((CaptureAssertion)expr).CapturingGroup = group;
  917. else if (expr is BalancingGroup)
  918. ((BalancingGroup)expr).Balance = group;
  919. }
  920. }
  921. // flag helper functions
  922. private static bool IsIgnoreCase (RegexOptions options) {
  923. return (options & RegexOptions.IgnoreCase) != 0;
  924. }
  925. private static bool IsMultiline (RegexOptions options) {
  926. return (options & RegexOptions.Multiline) != 0;
  927. }
  928. private static bool IsExplicitCapture (RegexOptions options) {
  929. return (options & RegexOptions.ExplicitCapture) != 0;
  930. }
  931. private static bool IsSingleline (RegexOptions options) {
  932. return (options & RegexOptions.Singleline) != 0;
  933. }
  934. private static bool IsIgnorePatternWhitespace (RegexOptions options) {
  935. return (options & RegexOptions.IgnorePatternWhitespace) != 0;
  936. }
  937. private static bool IsECMAScript (RegexOptions options) {
  938. return (options & RegexOptions.ECMAScript) != 0;
  939. }
  940. // exception creation
  941. private ArgumentException NewParseException (string msg) {
  942. msg = "parsing \"" + pattern + "\" - " + msg;
  943. return new ArgumentException (msg, pattern);
  944. }
  945. private string pattern;
  946. private int ptr;
  947. private ArrayList caps;
  948. private Hashtable refs;
  949. private int num_groups;
  950. }
  951. }