arch.cs 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: arch.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. //
  9. // Permission is hereby granted, free of charge, to any person obtaining
  10. // a copy of this software and associated documentation files (the
  11. // "Software"), to deal in the Software without restriction, including
  12. // without limitation the rights to use, copy, modify, merge, publish,
  13. // distribute, sublicense, and/or sell copies of the Software, and to
  14. // permit persons to whom the Software is furnished to do so, subject to
  15. // the following conditions:
  16. //
  17. // The above copyright notice and this permission notice shall be
  18. // included in all copies or substantial portions of the Software.
  19. //
  20. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  24. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  25. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  26. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27. //
  28. using System;
  29. using System.Collections;
  30. namespace System.Text.RegularExpressions {
  31. enum OpCode : ushort {
  32. False = 0, // always fails
  33. True, // always succeeds
  34. // matching
  35. Position, // zero-width position assertion
  36. String, // match string literal
  37. Reference, // back reference
  38. // character matching
  39. Character, // match character exactly
  40. Category, // match character from category
  41. Range, // match character from range
  42. Set, // match character from set
  43. In, // match character from group of tests
  44. // capturing
  45. Open, // open group
  46. Close, // close group
  47. Balance, // balance groups
  48. BalanceStart, //track balance group length
  49. // control flow
  50. IfDefined, // conditional on capture
  51. Sub, // non-backtracking subexpression
  52. Test, // non-backtracking lookahead/behind
  53. Branch, // alternative expression
  54. Jump, // unconditional goto
  55. Repeat, // new repeat context
  56. Until, // repeat subexpression within context
  57. FastRepeat, // repeat simple subexpression
  58. Anchor, // anchoring expression
  59. // miscellaneous
  60. Info // pattern information
  61. }
  62. [Flags]
  63. enum OpFlags : ushort {
  64. None = 0x000,
  65. Negate = 0x100, // succeed on mismatch
  66. IgnoreCase = 0x200, // case insensitive matching
  67. RightToLeft = 0x400, // right-to-left matching
  68. Lazy = 0x800 // minimizing repeat
  69. }
  70. enum Position : ushort {
  71. Any, // anywhere
  72. Start, // start of string \A
  73. StartOfString, // start of string \A
  74. StartOfLine, // start of line ^
  75. StartOfScan, // start of scan \G
  76. End, // end or before newline at end \Z
  77. EndOfString, // end of string \z
  78. EndOfLine, // end of line $
  79. Boundary, // word boundary \b
  80. NonBoundary // not word boundary \B
  81. };
  82. // see category.cs for Category enum
  83. interface IMachine {
  84. Match Scan (Regex regex, string text, int start, int end);
  85. }
  86. interface IMachineFactory {
  87. IMachine NewInstance ();
  88. IDictionary Mapping { get; set; }
  89. int GroupCount { get; }
  90. }
  91. // Anchor SKIP OFFSET
  92. //
  93. // Flags: [RightToLeft] ??
  94. // SKIP: relative address of tail expression
  95. // OFFSET: offset of anchor from start of pattern
  96. //
  97. // Usage:
  98. //
  99. // Anchor :1 OFFSET
  100. // <expr>
  101. // True
  102. // 1: <tail>
  103. //
  104. // Notes:
  105. //
  106. // In practice, the anchoring expression is only going to be
  107. // Position (StartOfString, StartOfLine, StartOfScan) or String.
  108. // This is because the optimizer looks for position anchors at the
  109. // start of the expression, and if that fails it looks for the
  110. // longest substring. If an expression has neither a position
  111. // anchor or a longest substring anchor, then the anchoring expression
  112. // is left empty. Since an empty expression will anchor at any
  113. // position in any string, the entire input string will be scanned.
  114. // String LEN STR...
  115. //
  116. // Flags: [RightToLeft, IgnoreCase]
  117. // LEN: length of string
  118. // STR: string characters
  119. // Branch SKIP
  120. //
  121. // SKIP: relative address of next branch
  122. //
  123. // Branch :1
  124. // <alt expr 1>
  125. // Jump :4
  126. // 1: Branch :2
  127. // <alt expr 2>
  128. // Jump :4
  129. // 2: Branch :3
  130. // <alt expr 3>
  131. // Jump :4
  132. // 3: False
  133. // 4: <tail>
  134. // Repeat SKIP MIN MAX
  135. //
  136. // Flags: [Lazy]
  137. // SKIP: relative address of Until instruction
  138. // MIN: minimum iterations
  139. // MAX: maximum iterations (0xffff is infinity)
  140. //
  141. // Repeat :1 MIN MAX
  142. // <expr>
  143. // Until
  144. // 1: <tail>
  145. // FastRepeat SKIP MIN MAX
  146. //
  147. // Flags: [Lazy]
  148. // SKIP: relative address of tail expression
  149. // MIN: minimum iterations
  150. // MAX: maximum iterations (0xffff is infinity)
  151. //
  152. // FastRepeat :1 MIN MAX
  153. // <expr>
  154. // True
  155. // 1: <tail>
  156. //
  157. // Notes:
  158. //
  159. // The subexpression of a FastRepeat construct must not contain any
  160. // complex operators. These include: Open, Close, Balance, Repeat,
  161. // FastRepeat, Sub, Test. In addition, the subexpression must have
  162. // been determined to have a fixed width.
  163. // Sub SKIP
  164. //
  165. // SKIP: relative address of tail expression
  166. //
  167. // Sub :1
  168. // <expr>
  169. // 1: <tail>
  170. //
  171. // Notes:
  172. //
  173. // The Sub operator invokes an independent subexpression. This means
  174. // that the subexpression will match only once and so will not
  175. // participate in any backtracking.
  176. // Test TSKIP FSKIP
  177. //
  178. // TSKIP: relative address of true expression
  179. // FSKIP: relative address of false expression
  180. //
  181. // Usage: (?(?=test)true|false)
  182. //
  183. // Test :1 :2
  184. // <test expr>
  185. // 1: <true expr>
  186. // Jump
  187. // 2: <false epxr>
  188. // <tail>
  189. //
  190. // Usage: (?(?=test)true)
  191. //
  192. // Test :1 :2
  193. // <test expr>
  194. // 1: <true expr>
  195. // 2: <tail>
  196. //
  197. // Usage: (?=test)
  198. //
  199. // Test :1 :2
  200. // <test expr>
  201. // 1: <true expr>
  202. // Jump 3:
  203. // 2: False
  204. // 3: <tail>
  205. //
  206. // Notes:
  207. //
  208. // For negative lookaheads, just swap the values of TSKIP and
  209. // FSKIP. For lookbehinds, the test expression must be compiled
  210. // in reverse. The test expression is always executed as an
  211. // independent subexpression, so its behaviour is non-backtracking
  212. // (like a Sub clause.)
  213. // IfDefined SKIP GID
  214. //
  215. // SKIP: relative address of else expression
  216. // GID: number of group to check
  217. //
  218. // Usage: (?(gid)true)
  219. //
  220. // IfDefined :1
  221. // <true expr>
  222. // 1: <tail>
  223. //
  224. // Usage: (?(gid)true|false)
  225. //
  226. // IfDefined :1
  227. // <true expr>
  228. // Jump :2
  229. // 1: <false expr>
  230. // 2: <tail>
  231. // Jump SKIP
  232. //
  233. // SKIP: relative address of target expression
  234. //
  235. // Jump :1
  236. // ...
  237. // :1 <target expr>
  238. // Character CHAR
  239. //
  240. // Flags: [Negate, IgnoreCase, RightToLeft]
  241. // CHAR: exact character to match
  242. // Category CAT
  243. //
  244. // Flags: [Negate, RightToLeft]
  245. // CAT: category to match (see Category enum)
  246. // Range LO HI
  247. //
  248. // Flags: [Negate, IgnoreCase, RightToLeft]
  249. // LO: lowest character in range
  250. // HI: higest character in range
  251. // Set LO LEN SET...
  252. //
  253. // Flags: [Negate, IgnoreCase, RightToLeft]
  254. // LO: lowest character in set
  255. // LEN: number of words in set
  256. // SET: bit array representing characters in set
  257. //
  258. // Notes:
  259. //
  260. // Each word in the set represents 16 characters, so the first word
  261. // defines membership for characters LO to LO + 15, the second for
  262. // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
  263. // up to the compiler to provide a compact representation for sparse
  264. // unicode sets. The simple way is to use Set 0 4096. Other methods
  265. // involve paritioning the set and placing the components into an
  266. // In block.
  267. // In SKIP
  268. //
  269. // SKIP: relative address of tail expression
  270. //
  271. // Usage: [expr]
  272. //
  273. // In :1
  274. // <expr>
  275. // True
  276. // :1 <tail>
  277. //
  278. // Usage: [^expr]
  279. //
  280. // In :1
  281. // <expr>
  282. // False
  283. // :1 <tail>
  284. //
  285. // Notes:
  286. //
  287. // The In instruction consumes a single character, using the flags
  288. // of the first instruction in the subexpression to determine its
  289. // IgnoreCase and RightToLeft properties. The subexpression is then
  290. // applied to the single character as a disjunction. If any instruction
  291. // in the subexpression succeeds, the entire In construct succeeds
  292. // and matching continues with the tail.
  293. // Position POS
  294. //
  295. // POS: position to match (see Position enum)
  296. // Open GID
  297. //
  298. // GID: number of group to open
  299. // Close GID
  300. //
  301. // GID: number of group to close
  302. // Balance GID BAL
  303. //
  304. // GID: number of capturing group (0 if none)
  305. // BAL: number of group to undefine
  306. // Info GROUPS MIN MAX
  307. //
  308. // GROUPS: number of capturing groups
  309. // MIN: minimum width of pattern
  310. // MAX: maximum width of pattern (0xffff means undefined)
  311. // False
  312. // True
  313. // Reference GID
  314. //
  315. // Flags: [IgnoreCase, RightToLeft]
  316. // GID: number of group to reference
  317. }