arch.cs 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: arch.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. using System;
  9. using System.Collections;
  10. namespace System.Text.RegularExpressions {
  11. enum OpCode : ushort {
  12. False = 0, // always fails
  13. True, // always succeeds
  14. // matching
  15. Position, // zero-width position assertion
  16. String, // match string literal
  17. Reference, // back reference
  18. // character matching
  19. Character, // match character exactly
  20. Category, // match character from category
  21. Range, // match character from range
  22. Set, // match character from set
  23. In, // match character from group of tests
  24. // capturing
  25. Open, // open group
  26. Close, // close group
  27. Balance, // balance groups
  28. // control flow
  29. IfDefined, // conditional on capture
  30. Sub, // non-backtracking subexpression
  31. Test, // non-backtracking lookahead/behind
  32. Branch, // alternative expression
  33. Jump, // unconditional goto
  34. Repeat, // new repeat context
  35. Until, // repeat subexpression within context
  36. FastRepeat, // repeat simple subexpression
  37. Anchor, // anchoring expression
  38. // miscellaneous
  39. Info // pattern information
  40. }
  41. [Flags]
  42. enum OpFlags : ushort {
  43. None = 0x000,
  44. Negate = 0x100, // succeed on mismatch
  45. IgnoreCase = 0x200, // case insensitive matching
  46. RightToLeft = 0x400, // right-to-left matching
  47. Lazy = 0x800 // minimizing repeat
  48. }
  49. enum Position : ushort {
  50. Any, // anywhere
  51. Start, // start of string \A
  52. StartOfString, // start of string \A
  53. StartOfLine, // start of line ^
  54. StartOfScan, // start of scan \G
  55. End, // end or before newline at end \Z
  56. EndOfString, // end of string \z
  57. EndOfLine, // end of line $
  58. Boundary, // word boundary \b
  59. NonBoundary // not word boundary \B
  60. };
  61. // see category.cs for Category enum
  62. interface IMachine {
  63. Match Scan (Regex regex, string text, int start, int end);
  64. }
  65. interface IMachineFactory {
  66. IMachine NewInstance ();
  67. IDictionary Mapping { get; set; }
  68. int GroupCount { get; }
  69. }
  70. // Anchor SKIP OFFSET
  71. //
  72. // Flags: [RightToLeft] ??
  73. // SKIP: relative address of tail expression
  74. // OFFSET: offset of anchor from start of pattern
  75. //
  76. // Usage:
  77. //
  78. // Anchor :1 OFFSET
  79. // <expr>
  80. // True
  81. // 1: <tail>
  82. //
  83. // Notes:
  84. //
  85. // In practice, the anchoring expression is only going to be
  86. // Position (StartOfString, StartOfLine, StartOfScan) or String.
  87. // This is because the optimizer looks for position anchors at the
  88. // start of the expression, and if that fails it looks for the
  89. // longest substring. If an expression has neither a position
  90. // anchor or a longest substring anchor, then the anchoring expression
  91. // is left empty. Since an empty expression will anchor at any
  92. // position in any string, the entire input string will be scanned.
  93. // String LEN STR...
  94. //
  95. // Flags: [RightToLeft, IgnoreCase]
  96. // LEN: length of string
  97. // STR: string characters
  98. // Branch SKIP
  99. //
  100. // SKIP: relative address of next branch
  101. //
  102. // Branch :1
  103. // <alt expr 1>
  104. // Jump :4
  105. // 1: Branch :2
  106. // <alt expr 2>
  107. // Jump :4
  108. // 2: Branch :3
  109. // <alt expr 3>
  110. // Jump :4
  111. // 3: False
  112. // 4: <tail>
  113. // Repeat SKIP MIN MAX
  114. //
  115. // Flags: [Lazy]
  116. // SKIP: relative address of Until instruction
  117. // MIN: minimum iterations
  118. // MAX: maximum iterations (0xffff is infinity)
  119. //
  120. // Repeat :1 MIN MAX
  121. // <expr>
  122. // Until
  123. // 1: <tail>
  124. // FastRepeat SKIP MIN MAX
  125. //
  126. // Flags: [Lazy]
  127. // SKIP: relative address of tail expression
  128. // MIN: minimum iterations
  129. // MAX: maximum iterations (0xffff is infinity)
  130. //
  131. // FastRepeat :1 MIN MAX
  132. // <expr>
  133. // True
  134. // 1: <tail>
  135. //
  136. // Notes:
  137. //
  138. // The subexpression of a FastRepeat construct must not contain any
  139. // complex operators. These include: Open, Close, Balance, Repeat,
  140. // FastRepeat, Sub, Test. In addition, the subexpression must have
  141. // been determined to have a fixed width.
  142. // Sub SKIP
  143. //
  144. // SKIP: relative address of tail expression
  145. //
  146. // Sub :1
  147. // <expr>
  148. // 1: <tail>
  149. //
  150. // Notes:
  151. //
  152. // The Sub operator invokes an independent subexpression. This means
  153. // that the subexpression will match only once and so will not
  154. // participate in any backtracking.
  155. // Test TSKIP FSKIP
  156. //
  157. // TSKIP: relative address of true expression
  158. // FSKIP: relative address of false expression
  159. //
  160. // Usage: (?(?=test)true|false)
  161. //
  162. // Test :1 :2
  163. // <test expr>
  164. // 1: <true expr>
  165. // Jump
  166. // 2: <false epxr>
  167. // <tail>
  168. //
  169. // Usage: (?(?=test)true)
  170. //
  171. // Test :1 :2
  172. // <test expr>
  173. // 1: <true expr>
  174. // 2: <tail>
  175. //
  176. // Usage: (?=test)
  177. //
  178. // Test :1 :2
  179. // <test expr>
  180. // 1: <true expr>
  181. // Jump 3:
  182. // 2: False
  183. // 3: <tail>
  184. //
  185. // Notes:
  186. //
  187. // For negative lookaheads, just swap the values of TSKIP and
  188. // FSKIP. For lookbehinds, the test expression must be compiled
  189. // in reverse. The test expression is always executed as an
  190. // independent subexpression, so its behaviour is non-backtracking
  191. // (like a Sub clause.)
  192. // IfDefined SKIP GID
  193. //
  194. // SKIP: relative address of else expression
  195. // GID: number of group to check
  196. //
  197. // Usage: (?(gid)true)
  198. //
  199. // IfDefined :1
  200. // <true expr>
  201. // 1: <tail>
  202. //
  203. // Usage: (?(gid)true|false)
  204. //
  205. // IfDefined :1
  206. // <true expr>
  207. // Jump :2
  208. // 1: <false expr>
  209. // 2: <tail>
  210. // Jump SKIP
  211. //
  212. // SKIP: relative address of target expression
  213. //
  214. // Jump :1
  215. // ...
  216. // :1 <target expr>
  217. // Character CHAR
  218. //
  219. // Flags: [Negate, IgnoreCase, RightToLeft]
  220. // CHAR: exact character to match
  221. // Category CAT
  222. //
  223. // Flags: [Negate, RightToLeft]
  224. // CAT: category to match (see Category enum)
  225. // Range LO HI
  226. //
  227. // Flags: [Negate, IgnoreCase, RightToLeft]
  228. // LO: lowest character in range
  229. // HI: higest character in range
  230. // Set LO LEN SET...
  231. //
  232. // Flags: [Negate, IgnoreCase, RightToLeft]
  233. // LO: lowest character in set
  234. // LEN: number of words in set
  235. // SET: bit array representing characters in set
  236. //
  237. // Notes:
  238. //
  239. // Each word in the set represents 16 characters, so the first word
  240. // defines membership for characters LO to LO + 15, the second for
  241. // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
  242. // up to the compiler to provide a compact representation for sparse
  243. // unicode sets. The simple way is to use Set 0 4096. Other methods
  244. // involve paritioning the set and placing the components into an
  245. // In block.
  246. // In SKIP
  247. //
  248. // SKIP: relative address of tail expression
  249. //
  250. // Usage: [expr]
  251. //
  252. // In :1
  253. // <expr>
  254. // True
  255. // :1 <tail>
  256. //
  257. // Usage: [^expr]
  258. //
  259. // In :1
  260. // <expr>
  261. // False
  262. // :1 <tail>
  263. //
  264. // Notes:
  265. //
  266. // The In instruction consumes a single character, using the flags
  267. // of the first instruction in the subexpression to determine its
  268. // IgnoreCase and RightToLeft properties. The subexpression is then
  269. // applied to the single character as a disjunction. If any instruction
  270. // in the subexpression succeeds, the entire In construct succeeds
  271. // and matching continues with the tail.
  272. // Position POS
  273. //
  274. // POS: position to match (see Position enum)
  275. // Open GID
  276. //
  277. // GID: number of group to open
  278. // Close GID
  279. //
  280. // GID: number of group to close
  281. // Balance GID BAL
  282. //
  283. // GID: number of capturing group (0 if none)
  284. // BAL: number of group to undefine
  285. // Info GROUPS MIN MAX
  286. //
  287. // GROUPS: number of capturing groups
  288. // MIN: minimum width of pattern
  289. // MAX: maximum width of pattern (0xffff means undefined)
  290. // False
  291. // True
  292. // Reference GID
  293. //
  294. // Flags: [IgnoreCase, RightToLeft]
  295. // GID: number of group to reference
  296. }