arch.cs 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: arch.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. using System;
  9. namespace System.Text.RegularExpressions {
  10. enum OpCode : ushort {
  11. False = 0, // always fails
  12. True, // always succeeds
  13. // matching
  14. Position, // zero-width position assertion
  15. String, // match string literal
  16. Reference, // back reference
  17. // character matching
  18. Character, // match character exactly
  19. Category, // match character from category
  20. Range, // match character from range
  21. Set, // match character from set
  22. In, // match character from group of tests
  23. // capturing
  24. Open, // open group
  25. Close, // close group
  26. Balance, // balance groups
  27. // control flow
  28. IfDefined, // conditional on capture
  29. Sub, // non-backtracking subexpression
  30. Test, // non-backtracking lookahead/behind
  31. Branch, // alternative expression
  32. Jump, // unconditional goto
  33. Repeat, // new repeat context
  34. Until, // repeat subexpression within context
  35. FastRepeat, // repeat simple subexpression
  36. Anchor, // anchoring expression
  37. // miscellaneous
  38. Info // pattern information
  39. }
  40. [Flags]
  41. enum OpFlags : ushort {
  42. None = 0x000,
  43. Negate = 0x100, // succeed on mismatch
  44. IgnoreCase = 0x200, // case insensitive matching
  45. RightToLeft = 0x400, // right-to-left matching
  46. Lazy = 0x800 // minimizing repeat
  47. }
  48. enum Position : ushort {
  49. Any, // anywhere
  50. Start, // start of string \A
  51. StartOfString, // start of string \A
  52. StartOfLine, // start of line ^
  53. StartOfScan, // start of scan \G
  54. End, // end or before newline at end \Z
  55. EndOfString, // end of string \z
  56. EndOfLine, // end of line $
  57. Boundary, // word boundary \b
  58. NonBoundary // not word boundary \B
  59. };
  60. // see category.cs for Category enum
  61. interface IMachine {
  62. Match Scan (Regex regex, string text, int start, int end);
  63. }
  64. interface IMachineFactory {
  65. IMachine NewInstance ();
  66. }
  67. // Anchor SKIP OFFSET
  68. //
  69. // Flags: [RightToLeft] ??
  70. // SKIP: relative address of tail expression
  71. // OFFSET: offset of anchor from start of pattern
  72. //
  73. // Usage:
  74. //
  75. // Anchor :1 OFFSET
  76. // <expr>
  77. // True
  78. // 1: <tail>
  79. //
  80. // Notes:
  81. //
  82. // In practice, the anchoring expression is only going to be
  83. // Position (StartOfString, StartOfLine, StartOfScan) or String.
  84. // This is because the optimizer looks for position anchors at the
  85. // start of the expression, and if that fails it looks for the
  86. // longest substring. If an expression has neither a position
  87. // anchor or a longest substring anchor, then the anchoring expression
  88. // is left empty. Since an empty expression will anchor at any
  89. // position in any string, the entire input string will be scanned.
  90. // String LEN STR...
  91. //
  92. // Flags: [RightToLeft, IgnoreCase]
  93. // LEN: length of string
  94. // STR: string characters
  95. // Branch SKIP
  96. //
  97. // SKIP: relative address of next branch
  98. //
  99. // Branch :1
  100. // <alt expr 1>
  101. // Jump :4
  102. // 1: Branch :2
  103. // <alt expr 2>
  104. // Jump :4
  105. // 2: Branch :3
  106. // <alt expr 3>
  107. // Jump :4
  108. // 3: False
  109. // 4: <tail>
  110. // Repeat SKIP MIN MAX
  111. //
  112. // Flags: [Lazy]
  113. // SKIP: relative address of Until instruction
  114. // MIN: minimum iterations
  115. // MAX: maximum iterations (0xffff is infinity)
  116. //
  117. // Repeat :1 MIN MAX
  118. // <expr>
  119. // Until
  120. // 1: <tail>
  121. // FastRepeat SKIP MIN MAX
  122. //
  123. // Flags: [Lazy]
  124. // SKIP: relative address of tail expression
  125. // MIN: minimum iterations
  126. // MAX: maximum iterations (0xffff is infinity)
  127. //
  128. // FastRepeat :1 MIN MAX
  129. // <expr>
  130. // True
  131. // 1: <tail>
  132. //
  133. // Notes:
  134. //
  135. // The subexpression of a FastRepeat construct must not contain any
  136. // complex operators. These include: Open, Close, Balance, Repeat,
  137. // FastRepeat, Sub, Test. In addition, the subexpression must have
  138. // been determined to have a fixed width.
  139. // Sub SKIP
  140. //
  141. // SKIP: relative address of tail expression
  142. //
  143. // Sub :1
  144. // <expr>
  145. // 1: <tail>
  146. //
  147. // Notes:
  148. //
  149. // The Sub operator invokes an independent subexpression. This means
  150. // that the subexpression will match only once and so will not
  151. // participate in any backtracking.
  152. // Test TSKIP FSKIP
  153. //
  154. // TSKIP: relative address of true expression
  155. // FSKIP: relative address of false expression
  156. //
  157. // Usage: (?(?=test)true|false)
  158. //
  159. // Test :1 :2
  160. // <test expr>
  161. // 1: <true expr>
  162. // Jump
  163. // 2: <false epxr>
  164. // <tail>
  165. //
  166. // Usage: (?(?=test)true)
  167. //
  168. // Test :1 :2
  169. // <test expr>
  170. // 1: <true expr>
  171. // 2: <tail>
  172. //
  173. // Usage: (?=test)
  174. //
  175. // Test :1 :2
  176. // <test expr>
  177. // 1: <true expr>
  178. // Jump 3:
  179. // 2: False
  180. // 3: <tail>
  181. //
  182. // Notes:
  183. //
  184. // For negative lookaheads, just swap the values of TSKIP and
  185. // FSKIP. For lookbehinds, the test expression must be compiled
  186. // in reverse. The test expression is always executed as an
  187. // independent subexpression, so its behaviour is non-backtracking
  188. // (like a Sub clause.)
  189. // IfDefined SKIP GID
  190. //
  191. // SKIP: relative address of else expression
  192. // GID: number of group to check
  193. //
  194. // Usage: (?(gid)true)
  195. //
  196. // IfDefined :1
  197. // <true expr>
  198. // 1: <tail>
  199. //
  200. // Usage: (?(gid)true|false)
  201. //
  202. // IfDefined :1
  203. // <true expr>
  204. // Jump :2
  205. // 1: <false expr>
  206. // 2: <tail>
  207. // Jump SKIP
  208. //
  209. // SKIP: relative address of target expression
  210. //
  211. // Jump :1
  212. // ...
  213. // :1 <target expr>
  214. // Character CHAR
  215. //
  216. // Flags: [Negate, IgnoreCase, RightToLeft]
  217. // CHAR: exact character to match
  218. // Category CAT
  219. //
  220. // Flags: [Negate, RightToLeft]
  221. // CAT: category to match (see Category enum)
  222. // Range LO HI
  223. //
  224. // Flags: [Negate, IgnoreCase, RightToLeft]
  225. // LO: lowest character in range
  226. // HI: higest character in range
  227. // Set LO LEN SET...
  228. //
  229. // Flags: [Negate, IgnoreCase, RightToLeft]
  230. // LO: lowest character in set
  231. // LEN: number of words in set
  232. // SET: bit array representing characters in set
  233. //
  234. // Notes:
  235. //
  236. // Each word in the set represents 16 characters, so the first word
  237. // defines membership for characters LO to LO + 15, the second for
  238. // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
  239. // up to the compiler to provide a compact representation for sparse
  240. // unicode sets. The simple way is to use Set 0 4096. Other methods
  241. // involve paritioning the set and placing the components into an
  242. // In block.
  243. // In SKIP
  244. //
  245. // SKIP: relative address of tail expression
  246. //
  247. // Usage: [expr]
  248. //
  249. // In :1
  250. // <expr>
  251. // True
  252. // :1 <tail>
  253. //
  254. // Usage: [^expr]
  255. //
  256. // In :1
  257. // <expr>
  258. // False
  259. // :1 <tail>
  260. //
  261. // Notes:
  262. //
  263. // The In instruction consumes a single character, using the flags
  264. // of the first instruction in the subexpression to determine its
  265. // IgnoreCase and RightToLeft properties. The subexpression is then
  266. // applied to the single character as a disjunction. If any instruction
  267. // in the subexpression succeeds, the entire In construct succeeds
  268. // and matching continues with the tail.
  269. // Position POS
  270. //
  271. // POS: position to match (see Position enum)
  272. // Open GID
  273. //
  274. // GID: number of group to open
  275. // Close GID
  276. //
  277. // GID: number of group to close
  278. // Balance GID BAL
  279. //
  280. // GID: number of capturing group (0 if none)
  281. // BAL: number of group to undefine
  282. // Info GROUPS MIN MAX
  283. //
  284. // GROUPS: number of capturing groups
  285. // MIN: minimum width of pattern
  286. // MAX: maximum width of pattern (0xffff means undefined)
  287. // False
  288. // True
  289. // Reference GID
  290. //
  291. // Flags: [IgnoreCase, RightToLeft]
  292. // GID: number of group to reference
  293. }