| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336 |
- //
- // assembly: System
- // namespace: System.Text.RegularExpressions
- // file: arch.cs
- //
- // author: Dan Lewis ([email protected])
- // (c) 2002
- using System;
- using System.Collections;
- namespace System.Text.RegularExpressions {
- enum OpCode : ushort {
- False = 0, // always fails
- True, // always succeeds
- // matching
- Position, // zero-width position assertion
- String, // match string literal
- Reference, // back reference
- // character matching
- Character, // match character exactly
- Category, // match character from category
- Range, // match character from range
- Set, // match character from set
- In, // match character from group of tests
- // capturing
- Open, // open group
- Close, // close group
- Balance, // balance groups
- // control flow
- IfDefined, // conditional on capture
- Sub, // non-backtracking subexpression
- Test, // non-backtracking lookahead/behind
- Branch, // alternative expression
- Jump, // unconditional goto
- Repeat, // new repeat context
- Until, // repeat subexpression within context
- FastRepeat, // repeat simple subexpression
- Anchor, // anchoring expression
- // miscellaneous
-
- Info // pattern information
- }
- [Flags]
- enum OpFlags : ushort {
- None = 0x000,
- Negate = 0x100, // succeed on mismatch
- IgnoreCase = 0x200, // case insensitive matching
- RightToLeft = 0x400, // right-to-left matching
- Lazy = 0x800 // minimizing repeat
- }
- enum Position : ushort {
- Any, // anywhere
- Start, // start of string \A
- StartOfString, // start of string \A
- StartOfLine, // start of line ^
- StartOfScan, // start of scan \G
- End, // end or before newline at end \Z
- EndOfString, // end of string \z
- EndOfLine, // end of line $
- Boundary, // word boundary \b
- NonBoundary // not word boundary \B
- };
-
- // see category.cs for Category enum
- interface IMachine {
- Match Scan (Regex regex, string text, int start, int end);
- }
- interface IMachineFactory {
- IMachine NewInstance ();
- IDictionary Mapping { get; set; }
- int GroupCount { get; }
- }
- // Anchor SKIP OFFSET
- //
- // Flags: [RightToLeft] ??
- // SKIP: relative address of tail expression
- // OFFSET: offset of anchor from start of pattern
- //
- // Usage:
- //
- // Anchor :1 OFFSET
- // <expr>
- // True
- // 1: <tail>
- //
- // Notes:
- //
- // In practice, the anchoring expression is only going to be
- // Position (StartOfString, StartOfLine, StartOfScan) or String.
- // This is because the optimizer looks for position anchors at the
- // start of the expression, and if that fails it looks for the
- // longest substring. If an expression has neither a position
- // anchor or a longest substring anchor, then the anchoring expression
- // is left empty. Since an empty expression will anchor at any
- // position in any string, the entire input string will be scanned.
- // String LEN STR...
- //
- // Flags: [RightToLeft, IgnoreCase]
- // LEN: length of string
- // STR: string characters
- // Branch SKIP
- //
- // SKIP: relative address of next branch
- //
- // Branch :1
- // <alt expr 1>
- // Jump :4
- // 1: Branch :2
- // <alt expr 2>
- // Jump :4
- // 2: Branch :3
- // <alt expr 3>
- // Jump :4
- // 3: False
- // 4: <tail>
- // Repeat SKIP MIN MAX
- //
- // Flags: [Lazy]
- // SKIP: relative address of Until instruction
- // MIN: minimum iterations
- // MAX: maximum iterations (0xffff is infinity)
- //
- // Repeat :1 MIN MAX
- // <expr>
- // Until
- // 1: <tail>
- // FastRepeat SKIP MIN MAX
- //
- // Flags: [Lazy]
- // SKIP: relative address of tail expression
- // MIN: minimum iterations
- // MAX: maximum iterations (0xffff is infinity)
- //
- // FastRepeat :1 MIN MAX
- // <expr>
- // True
- // 1: <tail>
- //
- // Notes:
- //
- // The subexpression of a FastRepeat construct must not contain any
- // complex operators. These include: Open, Close, Balance, Repeat,
- // FastRepeat, Sub, Test. In addition, the subexpression must have
- // been determined to have a fixed width.
-
- // Sub SKIP
- //
- // SKIP: relative address of tail expression
- //
- // Sub :1
- // <expr>
- // 1: <tail>
- //
- // Notes:
- //
- // The Sub operator invokes an independent subexpression. This means
- // that the subexpression will match only once and so will not
- // participate in any backtracking.
- // Test TSKIP FSKIP
- //
- // TSKIP: relative address of true expression
- // FSKIP: relative address of false expression
- //
- // Usage: (?(?=test)true|false)
- //
- // Test :1 :2
- // <test expr>
- // 1: <true expr>
- // Jump
- // 2: <false epxr>
- // <tail>
- //
- // Usage: (?(?=test)true)
- //
- // Test :1 :2
- // <test expr>
- // 1: <true expr>
- // 2: <tail>
- //
- // Usage: (?=test)
- //
- // Test :1 :2
- // <test expr>
- // 1: <true expr>
- // Jump 3:
- // 2: False
- // 3: <tail>
- //
- // Notes:
- //
- // For negative lookaheads, just swap the values of TSKIP and
- // FSKIP. For lookbehinds, the test expression must be compiled
- // in reverse. The test expression is always executed as an
- // independent subexpression, so its behaviour is non-backtracking
- // (like a Sub clause.)
- // IfDefined SKIP GID
- //
- // SKIP: relative address of else expression
- // GID: number of group to check
- //
- // Usage: (?(gid)true)
- //
- // IfDefined :1
- // <true expr>
- // 1: <tail>
- //
- // Usage: (?(gid)true|false)
- //
- // IfDefined :1
- // <true expr>
- // Jump :2
- // 1: <false expr>
- // 2: <tail>
- // Jump SKIP
- //
- // SKIP: relative address of target expression
- //
- // Jump :1
- // ...
- // :1 <target expr>
- // Character CHAR
- //
- // Flags: [Negate, IgnoreCase, RightToLeft]
- // CHAR: exact character to match
- // Category CAT
- //
- // Flags: [Negate, RightToLeft]
- // CAT: category to match (see Category enum)
- // Range LO HI
- //
- // Flags: [Negate, IgnoreCase, RightToLeft]
- // LO: lowest character in range
- // HI: higest character in range
- // Set LO LEN SET...
- //
- // Flags: [Negate, IgnoreCase, RightToLeft]
- // LO: lowest character in set
- // LEN: number of words in set
- // SET: bit array representing characters in set
- //
- // Notes:
- //
- // Each word in the set represents 16 characters, so the first word
- // defines membership for characters LO to LO + 15, the second for
- // LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
- // up to the compiler to provide a compact representation for sparse
- // unicode sets. The simple way is to use Set 0 4096. Other methods
- // involve paritioning the set and placing the components into an
- // In block.
- // In SKIP
- //
- // SKIP: relative address of tail expression
- //
- // Usage: [expr]
- //
- // In :1
- // <expr>
- // True
- // :1 <tail>
- //
- // Usage: [^expr]
- //
- // In :1
- // <expr>
- // False
- // :1 <tail>
- //
- // Notes:
- //
- // The In instruction consumes a single character, using the flags
- // of the first instruction in the subexpression to determine its
- // IgnoreCase and RightToLeft properties. The subexpression is then
- // applied to the single character as a disjunction. If any instruction
- // in the subexpression succeeds, the entire In construct succeeds
- // and matching continues with the tail.
- // Position POS
- //
- // POS: position to match (see Position enum)
- // Open GID
- //
- // GID: number of group to open
- // Close GID
- //
- // GID: number of group to close
-
- // Balance GID BAL
- //
- // GID: number of capturing group (0 if none)
- // BAL: number of group to undefine
- // Info GROUPS MIN MAX
- //
- // GROUPS: number of capturing groups
- // MIN: minimum width of pattern
- // MAX: maximum width of pattern (0xffff means undefined)
- // False
- // True
- // Reference GID
- //
- // Flags: [IgnoreCase, RightToLeft]
- // GID: number of group to reference
- }
|