| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976 |
- //
- // assembly: System
- // namespace: System.Text.RegularExpressions
- // file: syntax.cs
- //
- // author: Dan Lewis ([email protected])
- // (c) 2002
- using System;
- using System.Collections;
- namespace System.Text.RegularExpressions.Syntax {
- // collection classes
-
- class ExpressionCollection : CollectionBase {
- public void Add (Expression e) {
- List.Add (e);
- }
- public Expression this[int i] {
- get { return (Expression)List[i]; }
- set { List[i] = value; }
- }
- protected override void OnValidate (object o) {
- // allow null elements
- }
- }
- // abstract classes
-
- abstract class Expression {
- public abstract void Compile (ICompiler cmp, bool reverse);
- public abstract void GetWidth (out int min, out int max);
- public int GetFixedWidth () {
- int min, max;
- GetWidth (out min, out max);
- if (min == max)
- return min;
- return -1;
- }
- public virtual AnchorInfo GetAnchorInfo () {
- return new AnchorInfo (this, GetFixedWidth ());
- }
- public virtual bool IsComplex () {
- return true;
- }
- }
- // composite expressions
-
- abstract class CompositeExpression : Expression {
- public CompositeExpression () {
- expressions = new ExpressionCollection ();
- }
- protected ExpressionCollection Expressions {
- get { return expressions; }
- }
- protected void GetWidth (out int min, out int max, int count) {
- min = Int32.MaxValue;
- max = 0;
- bool empty = true;
- for (int i = 0; i < count; ++ i) {
- Expression e = Expressions[i];
- if (e == null)
- continue;
-
- empty = false;
- int a, b;
- e.GetWidth (out a, out b);
- if (a < min) min = a;
- if (b > max) max = b;
- }
- if (empty)
- min = max = 0;
- }
- private ExpressionCollection expressions;
- }
- // groups
-
- class Group : CompositeExpression {
- public Group () {
- }
- public Expression Expression {
- get { return Expressions[0]; }
- set { Expressions[0] = value; }
- }
- public void AppendExpression (Expression e) {
- Expressions.Add (e);
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- int count = Expressions.Count;
- for (int i = 0; i < count; ++ i) {
- Expression e;
- if (reverse)
- e = Expressions[count - i - 1];
- else
- e = Expressions[i];
- e.Compile (cmp, reverse);
- }
- }
- public override void GetWidth (out int min, out int max) {
- min = 0;
- max = 0;
- foreach (Expression e in Expressions) {
- int a, b;
- e.GetWidth (out a, out b);
- min += a;
- if (max == Int32.MaxValue || b == Int32.MaxValue)
- max = Int32.MaxValue;
- else
- max += b;
- }
- }
- public override AnchorInfo GetAnchorInfo () {
- int ptr;
- int width = GetFixedWidth ();
- ArrayList infos = new ArrayList ();
- IntervalCollection segments = new IntervalCollection ();
- // accumulate segments
- ptr = 0;
- foreach (Expression e in Expressions) {
- AnchorInfo info = e.GetAnchorInfo ();
- infos.Add (info);
- if (info.IsPosition)
- return new AnchorInfo (this, ptr + info.Offset, width, info.Position);
- if (info.IsSubstring)
- segments.Add (info.GetInterval (ptr));
- if (info.IsUnknownWidth)
- break;
- ptr += info.Width;
- }
- // normalize and find the longest segment
- segments.Normalize ();
- Interval longest = Interval.Empty;
- foreach (Interval segment in segments) {
- if (segment.Size > longest.Size)
- longest = segment;
- }
- // now chain the substrings that made this segment together
- if (!longest.IsEmpty) {
- string str = "";
- bool ignore = false;
- ptr = 0;
- foreach (AnchorInfo info in infos) {
- if (info.IsSubstring && longest.Contains (info.GetInterval (ptr))) {
- str += info.Substring; // TODO mark subexpressions
- ignore |= info.IgnoreCase;
- }
- if (info.IsUnknownWidth)
- break;
- ptr += info.Width;
- }
- return new AnchorInfo (this, longest.low, width, str, ignore);
- }
- return new AnchorInfo (this, width);
- }
- public override bool IsComplex () {
- bool comp = false;
- foreach (Expression e in Expressions) {
- comp |= e.IsComplex ();
- }
- return comp | GetFixedWidth () <= 0;
- }
- }
- class RegularExpression : Group {
- public RegularExpression () {
- group_count = 0;
- }
- public int GroupCount {
- get { return group_count; }
- set { group_count = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- // info block
- int min, max;
- GetWidth (out min, out max);
- cmp.EmitInfo (group_count, min, max);
- // anchoring expression
- AnchorInfo info = GetAnchorInfo ();
- if (reverse)
- info = new AnchorInfo (this, GetFixedWidth ()); // FIXME
- LinkRef pattern = cmp.NewLink ();
- cmp.EmitAnchor (info.Offset, pattern);
- if (info.IsPosition)
- cmp.EmitPosition (info.Position);
- else if (info.IsSubstring)
- cmp.EmitString (info.Substring, info.IgnoreCase, reverse);
-
- cmp.EmitTrue ();
-
- // pattern
- cmp.ResolveLink (pattern);
- base.Compile (cmp, reverse);
- cmp.EmitTrue ();
- }
- private int group_count;
- }
- class CapturingGroup : Group {
- public CapturingGroup () {
- this.gid = 0;
- this.name = null;
- }
- public int Number {
- get { return gid; }
- set { gid = value; }
- }
- public string Name {
- get { return name; }
- set { name = value; }
- }
- public bool IsNamed {
- get { return name != null; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- cmp.EmitOpen (gid);
- base.Compile (cmp, reverse);
- cmp.EmitClose (gid);
- }
- public override bool IsComplex () {
- return true;
- }
- private int gid;
- private string name;
- }
- class BalancingGroup : CapturingGroup {
- public BalancingGroup () {
- this.balance = null;
- }
- public CapturingGroup Balance {
- get { return balance; }
- set { balance = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- // can't invoke Group.Compile from here :(
- // so I'll just repeat the code
-
- int count = Expressions.Count;
- for (int i = 0; i < count; ++ i) {
- Expression e;
- if (reverse)
- e = Expressions[count - i - 1];
- else
- e = Expressions[i];
- e.Compile (cmp, reverse);
- }
- cmp.EmitBalance (this.Number, balance.Number);
- }
- private CapturingGroup balance;
- }
- class NonBacktrackingGroup : Group {
- public NonBacktrackingGroup () {
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- LinkRef tail = cmp.NewLink ();
- cmp.EmitSub (tail);
- base.Compile (cmp, reverse);
- cmp.EmitTrue ();
- cmp.ResolveLink (tail);
- }
- public override bool IsComplex () {
- return true;
- }
- }
- // repetition
- class Repetition : CompositeExpression {
- public Repetition (int min, int max, bool lazy) {
- Expressions.Add (null);
-
- this.min = min;
- this.max = max;
- this.lazy = lazy;
- }
- public Expression Expression {
- get { return Expressions[0]; }
- set { Expressions[0] = value; }
- }
- public int Minimum {
- get { return min; }
- set { min = value; }
- }
- public int Maximum {
- get { return max; }
- set { max = value; }
- }
- public bool Lazy {
- get { return lazy; }
- set { lazy = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- if (Expression.IsComplex ()) {
- LinkRef until = cmp.NewLink ();
-
- cmp.EmitRepeat (min, max, lazy, until);
- Expression.Compile (cmp, reverse);
- cmp.EmitUntil (until);
- }
- else {
- LinkRef tail = cmp.NewLink ();
- cmp.EmitFastRepeat (min, max, lazy, tail);
- Expression.Compile (cmp, reverse);
- cmp.EmitTrue ();
- cmp.ResolveLink (tail);
- }
- }
- public override void GetWidth (out int min, out int max) {
- Expression.GetWidth (out min, out max);
- min = min * this.min;
- if (max == Int32.MaxValue || this.max == 0xffff)
- max = Int32.MaxValue;
- else
- max = max * this.max;
- }
- public override AnchorInfo GetAnchorInfo () {
- int width = GetFixedWidth ();
- if (Minimum == 0)
- return new AnchorInfo (this, width);
-
- AnchorInfo info = Expression.GetAnchorInfo ();
- if (info.IsPosition)
- return new AnchorInfo (this, info.Offset, width, info.Position);
-
- if (info.IsSubstring) {
- if (info.IsComplete) {
- string str = "";
- for (int i = 0; i < Minimum; ++ i)
- str += info.Substring;
- return new AnchorInfo (this, 0, width, str, info.IgnoreCase);
- }
- return new AnchorInfo (this, info.Offset, width, info.Substring, info.IgnoreCase);
- }
- return new AnchorInfo (this, width);
- }
- private int min, max;
- private bool lazy;
- }
- // assertions
- abstract class Assertion : CompositeExpression {
- public Assertion () {
- Expressions.Add (null); // true expression
- Expressions.Add (null); // false expression
- }
- public Expression TrueExpression {
- get { return Expressions[0]; }
- set { Expressions[0] = value; }
- }
- public Expression FalseExpression {
- get { return Expressions[1]; }
- set { Expressions[1] = value; }
- }
- public override void GetWidth (out int min, out int max) {
- GetWidth (out min, out max, 2);
- if (TrueExpression == null || FalseExpression == null)
- min = 0;
- }
- }
- class CaptureAssertion : Assertion {
- public CaptureAssertion () {
- }
- public CapturingGroup CapturingGroup {
- get { return group; }
- set { group = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- int gid = group.Number;
- LinkRef tail = cmp.NewLink ();
- if (FalseExpression == null) {
- // IfDefined :1
- // <yes_exp>
- // 1: <tail>
-
- cmp.EmitIfDefined (gid, tail);
- TrueExpression.Compile (cmp, reverse);
- }
- else {
- // IfDefined :1
- // <yes_expr>
- // Jump :2
- // 1: <no_expr>
- // 2: <tail>
-
- LinkRef false_expr = cmp.NewLink ();
- cmp.EmitIfDefined (gid, false_expr);
- TrueExpression.Compile (cmp, reverse);
- cmp.EmitJump (tail);
- cmp.ResolveLink (false_expr);
- FalseExpression.Compile (cmp, reverse);
- }
- cmp.ResolveLink (tail);
- }
- public override bool IsComplex () {
- bool comp = false;
- if (TrueExpression != null)
- comp |= TrueExpression.IsComplex ();
- if (FalseExpression != null)
- comp |= FalseExpression.IsComplex ();
- return comp | GetFixedWidth () <= 0;
- }
- private CapturingGroup group;
- }
- class ExpressionAssertion : Assertion {
- public ExpressionAssertion () {
- Expressions.Add (null); // test expression
- }
- public bool Reverse {
- get { return reverse; }
- set { reverse = value; }
- }
- public bool Negate {
- get { return negate; }
- set { negate = value; }
- }
- public Expression TestExpression {
- get { return Expressions[2]; }
- set { Expressions[2] = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- LinkRef true_expr = cmp.NewLink ();
- LinkRef false_expr = cmp.NewLink ();
- // test op: positive / negative
- if (!negate)
- cmp.EmitTest (true_expr, false_expr);
- else
- cmp.EmitTest (false_expr, true_expr);
-
- // test expression: lookahead / lookbehind
- TestExpression.Compile (cmp, reverse ^ this.reverse);
- cmp.EmitTrue ();
- // target expressions
- if (TrueExpression == null) { // (?= ...)
- // Test :1, :2
- // <test_expr>
- // :2 False
- // :1 <tail>
-
- cmp.ResolveLink (false_expr);
- cmp.EmitFalse ();
- cmp.ResolveLink (true_expr);
- }
- else {
- cmp.ResolveLink (true_expr);
- TrueExpression.Compile (cmp, reverse);
-
- if (FalseExpression == null) { // (?(...) ...)
- // Test :1, :2
- // <test_expr>
- // :1 <yes_expr>
- // :2 <tail>
- cmp.ResolveLink (false_expr);
- }
- else { // (?(...) ... | ...)
- // Test :1, :2
- // <test_expr>
- // :1 <yes_expr>
- // Jump :3
- // :2 <no_expr>
- // :3 <tail>
-
- LinkRef tail = cmp.NewLink ();
-
- cmp.EmitJump (tail);
- cmp.ResolveLink (false_expr);
- FalseExpression.Compile (cmp, reverse);
- cmp.ResolveLink (tail);
- }
- }
- }
- private bool reverse, negate;
- }
- // alternation
- class Alternation : CompositeExpression {
- public Alternation () {
- }
- public ExpressionCollection Alternatives {
- get { return Expressions; }
- }
- public void AddAlternative (Expression e) {
- Alternatives.Add (e);
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- LinkRef next = cmp.NewLink ();
- LinkRef tail = cmp.NewLink ();
-
- foreach (Expression e in Alternatives) {
- cmp.EmitBranch (next);
- e.Compile (cmp, reverse);
- cmp.EmitJump (tail);
- cmp.ResolveLink (next);
- }
- cmp.EmitFalse ();
- cmp.ResolveLink (tail);
- }
- public override void GetWidth (out int min, out int max) {
- GetWidth (out min, out max, Alternatives.Count);
- }
- public override bool IsComplex () {
- bool comp = false;
- foreach (Expression e in Alternatives) {
- comp |= e.IsComplex ();
- }
- return comp | GetFixedWidth () <= 0;
- }
- }
- // terminal expressions
- class Literal : Expression {
- public Literal (string str, bool ignore) {
- this.str = str;
- this.ignore = ignore;
- }
- public string String {
- get { return str; }
- set { str = value; }
- }
- public bool IgnoreCase {
- get { return ignore; }
- set { ignore = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- if (str.Length == 0)
- return;
- if (str.Length == 1)
- cmp.EmitCharacter (str[0], false, ignore, reverse);
- else
- cmp.EmitString (str, ignore, reverse);
- }
- public override void GetWidth (out int min, out int max) {
- min = max = str.Length;
- }
- public override AnchorInfo GetAnchorInfo () {
- return new AnchorInfo (this, 0, str.Length, str, ignore);
- }
- public override bool IsComplex () {
- return false;
- }
- private string str;
- private bool ignore;
- }
- class PositionAssertion : Expression {
- public PositionAssertion (Position pos) {
- this.pos = pos;
- }
- public Position Position {
- get { return pos; }
- set { pos = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- cmp.EmitPosition (pos);
- }
- public override void GetWidth (out int min, out int max) {
- min = max = 0;
- }
- public override bool IsComplex () {
- return false;
- }
- public override AnchorInfo GetAnchorInfo () {
- switch (pos) {
- case Position.StartOfString: case Position.StartOfLine: case Position.StartOfScan:
- return new AnchorInfo (this, 0, 0, pos);
- default:
- return new AnchorInfo (this, 0);
- }
- }
- private Position pos;
- }
- class Reference : Expression {
- public Reference (bool ignore) {
- this.ignore = ignore;
- }
- public CapturingGroup CapturingGroup {
- get { return group; }
- set { group = value; }
- }
- public bool IgnoreCase {
- get { return ignore; }
- set { ignore = value; }
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- cmp.EmitReference (group.Number, ignore, reverse);
- }
- public override void GetWidth (out int min, out int max) {
- //group.GetWidth (out min, out max);
- // TODO set width to referenced group for non-cyclical references
- min = 0;
- max = Int32.MaxValue;
- }
- public override bool IsComplex () {
- return true; // FIXME incorporate cyclic check
- }
- private CapturingGroup group;
- private bool ignore;
- }
- class CharacterClass : Expression {
- public CharacterClass (bool negate, bool ignore) {
- this.negate = negate;
- this.ignore = ignore;
- intervals = new IntervalCollection ();
- // initialize pos/neg category arrays
- Array cat_values = Enum.GetValues (typeof (Category));
- int cat_size = (int)(Category)cat_values.GetValue (cat_values.Length - 1) + 1;
- pos_cats = new bool[cat_size];
- neg_cats = new bool[cat_size];
- for (int i = 0; i < cat_size; ++ i) {
- pos_cats[i] = false;
- neg_cats[i] = false;
- }
- }
- public CharacterClass (Category cat, bool negate) : this (false, false) {
- this.AddCategory (cat, negate);
- }
- public bool Negate {
- get { return negate; }
- set { negate = value; }
- }
- public bool IgnoreCase {
- get { return ignore; }
- set { ignore = value; }
- }
- public void AddCategory (Category cat, bool negate) {
- int n = (int)cat;
-
- if (negate) {
- if (pos_cats[n])
- pos_cats[n] = false;
- neg_cats[n] = true;
- }
- else {
- if (neg_cats[n])
- neg_cats[n] = false;
- pos_cats[n] = true;
- }
- }
- public void AddCharacter (char c) {
- intervals.Add (new Interval (c, c));
- }
- public void AddRange (char lo, char hi) {
- intervals.Add (new Interval (lo, hi));
- }
- public override void Compile (ICompiler cmp, bool reverse) {
- // create the meta-collection
- IntervalCollection meta =
- intervals.GetMetaCollection (new IntervalCollection.CostDelegate (GetIntervalCost));
- // count ops
-
- int count = meta.Count;
- for (int i = 0; i < pos_cats.Length; ++ i) {
- if (pos_cats[i]) ++ count;
- if (neg_cats[i]) ++ count;
- }
- if (count == 0)
- return;
- // emit in op for |meta| > 1
- LinkRef tail = cmp.NewLink ();
- if (count > 1)
- cmp.EmitIn (tail);
- // emit categories
- for (int i = 0; i < pos_cats.Length; ++ i) {
- if (pos_cats[i])
- cmp.EmitCategory ((Category)i, negate, reverse);
- else if (neg_cats[i])
- cmp.EmitCategory ((Category)i, !negate, reverse);
- }
- // emit character/range/sets from meta-collection
- foreach (Interval a in meta) {
- if (a.IsDiscontiguous) { // Set
- BitArray bits = new BitArray (a.Size);
- foreach (Interval b in intervals) {
- if (a.Contains (b)) {
- for (int i = b.low; i <= b.high; ++ i)
- bits[i - a.low] = true;
- }
- }
- cmp.EmitSet ((char)a.low, bits, negate, ignore, reverse);
- }
- else if (a.IsSingleton) // Character
- cmp.EmitCharacter ((char)a.low, negate, ignore, reverse);
- else // Range
- cmp.EmitRange ((char)a.low, (char)a.high, negate, ignore, reverse);
- }
-
- // finish up
- if (count > 1) {
- if (negate)
- cmp.EmitTrue ();
- else
- cmp.EmitFalse ();
- cmp.ResolveLink (tail);
- }
- }
- public override void GetWidth (out int min, out int max) {
- min = max = 1;
- }
- public override bool IsComplex () {
- return false;
- }
- // private
- private static double GetIntervalCost (Interval i) {
- // use op length as cost metric (=> optimize for space)
-
- if (i.IsDiscontiguous)
- return 3 + ((i.Size + 0xf) >> 4); // Set
- else if (i.IsSingleton)
- return 2; // Character
- else
- return 3; // Range
- }
- private bool negate, ignore;
- private bool[] pos_cats, neg_cats;
- private IntervalCollection intervals;
- }
- class AnchorInfo {
- private Expression expr;
- private Position pos;
- private int offset;
- private string str;
- private int width;
- private bool ignore;
- public AnchorInfo (Expression expr, int width) {
- this.expr = expr;
- this.offset = 0;
- this.width = width;
- this.str = null;
- this.ignore = false;
- this.pos = Position.Any;
- }
-
- public AnchorInfo (Expression expr, int offset, int width, string str, bool ignore) {
- this.expr = expr;
- this.offset = offset;
- this.width = width;
- this.str = ignore ? str.ToLower () : str;
- this.ignore = ignore;
- this.pos = Position.Any;
- }
- public AnchorInfo (Expression expr, int offset, int width, Position pos) {
- this.expr = expr;
- this.offset = offset;
- this.width = width;
- this.pos = pos;
- this.str = null;
- this.ignore = false;
- }
- public Expression Expression {
- get { return expr; }
- }
- public int Offset {
- get { return offset; }
- }
- public int Width {
- get { return width; }
- }
- public int Length {
- get { return (str != null) ? str.Length : 0; }
- }
- public bool IsUnknownWidth {
- get { return width < 0; }
- }
- public bool IsComplete {
- get { return Length == Width; }
- }
- public string Substring {
- get { return str; }
- }
- public bool IgnoreCase {
- get { return ignore; }
- }
- public Position Position {
- get { return pos; }
- }
- public bool IsSubstring {
- get { return str != null; }
- }
- public bool IsPosition {
- get { return pos != Position.Any; }
- }
- public Interval GetInterval () {
- return GetInterval (0);
- }
- public Interval GetInterval (int start) {
- if (!IsSubstring)
- return Interval.Empty;
- return new Interval (start + Offset, start + Offset + Length - 1);
- }
- }
- }
|