123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610 |
- {
- $Id$
- This file is part of the Free Pascal run time library.
- Copyright (c) 2000 by Florian Klaempfl
- This unit implements basic regular expression support
- See the file COPYING.FPC, included in this distribution,
- for details about the copyright.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- **********************************************************************}
- { $define DEBUG}
- {
- TODO:
- - correct backtracking, for example in (...)*
- - | support
- - getting substrings and using substrings with \1 etc.
- - test ^ and $
- - newline handling in DOS?
- - locals dependend upper/lowercase routines
- - extend the interface
- }
- {$mode objfpc}
- unit regexpr;
- interface
- { the following declarions are only in the interface because }
- { some procedures return pregexprentry but programs which }
- { use this unit shouldn't access this data structures }
- type
- tcharset = set of char;
- tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
- ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
- ret_endline);
- pregexprentry = ^tregexprentry;
- tregexprentry = record
- next,nextdestroy : pregexprentry;
- case typ : tregexprentrytype of
- ret_charset : (chars : tcharset;
- elsepath : pregexprentry);
- ret_or : (alternative : pregexprentry);
- end;
- tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
- tregexprflags = set of tregexprflag;
- TRegExprEngine = record
- Data : pregexprentry;
- DestroyList : pregexprentry;
- Flags : TRegExprFlags;
- end;
- const
- cs_allchars : tcharset = [#0..#255];
- cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
- cs_newline : tcharset = [#10];
- cs_digits : tcharset = ['0'..'9'];
- cs_whitespace : tcharset = [' ',#9];
- var
- { these are initilized in the init section of the unit }
- cs_nonwordchars : tcharset;
- cs_nondigits : tcharset;
- cs_nonwhitespace : tcharset;
- { the following procedures can be used by units basing }
- { on the regexpr unit }
- function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
- procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
- function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
- implementation
- {$ifdef DEBUG}
- procedure writecharset(c : tcharset);
- var
- b : byte;
- begin
- for b:=0 to 255 do
- if chr(b) in c then
- write(chr(b));
- writeln;
- end;
- {$endif DEBUG}
- function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
- var
- first : pregexprentry;
- procedure doregister(p : pregexprentry);
- begin
- p^.nextdestroy:=first;
- if not(assigned(first)) then
- first:=p;
- end;
- var
- currentpos : pchar;
- error : boolean;
- function readchars : tcharset;
- var
- c1 : char;
- begin
- readchars:=[];
- case currentpos^ of
- #0:
- exit;
- '.':
- begin
- inc(currentpos);
- readchars:=cs_allchars-cs_newline;
- end;
- '\':
- begin
- inc(currentpos);
- case currentpos^ of
- #0:
- begin
- error:=true;
- exit;
- end;
- 't':
- begin
- inc(currentpos);
- readchars:=[#9];
- end;
- 'n':
- begin
- inc(currentpos);
- readchars:=[#10];
- end;
- 'r':
- begin
- inc(currentpos);
- readchars:=[#13];
- end;
- 'd':
- begin
- inc(currentpos);
- readchars:=cs_digits;
- end;
- 'D':
- begin
- inc(currentpos);
- readchars:=cs_nondigits;
- end;
- 's':
- begin
- inc(currentpos);
- readchars:=cs_whitespace;
- end;
- 'S':
- begin
- inc(currentpos);
- readchars:=cs_nonwhitespace;
- end;
- 'w':
- begin
- inc(currentpos);
- readchars:=cs_wordchars;
- end;
- 'W':
- begin
- inc(currentpos);
- readchars:=cs_nonwordchars;
- end;
- else
- begin
- error:=true;
- exit;
- end;
- end;
- end;
- else
- begin
- if ref_caseinsensitive in flags then
- c1:=upcase(currentpos^)
- else
- c1:=currentpos^;
- inc(currentpos);
- if currentpos^='-' then
- begin
- inc(currentpos);
- if currentpos^=#0 then
- begin
- error:=true;
- exit;
- end;
- if ref_caseinsensitive in flags then
- readchars:=[c1..upcase(currentpos^)]
- else
- readchars:=[c1..currentpos^];
- inc(currentpos);
- end
- else
- readchars:=[c1];
- end;
- end;
- end;
- function readcharset : tcharset;
- begin
- readcharset:=[];
- case currentpos^ of
- #0:
- exit;
- '[':
- begin
- inc(currentpos);
- while currentpos^<>']' do
- begin
- if currentpos^='^' then
- begin
- inc(currentpos);
- readcharset:=readcharset+(cs_allchars-readchars);
- end
- else
- readcharset:=readcharset+readchars;
- if error or (currentpos^=#0) then
- begin
- error:=true;
- exit;
- end;
- end;
- inc(currentpos);
- end;
- '^':
- begin
- inc(currentpos);
- readcharset:=cs_allchars-readchars;
- end;
- else
- readcharset:=readchars;
- end;
- end;
- function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
- var
- hp,hp2,ep : pregexprentry;
- cs : tcharset;
- chaining : ^pregexprentry;
- begin
- chaining:=nil;
- parseregexpr:=nil;
- if error then
- exit;
- { this dummy allows us to redirect the elsepath later }
- new(ep);
- doregister(ep);
- ep^.typ:=ret_charset;
- ep^.chars:=[];
- ep^.elsepath:=elsepath;
- elsepath:=ep;
- while true do
- begin
- if error then
- exit;
- case currentpos^ of
- '(':
- begin
- inc(currentpos);
- new(hp2);
- doregister(hp2);
- hp2^.typ:=ret_charset;
- hp2^.chars:=[];
- hp2^.elsepath:=next;
- hp:=parseregexpr(hp2,ep);
- if assigned(chaining) then
- chaining^:=hp
- else
- parseregexpr:=hp;
- chaining:=@hp2^.elsepath;
- if currentpos^<>')' then
- begin
- error:=true;
- exit;
- end;
- inc(currentpos);
- end;
- '|':
- begin
- {$ifdef DEBUG}
- writeln('Creating backtrace entry');
- {$endif DEBUG}
- inc(currentpos);
- if currentpos^=#0 then
- begin
- error:=true;
- exit;
- end;
- new(hp);
- doregister(hp);
- hp^.typ:=ret_backtrace;
- // hp^.elsepath:=parseregexpr(elsepath);
- hp^.next:=@parseregexpr;
- parseregexpr:=hp;
- exit;
- end;
- ')':
- exit;
- '^':
- begin
- inc(currentpos);
- new(hp);
- doregister(hp);
- hp^.typ:=ret_startline;
- hp^.elsepath:=ep;
- // hp^.next:=parseregexpr(ep);
- end;
- '$':
- begin
- inc(currentpos);
- new(hp);
- doregister(hp);
- hp^.typ:=ret_endline;
- hp^.elsepath:=ep;
- // hp^.next:=parseregexpr(ep);
- end;
- #0:
- exit;
- else
- begin
- cs:=readcharset;
- if error then
- exit;
- case currentpos^ of
- '*':
- begin
- inc(currentpos);
- new(hp);
- doregister(hp);
- hp^.typ:=ret_charset;
- hp^.chars:=cs;
- hp^.elsepath:=next;
- hp^.next:=hp;
- if assigned(chaining) then
- chaining^:=hp
- else
- parseregexpr:=hp;
- chaining:=@hp^.elsepath;
- end;
- '+':
- begin
- inc(currentpos);
- new(hp);
- new(hp2);
- doregister(hp);
- doregister(hp2);
- hp^.typ:=ret_charset;
- hp2^.typ:=ret_charset;
- hp^.chars:=cs;
- hp2^.chars:=cs;
- hp^.elsepath:=elsepath;
- hp^.next:=hp2;
- hp2^.elsepath:=next;
- hp2^.next:=hp2;
- if assigned(chaining) then
- chaining^:=hp
- else
- parseregexpr:=hp;
- chaining:=@hp2^.elsepath;
- end;
- '?':
- begin
- inc(currentpos);
- new(hp);
- { this is a dummy }
- new(hp2);
- doregister(hp);
- doregister(hp2);
- hp^.typ:=ret_charset;
- hp^.chars:=cs;
- hp^.next:=hp2;
- hp^.elsepath:=hp2;
- hp2^.typ:=ret_charset;
- hp2^.chars:=[];
- hp2^.elsepath:=next;
- if assigned(chaining) then
- chaining^:=hp
- else
- parseregexpr:=hp;
- chaining:=@hp2^.elsepath;
- end;
- else
- begin
- new(hp);
- doregister(hp);
- hp^.typ:=ret_charset;
- hp^.chars:=cs;
- hp^.elsepath:=elsepath;
- hp^.next:=next;
- if assigned(chaining) then
- chaining^:=hp
- else
- parseregexpr:=hp;
- chaining:=@hp^.next;
- end;
- end;
- end;
- end;
- end;
- end;
- var
- endp : pregexprentry;
- begin
- GenerateRegExprEngine.Data:=nil;
- GenerateRegExprEngine.DestroyList:=nil;
- if regexpr=nil then
- exit;
- first:=nil;
- if (ref_singleline in flags) and (ref_multiline in flags) then
- exit;
- currentpos:=regexpr;
- new(endp);
- doregister(endp);
- endp^.typ:=ret_illegalend;
- GenerateRegExprEngine.flags:=flags;
- GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
- GenerateRegExprEngine.DestroyList:=first;
- if error or (currentpos^<>#0) then
- DestroyRegExprEngine(Result);
- end;
- procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
- var
- hp : pregexprentry;
- begin
- hp:=regexpr.DestroyList;
- while assigned(hp) do
- begin
- regexpr.DestroyList:=hp^.nextdestroy;
- dispose(hp);
- hp:=regexpr.DestroyList;
- end;
- regexpr.Data:=nil;
- regexpr.DestroyList:=nil;
- end;
- function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
- var
- lastpos : pchar;
- function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
- begin
- dosearch:=false;
- while true do
- begin
- {$IFDEF Debug}
- writeln(byte(regexpr^.typ));
- {$ENDIF Debug}
- case regexpr^.typ of
- ret_endline:
- begin
- if ref_multiline in regexprengine.flags then
- begin
- if ((pos+1)^ in [#10,#0]) then
- regexpr:=regexpr^.next
- else
- regexpr:=regexpr^.elsepath;
- end
- else
- begin
- if (pos+1)^=#0 then
- regexpr:=regexpr^.next
- else
- regexpr:=regexpr^.elsepath;
- end;
- end;
- ret_startline:
- begin
- if ref_multiline in regexprengine.flags then
- begin
- if (pos=p) or ((pos-1)^=#10) then
- regexpr:=regexpr^.next
- else
- regexpr:=regexpr^.elsepath;
- end
- else
- begin
- if pos=p then
- regexpr:=regexpr^.next
- else
- regexpr:=regexpr^.elsepath;
- end;
- end;
- ret_charset:
- begin
- if (pos^ in regexpr^.chars) or
- ((ref_caseinsensitive in regexprengine.flags) and
- (upcase(pos^) in regexpr^.chars)) then
- begin
- {$ifdef DEBUG}
- writeln('Found matching: ',pos^);
- {$endif DEBUG}
- regexpr:=regexpr^.next;
- inc(pos);
- end
- else
- begin
- {$ifdef DEBUG}
- writeln('Found unmatching: ',pos^);
- {$endif DEBUG}
- regexpr:=regexpr^.elsepath;
- end;
- end;
- ret_backtrace:
- begin
- {$ifdef DEBUG}
- writeln('Starting backtrace');
- {$endif DEBUG}
- if dosearch(regexpr^.next,pos) then
- begin
- dosearch:=true;
- exit;
- end
- else if dosearch(regexpr^.elsepath,pos) then
- begin
- dosearch:=true;
- exit;
- end
- else
- exit;
- end;
- end;
- lastpos:=pos;
- if regexpr=nil then
- begin
- dosearch:=true;
- exit;
- end;
- if regexpr^.typ=ret_illegalend then
- exit;
- if pos^=#0 then
- exit;
- end;
- end;
- begin
- RegExprPos:=false;
- index:=0;
- len:=0;
- if regexprengine.Data=nil then
- exit;
- while p^<>#0 do
- begin
- if dosearch(regexprengine.Data,p) then
- begin
- len:=lastpos-p;
- RegExprPos:=true;
- exit;
- end
- else
- begin
- inc(p);
- inc(index);
- end;
- end;
- index:=-1;
- end;
- begin
- cs_nonwordchars:=cs_allchars-cs_wordchars;
- cs_nondigits:=cs_allchars-cs_digits;
- cs_nonwhitespace:=cs_allchars-cs_whitespace;
- end.
- {
- $Log$
- Revision 1.1 2002-01-29 17:54:56 peter
- * splitted to base and extra
- Revision 1.4 2002/01/22 13:13:14 pierre
- + add mode objfpc to avoid problems if compiling from IDE dir
- Revision 1.3 2000/07/30 14:58:04 sg
- * Added modifications by Markus Kaemmerer:
- - Unit now compiles with Delphi
- - Removed debug output when not compiled with -dDEBUG
- Revision 1.2 2000/07/13 11:33:31 michael
- + removed logs
-
- }
|