Browse Source

* Initial revision

florian 25 years ago
parent
commit
b87d671416
2 changed files with 538 additions and 0 deletions
  1. 469 0
      packages/regexpr/regexpr.pp
  2. 69 0
      packages/regexpr/test1.pp

+ 469 - 0
packages/regexpr/regexpr.pp

@@ -0,0 +1,469 @@
+{
+    $Id$
+    This file is part of the Free Pascal run time library.
+    Copyright (c) 1999-2000 by the Free Pascal development team
+
+    This unit implements basic regular expression support
+
+    See the file COPYING.FPC, included in this distribution,
+    for details about the copyright.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ **********************************************************************}
+{ $define DEBUG}
+unit regexpr;
+
+  interface
+
+    { the following declarions are only in the interface because }
+    { some procedures return pregexprentry but programs which   }
+    { use this unit shouldn't access this data structures        }
+    type
+       tcharset = set of char;
+       tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
+          ret_illegalend,ret_backtrace);
+
+       pregexprentry = ^tregexprentry;
+       tregexprentry = record
+          next,nextdestroy : pregexprentry;
+          case typ : tregexprentrytype of
+             ret_charset : (chars : tcharset;
+                            elsepath : pregexprentry);
+             ret_or : (alternative : pregexprentry);
+       end;
+
+       TRegExprEngine = record
+          Data : pregexprentry;
+          DestroyList : pregexprentry;
+       end;
+
+     const
+        cs_allchars : tcharset = [#0..#255];
+        cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
+        cs_newline : tcharset = [#10];
+        cs_digits : tcharset = ['0'..'9'];
+        cs_whitespace : tcharset = [' ',#9];
+
+     var
+        { these are initilized in the init section of the unit }
+        cs_nonwordchars : tcharset;
+        cs_nondigits : tcharset;
+        cs_nonwhitespace : tcharset;
+
+     { the following procedures can be used by units basing }
+     { on the regexpr unit                                  }
+     function GenerateRegExprEngine(regexpr : pchar) : TRegExprEngine;
+
+     procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
+
+     function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
+
+  implementation
+
+     uses
+        strings;
+
+{$ifdef DEBUG}
+     procedure writecharset(c : tcharset);
+
+       var
+          b : byte;
+
+       begin
+          for b:=0 to 255 do
+            if chr(b) in c then
+              write(chr(b));
+          writeln;
+       end;
+{$endif DEBUG}
+
+     function GenerateRegExprEngine(regexpr : pchar) : TRegExprEngine;
+
+       var
+          first : pregexprentry;
+
+       procedure doregister(p : pregexprentry);
+
+         begin
+            p^.nextdestroy:=first;
+            if not(assigned(first)) then
+              first:=p;
+         end;
+
+       var
+          currentpos : pchar;
+          error : boolean;
+
+       function readchars : tcharset;
+
+         var
+            c1 : char;
+
+         begin
+            readchars:=[];
+            case currentpos^ of
+               #0:
+                  exit;
+               '.':
+                  begin
+                     inc(currentpos);
+                     readchars:=cs_allchars-cs_newline;
+                  end;
+               '\':
+                  begin
+                     inc(currentpos);
+                     case currentpos^ of
+                        #0:
+                          begin
+                             error:=true;
+                             exit;
+                          end;
+                        't':
+                           begin
+                              inc(currentpos);
+                              readchars:=[#9];
+                           end;
+                        'n':
+                           begin
+                              inc(currentpos);
+                              readchars:=[#10];
+                           end;
+                        'r':
+                           begin
+                              inc(currentpos);
+                              readchars:=[#13];
+                           end;
+                        'd':
+                          begin
+                             inc(currentpos);
+                             readchars:=cs_digits;
+                          end;
+                        'D':
+                          begin
+                             inc(currentpos);
+                             readchars:=cs_nondigits;
+                          end;
+                        's':
+                          begin
+                             inc(currentpos);
+                             readchars:=cs_whitespace;
+                          end;
+                        'S':
+                          begin
+                             inc(currentpos);
+                             readchars:=cs_nonwhitespace;
+                          end;
+                        'w':
+                           begin
+                              inc(currentpos);
+                              readchars:=cs_wordchars;
+                           end;
+                        'W':
+                           begin
+                              inc(currentpos);
+                              readchars:=cs_nonwordchars;
+                           end;
+                         else
+                           begin
+                              error:=true;
+                              exit;
+                           end;
+                     end;
+                  end;
+               else
+                 begin
+                    c1:=currentpos^;
+                    inc(currentpos);
+                    if currentpos^='-' then
+                      begin
+                         inc(currentpos);
+                         if currentpos^=#0 then
+                           begin
+                              error:=true;
+                              exit;
+                           end;
+                         readchars:=[c1..currentpos^];
+                         inc(currentpos);
+                      end
+                    else
+                      readchars:=[c1];
+                 end;
+            end;
+         end;
+
+
+       function readcharset : tcharset;
+
+         var
+            c1,c2 : char;
+
+         begin
+            readcharset:=[];
+            case currentpos^ of
+               #0:
+                  exit;
+               '[':
+                  begin
+                     inc(currentpos);
+                     while currentpos^<>']' do
+                       begin
+                          if currentpos^='^' then
+                            begin
+                               inc(currentpos);
+                               readcharset:=readcharset+(cs_allchars-readchars);
+                            end
+                          else
+                            readcharset:=readcharset+readchars;
+                          if error or (currentpos^=#0) then
+                            begin
+                               error:=true;
+                               exit;
+                            end;
+                       end;
+                     inc(currentpos);
+                  end;
+               '^':
+                  begin
+                     inc(currentpos);
+                     readcharset:=cs_allchars-readchars;
+                  end;
+               else
+                  readcharset:=readchars;
+            end;
+         end;
+
+       function parseregexpr(elsepath : pregexprentry) : pregexprentry;
+
+         var
+            hp,hp2,ep : pregexprentry;
+            cs : tcharset;
+
+         begin
+            parseregexpr:=nil;
+            if error then
+              exit;
+            { this dummy allows us to redirect the elsepath later }
+            new(ep);
+            doregister(ep);
+            ep^.typ:=ret_charset;
+            ep^.chars:=[];
+            ep^.elsepath:=elsepath;
+            while true do
+              begin
+                 if error then
+                   exit;
+                 case currentpos^ of
+                    '(':
+                       begin
+                          inc(currentpos);
+                          parseregexpr:=parseregexpr(ep);
+                          if currentpos^<>')' then
+                            begin
+                               error:=true;
+                               exit;
+                            end;
+                          inc(currentpos);
+                       end;
+                    '|':
+                       begin
+                          inc(currentpos);
+                          if currentpos^=#0 then
+                            begin
+                               error:=true;
+                               exit;
+                            end;
+                          ep^.typ:=ret_backtrace;
+                          ep^.elsepath:=parseregexpr(elsepath);
+                          ep^.next:=parseregexpr;
+                          parseregexpr:=ep;
+                       end;
+                    ')':
+                       exit;
+                    #0:
+
+                       exit;
+                    else
+                      begin
+                         cs:=readcharset;
+                         if error then
+                           exit;
+                         case currentpos^ of
+                            '*':
+                               begin
+                                  inc(currentpos);
+                                  new(hp);
+                                  doregister(hp);
+                                  hp^.typ:=ret_charset;
+                                  hp^.chars:=cs;
+                                  hp^.elsepath:=parseregexpr(ep);
+                                  hp^.next:=hp;
+                               end;
+                            '+':
+                               begin
+                                  inc(currentpos);
+                                  new(hp);
+                                  new(hp2);
+                                  doregister(hp);
+                                  doregister(hp2);
+                                  hp^.typ:=ret_charset;
+                                  hp2^.typ:=ret_charset;
+                                  hp^.chars:=cs;
+                                  hp^.elsepath:=ep;
+                                  hp^.next:=hp2;
+                                  hp2^.chars:=cs;
+                                  hp2^.elsepath:=parseregexpr(ep);
+                                  hp2^.next:=hp2;
+                               end;
+                            '?':
+                               begin
+                                  inc(currentpos);
+                                  new(hp);
+                                  doregister(hp);
+                                  hp^.typ:=ret_charset;
+                                  hp^.chars:=cs;
+                                  hp^.elsepath:=parseregexpr(ep);
+                                  hp^.next:=hp^.elsepath;
+                               end;
+                            else
+                               begin
+                                  new(hp);
+                                  doregister(hp);
+                                  hp^.typ:=ret_charset;
+                                  hp^.chars:=cs;
+                                  hp^.elsepath:=ep;
+                                  hp^.next:=parseregexpr(ep);
+                               end;
+                         end;
+                         parseregexpr:=hp;
+                      end;
+                 end;
+              end;
+         end;
+
+       var
+          endp : pregexprentry;
+
+       begin
+          GenerateRegExprEngine.Data:=nil;
+          GenerateRegExprEngine.DestroyList:=nil;
+          if regexpr=nil then
+            exit;
+          first:=nil;
+          currentpos:=regexpr;
+          new(endp);
+          doregister(endp);
+          endp^.typ:=ret_illegalend;
+          GenerateRegExprEngine.Data:=parseregexpr(endp);
+          GenerateRegExprEngine.DestroyList:=first;
+          if error then
+            DestroyRegExprEngine(GenerateRegExprEngine);
+       end;
+
+     procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
+
+       var
+          hp : pregexprentry;
+
+       begin
+          hp:=regexpr.DestroyList;
+          while assigned(hp) do
+            begin
+               regexpr.DestroyList:=hp^.nextdestroy;
+               dispose(hp);
+               hp:=regexpr.DestroyList;
+            end;
+          regexpr.Data:=nil;
+          regexpr.DestroyList:=nil;
+       end;
+
+     function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
+
+       var
+          lastpos,startpos : pchar;
+
+       function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
+
+         begin
+            dosearch:=false;
+            while true do
+              begin
+                 if regexpr^.typ=ret_backtrace then
+                   begin
+                      if dosearch(regexpr^.next,pos) then
+                        begin
+                           dosearch:=true;
+                           exit;
+                        end
+                      else if dosearch(regexpr^.elsepath,pos) then
+                        begin
+                           dosearch:=true;
+                           exit;
+                        end
+                      else
+                        exit;
+                   end;
+                 if pos^ in regexpr^.chars then
+                   begin
+{$ifdef DEBUG}
+                      writeln('Found matching: ',pos^);
+{$endif DEBUG}
+                      regexpr:=regexpr^.next;
+                      inc(pos);
+                   end
+                 else
+                   begin
+{$ifdef DEBUG}
+                      writeln('Found unmatching: ',pos^);
+{$endif DEBUG}
+                      regexpr:=regexpr^.elsepath;
+                   end;
+                 lastpos:=pos;
+                 if regexpr=nil then
+                   begin
+                      dosearch:=true;
+                      exit;
+                   end;
+                 if regexpr^.typ=ret_illegalend then
+                   exit;
+                 if pos^=#0 then
+                   exit;
+              end;
+         end;
+
+       begin
+          RegExprPos:=false;
+          index:=0;
+          len:=0;
+          if regexprengine.Data=nil then
+            exit;
+          while p^<>#0 do
+            begin
+               if dosearch(regexprengine.Data,p) then
+                 begin
+                    len:=lastpos-p;
+                    RegExprPos:=true;
+                    exit;
+                 end
+               else
+                 begin
+                    inc(p);
+                    inc(index);
+                 end;
+            end;
+          index:=-1;
+       end;
+
+begin
+   cs_nonwordchars:=cs_allchars-cs_wordchars;
+   cs_nondigits:=cs_allchars-cs_digits;
+   cs_nonwhitespace:=cs_allchars-cs_whitespace;
+end.
+{
+  $Log$
+  Revision 1.1  2000-03-14 22:09:03  florian
+    * Initial revision
+
+}

+ 69 - 0
packages/regexpr/test1.pp

@@ -0,0 +1,69 @@
+uses
+   dpmiexcp,regexpr;
+
+var
+   r : tregexprengine;
+   b : array[0..100] of char;
+   index,len : longint;
+
+procedure do_error(i : longint);
+
+  begin
+     writeln('error near ',i,' index: ',index,' len: ',len);
+     halt(1);
+  end;
+
+begin
+   writeln('*** Testing unit regexpr ***');
+
+   r:=GenerateRegExprEngine('[A-Z]');
+   if not(RegExprPos(r,'234578923457823659A38',index,len)) or
+     (index<>18) or (len<>1) then
+     do_error(1000);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('[A-Z]*');
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+   { is this result correct ??? }
+     (index<>0) or (len<>0) then
+     do_error(1002);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('[A-Z]+');
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+     (index<>18) or (len<>5) then
+     do_error(1003);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('[A-Z][A-Z]*');
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+     (index<>18) or (len<>5) then
+     do_error(1004);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('[A-Z][A-Z]?');
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+     (index<>18) or (len<>2) then
+     do_error(1005);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('^\d+');
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+     (index<>18) or (len<>5) then
+     do_error(1006);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('(nofoo|foo)1234');
+   if not(RegExprPos(r,'1234   foo1234XXXX',index,len)) or
+     (index<>8) or (len<>7) then
+     do_error(1007);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('(nofoo|foo)1234');
+   if not(RegExprPos(r,'1234   nofoo1234XXXX',index,len)) or
+     (index<>8) or (len<>9) then
+     do_error(1008);
+   DestroyregExprEngine(r);
+
+   writeln('*** Testing unit regexpr was successful ***');
+end.