Browse Source

* some improvements

florian 25 years ago
parent
commit
304e71ba5c
2 changed files with 237 additions and 63 deletions
  1. 170 50
      packages/regexpr/regexpr.pp
  2. 67 13
      packages/regexpr/test1.pp

+ 170 - 50
packages/regexpr/regexpr.pp

@@ -1,7 +1,7 @@
 {
     $Id$
     This file is part of the Free Pascal run time library.
-    Copyright (c) 1999-2000 by the Free Pascal development team
+    Copyright (c) 2000 by Florian Klaempfl
 
     This unit implements basic regular expression support
 
@@ -14,6 +14,16 @@
 
  **********************************************************************}
 { $define DEBUG}
+{
+  TODO:
+     - correct backtracking, for example in (...)*
+     - | support
+     - getting substrings and using substrings with \1 etc.
+     - test ^  and $
+     - newline handling in DOS?
+     - locals dependend upper/lowercase routines
+     - extend the interface
+}
 unit regexpr;
 
   interface
@@ -24,7 +34,8 @@ unit regexpr;
     type
        tcharset = set of char;
        tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
-          ret_illegalend,ret_backtrace);
+          ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
+          ret_endline);
 
        pregexprentry = ^tregexprentry;
        tregexprentry = record
@@ -246,13 +257,15 @@ unit regexpr;
             end;
          end;
 
-       function parseregexpr(elsepath : pregexprentry) : pregexprentry;
+       function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
 
          var
             hp,hp2,ep : pregexprentry;
             cs : tcharset;
+            chaining : ^pregexprentry;
 
          begin
+            chaining:=nil;
             parseregexpr:=nil;
             if error then
               exit;
@@ -262,6 +275,7 @@ unit regexpr;
             ep^.typ:=ret_charset;
             ep^.chars:=[];
             ep^.elsepath:=elsepath;
+            elsepath:=ep;
             while true do
               begin
                  if error then
@@ -270,7 +284,17 @@ unit regexpr;
                     '(':
                        begin
                           inc(currentpos);
-                          parseregexpr:=parseregexpr(ep);
+                          new(hp2);
+                          doregister(hp2);
+                          hp2^.typ:=ret_charset;
+                          hp2^.chars:=[];
+                          hp2^.elsepath:=next;
+                          hp:=parseregexpr(hp2,ep);
+                          if assigned(chaining) then
+                            chaining^:=hp
+                          else
+                            parseregexpr:=hp;
+                          chaining:=@hp2^.elsepath;
                           if currentpos^<>')' then
                             begin
                                error:=true;
@@ -280,21 +304,44 @@ unit regexpr;
                        end;
                     '|':
                        begin
+{$ifdef DEBUG}
+                          writeln('Creating backtrace entry');
+{$endif DEBUG}
                           inc(currentpos);
                           if currentpos^=#0 then
                             begin
                                error:=true;
                                exit;
                             end;
-                          ep^.typ:=ret_backtrace;
-                          ep^.elsepath:=parseregexpr(elsepath);
-                          ep^.next:=parseregexpr;
-                          parseregexpr:=ep;
+                          new(hp);
+                          doregister(hp);
+                          hp^.typ:=ret_backtrace;
+                          // hp^.elsepath:=parseregexpr(elsepath);
+                          hp^.next:=parseregexpr;
+                          parseregexpr:=hp;
+                          exit;
                        end;
                     ')':
                        exit;
+                    '^':
+                       begin
+                          inc(currentpos);
+                          new(hp);
+                          doregister(hp);
+                          hp^.typ:=ret_startline;
+                          hp^.elsepath:=ep;
+                          // hp^.next:=parseregexpr(ep);
+                       end;
+                    '$':
+                       begin
+                          inc(currentpos);
+                          new(hp);
+                          doregister(hp);
+                          hp^.typ:=ret_endline;
+                          hp^.elsepath:=ep;
+                          // hp^.next:=parseregexpr(ep);
+                       end;
                     #0:
-
                        exit;
                     else
                       begin
@@ -309,8 +356,13 @@ unit regexpr;
                                   doregister(hp);
                                   hp^.typ:=ret_charset;
                                   hp^.chars:=cs;
-                                  hp^.elsepath:=parseregexpr(ep);
+                                  hp^.elsepath:=next;
                                   hp^.next:=hp;
+                                  if assigned(chaining) then
+                                    chaining^:=hp
+                                  else
+                                    parseregexpr:=hp;
+                                  chaining:=@hp^.elsepath;
                                end;
                             '+':
                                begin
@@ -322,21 +374,37 @@ unit regexpr;
                                   hp^.typ:=ret_charset;
                                   hp2^.typ:=ret_charset;
                                   hp^.chars:=cs;
-                                  hp^.elsepath:=ep;
-                                  hp^.next:=hp2;
                                   hp2^.chars:=cs;
-                                  hp2^.elsepath:=parseregexpr(ep);
+                                  hp^.elsepath:=elsepath;
+                                  hp^.next:=hp2;
+                                  hp2^.elsepath:=next;
                                   hp2^.next:=hp2;
+                                  if assigned(chaining) then
+                                    chaining^:=hp
+                                  else
+                                    parseregexpr:=hp;
+                                  chaining:=@hp2^.elsepath;
                                end;
                             '?':
                                begin
                                   inc(currentpos);
                                   new(hp);
+                                  { this is a dummy }
+                                  new(hp2);
                                   doregister(hp);
+                                  doregister(hp2);
                                   hp^.typ:=ret_charset;
                                   hp^.chars:=cs;
-                                  hp^.elsepath:=parseregexpr(ep);
-                                  hp^.next:=hp^.elsepath;
+                                  hp^.next:=hp2;
+                                  hp^.elsepath:=hp2;
+                                  hp2^.typ:=ret_charset;
+                                  hp2^.chars:=[];
+                                  hp2^.elsepath:=next;
+                                  if assigned(chaining) then
+                                    chaining^:=hp
+                                  else
+                                    parseregexpr:=hp;
+                                  chaining:=@hp2^.elsepath;
                                end;
                             else
                                begin
@@ -344,11 +412,15 @@ unit regexpr;
                                   doregister(hp);
                                   hp^.typ:=ret_charset;
                                   hp^.chars:=cs;
-                                  hp^.elsepath:=ep;
-                                  hp^.next:=parseregexpr(ep);
+                                  hp^.elsepath:=elsepath;
+                                  hp^.next:=next;
+                                  if assigned(chaining) then
+                                    chaining^:=hp
+                                  else
+                                    parseregexpr:=hp;
+                                  chaining:=@hp^.next;
                                end;
                          end;
-                         parseregexpr:=hp;
                       end;
                  end;
               end;
@@ -363,14 +435,16 @@ unit regexpr;
           if regexpr=nil then
             exit;
           first:=nil;
+          if (ref_singleline in flags) and (ref_multiline in flags) then
+            exit;
           currentpos:=regexpr;
           new(endp);
           doregister(endp);
           endp^.typ:=ret_illegalend;
           GenerateRegExprEngine.flags:=flags;
-          GenerateRegExprEngine.Data:=parseregexpr(endp);
+          GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
           GenerateRegExprEngine.DestroyList:=first;
-          if error then
+          if error or (currentpos^<>#0) then
             DestroyRegExprEngine(GenerateRegExprEngine);
        end;
 
@@ -402,38 +476,81 @@ unit regexpr;
             dosearch:=false;
             while true do
               begin
-                 if regexpr^.typ=ret_backtrace then
-                   begin
-                      if dosearch(regexpr^.next,pos) then
-                        begin
-                           dosearch:=true;
-                           exit;
-                        end
-                      else if dosearch(regexpr^.elsepath,pos) then
-                        begin
-                           dosearch:=true;
-                           exit;
-                        end
-                      else
-                        exit;
-                   end;
-                 if (pos^ in regexpr^.chars) or
-                   ((ref_caseinsensitive in regexprengine.flags) and
-                    (upcase(pos^) in regexpr^.chars)) then
-                   begin
+                 writeln(byte(regexpr^.typ));
+                 case regexpr^.typ of
+                    ret_endline:
+                      begin
+                         if ref_multiline in regexprengine.flags then
+                           begin
+                              if ((pos+1)^ in [#10,#0]) then
+                                regexpr:=regexpr^.next
+                              else
+                                regexpr:=regexpr^.elsepath;
+                           end
+                         else
+                           begin
+                              if (pos+1)^=#0 then
+                                regexpr:=regexpr^.next
+                              else
+                                regexpr:=regexpr^.elsepath;
+                           end;
+                      end;
+                    ret_startline:
+                      begin
+                         if ref_multiline in regexprengine.flags then
+                           begin
+                              if (pos=p) or ((pos-1)^=#10) then
+                                regexpr:=regexpr^.next
+                              else
+                                regexpr:=regexpr^.elsepath;
+                           end
+                         else
+                           begin
+                              if pos=p then
+                                regexpr:=regexpr^.next
+                              else
+                                regexpr:=regexpr^.elsepath;
+                           end;
+                      end;
+                    ret_charset:
+                      begin
+                         if (pos^ in regexpr^.chars) or
+                           ((ref_caseinsensitive in regexprengine.flags) and
+                            (upcase(pos^) in regexpr^.chars)) then
+                           begin
 {$ifdef DEBUG}
-                      writeln('Found matching: ',pos^);
+                              writeln('Found matching: ',pos^);
 {$endif DEBUG}
-                      regexpr:=regexpr^.next;
-                      inc(pos);
-                   end
-                 else
-                   begin
+                              regexpr:=regexpr^.next;
+                              inc(pos);
+                           end
+                         else
+                           begin
 {$ifdef DEBUG}
-                      writeln('Found unmatching: ',pos^);
+                              writeln('Found unmatching: ',pos^);
 {$endif DEBUG}
-                      regexpr:=regexpr^.elsepath;
-                   end;
+                              regexpr:=regexpr^.elsepath;
+                           end;
+                      end;
+                    ret_backtrace:
+                      begin
+{$ifdef DEBUG}
+                         writeln('Starting backtrace');
+{$endif DEBUG}
+                         if dosearch(regexpr^.next,pos) then
+                           begin
+                              dosearch:=true;
+                              exit;
+                           end
+                         else if dosearch(regexpr^.elsepath,pos) then
+                           begin
+                              dosearch:=true;
+                              exit;
+                           end
+                         else
+                           exit;
+                      end;
+                 end;
                  lastpos:=pos;
                  if regexpr=nil then
                    begin
@@ -477,10 +594,13 @@ begin
 end.
 {
   $Log$
-  Revision 1.2  2000-03-14 22:57:51  florian
+  Revision 1.3  2000-03-19 16:20:44  florian
+    * some improvements
+
+  Revision 1.2  2000/03/14 22:57:51  florian
     + added flags
     + support of case insensitive search
 
   Revision 1.1  2000/03/14 22:09:03  florian
     * Initial revision
-}
+}

+ 67 - 13
packages/regexpr/test1.pp

@@ -16,8 +16,10 @@ procedure do_error(i : longint);
 begin
    writeln('*** Testing unit regexpr ***');
 
+   { basic tests }
+
    r:=GenerateRegExprEngine('[A-Z]',[]);
-   if not(RegExprPos(r,'234578923457823659A38',index,len)) or
+   if not(RegExprPos(r,'234578923457823659GHJK38',index,len)) or
      (index<>18) or (len<>1) then
      do_error(1000);
    DestroyregExprEngine(r);
@@ -47,40 +49,92 @@ begin
      do_error(1005);
    DestroyregExprEngine(r);
 
-   r:=GenerateRegExprEngine('^\d+',[]);
+   r:=GenerateRegExprEngine('[^\d]+',[]);
    if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
      (index<>18) or (len<>5) then
      do_error(1006);
    DestroyregExprEngine(r);
 
-   {
-   r:=GenerateRegExprEngine('(nofoo|foo)1234',[]);
-   if not(RegExprPos(r,'1234   foo1234XXXX',index,len)) or
-     (index<>8) or (len<>7) then
+   { test chaining }
+
+   r:=GenerateRegExprEngine('[A-Z][A-Z]?[A-Z]',[]);
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+     (index<>18) or (len<>3) then
      do_error(1007);
    DestroyregExprEngine(r);
 
-   r:=GenerateRegExprEngine('(nofoo|foo)1234',[]);
-   if not(RegExprPos(r,'1234   nofoo1234XXXX',index,len)) or
-     (index<>8) or (len<>9) then
+   r:=GenerateRegExprEngine('[A-Z][A-Z]*[0-9]',[]);
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+     (index<>18) or (len<>6) then
      do_error(1008);
    DestroyregExprEngine(r);
-   }
+
+   r:=GenerateRegExprEngine('[A-Z]+[0-9]',[]);
+   if not(RegExprPos(r,'234578923457823659ARTZU38',index,len)) or
+     (index<>18) or (len<>6) then
+     do_error(1009);
+   DestroyregExprEngine(r);
 
    { case insensitive: }
 
    r:=GenerateRegExprEngine('[A-Z]',[ref_caseinsensitive]);
    if not(RegExprPos(r,'234578923457823659a38',index,len)) or
      (index<>18) or (len<>1) then
-     do_error(1009);
+     do_error(1100);
    DestroyregExprEngine(r);
 
    { case insensitive: }
    r:=GenerateRegExprEngine('[a-z]',[ref_caseinsensitive]);
    if not(RegExprPos(r,'234578923457823659A38',index,len)) or
      (index<>18) or (len<>1) then
-     do_error(1010);
+     do_error(1101);
+   DestroyregExprEngine(r);
+
+   { with parenthsis }
+   r:=GenerateRegExprEngine('(foo)1234',[]);
+   if not(RegExprPos(r,'1234   foo1234XXXX',index,len)) or
+     (index<>7) or (len<>7) then
+     do_error(1200);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('(((foo)))1234',[]);
+   if not(RegExprPos(r,'1234   foo1234XXXX',index,len)) or
+     (index<>7) or (len<>7) then
+     do_error(1201);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('(foo)(1234)',[]);
+   if not(RegExprPos(r,'1234   foo1234XXXX',index,len)) or
+     (index<>7) or (len<>7) then
+     do_error(1202);
+   DestroyregExprEngine(r);
+
+   { test real backtracking }
+
+   r:=GenerateRegExprEngine('nofoo|foo',[]);
+   if not(RegExprPos(r,'1234   foo1234XXXX',index,len)) or
+     (index<>7) or (len<>3) then
+     do_error(1300);
    DestroyregExprEngine(r);
 
+   {
+   r:=GenerateRegExprEngine('(nofoo|foo)1234',[]);
+   if not(RegExprPos(r,'1234   nofoo1234XXXX',index,len)) or
+     (index<>8) or (len<>9) then
+     do_error(1008);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('(nofoo|foo|anotherfoo)1234',[]);
+   if not(RegExprPos(r,'1234   nofoo1234XXXX',index,len)) or
+     (index<>8) or (len<>9) then
+     do_error(1009);
+   DestroyregExprEngine(r);
+
+   r:=GenerateRegExprEngine('nofoo1234|foo1234',[]);
+   if (r.data=nil) or not(RegExprPos(r,'1234   foo1234XXXX',index,len)) or
+     (index<>7) or (len<>7) then
+     do_error(1010);
+   DestroyregExprEngine(r);
+   }
    writeln('*** Testing unit regexpr was successful ***');
-end.
+end.