fpc/packages/base/regexpr/regexpr.pp

{
    This unit implements basic regular expression support

    This file is part of the Free Pascal run time library.
    Copyright (c) 2000-2005 by Florian Klaempfl

    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}
{ $define DEBUG}
(*
  TODO:
     - correct backtracking, for example in (...)*
     - | support
     - getting substrings and using substrings with \1 etc.
     - test ^  and $
     - newline handling in DOS?
     - locals dependend upper/lowercase routines
     - extend the interface
     - support for number of matches:
       {n}    Match exactly n times
       {n,}   Match at least n times
       {n,m}  Match at least n but not more than m times

*)

{$mode objfpc}

unit regexpr;

  interface

    { the following declarions are only in the interface because }
    { some procedures return pregexprentry but programs which   }
    { use this unit shouldn't access this data structures        }
    type
       tcharset = set of char;
       tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
          ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
          ret_endline);

       pregexprentry = ^tregexprentry;
       tregexprentry = record
          next,nextdestroy : pregexprentry;
          case typ : tregexprentrytype of
             ret_charset : (chars : tcharset;
                            elsepath : pregexprentry);
             ret_or : (alternative : pregexprentry);
       end;

       tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
       tregexprflags = set of tregexprflag;

       TRegExprEngine = record
          Data : pregexprentry;
          DestroyList : pregexprentry;
          Flags : TRegExprFlags;
       end;

     const
        cs_allchars : tcharset = [#0..#255];
        cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
        cs_newline : tcharset = [#10];
        cs_digits : tcharset = ['0'..'9'];
        cs_whitespace : tcharset = [' ',#9];

     var
        { these are initilized in the init section of the unit }
        cs_nonwordchars : tcharset;
        cs_nondigits : tcharset;
        cs_nonwhitespace : tcharset;

     { the following procedures can be used by units basing }
     { on the regexpr unit                                  }
     function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;

     procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);

     function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;

     { This function Escape known regex chars and place the result on Return. If something went wrong the function will return false. }
     function RegExprEscapeStr (const S : AnsiString) : AnsiString;

  implementation

{$ifdef DEBUG}
     procedure writecharset(c : tcharset);

       var
          b : byte;

       begin
          for b:=0 to 255 do
            if chr(b) in c then
              write(chr(b));
          writeln;
       end;
{$endif DEBUG}

     function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;

       var
          first : pregexprentry;

       procedure doregister(p : pregexprentry);

         begin
           p^.nextdestroy:=first;
           first:=p;
         end;

       var
          currentpos : pchar;
          error : boolean;

       function readchars : tcharset;

         var
            c1 : char;

         begin
            readchars:=[];
            case currentpos^ of
               #0:
                  exit;
               '.':
                  begin
                     inc(currentpos);
                     readchars:=cs_allchars-cs_newline;
                  end;
               '\':
                  begin
                     inc(currentpos);
                     case currentpos^ of
                        #0:
                          begin
                             error:=true;
                             exit;
                          end;
                        't':
                           begin
                              inc(currentpos);
                              readchars:=[#9];
                           end;
                        'n':
                           begin
                              inc(currentpos);
                              readchars:=[#10];
                           end;
                        'r':
                           begin
                              inc(currentpos);
                              readchars:=[#13];
                           end;
                        'd':
                          begin
                             inc(currentpos);
                             readchars:=cs_digits;
                          end;
                        'D':
                          begin
                             inc(currentpos);
                             readchars:=cs_nondigits;
                          end;
                        's':
                          begin
                             inc(currentpos);
                             readchars:=cs_whitespace;
                          end;
                        'S':
                          begin
                             inc(currentpos);
                             readchars:=cs_nonwhitespace;
                          end;
                        'w':
                           begin
                              inc(currentpos);
                              readchars:=cs_wordchars;
                           end;
                        'W':
                           begin
                              inc(currentpos);
                              readchars:=cs_nonwordchars;
                           end;
                        'f' :
                            begin
                              inc(currentpos);
                              readchars:= [#12];
                            end;
                        'a' :
                            begin
                              inc(currentpos);
                              readchars:= [#7];
                            end;
                         else
                           begin //Some basic escaping...
                              readchars := [currentpos^];
                              inc (currentpos);
                              {error:=true;
                              exit;}
                           end;
                     end;
                  end;
               else
                 begin
                    if ref_caseinsensitive in flags then
                       c1:=upcase(currentpos^)
                    else
                       c1:=currentpos^;

                    inc(currentpos);
                    if currentpos^='-' then
                      begin
                         inc(currentpos);
                         if currentpos^=#0 then
                           begin
                              error:=true;
                              exit;
                           end;
                         if ref_caseinsensitive in flags then
                           readchars:=[c1..upcase(currentpos^)]
                         else
                           readchars:=[c1..currentpos^];
                         inc(currentpos);
                      end
                    else
                      readchars:=[c1];
                 end;
            end;
         end;


       function readcharset : tcharset;

         begin
            readcharset:=[];
            case currentpos^ of
               #0:
                  exit;
               '[':
                  begin
                     inc(currentpos);
                     while currentpos^<>']' do
                       begin
                          if currentpos^='^' then
                            begin
                               inc(currentpos);
                               readcharset:=readcharset+(cs_allchars-readchars);
                            end
                          else
                            readcharset:=readcharset+readchars;
                          if error or (currentpos^=#0) then
                            begin
                               error:=true;
                               exit;
                            end;
                       end;
                     inc(currentpos);
                  end;
               '^':
                  begin
                     inc(currentpos);
                     readcharset:=cs_allchars-readchars;
                  end;
               else
                  readcharset:=readchars;
            end;
         end;

       function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;

         var
            hp,hp2,ep : pregexprentry;
            cs : tcharset;
            chaining : ^pregexprentry;

         begin
            chaining:=nil;
            parseregexpr:=nil;
            if error then
              exit;
            { this dummy allows us to redirect the elsepath later }
            new(ep);
            doregister(ep);
            ep^.typ:=ret_charset;
            ep^.chars:=[];
            ep^.elsepath:=elsepath;
            elsepath:=ep;
            while true do
              begin
                 if error then
                   exit;
                 case currentpos^ of
                    '(':
                       begin
                          inc(currentpos);
                          new(hp2);
                          doregister(hp2);
                          hp2^.typ:=ret_charset;
                          hp2^.chars:=[];
                          hp2^.elsepath:=next;
                          hp:=parseregexpr(hp2,ep);
                          if assigned(chaining) then
                            chaining^:=hp
                          else
                            parseregexpr:=hp;
                          chaining:=@hp2^.elsepath;
                          if currentpos^<>')' then
                            begin
                               error:=true;
                               exit;
                            end;
                          inc(currentpos);
                       end;

(*                    '|':
                       begin
{$ifdef DEBUG}
                          writeln('Creating backtrace entry');
{$endif DEBUG}
                          if (not assigned (hp2)) then
                            new (hp2);

                          while currentpos^='|' do
                            begin
                              inc(currentpos);
                              if currentpos^=#0 then
                                begin
                                   error:=true;
                                   exit;
                                end;

                          doregister(hp2);
                          hp2^.typ:=ret_charset;
                          hp2^.chars:=[];
                          hp2^.elsepath:=next;

                              new(hp);
                              doregister(hp);
                              hp^.typ:=ret_backtrace;
                              hp^.elsepath:= parseregexpr (next, elsepath);
                              hp^.next:=next;
                              if assigned(chaining) then
                                chaining^:=hp
                              else
                                parseregexpr:=hp;
                              chaining:=@hp^.elsepath;
                            end;
                          exit;
                       end;
*)
                    ')':
                       exit;
                    '^':
                       begin
                          inc(currentpos);
                          new(hp);
                          doregister(hp);
                          hp^.typ:=ret_startline;
                          hp^.elsepath:=ep;
                          // hp^.next:=parseregexpr(ep);
                       end;
                    '$':
                       begin
                          inc(currentpos);
                          new(hp);
                          doregister(hp);
                          hp^.typ:=ret_endline;
                          hp^.elsepath:=ep;
                          // hp^.next:=parseregexpr(ep);
                       end;
                    #0:
                       exit;
                    else
                      begin
                         cs:=readcharset;
                         if error then
                           exit;
                         case currentpos^ of
                            '*':
                               begin
                                  inc(currentpos);
                                  new(hp);
                                  doregister(hp);
                                  hp^.typ:=ret_charset;
                                  hp^.chars:=cs;
                                  hp^.elsepath:=next;
                                  hp^.next:=hp;
                                  if assigned(chaining) then
                                    chaining^:=hp
                                  else
                                    parseregexpr:=hp;
                                  chaining:=@hp^.elsepath;
                               end;
                            '+':
                               begin
                                  inc(currentpos);
                                  new(hp);
                                  new(hp2);
                                  doregister(hp);
                                  doregister(hp2);
                                  hp^.typ:=ret_charset;
                                  hp2^.typ:=ret_charset;
                                  hp^.chars:=cs;
                                  hp2^.chars:=cs;
                                  hp^.elsepath:=elsepath;
                                  hp^.next:=hp2;
                                  hp2^.elsepath:=next;
                                  hp2^.next:=hp2;
                                  if assigned(chaining) then
                                    chaining^:=hp
                                  else
                                    parseregexpr:=hp;
                                  chaining:=@hp2^.elsepath;
                               end;
                            '?':
                               begin
                                  inc(currentpos);
                                  new(hp);
                                  { this is a dummy }
                                  new(hp2);
                                  doregister(hp);
                                  doregister(hp2);
                                  hp^.typ:=ret_charset;
                                  hp^.chars:=cs;
                                  hp^.next:=hp2;
                                  hp^.elsepath:=hp2;
                                  hp2^.typ:=ret_charset;
                                  hp2^.chars:=[];
                                  hp2^.elsepath:=next;
                                  if assigned(chaining) then
                                    chaining^:=hp
                                  else
                                    parseregexpr:=hp;
                                  chaining:=@hp2^.elsepath;
                               end;
                            else
                               begin
                                  new(hp);
                                  doregister(hp);
                                  hp^.typ:=ret_charset;
                                  hp^.chars:=cs;
                                  hp^.elsepath:=elsepath;
                                  hp^.next:=next;
                                  if assigned(chaining) then
                                    chaining^:=hp
                                  else
                                    parseregexpr:=hp;
                                  chaining:=@hp^.next;
                               end;
                         end;
                      end;
                 end;
              end;
         end;

       var
          endp : pregexprentry;

       begin
          GenerateRegExprEngine.Data:=nil;
          GenerateRegExprEngine.DestroyList:=nil;
          if regexpr=nil then
            exit;
          first:=nil;
          if (ref_singleline in flags) and (ref_multiline in flags) then
            exit;
          currentpos:=regexpr;
          error:=false;
          new(endp);
          doregister(endp);
          endp^.typ:=ret_illegalend;
          GenerateRegExprEngine.flags:=flags;
          GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
          GenerateRegExprEngine.DestroyList:=first;
          if error or (currentpos^<>#0) then
            DestroyRegExprEngine(Result);
       end;

    procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);

       var
          hp : pregexprentry;

       begin
          hp:=regexpr.DestroyList;
          while assigned(hp) do
            begin
               regexpr.DestroyList:=hp^.nextdestroy;
               dispose(hp);
               hp:=regexpr.DestroyList;
            end;
          regexpr.Data:=nil;
          regexpr.DestroyList:=nil;
       end;

     function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;

       var
          lastpos : pchar;

       function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;

         begin
            dosearch:=false;
            while true do
              begin
                 {$IFDEF Debug}
                 writeln(byte(regexpr^.typ));
                 {$ENDIF Debug}
                 case regexpr^.typ of
                    ret_endline:
                      begin
                         if ref_multiline in regexprengine.flags then
                           begin
                              if ((pos+1)^ in [#10,#0]) then
                                regexpr:=regexpr^.next
                              else
                                regexpr:=regexpr^.elsepath;
                           end
                         else
                           begin
                              if (pos+1)^=#0 then
                                regexpr:=regexpr^.next
                              else
                                regexpr:=regexpr^.elsepath;
                           end;
                      end;
                    ret_startline:
                      begin
                         if ref_multiline in regexprengine.flags then
                           begin
                              if (pos=p) or ((pos-1)^=#10) then
                                regexpr:=regexpr^.next
                              else
                                regexpr:=regexpr^.elsepath;
                           end
                         else
                           begin
                              if pos=p then
                                regexpr:=regexpr^.next
                              else
                                regexpr:=regexpr^.elsepath;
                           end;
                      end;
                    ret_charset:
                      begin
                         if (pos^ in regexpr^.chars) or
                           ((ref_caseinsensitive in regexprengine.flags) and
                            (upcase(pos^) in regexpr^.chars)) then
                           begin
{$ifdef DEBUG}
                              writeln('Found matching: ',pos^);
{$endif DEBUG}
                              regexpr:=regexpr^.next;
                              inc(pos);
                           end
                         else
                           begin
{$ifdef DEBUG}
                              writeln('Found unmatching: ',pos^);
{$endif DEBUG}
                              regexpr:=regexpr^.elsepath;
                           end;
                      end;
                    ret_backtrace:
                      begin
{$ifdef DEBUG}
                         writeln('Starting backtrace');
{$endif DEBUG}
                         if dosearch(regexpr^.next,pos) then
                           begin
                              dosearch:=true;
                              exit;
                           end
                         else if dosearch(regexpr^.elsepath,pos) then
                           begin
                              dosearch:=true;
                              exit;
                           end
                         else
                           exit;
                      end;
                 end;
                 lastpos:=pos;
                 if regexpr=nil then
                   begin
                      dosearch:=true;
                      exit;
                   end;
                 if regexpr^.typ=ret_illegalend then
                   exit;
                 if pos^=#0 then
                   exit;
              end;
         end;

       begin
          RegExprPos:=false;
          index:=0;
          len:=0;
          if regexprengine.Data=nil then
            exit;
          while p^<>#0 do
            begin
               if dosearch(regexprengine.Data,p) then
                 begin
                    len:=lastpos-p;
                    RegExprPos:=true;
                    exit;
                 end
               else
                 begin
                    inc(p);
                    inc(index);
                 end;
            end;
          index:=-1;
       end;


  function RegExprEscapeStr (const S : AnsiString) : AnsiString;
    var
     i, len   : SizeUInt;

    begin
      Result := '';
      if (S = '') then
       exit;

      SetLength(Result,Length(S)*2);

      len := Length (S);

      for i := 1 to len do
        begin
          if (S [i] in ['(','|', '.', '*', '?', '^', '$', '-', '[', '{', '}', ']', ')', '\']) then
            begin
              Result := Result + '\';
            end;

          Result := Result + S[i];
        end;
      SetLength(Result,Length(Result));
    end;

begin
   cs_nonwordchars:=cs_allchars-cs_wordchars;
   cs_nondigits:=cs_allchars-cs_digits;
   cs_nonwhitespace:=cs_allchars-cs_whitespace;
end.