fpc/packages/base/regexpr/regexpr.pp
florian 51d0c0ef2a * fixes from Ido Kanner
+ RegExprEscapeStr from Ido Kanner
* fixed memory leak

git-svn-id: trunk@953 -
2005-08-28 08:27:18 +00:00

657 lines
22 KiB
ObjectPascal

{
This unit implements basic regular expression support
This file is part of the Free Pascal run time library.
Copyright (c) 2000-2005 by Florian Klaempfl
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
{ $define DEBUG}
(*
TODO:
- correct backtracking, for example in (...)*
- | support
- getting substrings and using substrings with \1 etc.
- test ^ and $
- newline handling in DOS?
- locals dependend upper/lowercase routines
- extend the interface
- support for number of matches:
{n} Match exactly n times
{n,} Match at least n times
{n,m} Match at least n but not more than m times
*)
{$mode objfpc}
unit regexpr;
interface
{ the following declarions are only in the interface because }
{ some procedures return pregexprentry but programs which }
{ use this unit shouldn't access this data structures }
type
tcharset = set of char;
tregexprentrytype = (ret_charset,ret_or,ret_startpattern,
ret_endpattern,ret_illegalend,ret_backtrace,ret_startline,
ret_endline);
pregexprentry = ^tregexprentry;
tregexprentry = record
next,nextdestroy : pregexprentry;
case typ : tregexprentrytype of
ret_charset : (chars : tcharset;
elsepath : pregexprentry);
ret_or : (alternative : pregexprentry);
end;
tregexprflag = (ref_singleline,ref_multiline,ref_caseinsensitive);
tregexprflags = set of tregexprflag;
TRegExprEngine = record
Data : pregexprentry;
DestroyList : pregexprentry;
Flags : TRegExprFlags;
end;
const
cs_allchars : tcharset = [#0..#255];
cs_wordchars : tcharset = ['A'..'Z','a'..'z','_','0'..'9'];
cs_newline : tcharset = [#10];
cs_digits : tcharset = ['0'..'9'];
cs_whitespace : tcharset = [' ',#9];
var
{ these are initilized in the init section of the unit }
cs_nonwordchars : tcharset;
cs_nondigits : tcharset;
cs_nonwhitespace : tcharset;
{ the following procedures can be used by units basing }
{ on the regexpr unit }
function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
{ This function Escape known regex chars and place the result on Return. If something went wrong the function will return false. }
function RegExprEscapeStr (const S : AnsiString) : AnsiString;
implementation
{$ifdef DEBUG}
procedure writecharset(c : tcharset);
var
b : byte;
begin
for b:=0 to 255 do
if chr(b) in c then
write(chr(b));
writeln;
end;
{$endif DEBUG}
function GenerateRegExprEngine(regexpr : pchar;flags : tregexprflags) : TRegExprEngine;
var
first : pregexprentry;
procedure doregister(p : pregexprentry);
begin
p^.nextdestroy:=first;
first:=p;
end;
var
currentpos : pchar;
error : boolean;
function readchars : tcharset;
var
c1 : char;
begin
readchars:=[];
case currentpos^ of
#0:
exit;
'.':
begin
inc(currentpos);
readchars:=cs_allchars-cs_newline;
end;
'\':
begin
inc(currentpos);
case currentpos^ of
#0:
begin
error:=true;
exit;
end;
't':
begin
inc(currentpos);
readchars:=[#9];
end;
'n':
begin
inc(currentpos);
readchars:=[#10];
end;
'r':
begin
inc(currentpos);
readchars:=[#13];
end;
'd':
begin
inc(currentpos);
readchars:=cs_digits;
end;
'D':
begin
inc(currentpos);
readchars:=cs_nondigits;
end;
's':
begin
inc(currentpos);
readchars:=cs_whitespace;
end;
'S':
begin
inc(currentpos);
readchars:=cs_nonwhitespace;
end;
'w':
begin
inc(currentpos);
readchars:=cs_wordchars;
end;
'W':
begin
inc(currentpos);
readchars:=cs_nonwordchars;
end;
'f' :
begin
inc(currentpos);
readchars:= [#12];
end;
'a' :
begin
inc(currentpos);
readchars:= [#7];
end;
else
begin //Some basic escaping...
readchars := [currentpos^];
inc (currentpos);
{error:=true;
exit;}
end;
end;
end;
else
begin
if ref_caseinsensitive in flags then
c1:=upcase(currentpos^)
else
c1:=currentpos^;
inc(currentpos);
if currentpos^='-' then
begin
inc(currentpos);
if currentpos^=#0 then
begin
error:=true;
exit;
end;
if ref_caseinsensitive in flags then
readchars:=[c1..upcase(currentpos^)]
else
readchars:=[c1..currentpos^];
inc(currentpos);
end
else
readchars:=[c1];
end;
end;
end;
function readcharset : tcharset;
begin
readcharset:=[];
case currentpos^ of
#0:
exit;
'[':
begin
inc(currentpos);
while currentpos^<>']' do
begin
if currentpos^='^' then
begin
inc(currentpos);
readcharset:=readcharset+(cs_allchars-readchars);
end
else
readcharset:=readcharset+readchars;
if error or (currentpos^=#0) then
begin
error:=true;
exit;
end;
end;
inc(currentpos);
end;
'^':
begin
inc(currentpos);
readcharset:=cs_allchars-readchars;
end;
else
readcharset:=readchars;
end;
end;
function parseregexpr(next,elsepath : pregexprentry) : pregexprentry;
var
hp,hp2,ep : pregexprentry;
cs : tcharset;
chaining : ^pregexprentry;
begin
chaining:=nil;
parseregexpr:=nil;
if error then
exit;
{ this dummy allows us to redirect the elsepath later }
new(ep);
doregister(ep);
ep^.typ:=ret_charset;
ep^.chars:=[];
ep^.elsepath:=elsepath;
elsepath:=ep;
while true do
begin
if error then
exit;
case currentpos^ of
'(':
begin
inc(currentpos);
new(hp2);
doregister(hp2);
hp2^.typ:=ret_charset;
hp2^.chars:=[];
hp2^.elsepath:=next;
hp:=parseregexpr(hp2,ep);
if assigned(chaining) then
chaining^:=hp
else
parseregexpr:=hp;
chaining:=@hp2^.elsepath;
if currentpos^<>')' then
begin
error:=true;
exit;
end;
inc(currentpos);
end;
(* '|':
begin
{$ifdef DEBUG}
writeln('Creating backtrace entry');
{$endif DEBUG}
if (not assigned (hp2)) then
new (hp2);
while currentpos^='|' do
begin
inc(currentpos);
if currentpos^=#0 then
begin
error:=true;
exit;
end;
doregister(hp2);
hp2^.typ:=ret_charset;
hp2^.chars:=[];
hp2^.elsepath:=next;
new(hp);
doregister(hp);
hp^.typ:=ret_backtrace;
hp^.elsepath:= parseregexpr (next, elsepath);
hp^.next:=next;
if assigned(chaining) then
chaining^:=hp
else
parseregexpr:=hp;
chaining:=@hp^.elsepath;
end;
exit;
end;
*)
')':
exit;
'^':
begin
inc(currentpos);
new(hp);
doregister(hp);
hp^.typ:=ret_startline;
hp^.elsepath:=ep;
// hp^.next:=parseregexpr(ep);
end;
'$':
begin
inc(currentpos);
new(hp);
doregister(hp);
hp^.typ:=ret_endline;
hp^.elsepath:=ep;
// hp^.next:=parseregexpr(ep);
end;
#0:
exit;
else
begin
cs:=readcharset;
if error then
exit;
case currentpos^ of
'*':
begin
inc(currentpos);
new(hp);
doregister(hp);
hp^.typ:=ret_charset;
hp^.chars:=cs;
hp^.elsepath:=next;
hp^.next:=hp;
if assigned(chaining) then
chaining^:=hp
else
parseregexpr:=hp;
chaining:=@hp^.elsepath;
end;
'+':
begin
inc(currentpos);
new(hp);
new(hp2);
doregister(hp);
doregister(hp2);
hp^.typ:=ret_charset;
hp2^.typ:=ret_charset;
hp^.chars:=cs;
hp2^.chars:=cs;
hp^.elsepath:=elsepath;
hp^.next:=hp2;
hp2^.elsepath:=next;
hp2^.next:=hp2;
if assigned(chaining) then
chaining^:=hp
else
parseregexpr:=hp;
chaining:=@hp2^.elsepath;
end;
'?':
begin
inc(currentpos);
new(hp);
{ this is a dummy }
new(hp2);
doregister(hp);
doregister(hp2);
hp^.typ:=ret_charset;
hp^.chars:=cs;
hp^.next:=hp2;
hp^.elsepath:=hp2;
hp2^.typ:=ret_charset;
hp2^.chars:=[];
hp2^.elsepath:=next;
if assigned(chaining) then
chaining^:=hp
else
parseregexpr:=hp;
chaining:=@hp2^.elsepath;
end;
else
begin
new(hp);
doregister(hp);
hp^.typ:=ret_charset;
hp^.chars:=cs;
hp^.elsepath:=elsepath;
hp^.next:=next;
if assigned(chaining) then
chaining^:=hp
else
parseregexpr:=hp;
chaining:=@hp^.next;
end;
end;
end;
end;
end;
end;
var
endp : pregexprentry;
begin
GenerateRegExprEngine.Data:=nil;
GenerateRegExprEngine.DestroyList:=nil;
if regexpr=nil then
exit;
first:=nil;
if (ref_singleline in flags) and (ref_multiline in flags) then
exit;
currentpos:=regexpr;
error:=false;
new(endp);
doregister(endp);
endp^.typ:=ret_illegalend;
GenerateRegExprEngine.flags:=flags;
GenerateRegExprEngine.Data:=parseregexpr(nil,endp);
GenerateRegExprEngine.DestroyList:=first;
if error or (currentpos^<>#0) then
DestroyRegExprEngine(Result);
end;
procedure DestroyRegExprEngine(var regexpr : TRegExprEngine);
var
hp : pregexprentry;
begin
hp:=regexpr.DestroyList;
while assigned(hp) do
begin
regexpr.DestroyList:=hp^.nextdestroy;
dispose(hp);
hp:=regexpr.DestroyList;
end;
regexpr.Data:=nil;
regexpr.DestroyList:=nil;
end;
function RegExprPos(regexprengine : TRegExprEngine;p : pchar;var index,len : longint) : boolean;
var
lastpos : pchar;
function dosearch(regexpr : pregexprentry;pos : pchar) : boolean;
begin
dosearch:=false;
while true do
begin
{$IFDEF Debug}
writeln(byte(regexpr^.typ));
{$ENDIF Debug}
case regexpr^.typ of
ret_endline:
begin
if ref_multiline in regexprengine.flags then
begin
if ((pos+1)^ in [#10,#0]) then
regexpr:=regexpr^.next
else
regexpr:=regexpr^.elsepath;
end
else
begin
if (pos+1)^=#0 then
regexpr:=regexpr^.next
else
regexpr:=regexpr^.elsepath;
end;
end;
ret_startline:
begin
if ref_multiline in regexprengine.flags then
begin
if (pos=p) or ((pos-1)^=#10) then
regexpr:=regexpr^.next
else
regexpr:=regexpr^.elsepath;
end
else
begin
if pos=p then
regexpr:=regexpr^.next
else
regexpr:=regexpr^.elsepath;
end;
end;
ret_charset:
begin
if (pos^ in regexpr^.chars) or
((ref_caseinsensitive in regexprengine.flags) and
(upcase(pos^) in regexpr^.chars)) then
begin
{$ifdef DEBUG}
writeln('Found matching: ',pos^);
{$endif DEBUG}
regexpr:=regexpr^.next;
inc(pos);
end
else
begin
{$ifdef DEBUG}
writeln('Found unmatching: ',pos^);
{$endif DEBUG}
regexpr:=regexpr^.elsepath;
end;
end;
ret_backtrace:
begin
{$ifdef DEBUG}
writeln('Starting backtrace');
{$endif DEBUG}
if dosearch(regexpr^.next,pos) then
begin
dosearch:=true;
exit;
end
else if dosearch(regexpr^.elsepath,pos) then
begin
dosearch:=true;
exit;
end
else
exit;
end;
end;
lastpos:=pos;
if regexpr=nil then
begin
dosearch:=true;
exit;
end;
if regexpr^.typ=ret_illegalend then
exit;
if pos^=#0 then
exit;
end;
end;
begin
RegExprPos:=false;
index:=0;
len:=0;
if regexprengine.Data=nil then
exit;
while p^<>#0 do
begin
if dosearch(regexprengine.Data,p) then
begin
len:=lastpos-p;
RegExprPos:=true;
exit;
end
else
begin
inc(p);
inc(index);
end;
end;
index:=-1;
end;
function RegExprEscapeStr (const S : AnsiString) : AnsiString;
var
i, len : SizeUInt;
begin
Result := '';
if (S = '') then
exit;
SetLength(Result,Length(S)*2);
len := Length (S);
for i := 1 to len do
begin
if (S [i] in ['(','|', '.', '*', '?', '^', '$', '-', '[', '{', '}', ']', ')', '\']) then
begin
Result := Result + '\';
end;
Result := Result + S[i];
end;
SetLength(Result,Length(Result));
end;
begin
cs_nonwordchars:=cs_allchars-cs_wordchars;
cs_nondigits:=cs_allchars-cs_digits;
cs_nonwhitespace:=cs_allchars-cs_whitespace;
end.