lazarus-ccr/components/orpheus/o32rxngn.pas
2007-01-16 02:17:08 +00:00

1064 lines
35 KiB
ObjectPascal

{*********************************************************}
{* O32RXNGN.PAS 4.06 *}
{*********************************************************}
{* ***** BEGIN LICENSE BLOCK ***** *}
{* Version: MPL 1.1 *}
{* *}
{* The contents of this file are subject to the Mozilla Public License *}
{* Version 1.1 (the "License"); you may not use this file except in *}
{* compliance with the License. You may obtain a copy of the License at *}
{* http://www.mozilla.org/MPL/ *}
{* *}
{* Software distributed under the License is distributed on an "AS IS" basis, *}
{* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License *}
{* for the specific language governing rights and limitations under the *}
{* License. *}
{* *}
{* The Original Code is TurboPower Orpheus *}
{* *}
{* The Initial Developer of the Original Code is TurboPower Software *}
{* *}
{* Portions created by TurboPower Software Inc. are Copyright (C)1995-2002 *}
{* TurboPower Software Inc. All Rights Reserved. *}
{* *}
{* Contributor(s): *}
{* *}
{* ***** END LICENSE BLOCK ***** *}
{The following is a modified version of a Regex engine, which was originally
written for Algorithms Alfresco. It is copyright(c)2001 by Julian M. Bucknall
and is used here with permission.}
{$I OVC.INC}
{$B-} {Complete Boolean Evaluation}
{$I+} {Input/Output-Checking}
{$P+} {Open Parameters}
{$T-} {Typed @ Operator}
{.W-} {Windows Stack Frame}
{$X+} {Extended Syntax}
{Notes: these classes parse regular expressions that follow this grammar:
<anchorexpr> ::= <expr> |
'^' <expr> |
<expr> '$' |
'^' <expr> '$'
<expr> ::= <term> |
<term> '|' <expr> - alternation
<term> ::= <factor> |
<factor><term> - concatenation
<factor> ::= <atom> |
<atom> '?' | - zero or one
<atom> '*' | - zero or more
<atom> '+' - one or more
<atom> ::= <char> |
'.' | - any char
'(' <expr> ') | - parentheses
'[' <charclass> ']' | - normal class
'[^' <charclass> ']' - negated class
<charclass> ::= <charrange> |
<charrange><charclass>
<charrange> ::= <ccchar> |
<ccchar> '-' <ccchar>
<char> ::= <any character except metacharacters> |
'\' <any character at all>
<ccchar> ::= <any character except '-' and ']'> |
'\' <any character at all>
This means that parentheses have maximum precedence, followed
by square brackets, followed by the closure operators,
followed by concatenation, finally followed by alternation.}
unit o32rxngn;
{Orpheus Regex Engine}
interface
uses
Classes, SysUtils, O32IntDeq, O32IntLst;
type
PO32CharSet = ^TO32CharSet;
TO32CharSet = set of char;
TO32NFAMatchType = ( {types of matching performed...}
mtNone, {..no match (an epsilon no-cost move)}
mtAnyChar, {..any character}
mtChar, {..a particular character}
mtClass, {..a character class}
mtNegClass, {..a negated character class}
mtTerminal, {..the final state--no matching}
mtUnused); {..an unused state--no matching}
TO32RegexError = ( {error codes for invalid regex strings}
recNone, {..no error}
recSuddenEnd, {..unexpected end of string}
recMetaChar, {..read metacharacter, but needed normal char}
recNoCloseParen, {..expected close paren, but not there}
recExtraChars); {..extra characters at the end of the Regex string}
TO32UpcaseChar = function (aCh : char) : char;
{- Regex Engine Class}
TO32RegexEngine = class
protected {private}
FAnchorEnd : boolean;
FAnchorStart : boolean;
FErrorCode : TO32RegexError;
FIgnoreCase : boolean;
FPosn : PAnsiChar;
FRegexStr : string;
FStartState : integer;
FTable : TList;
FUpcase : TO32UpcaseChar;
FLogging : Boolean;
Log : System.Text;
FLogFile : string;
procedure SetLogFile(const Value: String);
procedure rcSetIgnoreCase(aValue : boolean);
procedure rcSetRegexStr(const aRegexStr : string);
procedure rcSetUpcase(aValue : TO32UpcaseChar);
procedure rcSetLogging(const aValue : Boolean);
procedure rcClear;
procedure rcLevel1Optimize;
procedure rcLevel2Optimize;
function rcMatchSubString(const S : string;
StartPosn : integer) : boolean;
function rcAddState(aMatchType : TO32NFAMatchType;
aChar : char;
aCharClass : PO32CharSet;
aNextState1: integer;
aNextState2: integer) : integer;
function rcSetState(aState : integer;
aNextState1: integer;
aNextState2: integer) : integer;
function rcParseAnchorExpr : integer;
function rcParseAtom : integer;
function rcParseCCChar : char;
function rcParseChar : integer;
function rcParseCharClass(aClass : PO32CharSet) : boolean;
function rcParseCharRange(aClass : PO32CharSet) : boolean;
function rcParseExpr : integer;
function rcParseFactor : integer;
function rcParseTerm : integer;
procedure rcWalkNoCostTree(aList : TO32IntList;
aState : integer);
procedure rcDumpTable;
public
constructor Create(const aRegexStr : string);
destructor Destroy; override;
function Parse(var aErrorPos : integer;
var aErrorCode: TO32RegexError) : boolean;
function MatchString(const S : string) : integer;
property IgnoreCase : boolean
read FIgnoreCase write rcSetIgnoreCase;
property RegexString : string
read FRegexStr write rcSetRegexStr;
property Upcase : TO32UpcaseChar
read FUpcase write rcSetUpcase;
property Logging : Boolean
read FLogging write rcSetLogging;
property LogFile : String read FLogFile write SetLogFile;
end;
implementation
const
MetaCharacters: set of char =
['?', '*', '+', '-', '.', '|', '(', ')', '[', ']', '^', '$'];
{handy constants}
UnusedState = -1;
NewFinalState = -2;
CreateNewState = -3;
ErrorState = -4;
MustScan = -5;
type
PO32NFAState = ^TO32NFAState;
TO32NFAState = record
sdNextState1: integer;
sdNextState2: integer;
sdNextList : TO32IntList;
sdClass : PO32CharSet;
sdMatchType : TO32NFAMatchType;
sdChar : char;
end;
{===TO32RegexEngine=================================================}
constructor TO32RegexEngine.Create(const aRegexStr : string);
begin
inherited Create;
FRegexStr := aRegexStr;
FIgnoreCase := true;
FUpcase := System.Upcase;
FTable := TList.Create;
FTable.Capacity := 64;
FLogging := false;
end;
{--------}
destructor TO32RegexEngine.Destroy;
begin
if (FTable <> nil) then begin
rcClear;
FTable.Free;
end;
inherited Destroy;
end;
{--------}
procedure TO32RegexEngine.rcSetLogging(const aValue : Boolean);
begin
FLogging := aValue;
end;
{--------}
procedure TO32RegexEngine.SetLogFile(const Value: String);
begin
FLogFile := Value;
end;
{--------}
function TO32RegexEngine.MatchString(const S : string) : integer;
var
i : integer;
ErrorPos : integer;
ErrorCode : TO32RegexError;
begin
{if the regex string hasn't been parsed yet, do so}
if (FTable.Count = 0) then begin
if not Parse(ErrorPos, ErrorCode) then begin
raise Exception.Create(
Format('The regex was invalid at position %d', [ErrorPos]));
end;
end;
{now try and see if the string matches (empty strings don't)}
Result := 0;
if (S <> '') then
{if the regex specified a start anchor, then we only need to check
the string starting at the first position}
if FAnchorStart then begin
if rcMatchSubString(S, 1) then
Result := 1;
end
{otherwise we try and match the string at every position and
return at the first success}
else begin
for i := 1 to length(S) do
if rcMatchSubString(S, i) then begin
Result := i;
Break;
end;
end;
end;
{--------}
function TO32RegexEngine.Parse(var aErrorPos : integer;
var aErrorCode: TO32RegexError)
: boolean;
procedure WriteError(aErrorPos : integer;
aErrorCode: TO32RegexError);
begin
writeln(Log, '***parse error found at ', aErrorPos);
case aErrorCode of
recNone : writeln(Log, '-->no error');
recSuddenEnd : writeln(Log, '-->unexpected end of regex');
recMetaChar : writeln(Log, '-->found metacharacter in wrong place');
recNoCloseParen : writeln(Log, '-->missing close paren');
recExtraChars : writeln(Log, '-->extra chars after valid regex');
end;
writeln(Log, '"', FRegexStr, '"');
writeln(Log, '^':succ(aErrorPos));
end;
begin
result := false;
if FLogging then begin
if FLogFile = '' then FLogFile := 'Parse.log';
System.Assign(Log, FLogFile);
System.Rewrite(Log);
end;
try
if FLogging then writeln(Log, 'Parsing regex: "', FRegexStr, '"');
{clear the current transition table}
rcClear;
{empty regex strings are not allowed}
if (FRegexStr = '') then begin
aErrorPos := 1;
aErrorCode := recSuddenEnd;
if FLogging then WriteError(aErrorPos, aErrorCode);
Exit;
end;
{parse the regex string}
FPosn := PAnsiChar(FRegexStr);
FStartState := rcParseAnchorExpr;
{if an error occurred or we're not at the end of the regex string,
clear the transition table, return false and the error position}
if (FStartState = ErrorState) or (FPosn^ <> #0) then begin
if (FStartState <> ErrorState) and (FPosn^ <> #0) then
FErrorCode := recExtraChars;
rcClear;
aErrorPos := succ(FPosn - PAnsiChar(FRegexStr));
aErrorCode := FErrorCode;
if FLogging then WriteError(aErrorPos, aErrorCode);
end
{otherwise add a terminal state, optimize, return true}
else begin
rcAddState(mtTerminal, #0, nil, UnusedState, UnusedState);
rcLevel1Optimize;
rcLevel2Optimize;
Result := true;
aErrorPos := 0;
aErrorCode := recNone;
if FLogging then rcDumpTable;
end;
finally
if FLogging then System.Close(Log);
end;
end;
{--------}
function TO32RegexEngine.rcAddState(aMatchType : TO32NFAMatchType;
aChar : char;
aCharClass : PO32CharSet;
aNextState1: integer;
aNextState2: integer) : integer;
var
StateData : PO32NFAState;
begin
{create the new state record}
StateData := AllocMem(sizeof(TO32NFAState));
{set up the fields in the state record}
if (aNextState1 = NewFinalState) then
StateData^.sdNextState1 := succ(FTable.Count)
else
StateData^.sdNextState1 := aNextState1;
StateData^.sdNextState2 := aNextState2;
StateData^.sdMatchType := aMatchType;
if (aMatchType = mtChar) then
StateData^.sdChar := aChar
else if (aMatchType = mtClass) or (aMatchType = mtNegClass) then
StateData^.sdClass := aCharClass;
{add the new state}
Result := FTable.Count;
FTable.Add(StateData);
end;
{--------}
procedure TO32RegexEngine.rcClear;
var
i : integer;
StateData : PO32NFAState;
begin
{free all items in the state transition table}
for i := 0 to pred(FTable.Count) do begin
StateData := PO32NFAState(FTable.List^[i]);
if (StateData <> nil) then begin
with StateData^ do begin
if (sdMatchType = mtClass) or
(sdMatchType = mtNegClass) then
if (sdClass <> nil) then
FreeMem(StateData^.sdClass);
sdNextList.Free;
end;
Dispose(StateData);
end;
end;
{clear the state transition table}
FTable.Clear;
FTable.Capacity := 64;
FAnchorStart := false;
FAnchorEnd := false;
end;
{--------}
procedure TO32RegexEngine.rcDumpTable;
var
i, j : integer;
begin
writeln(Log);
if (FTable.Count = 0) then
writeln(Log, 'No transition table to dump!')
else begin
writeln(Log, 'Transition table dump for "', FRegexStr, '"');
if FAnchorStart then
writeln(Log, 'anchored at start of string');
if FAnchorEnd then
writeln(Log, 'anchored at end of string');
writeln(Log, 'start state: ', FStartState:3);
for i := 0 to pred(FTable.Count) do begin
write(Log, i:3);
with PO32NFAState(FTable[i])^ do begin
case sdMatchType of
mtNone : write(Log, ' no match');
mtAnyChar : write(Log, ' any char');
mtChar : write(Log, ' char:', sdChar);
mtClass : write(Log, ' class');
mtNegClass: write(Log, ' neg class');
mtTerminal: write(Log, '*******END');
mtUnused : write(Log, ' --');
else
write(Log, ' **error**');
end;
if (sdNextList <> nil) then begin
write(Log, ' next:');
for j := 0 to pred(sdNextList.Count) do
write(Log, ' ', sdNextList[j]);
end
else begin
if (sdMatchType <> mtTerminal) and
(sdMatchType <> mtUnused) then begin
write(Log, ' next1: ', sdNextState1:3);
if (sdNextState2 <> UnusedState) then
write(Log, ' next2: ', sdNextState2:3);
end;
end;
end;
writeln(Log);
end;
end;
end;
{--------}
procedure TO32RegexEngine.rcLevel1Optimize;
var
i : integer;
Walker : PO32NFAState;
begin
{level 1 optimization removes all states that have only a single
no-cost move to another state}
{cycle through all the state records, except for the last one}
for i := 0 to (FTable.Count - 2) do begin
{get this state}
with PO32NFAState(FTable.List^[i])^ do begin
{walk the chain pointed to by the first next state, unlinking
the states that are simple single no-cost moves}
Walker := PO32NFAState(FTable.List^[sdNextState1]);
while (Walker^.sdMatchType = mtNone) and
(Walker^.sdNextState2 = UnusedState) do begin
sdNextState1 := Walker^.sdNextState1;
Walker := PO32NFAState(FTable.List^[sdNextState1]);
end;
{walk the chain pointed to by the first next state, unlinking
the states that are simple single no-cost moves}
if (sdNextState2 <> UnusedState) then begin
Walker := PO32NFAState(FTable.List^[sdNextState2]);
while (Walker^.sdMatchType = mtNone) and
(Walker^.sdNextState2 = UnusedState) do begin
sdNextState2 := Walker^.sdNextState1;
Walker := PO32NFAState(FTable.List^[sdNextState2]);
end;
end;
end;
end;
end;
{--------}
procedure TO32RegexEngine.rcLevel2Optimize;
var
i : integer;
begin
{level 2 optimization removes all no-cost moves, except for those
from the start state, if that is a no-cost move state}
{cycle through all the state records, except for the last one}
for i := 0 to (FTable.Count - 2) do begin
{get this state}
with PO32NFAState(FTable.List^[i])^ do begin
{if it's not a no-cost move state or it's the start state...}
if (sdMatchType <> mtNone) or (i = FStartState) then begin
{create the state list}
sdNextList := TO32IntList.Create;
{walk the chain pointed to by the first next state, adding
the non-no-cost states to the list}
rcWalkNoCostTree(sdNextList, sdNextState1);
{if this is the start state, and it's a no-cost move state
walk the chain pointed to by the second next state, adding
the non-no-cost states to the list}
if (sdMatchType = mtNone) then
rcWalkNoCostTree(sdNextList, sdNextState2);
end;
end;
end;
{cycle through all the state records, except for the last one,
marking unused ones--not strictly necessary but good for debugging}
for i := 0 to (FTable.Count - 2) do begin
if (i <> FStartState) then
with PO32NFAState(FTable.List^[i])^ do begin
if (sdMatchType = mtNone) then
sdMatchType := mtUnused;
end;
end;
end;
{--------}
function TO32RegexEngine.rcMatchSubString(const S : string;
StartPosn : integer)
: boolean;
var
i : integer;
Ch : char;
State : integer;
Deque : TO32IntDeque;
StrInx : integer;
begin
{assume we fail to match}
Result := false;
{create the deque}
Deque := TO32IntDeque.Create(64);
try
{enqueue the special value to start scanning}
Deque.Enqueue(MustScan);
{enqueue the first state}
Deque.Enqueue(FStartState);
{prepare the string index}
StrInx := StartPosn - 1;
Ch := #0; //just to fool the engine
{loop until the deque is empty or we run out of string}
while (StrInx <= length(S)) and not Deque.IsEmpty do begin
{pop the top state from the deque}
State := Deque.Pop;
{process the "must scan" state first}
if (State = MustScan) then begin
{if the deque is empty at this point, we might as well give up
since there are no states left to process new characters}
if not Deque.IsEmpty then begin
{if we haven't run out of string, get the character, and
enqueue the "must scan" state again}
inc(StrInx);
if (StrInx <= length(S)) then begin
if IgnoreCase then
Ch := Upcase(S[StrInx])
else
Ch := S[StrInx];
Deque.Enqueue(MustScan);
end;
end;
end
{otherwise, process the state}
else with PO32NFAState(FTable.List^[State])^ do begin
case sdMatchType of
mtNone :
begin
if (State <> FStartState) then
Assert(false, 'no-cost states shouldn''t be seen');
for i := 0 to pred(sdNextList.Count) do
Deque.Push(sdNextList[i]);
end;
mtAnyChar :
begin
{for a match of any character, enqueue the next states}
for i := 0 to pred(sdNextList.Count) do
Deque.Enqueue(sdNextList[i]);
end;
mtChar :
begin
{for a match of a character, enqueue the next states}
if (Ch = sdChar) then
for i := 0 to pred(sdNextList.Count) do
Deque.Enqueue(sdNextList[i]);
end;
mtClass :
begin
{for a match within a class, enqueue the next states}
if (Ch in sdClass^) then
for i := 0 to pred(sdNextList.Count) do
Deque.Enqueue(sdNextList[i]);
end;
mtNegClass :
begin
{for a match not within a class, enqueue the next states}
if not (Ch in sdClass^) then
for i := 0 to pred(sdNextList.Count) do
Deque.Enqueue(sdNextList[i]);
end;
mtTerminal :
begin
{for a terminal state, the string successfully matched
if the regex had no end anchor, or we're at the end
of the string}
if (not FAnchorEnd) or (StrInx > length(S)) then begin
Result := true;
Exit;
end;
end;
mtUnused :
begin
Assert(false, 'unused state ' + IntToStr(State)
+ ' shouldn''t be seen');
end;
end;
end;
end;
{if we reach this point we've either exhausted the deque or we've
run out of string; if the former, the substring did not match
since there are no more states. If the latter, we need to check
the states left on the deque to see if one is the terminating
state; if so the string matched the regular expression defined by
the transition table}
while not Deque.IsEmpty do begin
State := Deque.Pop;
with PO32NFAState(FTable.List^[State])^ do begin
case sdMatchType of
mtTerminal :
begin
{for a terminal state, the string successfully matched
if the regex had no end anchor, or we're at the end
of the string}
if (not FAnchorEnd) or (StrInx > length(S)) then begin
Result := true;
Exit;
end;
end;
end;{case}
end;
end;
finally
Deque.Free;
end;
end;
{--------}
function TO32RegexEngine.rcParseAnchorExpr : integer;
begin
{check for an initial '^'}
if (FPosn^ = '^') then begin
FAnchorStart := true;
inc(FPosn);
if FLogging then writeln(Log, 'parsed start anchor');
end;
{parse an expression}
Result := rcParseExpr;
{if we were successful, check for the final '$'}
if (Result <> ErrorState) then begin
if (FPosn^ = '$') then begin
FAnchorEnd := true;
inc(FPosn);
if FLogging then writeln(Log, 'parsed end anchor');
end;
end;
end;
{--------}
function TO32RegexEngine.rcParseAtom : integer;
var
MatchType : TO32NFAMatchType;
CharClass : PO32CharSet;
begin
case FPosn^ of
'(' :
begin
{move past the open parenthesis}
inc(FPosn);
if FLogging then writeln(Log, 'parsed open paren');
{parse a complete regex between the parentheses}
Result := rcParseExpr;
if (Result = ErrorState) then
Exit;
{if the current character is not a close parenthesis,
there's an error}
if (FPosn^ <> ')') then begin
FErrorCode := recNoCloseParen;
Result := ErrorState;
Exit;
end;
{move past the close parenthesis}
inc(FPosn);
if FLogging then writeln(Log, 'parsed close paren');
end;
'[' :
begin
{move past the open square bracket}
inc(FPosn);
if Logging then
writeln(Log, 'parsed open square bracket (start of class)');
{if the first character in the class is a '^' then the
class if negated, otherwise it's a normal one}
if (FPosn^ = '^') then begin
inc(FPosn);
MatchType := mtNegClass;
if FLogging then writeln(Log, 'it is a negated class');
end
else begin
MatchType := mtClass;
if FLogging then writeln(Log, 'it is a normal class');
end;
{allocate the class character set and parse the character
class; this will return either with an error, or when the
closing square bracket is encountered}
New(CharClass);
CharClass^ := [];
if not rcParseCharClass(CharClass) then begin
Dispose(CharClass);
Result := ErrorState;
Exit;
end;
{move past the closing square bracket}
Assert(FPosn^ = ']',
'the rcParseCharClass terminated without finding a "]"');
inc(FPosn);
if Logging then
writeln(Log, 'parsed close square bracket (end of class)');
{add a new state for the character class}
Result := rcAddState(MatchType, #0, CharClass,
NewFinalState, UnusedState);
end;
'.' :
begin
{move past the period metacharacter}
inc(FPosn);
if FLogging then writeln(Log, 'parsed anychar operator "."');
{add a new state for the 'any character' token}
Result := rcAddState(mtAnyChar, #0, nil,
NewFinalState, UnusedState);
end;
else
{otherwise parse a single character}
Result := rcParseChar;
end;{case}
end;
{--------}
function TO32RegexEngine.rcParseCCChar : char;
begin
{if we hit the end of the string, it's an error}
if (FPosn^ = #0) then begin
FErrorCode := recSuddenEnd;
Result := #0;
Exit;
end;
{if the current char is a metacharacter (at least in terms of a
character class), it's an error}
if FPosn^ in [']', '-'] then begin
FErrorCode := recMetaChar;
Result := #0;
Exit;
end;
{otherwise return the character and advance past it}
if (FPosn^ = '\') then
{..it's an escaped character: get the next character instead}
inc(FPosn);
Result := FPosn^;
inc(FPosn);
if FLogging then writeln(Log, 'parsed charclass char: "', Result, '"');
end;
{--------}
function TO32RegexEngine.rcParseChar : integer;
var
Ch : char;
begin
{if we hit the end of the string, it's an error}
if (FPosn^ = #0) then begin
Result := ErrorState;
FErrorCode := recSuddenEnd;
Exit;
end;
{if the current char is one of the metacharacters, it's an error}
if FPosn^ in MetaCharacters then begin
Result := ErrorState;
FErrorCode := recMetaChar;
Exit;
end;
{otherwise add a state for the character}
{..if it's an escaped character: get the next character instead}
if (FPosn^ = '\') then
inc(FPosn);
if IgnoreCase then
Ch := Upcase(FPosn^)
else
Ch := FPosn^;
Result := rcAddState(mtChar, Ch, nil, NewFinalState, UnusedState);
inc(FPosn);
if FLogging then writeln(Log, 'parsed char: "', Ch, '"');
end;
{--------}
function TO32RegexEngine.rcParseCharClass(aClass : PO32CharSet) : boolean;
begin
{assume we can't parse a character class properly}
Result := false;
{parse a character range; if we can't there was an error and the
caller will take care of it}
if not rcParseCharRange(aClass) then
Exit;
{if the current character was not the right bracket, parse another
character class (note: we're removing the tail recursion here)}
while (FPosn^ <> ']') do begin
if not rcParseCharRange(aClass) then
Exit;
end;
{if we reach here we were successful}
Result := true;
end;
{--------}
function TO32RegexEngine.rcParseCharRange(aClass : PO32CharSet) : boolean;
var
StartChar : char;
EndChar : char;
Ch : char;
begin
{assume we can't parse a character range properly}
Result := false;
{parse a single character; if it's null there was an error}
StartChar := rcParseCCChar;
if (StartChar = #0) then
Exit;
{if the current character is not a dash, the range consisted of a
single character}
if (FPosn^ <> '-') then begin
if IgnoreCase then
Include(aClass^, Upcase(StartChar))
else
Include(aClass^, StartChar)
end
{otherwise it's a real range, so get the character at the end of the
range; if that's null, there was an error}
else begin
if FLogging then writeln(Log, '-range to-');
inc(FPosn); {move past the '-'}
EndChar := rcParseCCChar;
if (EndChar = #0) then
Exit;
{build the range as a character set}
if (StartChar > EndChar) then begin
Ch := StartChar;
StartChar := EndChar;
EndChar := Ch;
end;
for Ch := StartChar to EndChar do begin
Include(aClass^, Ch);
if IgnoreCase then
Include(aClass^, Upcase(Ch));
end;
end;
{if we reach here we were successful}
Result := true;
end;
{--------}
function TO32RegexEngine.rcParseExpr : integer;
var
StartState1 : integer;
StartState2 : integer;
EndState1 : integer;
OverallStartState : integer;
begin
{assume the worst}
Result := ErrorState;
{parse an initial term}
StartState1 := rcParseTerm;
if (StartState1 = ErrorState) then
Exit;
{if the current character is *not* a pipe character, no alternation
is present so return the start state of the initial term as our
start state}
if (FPosn^ <> '|') then
Result := StartState1
{otherwise, we need to parse another expr and join the two together
in the transition table}
else begin
if FLogging then writeln(Log, 'OR (alternation)');
{advance past the pipe}
inc(FPosn);
{the initial term's end state does not exist yet (although there
is a state in the term that points to it), so create it}
EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
{for the OR construction we need a new initial state: it will
point to the initial term and the second just-about-to-be-parsed
expr}
OverallStartState := rcAddState(mtNone, #0, nil,
UnusedState, UnusedState);
{parse another expr}
StartState2 := rcParseExpr;
if (StartState2 = ErrorState) then
Exit;
{alter the state state for the overall expr so that the second
link points to the start of the second expr}
Result := rcSetState(OverallStartState, StartState1, StartState2);
{now set the end state for the initial term to point to the final
end state for the second expr and the overall expr}
rcSetState(EndState1, FTable.Count, UnusedState);
end;
end;
{--------}
function TO32RegexEngine.rcParseFactor : integer;
var
StartStateAtom : integer;
EndStateAtom : integer;
begin
{assume the worst}
Result := ErrorState;
{first parse an atom}
StartStateAtom := rcParseAtom;
if (StartStateAtom = ErrorState) then
Exit;
{check for a closure operator}
case FPosn^ of
'?' : begin
if FLogging then writeln(Log, 'zero or one closure');
{move past the ? operator}
inc(FPosn);
{the atom's end state doesn't exist yet, so create one}
EndStateAtom := rcAddState(mtNone, #0, nil,
UnusedState, UnusedState);
{create a new start state for the overall regex}
Result := rcAddState(mtNone, #0, nil,
StartStateAtom, EndStateAtom);
{make sure the new end state points to the next unused
state}
rcSetState(EndStateAtom, FTable.Count, UnusedState);
end;
'*' : begin
if FLogging then writeln(Log, 'zero or more closure');
{move past the * operator}
inc(FPosn);
{the atom's end state doesn't exist yet, so create one;
it'll be the start of the overall regex subexpression}
Result := rcAddState(mtNone, #0, nil,
NewFinalState, StartStateAtom);
end;
'+' : begin
if FLogging then writeln(Log, 'one or more closure');
{move past the + operator}
inc(FPosn);
{the atom's end state doesn't exist yet, so create one}
rcAddState(mtNone, #0, nil, NewFinalState, StartStateAtom);
{the start of the overall regex subexpression will be the
atom's start state}
Result := StartStateAtom;
end;
else
Result := StartStateAtom;
end;{case}
end;
{--------}
function TO32RegexEngine.rcParseTerm : integer;
var
StartState2 : integer;
EndState1 : integer;
begin
{parse an initial factor, the state number returned will also be our
return state number}
Result := rcParseFactor;
if (Result = ErrorState) then
Exit;
{Note: we have to "break the grammar" here. We've parsed a regular
subexpression and we're possibly following on with another
regular subexpression. There's no nice operator to key off
for concatenation: we just have to know that for
concatenating two subexpressions, the current character will
be
- an open parenthesis
- an open square bracket
- an any char operator
- a character that's not a metacharacter
i.e., the three possibilities for the start of an "atom" in
our grammar}
if (FPosn^ = '(') or
(FPosn^ = '[') or
(FPosn^ = '.') or
((FPosn^ <> #0) and not (FPosn^ in MetaCharacters)) then begin
if FLogging then writeln(Log, 'concatenation');
{the initial factor's end state does not exist yet (although there
is a state in the term that points to it), so create it}
EndState1 := rcAddState(mtNone, #0, nil, UnusedState, UnusedState);
{parse another term}
StartState2 := rcParseTerm;
if (StartState2 = ErrorState) then begin
Result := ErrorState;
Exit;
end;
{join the first factor to the second term}
rcSetState(EndState1, StartState2, UnusedState);
end;
end;
{--------}
procedure TO32RegexEngine.rcSetIgnoreCase(aValue : boolean);
begin
if (aValue <> FIgnoreCase) then begin
rcClear;
FIgnoreCase := aValue;
end;
end;
{--------}
procedure TO32RegexEngine.rcSetRegexStr(const aRegexStr : string);
begin
if (aRegexStr <> FRegexStr) then begin
rcClear;
FRegexStr := aRegexStr;
end;
end;
{--------}
function TO32RegexEngine.rcSetState(aState : integer;
aNextState1: integer;
aNextState2: integer) : integer;
var
StateData : PO32NFAState;
begin
Assert((0 <= aState) and (aState < FTable.Count),
'trying to change an invalid state');
{get the state record and change the transition information}
StateData := PO32NFAState(FTable.List^[aState]);
StateData^.sdNextState1 := aNextState1;
StateData^.sdNextState2 := aNextState2;
Result := aState;
end;
{--------}
procedure TO32RegexEngine.rcSetUpcase(aValue : TO32UpcaseChar);
begin
if not Assigned(aValue) then
FUpcase := System.Upcase
else
FUpcase := aValue;
end;
{--------}
procedure TO32RegexEngine.rcWalkNoCostTree(aList : TO32IntList;
aState : integer);
begin
{look at this state's record...}
with PO32NFAState(FTable.List^[aState])^ do begin
{if it's a no-cost state, recursively walk the
first, then the second chain}
if (sdMatchType = mtNone) then begin
rcWalkNoCostTree(aList, sdNextState1);
rcWalkNoCostTree(aList, sdNextState2);
end
{otherwise, add it to the list}
else
aList.Add(aState);
end;
end;
{----}
end.