mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-04-05 13:38:08 +02:00
8548 lines
254 KiB
ObjectPascal
8548 lines
254 KiB
ObjectPascal
unit xregexpr;
|
||
|
||
(*
|
||
* THIS IS A
|
||
* MODIFIED VERSION
|
||
* OF TREGEXPR
|
||
*)
|
||
|
||
{
|
||
TRegExpr class library
|
||
Delphi Regular Expressions
|
||
|
||
Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
|
||
|
||
You can choose to use this Pascal unit in one of the two following licenses:
|
||
|
||
Option 1>
|
||
|
||
You may use this software in any kind of development,
|
||
including comercial, redistribute, and modify it freely,
|
||
under the following restrictions :
|
||
1. This software is provided as it is, without any kind of
|
||
warranty given. Use it at Your own risk.The author is not
|
||
responsible for any consequences of use of this software.
|
||
2. The origin of this software may not be mispresented, You
|
||
must not claim that You wrote the original software. If
|
||
You use this software in any kind of product, it would be
|
||
appreciated that there in a information box, or in the
|
||
documentation would be an acknowledgement like
|
||
|
||
Partial Copyright (c) 2004 Andrey V. Sorokin
|
||
https://sorokin.engineer/
|
||
andrey@sorokin.engineer
|
||
|
||
3. You may not have any income from distributing this source
|
||
(or altered version of it) to other developers. When You
|
||
use this product in a comercial package, the source may
|
||
not be charged seperatly.
|
||
4. Altered versions must be plainly marked as such, and must
|
||
not be misrepresented as being the original software.
|
||
5. RegExp Studio application and all the visual components as
|
||
well as documentation is not part of the TRegExpr library
|
||
and is not free for usage.
|
||
|
||
https://sorokin.engineer/
|
||
andrey@sorokin.engineer
|
||
|
||
Option 2>
|
||
|
||
The same modified LGPL with static linking exception as the Free Pascal RTL
|
||
}
|
||
|
||
{
|
||
program is essentially a linear encoding
|
||
of a nondeterministic finite-state machine (aka syntax charts or
|
||
"railroad normal form" in parsing technology). Each node is an opcode
|
||
plus a "next" pointer, possibly plus an operand. "Next" pointers of
|
||
all nodes except BRANCH implement concatenation; a "next" pointer with
|
||
a BRANCH on both ends of it connects two alternatives. (Here we
|
||
have one of the subtle syntax dependencies: an individual BRANCH (as
|
||
opposed to a collection of them) is never concatenated with anything
|
||
because of operator precedence.) The operand of some types of node is
|
||
a literal string; for others, it is a node leading into a sub-FSM. In
|
||
particular, the operand of a BRANCH node is the first node of the branch.
|
||
(NB this is *not* a tree structure: the tail of the branch connects
|
||
to the thing following the set of BRANCHes.)
|
||
}
|
||
|
||
interface
|
||
|
||
{ off $DEFINE DebugSynRegExpr }
|
||
// ======== Determine compiler
|
||
{$I regexpr_compilers.inc}
|
||
// ======== Define base compiler options
|
||
{$BOOLEVAL OFF}
|
||
{$EXTENDEDSYNTAX ON}
|
||
{$LONGSTRINGS ON}
|
||
{$IFDEF FPC}
|
||
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
|
||
{$INLINE ON}
|
||
{$ENDIF}
|
||
// ======== Define options for TRegExpr engine
|
||
{disable $DEFINE UnicodeRE} // Use WideChar for characters and UnicodeString/WideString for strings
|
||
{ off $DEFINE UnicodeEx} // Support Unicode >0xFFFF, e.g. emoji, e.g. "." must find 2 WideChars of 1 emoji
|
||
{ off $DEFINE UseWordChars} // Use WordChars property, otherwise fixed list 'a'..'z','A'..'Z','0'..'9','_'
|
||
{ off $DEFINE UseSpaceChars} // Use SpaceChars property, otherwise fixed list
|
||
{ off $DEFINE UseLineSep} // Use LineSeparators property, otherwise fixed line-break chars
|
||
{$IFDEF UNICODE}
|
||
{$IFNDEF UnicodeRE}
|
||
{$MESSAGE ERROR 'You cannot undefine UnicodeRE for Unicode Delphi versions'}
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
{$IFDEF FPC}
|
||
{$DEFINE FastUnicodeData} // Use arrays for UpperCase/LowerCase/IsWordChar, they take 320K more memory
|
||
{$ENDIF}
|
||
{$DEFINE RegExpWithStackOverflowCheck} // Check the recursion depth and abort matching before stack overflows (available only for some OS/CPU)
|
||
{$DEFINE UseFirstCharSet} // Enable optimization, which finds possible first chars of input string
|
||
{$DEFINE RegExpPCodeDump} // Enable method Dump() to show opcode as string
|
||
{$IFNDEF FPC} // Not supported in FreePascal
|
||
{$DEFINE reRealExceptionAddr} // Exceptions will point to appropriate source line, not to Error procedure
|
||
{$ENDIF}
|
||
{$DEFINE ComplexBraces} // Support braces in complex cases
|
||
{$IFNDEF UnicodeRE}
|
||
{$UNDEF UnicodeEx}
|
||
{$UNDEF FastUnicodeData}
|
||
{$ENDIF}
|
||
{.$DEFINE Compat} // Enable compatability methods/properties for forked version in Free Pascal 3.0
|
||
// ======== Define Pascal-language options
|
||
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
|
||
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
|
||
{$IFDEF D3} { $DEFINE WITH_REGEX_ASSERT} {$ENDIF}
|
||
{$IFDEF FPC}{$IFOPT C+} {$DEFINE WITH_REGEX_ASSERT} {$ENDIF}{$ENDIF} // Only if compile with -Sa
|
||
// Define 'use subroutine parameters default values' option (do not edit this definition).
|
||
{$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
|
||
{$IFDEF FPC} {$DEFINE DefParam} {$ENDIF}
|
||
// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
|
||
{$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
|
||
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
|
||
// Define 'InlineFuncs' options, to use inline keyword (do not edit this definitions).
|
||
{$IFDEF D8} {$DEFINE InlineFuncs} {$ENDIF}
|
||
{$IFDEF FPC} {$DEFINE InlineFuncs} {$ENDIF}
|
||
{$PointerMath on}
|
||
|
||
{$IFDEF RegExpWithStackOverflowCheck} // Define the stack checking algorithm for the current platform/CPU
|
||
{$IF defined(Linux) or defined(Windows)}{$IF defined(CPU386) or defined(CPUX86_64)}
|
||
{$DEFINE RegExpWithStackOverflowCheck_DecStack_Frame} // Stack-pointer decrements // use getframe over Sptr()
|
||
{$ENDIF}{$ENDIF}
|
||
{$ENDIF}
|
||
uses
|
||
SysUtils, // Exception
|
||
{$IFDEF D2009}
|
||
{$IFDEF D_XE2}
|
||
System.Character,
|
||
{$ELSE}
|
||
Character,
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
Classes; // TStrings in Split method
|
||
|
||
type
|
||
{$IFNDEF FPC}
|
||
// Delphi doesn't have PtrInt but has NativeInt
|
||
// but unfortunately NativeInt is declared wrongly in several versions
|
||
{$IF SizeOf(Pointer)=4}
|
||
PtrInt = Integer;
|
||
PtrUInt = Cardinal;
|
||
{$ELSE}
|
||
PtrInt = Int64;
|
||
PtrUInt = UInt64;
|
||
{$IFEND}
|
||
{$ENDIF}
|
||
|
||
{$IFDEF UnicodeRE}
|
||
PRegExprChar = PWideChar;
|
||
{$IFDEF FPC}
|
||
RegExprString = UnicodeString;
|
||
{$ELSE}
|
||
{$IFDEF D2009}
|
||
RegExprString = UnicodeString;
|
||
{$ELSE}
|
||
RegExprString = WideString;
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
REChar = WideChar;
|
||
{$ELSE}
|
||
PRegExprChar = PAnsiChar;
|
||
RegExprString = AnsiString;
|
||
REChar = AnsiChar;
|
||
{$ENDIF}
|
||
TREOp = REChar; // internal opcode type
|
||
PREOp = ^TREOp;
|
||
|
||
type
|
||
TRegExprCharset = set of Byte;
|
||
|
||
const
|
||
// Escape char ('\' in common r.e.) used for escaping metachars (\w, \d etc)
|
||
EscChar = '\';
|
||
|
||
// Substitute method: prefix of group reference: $1 .. $9 and $<name>
|
||
SubstituteGroupChar = '$';
|
||
|
||
RegExprModifierI: Boolean = False; // default value for ModifierI
|
||
RegExprModifierR: Boolean = True; // default value for ModifierR
|
||
RegExprModifierS: Boolean = True; // default value for ModifierS
|
||
RegExprModifierG: Boolean = True; // default value for ModifierG
|
||
RegExprModifierM: Boolean = False; // default value for ModifierM
|
||
RegExprModifierX: Boolean = False; // default value for ModifierX
|
||
|
||
{$IFDEF UseSpaceChars}
|
||
// default value for SpaceChars
|
||
RegExprSpaceChars: RegExprString = ' '#$9#$A#$D#$C;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF UseWordChars}
|
||
// default value for WordChars
|
||
RegExprWordChars: RegExprString = '0123456789'
|
||
+ 'abcdefghijklmnopqrstuvwxyz'
|
||
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
|
||
{$ENDIF}
|
||
|
||
{$IFDEF UseLineSep}
|
||
// default value for LineSeparators
|
||
RegExprLineSeparators: RegExprString = #$d#$a#$b#$c
|
||
{$IFDEF UnicodeRE}
|
||
+ #$2028#$2029#$85
|
||
{$ENDIF};
|
||
{$ENDIF}
|
||
|
||
// Tab and Unicode category "Space Separator":
|
||
// https://www.compart.com/en/unicode/category/Zs
|
||
RegExprHorzSeparators: RegExprString = #9#$20#$A0
|
||
{$IFDEF UnicodeRE}
|
||
+ #$1680#$2000#$2001#$2002#$2003#$2004#$2005#$2006#$2007#$2008#$2009#$200A#$202F#$205F#$3000
|
||
{$ENDIF};
|
||
|
||
RegExprUsePairedBreak: Boolean = True;
|
||
RegExprReplaceLineBreak: RegExprString = sLineBreak;
|
||
|
||
const
|
||
// Increment/keep-capacity for the size of arrays holding 'Group' related data
|
||
// e.g., GrpBounds, GrpOpCodes and GrpNames
|
||
RegexGroupCountIncrement = 50;
|
||
|
||
// Max possible amount of groups.
|
||
// Don't change it! It's defined by internal TRegExpr design.
|
||
RegexMaxMaxGroups = MaxInt div 16;
|
||
|
||
// Max depth of recursion for (?R) and (?1)..(?9)
|
||
RegexMaxRecursion = 20;
|
||
|
||
type
|
||
TRegExprModifiers = record
|
||
I: Boolean;
|
||
// Case-insensitive.
|
||
R: Boolean;
|
||
// Extended syntax for Russian ranges in [].
|
||
// If True, then а-я additionally includes letter 'ё',
|
||
// А-Я additionally includes 'Ё', and а-Я includes all Russian letters.
|
||
// Turn it off if it interferes with your national alphabet.
|
||
S: Boolean;
|
||
// Dot '.' matches any char, otherwise only [^\n].
|
||
G: Boolean;
|
||
// Greedy. Switching it off switches all operators to non-greedy style,
|
||
// so if G=False, then '*' works like '*?', '+' works like '+?' and so on.
|
||
M: Boolean;
|
||
// Treat string as multiple lines. It changes `^' and `$' from
|
||
// matching at only the very start/end of the string to the start/end
|
||
// of any line anywhere within the string.
|
||
X: Boolean;
|
||
// Allow comments in regex using # char.
|
||
end;
|
||
|
||
function IsModifiersEqual(const A, B: TRegExprModifiers): Boolean;
|
||
|
||
type
|
||
TRegExpr = class;
|
||
TRegExprReplaceFunction = function(ARegExpr: TRegExpr): RegExprString of object;
|
||
TRegExprCharChecker = function(ch: REChar): Boolean of object;
|
||
TRegExprCharCheckerArray = array[0 .. 30] of TRegExprCharChecker;
|
||
TRegExprCharCheckerInfo = record
|
||
CharBegin, CharEnd: REChar;
|
||
CheckerIndex: Integer;
|
||
end;
|
||
TRegExprCharCheckerInfos = array of TRegExprCharCheckerInfo;
|
||
|
||
TRegExprAnchor = (
|
||
raNone, // Not anchored
|
||
raBOL, // Must start at BOL
|
||
raEOL, // Must start at EOL (maybe look behind)
|
||
raContinue, // Must start at continue pos \G
|
||
raOnlyOnce // Starts with .* must match from the start pos only. Must not be tried from a later pos
|
||
);
|
||
|
||
TRegExprFindFixedLengthFlag = (
|
||
flfReturnAtNextNil,
|
||
flfSkipLookAround
|
||
);
|
||
TRegExprFindFixedLengthFlags = set of TRegExprFindFixedLengthFlag;
|
||
|
||
{$IFDEF Compat}
|
||
TRegExprInvertCaseFunction = function(const Ch: REChar): REChar of object;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF ComplexBraces}
|
||
POpLoopInfo = ^TOpLoopInfo;
|
||
TOpLoopInfo = record
|
||
Count: Integer;
|
||
CurrentRegInput: PRegExprChar;
|
||
BackTrackingAsAtom: Boolean;
|
||
OuterLoop: POpLoopInfo; // for nested loops
|
||
end;
|
||
{$ENDIF}
|
||
|
||
|
||
PPRegExprChar = ^PRegExprChar;
|
||
TRegExprBoundsPtr = record
|
||
TmpStart: PPRegExprChar; // pointer start of not yet finished group start in InputString
|
||
// OP_CLOSE not yet reached
|
||
// does not need to be cleared
|
||
GrpStart: PPRegExprChar; // pointer to group start in InputString
|
||
GrpEnd: PPRegExprChar; // pointer to group end in InputString
|
||
end;
|
||
TRegExprBounds = record
|
||
TmpStart: array of PRegExprChar; // pointer start of not yet finished group start in InputString
|
||
// OP_CLOSE not yet reached
|
||
// does not need to be cleared
|
||
GrpStart: array of PRegExprChar; // pointer to group start in InputString
|
||
GrpEnd: array of PRegExprChar; // pointer to group end in InputString
|
||
end;
|
||
TRegExprBoundsArray = array[0 .. RegexMaxRecursion] of TRegExprBounds;
|
||
|
||
PRegExprLookAroundInfo = ^TRegExprLookAroundInfo;
|
||
TRegExprLookAroundInfo = record
|
||
InputPos: PRegExprChar; // pointer to start of look-around in the input string
|
||
savedInputCurrentEnd: PRegExprChar; // pointer to start of look-around in the input string
|
||
IsNegative, HasMatchedToEnd: Boolean;
|
||
IsBackTracking: Boolean;
|
||
OuterInfo: PRegExprLookAroundInfo; // for nested lookaround
|
||
end;
|
||
|
||
TRegExprGroupName = record
|
||
Name: RegExprString;
|
||
Index: Integer;
|
||
end;
|
||
|
||
{ TRegExprGroupNameList }
|
||
|
||
TRegExprGroupNameList = object
|
||
Names: array of TRegExprGroupName;
|
||
NameCount: Integer;
|
||
// get index of group (subexpression) by name, to support named groups
|
||
// like in Python: (?P<name>regex)
|
||
function MatchIndexFromName(const AName: RegExprString): Integer;
|
||
procedure Clear;
|
||
procedure Add(const AName: RegExprString; AnIndex: Integer);
|
||
end;
|
||
|
||
{ TRegExpr }
|
||
|
||
TRegExpr = class
|
||
private
|
||
FAllowBraceWithoutMin: Boolean;
|
||
FAllowUnsafeLookBehind: Boolean;
|
||
FAllowLiteralBraceWithoutRange: Boolean;
|
||
FMatchesCleared: Boolean;
|
||
fRaiseForRuntimeError: Boolean;
|
||
GrpBounds: TRegExprBoundsArray;
|
||
CurrentGrpBounds: TRegExprBoundsPtr;
|
||
GrpNames: TRegExprGroupNameList; // names of groups, if non-empty
|
||
GrpBacktrackingAsAtom: array of Boolean; // close of group[i] has set IsBacktrackingGroupAsAtom
|
||
IsBacktrackingGroupAsAtom: Boolean; // Backtracking an entire atomic group that had matched.
|
||
// Once the group matched it should not try any alternative matches within the group
|
||
// If the pattern after the group fails, then the group fails (regardless of any alternative match in the group)
|
||
|
||
GrpOpCodes: array of PRegExprChar; // pointer to opcode of group[i] (used by OP_SUBCALL*)
|
||
GrpCount, ParsedGrpCount: Integer;
|
||
|
||
{$IFDEF ComplexBraces}
|
||
CurrentLoopInfoListPtr: POpLoopInfo;
|
||
{$ENDIF}
|
||
|
||
// The "internal use only" fields to pass info from compile
|
||
// to execute that permits the execute phase to run lots faster on
|
||
// simple cases.
|
||
|
||
regAnchored: TRegExprAnchor; // is the match anchored (at beginning-of-line only)?
|
||
// regAnchored permits very fast decisions on suitable starting points
|
||
// for a match, cutting down the work a lot. regMust permits fast rejection
|
||
// of lines that cannot possibly match. The regMust tests are costly enough
|
||
// that regcomp() supplies a regMust only if the r.e. contains something
|
||
// potentially expensive (at present, the only such thing detected is * or +
|
||
// at the start of the r.e., which can involve a lot of backup). regMustLen is
|
||
// supplied because the test in regexec() needs it and regcomp() is computing
|
||
// it anyway.
|
||
|
||
regMust: PRegExprChar; // string (pointer into program) that match must include, or nil
|
||
regMustLen: Integer; // length of regMust string
|
||
regMustString: RegExprString; // string which must occur in match (got from regMust/regMustLen)
|
||
LookAroundInfoList: PRegExprLookAroundInfo;
|
||
//regNestedCalls: integer; // some attempt to prevent 'catastrophic backtracking' but not used
|
||
CurrentSubCalled: Integer;
|
||
|
||
FMinMatchLen: integer;
|
||
{$IFDEF UseFirstCharSet}
|
||
FirstCharSet: TRegExprCharset;
|
||
FirstCharArray: array[Byte] of Boolean;
|
||
{$ENDIF}
|
||
|
||
// work variables for Exec routines - save stack in recursion
|
||
regInput: PRegExprChar; // pointer to currently handling char of input string
|
||
fInputStart: PRegExprChar; // pointer to first char of input string
|
||
fInputContinue: PRegExprChar; // pointer to char specified with Exec(AOffset), or start pos of ExecNext
|
||
fInputEnd: PRegExprChar; // pointer after last char of input string
|
||
fInputCurrentEnd: PRegExprChar; // pointer after last char of the current visible part of input string (can be limited by look-behind)
|
||
fRegexStart: PRegExprChar; // pointer to first char of regex
|
||
fRegexEnd: PRegExprChar; // pointer after last char of regex
|
||
regRecursion: Integer; // current level of recursion (?R) (?1); always 0 if no recursion is used
|
||
hasRecursion: Boolean;
|
||
|
||
// work variables for compiler's routines
|
||
regParse: PRegExprChar; // pointer to currently handling char of regex
|
||
regNumBrackets: Integer; // count of () brackets
|
||
regNumAtomicBrackets: Integer; // count of (?>) brackets
|
||
regDummy: array [0..8 div SizeOf(REChar)] of REChar; // dummy pointer, used to detect 1st/2nd pass of Compile
|
||
// if p=@regDummy, it is pass-1: opcode memory is not yet allocated
|
||
programm: PRegExprChar; // pointer to opcode, =nil in pass-1
|
||
regCode: PRegExprChar; // pointer to last emitted opcode; changing in pass-2, but =@regDummy in pass-1
|
||
regCodeSize: Integer; // total opcode size in REChars
|
||
regCodeWork: PRegExprChar; // pointer to opcode, to first code after MAGIC
|
||
regExactlyLen: PLongInt; // pointer to length of substring of OP_EXACTLY* inside opcode
|
||
fSecondPass: Boolean; // true inside pass-2 of Compile
|
||
|
||
fExpression: RegExprString; // regex string
|
||
fInputString: RegExprString; // input string
|
||
fLastError: Integer; // Error call sets code of LastError
|
||
fLastErrorOpcode: TREOp;
|
||
fLastErrorSymbol: REChar;
|
||
|
||
fModifiers: TRegExprModifiers; // regex modifiers
|
||
fCompModifiers: TRegExprModifiers; // compiler's copy of modifiers
|
||
fProgModifiers: TRegExprModifiers; // modifiers values from last programm compilation
|
||
|
||
{$IFDEF UseSpaceChars}
|
||
fSpaceChars: RegExprString;
|
||
{$ENDIF}
|
||
{$IFDEF UseWordChars}
|
||
fWordChars: RegExprString;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF UseLineSep}
|
||
fLineSeparators: RegExprString;
|
||
{$ENDIF}
|
||
|
||
fUsePairedBreak: Boolean;
|
||
fReplaceLineEnd: RegExprString; // string to use for "\n" in Substitute method
|
||
|
||
fSlowChecksSizeMax: Integer;
|
||
// Exec() param ASlowChecks is set to True, when Length(InputString)<SlowChecksSizeMax
|
||
// This ASlowChecks enables to use regMustString optimization
|
||
|
||
{$IFDEF UseLineSep}
|
||
{$IFNDEF UnicodeRE}
|
||
fLineSepArray: array[Byte] of Boolean;
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
|
||
CharCheckers: TRegExprCharCheckerArray;
|
||
CharCheckerInfos: TRegExprCharCheckerInfos;
|
||
CheckerIndex_Word: Byte;
|
||
CheckerIndex_NotWord: Byte;
|
||
CheckerIndex_Digit: Byte;
|
||
CheckerIndex_NotDigit: Byte;
|
||
CheckerIndex_Space: Byte;
|
||
CheckerIndex_NotSpace: Byte;
|
||
CheckerIndex_HorzSep: Byte;
|
||
CheckerIndex_NotHorzSep: Byte;
|
||
CheckerIndex_VertSep: Byte;
|
||
CheckerIndex_NotVertSep: Byte;
|
||
CheckerIndex_LowerAZ: Byte;
|
||
CheckerIndex_UpperAZ: Byte;
|
||
CheckerIndex_AnyLineBreak: Byte;
|
||
{$IFDEF RegExpWithStackOverflowCheck_DecStack_Frame}
|
||
StackLimit: Pointer;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF Compat}
|
||
fUseUnicodeWordDetection: Boolean;
|
||
fInvertCase: TRegExprInvertCaseFunction;
|
||
fEmptyInputRaisesError: Boolean;
|
||
fUseOsLineEndOnReplace: Boolean;
|
||
function OldInvertCase(const Ch: REChar): REChar;
|
||
function GetLinePairedSeparator: RegExprString;
|
||
procedure SetLinePairedSeparator(const AValue: RegExprString);
|
||
procedure SetUseOsLineEndOnReplace(AValue: Boolean);
|
||
{$ENDIF}
|
||
|
||
procedure InitCharCheckers;
|
||
function CharChecker_Word(ch: REChar): Boolean;
|
||
function CharChecker_NotWord(ch: REChar): Boolean;
|
||
function CharChecker_Space(ch: REChar): Boolean;
|
||
function CharChecker_NotSpace(ch: REChar): Boolean;
|
||
function CharChecker_Digit(ch: REChar): Boolean;
|
||
function CharChecker_NotDigit(ch: REChar): Boolean;
|
||
function CharChecker_HorzSep(ch: REChar): Boolean;
|
||
function CharChecker_NotHorzSep(ch: REChar): Boolean;
|
||
function CharChecker_VertSep(ch: REChar): Boolean;
|
||
function CharChecker_NotVertSep(ch: REChar): Boolean;
|
||
function CharChecker_AnyLineBreak(ch: REChar): Boolean;
|
||
function CharChecker_LowerAZ(ch: REChar): Boolean;
|
||
function CharChecker_UpperAZ(ch: REChar): Boolean;
|
||
function DumpCheckerIndex(N: Byte): RegExprString;
|
||
function DumpCategoryChars(ch, ch2: REChar; Positive: Boolean): RegExprString;
|
||
|
||
procedure ClearMatches; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
procedure ClearInternalExecData; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
procedure InitInternalGroupData;
|
||
function FindInCharClass(ABuffer: PRegExprChar; AChar: REChar): Boolean;
|
||
procedure GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: Boolean; var ARes: TRegExprCharset);
|
||
procedure GetCharSetFromSpaceChars(var ARes: TRegExprCharset); {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
procedure GetCharSetFromWordChars(var ARes: TRegExprCharSet); {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
function IsWordChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
function IsSpaceChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
function IsCustomLineSeparator(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
{$IFDEF UseLineSep}
|
||
procedure InitLineSepArray;
|
||
{$ENDIF}
|
||
procedure FindGroupName(APtr, AEndPtr: PRegExprChar; AEndChar: REChar; var AName: RegExprString);
|
||
|
||
// Mark programm as having to be [re]compiled
|
||
procedure InvalidateProgramm;
|
||
|
||
// Check if we can use compiled regex, compile it if something changed
|
||
function IsProgrammOk: Boolean;
|
||
|
||
procedure SetExpression(const AStr: RegExprString);
|
||
|
||
function GetModifierStr: RegExprString;
|
||
procedure SetModifierStr(const AStr: RegExprString);
|
||
function GetModifierG: Boolean;
|
||
function GetModifierI: Boolean;
|
||
function GetModifierM: Boolean;
|
||
function GetModifierR: Boolean;
|
||
function GetModifierS: Boolean;
|
||
function GetModifierX: Boolean;
|
||
procedure SetModifierG(AValue: Boolean);
|
||
procedure SetModifierI(AValue: Boolean);
|
||
procedure SetModifierM(AValue: Boolean);
|
||
procedure SetModifierR(AValue: Boolean);
|
||
procedure SetModifierS(AValue: Boolean);
|
||
procedure SetModifierX(AValue: Boolean);
|
||
|
||
{ ==================== Compiler section =================== }
|
||
// compile a regular expression into internal code
|
||
function CompileRegExpr(ARegExp: PRegExprChar): Boolean;
|
||
|
||
// set the next-pointer at the end of a node chain
|
||
procedure Tail(p: PRegExprChar; val: PRegExprChar);
|
||
|
||
// regoptail - regtail on operand of first argument; nop if operandless
|
||
procedure OpTail(p: PRegExprChar; val: PRegExprChar);
|
||
|
||
// regnode - emit a node, return location
|
||
function EmitNode(op: TREOp): PRegExprChar;
|
||
|
||
// emit OP_BRANCH (and fillchars)
|
||
function EmitBranch: PRegExprChar; {$IFDEF FPC}inline;{$ENDIF}
|
||
|
||
// emit (if appropriate) a byte of code
|
||
procedure EmitC(ch: REChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
|
||
// emit LongInt value
|
||
procedure EmitInt(AValue: LongInt); {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
|
||
// for groups
|
||
function EmitNodeWithGroupIndex(op: TREOp; AIndex: Integer): PRegExprChar;
|
||
|
||
// emit back-reference to group
|
||
function EmitGroupRef(AIndex: Integer; AIgnoreCase: Boolean): PRegExprChar;
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
procedure FindCategoryName(var scan: PRegExprChar; var ch1, ch2: REChar);
|
||
function EmitCategoryMain(APositive: Boolean): PRegExprChar;
|
||
{$ENDIF}
|
||
|
||
// insert an operator in front of already-emitted operand
|
||
// Means relocating the operand.
|
||
procedure InsertOperator(op: TREOp; opnd: PRegExprChar; sz: Integer);
|
||
procedure RemoveOperator(opnd: PRegExprChar; sz: Integer);
|
||
|
||
// regular expression, i.e. main body or parenthesized thing
|
||
function ParseReg(var FlagParse: Integer): PRegExprChar;
|
||
function DoParseReg(InBrackets: Boolean; BracketCounter: PInteger; var FlagParse: Integer; BeginGroupOp, EndGroupOP: TReOp): PRegExprChar;
|
||
|
||
// one alternative of an | operator
|
||
function ParseBranch(var FlagParse: Integer): PRegExprChar;
|
||
|
||
procedure MaybeGuardBranchPiece(piece: PRegExprChar);
|
||
|
||
// something followed by possible [*+?]
|
||
function ParsePiece(var FlagParse: Integer): PRegExprChar;
|
||
|
||
function HexDig(Ch: REChar): Integer;
|
||
|
||
function UnQuoteChar(var APtr, AEnd: PRegExprChar): REChar;
|
||
|
||
// the lowest level
|
||
function ParseAtom(var FlagParse: Integer): PRegExprChar;
|
||
|
||
// current pos in r.e. - for error hanling
|
||
function GetCompilerErrorPos: PtrInt;
|
||
|
||
{$IFDEF UseFirstCharSet}
|
||
procedure FillFirstCharSet(prog: PRegExprChar);
|
||
{$ENDIF}
|
||
|
||
function IsPartFixedLength(var prog: PRegExprChar; var op: TREOp; var AMinLen, AMaxLen: integer; StopAt: TREOp; StopMaxProg: PRegExprChar; Flags: TRegExprFindFixedLengthFlags): boolean;
|
||
|
||
{ ===================== Matching section =================== }
|
||
// repeatedly match something simple, report how many
|
||
function FindRepeated(p: PRegExprChar; AMax: Integer): Integer;
|
||
|
||
// dig the "next" pointer out of a node
|
||
function regNext(p: PRegExprChar): PRegExprChar;
|
||
function regNextQuick(p: PRegExprChar): PRegExprChar; {$IFDEF FPC}inline;{$ENDIF}
|
||
|
||
// dig the "last" pointer out of a chain of node
|
||
function regLast(p: PRegExprChar): PRegExprChar;
|
||
|
||
// recursively matching routine
|
||
function MatchPrim(prog: PRegExprChar): Boolean;
|
||
|
||
// match at specific position only, called from ExecPrim
|
||
function MatchAtOnePos(APos: PRegExprChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
|
||
// Exec for stored InputString
|
||
function ExecPrim(AOffset: Integer; ASlowChecks, ABackward: Boolean; ATryMatchOnlyStartingBefore: Integer): Boolean;
|
||
function ExecPrimProtected(AOffset: Integer; ASlowChecks, ABackward: Boolean; ATryMatchOnlyStartingBefore: Integer): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
|
||
function GetSubExprCount: Integer;
|
||
function GetMatchPos(Idx: Integer): PtrInt;
|
||
function GetMatchLen(Idx: Integer): PtrInt;
|
||
function GetMatch(Idx: Integer): RegExprString;
|
||
|
||
procedure SetInputString(const AInputString: RegExprString);
|
||
procedure SetInputRange(AStart, AEnd, AContinueAnchor: PRegExprChar);
|
||
|
||
{$IFDEF UseLineSep}
|
||
procedure SetLineSeparators(const AStr: RegExprString);
|
||
{$ENDIF}
|
||
procedure SetUsePairedBreak(AValue: Boolean);
|
||
|
||
protected
|
||
// Default handler raises exception ERegExpr with
|
||
// Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
|
||
// and CompilerErrorPos = value of property CompilerErrorPos.
|
||
procedure Error(AErrorID: Integer); virtual; // error handler.
|
||
|
||
public
|
||
constructor Create; {$IFDEF OverMeth} overload;
|
||
constructor Create(const AExpression: RegExprString); overload;
|
||
{$ENDIF}
|
||
destructor Destroy; override;
|
||
|
||
class function VersionMajor: Integer;
|
||
class function VersionMinor: Integer;
|
||
|
||
// match a programm against a string AInputString
|
||
// Exec stores AInputString into InputString property
|
||
// For Delphi 5 and higher overloaded versions are available: first without
|
||
// parameter (uses already assigned InputString property value)
|
||
// and second has int parameter, same as for ExecPos
|
||
function Exec(const AInputString: RegExprString): Boolean;
|
||
{$IFDEF OverMeth}overload;{$endif} {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
|
||
{$IFDEF OverMeth}
|
||
function Exec: Boolean; overload; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
function Exec(AOffset: Integer): Boolean; overload; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
{$ENDIF}
|
||
|
||
// find next match:
|
||
// ExecNext;
|
||
// works the same as
|
||
// if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
|
||
// else ExecPos (MatchPos [0] + MatchLen [0]);
|
||
// but it's more simpler !
|
||
// Raises exception if used without preceeding SUCCESSFUL call to
|
||
// Exec* (Exec, ExecPos, ExecNext). So You always must use something like
|
||
// if Exec (InputString) then repeat { proceed results} until not ExecNext;
|
||
function ExecNext(ABackward: Boolean {$IFDEF DefParam} = False{$ENDIF}): Boolean;
|
||
|
||
// find match for InputString starting from AOffset position
|
||
// (AOffset=1 - first char of InputString)
|
||
function ExecPos(AOffset: Integer {$IFDEF DefParam} = 1{$ENDIF}): Boolean;
|
||
{$IFDEF OverMeth}overload;{$endif} {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
|
||
{$IFDEF OverMeth}
|
||
// find match for InputString at AOffset.
|
||
// if ATryOnce=True then only match exactly at AOffset (like anchor \G)
|
||
// if ATryMatchOnlyStartingBefore then only when the match can start before
|
||
// that position: Result := MatchPos[0] < ATryMatchOnlyStartingBefore;
|
||
function ExecPos(AOffset: Integer; ATryOnce, ABackward: Boolean): Boolean; overload; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
function ExecPos(AOffset, ATryMatchOnlyStartingBefore: Integer): Boolean; overload; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
{$ENDIF}
|
||
|
||
// Returns ATemplate with '$&' or '$0' replaced by whole r.e.
|
||
// occurence and '$1'...'$nn' replaced by subexpression with given index.
|
||
// Symbol '$' is used instead of '\' (for future extensions
|
||
// and for more Perl-compatibility) and accepts more than one digit.
|
||
// If you want to place into template raw '$' or '\', use prefix '\'.
|
||
// Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
|
||
// If you want to place any number after '$' you must enclose it
|
||
// with curly braces: '${12}'.
|
||
// Example: 'a$12bc' -> 'a<Match[12]>bc'
|
||
// 'a${1}2bc' -> 'a<Match[1]>2bc'.
|
||
function Substitute(const ATemplate: RegExprString): RegExprString;
|
||
|
||
// Splits AInputStr to list by positions of all r.e. occurencies.
|
||
// Internally calls Exec, ExecNext.
|
||
procedure Split(const AInputStr: RegExprString; APieces: TStrings);
|
||
|
||
function Replace(const AInputStr: RegExprString;
|
||
const AReplaceStr: RegExprString;
|
||
AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF})
|
||
: RegExprString; {$IFDEF OverMeth} overload;
|
||
function Replace(const AInputStr: RegExprString;
|
||
AReplaceFunc: TRegExprReplaceFunction): RegExprString; overload;
|
||
{$ENDIF}
|
||
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr.
|
||
// If AUseSubstitution is true, then AReplaceStr will be used
|
||
// as template for Substitution methods.
|
||
// For example:
|
||
// Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
|
||
// Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
|
||
// will return: def 'BLOCK' value 'test1'
|
||
// Replace ('BLOCK( test1)', 'def "$1" value "$2"')
|
||
// will return: def "$1" value "$2"
|
||
// Internally calls Exec, ExecNext.
|
||
// Overloaded version and ReplaceEx operate with callback function,
|
||
// so you can implement really complex functionality.
|
||
function ReplaceEx(const AInputStr: RegExprString;
|
||
AReplaceFunc: TRegExprReplaceFunction): RegExprString;
|
||
|
||
{$IFDEF Compat}
|
||
function ExecPos(AOffset: Integer; ATryOnce: Boolean): Boolean; overload; deprecated 'Use modern form of ExecPos()';
|
||
class function InvertCaseFunction(const Ch: REChar): REChar; deprecated 'This has no effect now';
|
||
property InvertCase: TRegExprInvertCaseFunction read fInvertCase write fInvertCase; deprecated 'This has no effect now';
|
||
property UseUnicodeWordDetection: Boolean read fUseUnicodeWordDetection write fUseUnicodeWordDetection; deprecated 'This has no effect, use {$DEFINE UnicodeRE} instead';
|
||
property LinePairedSeparator: RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; deprecated 'This has no effect now';
|
||
property EmptyInputRaisesError: Boolean read fEmptyInputRaisesError write fEmptyInputRaisesError; deprecated 'This has no effect now';
|
||
property UseOsLineEndOnReplace: Boolean read fUseOsLineEndOnReplace write SetUseOsLineEndOnReplace; deprecated 'Use property ReplaceLineEnd instead';
|
||
{$ENDIF}
|
||
|
||
// Returns ID of last error, 0 if no errors (unusable if
|
||
// Error method raises exception) and clear internal status
|
||
// into 0 (no errors).
|
||
function LastError: Integer;
|
||
|
||
// Returns Error message for error with ID = AErrorID.
|
||
function ErrorMsg(AErrorID: Integer): RegExprString; virtual;
|
||
|
||
// Re-compile regex
|
||
procedure Compile;
|
||
|
||
{$IFDEF RegExpPCodeDump}
|
||
// Show compiled regex in textual form
|
||
function Dump(Indent: Integer = 0): RegExprString;
|
||
// Show single opcode in textual form
|
||
function DumpOp(op: TREOp): RegExprString;
|
||
{$ENDIF}
|
||
|
||
function IsCompiled: Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
|
||
// Opcode contains only operations for fixed match length: EXACTLY*, ANY*, etc
|
||
function IsFixedLength(var op: TREOp; var ALen: Integer): Boolean;
|
||
function IsFixedLengthEx(var op: TREOp; var AMinLen, AMaxLen: integer): boolean;
|
||
|
||
// Regular expression.
|
||
// For optimization, TRegExpr will automatically compiles it into 'P-code'
|
||
// (You can see it with help of Dump method) and stores in internal
|
||
// structures. Real [re]compilation occures only when it really needed -
|
||
// while calling Exec, ExecNext, Substitute, Dump, etc
|
||
// and only if Expression or other P-code affected properties was changed
|
||
// after last [re]compilation.
|
||
// If any errors while [re]compilation occures, Error method is called
|
||
// (by default Error raises exception - see below)
|
||
property Expression: RegExprString read fExpression write SetExpression;
|
||
|
||
// Set/get default values of r.e.syntax modifiers. Modifiers in
|
||
// r.e. (?ismx-ismx) will replace this default values.
|
||
// If you try to set unsupported modifier, Error will be called
|
||
// (by defaul Error raises exception ERegExpr).
|
||
property ModifierStr: RegExprString read GetModifierStr write SetModifierStr;
|
||
|
||
property ModifierI: Boolean read GetModifierI write SetModifierI;
|
||
property ModifierR: Boolean read GetModifierR write SetModifierR;
|
||
property ModifierS: Boolean read GetModifierS write SetModifierS;
|
||
property ModifierG: Boolean read GetModifierG write SetModifierG;
|
||
property ModifierM: Boolean read GetModifierM write SetModifierM;
|
||
property ModifierX: Boolean read GetModifierX write SetModifierX;
|
||
|
||
// returns current input string (from last Exec call or last assign
|
||
// to this property).
|
||
// Any assignment to this property clear Match* properties !
|
||
property InputString: RegExprString read fInputString write SetInputString;
|
||
// SetInputSubString
|
||
// Only looks at copy(AInputString, AInputStartPos, AInputLen)
|
||
procedure SetInputSubString(const AInputString: RegExprString; AInputStartPos, AInputLen: Integer);
|
||
|
||
// Number of subexpressions has been found in last Exec* call.
|
||
// If there are no subexpr. but whole expr was found (Exec* returned True),
|
||
// then SubExprMatchCount=0, if no subexpressions nor whole
|
||
// r.e. found (Exec* returned false) then SubExprMatchCount=-1.
|
||
// Note, that some subexpr. may be not found and for such
|
||
// subexpr. MathPos=MatchLen=-1 and Match=''.
|
||
// For example: Expression := '(1)?2(3)?';
|
||
// Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
|
||
// Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
|
||
// Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
|
||
// Exec ('2'): SubExprMatchCount=0, Match[0]='2'
|
||
// Exec ('7') - return False: SubExprMatchCount=-1
|
||
property SubExprMatchCount: Integer read GetSubExprCount;
|
||
|
||
// pos of entrance subexpr. #Idx into tested in last Exec*
|
||
// string. First subexpr. has Idx=1, last - MatchCount,
|
||
// whole r.e. has Idx=0.
|
||
// Returns -1 if in r.e. no such subexpr. or this subexpr.
|
||
// not found in input string.
|
||
property MatchPos[Idx: Integer]: PtrInt read GetMatchPos;
|
||
|
||
// len of entrance subexpr. #Idx r.e. into tested in last Exec*
|
||
// string. First subexpr. has Idx=1, last - MatchCount,
|
||
// whole r.e. has Idx=0.
|
||
// Returns -1 if in r.e. no such subexpr. or this subexpr.
|
||
// not found in input string.
|
||
// Remember - MatchLen may be 0 (if r.e. match empty string) !
|
||
property MatchLen[Idx: Integer]: PtrInt read GetMatchLen;
|
||
|
||
// == copy (InputString, MatchPos [Idx], MatchLen [Idx])
|
||
// Returns '' if in r.e. no such subexpr. or this subexpr.
|
||
// not found in input string.
|
||
property Match[Idx: Integer]: RegExprString read GetMatch;
|
||
|
||
// get index of group (subexpression) by name, to support named groups
|
||
// like in Python: (?P<name>regex)
|
||
function MatchIndexFromName(const AName: RegExprString): Integer;
|
||
|
||
function MatchFromName(const AName: RegExprString): RegExprString;
|
||
|
||
// Returns position in r.e. where compiler stopped.
|
||
// Useful for error diagnostics
|
||
property CompilerErrorPos: PtrInt read GetCompilerErrorPos;
|
||
|
||
{$IFDEF UseSpaceChars}
|
||
// Contains chars, treated as /s (initially filled with RegExprSpaceChars
|
||
// global constant)
|
||
property SpaceChars: RegExprString read fSpaceChars write fSpaceChars;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF UseWordChars}
|
||
// Contains chars, treated as /w (initially filled with RegExprWordChars
|
||
// global constant)
|
||
property WordChars: RegExprString read fWordChars write fWordChars;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF UseLineSep}
|
||
// line separators (like \n in Unix)
|
||
property LineSeparators: RegExprString read fLineSeparators write SetLineSeparators;
|
||
{$ENDIF}
|
||
|
||
// support paired line-break CR LF
|
||
property UseLinePairedBreak: Boolean read fUsePairedBreak write SetUsePairedBreak;
|
||
|
||
property ReplaceLineEnd: RegExprString read fReplaceLineEnd write fReplaceLineEnd;
|
||
|
||
property SlowChecksSizeMax: Integer read fSlowChecksSizeMax write fSlowChecksSizeMax;
|
||
|
||
// Errors during Exec() return false and set LastError. This option allows
|
||
// them to raise an Exception
|
||
property RaiseForRuntimeError: Boolean read fRaiseForRuntimeError write fRaiseForRuntimeError;
|
||
|
||
property AllowUnsafeLookBehind: Boolean read FAllowUnsafeLookBehind write FAllowUnsafeLookBehind;
|
||
|
||
// Make sure a { always is a range / don't allow unescaped literal usage
|
||
property AllowLiteralBraceWithoutRange: Boolean read FAllowLiteralBraceWithoutRange write FAllowLiteralBraceWithoutRange;
|
||
// support {,123} defaulting the min-matches to 0
|
||
property AllowBraceWithoutMin: Boolean read FAllowBraceWithoutMin write FAllowBraceWithoutMin;
|
||
end;
|
||
|
||
type
|
||
ERegExpr = class(Exception)
|
||
public
|
||
ErrorCode: Integer;
|
||
CompilerErrorPos: PtrInt;
|
||
end;
|
||
|
||
// true if string AInputString match regular expression ARegExpr
|
||
// ! will raise exeption if syntax errors in ARegExpr
|
||
function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): Boolean;
|
||
|
||
// Split AInputStr into APieces by r.e. ARegExpr occurencies
|
||
procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
|
||
APieces: TStrings);
|
||
|
||
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
|
||
// If AUseSubstitution is true, then AReplaceStr will be used
|
||
// as template for Substitution methods.
|
||
// For example:
|
||
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
|
||
// 'BLOCK( test1)', 'def "$1" value "$2"', True)
|
||
// will return: def 'BLOCK' value 'test1'
|
||
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
|
||
// 'BLOCK( test1)', 'def "$1" value "$2"')
|
||
// will return: def "$1" value "$2"
|
||
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
|
||
AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
|
||
{$IFDEF OverMeth}overload;
|
||
|
||
// Alternate form allowing to set more parameters.
|
||
|
||
type
|
||
TRegexReplaceOption = (
|
||
rroModifierI,
|
||
rroModifierR,
|
||
rroModifierS,
|
||
rroModifierG,
|
||
rroModifierM,
|
||
rroModifierX,
|
||
rroUseSubstitution,
|
||
rroUseOsLineEnd
|
||
);
|
||
TRegexReplaceOptions = set of TRegexReplaceOption;
|
||
|
||
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
|
||
Options: TRegexReplaceOptions): RegExprString; overload;
|
||
{$ENDIF}
|
||
// Replace all metachars with its safe representation,
|
||
// for example 'abc$cd.(' converts into 'abc\$cd\.\('
|
||
// This function useful for r.e. autogeneration from
|
||
// user input
|
||
function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
|
||
// Makes list of subexpressions found in ARegExpr r.e.
|
||
// In ASubExps every item represent subexpression,
|
||
// from first to last, in format:
|
||
// String - subexpression text (without '()')
|
||
// low word of Object - starting position in ARegExpr, including '('
|
||
// if exists! (first position is 1)
|
||
// high word of Object - length, including starting '(' and ending ')'
|
||
// if exist!
|
||
// AExtendedSyntax - must be True if modifier /m will be On while
|
||
// using the r.e.
|
||
// Useful for GUI editors of r.e. etc (You can find example of using
|
||
// in TestRExp.dpr project)
|
||
// Returns
|
||
// 0 Success. No unbalanced brackets was found;
|
||
// -1 There are not enough closing brackets ')';
|
||
// -(n+1) At position n was found opening '[' without
|
||
// corresponding closing ']';
|
||
// n At position n was found closing bracket ')' without
|
||
// corresponding opening '('.
|
||
// If Result <> 0, then ASubExpr can contain empty items or illegal ones
|
||
function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings;
|
||
AExtendedSyntax: Boolean{$IFDEF DefParam} = False{$ENDIF}): Integer;
|
||
|
||
implementation
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
uses
|
||
xregexpr_unicodedata;
|
||
{$ENDIF}
|
||
|
||
const
|
||
// TRegExpr.VersionMajor/Minor return values of these constants:
|
||
REVersionMajor = 1;
|
||
REVersionMinor = 184;
|
||
|
||
OpKind_End = REChar(1);
|
||
OpKind_MetaClass = REChar(2);
|
||
OpKind_Range = REChar(3);
|
||
OpKind_Char = REChar(4);
|
||
OpKind_CategoryYes = REChar(5);
|
||
OpKind_CategoryNo = REChar(6);
|
||
|
||
RegExprAllSet = [0 .. 255];
|
||
RegExprWordSet = [Ord('a') .. Ord('z'), Ord('A') .. Ord('Z'), Ord('0') .. Ord('9'), Ord('_')];
|
||
RegExprDigitSet = [Ord('0') .. Ord('9')];
|
||
RegExprLowerAzSet = [Ord('a') .. Ord('z')];
|
||
RegExprUpperAzSet = [Ord('A') .. Ord('Z')];
|
||
RegExprAllAzSet = RegExprLowerAzSet + RegExprUpperAzSet;
|
||
RegExprSpaceSet = [Ord(' '), $9, $A, $D, $C];
|
||
RegExprLineSeparatorsSet = [$d, $a, $b, $c] {$IFDEF UnicodeRE} + [$85] {$ENDIF};
|
||
RegExprHorzSeparatorsSet = [9, $20, $A0];
|
||
|
||
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments
|
||
|
||
type
|
||
TRENextOff = PtrInt;
|
||
// internal Next "pointer" (offset to current p-code)
|
||
PRENextOff = ^TRENextOff;
|
||
// used for extracting Next "pointers" from compiled r.e.
|
||
TREBracesArg = Integer; // type of {m,n} arguments
|
||
PREBracesArg = ^TREBracesArg;
|
||
|
||
TREGroupKind = (
|
||
gkNormalGroup,
|
||
gkNonCapturingGroup,
|
||
gkAtomicGroup,
|
||
gkNamedGroupReference,
|
||
gkComment,
|
||
gkModifierString,
|
||
gkLookahead,
|
||
gkLookaheadNeg,
|
||
gkLookbehind,
|
||
gkLookbehindNeg,
|
||
gkRecursion,
|
||
gkSubCall
|
||
);
|
||
|
||
TReOpLookBehindOptions = packed record
|
||
MatchLenMin, MatchLenMax: TREBracesArg;
|
||
IsGreedy: REChar;
|
||
end;
|
||
PReOpLookBehindOptions = ^TReOpLookBehindOptions;
|
||
|
||
const
|
||
ReOpLookBehindOptionsSz = SizeOf(TReOpLookBehindOptions) div SizeOf(REChar);
|
||
OPT_LOOKBEHIND_NON_GREEDY = REChar(0);
|
||
OPT_LOOKBEHIND_GREEDY = REChar(1);
|
||
OPT_LOOKBEHIND_FIXED = REChar(2);
|
||
|
||
// Alexey T.: handling of that define FPC_REQUIRES_PROPER_ALIGNMENT was present even 15 years ago,
|
||
// but with it, we have failing of some RegEx tests, on ARM64 CPU.
|
||
// If I undefine FPC_REQUIRES_PROPER_ALIGNMENT, all tests run OK on ARM64 again.
|
||
{$undef FPC_REQUIRES_PROPER_ALIGNMENT}
|
||
|
||
const
|
||
REOpSz = SizeOf(TREOp) div SizeOf(REChar);
|
||
// size of OP_ command in REChars
|
||
{$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
|
||
// add space for aligning pointer
|
||
// -1 is the correct max size but also needed for InsertOperator that needs a multiple of pointer size
|
||
RENextOffSz = (2 * SizeOf(TRENextOff) div SizeOf(REChar)) - 1;
|
||
REBracesArgSz = (2 * SizeOf(TREBracesArg) div SizeOf(REChar));
|
||
// add space for aligning pointer
|
||
{$ELSE}
|
||
RENextOffSz = (SizeOf(TRENextOff) div SizeOf(REChar));
|
||
// size of Next pointer in REChars
|
||
REBracesArgSz = SizeOf(TREBracesArg) div SizeOf(REChar);
|
||
// size of BRACES arguments in REChars
|
||
{$ENDIF}
|
||
RENumberSz = SizeOf(LongInt) div SizeOf(REChar);
|
||
|
||
REBranchArgSz = 2; // 2 * (REChar div REChar)
|
||
|
||
type
|
||
TReGroupIndex = LongInt;
|
||
PReGroupIndex = ^TReGroupIndex;
|
||
const
|
||
ReGroupIndexSz = SizeOf(TReGroupIndex) div SizeOf(REChar);
|
||
|
||
type
|
||
PtrPair = {$IFDEF UnicodeRE} ^LongInt; {$ELSE} ^Word; {$ENDIF}
|
||
|
||
function GroupDataArraySize(ARequired, ACurrent: Integer): Integer;
|
||
begin
|
||
Result := ARequired;
|
||
if Result > ACurrent then
|
||
Exit;
|
||
|
||
// Keep some extra
|
||
if Result > ACurrent - RegexGroupCountIncrement then
|
||
Result := ACurrent;
|
||
end;
|
||
|
||
function IsPairedBreak(p: PRegExprChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
const
|
||
cBreak = {$IFDEF UnicodeRE} $000D000A; {$ELSE} $0D0A; {$ENDIF}
|
||
begin
|
||
Result := PtrPair(p)^ = cBreak;
|
||
end;
|
||
|
||
function IsAnyLineBreak(C: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case C of
|
||
#10,
|
||
#13,
|
||
#$0B,
|
||
#$0C
|
||
{$ifdef UnicodeRE}
|
||
, #$85
|
||
, #$2028
|
||
, #$2029
|
||
{$endif}:
|
||
Result := True;
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
function _FindCharInBuffer(SBegin, SEnd: PRegExprChar; Ch: REChar): PRegExprChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
while SBegin < SEnd do
|
||
begin
|
||
if SBegin^ = Ch then
|
||
begin
|
||
Result := SBegin;
|
||
Exit;
|
||
end;
|
||
Inc(SBegin);
|
||
end;
|
||
Result := nil;
|
||
end;
|
||
|
||
function IsIgnoredChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case AChar of
|
||
' ', #9, #$d, #$a:
|
||
Result := True
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
function _IsMetaChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case AChar of
|
||
'd', 'D',
|
||
's', 'S',
|
||
'w', 'W',
|
||
'v', 'V',
|
||
'h', 'H',
|
||
'R':
|
||
Result := True
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
function AlignToPtr(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
{$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
|
||
Result := Align(p, SizeOf(Pointer));
|
||
{$ELSE}
|
||
Result := p;
|
||
{$ENDIF}
|
||
end;
|
||
|
||
function AlignToInt(const p: Pointer): Pointer; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
{$IFDEF FPC_REQUIRES_PROPER_ALIGNMENT}
|
||
Result := Align(p, SizeOf(Integer));
|
||
{$ELSE}
|
||
Result := p;
|
||
{$ENDIF}
|
||
end;
|
||
|
||
function StrLScan(P: PRegExprChar; C: REChar; len: PtrInt): PRegExprChar;
|
||
Var
|
||
count: PtrInt;
|
||
Begin
|
||
count := 0;
|
||
{ Find first matching character of Ch in Str }
|
||
while (count < len) do
|
||
begin
|
||
if C = P[count] then
|
||
begin
|
||
StrLScan := @(P[count]);
|
||
exit;
|
||
end;
|
||
Inc(count);
|
||
end;
|
||
{ nothing found. }
|
||
StrLScan := nil;
|
||
end;
|
||
|
||
function StrLComp(str1,str2 : PRegExprChar; len : PtrInt) : PtrInt;
|
||
var
|
||
counter: PtrInt;
|
||
c1, c2: REChar;
|
||
begin
|
||
if len = 0 then
|
||
begin
|
||
StrLComp := 0;
|
||
exit;
|
||
end;
|
||
counter:=0;
|
||
repeat
|
||
c1:=str1[counter];
|
||
c2:=str2[counter];
|
||
inc(counter);
|
||
until (c1<>c2) or (counter>=len) or (c1=#0) or (c2=#0);
|
||
StrLComp:=ord(c1)-ord(c2);
|
||
end;
|
||
|
||
function StrLPos(str1,str2 : PRegExprChar; len1, len2: PtrInt) : PRegExprChar;
|
||
var
|
||
p : PRegExprChar;
|
||
begin
|
||
StrLPos := nil;
|
||
if (str1 = nil) or (str2 = nil) then
|
||
exit;
|
||
len1 := len1 - len2 + 1;
|
||
p := StrLScan(str1,str2^, len1);
|
||
while p <> nil do
|
||
begin
|
||
if StrLComp(p, str2, len2)=0 then
|
||
begin
|
||
StrLPos := p;
|
||
exit;
|
||
end;
|
||
inc(p);
|
||
p := StrLScan(p, str2^, len1 - (p-str1));
|
||
end;
|
||
end;
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
function _UpperCase(Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
Result := CharUpperArray[Ord(Ch)];
|
||
end;
|
||
|
||
function _LowerCase(Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
Result := CharLowerArray[Ord(Ch)];
|
||
end;
|
||
|
||
{$ELSE}
|
||
function _UpperCase(Ch: REChar): REChar;
|
||
begin
|
||
Result := Ch;
|
||
if (Ch >= 'a') and (Ch <= 'z') then
|
||
begin
|
||
Dec(Result, 32);
|
||
Exit;
|
||
end;
|
||
if Ord(Ch) < 128 then
|
||
Exit;
|
||
|
||
{$IFDEF FPC}
|
||
{$IFDEF UnicodeRE}
|
||
Result := UnicodeUpperCase(Ch)[1];
|
||
{$ELSE}
|
||
Result := AnsiUpperCase(Ch)[1];
|
||
{$ENDIF}
|
||
{$ELSE}
|
||
{$IFDEF UnicodeRE}
|
||
{$IFDEF D_XE4}
|
||
Result := Ch.ToUpper;
|
||
{$ELSE}
|
||
{$IFDEF D2009}
|
||
Result := TCharacter.ToUpper(Ch);
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
{$ELSE}
|
||
Result := AnsiUpperCase(Ch)[1];
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
end;
|
||
|
||
function _LowerCase(Ch: REChar): REChar;
|
||
begin
|
||
Result := Ch;
|
||
if (Ch >= 'A') and (Ch <= 'Z') then
|
||
begin
|
||
Inc(Result, 32);
|
||
Exit;
|
||
end;
|
||
if Ord(Ch) < 128 then
|
||
Exit;
|
||
|
||
{$IFDEF FPC}
|
||
{$IFDEF UnicodeRE}
|
||
Result := UnicodeLowerCase(Ch)[1];
|
||
{$ELSE}
|
||
Result := AnsiLowerCase(Ch)[1];
|
||
{$ENDIF}
|
||
{$ELSE}
|
||
{$IFDEF UnicodeRE}
|
||
{$IFDEF D_XE4}
|
||
Result := Ch.ToLower;
|
||
{$ELSE}
|
||
{$IFDEF D2009}
|
||
Result := TCharacter.ToLower(Ch);
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
{$ELSE}
|
||
Result := AnsiLowerCase(Ch)[1];
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
end;
|
||
{$ENDIF}
|
||
|
||
function InvertCase(const Ch: REChar): REChar; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
Result := _UpperCase(Ch);
|
||
if Result = Ch then
|
||
Result := _LowerCase(Ch);
|
||
end;
|
||
|
||
function _FindClosingBracket(P, PEnd: PRegExprChar): PRegExprChar;
|
||
var
|
||
Level: Integer;
|
||
begin
|
||
Result := nil;
|
||
Level := 1;
|
||
repeat
|
||
if P >= PEnd then Exit;
|
||
case P^ of
|
||
EscChar:
|
||
Inc(P);
|
||
'(':
|
||
begin
|
||
Inc(Level);
|
||
end;
|
||
')':
|
||
begin
|
||
Dec(Level);
|
||
if Level = 0 then
|
||
begin
|
||
Result := P;
|
||
Exit;
|
||
end;
|
||
end;
|
||
end;
|
||
Inc(P);
|
||
until False;
|
||
end;
|
||
|
||
{$IFDEF UNICODEEX}
|
||
procedure IncUnicode(var p: PRegExprChar); {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
// make additional increment if we are on low-surrogate char
|
||
// no need to check p<fInputEnd, at the end of string we have chr(0)
|
||
var
|
||
ch: REChar;
|
||
begin
|
||
Inc(p);
|
||
ch := p^;
|
||
if (Ord(ch) >= $DC00) and (Ord(ch) <= $DFFF) then
|
||
Inc(p);
|
||
end;
|
||
|
||
procedure IncUnicode2(var p: PRegExprChar; var N: Integer); {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
var
|
||
ch: REChar;
|
||
begin
|
||
Inc(p);
|
||
Inc(N);
|
||
ch := p^;
|
||
if (Ord(ch) >= $DC00) and (Ord(ch) <= $DFFF) then
|
||
begin
|
||
Inc(p);
|
||
Inc(N);
|
||
end;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
{ ============================================================= }
|
||
{ ===================== Global functions ====================== }
|
||
{ ============================================================= }
|
||
|
||
function IsModifiersEqual(const A, B: TRegExprModifiers): Boolean;
|
||
begin
|
||
Result :=
|
||
(A.I = B.I) and
|
||
(A.G = B.G) and
|
||
(A.M = B.M) and
|
||
(A.S = B.S) and
|
||
(A.R = B.R) and
|
||
(A.X = B.X);
|
||
end;
|
||
|
||
function ParseModifiers(const APtr: PRegExprChar;
|
||
ALen: Integer;
|
||
var AValue: TRegExprModifiers): Boolean;
|
||
// Parse string and set AValue if it's in format 'ismxrg-ismxrg'
|
||
var
|
||
IsOn: Boolean;
|
||
i: Integer;
|
||
begin
|
||
Result := True;
|
||
IsOn := True;
|
||
for i := 0 to ALen-1 do
|
||
case APtr[i] of
|
||
'-':
|
||
if IsOn then
|
||
begin
|
||
IsOn := False;
|
||
end
|
||
else
|
||
begin
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
'I', 'i':
|
||
AValue.I := IsOn;
|
||
'R', 'r':
|
||
AValue.R := IsOn;
|
||
'S', 's':
|
||
AValue.S := IsOn;
|
||
'G', 'g':
|
||
AValue.G := IsOn;
|
||
'M', 'm':
|
||
AValue.M := IsOn;
|
||
'X', 'x':
|
||
AValue.X := IsOn;
|
||
else
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
function ExecRegExpr(const ARegExpr, AInputStr: RegExprString): Boolean;
|
||
var
|
||
r: TRegExpr;
|
||
begin
|
||
r := TRegExpr.Create;
|
||
try
|
||
r.Expression := ARegExpr;
|
||
Result := r.Exec(AInputStr);
|
||
finally
|
||
r.Free;
|
||
end;
|
||
end; { of function ExecRegExpr
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure SplitRegExpr(const ARegExpr, AInputStr: RegExprString;
|
||
APieces: TStrings);
|
||
var
|
||
r: TRegExpr;
|
||
begin
|
||
APieces.Clear;
|
||
r := TRegExpr.Create;
|
||
try
|
||
r.Expression := ARegExpr;
|
||
r.Split(AInputStr, APieces);
|
||
finally
|
||
r.Free;
|
||
end;
|
||
end; { of procedure SplitRegExpr
|
||
-------------------------------------------------------------- }
|
||
|
||
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
|
||
AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
|
||
begin
|
||
with TRegExpr.Create do
|
||
try
|
||
Expression := ARegExpr;
|
||
Result := Replace(AInputStr, AReplaceStr, AUseSubstitution);
|
||
finally
|
||
Free;
|
||
end;
|
||
end; { of function ReplaceRegExpr
|
||
-------------------------------------------------------------- }
|
||
{$IFDEF OverMeth}
|
||
|
||
function ReplaceRegExpr(const ARegExpr, AInputStr, AReplaceStr: RegExprString;
|
||
Options: TRegexReplaceOptions): RegExprString; overload;
|
||
|
||
begin
|
||
with TRegExpr.Create do
|
||
try
|
||
ModifierI := (rroModifierI in Options);
|
||
ModifierR := (rroModifierR in Options);
|
||
ModifierS := (rroModifierS in Options);
|
||
ModifierG := (rroModifierG in Options);
|
||
ModifierM := (rroModifierM in Options);
|
||
ModifierX := (rroModifierX in Options);
|
||
// Set this after the above, if the regex contains modifiers, they will be applied.
|
||
Expression := ARegExpr;
|
||
if rroUseOsLineEnd in Options then
|
||
ReplaceLineEnd := sLineBreak
|
||
else
|
||
ReplaceLineEnd := #10;
|
||
Result := Replace(AInputStr, AReplaceStr, rroUseSubstitution in Options);
|
||
finally
|
||
Free;
|
||
end;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
(*
|
||
const
|
||
MetaChars_Init = '^$.[()|?+*' + EscChar + '{';
|
||
MetaChars = MetaChars_Init; // not needed to be a variable, const is faster
|
||
MetaAll = MetaChars_Init + ']}'; // Very similar to MetaChars, but slighly changed.
|
||
*)
|
||
|
||
function _IsMetaSymbol1(ch: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case ch of
|
||
'^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{':
|
||
Result := True
|
||
else
|
||
Result := False
|
||
end;
|
||
end;
|
||
|
||
function _IsMetaSymbol2(ch: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case ch of
|
||
'^', '$', '.', '[', '(', ')', '|', '?', '+', '*', EscChar, '{',
|
||
']', '}':
|
||
Result := True
|
||
else
|
||
Result := False
|
||
end;
|
||
end;
|
||
|
||
function QuoteRegExprMetaChars(const AStr: RegExprString): RegExprString;
|
||
var
|
||
i, i0, Len: Integer;
|
||
ch: REChar;
|
||
begin
|
||
Result := '';
|
||
Len := Length(AStr);
|
||
i := 1;
|
||
i0 := i;
|
||
while i <= Len do
|
||
begin
|
||
ch := AStr[i];
|
||
if _IsMetaSymbol2(ch) then
|
||
begin
|
||
Result := Result + System.Copy(AStr, i0, i - i0) + EscChar + ch;
|
||
i0 := i + 1;
|
||
end;
|
||
Inc(i);
|
||
end;
|
||
Result := Result + System.Copy(AStr, i0, MaxInt); // Tail
|
||
end; { of function QuoteRegExprMetaChars
|
||
-------------------------------------------------------------- }
|
||
|
||
function RegExprSubExpressions(const ARegExpr: RegExprString; ASubExprs: TStrings;
|
||
AExtendedSyntax: Boolean{$IFDEF DefParam} = False{$ENDIF}): Integer;
|
||
type
|
||
TStackItemRec = record
|
||
SubExprIdx: Integer;
|
||
StartPos: PtrInt;
|
||
end;
|
||
|
||
TStackArray = packed array [0 .. RegexMaxMaxGroups - 1] of TStackItemRec;
|
||
var
|
||
Len, SubExprLen: Integer;
|
||
i, i0: Integer;
|
||
Modif: TRegExprModifiers;
|
||
Stack: ^TStackArray;
|
||
StackIdx, StackSz: Integer;
|
||
begin
|
||
Result := 0; // no unbalanced brackets found at this very moment
|
||
FillChar(Modif, SizeOf(Modif), 0);
|
||
ASubExprs.Clear; // I don't think that adding to non empty list
|
||
// can be useful, so I simplified algorithm to work only with empty list
|
||
|
||
Len := Length(ARegExpr); // some optimization tricks
|
||
|
||
// first we have to calculate number of subexpression to reserve
|
||
// space in Stack array (may be we'll reserve more than needed, but
|
||
// it's faster then memory reallocation during parsing)
|
||
StackSz := 1; // add 1 for entire r.e.
|
||
for i := 1 to Len do
|
||
if ARegExpr[i] = '(' then
|
||
Inc(StackSz);
|
||
// SetLength (Stack, StackSz);
|
||
GetMem(Stack, SizeOf(TStackItemRec) * StackSz);
|
||
|
||
try
|
||
StackIdx := 0;
|
||
i := 1;
|
||
while (i <= Len) do
|
||
begin
|
||
case ARegExpr[i] of
|
||
'(':
|
||
begin
|
||
if (i < Len) and (ARegExpr[i + 1] = '?') then
|
||
begin
|
||
// this is not subexpression, but comment or other
|
||
// Perl extension. We must check is it (?ismxrg-ismxrg)
|
||
// and change AExtendedSyntax if /x is changed.
|
||
Inc(i, 2); // skip '(?'
|
||
i0 := i;
|
||
while (i <= Len) and (ARegExpr[i] <> ')') do
|
||
Inc(i);
|
||
if i > Len then
|
||
Result := -1 // unbalansed '('
|
||
else
|
||
if ParseModifiers(@ARegExpr[i0], i - i0, Modif) then
|
||
// Alexey-T: original code had copy from i, not from i0
|
||
AExtendedSyntax := Modif.X;
|
||
end
|
||
else
|
||
begin // subexpression starts
|
||
ASubExprs.Add(''); // just reserve space
|
||
with Stack[StackIdx] do
|
||
begin
|
||
SubExprIdx := ASubExprs.Count - 1;
|
||
StartPos := i;
|
||
end;
|
||
Inc(StackIdx);
|
||
end;
|
||
end;
|
||
')':
|
||
begin
|
||
if StackIdx = 0 then
|
||
Result := i // unbalanced ')'
|
||
else
|
||
begin
|
||
Dec(StackIdx);
|
||
with Stack[StackIdx] do
|
||
begin
|
||
SubExprLen := i - StartPos + 1;
|
||
ASubExprs.Objects[SubExprIdx] :=
|
||
TObject(StartPos or (SubExprLen ShL 16));
|
||
ASubExprs[SubExprIdx] := System.Copy(ARegExpr, StartPos + 1,
|
||
SubExprLen - 2); // add without brackets
|
||
end;
|
||
end;
|
||
end;
|
||
EscChar:
|
||
Inc(i); // skip quoted symbol
|
||
'[':
|
||
begin
|
||
// we have to skip character ranges at once, because they can
|
||
// contain '#', and '#' in it must NOT be recognized as eXtended
|
||
// comment beginning!
|
||
i0 := i;
|
||
Inc(i);
|
||
if ARegExpr[i] = ']' // first ']' inside [] treated as simple char, no need to check '['
|
||
then
|
||
Inc(i);
|
||
while (i <= Len) and (ARegExpr[i] <> ']') do
|
||
if ARegExpr[i] = EscChar
|
||
then
|
||
Inc(i, 2) // skip 'escaped' char to prevent stopping at '\]'
|
||
else
|
||
Inc(i);
|
||
if (i > Len) or (ARegExpr[i] <> ']')
|
||
then
|
||
Result := -(i0 + 1); // unbalanced '['
|
||
end;
|
||
'#':
|
||
if AExtendedSyntax then
|
||
begin
|
||
// skip eXtended comments
|
||
while (i <= Len) and (ARegExpr[i] <> #$d) and (ARegExpr[i] <> #$a)
|
||
// do not use [#$d, #$a] due to Unicode compatibility
|
||
do
|
||
Inc(i);
|
||
while (i + 1 <= Len) and
|
||
((ARegExpr[i + 1] = #$d) or (ARegExpr[i + 1] = #$a)) do
|
||
Inc(i); // attempt to work with different kinds of line separators
|
||
// now we are at the line separator that must be skipped.
|
||
end;
|
||
// here is no 'else' clause - we simply skip ordinary chars
|
||
end; // of case
|
||
Inc(i); // skip scanned char
|
||
// ! can move after Len due to skipping quoted symbol
|
||
end;
|
||
|
||
// check brackets balance
|
||
if StackIdx <> 0 then
|
||
Result := -1; // unbalansed '('
|
||
|
||
// check if entire r.e. added
|
||
if (ASubExprs.Count = 0) or ((PtrInt(ASubExprs.Objects[0]) and $FFFF) <> 1)
|
||
or (((PtrInt(ASubExprs.Objects[0]) ShR 16) and $FFFF) <> Len)
|
||
// whole r.e. wasn't added because it isn't bracketed
|
||
// well, we add it now:
|
||
then
|
||
ASubExprs.InsertObject(0, ARegExpr, TObject((Len ShL 16) or 1));
|
||
|
||
finally
|
||
FreeMem(Stack);
|
||
end;
|
||
end; { of function RegExprSubExpressions
|
||
-------------------------------------------------------------- }
|
||
|
||
const
|
||
OP_MAGIC = TREOp(216); // programm signature
|
||
|
||
OP_EEND = TREOp(0); // End of program
|
||
OP_BOL = TREOp(1); // Empty match at beginning of line
|
||
OP_EOL = TREOp(2); // Empty match at end of line
|
||
OP_ANY = TREOp(3); // Match any one character
|
||
OP_ANYOF = TREOp(4); // Match any character in string
|
||
OP_ANYBUT = TREOp(5); // Match any character not in string
|
||
OP_BRANCH = TREOp(6); // Match this alternative, or the next
|
||
OP_BACK = TREOp(7); // Jump backward (Next < 0)
|
||
OP_EXACTLY = TREOp(8); // Match string exactly
|
||
OP_NOTHING = TREOp(9); // Match empty string
|
||
OP_STAR = TREOp(10); // Match this (simple) thing 0 or more times
|
||
OP_PLUS = TREOp(11); // Match this (simple) thing 1 or more times
|
||
OP_ANYDIGIT = TREOp(12); // Match any digit (equiv [0-9])
|
||
OP_NOTDIGIT = TREOp(13); // Match not digit (equiv [0-9])
|
||
OP_ANYLETTER = TREOp(14); // Match any 'word' char
|
||
OP_NOTLETTER = TREOp(15); // Match any 'non-word' char
|
||
OP_ANYSPACE = TREOp(16); // Match any 'space' char
|
||
OP_NOTSPACE = TREOp(17); // Match 'not space' char
|
||
OP_BRACES = TREOp(18);
|
||
// Node,Min,Max Match this (simple) thing from Min to Max times.
|
||
// Min and Max are TREBracesArg
|
||
OP_COMMENT = TREOp(19); // Comment
|
||
OP_EXACTLY_CI = TREOp(20); // Match string, case insensitive
|
||
OP_ANYOF_CI = TREOp(21); // Match any character in string, case insensitive
|
||
OP_ANYBUT_CI = TREOp(22); // Match any char not in string, case insensitive
|
||
OP_LOOPENTRY = TREOp(23); // Start of loop (Node - LOOP for this loop)
|
||
OP_LOOP = TREOp(24); // Back jump for LOOPENTRY
|
||
// Min and Max are TREBracesArg
|
||
// Node - next node in sequence,
|
||
// LoopEntryJmp - associated LOOPENTRY node addr
|
||
OP_EOL2 = TReOp(25); // like OP_EOL, but also matches before final line-break
|
||
OP_CONTINUE_POS = TReOp(26); // \G, where offset is from last match end or from Exec(AOffset)
|
||
OP_ANYLINEBREAK = TReOp(27); // \R
|
||
OP_BSUBEXP = TREOp(28); // Match previously matched subexpression #Idx (stored as REChar)
|
||
OP_BSUBEXP_CI = TREOp(29); // -"- in case-insensitive mode
|
||
|
||
// Non-greedy ops
|
||
OP_STAR_NG = TREOp(30); // Same as OP_START but in non-greedy mode
|
||
OP_PLUS_NG = TREOp(31); // Same as OP_PLUS but in non-greedy mode
|
||
OP_BRACES_NG = TREOp(32); // Same as OP_BRACES but in non-greedy mode
|
||
OP_LOOP_NG = TREOp(33); // Same as OP_LOOP but in non-greedy mode
|
||
|
||
// Multiline mode \m
|
||
OP_BOL_ML = TREOp(34); // Match "" at beginning of line
|
||
OP_EOL_ML = TREOp(35); // Match "" at end of line
|
||
OP_ANY_ML = TREOp(36); // Match any one character
|
||
|
||
// Word boundary
|
||
OP_BOUND = TREOp(37); // Match "" between word char and non-word char
|
||
OP_NOTBOUND = TREOp(38); // Opposite to OP_BOUND
|
||
|
||
OP_ANYHORZSEP = TREOp(39); // Any horizontal whitespace \h
|
||
OP_NOTHORZSEP = TREOp(40); // Not horizontal whitespace \H
|
||
OP_ANYVERTSEP = TREOp(41); // Any vertical whitespace \v
|
||
OP_NOTVERTSEP = TREOp(42); // Not vertical whitespace \V
|
||
|
||
OP_ANYCATEGORY = TREOp(43); // \p{L}
|
||
OP_NOTCATEGORY = TREOp(44); // \P{L}
|
||
|
||
// Possessive quantifiers
|
||
OP_STAR_POSS = TReOp(45);
|
||
OP_PLUS_POSS = TReOp(46);
|
||
OP_BRACES_POSS = TReOp(47);
|
||
|
||
OP_RECUR = TReOp(48);
|
||
|
||
OP_OPEN = TREOp(50); // Opening of group
|
||
OP_CLOSE = TREOp(51); // Closing of group
|
||
OP_OPEN_ATOMIC = TREOp(52); // Opening of group
|
||
OP_CLOSE_ATOMIC = TREOp(53); // Closing of group
|
||
|
||
OP_LOOKAHEAD = TREOp(55);
|
||
OP_LOOKAHEAD_NEG = TREOp(56);
|
||
OP_LOOKAHEAD_END = TREOp(57);
|
||
OP_LOOKBEHIND = TREOp(58);
|
||
OP_LOOKBEHIND_NEG = TREOp(59);
|
||
OP_LOOKBEHIND_END = TREOp(60);
|
||
|
||
OP_SUBCALL = TREOp(65); // Call of subroutine; OP_SUBCALL+i is for group i
|
||
OP_LOOP_POSS = TREOp(66); // Same as OP_LOOP but in non-greedy mode
|
||
|
||
// Guarded branch
|
||
// If a branch is know to begin with a specific letter (starts with OP_EXACTLY[_CI])
|
||
// then that letter can be tested before recursively calling MatchPrim. (guarded from non-match entering)
|
||
OP_GBRANCH = TREOp(67);
|
||
OP_GBRANCH_EX = TREOp(68);
|
||
OP_GBRANCH_EX_CI = TREOp(69);
|
||
|
||
OP_RESET_MATCHPOS = TReOp(70);
|
||
|
||
OP_NONE = High(TREOp);
|
||
|
||
// We work with p-code through pointers, compatible with PRegExprChar.
|
||
// Note: all code components (TRENextOff, TREOp, TREBracesArg, etc)
|
||
// must have lengths that can be divided by SizeOf (REChar) !
|
||
// A node is TREOp of opcode followed Next "pointer" of TRENextOff type.
|
||
// The Next is a offset from the opcode of the node containing it.
|
||
// An operand, if any, simply follows the node. (Note that much of
|
||
// the code generation knows about this implicit relationship!)
|
||
// Using TRENextOff=PtrInt speed up p-code processing.
|
||
|
||
// Opcodes description:
|
||
//
|
||
// BRANCH The set of branches constituting a single choice are hooked
|
||
// together with their "next" pointers, since precedence prevents
|
||
// anything being concatenated to any individual branch. The
|
||
// "next" pointer of the last BRANCH in a choice points to the
|
||
// thing following the whole choice. This is also where the
|
||
// final "next" pointer of each individual branch points; each
|
||
// branch starts with the operand node of a BRANCH node.
|
||
// BACK Normal "next" pointers all implicitly point forward; BACK
|
||
// exists to make loop structures possible.
|
||
// STAR,PLUS,BRACES '?', and complex '*' and '+', are implemented as
|
||
// circular BRANCH structures using BACK. Complex '{min,max}'
|
||
// - as pair LOOPENTRY-LOOP (see below). Simple cases (one
|
||
// character per match) are implemented with STAR, PLUS and
|
||
// BRACES for speed and to minimize recursive plunges.
|
||
// LOOPENTRY,LOOP {min,max} are implemented as special pair
|
||
// LOOPENTRY-LOOP. Each LOOPENTRY initialize loopstack for
|
||
// current level.
|
||
// OPEN,CLOSE are numbered at compile time.
|
||
|
||
{ ============================================================= }
|
||
{ ================== Error handling section =================== }
|
||
{ ============================================================= }
|
||
|
||
const
|
||
reeOk = 0;
|
||
reeCompNullArgument = 100;
|
||
reeUnknownMetaSymbol = 101;
|
||
reeCompParseRegTooManyBrackets = 102;
|
||
reeCompParseRegUnmatchedBrackets = 103;
|
||
reeCompParseRegUnmatchedBrackets2 = 104;
|
||
reeCompParseRegJunkOnEnd = 105;
|
||
reeNotQuantifiable = 106;
|
||
reeNestedQuantif = 107;
|
||
reeBadHexDigit = 108;
|
||
reeInvalidRange = 109;
|
||
reeParseAtomTrailingBackSlash = 110;
|
||
reeNoHexCodeAfterBSlashX = 111;
|
||
reeHexCodeAfterBSlashXTooBig = 112;
|
||
reeUnmatchedSqBrackets = 113;
|
||
reeInternalUrp = 114;
|
||
reeQuantifFollowsNothing = 115;
|
||
reeTrailingBackSlash = 116;
|
||
reeNoLetterAfterBSlashC = 117;
|
||
reeMetaCharAfterMinusInRange = 118;
|
||
reeRarseAtomInternalDisaster = 119;
|
||
reeIncorrectSpecialBrackets = 120;
|
||
reeIncorrectBraces = 121;
|
||
reeBRACESArgTooBig = 122;
|
||
reeUnknownOpcodeInFillFirst = 123;
|
||
reeBracesMinParamGreaterMax = 124;
|
||
reeUnclosedComment = 125;
|
||
reeComplexBracesNotImplemented = 126;
|
||
reeUnrecognizedModifier = 127;
|
||
reeBadLinePairedSeparator = 128;
|
||
reeBadUnicodeCategory = 129;
|
||
reeTooSmallCheckersArray = 130;
|
||
reeBadRecursion = 132;
|
||
reeBadSubCall = 133;
|
||
reeNamedGroupBad = 140;
|
||
reeNamedGroupBadName = 141;
|
||
reeNamedGroupBadRef = 142;
|
||
reeNamedGroupDupName = 143;
|
||
reeLookaheadBad = 150;
|
||
reeLookbehindBad = 152;
|
||
reeLookaroundNotSafe = 153;
|
||
reeBadReference = 154;
|
||
// Runtime errors must be >= reeFirstRuntimeCode
|
||
reeFirstRuntimeCode = 1000;
|
||
reeRegRepeatCalledInappropriately = 1000;
|
||
reeMatchPrimMemoryCorruption = 1001;
|
||
reeNoExpression = 1003;
|
||
reeCorruptedProgram = 1004;
|
||
reeOffsetMustBePositive = 1006;
|
||
reeExecNextWithoutExec = 1007;
|
||
reeBadOpcodeInCharClass = 1008;
|
||
reeDumpCorruptedOpcode = 1011;
|
||
reeLoopStackExceeded = 1014;
|
||
reeLoopWithoutEntry = 1015;
|
||
reeUnknown = 1016;
|
||
|
||
function TRegExpr.ErrorMsg(AErrorID: Integer): RegExprString;
|
||
begin
|
||
case AErrorID of
|
||
reeOk:
|
||
Result := 'No errors';
|
||
reeCompNullArgument:
|
||
Result := 'TRegExpr compile: null argument';
|
||
reeUnknownMetaSymbol:
|
||
Result := 'TRegExpr compile: unknown meta-character: \' + fLastErrorSymbol;
|
||
reeCompParseRegTooManyBrackets:
|
||
Result := 'TRegExpr compile: ParseReg: too many ()';
|
||
reeCompParseRegUnmatchedBrackets:
|
||
Result := 'TRegExpr compile: ParseReg: unmatched ()';
|
||
reeCompParseRegUnmatchedBrackets2:
|
||
Result := 'TRegExpr compile: ParseReg: unmatched ()';
|
||
reeCompParseRegJunkOnEnd:
|
||
Result := 'TRegExpr compile: ParseReg: junk at end';
|
||
reeNotQuantifiable:
|
||
Result := 'TRegExpr compile: Token before *+ operand is not quantifiable';
|
||
reeNestedQuantif:
|
||
Result := 'TRegExpr compile: nested quantifier *?+';
|
||
reeBadHexDigit:
|
||
Result := 'TRegExpr compile: bad hex digit';
|
||
reeInvalidRange:
|
||
Result := 'TRegExpr compile: invalid [] range';
|
||
reeParseAtomTrailingBackSlash:
|
||
Result := 'TRegExpr compile: parse atom trailing \';
|
||
reeNoHexCodeAfterBSlashX:
|
||
Result := 'TRegExpr compile: no hex code after \x';
|
||
reeNoLetterAfterBSlashC:
|
||
Result := 'TRegExpr compile: no letter "A".."Z" after \c';
|
||
reeMetaCharAfterMinusInRange:
|
||
Result := 'TRegExpr compile: metachar after "-" in [] range';
|
||
reeHexCodeAfterBSlashXTooBig:
|
||
Result := 'TRegExpr compile: hex code after \x is too big';
|
||
reeUnmatchedSqBrackets:
|
||
Result := 'TRegExpr compile: unmatched []';
|
||
reeInternalUrp:
|
||
Result := 'TRegExpr compile: internal fail on char "|", ")"';
|
||
reeQuantifFollowsNothing:
|
||
Result := 'TRegExpr compile: quantifier ?+*{ follows nothing';
|
||
reeTrailingBackSlash:
|
||
Result := 'TRegExpr compile: trailing \';
|
||
reeRarseAtomInternalDisaster:
|
||
Result := 'TRegExpr compile: RarseAtom internal disaster';
|
||
reeIncorrectSpecialBrackets:
|
||
Result := 'TRegExpr compile: incorrect expression in (?...) brackets';
|
||
reeIncorrectBraces:
|
||
Result := 'TRegExpr compile: incorrect {} braces';
|
||
reeBRACESArgTooBig:
|
||
Result := 'TRegExpr compile: braces {} argument too big';
|
||
reeUnknownOpcodeInFillFirst:
|
||
Result := 'TRegExpr compile: unknown opcode in FillFirstCharSet ('+DumpOp(fLastErrorOpcode)+')';
|
||
reeBracesMinParamGreaterMax:
|
||
Result := 'TRegExpr compile: braces {} min param greater then max';
|
||
reeUnclosedComment:
|
||
Result := 'TRegExpr compile: unclosed (?#comment)';
|
||
reeComplexBracesNotImplemented:
|
||
Result := 'TRegExpr compile: if you use braces {} and non-greedy ops *?, +?, ?? for complex cases, enable {$DEFINE ComplexBraces}';
|
||
reeUnrecognizedModifier:
|
||
Result := 'TRegExpr compile: incorrect modifier';
|
||
reeBadLinePairedSeparator:
|
||
Result := 'TRegExpr compile: LinePairedSeparator must countain two different chars or be empty';
|
||
reeBadUnicodeCategory:
|
||
Result := 'TRegExpr compile: invalid category after \p or \P';
|
||
reeTooSmallCheckersArray:
|
||
Result := 'TRegExpr compile: too small CharCheckers array';
|
||
reeBadRecursion:
|
||
Result := 'TRegExpr compile: bad recursion (?R)';
|
||
reeBadSubCall:
|
||
Result := 'TRegExpr compile: bad subroutine call';
|
||
reeNamedGroupBad:
|
||
Result := 'TRegExpr compile: bad named group';
|
||
reeNamedGroupBadName:
|
||
Result := 'TRegExpr compile: bad identifier in named group';
|
||
reeNamedGroupBadRef:
|
||
Result := 'TRegExpr compile: bad back-reference to named group';
|
||
reeNamedGroupDupName:
|
||
Result := 'TRegExpr compile: named group defined more than once';
|
||
reeLookaheadBad:
|
||
Result := 'TRegExpr compile: bad lookahead';
|
||
reeLookbehindBad:
|
||
Result := 'TRegExpr compile: bad lookbehind';
|
||
reeLookaroundNotSafe:
|
||
Result := 'TRegExpr compile: lookbehind brackets with variable length do not support captures';
|
||
reeBadReference:
|
||
Result := 'TRegExpr compile: invalid syntax for reference to capture group';
|
||
|
||
reeRegRepeatCalledInappropriately:
|
||
Result := 'TRegExpr exec: RegRepeat called inappropriately';
|
||
reeMatchPrimMemoryCorruption:
|
||
Result := 'TRegExpr exec: MatchPrim memory corruption';
|
||
reeNoExpression:
|
||
Result := 'TRegExpr exec: empty expression';
|
||
reeCorruptedProgram:
|
||
Result := 'TRegExpr exec: corrupted opcode (no magic byte)';
|
||
reeOffsetMustBePositive:
|
||
Result := 'TRegExpr exec: offset must be >0';
|
||
reeExecNextWithoutExec:
|
||
Result := 'TRegExpr exec: ExecNext without Exec(Pos)';
|
||
reeBadOpcodeInCharClass:
|
||
Result := 'TRegExpr exec: invalid opcode in char class';
|
||
reeDumpCorruptedOpcode:
|
||
Result := 'TRegExpr dump: corrupted opcode';
|
||
reeLoopStackExceeded:
|
||
Result := 'TRegExpr exec: loop stack exceeded';
|
||
reeLoopWithoutEntry:
|
||
Result := 'TRegExpr exec: loop without loop entry';
|
||
reeUnknown:
|
||
Result := 'TRegExpr exec: unknow error';
|
||
else
|
||
Result := 'Unknown error';
|
||
end;
|
||
end; { of procedure TRegExpr.Error
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.LastError: Integer;
|
||
begin
|
||
Result := fLastError;
|
||
fLastError := reeOk;
|
||
end; { of function TRegExpr.LastError
|
||
-------------------------------------------------------------- }
|
||
|
||
{ ============================================================= }
|
||
{ ===================== Common section ======================== }
|
||
{ ============================================================= }
|
||
|
||
class function TRegExpr.VersionMajor: Integer;
|
||
begin
|
||
Result := REVersionMajor;
|
||
end;
|
||
|
||
class function TRegExpr.VersionMinor: Integer;
|
||
begin
|
||
Result := REVersionMinor;
|
||
end;
|
||
|
||
constructor TRegExpr.Create;
|
||
begin
|
||
inherited;
|
||
programm := nil;
|
||
fExpression := '';
|
||
fInputString := '';
|
||
|
||
FillChar(fModifiers, SizeOf(fModifiers), 0);
|
||
fModifiers.I := RegExprModifierI;
|
||
fModifiers.R := RegExprModifierR;
|
||
fModifiers.S := RegExprModifierS;
|
||
fModifiers.G := RegExprModifierG;
|
||
fModifiers.M := RegExprModifierM;
|
||
fModifiers.X := RegExprModifierX;
|
||
|
||
{$IFDEF UseSpaceChars}
|
||
SpaceChars := RegExprSpaceChars;
|
||
{$ENDIF}
|
||
{$IFDEF UseWordChars}
|
||
WordChars := RegExprWordChars;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF UseLineSep}
|
||
fLineSeparators := RegExprLineSeparators;
|
||
{$ENDIF}
|
||
|
||
fUsePairedBreak := RegExprUsePairedBreak;
|
||
fReplaceLineEnd := RegExprReplaceLineBreak;
|
||
|
||
fSlowChecksSizeMax := 2000;
|
||
FAllowUnsafeLookBehind := False;
|
||
fRaiseForRuntimeError := True;
|
||
|
||
{$IFDEF UseLineSep}
|
||
InitLineSepArray;
|
||
{$ENDIF}
|
||
|
||
InitCharCheckers;
|
||
|
||
{$IFDEF Compat}
|
||
fInvertCase := OldInvertCase;
|
||
{$ENDIF}
|
||
end; { of constructor TRegExpr.Create
|
||
-------------------------------------------------------------- }
|
||
|
||
{ TRegExprGroupNameList }
|
||
|
||
function TRegExprGroupNameList.MatchIndexFromName(const AName: RegExprString
|
||
): Integer;
|
||
var
|
||
i: Integer;
|
||
begin
|
||
for i := 0 to NameCount - 1 do
|
||
if Names[i].Name = AName then
|
||
begin
|
||
Result := Names[i].Index;
|
||
Exit;
|
||
end;
|
||
Result := -1;
|
||
end;
|
||
|
||
procedure TRegExprGroupNameList.Clear;
|
||
begin
|
||
NameCount := 0;
|
||
if Length(Names) > RegexGroupCountIncrement then
|
||
SetLength(Names, RegexGroupCountIncrement);
|
||
end;
|
||
|
||
procedure TRegExprGroupNameList.Add(const AName: RegExprString; AnIndex: Integer
|
||
);
|
||
begin
|
||
if NameCount >= Length(Names) then
|
||
SetLength(Names, Length(Names) + 1 + RegexGroupCountIncrement);
|
||
Names[NameCount].Name := AName;
|
||
Names[NameCount].Index := AnIndex;
|
||
inc(NameCount);
|
||
end;
|
||
|
||
{$IFDEF OverMeth}
|
||
constructor TRegExpr.Create(const AExpression: RegExprString);
|
||
begin
|
||
Create;
|
||
Expression := AExpression;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
destructor TRegExpr.Destroy;
|
||
begin
|
||
if programm <> nil then
|
||
begin
|
||
FreeMem(programm);
|
||
programm := nil;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.SetExpression(const AStr: RegExprString);
|
||
begin
|
||
if (AStr <> fExpression) or not IsCompiled then
|
||
begin
|
||
fExpression := AStr;
|
||
//UniqueString(fExpression);
|
||
fRegexStart := PRegExprChar(fExpression);
|
||
fRegexEnd := fRegexStart + Length(fExpression);
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.GetSubExprCount: Integer;
|
||
begin
|
||
Result := -1;
|
||
// if nothing found, we must return -1 per TRegExpr docs
|
||
if (GrpBounds[0].GrpStart[0] <> nil) then
|
||
Result := GrpCount;
|
||
end;
|
||
|
||
function TRegExpr.GetMatchPos(Idx: Integer): PtrInt;
|
||
begin
|
||
Result := -1;
|
||
if (Idx < 0) or (Idx >= Length(GrpBounds[0].GrpStart)) then
|
||
Exit;
|
||
if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then
|
||
Result := GrpBounds[0].GrpStart[Idx] - fInputStart + 1;
|
||
end;
|
||
|
||
function TRegExpr.GetMatchLen(Idx: Integer): PtrInt;
|
||
begin
|
||
Result := -1;
|
||
if (Idx < 0) or (Idx >= Length(GrpBounds[0].GrpStart)) then
|
||
Exit;
|
||
if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) then
|
||
Result := GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx];
|
||
end;
|
||
|
||
function TRegExpr.GetMatch(Idx: Integer): RegExprString;
|
||
begin
|
||
Result := '';
|
||
if (Idx < 0) or (Idx >= Length(GrpBounds[0].GrpStart)) then
|
||
Exit;
|
||
if (Idx >= 0) and (GrpBounds[0].GrpStart[Idx] <> nil) and
|
||
(GrpBounds[0].GrpEnd[Idx] > GrpBounds[0].GrpStart[Idx])
|
||
then
|
||
SetString(Result, GrpBounds[0].GrpStart[Idx], GrpBounds[0].GrpEnd[Idx] - GrpBounds[0].GrpStart[Idx]);
|
||
end;
|
||
|
||
function TRegExpr.MatchIndexFromName(const AName: RegExprString): Integer;
|
||
begin
|
||
Result := GrpNames.MatchIndexFromName(AName);
|
||
end;
|
||
|
||
function TRegExpr.MatchFromName(const AName: RegExprString): RegExprString;
|
||
var
|
||
Idx: Integer;
|
||
begin
|
||
Result := '';
|
||
Idx := GrpNames.MatchIndexFromName(AName);
|
||
if Idx >= 0 then
|
||
Result := GetMatch(Idx)
|
||
else
|
||
Result := '';
|
||
end;
|
||
|
||
function TRegExpr.GetModifierStr: RegExprString;
|
||
begin
|
||
Result := '-';
|
||
|
||
if ModifierI then
|
||
Result := 'i' + Result
|
||
else
|
||
Result := Result + 'i';
|
||
if ModifierR then
|
||
Result := 'r' + Result
|
||
else
|
||
Result := Result + 'r';
|
||
if ModifierS then
|
||
Result := 's' + Result
|
||
else
|
||
Result := Result + 's';
|
||
if ModifierG then
|
||
Result := 'g' + Result
|
||
else
|
||
Result := Result + 'g';
|
||
if ModifierM then
|
||
Result := 'm' + Result
|
||
else
|
||
Result := Result + 'm';
|
||
if ModifierX then
|
||
Result := 'x' + Result
|
||
else
|
||
Result := Result + 'x';
|
||
|
||
if Result[Length(Result)] = '-' // remove '-' if all modifiers are 'On'
|
||
then
|
||
System.Delete(Result, Length(Result), 1);
|
||
end; { of function TRegExpr.GetModifierStr
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.SetModifierG(AValue: Boolean);
|
||
begin
|
||
if fModifiers.G <> AValue then
|
||
begin
|
||
fModifiers.G := AValue;
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.SetModifierI(AValue: Boolean);
|
||
begin
|
||
if fModifiers.I <> AValue then
|
||
begin
|
||
fModifiers.I := AValue;
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.SetModifierM(AValue: Boolean);
|
||
begin
|
||
if fModifiers.M <> AValue then
|
||
begin
|
||
fModifiers.M := AValue;
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.SetModifierR(AValue: Boolean);
|
||
begin
|
||
if fModifiers.R <> AValue then
|
||
begin
|
||
fModifiers.R := AValue;
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.SetModifierS(AValue: Boolean);
|
||
begin
|
||
if fModifiers.S <> AValue then
|
||
begin
|
||
fModifiers.S := AValue;
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.SetModifierX(AValue: Boolean);
|
||
begin
|
||
if fModifiers.X <> AValue then
|
||
begin
|
||
fModifiers.X := AValue;
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.SetModifierStr(const AStr: RegExprString);
|
||
begin
|
||
if ParseModifiers(PRegExprChar(AStr), Length(AStr), fModifiers) then
|
||
InvalidateProgramm
|
||
else
|
||
Error(reeUnrecognizedModifier);
|
||
end;
|
||
|
||
{ ============================================================= }
|
||
{ ==================== Compiler section ======================= }
|
||
{ ============================================================= }
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
function TRegExpr.IsWordChar(AChar: REChar): Boolean;
|
||
begin
|
||
// bit 7 in value: is word char
|
||
Result := CharCategoryArray[Ord(AChar)] and 128 <> 0;
|
||
end;
|
||
|
||
(*
|
||
// Unicode General Category
|
||
UGC_UppercaseLetter = 0; Lu
|
||
UGC_LowercaseLetter = 1; Ll
|
||
UGC_TitlecaseLetter = 2; Lt
|
||
UGC_ModifierLetter = 3; Lm
|
||
UGC_OtherLetter = 4; Lo
|
||
|
||
UGC_NonSpacingMark = 5; Mn
|
||
UGC_CombiningMark = 6; Mc
|
||
UGC_EnclosingMark = 7; Me
|
||
|
||
UGC_DecimalNumber = 8; Nd
|
||
UGC_LetterNumber = 9; Nl
|
||
UGC_OtherNumber = 10; No
|
||
|
||
UGC_ConnectPunctuation = 11; Pc
|
||
UGC_DashPunctuation = 12; Pd
|
||
UGC_OpenPunctuation = 13; Ps
|
||
UGC_ClosePunctuation = 14; Pe
|
||
UGC_InitialPunctuation = 15; Pi
|
||
UGC_FinalPunctuation = 16; Pf
|
||
UGC_OtherPunctuation = 17; Po
|
||
|
||
UGC_MathSymbol = 18; Sm
|
||
UGC_CurrencySymbol = 19; Sc
|
||
UGC_ModifierSymbol = 20; Sk
|
||
UGC_OtherSymbol = 21; So
|
||
|
||
UGC_SpaceSeparator = 22; Zs
|
||
UGC_LineSeparator = 23; Zl
|
||
UGC_ParagraphSeparator = 24; Zp
|
||
|
||
UGC_Control = 25; Cc
|
||
UGC_Format = 26; Cf
|
||
UGC_Surrogate = 27; Cs
|
||
UGC_PrivateUse = 28; Co
|
||
UGC_Unassigned = 29; Cn
|
||
*)
|
||
|
||
const
|
||
CategoryNames: array[0..29] of array[0..1] of REChar = (
|
||
('L', 'u'),
|
||
('L', 'l'),
|
||
('L', 't'),
|
||
('L', 'm'),
|
||
('L', 'o'),
|
||
('M', 'n'),
|
||
('M', 'c'),
|
||
('M', 'e'),
|
||
('N', 'd'),
|
||
('N', 'l'),
|
||
('N', 'o'),
|
||
('P', 'c'),
|
||
('P', 'd'),
|
||
('P', 's'),
|
||
('P', 'e'),
|
||
('P', 'i'),
|
||
('P', 'f'),
|
||
('P', 'o'),
|
||
('S', 'm'),
|
||
('S', 'c'),
|
||
('S', 'k'),
|
||
('S', 'o'),
|
||
('Z', 's'),
|
||
('Z', 'l'),
|
||
('Z', 'p'),
|
||
('C', 'c'),
|
||
('C', 'f'),
|
||
('C', 's'),
|
||
('C', 'o'),
|
||
('C', 'n')
|
||
);
|
||
|
||
function IsCategoryFirstChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case AChar of
|
||
'L', 'M', 'N', 'P', 'S', 'C', 'Z':
|
||
Result := True;
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
function IsCategoryChars(AChar, AChar2: REChar): Boolean;
|
||
var
|
||
i: Integer;
|
||
begin
|
||
for i := Low(CategoryNames) to High(CategoryNames) do
|
||
if (AChar = CategoryNames[i][0]) then
|
||
if (AChar2 = CategoryNames[i][1]) then
|
||
begin
|
||
Result := True;
|
||
Exit
|
||
end;
|
||
Result := False;
|
||
end;
|
||
|
||
function CheckCharCategory(AChar: REChar; Ch0, Ch1: REChar): Boolean;
|
||
// AChar: check this char against opcode
|
||
// Ch0, Ch1: opcode operands after OP_*CATEGORY
|
||
var
|
||
N: Byte;
|
||
Name0, Name1: REChar;
|
||
begin
|
||
Result := False;
|
||
// bits 0..6 are category
|
||
N := CharCategoryArray[Ord(AChar)] and 127;
|
||
if N <= High(CategoryNames) then
|
||
begin
|
||
Name0 := CategoryNames[N][0];
|
||
Name1 := CategoryNames[N][1];
|
||
if Ch0 <> Name0 then Exit;
|
||
if Ch1 <> #0 then
|
||
if Ch1 <> Name1 then Exit;
|
||
Result := True;
|
||
end;
|
||
end;
|
||
|
||
function MatchOneCharCategory(opnd, scan: PRegExprChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
// opnd: points to opcode operands after OP_*CATEGORY
|
||
// scan: points into InputString
|
||
begin
|
||
Result := CheckCharCategory(scan^, opnd^, (opnd + 1)^);
|
||
end;
|
||
|
||
{$ELSE}
|
||
function TRegExpr.IsWordChar(AChar: REChar): Boolean;
|
||
begin
|
||
{$IFDEF UseWordChars}
|
||
Result := Pos(AChar, fWordChars) > 0;
|
||
{$ELSE}
|
||
case AChar of
|
||
'a' .. 'z',
|
||
'A' .. 'Z',
|
||
'0' .. '9', '_':
|
||
Result := True
|
||
else
|
||
Result := False;
|
||
end;
|
||
{$ENDIF}
|
||
end;
|
||
{$ENDIF}
|
||
|
||
function TRegExpr.IsSpaceChar(AChar: REChar): Boolean;
|
||
begin
|
||
{$IFDEF UseSpaceChars}
|
||
Result := Pos(AChar, fSpaceChars) > 0;
|
||
{$ELSE}
|
||
case AChar of
|
||
' ', #$9, #$A, #$D, #$C:
|
||
Result := True
|
||
else
|
||
Result := False;
|
||
end;
|
||
{$ENDIF}
|
||
end;
|
||
|
||
function TRegExpr.IsCustomLineSeparator(AChar: REChar): Boolean;
|
||
begin
|
||
{$IFDEF UseLineSep}
|
||
{$IFDEF UnicodeRE}
|
||
Result := Pos(AChar, fLineSeparators) > 0;
|
||
{$ELSE}
|
||
Result := fLineSepArray[Byte(AChar)];
|
||
{$ENDIF}
|
||
{$ELSE}
|
||
case AChar of
|
||
#$d, #$a,
|
||
{$IFDEF UnicodeRE}
|
||
#$85, #$2028, #$2029,
|
||
{$ENDIF}
|
||
#$b, #$c:
|
||
Result := True;
|
||
else
|
||
Result := False;
|
||
end;
|
||
{$ENDIF}
|
||
end;
|
||
|
||
function IsDigitChar(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case AChar of
|
||
'0' .. '9':
|
||
Result := True;
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
function IsHorzSeparator(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
// Tab and Unicode categoty "Space Separator": https://www.compart.com/en/unicode/category/Zs
|
||
case AChar of
|
||
#9, #$20, #$A0:
|
||
Result := True;
|
||
{$IFDEF UnicodeRE}
|
||
#$1680, #$2000 .. #$200A, #$202F, #$205F, #$3000:
|
||
Result := True;
|
||
{$ENDIF}
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
function IsVertLineSeparator(AChar: REChar): Boolean; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
case AChar of
|
||
#$d, #$a, #$b, #$c:
|
||
Result := True;
|
||
{$IFDEF UnicodeRE}
|
||
#$2028, #$2029, #$85:
|
||
Result := True;
|
||
{$ENDIF}
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.InvalidateProgramm;
|
||
begin
|
||
if programm <> nil then
|
||
begin
|
||
FreeMem(programm);
|
||
programm := nil;
|
||
end;
|
||
end; { of procedure TRegExpr.InvalidateProgramm
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.Compile;
|
||
begin
|
||
if fExpression = '' then
|
||
begin
|
||
Error(reeNoExpression);
|
||
Exit;
|
||
end;
|
||
|
||
CompileRegExpr(fRegexStart);
|
||
end; { of procedure TRegExpr.Compile
|
||
-------------------------------------------------------------- }
|
||
|
||
{$IFDEF UseLineSep}
|
||
procedure TRegExpr.InitLineSepArray;
|
||
{$IFNDEF UnicodeRE}
|
||
var
|
||
i: Integer;
|
||
{$ENDIF}
|
||
begin
|
||
{$IFNDEF UnicodeRE}
|
||
FillChar(fLineSepArray, SizeOf(fLineSepArray), 0);
|
||
for i := 1 to Length(fLineSeparators) do
|
||
fLineSepArray[Byte(fLineSeparators[i])] := True;
|
||
{$ENDIF}
|
||
end;
|
||
{$ENDIF}
|
||
|
||
function TRegExpr.IsProgrammOk: Boolean;
|
||
begin
|
||
Result := False;
|
||
|
||
// check modifiers
|
||
if not IsModifiersEqual(fModifiers, fProgModifiers) then
|
||
InvalidateProgramm;
|
||
|
||
// compile if needed
|
||
if programm = nil then
|
||
begin
|
||
Compile;
|
||
// Check compiled programm
|
||
if programm = nil then
|
||
Exit;
|
||
end;
|
||
|
||
if programm[0] <> OP_MAGIC then
|
||
Error(reeCorruptedProgram)
|
||
else
|
||
Result := True;
|
||
end; { of function TRegExpr.IsProgrammOk
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.Tail(p: PRegExprChar; val: PRegExprChar);
|
||
// set the next-pointer at the end of a node chain
|
||
var
|
||
scan: PRegExprChar;
|
||
begin
|
||
if p = @regDummy[0] then
|
||
Exit;
|
||
// Find last node.
|
||
scan := regLast(p);
|
||
// Set Next 'pointer'
|
||
if val < scan then
|
||
PRENextOff(AlignToPtr(scan + REOpSz))^ := -(scan - val)
|
||
// work around PWideChar subtraction bug (Delphi uses
|
||
// shr after subtraction to calculate widechar distance %-( )
|
||
// so, if difference is negative we have .. the "feature" :(
|
||
// I could wrap it in $IFDEF UnicodeRE, but I didn't because
|
||
// "P – Q computes the difference between the address given
|
||
// by P (the higher address) and the address given by Q (the
|
||
// lower address)" - Delphi help quotation.
|
||
else
|
||
PRENextOff(AlignToPtr(scan + REOpSz))^ := val - scan;
|
||
end; { of procedure TRegExpr.Tail
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.OpTail(p: PRegExprChar; val: PRegExprChar);
|
||
// regtail on operand of first argument; nop if operandless
|
||
begin
|
||
// "Operandless" and "op != OP_BRANCH" are synonymous in practice.
|
||
if (p = nil) or (p = @regDummy[0]) or
|
||
(PREOp(p)^ <> OP_BRANCH) and (PREOp(p)^ <> OP_GBRANCH) and
|
||
(PREOp(p)^ <> OP_GBRANCH_EX) and (PREOp(p)^ <> OP_GBRANCH_EX_CI)
|
||
then
|
||
Exit;
|
||
Tail(p + REOpSz + RENextOffSz + REBranchArgSz, val);
|
||
end; { of procedure TRegExpr.OpTail
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.EmitNode(op: TREOp): PRegExprChar;
|
||
// emit a node, return location
|
||
begin
|
||
Result := regCode;
|
||
if Result <> @regDummy[0] then
|
||
begin
|
||
PREOp(regCode)^ := op;
|
||
Inc(regCode, REOpSz);
|
||
PRENextOff(AlignToPtr(regCode))^ := 0; // Next "pointer" := nil
|
||
Inc(regCode, RENextOffSz);
|
||
|
||
if (op = OP_EXACTLY) or (op = OP_EXACTLY_CI) then
|
||
regExactlyLen := PLongInt(regCode)
|
||
else
|
||
regExactlyLen := nil;
|
||
|
||
{$IFDEF DebugSynRegExpr}
|
||
if regcode - programm > regCodeSize then
|
||
raise Exception.Create('TRegExpr.EmitNode buffer overrun');
|
||
{$ENDIF}
|
||
end
|
||
else
|
||
Inc(regCodeSize, REOpSz + RENextOffSz);
|
||
// compute code size without code generation
|
||
end; { of function TRegExpr.EmitNode
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.EmitBranch: PRegExprChar;
|
||
begin
|
||
Result := EmitNode(OP_BRANCH);
|
||
EmitC(#0);
|
||
EmitC(#0);
|
||
end;
|
||
|
||
procedure TRegExpr.EmitC(ch: REChar);
|
||
begin
|
||
if regCode <> @regDummy[0] then
|
||
begin
|
||
regCode^ := ch;
|
||
Inc(regCode);
|
||
{$IFDEF DebugSynRegExpr}
|
||
if regcode - programm > regCodeSize then
|
||
raise Exception.Create('TRegExpr.EmitC buffer overrun');
|
||
{$ENDIF}
|
||
end
|
||
else
|
||
Inc(regCodeSize, REOpSz); // Type of p-code pointer always is ^REChar
|
||
end; { of procedure TRegExpr.EmitC
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.EmitInt(AValue: LongInt);
|
||
begin
|
||
if regCode <> @regDummy[0] then
|
||
begin
|
||
PLongInt(regCode)^ := AValue;
|
||
Inc(regCode, RENumberSz);
|
||
{$IFDEF DebugSynRegExpr}
|
||
if regcode - programm > regCodeSize then
|
||
raise Exception.Create('TRegExpr.EmitInt buffer overrun');
|
||
{$ENDIF}
|
||
end
|
||
else
|
||
Inc(regCodeSize, RENumberSz);
|
||
end;
|
||
|
||
function TRegExpr.EmitNodeWithGroupIndex(op: TREOp; AIndex: Integer): PRegExprChar;
|
||
begin
|
||
Result := EmitNode(op);
|
||
EmitInt(AIndex); // TReGroupIndex = LongInt;
|
||
end;
|
||
|
||
function TRegExpr.EmitGroupRef(AIndex: Integer; AIgnoreCase: Boolean): PRegExprChar;
|
||
begin
|
||
if AIgnoreCase then
|
||
Result := EmitNode(OP_BSUBEXP_CI)
|
||
else
|
||
Result := EmitNode(OP_BSUBEXP);
|
||
EmitInt(AIndex); // TReGroupIndex = LongInt;
|
||
end;
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
procedure TRegExpr.FindCategoryName(var scan: PRegExprChar; var ch1, ch2: REChar);
|
||
// scan: points into regex string after '\p', to find category name
|
||
// ch1, ch2: 2-char name of category; ch2 can be #0
|
||
var
|
||
ch: REChar;
|
||
pos1, pos2, namePtr: PRegExprChar;
|
||
nameLen: Integer;
|
||
begin
|
||
ch1 := #0;
|
||
ch2 := #0;
|
||
ch := scan^;
|
||
if IsCategoryFirstChar(ch) then
|
||
begin
|
||
ch1 := ch;
|
||
Exit;
|
||
end;
|
||
if ch = '{' then
|
||
begin
|
||
pos1 := scan;
|
||
pos2 := pos1;
|
||
while (pos2 < fRegexEnd) and (pos2^ <> '}') do
|
||
Inc(pos2);
|
||
if pos2 >= fRegexEnd then
|
||
Error(reeIncorrectBraces);
|
||
|
||
namePtr := pos1+1;
|
||
nameLen := pos2-pos1-1;
|
||
Inc(scan, nameLen+1);
|
||
|
||
if nameLen<1 then
|
||
Error(reeBadUnicodeCategory);
|
||
if nameLen>2 then
|
||
Error(reeBadUnicodeCategory);
|
||
|
||
if nameLen = 1 then
|
||
begin
|
||
ch1 := namePtr^;
|
||
ch2 := #0;
|
||
if not IsCategoryFirstChar(ch1) then
|
||
Error(reeBadUnicodeCategory);
|
||
Exit;
|
||
end;
|
||
|
||
if nameLen = 2 then
|
||
begin
|
||
ch1 := namePtr^;
|
||
ch2 := (namePtr+1)^;
|
||
if not IsCategoryChars(ch1, ch2) then
|
||
Error(reeBadUnicodeCategory);
|
||
Exit;
|
||
end;
|
||
end
|
||
else
|
||
Error(reeBadUnicodeCategory);
|
||
end;
|
||
|
||
function TRegExpr.EmitCategoryMain(APositive: Boolean): PRegExprChar;
|
||
var
|
||
ch, ch2: REChar;
|
||
begin
|
||
Inc(regParse);
|
||
if regParse >= fRegexEnd then
|
||
Error(reeBadUnicodeCategory);
|
||
FindCategoryName(regParse, ch, ch2);
|
||
if APositive then
|
||
Result := EmitNode(OP_ANYCATEGORY)
|
||
else
|
||
Result := EmitNode(OP_NOTCATEGORY);
|
||
EmitC(ch);
|
||
EmitC(ch2);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
procedure TRegExpr.InsertOperator(op: TREOp; opnd: PRegExprChar; sz: Integer);
|
||
// insert an operator in front of already-emitted operand
|
||
// Means relocating the operand.
|
||
var
|
||
src, dst, place: PRegExprChar;
|
||
i: Integer;
|
||
begin
|
||
if regCode = @regDummy[0] then
|
||
begin
|
||
Inc(regCodeSize, sz);
|
||
Exit;
|
||
end;
|
||
// move code behind insert position
|
||
src := regCode;
|
||
Inc(regCode, sz);
|
||
{$IFDEF DebugSynRegExpr}
|
||
if regCode - programm > regCodeSize then
|
||
raise Exception.Create('TRegExpr.InsertOperator buffer overrun');
|
||
if fSecondPass and ( (opnd<regCodeWork) or (opnd-regCodeWork>regCodeSize) ) then
|
||
raise Exception.Create('TRegExpr.InsertOperator invalid opnd');
|
||
{$ENDIF}
|
||
dst := regCode;
|
||
while src > opnd do
|
||
begin
|
||
Dec(dst);
|
||
Dec(src);
|
||
dst^ := src^;
|
||
end;
|
||
place := opnd; // Op node, where operand used to be.
|
||
PREOp(place)^ := op;
|
||
Inc(place, REOpSz);
|
||
for i := 1 + REOpSz to sz do
|
||
begin
|
||
place^ := #0;
|
||
Inc(place);
|
||
end;
|
||
for i := 0 to regNumBrackets - 1 do
|
||
if (GrpOpCodes[i] <> nil) and (GrpOpCodes[i] >= opnd) then
|
||
GrpOpCodes[i] := GrpOpCodes[i] + sz;
|
||
end; { of procedure TRegExpr.InsertOperator
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.RemoveOperator(opnd: PRegExprChar; sz: Integer);
|
||
// remove an operator in front of already-emitted operand
|
||
// Means relocating the operand.
|
||
var
|
||
src, dst: PRegExprChar;
|
||
i: Integer;
|
||
begin
|
||
if regCode = @regDummy[0] then
|
||
begin
|
||
// Do not decrement regCodeSize => the fSecondPass may temporary fill the extra memory;
|
||
Exit;
|
||
end;
|
||
// move code behind insert position
|
||
{$IFDEF DebugSynRegExpr}
|
||
if fSecondPass and ( (opnd<regCodeWork) or (opnd>=regCodeWork+regCodeSize) ) then
|
||
raise Exception.Create('TRegExpr.RemoveOperator() invalid opnd');
|
||
if (sz > regCodeSize-(opnd-regCodeWork)) then
|
||
raise Exception.Create('TRegExpr.RemoveOperator buffer underrun');
|
||
{$ENDIF}
|
||
src := opnd + sz;
|
||
dst := opnd;
|
||
while src < regCode do
|
||
begin
|
||
dst^ := src^;
|
||
Inc(dst);
|
||
Inc(src);
|
||
end;
|
||
Dec(regCode, sz);
|
||
for i := 0 to regNumBrackets - 1 do
|
||
if (GrpOpCodes[i] <> nil) and (GrpOpCodes[i] > opnd) then
|
||
GrpOpCodes[i] := GrpOpCodes[i] - sz;
|
||
end;
|
||
|
||
function FindSkippedMetaLen(PStart, PEnd: PRegExprChar): Integer; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
// find length of initial segment of PStart string consisting
|
||
// entirely of characters not from IsMetaSymbol1.
|
||
begin
|
||
Result := 0;
|
||
while PStart < PEnd do
|
||
begin
|
||
if _IsMetaSymbol1(PStart^) then
|
||
Exit;
|
||
Inc(Result);
|
||
Inc(PStart)
|
||
end;
|
||
end;
|
||
|
||
const
|
||
// Flags to be passed up and down.
|
||
FLAG_WORST = 0; // Worst case
|
||
FLAG_HASWIDTH = 1; // Cannot match empty string
|
||
FLAG_SIMPLE = 2; // Simple enough to be OP_STAR/OP_PLUS/OP_BRACES operand
|
||
FLAG_SPECSTART = 4; // Starts with * or +
|
||
FLAG_LOOP = 8; // Has eithe *, + or {,n} with n>=2
|
||
FLAG_GREEDY = 16; // Has any greedy code
|
||
FLAG_NOT_QUANTIFIABLE = 64; // "Piece" (ParsePiece) is look-around
|
||
|
||
{$IFDEF UnicodeRE}
|
||
RusRangeLoLow = #$430; // 'а'
|
||
RusRangeLoHigh = #$44F; // 'я'
|
||
RusRangeHiLow = #$410; // 'А'
|
||
RusRangeHiHigh = #$42F; // 'Я'
|
||
{$ELSE}
|
||
RusRangeLoLow = #$E0; // 'а' in cp1251
|
||
RusRangeLoHigh = #$FF; // 'я' in cp1251
|
||
RusRangeHiLow = #$C0; // 'А' in cp1251
|
||
RusRangeHiHigh = #$DF; // 'Я' in cp1251
|
||
{$ENDIF}
|
||
|
||
function TRegExpr.FindInCharClass(ABuffer: PRegExprChar; AChar: REChar): Boolean;
|
||
// Buffer contains char pairs: (Kind, Data), where Kind is one of OpKind_ values,
|
||
// and Data depends on Kind
|
||
var
|
||
OpKind: REChar;
|
||
{$IFDEF FastUnicodeData}
|
||
ch, ch2: REChar;
|
||
{$ENDIF}
|
||
N: integer;
|
||
begin
|
||
repeat
|
||
OpKind := ABuffer^;
|
||
case OpKind of
|
||
OpKind_End:
|
||
begin
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
|
||
OpKind_Range:
|
||
begin
|
||
Inc(ABuffer);
|
||
if (AChar >= ABuffer^) then
|
||
begin
|
||
Inc(ABuffer);
|
||
if (AChar <= ABuffer^) then
|
||
begin
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
Inc(ABuffer);
|
||
end
|
||
else
|
||
Inc(ABuffer, 2);
|
||
end;
|
||
|
||
OpKind_MetaClass:
|
||
begin
|
||
Inc(ABuffer);
|
||
N := Ord(ABuffer^);
|
||
if CharCheckers[N](AChar) then
|
||
begin
|
||
Result := True;
|
||
Exit
|
||
end;
|
||
Inc(ABuffer);
|
||
end;
|
||
|
||
OpKind_Char:
|
||
begin
|
||
Inc(ABuffer);
|
||
N := PLongInt(ABuffer)^;
|
||
Inc(ABuffer, RENumberSz);
|
||
repeat
|
||
if ABuffer^ = AChar then
|
||
begin
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
Inc(ABuffer);
|
||
dec(n);
|
||
until n = 0;
|
||
end;
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
OpKind_CategoryYes,
|
||
OpKind_CategoryNo:
|
||
begin
|
||
Inc(ABuffer);
|
||
ch := ABuffer^;
|
||
Inc(ABuffer);
|
||
ch2 := ABuffer^;
|
||
Inc(ABuffer);
|
||
Result := CheckCharCategory(AChar, ch, ch2);
|
||
if OpKind = OpKind_CategoryNo then
|
||
Result := not Result;
|
||
if Result then
|
||
Exit;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF WITH_REGEX_ASSERT}
|
||
else
|
||
Error(reeBadOpcodeInCharClass);
|
||
{$ENDIF}
|
||
end;
|
||
until False; // assume that Buffer is ended correctly
|
||
end;
|
||
|
||
|
||
procedure TRegExpr.GetCharSetFromWordChars(var ARes: TRegExprCharSet);
|
||
{$IFDEF UseWordChars}
|
||
var
|
||
i: Integer;
|
||
ch: REChar;
|
||
{$ENDIF}
|
||
begin
|
||
{$IFDEF UseWordChars}
|
||
ARes := [];
|
||
for i := 1 to Length(fWordChars) do
|
||
begin
|
||
ch := fWordChars[i];
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(ch) <= $FF then
|
||
{$ENDIF}
|
||
Include(ARes, Byte(ch));
|
||
end;
|
||
{$ELSE}
|
||
ARes := RegExprWordSet;
|
||
{$ENDIF}
|
||
end;
|
||
|
||
procedure TRegExpr.GetCharSetFromSpaceChars(var ARes: TRegExprCharset);
|
||
{$IFDEF UseSpaceChars}
|
||
var
|
||
i: Integer;
|
||
ch: REChar;
|
||
{$ENDIF}
|
||
begin
|
||
{$IFDEF UseSpaceChars}
|
||
ARes := [];
|
||
for i := 1 to Length(fSpaceChars) do
|
||
begin
|
||
ch := fSpaceChars[i];
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(ch) <= $FF then
|
||
{$ENDIF}
|
||
Include(ARes, Byte(ch));
|
||
end;
|
||
{$ELSE}
|
||
ARes := RegExprSpaceSet;
|
||
{$ENDIF}
|
||
end;
|
||
|
||
procedure TRegExpr.GetCharSetFromCharClass(ABuffer: PRegExprChar; AIgnoreCase: Boolean; var ARes: TRegExprCharset);
|
||
var
|
||
ch, ch2: REChar;
|
||
TempSet: TRegExprCharSet;
|
||
N, i: Integer;
|
||
begin
|
||
ARes := [];
|
||
TempSet := [];
|
||
repeat
|
||
case ABuffer^ of
|
||
OpKind_End:
|
||
Exit;
|
||
|
||
OpKind_Range:
|
||
begin
|
||
Inc(ABuffer);
|
||
ch := ABuffer^;
|
||
Inc(ABuffer);
|
||
ch2 := ABuffer^;
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(ch2) > $FF then
|
||
ch2 := REChar($FF);
|
||
{$ENDIF}
|
||
Inc(ABuffer);
|
||
for i := Ord(ch) to Ord(ch2) do
|
||
begin
|
||
Include(ARes, Byte(i));
|
||
if AIgnoreCase then
|
||
Include(ARes, Byte(InvertCase(REChar(i))));
|
||
end;
|
||
end;
|
||
|
||
OpKind_MetaClass:
|
||
begin
|
||
Inc(ABuffer);
|
||
N := Ord(ABuffer^);
|
||
Inc(ABuffer);
|
||
|
||
if N = CheckerIndex_Word then
|
||
begin
|
||
GetCharSetFromWordChars(TempSet);
|
||
ARes := ARes + TempSet;
|
||
end
|
||
else
|
||
if N = CheckerIndex_NotWord then
|
||
begin
|
||
GetCharSetFromWordChars(TempSet);
|
||
ARes := ARes + (RegExprAllSet - TempSet);
|
||
end
|
||
else
|
||
if N = CheckerIndex_Space then
|
||
begin
|
||
GetCharSetFromSpaceChars(TempSet);
|
||
ARes := ARes + TempSet;
|
||
end
|
||
else
|
||
if N = CheckerIndex_NotSpace then
|
||
begin
|
||
GetCharSetFromSpaceChars(TempSet);
|
||
ARes := ARes + (RegExprAllSet - TempSet);
|
||
end
|
||
else
|
||
if N = CheckerIndex_Digit then
|
||
ARes := ARes + RegExprDigitSet
|
||
else
|
||
if N = CheckerIndex_NotDigit then
|
||
ARes := ARes + (RegExprAllSet - RegExprDigitSet)
|
||
else
|
||
if N = CheckerIndex_VertSep then
|
||
ARes := ARes + RegExprLineSeparatorsSet
|
||
else
|
||
if N = CheckerIndex_NotVertSep then
|
||
ARes := ARes + (RegExprAllSet - RegExprLineSeparatorsSet)
|
||
else
|
||
if N = CheckerIndex_HorzSep then
|
||
ARes := ARes + RegExprHorzSeparatorsSet
|
||
else
|
||
if N = CheckerIndex_NotHorzSep then
|
||
ARes := ARes + (RegExprAllSet - RegExprHorzSeparatorsSet)
|
||
else
|
||
if N = CheckerIndex_LowerAZ then
|
||
begin
|
||
if AIgnoreCase then
|
||
ARes := ARes + RegExprAllAzSet
|
||
else
|
||
ARes := ARes + RegExprLowerAzSet;
|
||
end
|
||
else
|
||
if N = CheckerIndex_UpperAZ then
|
||
begin
|
||
if AIgnoreCase then
|
||
ARes := ARes + RegExprAllAzSet
|
||
else
|
||
ARes := ARes + RegExprUpperAzSet;
|
||
end
|
||
else
|
||
if N = CheckerIndex_AnyLineBreak then
|
||
begin
|
||
ARes := ARes + RegExprLineSeparatorsSet;
|
||
//we miss U+2028 and U+2029 here
|
||
end
|
||
else
|
||
Error(reeBadOpcodeInCharClass);
|
||
end;
|
||
|
||
OpKind_Char:
|
||
begin
|
||
Inc(ABuffer);
|
||
N := PLongInt(ABuffer)^;
|
||
Inc(ABuffer, RENumberSz);
|
||
for i := 1 to N do
|
||
begin
|
||
ch := ABuffer^;
|
||
Inc(ABuffer);
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(ch) <= $FF then
|
||
{$ENDIF}
|
||
begin
|
||
Include(ARes, Byte(ch));
|
||
if AIgnoreCase then
|
||
Include(ARes, Byte(InvertCase(ch)));
|
||
end;
|
||
end;
|
||
end;
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
OpKind_CategoryYes,
|
||
OpKind_CategoryNo:
|
||
begin
|
||
// usage of FirstCharSet makes no sense for regex with \p \P
|
||
ARes := RegExprAllSet;
|
||
Exit;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF WITH_REGEX_ASSERT}
|
||
else
|
||
Error(reeBadOpcodeInCharClass);
|
||
{$ENDIF}
|
||
end;
|
||
until False; // assume that Buffer is ended correctly
|
||
end;
|
||
|
||
|
||
function TRegExpr.GetModifierG: Boolean;
|
||
begin
|
||
Result := fModifiers.G;
|
||
end;
|
||
|
||
function TRegExpr.GetModifierI: Boolean;
|
||
begin
|
||
Result := fModifiers.I;
|
||
end;
|
||
|
||
function TRegExpr.GetModifierM: Boolean;
|
||
begin
|
||
Result := fModifiers.M;
|
||
end;
|
||
|
||
function TRegExpr.GetModifierR: Boolean;
|
||
begin
|
||
Result := fModifiers.R;
|
||
end;
|
||
|
||
function TRegExpr.GetModifierS: Boolean;
|
||
begin
|
||
Result := fModifiers.S;
|
||
end;
|
||
|
||
function TRegExpr.GetModifierX: Boolean;
|
||
begin
|
||
Result := fModifiers.X;
|
||
end;
|
||
|
||
function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): Boolean;
|
||
// Compile a regular expression into internal code
|
||
// We can't allocate space until we know how big the compiled form will be,
|
||
// but we can't compile it (and thus know how big it is) until we've got a
|
||
// place to put the code. So we cheat: we compile it twice, once with code
|
||
// generation turned off and size counting turned on, and once "for real".
|
||
// This also means that we don't allocate space until we are sure that the
|
||
// thing really will compile successfully, and we never have to move the
|
||
// code and thus invalidate pointers into it. (Note that it has to be in
|
||
// one piece because free() must be able to free it all.)
|
||
// Beware that the optimization-preparation code in here knows about some
|
||
// of the structure of the compiled regexp.
|
||
var
|
||
scan, scanTemp, longest, longestTemp: PRegExprChar;
|
||
Len, LenTemp: Integer;
|
||
FlagTemp, MaxMatchLen: integer;
|
||
op: TREOp;
|
||
begin
|
||
Result := False;
|
||
FlagTemp := 0;
|
||
regParse := nil; // for correct error handling
|
||
regExactlyLen := nil;
|
||
|
||
GrpCount := 0;
|
||
ParsedGrpCount := 0;
|
||
GrpNames.Clear;
|
||
fLastError := reeOk;
|
||
fLastErrorOpcode := TREOp(0);
|
||
hasRecursion := False;
|
||
|
||
try
|
||
if programm <> nil then
|
||
begin
|
||
FreeMem(programm);
|
||
programm := nil;
|
||
end;
|
||
|
||
if ARegExp = nil then
|
||
begin
|
||
Error(reeCompNullArgument);
|
||
Exit;
|
||
end;
|
||
|
||
fProgModifiers := fModifiers;
|
||
// well, may it's paranoia. I'll check it later.
|
||
|
||
// First pass: calculate opcode size, validate regex
|
||
fSecondPass := False;
|
||
fCompModifiers := fModifiers;
|
||
regParse := ARegExp;
|
||
regNumBrackets := 1;
|
||
regNumAtomicBrackets := 0;
|
||
regCodeSize := 0;
|
||
regCode := @regDummy[0];
|
||
regCodeWork := nil;
|
||
|
||
EmitC(OP_MAGIC);
|
||
if ParseReg(FlagTemp) = nil then begin
|
||
regNumBrackets := 0; // Not calling InitInternalGroupData => array sizes not adjusted for FillChar
|
||
regNumAtomicBrackets := 0;
|
||
Exit;
|
||
end;
|
||
|
||
// Allocate memory
|
||
GetMem(programm, regCodeSize * SizeOf(REChar));
|
||
InitInternalGroupData;
|
||
|
||
// Second pass: emit opcode
|
||
fSecondPass := True;
|
||
fCompModifiers := fModifiers;
|
||
regParse := ARegExp;
|
||
regNumBrackets := 1;
|
||
regNumAtomicBrackets := 0;
|
||
GrpCount := ParsedGrpCount;
|
||
ParsedGrpCount := 0;
|
||
regCode := programm;
|
||
regCodeWork := programm + REOpSz;
|
||
|
||
EmitC(OP_MAGIC);
|
||
if ParseReg(FlagTemp) = nil then
|
||
Exit;
|
||
|
||
// Dig out information for optimizations.
|
||
IsFixedLengthEx(op, FMinMatchLen, MaxMatchLen);
|
||
{$IFDEF UseFirstCharSet}
|
||
FirstCharSet := [];
|
||
FillFirstCharSet(regCodeWork);
|
||
for Len := 0 to 255 do
|
||
FirstCharArray[Len] := Byte(Len) in FirstCharSet;
|
||
{$ENDIF}
|
||
|
||
regAnchored := raNone;
|
||
regMust := nil;
|
||
regMustLen := 0;
|
||
regMustString := '';
|
||
|
||
scan := regCodeWork; // First OP_BRANCH.
|
||
// Starting-point info.
|
||
if PREOp(scan)^ = OP_BOL then
|
||
regAnchored := raBOL
|
||
else
|
||
if PREOp(scan)^ = OP_EOL then
|
||
regAnchored := raEOL
|
||
else
|
||
if PREOp(scan)^ = OP_CONTINUE_POS then
|
||
regAnchored := raContinue
|
||
else
|
||
// ".*", ".*?", ".*+" at the very start of the pattern, only need to be
|
||
// tested from the start-pos of the InputString.
|
||
// If a pattern matches, then the ".*" will always go forward to where the
|
||
// rest of the pattern starts matching
|
||
// OP_ANY is "ModifierS=True"
|
||
if (PREOp(scan)^ = OP_STAR) or (PREOp(scan)^ = OP_STAR_NG) or (PREOp(scan)^ = OP_STAR_POSS) then begin
|
||
scanTemp := AlignToInt(scan + REOpSz + RENextOffSz);
|
||
if PREOp(scanTemp)^ = OP_ANY then
|
||
regAnchored := raOnlyOnce;
|
||
end
|
||
else
|
||
// "{0,} is the same as ".*". So the same optimization applies
|
||
if (PREOp(scan)^ = OP_BRACES) or (PREOp(scan)^ = OP_BRACES_NG) or (PREOp(scan)^ = OP_BRACES_POSS) then begin
|
||
scanTemp := AlignToInt(scan + REOpSz + RENextOffSz);
|
||
if (PREBracesArg(scanTemp)^ = 0) // BracesMinCount
|
||
and (PREBracesArg(scanTemp + REBracesArgSz)^ = MaxBracesArg) // BracesMaxCount
|
||
then begin
|
||
scanTemp := AlignToPtr(scanTemp + REBracesArgSz + REBracesArgSz);
|
||
if PREOp(scanTemp)^ = OP_ANY then
|
||
regAnchored := raOnlyOnce;
|
||
end;
|
||
end;
|
||
|
||
// If there's something expensive in the r.e., find the longest
|
||
// literal string that must appear and make it the regMust. Resolve
|
||
// ties in favor of later strings, since the regstart check works
|
||
// with the beginning of the r.e. and avoiding duplication
|
||
// strengthens checking. Not a strong reason, but sufficient in the
|
||
// absence of others.
|
||
if (FlagTemp and FLAG_SPECSTART) <> 0 then
|
||
begin
|
||
longest := nil;
|
||
Len := 0;
|
||
while scan <> nil do
|
||
begin
|
||
if PREOp(scan)^ = OP_EXACTLY then
|
||
begin
|
||
longestTemp := scan + REOpSz + RENextOffSz + RENumberSz;
|
||
LenTemp := PLongInt(scan + REOpSz + RENextOffSz)^;
|
||
if LenTemp >= Len then
|
||
begin
|
||
longest := longestTemp;
|
||
Len := LenTemp;
|
||
end;
|
||
end;
|
||
scan := regNext(scan);
|
||
end;
|
||
regMust := longest;
|
||
regMustLen := Len;
|
||
if regMustLen > 1 then // don't use regMust if too short
|
||
SetString(regMustString, regMust, regMustLen);
|
||
end;
|
||
|
||
Result := True;
|
||
|
||
finally
|
||
begin
|
||
if not Result then
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
end; { of function TRegExpr.CompileRegExpr
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.ParseReg(var FlagParse: Integer): PRegExprChar;
|
||
begin
|
||
Result := DoParseReg(False, nil, FlagParse, OP_NONE, OP_COMMENT); // can't use OP_NONE // The "ender" op will not be omitted anyway
|
||
end;
|
||
|
||
function TRegExpr.DoParseReg(InBrackets: Boolean; BracketCounter: PInteger;
|
||
var FlagParse: Integer; BeginGroupOp, EndGroupOP: TReOp): PRegExprChar;
|
||
// regular expression, i.e. main body or parenthesized thing
|
||
// Caller must absorb opening parenthesis.
|
||
// Combining parenthesis handling with the base level of regular expression
|
||
// is a trifle forced, but the need to tie the tails of the branches to what
|
||
// follows makes it hard to avoid.
|
||
var
|
||
ret, br, ender, brStart: PRegExprChar;
|
||
NBrackets: Integer;
|
||
FlagTemp: Integer;
|
||
SavedModifiers: TRegExprModifiers;
|
||
HasGBranch, HasChoice: Boolean;
|
||
begin
|
||
Result := nil;
|
||
FlagTemp := 0;
|
||
FlagParse := FLAG_HASWIDTH; // Tentatively.
|
||
NBrackets := 0;
|
||
SavedModifiers := fCompModifiers;
|
||
|
||
// Make an OP_OPEN node, if parenthesized.
|
||
ret := nil;
|
||
if InBrackets then
|
||
begin
|
||
if BracketCounter <> nil then begin
|
||
if BracketCounter^ >= RegexMaxMaxGroups then
|
||
begin
|
||
Error(reeCompParseRegTooManyBrackets);
|
||
Exit;
|
||
end;
|
||
NBrackets := BracketCounter^;
|
||
Inc(BracketCounter^);
|
||
if BeginGroupOp <> OP_NONE then
|
||
ret := EmitNodeWithGroupIndex(BeginGroupOp, NBrackets);
|
||
if fSecondPass and (BracketCounter = @regNumBrackets) then
|
||
GrpOpCodes[NBrackets] := ret;
|
||
end
|
||
else
|
||
if BeginGroupOp <> OP_NONE then
|
||
ret := EmitNode(BeginGroupOp);
|
||
end;
|
||
|
||
// Pick up the branches, linking them together.
|
||
br := ParseBranch(FlagTemp);
|
||
brStart := br;
|
||
if br = nil then
|
||
begin
|
||
Result := nil;
|
||
Exit;
|
||
end;
|
||
if ret <> nil then
|
||
Tail(ret, br) // OP_OPEN -> first.
|
||
else
|
||
ret := br;
|
||
if (FlagTemp and FLAG_HASWIDTH) = 0 then
|
||
FlagParse := FlagParse and not FLAG_HASWIDTH;
|
||
FlagParse := FlagParse or FlagTemp and (FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY);
|
||
HasGBranch := False;
|
||
HasChoice := regParse^ = '|';
|
||
while (regParse^ = '|') do
|
||
begin
|
||
Inc(regParse);
|
||
br := ParseBranch(FlagTemp);
|
||
if br = nil then
|
||
begin
|
||
Result := nil;
|
||
Exit;
|
||
end;
|
||
if br^ <> OP_BRANCH then
|
||
HasGBranch := True;
|
||
Tail(ret, br); // OP_BRANCH -> OP_BRANCH.
|
||
if (FlagTemp and FLAG_HASWIDTH) = 0 then
|
||
FlagParse := FlagParse and not FLAG_HASWIDTH;
|
||
FlagParse := FlagParse or FlagTemp and (FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY);
|
||
end;
|
||
if fSecondPass then begin
|
||
if HasGBranch then begin
|
||
if brStart^ = OP_BRANCH then
|
||
brStart^ := OP_GBRANCH;
|
||
end
|
||
else
|
||
if not HasChoice then
|
||
RemoveOperator(brStart, REOpSz + RENextOffSz + REBranchArgSz);
|
||
end;
|
||
|
||
// Make a closing node, and hook it on the end.
|
||
if InBrackets and (EndGroupOP <> OP_NONE) then begin
|
||
if BracketCounter <> nil then
|
||
ender := EmitNodeWithGroupIndex(EndGroupOP, NBrackets)
|
||
else
|
||
ender := EmitNode(EndGroupOP);
|
||
end
|
||
else
|
||
if (EndGroupOP = OP_NONE) then begin
|
||
if HasChoice then
|
||
ender := EmitNode(OP_COMMENT) // need something to hook the branches' tails too
|
||
else
|
||
ender := nil;
|
||
end
|
||
else
|
||
ender := EmitNode(OP_EEND);
|
||
|
||
if ender <> nil then begin
|
||
Tail(ret, ender);
|
||
|
||
// Hook the tails of the branches to the closing node.
|
||
br := ret;
|
||
while br <> nil do
|
||
begin
|
||
OpTail(br, ender);
|
||
br := regNext(br);
|
||
end;
|
||
end;
|
||
|
||
// Check for proper termination.
|
||
if InBrackets then
|
||
if regParse^ <> ')' then
|
||
begin
|
||
Error(reeCompParseRegUnmatchedBrackets);
|
||
Exit;
|
||
end
|
||
else
|
||
Inc(regParse); // skip trailing ')'
|
||
if (not InBrackets) and (regParse < fRegexEnd) then
|
||
begin
|
||
if regParse^ = ')' then
|
||
Error(reeCompParseRegUnmatchedBrackets2)
|
||
else
|
||
Error(reeCompParseRegJunkOnEnd);
|
||
Exit;
|
||
end;
|
||
fCompModifiers := SavedModifiers; // restore modifiers of parent
|
||
Result := ret;
|
||
end; { of function TRegExpr.ParseReg
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.ParseBranch(var FlagParse: Integer): PRegExprChar;
|
||
// one alternative of an | operator
|
||
// Implements the concatenation operator.
|
||
var
|
||
ret, chain, latest: PRegExprChar;
|
||
FlagTemp: Integer;
|
||
begin
|
||
FlagTemp := 0;
|
||
FlagParse := FLAG_WORST; // Tentatively.
|
||
|
||
ret := EmitBranch;
|
||
chain := nil;
|
||
while (regParse < fRegexEnd) and (regParse^ <> '|') and (regParse^ <> ')') do
|
||
begin
|
||
latest := ParsePiece(FlagTemp);
|
||
if latest = nil then
|
||
begin
|
||
Result := nil;
|
||
Exit;
|
||
end;
|
||
if fSecondPass and
|
||
(latest <> nil) and (latest^ = OP_COMMENT) and
|
||
( ((regParse < fRegexEnd) and (regParse^ <> '|') and (regParse^ <> ')')) or
|
||
(chain <> nil)
|
||
)
|
||
then begin
|
||
regCode := latest;
|
||
continue;
|
||
end;
|
||
|
||
FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_LOOP or FLAG_GREEDY);
|
||
if chain = nil // First piece.
|
||
then begin
|
||
FlagParse := FlagParse or FlagTemp and FLAG_SPECSTART;
|
||
MaybeGuardBranchPiece(ret);
|
||
end
|
||
else
|
||
Tail(chain, latest);
|
||
chain := latest;
|
||
end;
|
||
if chain = nil // Loop ran zero times.
|
||
then
|
||
EmitNode(OP_NOTHING);
|
||
Result := ret;
|
||
end; { of function TRegExpr.ParseBranch
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.MaybeGuardBranchPiece(piece: PRegExprChar);
|
||
var
|
||
opnd: PRegExprChar;
|
||
ch: REChar;
|
||
begin
|
||
if not fSecondPass then
|
||
exit;
|
||
|
||
opnd := piece + REOpSz + RENextOffSz + REBranchArgSz;
|
||
while opnd <> nil do begin
|
||
case opnd^ of
|
||
OP_OPEN, OP_OPEN_ATOMIC, OP_CLOSE, OP_CLOSE_ATOMIC,
|
||
OP_COMMENT,
|
||
OP_BOL, OP_CONTINUE_POS, OP_RESET_MATCHPOS,
|
||
OP_BOUND, OP_NOTBOUND,
|
||
OP_BACK:
|
||
opnd := regNext(opnd);
|
||
OP_PLUS, OP_PLUS_NG, OP_PLUS_POSS:
|
||
opnd := opnd + REOpSz + RENextOffSz;
|
||
OP_BRACES, OP_BRACES_NG, OP_BRACES_POSS:
|
||
begin
|
||
if PREBracesArg(AlignToPtr(opnd + REOpSz + RENextOffSz))^ >= 1 then
|
||
opnd := opnd + REOpSz + RENextOffSz + 2*REBracesArgSz;
|
||
break;
|
||
end;
|
||
OP_LOOPENTRY:
|
||
begin
|
||
if PREBracesArg(AlignToInt(regNext(opnd) + REOpSz + RENextOffSz))^ >= 1 then
|
||
opnd := opnd + REOpSz + RENextOffSz;
|
||
break;
|
||
end;
|
||
OP_LOOKAHEAD: // could contain OP_OPEN....
|
||
begin
|
||
if ( ((opnd + 1 + RENextOffSz)^ = OP_EXACTLY) or
|
||
((opnd + 1 + RENextOffSz)^ = OP_EXACTLY_CI)
|
||
)
|
||
then begin
|
||
opnd := (opnd + 1 + RENextOffSz);
|
||
break;
|
||
end
|
||
else
|
||
opnd := regNext(regNext(opnd));
|
||
end;
|
||
OP_LOOKAHEAD_NEG, OP_LOOKBEHIND, OP_LOOKBEHIND_NEG:
|
||
opnd := regNext(regNext(opnd));
|
||
else
|
||
break;
|
||
end;
|
||
end;
|
||
if opnd <> nil then
|
||
case opnd^ of
|
||
OP_EXACTLY: begin
|
||
piece^ := OP_GBRANCH_EX;
|
||
ch := (opnd + REOpSz + RENextOffSz + RENumberSz)^;
|
||
(piece + REOpSz + RENextOffSz)^ := ch;
|
||
end;
|
||
OP_EXACTLY_CI: begin
|
||
piece^ := OP_GBRANCH_EX_CI;
|
||
ch := (opnd + REOpSz + RENextOffSz + RENumberSz)^;
|
||
(piece + REOpSz + RENextOffSz)^ := _UpperCase(ch);
|
||
(piece + REOpSz + RENextOffSz + 1)^ := _LowerCase(ch);
|
||
end;
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.ParsePiece(var FlagParse: Integer): PRegExprChar;
|
||
// something followed by possible [*+?{]
|
||
// Note that the branching code sequences used for ? and the general cases
|
||
// of * and + and { are somewhat optimized: they use the same OP_NOTHING node as
|
||
// both the endmarker for their branch list and the body of the last branch.
|
||
// It might seem that this node could be dispensed with entirely, but the
|
||
// endmarker role is not redundant.
|
||
|
||
function ParseNumber(AStart, AEnd: PRegExprChar): TREBracesArg;
|
||
begin
|
||
Result := 0;
|
||
if AEnd - AStart + 1 > 8 then
|
||
begin // prevent stupid scanning
|
||
Error(reeBRACESArgTooBig);
|
||
Exit;
|
||
end;
|
||
while AStart <= AEnd do
|
||
begin
|
||
Result := Result * 10 + (Ord(AStart^) - Ord('0'));
|
||
Inc(AStart);
|
||
end;
|
||
if (Result > MaxBracesArg) or (Result < 0) then
|
||
begin
|
||
Error(reeBRACESArgTooBig);
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
var
|
||
TheOp: TREOp;
|
||
NextNode: PRegExprChar;
|
||
|
||
procedure EmitComplexBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp, APossesive: boolean);
|
||
{$IFDEF ComplexBraces}
|
||
var
|
||
off: TRENextOff;
|
||
{$ENDIF}
|
||
begin
|
||
{$IFNDEF ComplexBraces}
|
||
Error(reeComplexBracesNotImplemented);
|
||
{$ELSE}
|
||
if APossesive then
|
||
TheOp := OP_LOOP_POSS
|
||
else
|
||
if ANonGreedyOp then
|
||
TheOp := OP_LOOP_NG
|
||
else
|
||
TheOp := OP_LOOP;
|
||
InsertOperator(OP_LOOPENTRY, Result, REOpSz + RENextOffSz);
|
||
NextNode := EmitNode(TheOp);
|
||
if regCode <> @regDummy[0] then
|
||
begin
|
||
off := (Result + REOpSz + RENextOffSz) - (regCode - REOpSz - RENextOffSz);
|
||
// back to Atom after OP_LOOPENTRY
|
||
PREBracesArg(AlignToInt(regCode))^ := ABracesMin;
|
||
Inc(regCode, REBracesArgSz);
|
||
PREBracesArg(AlignToInt(regCode))^ := ABracesMax;
|
||
Inc(regCode, REBracesArgSz);
|
||
PRENextOff(AlignToPtr(regCode))^ := off;
|
||
Inc(regCode, RENextOffSz);
|
||
{$IFDEF DebugSynRegExpr}
|
||
if regcode - programm > regCodeSize then
|
||
raise Exception.Create
|
||
('TRegExpr.ParsePiece.EmitComplexBraces buffer overrun');
|
||
{$ENDIF}
|
||
end
|
||
else
|
||
Inc(regCodeSize, REBracesArgSz * 2 + RENextOffSz);
|
||
Tail(Result, NextNode); // OP_LOOPENTRY -> OP_LOOP
|
||
if regCode <> @regDummy[0] then
|
||
Tail(Result + REOpSz + RENextOffSz, NextNode); // Atom -> OP_LOOP
|
||
{$ENDIF}
|
||
end;
|
||
|
||
procedure EmitSimpleBraces(ABracesMin, ABracesMax: TREBracesArg; ANonGreedyOp, APossessive: Boolean);
|
||
begin
|
||
if APossessive then
|
||
TheOp := OP_BRACES_POSS
|
||
else
|
||
if ANonGreedyOp then
|
||
TheOp := OP_BRACES_NG
|
||
else
|
||
TheOp := OP_BRACES;
|
||
InsertOperator(TheOp, Result, REOpSz + RENextOffSz + REBracesArgSz * 2);
|
||
if regCode <> @regDummy[0] then
|
||
begin
|
||
PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz))^ := ABracesMin;
|
||
PREBracesArg(AlignToInt(Result + REOpSz + RENextOffSz + REBracesArgSz))^ := ABracesMax;
|
||
end;
|
||
end;
|
||
|
||
function DoParseBraceMinMax(var BMin, BMax: TREBracesArg): Boolean;
|
||
var
|
||
p: PRegExprChar;
|
||
begin
|
||
Result := False;
|
||
p := regParse;
|
||
while IsDigitChar(regParse^) do // <min> MUST appear
|
||
Inc(regParse);
|
||
if FAllowBraceWithoutMin and (regParse^ = ',') and (p = regParse) then
|
||
begin
|
||
if not (((regParse+1)^ >= '0') and ((regParse+1)^ <= '9')) then
|
||
Exit;
|
||
BMin := 0
|
||
end
|
||
else
|
||
if (regParse^ <> '}') and (regParse^ <> ',') or (p = regParse) then
|
||
begin
|
||
if not FAllowLiteralBraceWithoutRange then
|
||
Error(reeIncorrectBraces);
|
||
Exit;
|
||
end
|
||
else
|
||
BMin := ParseNumber(p, regParse - 1);
|
||
if regParse^ = ',' then
|
||
begin
|
||
Inc(regParse);
|
||
p := regParse;
|
||
while IsDigitChar(regParse^) do
|
||
Inc(regParse);
|
||
if regParse^ <> '}' then
|
||
begin
|
||
if not FAllowLiteralBraceWithoutRange then
|
||
Error(reeIncorrectBraces);
|
||
Exit;
|
||
end;
|
||
if p = regParse then
|
||
BMax := MaxBracesArg
|
||
else
|
||
BMax := ParseNumber(p, regParse - 1);
|
||
end
|
||
else
|
||
BMax := BMin; // {n} == {n,n}
|
||
Result := True;
|
||
end;
|
||
|
||
function ParseBraceMinMax(var BMin, BMax: TREBracesArg): Boolean;
|
||
begin
|
||
Result := DoParseBraceMinMax(BMin, BMax);
|
||
if Result and (BMin > BMax) then
|
||
begin
|
||
Error(reeBracesMinParamGreaterMax);
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
function CheckBraceIsLiteral: Boolean;
|
||
var
|
||
dummyBracesMin, dummyBracesMax: TREBracesArg;
|
||
savedRegParse: PRegExprChar;
|
||
begin
|
||
Result := False;
|
||
if not FAllowLiteralBraceWithoutRange then
|
||
exit;
|
||
savedRegParse := regParse;
|
||
Inc(regParse);
|
||
Result := not DoParseBraceMinMax(dummyBracesMin, dummyBracesMax);
|
||
regParse := savedRegParse;
|
||
end;
|
||
|
||
var
|
||
op, nextch: REChar;
|
||
NonGreedyOp, NonGreedyCh, PossessiveCh: Boolean;
|
||
FlagTemp: Integer;
|
||
BracesMin, BracesMax: TREBracesArg;
|
||
savedRegParse: PRegExprChar;
|
||
begin
|
||
FlagTemp := 0;
|
||
Result := ParseAtom(FlagTemp);
|
||
if Result = nil then
|
||
Exit;
|
||
|
||
op := regParse^;
|
||
if not ((op = '*') or (op = '+') or (op = '?') or (op = '{')) then
|
||
begin
|
||
FlagParse := FlagTemp;
|
||
Exit;
|
||
end;
|
||
|
||
case op of
|
||
'*':
|
||
begin
|
||
if (FlagTemp and FLAG_NOT_QUANTIFIABLE) <> 0 then begin
|
||
Error(reeNotQuantifiable);
|
||
exit;
|
||
end;
|
||
FlagParse := FLAG_WORST or FLAG_SPECSTART or FLAG_LOOP;
|
||
nextch := (regParse + 1)^;
|
||
PossessiveCh := nextch = '+';
|
||
if PossessiveCh then
|
||
begin
|
||
NonGreedyCh := False;
|
||
NonGreedyOp := False;
|
||
end
|
||
else
|
||
begin
|
||
NonGreedyCh := nextch = '?';
|
||
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
|
||
end;
|
||
if not NonGreedyCh then
|
||
FlagParse := FlagParse or FLAG_GREEDY;
|
||
if (FlagTemp and (FLAG_SIMPLE or FLAG_HASWIDTH)) <> (FLAG_SIMPLE or FLAG_HASWIDTH) then
|
||
begin
|
||
if NonGreedyOp or PossessiveCh or ((FlagTemp and FLAG_HASWIDTH) = 0) then
|
||
EmitComplexBraces(0, MaxBracesArg, NonGreedyOp, PossessiveCh)
|
||
else
|
||
begin
|
||
// Too complex for OP_STAR. Write loop using OP_BRANCH and OP_BACK.
|
||
// 1: OP_BRANCH with 2 branches - to allow backtracking
|
||
// 1st choice: loop-content
|
||
// OP_BACK back to the branch
|
||
// execute another iteration of the branch, so each can backtrack
|
||
// 2nd choice: OP_NOTHING to exit
|
||
InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz + REBranchArgSz);
|
||
OpTail(Result, EmitNode(OP_BACK));
|
||
OpTail(Result, Result);
|
||
Tail(Result, EmitBranch);
|
||
Tail(Result, EmitNode(OP_NOTHING));
|
||
MaybeGuardBranchPiece(Result);
|
||
end
|
||
end
|
||
else
|
||
begin // Simple AND has Width
|
||
if PossessiveCh then
|
||
TheOp := OP_STAR_POSS
|
||
else
|
||
if NonGreedyOp then
|
||
TheOp := OP_STAR_NG
|
||
else
|
||
TheOp := OP_STAR;
|
||
InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
|
||
end;
|
||
if NonGreedyCh or PossessiveCh then
|
||
Inc(regParse); // Skip extra char ('?')
|
||
end; { of case '*' }
|
||
'+':
|
||
begin
|
||
if (FlagTemp and FLAG_NOT_QUANTIFIABLE) <> 0 then begin
|
||
Error(reeNotQuantifiable);
|
||
exit;
|
||
end;
|
||
FlagParse := FLAG_WORST or FLAG_SPECSTART or (FlagTemp and FLAG_HASWIDTH) or FLAG_LOOP;
|
||
nextch := (regParse + 1)^;
|
||
PossessiveCh := nextch = '+';
|
||
if PossessiveCh then
|
||
begin
|
||
NonGreedyCh := False;
|
||
NonGreedyOp := False;
|
||
end
|
||
else
|
||
begin
|
||
NonGreedyCh := nextch = '?';
|
||
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
|
||
end;
|
||
if not NonGreedyCh then
|
||
FlagParse := FlagParse or FLAG_GREEDY;
|
||
if (FlagTemp and (FLAG_SIMPLE or FLAG_HASWIDTH)) <> (FLAG_SIMPLE or FLAG_HASWIDTH) then
|
||
begin
|
||
if NonGreedyOp or PossessiveCh or ((FlagTemp and FLAG_HASWIDTH) = 0) then
|
||
EmitComplexBraces(1, MaxBracesArg, NonGreedyOp, PossessiveCh)
|
||
else
|
||
begin
|
||
// Too complex for OP_PLUS. Write loop using OP_BRANCH and OP_BACK.
|
||
// 1: loop-content
|
||
// 2: OP_BRANCH with 2 choices - to allow backtracking
|
||
// 2a: OP_BACK(1) to match the loop again (goto back, include another iteration of the branch in this choice)
|
||
// 2b: OP_NOTHING to exit, if the loop can match no more (branch 2a did not match)
|
||
NextNode := EmitBranch;
|
||
Tail(Result, NextNode);
|
||
Tail(EmitNode(OP_BACK), Result);
|
||
Tail(NextNode, EmitBranch);
|
||
Tail(Result, EmitNode(OP_NOTHING));
|
||
MaybeGuardBranchPiece(NextNode);
|
||
end
|
||
end
|
||
else
|
||
begin // Simple
|
||
if PossessiveCh then
|
||
TheOp := OP_PLUS_POSS
|
||
else
|
||
if NonGreedyOp then
|
||
TheOp := OP_PLUS_NG
|
||
else
|
||
TheOp := OP_PLUS;
|
||
InsertOperator(TheOp, Result, REOpSz + RENextOffSz);
|
||
end;
|
||
if NonGreedyCh or PossessiveCh then
|
||
Inc(regParse); // Skip extra char ('?')
|
||
end; { of case '+' }
|
||
'?':
|
||
begin
|
||
FlagParse := FLAG_WORST;
|
||
nextch := (regParse + 1)^;
|
||
PossessiveCh := nextch = '+';
|
||
if PossessiveCh then
|
||
begin
|
||
NonGreedyCh := False;
|
||
NonGreedyOp := False;
|
||
end
|
||
else
|
||
begin
|
||
NonGreedyCh := nextch = '?';
|
||
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
|
||
end;
|
||
if not NonGreedyCh then
|
||
FlagParse := FlagParse or FLAG_GREEDY;
|
||
if NonGreedyOp or PossessiveCh then
|
||
begin // We emit x?? as x{0,1}?
|
||
if (FlagTemp and FLAG_SIMPLE) = 0 then
|
||
begin
|
||
EmitComplexBraces(0, 1, NonGreedyOp, PossessiveCh);
|
||
end
|
||
else
|
||
EmitSimpleBraces(0, 1, NonGreedyOp, PossessiveCh);
|
||
end
|
||
else
|
||
begin // greedy '?'
|
||
InsertOperator(OP_BRANCH, Result, REOpSz + RENextOffSz + REBranchArgSz); // Either x
|
||
Tail(Result, EmitBranch); // or
|
||
NextNode := EmitNode(OP_NOTHING); // nil.
|
||
Tail(Result, NextNode);
|
||
OpTail(Result, NextNode);
|
||
MaybeGuardBranchPiece(Result);
|
||
end;
|
||
if NonGreedyCh or PossessiveCh then
|
||
Inc(regParse); // Skip extra char ('?')
|
||
end; { of case '?' }
|
||
'{':
|
||
begin
|
||
savedRegParse := regParse;
|
||
Inc(regParse);
|
||
if not ParseBraceMinMax(BracesMin, BracesMax) then
|
||
begin
|
||
regParse := savedRegParse;
|
||
Exit;
|
||
end;
|
||
if (FlagTemp and FLAG_NOT_QUANTIFIABLE) <> 0 then begin
|
||
Error(reeNotQuantifiable);
|
||
exit;
|
||
end;
|
||
if BracesMin > 0 then
|
||
FlagParse := FLAG_WORST or (FlagTemp and FLAG_HASWIDTH);
|
||
if BracesMax > 0 then
|
||
FlagParse := FlagParse or FLAG_SPECSTART;
|
||
|
||
nextch := (regParse + 1)^;
|
||
PossessiveCh := nextch = '+';
|
||
if PossessiveCh then
|
||
begin
|
||
NonGreedyCh := False;
|
||
NonGreedyOp := False;
|
||
end
|
||
else
|
||
begin
|
||
NonGreedyCh := nextch = '?';
|
||
NonGreedyOp := NonGreedyCh or not fCompModifiers.G;
|
||
end;
|
||
if not NonGreedyCh then
|
||
FlagParse := FlagParse or FLAG_GREEDY;
|
||
if BracesMax >= 2 then
|
||
FlagParse := FlagParse or FLAG_LOOP;
|
||
if (FlagTemp and (FLAG_SIMPLE or FLAG_HASWIDTH)) = (FLAG_SIMPLE or FLAG_HASWIDTH) then
|
||
EmitSimpleBraces(BracesMin, BracesMax, NonGreedyOp, PossessiveCh)
|
||
else
|
||
begin
|
||
EmitComplexBraces(BracesMin, BracesMax, NonGreedyOp, PossessiveCh);
|
||
end;
|
||
if NonGreedyCh or PossessiveCh then
|
||
Inc(regParse); // Skip extra char '?'
|
||
end; // of case '{'
|
||
// else // here we can't be
|
||
end; { of case op }
|
||
|
||
FlagParse := FlagParse or FlagTemp and (FLAG_LOOP or FLAG_GREEDY);
|
||
Inc(regParse);
|
||
op := regParse^;
|
||
if (op = '*') or (op = '+') or (op = '?') or
|
||
( (op = '{') and not CheckBraceIsLiteral)
|
||
then
|
||
Error(reeNestedQuantif);
|
||
end; { of function TRegExpr.ParsePiece
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.HexDig(Ch: REChar): Integer;
|
||
begin
|
||
case Ch of
|
||
'0' .. '9':
|
||
Result := Ord(Ch) - Ord('0');
|
||
'a' .. 'f':
|
||
Result := Ord(Ch) - Ord('a') + 10;
|
||
'A' .. 'F':
|
||
Result := Ord(Ch) - Ord('A') + 10;
|
||
else
|
||
Result := 0;
|
||
Error(reeBadHexDigit);
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.UnQuoteChar(var APtr, AEnd: PRegExprChar): REChar;
|
||
var
|
||
Ch: REChar;
|
||
begin
|
||
case APtr^ of
|
||
't':
|
||
Result := #$9; // \t => tab (HT/TAB)
|
||
'n':
|
||
Result := #$a; // \n => newline (NL)
|
||
'r':
|
||
Result := #$d; // \r => carriage return (CR)
|
||
'f':
|
||
Result := #$c; // \f => form feed (FF)
|
||
'a':
|
||
Result := #$7; // \a => alarm (bell) (BEL)
|
||
'e':
|
||
Result := #$1b; // \e => escape (ESC)
|
||
'c':
|
||
begin // \cK => code for Ctrl+K
|
||
Result := #0;
|
||
Inc(APtr);
|
||
if APtr >= AEnd then
|
||
Error(reeNoLetterAfterBSlashC);
|
||
Ch := APtr^;
|
||
case Ch of
|
||
'a' .. 'z':
|
||
Result := REChar(Ord(Ch) - Ord('a') + 1);
|
||
'A' .. 'Z':
|
||
Result := REChar(Ord(Ch) - Ord('A') + 1);
|
||
else
|
||
Error(reeNoLetterAfterBSlashC);
|
||
end;
|
||
end;
|
||
'x':
|
||
begin // \x: hex char
|
||
Result := #0;
|
||
Inc(APtr);
|
||
if APtr >= AEnd then
|
||
begin
|
||
Error(reeNoHexCodeAfterBSlashX);
|
||
Exit;
|
||
end;
|
||
if APtr^ = '{' then
|
||
begin // \x{nnnn}
|
||
repeat
|
||
Inc(APtr);
|
||
if APtr >= AEnd then
|
||
begin
|
||
Error(reeNoHexCodeAfterBSlashX);
|
||
Exit;
|
||
end;
|
||
if APtr^ <> '}' then
|
||
begin
|
||
if (Ord(Result) ShR (SizeOf(REChar) * 8 - 4)) and $F <> 0 then
|
||
begin
|
||
Error(reeHexCodeAfterBSlashXTooBig);
|
||
Exit;
|
||
end;
|
||
Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
|
||
// HexDig will cause Error if bad hex digit found
|
||
end
|
||
else
|
||
Break;
|
||
until False;
|
||
end
|
||
else
|
||
begin
|
||
Result := REChar(HexDig(APtr^));
|
||
// HexDig will cause Error if bad hex digit found
|
||
Inc(APtr);
|
||
if APtr >= AEnd then
|
||
begin
|
||
Error(reeNoHexCodeAfterBSlashX);
|
||
Exit;
|
||
end;
|
||
Result := REChar((Ord(Result) ShL 4) or HexDig(APtr^));
|
||
// HexDig will cause Error if bad hex digit found
|
||
end;
|
||
end;
|
||
else
|
||
Result := APtr^;
|
||
if (Result <> '_') and IsWordChar(Result) then
|
||
begin
|
||
fLastErrorSymbol := Result;
|
||
Error(reeUnknownMetaSymbol);
|
||
end;
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.ParseAtom(var FlagParse: Integer): PRegExprChar;
|
||
// the lowest level
|
||
// Optimization: gobbles an entire sequence of ordinary characters so that
|
||
// it can turn them into a single node, which is smaller to store and
|
||
// faster to run. Backslashed characters are exceptions, each becoming a
|
||
// separate node; the code is simpler that way and it's not worth fixing.
|
||
var
|
||
ret, ret2, regLookBehindOption: PRegExprChar;
|
||
RangeBeg, RangeEnd: REChar;
|
||
CanBeRange: Boolean;
|
||
AddrOfLen: PLongInt;
|
||
HasCaseSenseChars: boolean;
|
||
|
||
function ParseNumber(var AParsePos: PRegExprChar; out ANumber: Integer): Boolean;
|
||
begin
|
||
Result := False;
|
||
ANumber := 0;
|
||
while (AParsePos^ >= '0') and (AParsePos^ <= '9') do
|
||
begin
|
||
if ANumber > (High(ANumber)-10) div 10 then
|
||
exit;
|
||
ANumber := ANumber * 10 + (Ord(AParsePos^) - Ord('0'));
|
||
inc(AParsePos);
|
||
end;
|
||
Result := True;
|
||
end;
|
||
|
||
procedure EmitExactly(Ch: REChar);
|
||
var
|
||
cs: Boolean;
|
||
begin
|
||
if fCompModifiers.I then
|
||
ret := EmitNode(OP_EXACTLY_CI)
|
||
else
|
||
ret := EmitNode(OP_EXACTLY);
|
||
EmitInt(1);
|
||
cs := False;
|
||
if fCompModifiers.I then begin
|
||
Ch := _UpperCase(Ch);
|
||
EmitC(Ch);
|
||
if Ch <> _LowerCase(Ch) then
|
||
cs := True;
|
||
end
|
||
else
|
||
EmitC(Ch);
|
||
if not cs then
|
||
PREOp(ret)^ := OP_EXACTLY;
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
|
||
procedure EmitRangeChar(Ch: REChar; AStartOfRange: Boolean);
|
||
begin
|
||
CanBeRange := AStartOfRange;
|
||
if fCompModifiers.I then begin
|
||
Ch := _UpperCase(Ch);
|
||
if Ch <> _LowerCase(Ch) then
|
||
HasCaseSenseChars := True;
|
||
end;
|
||
if AStartOfRange then
|
||
begin
|
||
AddrOfLen := nil;
|
||
RangeBeg := Ch;
|
||
end
|
||
else
|
||
begin
|
||
if AddrOfLen = nil then
|
||
begin
|
||
EmitC(OpKind_Char);
|
||
Pointer(AddrOfLen) := regCode;
|
||
EmitInt(0);
|
||
end;
|
||
Inc(AddrOfLen^);
|
||
EmitC(Ch);
|
||
end;
|
||
end;
|
||
|
||
procedure EmitRangePacked(ch1, ch2: REChar);
|
||
var
|
||
ChkIndex: Integer;
|
||
begin
|
||
AddrOfLen := nil;
|
||
CanBeRange := False;
|
||
|
||
if fCompModifiers.I then
|
||
begin
|
||
ch1 := _UpperCase(ch1);
|
||
ch2 := _UpperCase(ch2);
|
||
if (Ch1 <> _LowerCase(Ch1)) or (Ch2 <> _LowerCase(Ch2)) then
|
||
HasCaseSenseChars := True;
|
||
end;
|
||
|
||
for ChkIndex := Low(CharCheckerInfos) to High(CharCheckerInfos) do
|
||
if (CharCheckerInfos[ChkIndex].CharBegin = ch1) and
|
||
(CharCheckerInfos[ChkIndex].CharEnd = ch2) then
|
||
begin
|
||
EmitC(OpKind_MetaClass);
|
||
EmitC(REChar(CharCheckerInfos[ChkIndex].CheckerIndex));
|
||
Exit;
|
||
end;
|
||
|
||
EmitC(OpKind_Range);
|
||
EmitC(ch1);
|
||
EmitC(ch2);
|
||
end;
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
procedure EmitCategoryInCharClass(APositive: Boolean);
|
||
var
|
||
ch, ch2: REChar;
|
||
begin
|
||
AddrOfLen := nil;
|
||
CanBeRange := False;
|
||
Inc(regParse);
|
||
FindCategoryName(regParse, ch, ch2);
|
||
if APositive then
|
||
EmitC(OpKind_CategoryYes)
|
||
else
|
||
EmitC(OpKind_CategoryNo);
|
||
EmitC(ch);
|
||
EmitC(ch2);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
var
|
||
FlagTemp: Integer;
|
||
Len: Integer;
|
||
SavedPtr: PRegExprChar;
|
||
EnderChar, TempChar: REChar;
|
||
DashForRange: Boolean;
|
||
GrpKind: TREGroupKind;
|
||
GrpName: RegExprString;
|
||
GrpIndex, ALen, RegGrpCountBefore, AMaxLen: integer;
|
||
NextCh: REChar;
|
||
op: TREOp;
|
||
SavedModifiers: TRegExprModifiers;
|
||
begin
|
||
Result := nil;
|
||
FlagTemp := 0;
|
||
FlagParse := FLAG_WORST;
|
||
AddrOfLen := nil;
|
||
GrpIndex := -1;
|
||
|
||
Inc(regParse);
|
||
case (regParse - 1)^ of
|
||
'^':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
if not fCompModifiers.M
|
||
{$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then
|
||
ret := EmitNode(OP_BOL)
|
||
else
|
||
ret := EmitNode(OP_BOL_ML);
|
||
end;
|
||
|
||
'$':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
if not fCompModifiers.M
|
||
{$IFDEF UseLineSep} or (fLineSeparators = '') {$ENDIF} then
|
||
ret := EmitNode(OP_EOL)
|
||
else
|
||
ret := EmitNode(OP_EOL_ML);
|
||
end;
|
||
|
||
'.':
|
||
begin
|
||
if fCompModifiers.S then
|
||
begin
|
||
ret := EmitNode(OP_ANY);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end
|
||
else
|
||
begin // not /s, so emit [^:LineSeparators:]
|
||
ret := EmitNode(OP_ANY_ML);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH; // not so simple ;)
|
||
end;
|
||
end;
|
||
|
||
'[':
|
||
begin
|
||
HasCaseSenseChars := False;
|
||
if regParse^ = '^' then
|
||
begin // Complement of range.
|
||
if fCompModifiers.I then
|
||
ret := EmitNode(OP_ANYBUT_CI)
|
||
else
|
||
ret := EmitNode(OP_ANYBUT);
|
||
Inc(regParse);
|
||
end
|
||
else if fCompModifiers.I then
|
||
ret := EmitNode(OP_ANYOF_CI)
|
||
else
|
||
ret := EmitNode(OP_ANYOF);
|
||
|
||
CanBeRange := False;
|
||
|
||
if regParse^ = ']' then
|
||
begin
|
||
// first ']' inside [] treated as simple char, no need to check '['
|
||
EmitRangeChar(regParse^, (regParse + 1)^ = '-');
|
||
Inc(regParse);
|
||
end;
|
||
|
||
while (regParse < fRegexEnd) and (regParse^ <> ']') do
|
||
begin
|
||
// last '-' inside [] treated as simple dash
|
||
if (regParse^ = '-') and
|
||
((regParse + 1) < fRegexEnd) and
|
||
((regParse + 1)^ = ']') then
|
||
begin
|
||
EmitRangeChar('-', False);
|
||
Inc(regParse);
|
||
Break;
|
||
end;
|
||
|
||
// char '-' which (maybe) makes a range
|
||
if (regParse^ = '-') and ((regParse + 1) < fRegexEnd) and CanBeRange then
|
||
begin
|
||
Inc(regParse);
|
||
RangeEnd := regParse^;
|
||
if RangeEnd = EscChar then
|
||
begin
|
||
if _IsMetaChar((regParse + 1)^) then
|
||
begin
|
||
Error(reeMetaCharAfterMinusInRange);
|
||
Exit;
|
||
end;
|
||
Inc(regParse);
|
||
RangeEnd := UnQuoteChar(regParse, fRegexEnd);
|
||
end;
|
||
|
||
// special handling for Russian range a-YA, add 2 ranges: a-ya and A-YA
|
||
if fCompModifiers.R and
|
||
(RangeBeg = RusRangeLoLow) and (RangeEnd = RusRangeHiHigh) then
|
||
begin
|
||
EmitRangePacked(RusRangeLoLow, RusRangeLoHigh);
|
||
EmitRangePacked(RusRangeHiLow, RusRangeHiHigh);
|
||
end
|
||
else
|
||
begin // standard r.e. handling
|
||
if RangeBeg > RangeEnd then
|
||
begin
|
||
Error(reeInvalidRange);
|
||
Exit;
|
||
end;
|
||
EmitRangePacked(RangeBeg, RangeEnd);
|
||
end;
|
||
Inc(regParse);
|
||
end
|
||
else
|
||
begin
|
||
if regParse^ = EscChar then
|
||
begin
|
||
Inc(regParse);
|
||
if regParse >= fRegexEnd then
|
||
begin
|
||
Error(reeParseAtomTrailingBackSlash);
|
||
Exit;
|
||
end;
|
||
if _IsMetaChar(regParse^) then
|
||
begin
|
||
AddrOfLen := nil;
|
||
CanBeRange := False;
|
||
EmitC(OpKind_MetaClass);
|
||
case regParse^ of
|
||
'w':
|
||
EmitC(REChar(CheckerIndex_Word));
|
||
'W':
|
||
EmitC(REChar(CheckerIndex_NotWord));
|
||
's':
|
||
EmitC(REChar(CheckerIndex_Space));
|
||
'S':
|
||
EmitC(REChar(CheckerIndex_NotSpace));
|
||
'd':
|
||
EmitC(REChar(CheckerIndex_Digit));
|
||
'D':
|
||
EmitC(REChar(CheckerIndex_NotDigit));
|
||
'v':
|
||
EmitC(REChar(CheckerIndex_VertSep));
|
||
'V':
|
||
EmitC(REChar(CheckerIndex_NotVertSep));
|
||
'h':
|
||
EmitC(REChar(CheckerIndex_HorzSep));
|
||
'H':
|
||
EmitC(REChar(CheckerIndex_NotHorzSep));
|
||
'R':
|
||
EmitC(REChar(CheckerIndex_AnyLineBreak));
|
||
else
|
||
Error(reeBadOpcodeInCharClass);
|
||
end;
|
||
end
|
||
else
|
||
{$IFDEF FastUnicodeData}
|
||
if regParse^ = 'p' then
|
||
EmitCategoryInCharClass(True)
|
||
else
|
||
if regParse^ = 'P' then
|
||
EmitCategoryInCharClass(False)
|
||
else
|
||
{$ENDIF}
|
||
begin
|
||
TempChar := UnQuoteChar(regParse, fRegexEnd);
|
||
// False if '-' is last char in []
|
||
DashForRange :=
|
||
(regParse + 2 < fRegexEnd) and
|
||
((regParse + 1)^ = '-') and
|
||
((regParse + 2)^ <> ']');
|
||
EmitRangeChar(TempChar, DashForRange);
|
||
end;
|
||
end
|
||
else
|
||
begin
|
||
// False if '-' is last char in []
|
||
DashForRange :=
|
||
(regParse + 2 < fRegexEnd) and
|
||
((regParse + 1)^ = '-') and
|
||
((regParse + 2)^ <> ']');
|
||
EmitRangeChar(regParse^, DashForRange);
|
||
end;
|
||
Inc(regParse);
|
||
end;
|
||
end; { of while }
|
||
AddrOfLen := nil;
|
||
CanBeRange := False;
|
||
EmitC(OpKind_End);
|
||
if fCompModifiers.I and not HasCaseSenseChars then begin
|
||
if PREOp(ret)^ = OP_ANYBUT_CI then
|
||
PREOp(ret)^ := OP_ANYBUT;
|
||
if PREOp(ret)^ = OP_ANYOF_CI then
|
||
PREOp(ret)^ := OP_ANYOF;
|
||
end;
|
||
if regParse^ <> ']' then
|
||
begin
|
||
Error(reeUnmatchedSqBrackets);
|
||
Exit;
|
||
end;
|
||
Inc(regParse);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
|
||
'(':
|
||
begin
|
||
GrpKind := gkNormalGroup;
|
||
GrpName := '';
|
||
|
||
// A: detect kind of expression in brackets
|
||
if regParse^ = '?' then
|
||
begin
|
||
NextCh := (regParse + 1)^;
|
||
case NextCh of
|
||
':':
|
||
begin
|
||
// non-capturing group: (?:regex)
|
||
GrpKind := gkNonCapturingGroup;
|
||
Inc(regParse, 2);
|
||
end;
|
||
'>':
|
||
begin
|
||
// atomic group: (?>regex)
|
||
GrpKind := gkAtomicGroup;
|
||
Inc(regParse, 2);
|
||
end;
|
||
'P':
|
||
begin
|
||
if (regParse + 4 >= fRegexEnd) then
|
||
Error(reeNamedGroupBad);
|
||
case (regParse + 2)^ of
|
||
'<':
|
||
begin
|
||
// named group: (?P<name>regex)
|
||
GrpKind := gkNormalGroup;
|
||
FindGroupName(regParse + 3, fRegexEnd, '>', GrpName);
|
||
Inc(regParse, Length(GrpName) + 4);
|
||
end;
|
||
'=':
|
||
begin
|
||
// back-reference to named group: (?P=name)
|
||
GrpKind := gkNamedGroupReference;
|
||
FindGroupName(regParse + 3, fRegexEnd, ')', GrpName);
|
||
Inc(regParse, Length(GrpName) + 4);
|
||
end;
|
||
'>':
|
||
begin
|
||
// subroutine call to named group: (?P>name)
|
||
GrpKind := gkSubCall;
|
||
FindGroupName(regParse + 3, fRegexEnd, ')', GrpName);
|
||
Inc(regParse, Length(GrpName) + 4);
|
||
if fSecondPass then begin
|
||
GrpIndex := GrpNames.MatchIndexFromName(GrpName);
|
||
if GrpIndex < 1 then
|
||
Error(reeNamedGroupBadRef);
|
||
end;
|
||
end;
|
||
else
|
||
Error(reeNamedGroupBad);
|
||
end;
|
||
end;
|
||
'<':
|
||
begin
|
||
// lookbehind: (?<=foo)bar
|
||
case (regParse + 2)^ of
|
||
'=':
|
||
begin
|
||
if (regParse + 4 >= fRegexEnd) then
|
||
Error(reeLookbehindBad);
|
||
GrpKind := gkLookbehind;
|
||
Inc(regParse, 3);
|
||
end;
|
||
'!':
|
||
begin
|
||
if (regParse + 4 >= fRegexEnd) then
|
||
Error(reeLookbehindBad);
|
||
GrpKind := gkLookbehindNeg;
|
||
Inc(regParse, 3);
|
||
end;
|
||
'A'..'Z', 'a'..'z':
|
||
begin
|
||
// named group: (?<name>regex)
|
||
if (regParse + 4 >= fRegexEnd) then
|
||
Error(reeNamedGroupBad);
|
||
GrpKind := gkNormalGroup;
|
||
FindGroupName(regParse + 2, fRegexEnd, '>', GrpName);
|
||
Inc(regParse, Length(GrpName) + 3);
|
||
end;
|
||
else
|
||
Error(reeIncorrectSpecialBrackets);
|
||
end;
|
||
end;
|
||
'=', '!':
|
||
begin
|
||
// lookaheads: foo(?=bar) and foo(?!bar)
|
||
if (regParse + 3 >= fRegexEnd) then
|
||
Error(reeLookaheadBad);
|
||
if NextCh = '=' then
|
||
begin
|
||
GrpKind := gkLookahead;
|
||
end
|
||
else
|
||
begin
|
||
GrpKind := gkLookaheadNeg;
|
||
end;
|
||
Inc(regParse, 2);
|
||
end;
|
||
'#':
|
||
begin
|
||
// (?#comment)
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
GrpKind := gkComment;
|
||
Inc(regParse, 2);
|
||
end;
|
||
'a'..'z', '-':
|
||
begin
|
||
// modifiers string like (?mxr)
|
||
GrpKind := gkModifierString;
|
||
Inc(regParse);
|
||
end;
|
||
'R', '0':
|
||
begin
|
||
// recursion (?R), (?0)
|
||
GrpKind := gkRecursion;
|
||
Inc(regParse, 2);
|
||
if regParse^ <> ')' then
|
||
Error(reeBadRecursion);
|
||
Inc(regParse);
|
||
end;
|
||
'1'..'9':
|
||
begin
|
||
// subroutine call (?1)..(?99)
|
||
GrpKind := gkSubCall;
|
||
Inc(regParse, 1);
|
||
if not ParseNumber(regParse, GrpIndex) or (regParse^ <> ')') then
|
||
begin
|
||
Error(reeBadRecursion);
|
||
Exit;
|
||
end;
|
||
Inc(regParse, 1);
|
||
if fSecondPass and (GrpIndex > GrpCount) then
|
||
Error(reeBadSubCall);
|
||
end;
|
||
'''':
|
||
begin
|
||
// named group: (?'name'regex)
|
||
if (regParse + 4 >= fRegexEnd) then
|
||
Error(reeNamedGroupBad);
|
||
GrpKind := gkNormalGroup;
|
||
FindGroupName(regParse + 2, fRegexEnd, '''', GrpName);
|
||
Inc(regParse, Length(GrpName) + 3);
|
||
end;
|
||
'&':
|
||
begin
|
||
// subroutine call to named group: (?&name)
|
||
if (regParse + 2 >= fRegexEnd) then
|
||
Error(reeBadSubCall);
|
||
GrpKind := gkSubCall;
|
||
FindGroupName(regParse + 2, fRegexEnd, ')', GrpName);
|
||
Inc(regParse, Length(GrpName) + 3);
|
||
if fSecondPass then begin
|
||
GrpIndex := GrpNames.MatchIndexFromName(GrpName);
|
||
if GrpIndex < 1 then
|
||
Error(reeNamedGroupBadRef);
|
||
end;
|
||
end;
|
||
else
|
||
Error(reeIncorrectSpecialBrackets);
|
||
end;
|
||
end;
|
||
|
||
// B: process found kind of brackets
|
||
case GrpKind of
|
||
gkNonCapturingGroup:
|
||
begin
|
||
ret := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_NONE);
|
||
if ret = nil then
|
||
begin
|
||
Result := nil;
|
||
Exit;
|
||
end;
|
||
FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY);
|
||
end;
|
||
|
||
gkNormalGroup,
|
||
gkAtomicGroup:
|
||
begin
|
||
// skip this block for one of passes, to not double groups count;
|
||
// must take first pass (we need GrpNames filled)
|
||
if (GrpKind = gkNormalGroup) then begin
|
||
Inc(ParsedGrpCount);
|
||
if (not fSecondPass) and (GrpName <> '') then
|
||
begin
|
||
// first pass
|
||
if GrpNames.MatchIndexFromName(GrpName) >= 0 then
|
||
Error(reeNamedGroupDupName);
|
||
GrpNames.Add(GrpName, ParsedGrpCount);
|
||
end;
|
||
end;
|
||
|
||
if GrpKind = gkAtomicGroup then
|
||
ret := DoParseReg(True, @regNumAtomicBrackets, FlagTemp, OP_OPEN_ATOMIC, OP_CLOSE_ATOMIC)
|
||
else
|
||
ret := DoParseReg(True, @regNumBrackets, FlagTemp, OP_OPEN, OP_CLOSE);
|
||
if ret = nil then
|
||
begin
|
||
Result := nil;
|
||
Exit;
|
||
end;
|
||
FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY);
|
||
end;
|
||
|
||
gkLookahead,
|
||
gkLookaheadNeg:
|
||
begin
|
||
case GrpKind of
|
||
gkLookahead: ret := EmitNode(OP_LOOKAHEAD);
|
||
gkLookaheadNeg: ret := EmitNode(OP_LOOKAHEAD_NEG);
|
||
end;
|
||
|
||
Result := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_LOOKAHEAD_END);
|
||
if Result = nil then
|
||
Exit;
|
||
|
||
Tail(ret, regLast(Result));
|
||
FlagParse := FlagParse and not FLAG_HASWIDTH;
|
||
end;
|
||
|
||
gkLookbehind,
|
||
gkLookbehindNeg:
|
||
begin
|
||
case GrpKind of
|
||
gkLookbehind: ret := EmitNode(OP_LOOKBEHIND);
|
||
gkLookbehindNeg: ret := EmitNode(OP_LOOKBEHIND_NEG);
|
||
end;
|
||
regLookBehindOption := regCode;
|
||
if (regCode <> @regDummy[0]) then
|
||
Inc(regCode, ReOpLookBehindOptionsSz)
|
||
else
|
||
Inc(regCodeSize, ReOpLookBehindOptionsSz);
|
||
|
||
RegGrpCountBefore := ParsedGrpCount;
|
||
Result := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_LOOKBEHIND_END);
|
||
if Result = nil then
|
||
Exit;
|
||
|
||
Tail(ret, regLast(Result));
|
||
|
||
if (regCode <> @regDummy[0]) then begin
|
||
ALen := 0;
|
||
ret2 := Result;
|
||
if IsPartFixedLength(ret2, op, ALen, AMaxLen, OP_LOOKBEHIND_END, regCode, [flfSkipLookAround]) then
|
||
PReOpLookBehindOptions(regLookBehindOption)^.IsGreedy := OPT_LOOKBEHIND_FIXED
|
||
else
|
||
if (ParsedGrpCount > RegGrpCountBefore) and (not FAllowUnsafeLookBehind) then
|
||
Error(reeLookaroundNotSafe)
|
||
else
|
||
if (FlagTemp and (FLAG_GREEDY)) = (FLAG_GREEDY) then
|
||
PReOpLookBehindOptions(regLookBehindOption)^.IsGreedy := OPT_LOOKBEHIND_GREEDY
|
||
else
|
||
PReOpLookBehindOptions(regLookBehindOption)^.IsGreedy := OPT_LOOKBEHIND_NON_GREEDY;
|
||
PReOpLookBehindOptions(regLookBehindOption)^.MatchLenMin := ALen;
|
||
PReOpLookBehindOptions(regLookBehindOption)^.MatchLenMax := AMaxLen;
|
||
end;
|
||
|
||
FlagParse := FlagParse and not FLAG_HASWIDTH;
|
||
end;
|
||
|
||
gkNamedGroupReference:
|
||
begin
|
||
Len := GrpNames.MatchIndexFromName(GrpName);
|
||
if fSecondPass and (Len < 0) then
|
||
Error(reeNamedGroupBadRef);
|
||
ret := EmitGroupRef(Len, fCompModifiers.I);
|
||
end;
|
||
|
||
gkModifierString:
|
||
begin
|
||
SavedPtr := regParse;
|
||
while (regParse < fRegexEnd) and (regParse^ <> ')') and (regParse^ <> ':') do
|
||
Inc(regParse);
|
||
SavedModifiers := fCompModifiers;
|
||
if (regParse^ = ':') and ParseModifiers(SavedPtr, regParse - SavedPtr, fCompModifiers) then
|
||
begin
|
||
Inc(regParse); // skip ')'
|
||
ret := DoParseReg(True, nil, FlagTemp, OP_NONE, OP_COMMENT); // can't use OP_NONE // The "ender" op will not be omitted anyway
|
||
fCompModifiers := SavedModifiers;
|
||
if ret = nil then
|
||
begin
|
||
Result := nil;
|
||
Exit;
|
||
end;
|
||
FlagParse := FlagParse or FlagTemp and (FLAG_HASWIDTH or FLAG_SPECSTART or FLAG_LOOP or FLAG_GREEDY);
|
||
end
|
||
else
|
||
if (regParse^ = ')') and ParseModifiers(SavedPtr, regParse - SavedPtr, fCompModifiers) then
|
||
begin
|
||
Inc(regParse); // skip ')'
|
||
ret := EmitNode(OP_COMMENT); // comment
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
end
|
||
else
|
||
begin
|
||
Error(reeUnrecognizedModifier);
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
gkComment:
|
||
begin
|
||
while (regParse < fRegexEnd) and (regParse^ <> ')') do
|
||
Inc(regParse);
|
||
if regParse^ <> ')' then
|
||
begin
|
||
Error(reeUnclosedComment);
|
||
Exit;
|
||
end;
|
||
Inc(regParse); // skip ')'
|
||
ret := EmitNode(OP_COMMENT); // comment
|
||
end;
|
||
|
||
gkRecursion:
|
||
begin
|
||
// set FLAG_HASWIDTH to allow compiling of such regex: b(?:m|(?R))*e
|
||
FlagParse := FlagParse or FLAG_HASWIDTH;
|
||
ret := EmitNode(OP_RECUR);
|
||
hasRecursion := True;
|
||
end;
|
||
|
||
gkSubCall:
|
||
begin
|
||
// set FLAG_HASWIDTH like for (?R)
|
||
FlagParse := FlagParse or FLAG_HASWIDTH;
|
||
ret := EmitNodeWithGroupIndex(OP_SUBCALL, GrpIndex);
|
||
hasRecursion := True;
|
||
end;
|
||
end; // case GrpKind of
|
||
end;
|
||
|
||
'|', ')':
|
||
begin // Supposed to be caught earlier.
|
||
Error(reeInternalUrp);
|
||
Exit;
|
||
end;
|
||
|
||
'?', '+', '*':
|
||
begin
|
||
Error(reeQuantifFollowsNothing);
|
||
Exit;
|
||
end;
|
||
|
||
EscChar:
|
||
begin
|
||
if regParse >= fRegexEnd then
|
||
begin
|
||
Error(reeTrailingBackSlash);
|
||
Exit;
|
||
end;
|
||
case regParse^ of
|
||
'b':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
ret := EmitNode(OP_BOUND);
|
||
end;
|
||
'B':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
ret := EmitNode(OP_NOTBOUND);
|
||
end;
|
||
'A':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
ret := EmitNode(OP_BOL);
|
||
end;
|
||
'z':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
ret := EmitNode(OP_EOL);
|
||
end;
|
||
'Z':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
ret := EmitNode(OP_EOL2);
|
||
end;
|
||
'G':
|
||
begin
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
ret := EmitNode(OP_CONTINUE_POS);
|
||
end;
|
||
'd':
|
||
begin // r.e.extension - any digit ('0' .. '9')
|
||
ret := EmitNode(OP_ANYDIGIT);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'D':
|
||
begin // r.e.extension - not digit ('0' .. '9')
|
||
ret := EmitNode(OP_NOTDIGIT);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
's':
|
||
begin // r.e.extension - any space char
|
||
ret := EmitNode(OP_ANYSPACE);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'S':
|
||
begin // r.e.extension - not space char
|
||
ret := EmitNode(OP_NOTSPACE);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'w':
|
||
begin // r.e.extension - any english char / digit / '_'
|
||
ret := EmitNode(OP_ANYLETTER);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'W':
|
||
begin // r.e.extension - not english char / digit / '_'
|
||
ret := EmitNode(OP_NOTLETTER);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'v':
|
||
begin
|
||
ret := EmitNode(OP_ANYVERTSEP);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'V':
|
||
begin
|
||
ret := EmitNode(OP_NOTVERTSEP);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'h':
|
||
begin
|
||
ret := EmitNode(OP_ANYHORZSEP);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'H':
|
||
begin
|
||
ret := EmitNode(OP_NOTHORZSEP);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'1' .. '9':
|
||
begin
|
||
if fSecondPass and (Ord(regParse^) - Ord('0') > GrpCount) then
|
||
Error(reeBadReference);
|
||
ret := EmitGroupRef(Ord(regParse^) - Ord('0'), fCompModifiers.I);
|
||
end;
|
||
'g':
|
||
begin
|
||
case (regParse + 1)^ of
|
||
'<', '''':
|
||
begin
|
||
// subroutine call to named group
|
||
case (regParse + 1)^ of
|
||
'<': FindGroupName(regParse + 2, fRegexEnd, '>', GrpName);
|
||
'''': FindGroupName(regParse + 2, fRegexEnd, '''', GrpName);
|
||
end;
|
||
Inc(regParse, Length(GrpName) + 2);
|
||
GrpIndex := GrpNames.MatchIndexFromName(GrpName);
|
||
if fSecondPass and (GrpIndex < 1) then
|
||
Error(reeNamedGroupBadRef);
|
||
ret := EmitNodeWithGroupIndex(OP_SUBCALL, GrpIndex);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH;
|
||
hasRecursion := True;
|
||
end;
|
||
'{':
|
||
begin
|
||
// back-reference to named group
|
||
FindGroupName(regParse + 2, fRegexEnd, '}', GrpName);
|
||
Inc(regParse, Length(GrpName) + 2);
|
||
GrpIndex := GrpNames.MatchIndexFromName(GrpName);
|
||
if fSecondPass and (GrpIndex < 1) then
|
||
Error(reeNamedGroupBadRef);
|
||
ret := EmitGroupRef(GrpIndex, fCompModifiers.I);
|
||
end;
|
||
'0'..'9':
|
||
begin
|
||
inc(regParse);
|
||
if not ParseNumber(regParse, GrpIndex) then begin
|
||
Error(reeBadReference);
|
||
Exit;
|
||
end;
|
||
dec(regParse);
|
||
if GrpIndex = 0 then
|
||
Error(reeBadReference);
|
||
if fSecondPass and (GrpIndex > GrpCount) then
|
||
Error(reeBadReference);
|
||
ret := EmitGroupRef(GrpIndex, fCompModifiers.I);
|
||
end;
|
||
else
|
||
Error(reeBadReference);
|
||
end;
|
||
end;
|
||
'k':
|
||
begin
|
||
// back-reference to named group
|
||
case (regParse + 1)^ of
|
||
'<':
|
||
FindGroupName(regParse + 2, fRegexEnd, '>', GrpName);
|
||
'''':
|
||
FindGroupName(regParse + 2, fRegexEnd, '''', GrpName);
|
||
'{':
|
||
FindGroupName(regParse + 2, fRegexEnd, '}', GrpName);
|
||
else
|
||
Error(reeBadReference);
|
||
end;
|
||
Inc(regParse, Length(GrpName) + 2);
|
||
GrpIndex := GrpNames.MatchIndexFromName(GrpName);
|
||
if fSecondPass and (GrpIndex < 1) then
|
||
Error(reeNamedGroupBadRef);
|
||
ret := EmitGroupRef(GrpIndex, fCompModifiers.I);
|
||
end;
|
||
'K':
|
||
begin
|
||
ret := EmitNode(OP_RESET_MATCHPOS);
|
||
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
|
||
end;
|
||
{$IFDEF FastUnicodeData}
|
||
'p':
|
||
begin
|
||
ret := EmitCategoryMain(True);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
'P':
|
||
begin
|
||
ret := EmitCategoryMain(False);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
{$ENDIF}
|
||
'R':
|
||
begin
|
||
ret := EmitNode(OP_ANYLINEBREAK);
|
||
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
|
||
end;
|
||
else
|
||
EmitExactly(UnQuoteChar(regParse, fRegexEnd));
|
||
end; { of case }
|
||
Inc(regParse);
|
||
end;
|
||
|
||
else
|
||
begin
|
||
Dec(regParse);
|
||
if fCompModifiers.X and // check for eXtended syntax
|
||
((regParse^ = '#') or IsIgnoredChar(regParse^)) then
|
||
begin // \x
|
||
if regParse^ = '#' then
|
||
begin // Skip eXtended comment
|
||
// find comment terminator (group of \n and/or \r)
|
||
while (regParse < fRegexEnd) and (regParse^ <> #$d) and
|
||
(regParse^ <> #$a) do
|
||
Inc(regParse);
|
||
while (regParse^ = #$d) or (regParse^ = #$a)
|
||
// skip comment terminator
|
||
do
|
||
Inc(regParse);
|
||
// attempt to support different type of line separators
|
||
end
|
||
else
|
||
begin // Skip the blanks!
|
||
while IsIgnoredChar(regParse^) do
|
||
Inc(regParse);
|
||
end;
|
||
ret := EmitNode(OP_COMMENT); // comment
|
||
end
|
||
else
|
||
begin
|
||
Len := FindSkippedMetaLen(regParse, fRegexEnd);
|
||
if Len <= 0 then
|
||
if regParse^ <> '{' then
|
||
begin
|
||
Error(reeRarseAtomInternalDisaster);
|
||
Exit;
|
||
end
|
||
else
|
||
Len := FindSkippedMetaLen(regParse + 1, fRegexEnd) + 1;
|
||
// bad {n,m} - compile as EXACTLY
|
||
EnderChar := (regParse + Len)^;
|
||
if (Len > 1) and ((EnderChar = '*') or (EnderChar = '+') or (EnderChar = '?') or (EnderChar = '{')) then
|
||
Dec(Len); // back off clear of ?+*{ operand.
|
||
FlagParse := FlagParse or FLAG_HASWIDTH;
|
||
if Len = 1 then
|
||
FlagParse := FlagParse or FLAG_SIMPLE;
|
||
if fCompModifiers.I then
|
||
ret := EmitNode(OP_EXACTLY_CI)
|
||
else
|
||
ret := EmitNode(OP_EXACTLY);
|
||
EmitInt(0);
|
||
while (Len > 0) and ((not fCompModifiers.X) or (regParse^ <> '#')) do
|
||
begin
|
||
if not fCompModifiers.X or not IsIgnoredChar(regParse^) then
|
||
begin
|
||
if fCompModifiers.I then
|
||
EmitC(_UpperCase(regParse^))
|
||
else
|
||
EmitC(regParse^);
|
||
if regCode <> @regDummy[0] then
|
||
Inc(regExactlyLen^);
|
||
end;
|
||
Inc(regParse);
|
||
Dec(Len);
|
||
end;
|
||
end; { of if not comment }
|
||
end; { of case else }
|
||
end; { of case }
|
||
|
||
Result := ret;
|
||
end; { of function TRegExpr.ParseAtom
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.GetCompilerErrorPos: PtrInt;
|
||
begin
|
||
Result := 0;
|
||
if (fRegexStart = nil) or (regParse = nil) then
|
||
Exit; // not in compiling mode ?
|
||
Result := regParse - fRegexStart;
|
||
end; { of function TRegExpr.GetCompilerErrorPos
|
||
-------------------------------------------------------------- }
|
||
|
||
{ ============================================================= }
|
||
{ ===================== Matching section ====================== }
|
||
{ ============================================================= }
|
||
|
||
procedure TRegExpr.FindGroupName(APtr, AEndPtr: PRegExprChar; AEndChar: REChar; var AName: RegExprString);
|
||
// check that group name is valid identifier, started from non-digit
|
||
// this is to be like in Python regex
|
||
var
|
||
P: PRegExprChar;
|
||
begin
|
||
P := APtr;
|
||
if IsDigitChar(P^) or not IsWordChar(P^) then
|
||
Error(reeNamedGroupBadName);
|
||
|
||
repeat
|
||
if P >= AEndPtr then
|
||
Error(reeNamedGroupBad);
|
||
if P^ = AEndChar then
|
||
Break;
|
||
if not (IsWordChar(P^) or (P^ = '_')) then
|
||
Error(reeNamedGroupBadName);
|
||
Inc(P);
|
||
until False;
|
||
|
||
SetString(AName, APtr, P-APtr);
|
||
end;
|
||
|
||
function TRegExpr.FindRepeated(p: PRegExprChar; AMax: Integer): Integer;
|
||
// repeatedly match something simple, report how many
|
||
// p: points to current opcode
|
||
var
|
||
scan: PRegExprChar;
|
||
opnd: PRegExprChar;
|
||
TheMax: PtrInt; // PtrInt, gets diff of 2 pointers
|
||
InvChar: REChar;
|
||
{$IFDEF UnicodeEx}
|
||
i: Integer;
|
||
{$ENDIF}
|
||
begin
|
||
Result := 0;
|
||
scan := regInput; // points into InputString
|
||
opnd := p + REOpSz + RENextOffSz; // points to operand of opcode (after OP_nnn code)
|
||
TheMax := fInputCurrentEnd - scan;
|
||
if TheMax > AMax then
|
||
TheMax := AMax;
|
||
case PREOp(p)^ of
|
||
OP_ANY:
|
||
begin
|
||
// note - OP_ANY_ML cannot be proceeded in FindRepeated because can skip
|
||
// more than one char at once
|
||
{$IFDEF UnicodeEx}
|
||
for i := 1 to TheMax do
|
||
IncUnicode2(scan, Result);
|
||
{$ELSE}
|
||
Result := TheMax;
|
||
Inc(scan, Result);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_EXACTLY:
|
||
begin // in opnd can be only ONE char !!!
|
||
{
|
||
// Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
|
||
NLen := PLongInt(opnd)^;
|
||
if TheMax > NLen then
|
||
TheMax := NLen;
|
||
}
|
||
Inc(opnd, RENumberSz);
|
||
while (Result < TheMax) and (opnd^ = scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
end;
|
||
|
||
OP_EXACTLY_CI:
|
||
begin // in opnd can be only ONE char !!!
|
||
{
|
||
// Alexey: commented because of https://github.com/andgineer/TRegExpr/issues/145
|
||
NLen := PLongInt(opnd)^;
|
||
if TheMax > NLen then
|
||
TheMax := NLen;
|
||
}
|
||
Inc(opnd, RENumberSz);
|
||
while (Result < TheMax) and (opnd^ = scan^) do
|
||
begin // prevent unneeded InvertCase
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
if Result < TheMax then
|
||
begin
|
||
InvChar := _LowerCase(opnd^); // store in register
|
||
while (Result < TheMax) and ((opnd^ = scan^) or (InvChar = scan^)) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
end;
|
||
end;
|
||
|
||
OP_ANYDIGIT:
|
||
while (Result < TheMax) and IsDigitChar(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
|
||
OP_NOTDIGIT:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not IsDigitChar(scan^) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not IsDigitChar(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYLETTER:
|
||
while (Result < TheMax) and IsWordChar(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
|
||
OP_NOTLETTER:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not IsWordChar(scan^) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not IsWordChar(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYSPACE:
|
||
while (Result < TheMax) and IsSpaceChar(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
|
||
OP_NOTSPACE:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not IsSpaceChar(scan^) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not IsSpaceChar(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYVERTSEP:
|
||
while (Result < TheMax) and IsVertLineSeparator(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
|
||
OP_NOTVERTSEP:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not IsVertLineSeparator(scan^) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not IsVertLineSeparator(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYHORZSEP:
|
||
while (Result < TheMax) and IsHorzSeparator(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
|
||
OP_NOTHORZSEP:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not IsHorzSeparator(scan^) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not IsHorzSeparator(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYOF:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and FindInCharClass(opnd, scan^) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and FindInCharClass(opnd, scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYBUT:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not FindInCharClass(opnd, scan^) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not FindInCharClass(opnd, scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYOF_CI:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and FindInCharClass(opnd, _UpperCase(scan^)) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and FindInCharClass(opnd, _UpperCase(scan^)) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_ANYBUT_CI:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not FindInCharClass(opnd, _UpperCase(scan^)) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not FindInCharClass(opnd, _UpperCase(scan^)) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
OP_ANYCATEGORY:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and MatchOneCharCategory(opnd, scan) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and MatchOneCharCategory(opnd, scan) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_NOTCATEGORY:
|
||
{$IFDEF UNICODEEX}
|
||
begin
|
||
i := 0;
|
||
while (i < TheMax) and not MatchOneCharCategory(opnd, scan) do
|
||
begin
|
||
Inc(i);
|
||
IncUnicode2(scan, Result);
|
||
end;
|
||
end;
|
||
{$ELSE}
|
||
while (Result < TheMax) and not MatchOneCharCategory(opnd, scan) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
|
||
OP_ANYLINEBREAK:
|
||
while (Result < TheMax) and IsAnyLineBreak(scan^) do
|
||
begin
|
||
Inc(Result);
|
||
Inc(scan);
|
||
end;
|
||
|
||
else
|
||
Result := 0;
|
||
Error(reeRegRepeatCalledInappropriately);
|
||
Exit;
|
||
end; { of case }
|
||
regInput := scan;
|
||
end; { of function TRegExpr.FindRepeated
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.regNext(p: PRegExprChar): PRegExprChar;
|
||
// dig the "next" pointer out of a node
|
||
var
|
||
offset: TRENextOff;
|
||
begin
|
||
if p = @regDummy[0] then
|
||
begin
|
||
Result := nil;
|
||
Exit;
|
||
end;
|
||
offset := PRENextOff(AlignToPtr(p + REOpSz))^;
|
||
if offset = 0 then
|
||
Result := nil
|
||
else
|
||
Result := p + offset;
|
||
end;
|
||
|
||
function TRegExpr.regNextQuick(p: PRegExprChar): PRegExprChar; {$IFDEF FPC}inline;{$ENDIF}
|
||
{$IFDEF WITH_REGEX_ASSERT}
|
||
var
|
||
offset: TRENextOff;
|
||
{$ENDIF}
|
||
begin
|
||
// The inlined version is never called in the first pass.
|
||
Assert(fSecondPass); // fSecondPass will also be true in MatchPrim.
|
||
{$IFDEF WITH_REGEX_ASSERT}
|
||
offset := PRENextOff(AlignToPtr(p + REOpSz))^;
|
||
if offset = 0 then
|
||
Result := nil
|
||
else
|
||
begin
|
||
Result := p + offset;
|
||
assert((Result >= programm) and (Result < programm + regCodeSize * SizeOf(REChar)));
|
||
end;
|
||
{$ELSE}
|
||
Result := p + PRENextOff(AlignToPtr(p + REOpSz))^;
|
||
{$ENDIF}
|
||
end;
|
||
|
||
function TRegExpr.regLast(p: PRegExprChar): PRegExprChar;
|
||
var
|
||
temp: PRegExprChar;
|
||
begin
|
||
Result := p;
|
||
if p = @regDummy[0] then
|
||
Exit;
|
||
// Find last node.
|
||
repeat
|
||
temp := regNext(Result);
|
||
if temp = nil then
|
||
Break;
|
||
Result := temp;
|
||
until False;
|
||
end;
|
||
|
||
type
|
||
TRegExprMatchPrimLocals = record
|
||
case TREOp of
|
||
{$IFDEF ComplexBraces}
|
||
OP_LOOPENTRY: (
|
||
LoopInfo: TOpLoopInfo;
|
||
);
|
||
OP_LOOP: ( // and OP_LOOP_NG
|
||
LoopInfoListPtr: POpLoopInfo;
|
||
);
|
||
{$ENDIF}
|
||
OP_LOOKAHEAD, OP_LOOKBEHIND: (
|
||
IsGreedy: REChar;
|
||
LookAroundInfo: TRegExprLookAroundInfo;
|
||
InpStart: PRegExprChar; // only OP_LOOKBEHIND
|
||
);
|
||
OP_LOOKAHEAD_END, OP_LOOKBEHIND_END: (
|
||
LookAroundInfoPtr: PRegExprLookAroundInfo;
|
||
);
|
||
OP_SUBCALL: (
|
||
savedCurrentSubCalled: Integer;
|
||
);
|
||
OP_STAR: (
|
||
nextch: REChar;
|
||
);
|
||
end;
|
||
|
||
function TRegExpr.MatchPrim(prog: PRegExprChar): Boolean;
|
||
// recursively matching routine
|
||
// Conceptually the strategy is simple: check to see whether the current
|
||
// node matches, call self recursively to see whether the rest matches,
|
||
// and then act accordingly. In practice we make some effort to avoid
|
||
// recursion, in particular by going through "ordinary" nodes (that don't
|
||
// need to know whether the rest of the match failed) by a loop instead of
|
||
// by recursion.
|
||
|
||
var
|
||
scan: PRegExprChar;
|
||
next: PRegExprChar; // next node
|
||
opnd, save: PRegExprChar;
|
||
no: Integer;
|
||
LoopCnt: Integer;
|
||
Local: TRegExprMatchPrimLocals;
|
||
begin
|
||
Result := False;
|
||
{$IFDEF RegExpWithStackOverflowCheck_DecStack_Frame}
|
||
if get_frame < StackLimit then begin
|
||
error(reeLoopStackExceeded);
|
||
exit;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
|
||
{
|
||
// Alexey: not sure it's ok for long searches in big texts, so disabled
|
||
if regNestedCalls > MaxRegexBackTracking then
|
||
Exit;
|
||
Inc(regNestedCalls);
|
||
}
|
||
|
||
scan := prog;
|
||
while True do
|
||
begin
|
||
Assert(scan <> nil);
|
||
next := regNextQuick(scan);
|
||
|
||
case scan^ of
|
||
OP_BOUND:
|
||
begin
|
||
if ( (regInput = fInputStart) or not IsWordChar((regInput - 1)^) )
|
||
=
|
||
( (regInput >= fInputEnd) or not IsWordChar(regInput^) )
|
||
then
|
||
Exit;
|
||
end;
|
||
|
||
OP_NOTBOUND:
|
||
begin
|
||
if ( (regInput = fInputStart) or not IsWordChar((regInput - 1)^) )
|
||
<>
|
||
( (regInput >= fInputEnd) or not IsWordChar(regInput^) )
|
||
then
|
||
Exit;
|
||
end;
|
||
|
||
OP_BOL:
|
||
begin
|
||
if regInput <> fInputStart then
|
||
Exit;
|
||
end;
|
||
|
||
OP_CONTINUE_POS:
|
||
begin
|
||
if regInput <> fInputContinue then
|
||
Exit;
|
||
end;
|
||
|
||
OP_RESET_MATCHPOS:
|
||
begin
|
||
save := GrpBounds[0].GrpStart[0];
|
||
GrpBounds[0].GrpStart[0] := regInput;
|
||
Result := MatchPrim(next);
|
||
if not Result then
|
||
GrpBounds[0].GrpStart[0] := save;
|
||
exit;
|
||
end;
|
||
|
||
OP_EOL:
|
||
begin
|
||
// \z matches at the very end
|
||
if regInput < fInputEnd then
|
||
Exit;
|
||
end;
|
||
|
||
OP_EOL2:
|
||
begin
|
||
// \Z matches at the very and + before the final line-break (LF and CR LF)
|
||
if regInput < fInputEnd then
|
||
begin
|
||
if (regInput = fInputEnd - 1) and (regInput^ = #10) then
|
||
begin end
|
||
else
|
||
if (regInput = fInputEnd - 2) and (regInput^ = #13) and ((regInput + 1) ^ = #10) then
|
||
begin end
|
||
else
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
OP_BOL_ML:
|
||
if regInput > fInputStart then
|
||
begin
|
||
if ((regInput - 1) <= fInputStart) or
|
||
not IsPairedBreak(regInput - 2) then
|
||
begin
|
||
// don't stop between paired separator
|
||
if IsPairedBreak(regInput - 1) then
|
||
Exit;
|
||
if not IsCustomLineSeparator((regInput - 1)^) then
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
OP_EOL_ML:
|
||
if regInput < fInputEnd then
|
||
begin
|
||
if not IsPairedBreak(regInput) then
|
||
begin
|
||
// don't stop between paired separator
|
||
if (regInput > fInputStart) and IsPairedBreak(regInput - 1) then
|
||
Exit;
|
||
if not IsCustomLineSeparator(regInput^) then
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
OP_ANY:
|
||
begin
|
||
if regInput >= fInputCurrentEnd then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANY_ML:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or
|
||
IsPairedBreak(regInput) or
|
||
IsCustomLineSeparator(regInput^)
|
||
then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYDIGIT:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or not IsDigitChar(regInput^) then
|
||
Exit;
|
||
Inc(regInput);
|
||
end;
|
||
|
||
OP_NOTDIGIT:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or IsDigitChar(regInput^) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYLETTER:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or not IsWordChar(regInput^) then
|
||
Exit;
|
||
Inc(regInput);
|
||
end;
|
||
|
||
OP_NOTLETTER:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or IsWordChar(regInput^) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYSPACE:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or not IsSpaceChar(regInput^) then
|
||
Exit;
|
||
Inc(regInput);
|
||
end;
|
||
|
||
OP_NOTSPACE:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or IsSpaceChar(regInput^) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYVERTSEP:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or not IsVertLineSeparator(regInput^) then
|
||
Exit;
|
||
Inc(regInput);
|
||
end;
|
||
|
||
OP_NOTVERTSEP:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or IsVertLineSeparator(regInput^) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYHORZSEP:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or not IsHorzSeparator(regInput^) then
|
||
Exit;
|
||
Inc(regInput);
|
||
end;
|
||
|
||
OP_NOTHORZSEP:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or IsHorzSeparator(regInput^) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_EXACTLY_CI:
|
||
begin
|
||
opnd := scan + REOpSz + RENextOffSz; // OPERAND
|
||
no := PLongInt(opnd)^;
|
||
if (regInput + no > fInputCurrentEnd) then
|
||
Exit;
|
||
Inc(opnd, RENumberSz);
|
||
// Inline the first character, for speed.
|
||
if (opnd^ <> regInput^) and (_LowerCase(opnd^) <> regInput^) then
|
||
Exit;
|
||
save := regInput;
|
||
Inc(regInput, no);
|
||
while no > 1 do
|
||
begin
|
||
Inc(save);
|
||
Inc(opnd);
|
||
if (opnd^ <> save^) and (_LowerCase(opnd^) <> save^) then
|
||
Exit;
|
||
Dec(no);
|
||
end;
|
||
end;
|
||
|
||
OP_EXACTLY:
|
||
begin
|
||
opnd := scan + REOpSz + RENextOffSz; // OPERAND
|
||
no := PLongInt(opnd)^;
|
||
if (regInput + no > fInputCurrentEnd) then
|
||
Exit;
|
||
Inc(opnd, RENumberSz);
|
||
// Inline the first character, for speed.
|
||
if opnd^ <> regInput^ then
|
||
Exit;
|
||
save := regInput;
|
||
Inc(regInput, no);
|
||
while no > 1 do
|
||
begin
|
||
Inc(save);
|
||
Inc(opnd);
|
||
if opnd^ <> save^ then
|
||
Exit;
|
||
Dec(no);
|
||
end;
|
||
end;
|
||
|
||
OP_BSUBEXP:
|
||
begin
|
||
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
|
||
if no < 0 then
|
||
Exit;
|
||
opnd := CurrentGrpBounds.GrpStart[no];
|
||
if opnd = nil then
|
||
Exit;
|
||
save := CurrentGrpBounds.GrpEnd[no];
|
||
if save = nil then
|
||
Exit;
|
||
no := save - opnd;
|
||
save := regInput;
|
||
if save + no - 1 >= fInputCurrentEnd then
|
||
Exit;
|
||
|
||
while no > 0 do
|
||
begin
|
||
if (save^ <> opnd^) then
|
||
Exit;
|
||
Inc(save);
|
||
Inc(opnd);
|
||
Dec(no);
|
||
end;
|
||
regInput := save;
|
||
end;
|
||
|
||
OP_BSUBEXP_CI:
|
||
begin
|
||
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
|
||
if no < 0 then
|
||
Exit;
|
||
opnd := CurrentGrpBounds.GrpStart[no];
|
||
if opnd = nil then
|
||
Exit;
|
||
save := CurrentGrpBounds.GrpEnd[no];
|
||
if save = nil then
|
||
Exit;
|
||
no := save - opnd;
|
||
save := regInput;
|
||
if save + no - 1 >= fInputCurrentEnd then
|
||
Exit;
|
||
|
||
while no > 0 do
|
||
begin
|
||
if ((save^ <> opnd^) and (save^ <> InvertCase(opnd^))) then
|
||
Exit;
|
||
Inc(save);
|
||
Inc(opnd);
|
||
Dec(no);
|
||
end;
|
||
regInput := save;
|
||
end;
|
||
|
||
OP_ANYOF:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or
|
||
not FindInCharClass(scan + REOpSz + RENextOffSz, regInput^) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYBUT:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or
|
||
FindInCharClass(scan + REOpSz + RENextOffSz, regInput^) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYOF_CI:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or
|
||
not FindInCharClass(scan + REOpSz + RENextOffSz, _UpperCase(regInput^)) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_ANYBUT_CI:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or
|
||
FindInCharClass(scan + REOpSz + RENextOffSz, _UpperCase(regInput^)) then
|
||
Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_NOTHING:
|
||
;
|
||
OP_COMMENT:
|
||
;
|
||
OP_BACK:
|
||
;
|
||
|
||
OP_OPEN:
|
||
begin
|
||
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
|
||
save := CurrentGrpBounds.TmpStart[no];
|
||
CurrentGrpBounds.TmpStart[no] := regInput;
|
||
Result := MatchPrim(next);
|
||
CurrentGrpBounds.TmpStart[no] := save;
|
||
exit;
|
||
end;
|
||
|
||
OP_OPEN_ATOMIC:
|
||
begin
|
||
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
|
||
Result := MatchPrim(next);
|
||
if GrpBacktrackingAsAtom[no] then
|
||
IsBacktrackingGroupAsAtom := False;
|
||
GrpBacktrackingAsAtom[no] := False;
|
||
Exit;
|
||
end;
|
||
|
||
OP_CLOSE:
|
||
begin
|
||
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
|
||
save := CurrentGrpBounds.GrpStart[no];
|
||
opnd := CurrentGrpBounds.GrpEnd[no]; // save2
|
||
CurrentGrpBounds.GrpStart[no] := CurrentGrpBounds.TmpStart[no];
|
||
CurrentGrpBounds.GrpEnd[no] := regInput;
|
||
|
||
// if we are in OP_SUBCALL* call, it called OP_OPEN*, so we must return
|
||
// in OP_CLOSE, without going to next opcode
|
||
if CurrentSubCalled = no then
|
||
begin
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
|
||
Result := MatchPrim(next);
|
||
if not Result then begin
|
||
CurrentGrpBounds.GrpStart[no] := save;
|
||
CurrentGrpBounds.GrpEnd[no] := opnd;
|
||
end;
|
||
|
||
Exit;
|
||
end;
|
||
|
||
OP_CLOSE_ATOMIC:
|
||
begin
|
||
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
|
||
// handle atomic group, mark it as "done"
|
||
// (we are here because some OP_BRANCH is matched)
|
||
Result := MatchPrim(next);
|
||
if not Result then begin
|
||
if not IsBacktrackingGroupAsAtom then begin
|
||
GrpBacktrackingAsAtom[no] := True;
|
||
IsBacktrackingGroupAsAtom := True;
|
||
end;
|
||
end;
|
||
Exit;
|
||
end;
|
||
|
||
OP_LOOKAHEAD, OP_LOOKAHEAD_NEG:
|
||
begin
|
||
Local.LookAroundInfo.InputPos := regInput;
|
||
Local.LookAroundInfo.IsNegative := (scan^ = OP_LOOKAHEAD_NEG);
|
||
Local.LookAroundInfo.HasMatchedToEnd := False;
|
||
Local.LookAroundInfo.IsBackTracking := False;
|
||
Local.LookAroundInfo.OuterInfo := LookAroundInfoList;
|
||
Local.LookAroundInfo.savedInputCurrentEnd := fInputCurrentEnd;
|
||
LookAroundInfoList := @Local.LookAroundInfo;
|
||
fInputCurrentEnd := fInputEnd;
|
||
|
||
scan := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz;
|
||
Result := MatchPrim(scan);
|
||
|
||
if Local.LookAroundInfo.IsBackTracking then
|
||
IsBacktrackingGroupAsAtom := False;
|
||
LookAroundInfoList := Local.LookAroundInfo.OuterInfo;
|
||
fInputCurrentEnd := Local.LookAroundInfo.savedInputCurrentEnd;
|
||
|
||
if Local.LookAroundInfo.IsNegative then begin
|
||
Result := (not Local.LookAroundInfo.HasMatchedToEnd);
|
||
if Result then begin
|
||
next := regNextQuick(next); // Next-Pointer of OP_LOOKAHEAD_END
|
||
regInput := Local.LookAroundInfo.InputPos;
|
||
Result := False;
|
||
scan := next;
|
||
continue;
|
||
end;
|
||
end;
|
||
|
||
Exit;
|
||
end;
|
||
|
||
OP_LOOKBEHIND, OP_LOOKBEHIND_NEG:
|
||
begin
|
||
Local.LookAroundInfo.InputPos := regInput;
|
||
Local.LookAroundInfo.IsNegative := (scan^ = OP_LOOKBEHIND_NEG);
|
||
Local.LookAroundInfo.HasMatchedToEnd := False;
|
||
Local.LookAroundInfo.IsBackTracking := False;
|
||
Local.LookAroundInfo.OuterInfo := LookAroundInfoList;
|
||
Local.LookAroundInfo.savedInputCurrentEnd := fInputCurrentEnd;
|
||
LookAroundInfoList := @Local.LookAroundInfo;
|
||
|
||
scan := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz;
|
||
Local.IsGreedy := PReOpLookBehindOptions(scan)^.IsGreedy;
|
||
|
||
fInputCurrentEnd := regInput;
|
||
|
||
Result := regInput - fInputStart >= PReOpLookBehindOptions(scan)^.MatchLenMin;
|
||
if Result then begin
|
||
if Local.IsGreedy = OPT_LOOKBEHIND_FIXED then begin
|
||
regInput := regInput - PReOpLookBehindOptions(scan)^.MatchLenMin;
|
||
inc(scan, ReOpLookBehindOptionsSz);
|
||
Result := MatchPrim(scan)
|
||
end
|
||
else
|
||
if Local.IsGreedy = OPT_LOOKBEHIND_NON_GREEDY then begin
|
||
Local.InpStart := regInput - PReOpLookBehindOptions(scan)^.MatchLenMin;
|
||
if regInput - fInputStart >= PReOpLookBehindOptions(scan)^.MatchLenMax then
|
||
save := regInput - PReOpLookBehindOptions(scan)^.MatchLenMax
|
||
else
|
||
save := fInputStart;
|
||
inc(scan, ReOpLookBehindOptionsSz);
|
||
repeat
|
||
regInput := Local.InpStart;
|
||
dec(Local.InpStart);
|
||
Result := MatchPrim(scan);
|
||
until Local.LookAroundInfo.HasMatchedToEnd or (Local.InpStart < save);
|
||
end
|
||
else begin
|
||
if regInput - fInputStart >= PReOpLookBehindOptions(scan)^.MatchLenMax then
|
||
Local.InpStart := regInput - PReOpLookBehindOptions(scan)^.MatchLenMax
|
||
else
|
||
Local.InpStart := fInputStart;
|
||
save := Local.LookAroundInfo.InputPos - PReOpLookBehindOptions(scan)^.MatchLenMin;
|
||
inc(scan, ReOpLookBehindOptionsSz);
|
||
repeat
|
||
regInput := Local.InpStart;
|
||
inc(Local.InpStart);
|
||
Result := MatchPrim(scan);
|
||
until Local.LookAroundInfo.HasMatchedToEnd or (Local.InpStart > save);
|
||
end;
|
||
end;
|
||
|
||
if Local.LookAroundInfo.IsBackTracking then
|
||
IsBacktrackingGroupAsAtom := False;
|
||
LookAroundInfoList := Local.LookAroundInfo.OuterInfo;
|
||
fInputCurrentEnd := Local.LookAroundInfo.savedInputCurrentEnd;
|
||
|
||
if Local.LookAroundInfo.IsNegative then begin
|
||
Result := not Local.LookAroundInfo.HasMatchedToEnd;
|
||
if Result then begin
|
||
next := regNextQuick(next); // Next-Pointer of OP_LOOKAHEAD_END
|
||
regInput := Local.LookAroundInfo.InputPos;
|
||
Result := False;
|
||
scan := next;
|
||
continue;
|
||
end;
|
||
end;
|
||
|
||
Exit;
|
||
end;
|
||
|
||
OP_LOOKAHEAD_END:
|
||
begin
|
||
if LookAroundInfoList = nil then
|
||
Exit;
|
||
Local.LookAroundInfoPtr := LookAroundInfoList;
|
||
Local.LookAroundInfoPtr.HasMatchedToEnd := True;
|
||
|
||
if not Local.LookAroundInfoPtr^.IsNegative then begin
|
||
fInputCurrentEnd := Local.LookAroundInfoPtr^.savedInputCurrentEnd;
|
||
regInput := Local.LookAroundInfoPtr^.InputPos;
|
||
LookAroundInfoList := Local.LookAroundInfoPtr^.OuterInfo;
|
||
|
||
Result := MatchPrim(next);
|
||
LookAroundInfoList := Local.LookAroundInfoPtr;
|
||
end;
|
||
|
||
if (not Result) and not IsBacktrackingGroupAsAtom then begin
|
||
IsBacktrackingGroupAsAtom := True;
|
||
Local.LookAroundInfoPtr.IsBackTracking := True;
|
||
end;
|
||
Exit;
|
||
end;
|
||
|
||
OP_LOOKBEHIND_END:
|
||
begin
|
||
if LookAroundInfoList = nil then
|
||
Exit;
|
||
|
||
Local.LookAroundInfoPtr := LookAroundInfoList;
|
||
if not (Local.LookAroundInfoPtr^.InputPos = regInput) then
|
||
Exit;
|
||
|
||
Local.LookAroundInfoPtr.HasMatchedToEnd := True;
|
||
|
||
if not Local.LookAroundInfoPtr^.IsNegative then begin
|
||
regInput := Local.LookAroundInfoPtr^.InputPos;
|
||
fInputCurrentEnd := Local.LookAroundInfoPtr^.savedInputCurrentEnd;
|
||
LookAroundInfoList := Local.LookAroundInfoPtr^.OuterInfo;
|
||
|
||
Result := MatchPrim(next);
|
||
LookAroundInfoList := Local.LookAroundInfoPtr;
|
||
end;
|
||
|
||
if (not Result) and not IsBacktrackingGroupAsAtom then begin
|
||
IsBacktrackingGroupAsAtom := True;
|
||
Local.LookAroundInfoPtr.IsBackTracking := True;
|
||
end;
|
||
Exit;
|
||
end;
|
||
|
||
OP_BRANCH:
|
||
begin
|
||
repeat
|
||
save := regInput;
|
||
Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz);
|
||
if Result then
|
||
Exit;
|
||
// if branch worked until OP_CLOSE, and marked atomic group as "done", then exit
|
||
regInput := save;
|
||
if IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
scan := next;
|
||
Assert(scan <> nil);
|
||
next := regNextQuick(scan);
|
||
if (next^ <> OP_BRANCH) then
|
||
break;
|
||
until False;
|
||
next := scan + REOpSz + RENextOffSz + REBranchArgSz; // Avoid recursion
|
||
end;
|
||
|
||
OP_GBRANCH, OP_GBRANCH_EX, OP_GBRANCH_EX_CI:
|
||
begin
|
||
Assert((next^ = OP_BRANCH) or (next^ = OP_GBRANCH) or (next^ = OP_GBRANCH_EX) or (next^ = OP_GBRANCH_EX_CI));
|
||
repeat
|
||
save := regInput;
|
||
case scan^ of
|
||
OP_GBRANCH, OP_BRANCH:
|
||
Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz);
|
||
OP_GBRANCH_EX:
|
||
if (regInput^ = (scan + REOpSz + RENextOffSz)^) then
|
||
Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz);
|
||
OP_GBRANCH_EX_CI:
|
||
if (regInput^ = (scan + REOpSz + RENextOffSz)^) or
|
||
(regInput^ = (scan + REOpSz + RENextOffSz + 1)^)
|
||
then
|
||
Result := MatchPrim(scan + REOpSz + RENextOffSz + REBranchArgSz);
|
||
end;
|
||
if Result then
|
||
Exit;
|
||
// if branch worked until OP_CLOSE, and marked atomic group as "done", then exit
|
||
regInput := save;
|
||
if IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
scan := next;
|
||
Assert(scan <> nil);
|
||
next := regNextQuick(scan);
|
||
if (next^ <> OP_BRANCH) and (next^ <> OP_GBRANCH) and (next^ <> OP_GBRANCH_EX) and (next^ <> OP_GBRANCH_EX_CI) then
|
||
break;
|
||
until False;
|
||
case scan^ of
|
||
OP_GBRANCH_EX:
|
||
if (regInput^ <> (scan + REOpSz + RENextOffSz)^) then
|
||
exit;
|
||
OP_GBRANCH_EX_CI:
|
||
if (regInput^ <> (scan + REOpSz + RENextOffSz)^) and
|
||
(regInput^ <> (scan + REOpSz + RENextOffSz + 1)^)
|
||
then
|
||
exit;
|
||
end;
|
||
next := scan + REOpSz + RENextOffSz + REBranchArgSz; // Avoid recursion
|
||
end;
|
||
|
||
{$IFDEF ComplexBraces}
|
||
OP_LOOPENTRY:
|
||
begin
|
||
Local.LoopInfo.Count := 0;
|
||
Local.LoopInfo.BackTrackingAsAtom := False;
|
||
Local.LoopInfo.CurrentRegInput := nil;
|
||
Local.LoopInfo.OuterLoop := CurrentLoopInfoListPtr;
|
||
CurrentLoopInfoListPtr := @Local.LoopInfo;
|
||
save := regInput;
|
||
Result := MatchPrim(next); // execute loop
|
||
CurrentLoopInfoListPtr := Local.LoopInfo.OuterLoop;
|
||
if Local.LoopInfo.BackTrackingAsAtom then
|
||
IsBacktrackingGroupAsAtom := False;
|
||
if not Result then
|
||
regInput := save;
|
||
Exit;
|
||
end;
|
||
|
||
OP_LOOP, OP_LOOP_POSS:
|
||
begin
|
||
if CurrentLoopInfoListPtr = nil then begin
|
||
Error(reeLoopWithoutEntry);
|
||
Exit;
|
||
end;
|
||
opnd := AlignToPtr(scan + REOpSz + RENextOffSz);
|
||
Local.LoopInfoListPtr := CurrentLoopInfoListPtr;
|
||
if Local.LoopInfoListPtr^.Count >= PREBracesArg(opnd)^ then // Min-Count
|
||
begin // Min alredy matched - we can work
|
||
LoopCnt := PREBracesArg(opnd + REBracesArgSz)^; // Max-Count
|
||
Result := (LoopCnt = MaxBracesArg) and // * or +
|
||
(Local.LoopInfoListPtr^.CurrentRegInput = regInput);
|
||
if Result then begin
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop;
|
||
Result := MatchPrim(next);
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr;
|
||
if (not Result) and (scan^ = OP_LOOP_POSS) then begin
|
||
Local.LoopInfoListPtr^.BackTrackingAsAtom := True;
|
||
IsBacktrackingGroupAsAtom := True;
|
||
end;
|
||
exit;
|
||
end;
|
||
|
||
// greedy way - first try to max deep of greed ;)
|
||
if Local.LoopInfoListPtr^.Count < LoopCnt then
|
||
begin
|
||
save := regInput;
|
||
Local.LoopInfoListPtr^.CurrentRegInput := save;
|
||
Inc(Local.LoopInfoListPtr^.Count);
|
||
Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^);
|
||
if Result or IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
Dec(Local.LoopInfoListPtr^.Count);
|
||
regInput := save;
|
||
end;
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop;
|
||
Result := MatchPrim(next);
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr;
|
||
|
||
if Result or IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
if (scan^ = OP_LOOP_POSS) then begin
|
||
Local.LoopInfoListPtr^.BackTrackingAsAtom := True;
|
||
IsBacktrackingGroupAsAtom := True;
|
||
end;
|
||
Exit;
|
||
end
|
||
else
|
||
begin // first match a min_cnt times
|
||
Inc(Local.LoopInfoListPtr^.Count);
|
||
Local.LoopInfoListPtr^.CurrentRegInput := regInput;
|
||
Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^);
|
||
if Result or IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
Dec(Local.LoopInfoListPtr^.Count);
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
OP_LOOP_NG:
|
||
begin
|
||
if CurrentLoopInfoListPtr = nil then begin
|
||
Error(reeLoopWithoutEntry);
|
||
Exit;
|
||
end;
|
||
opnd := AlignToPtr(scan + REOpSz + RENextOffSz);
|
||
Local.LoopInfoListPtr := CurrentLoopInfoListPtr;
|
||
if Local.LoopInfoListPtr^.Count >= PREBracesArg(opnd)^ then // Min-Count
|
||
begin // Min alredy matched - we can work
|
||
LoopCnt := PREBracesArg(opnd + REBracesArgSz)^; // Max-Count
|
||
Result := (LoopCnt = MaxBracesArg) and // * or +
|
||
(Local.LoopInfoListPtr^.CurrentRegInput = regInput);
|
||
if Result then begin
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop;
|
||
Result := MatchPrim(next);
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr;
|
||
exit;
|
||
end;
|
||
|
||
save := regInput;
|
||
Local.LoopInfoListPtr^.CurrentRegInput := save;
|
||
// non-greedy - try just now
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr^.OuterLoop;
|
||
Result := MatchPrim(next);
|
||
CurrentLoopInfoListPtr := Local.LoopInfoListPtr;
|
||
if Result or IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
if Local.LoopInfoListPtr^.Count < LoopCnt then
|
||
begin
|
||
regInput := save; // failed - move next and try again
|
||
Inc(Local.LoopInfoListPtr^.Count);
|
||
Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^);
|
||
if Result or IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
Dec(Local.LoopInfoListPtr^.Count);
|
||
end;
|
||
Exit;
|
||
end
|
||
else
|
||
begin // first match a min_cnt times
|
||
Inc(Local.LoopInfoListPtr^.Count);
|
||
Local.LoopInfoListPtr^.CurrentRegInput := regInput;
|
||
Result := MatchPrim(scan + PRENextOff(opnd + 2 * REBracesArgSz)^);
|
||
if Result or IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
Dec(Local.LoopInfoListPtr^.Count);
|
||
Exit;
|
||
end;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_STAR, OP_PLUS, OP_BRACES:
|
||
begin
|
||
opnd := scan + REOpSz + RENextOffSz;
|
||
save := regInput;
|
||
case scan^ of
|
||
OP_STAR:
|
||
begin
|
||
no := FindRepeated(opnd, MaxInt);
|
||
LoopCnt := 0 // star
|
||
end;
|
||
OP_PLUS:
|
||
begin
|
||
no := FindRepeated(opnd, MaxInt);
|
||
if no < 1 then
|
||
Exit;
|
||
LoopCnt := 1 // star
|
||
end;
|
||
else
|
||
begin // braces
|
||
opnd := AlignToPtr(opnd);
|
||
no := FindRepeated(opnd + 2 * REBracesArgSz, PREBracesArg(opnd + REBracesArgSz)^);
|
||
LoopCnt := PREBracesArg(opnd)^;
|
||
if no < LoopCnt then
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
if next^ = OP_EXACTLY then begin
|
||
// Lookahead to avoid useless match attempts when we know
|
||
// what character comes next.
|
||
Local.nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
|
||
while no >= LoopCnt do
|
||
begin
|
||
// If it could work, try it.
|
||
if (Local.nextch = #0) or (regInput^ = Local.nextch) then
|
||
begin
|
||
if MatchPrim(next) then
|
||
begin
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
if IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
end;
|
||
Dec(no); // Couldn't or didn't - back up.
|
||
regInput := save + no;
|
||
end; { of while }
|
||
end
|
||
else begin
|
||
while no >= LoopCnt do
|
||
begin
|
||
if MatchPrim(next) then
|
||
begin
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
if IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
Dec(no); // Couldn't or didn't - back up.
|
||
regInput := save + no;
|
||
end; { of while }
|
||
end;
|
||
Exit;
|
||
end;
|
||
|
||
OP_STAR_NG, OP_PLUS_NG, OP_BRACES_NG:
|
||
begin
|
||
opnd := scan + REOpSz + RENextOffSz;
|
||
save := regInput;
|
||
case scan^ of
|
||
OP_STAR_NG:
|
||
begin
|
||
no := FindRepeated(opnd, MaxInt);
|
||
LoopCnt := 0 // star
|
||
end;
|
||
OP_PLUS_NG:
|
||
begin
|
||
no := FindRepeated(opnd, MaxInt);
|
||
if no < 1 then
|
||
Exit;
|
||
LoopCnt := 1 // star
|
||
end;
|
||
else
|
||
begin // braces
|
||
opnd := AlignToPtr(opnd);
|
||
no := FindRepeated(opnd + 2 * REBracesArgSz, PREBracesArg(opnd + REBracesArgSz)^);
|
||
LoopCnt := PREBracesArg(opnd)^;
|
||
if no < LoopCnt then
|
||
Exit;
|
||
end;
|
||
end;
|
||
|
||
|
||
// non-greedy mode
|
||
// don't repeat more than "no" times
|
||
// Now we know real Max limit to move forward (for recursion 'back up')
|
||
// In some cases it can be faster to check only Min positions first,
|
||
// but after that we have to check every position separtely instead
|
||
// of fast scannig in loop.
|
||
if next^ = OP_EXACTLY then begin
|
||
// Lookahead to avoid useless match attempts when we know
|
||
// what character comes next.
|
||
Local.nextch := (next + REOpSz + RENextOffSz + RENumberSz)^;
|
||
while LoopCnt <= no do
|
||
begin
|
||
regInput := save + LoopCnt;
|
||
// If it could work, try it.
|
||
if (Local.nextch = #0) or (regInput^ = Local.nextch) then
|
||
begin
|
||
if MatchPrim(next) then
|
||
begin
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
if IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
end;
|
||
Inc(LoopCnt); // Couldn't or didn't - move forward.
|
||
end; { of while }
|
||
end
|
||
else begin
|
||
while LoopCnt <= no do
|
||
begin
|
||
regInput := save + LoopCnt;
|
||
if MatchPrim(next) then
|
||
begin
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
if IsBacktrackingGroupAsAtom then
|
||
Exit;
|
||
Inc(LoopCnt); // Couldn't or didn't - move forward.
|
||
end; { of while }
|
||
end;
|
||
Exit;
|
||
end;
|
||
|
||
OP_STAR_POSS, OP_PLUS_POSS, OP_BRACES_POSS:
|
||
begin
|
||
opnd := scan + REOpSz + RENextOffSz;
|
||
case scan^ of
|
||
OP_STAR_POSS:
|
||
begin
|
||
FindRepeated(opnd, MaxInt);
|
||
end;
|
||
OP_PLUS_POSS:
|
||
begin
|
||
if FindRepeated(opnd, MaxInt) < 1 then
|
||
Exit;
|
||
end;
|
||
else
|
||
begin // braces
|
||
opnd := AlignToPtr(opnd);
|
||
if FindRepeated(opnd + 2 * REBracesArgSz, PREBracesArg(opnd + REBracesArgSz)^)
|
||
< PREBracesArg(opnd)^
|
||
then
|
||
Exit;
|
||
end;
|
||
end;
|
||
end;
|
||
|
||
OP_EEND:
|
||
begin
|
||
Result := True; // Success!
|
||
Exit;
|
||
end;
|
||
|
||
{$IFDEF FastUnicodeData}
|
||
OP_ANYCATEGORY:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) then Exit;
|
||
if not MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
|
||
OP_NOTCATEGORY:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) then Exit;
|
||
if MatchOneCharCategory(scan + REOpSz + RENextOffSz, regInput) then Exit;
|
||
{$IFDEF UNICODEEX}
|
||
IncUnicode(regInput);
|
||
{$ELSE}
|
||
Inc(regInput);
|
||
{$ENDIF}
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_RECUR:
|
||
begin
|
||
// call opcode start
|
||
if regRecursion < RegexMaxRecursion then
|
||
begin
|
||
Inc(regRecursion);
|
||
if regNumBrackets > 0 then begin
|
||
CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0];
|
||
CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0];
|
||
CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0];
|
||
FillChar(CurrentGrpBounds.GrpStart[0], SizeOf(CurrentGrpBounds.GrpStart[0])*regNumBrackets, 0);
|
||
end;
|
||
Result := MatchPrim(regCodeWork);
|
||
Dec(regRecursion);
|
||
if regNumBrackets > 0 then begin
|
||
CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0];
|
||
CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0];
|
||
CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0];
|
||
end;
|
||
if not Result then Exit;
|
||
Result := False;
|
||
end
|
||
else
|
||
Exit;
|
||
end;
|
||
|
||
OP_SUBCALL:
|
||
begin
|
||
// call subroutine
|
||
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
|
||
if no < 0 then Exit;
|
||
save := GrpOpCodes[no];
|
||
if save = nil then Exit;
|
||
if regRecursion < RegexMaxRecursion then
|
||
begin
|
||
Local.savedCurrentSubCalled := CurrentSubCalled;
|
||
CurrentSubCalled := no;
|
||
Inc(regRecursion);
|
||
if regNumBrackets > 0 then begin
|
||
CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0];
|
||
CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0];
|
||
CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0];
|
||
FillChar(CurrentGrpBounds.GrpStart[0], SizeOf(CurrentGrpBounds.GrpStart[0])*regNumBrackets, 0);
|
||
end;
|
||
Result := MatchPrim(save);
|
||
Dec(regRecursion);
|
||
if regNumBrackets > 0 then begin
|
||
CurrentGrpBounds.TmpStart := @GrpBounds[regRecursion].TmpStart[0];
|
||
CurrentGrpBounds.GrpStart := @GrpBounds[regRecursion].GrpStart[0];
|
||
CurrentGrpBounds.GrpEnd := @GrpBounds[regRecursion].GrpEnd[0];
|
||
end;
|
||
CurrentSubCalled := Local.savedCurrentSubCalled;
|
||
if not Result then Exit;
|
||
Result := False;
|
||
end
|
||
else
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYLINEBREAK:
|
||
begin
|
||
if (regInput >= fInputCurrentEnd) or not IsAnyLineBreak(regInput^) then
|
||
Exit;
|
||
if regInput^ = #13 then begin
|
||
Inc(regInput);
|
||
if (regInput < fInputCurrentEnd) and (regInput^ = #10) then
|
||
Inc(regInput);
|
||
end
|
||
else
|
||
Inc(regInput);
|
||
end;
|
||
|
||
{$IFDEF WITH_REGEX_ASSERT}
|
||
else
|
||
Error(reeMatchPrimMemoryCorruption);
|
||
Exit;
|
||
{$ENDIF}
|
||
end; { of case scan^ }
|
||
scan := next;
|
||
end; { of while scan <> nil }
|
||
end; { of function TRegExpr.MatchPrim
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.Exec(const AInputString: RegExprString): Boolean;
|
||
begin
|
||
InputString := AInputString;
|
||
Result := ExecPrim(1, False, False, 0);
|
||
end; { of function TRegExpr.Exec
|
||
-------------------------------------------------------------- }
|
||
|
||
{$IFDEF OverMeth}
|
||
function TRegExpr.Exec: Boolean;
|
||
var
|
||
SlowChecks: Boolean;
|
||
begin
|
||
SlowChecks := (fInputEnd - fInputStart < fSlowChecksSizeMax) and (regMustString <> '');
|
||
Result := ExecPrim(1, SlowChecks, False, 0);
|
||
end; { of function TRegExpr.Exec
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.Exec(AOffset: Integer): Boolean;
|
||
begin
|
||
// Check that the start position is not negative
|
||
if AOffset < 1 then
|
||
begin
|
||
ClearMatches;
|
||
Error(reeOffsetMustBePositive);
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
Result := ExecPrim(AOffset, False, False, 0);
|
||
end; { of function TRegExpr.Exec
|
||
-------------------------------------------------------------- }
|
||
{$ENDIF}
|
||
|
||
function TRegExpr.ExecPos(AOffset: Integer {$IFDEF DefParam} = 1{$ENDIF}): Boolean;
|
||
begin
|
||
// Check that the start position is not negative
|
||
if AOffset < 1 then
|
||
begin
|
||
ClearMatches;
|
||
Error(reeOffsetMustBePositive);
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
Result := ExecPrim(AOffset, False, False, 0);
|
||
end; { of function TRegExpr.ExecPos
|
||
-------------------------------------------------------------- }
|
||
|
||
{$IFDEF OverMeth}
|
||
function TRegExpr.ExecPos(AOffset: Integer; ATryOnce, ABackward: Boolean): Boolean;
|
||
begin
|
||
// Check that the start position is not negative
|
||
if AOffset < 1 then
|
||
begin
|
||
ClearMatches;
|
||
Error(reeOffsetMustBePositive);
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
if ATryOnce then
|
||
Result := ExecPrim(AOffset, False, ABackward, AOffset + 1)
|
||
else
|
||
Result := ExecPrim(AOffset, False, ABackward, 0);
|
||
end;
|
||
|
||
function TRegExpr.ExecPos(AOffset, ATryMatchOnlyStartingBefore: Integer): Boolean;
|
||
begin
|
||
// Check that the start position is not negative
|
||
if AOffset < 1 then
|
||
begin
|
||
ClearMatches;
|
||
Error(reeOffsetMustBePositive);
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
if (ATryMatchOnlyStartingBefore > 0) and (AOffset >= ATryMatchOnlyStartingBefore) then begin
|
||
ClearMatches;
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
Result := ExecPrim(AOffset, False, False, ATryMatchOnlyStartingBefore);
|
||
end;
|
||
{$ENDIF}
|
||
|
||
function TRegExpr.MatchAtOnePos(APos: PRegExprChar): Boolean;
|
||
begin
|
||
regInput := APos;
|
||
//regNestedCalls := 0;
|
||
fInputCurrentEnd := fInputEnd;
|
||
GrpBounds[0].GrpStart[0] := APos;
|
||
Result := MatchPrim(regCodeWork);
|
||
if Result then
|
||
Result := regInput >= GrpBounds[0].GrpStart[0];
|
||
if Result then
|
||
GrpBounds[0].GrpEnd[0] := regInput
|
||
else
|
||
GrpBounds[0].GrpStart[0] := nil;
|
||
end;
|
||
|
||
procedure TRegExpr.ClearMatches;
|
||
begin
|
||
if FMatchesCleared then
|
||
exit;
|
||
FMatchesCleared := True;
|
||
if Length(GrpBounds[0].GrpStart) > 0 then
|
||
FillChar(GrpBounds[0].GrpStart[0], SizeOf(GrpBounds[0].GrpStart[0])*regNumBrackets, 0);
|
||
end;
|
||
|
||
procedure TRegExpr.ClearInternalExecData;
|
||
begin
|
||
fLastError := reeOk;
|
||
if Length(GrpBacktrackingAsAtom) > 0 then
|
||
FillChar(GrpBacktrackingAsAtom[0], SizeOf(GrpBacktrackingAsAtom[0])*regNumAtomicBrackets, 0);
|
||
IsBacktrackingGroupAsAtom := False;
|
||
{$IFDEF ComplexBraces}
|
||
// no loops started
|
||
CurrentLoopInfoListPtr := nil;
|
||
{$ENDIF}
|
||
LookAroundInfoList := nil;
|
||
CurrentSubCalled := -1;
|
||
regRecursion := 0;
|
||
if regNumBrackets > 0 then begin
|
||
CurrentGrpBounds.TmpStart := @GrpBounds[0].TmpStart[0];
|
||
CurrentGrpBounds.GrpStart := @GrpBounds[0].GrpStart[0];
|
||
CurrentGrpBounds.GrpEnd := @GrpBounds[0].GrpEnd[0];
|
||
end;
|
||
end;
|
||
|
||
procedure TRegExpr.InitInternalGroupData;
|
||
var
|
||
BndLen, i: Integer;
|
||
begin
|
||
BndLen := GroupDataArraySize(regNumBrackets, Length(GrpBounds[0].GrpStart));
|
||
if hasRecursion then begin
|
||
for i := low(GrpBounds) to high(GrpBounds) do begin
|
||
SetLength(GrpBounds[i].TmpStart, BndLen);
|
||
SetLength(GrpBounds[i].GrpStart, BndLen);
|
||
SetLength(GrpBounds[i].GrpEnd, BndLen);
|
||
end;
|
||
end
|
||
else begin
|
||
SetLength(GrpBounds[0].TmpStart, BndLen);
|
||
SetLength(GrpBounds[0].GrpStart, BndLen);
|
||
SetLength(GrpBounds[0].GrpEnd, BndLen);
|
||
for i := low(GrpBounds) + 1 to high(GrpBounds) do begin
|
||
GrpBounds[i].TmpStart := nil;
|
||
GrpBounds[i].GrpStart := nil;
|
||
GrpBounds[i].GrpEnd := nil;
|
||
end;
|
||
end;
|
||
|
||
SetLength(GrpOpCodes, GroupDataArraySize(regNumBrackets, Length(GrpOpCodes)));
|
||
SetLength(GrpBacktrackingAsAtom, GroupDataArraySize(regNumAtomicBrackets, Length(GrpBacktrackingAsAtom)));
|
||
|
||
GrpOpCodes[0] := nil;
|
||
end;
|
||
|
||
function TRegExpr.ExecPrim(AOffset: Integer; ASlowChecks, ABackward: Boolean;
|
||
ATryMatchOnlyStartingBefore: Integer): Boolean;
|
||
var
|
||
Len: Ptrint;
|
||
begin
|
||
Result := False;
|
||
|
||
// Ensure that Match cleared either if optimization tricks or some error
|
||
// will lead to leaving ExecPrim without actual search. That is
|
||
// important for ExecNext logic and so on.
|
||
ClearMatches;
|
||
|
||
// Don't check IsProgrammOk here! it causes big slowdown in test_benchmark!
|
||
if programm = nil then
|
||
begin
|
||
if fRaiseForRuntimeError then begin
|
||
Compile;
|
||
end
|
||
else begin
|
||
try
|
||
Compile;
|
||
except
|
||
on E: ERegExpr do begin
|
||
Result := False;
|
||
end;
|
||
else begin
|
||
Result := False;
|
||
fLastError := reeUnknown;
|
||
end;
|
||
end;
|
||
end;
|
||
if programm = nil then
|
||
Exit;
|
||
end;
|
||
|
||
Len := fInputEnd - fInputStart;
|
||
if FMinMatchLen > Len then
|
||
Exit;
|
||
|
||
// Check that the start position is not longer than the line
|
||
if (AOffset - 1) > Len - FMinMatchLen then
|
||
exit;
|
||
|
||
|
||
// If there is a "must appear" string, look for it.
|
||
if ASlowChecks then
|
||
if regMustString <> '' then
|
||
if StrLPos(fInputStart, PRegExprChar(regMustString), Len, length(regMustString)) = nil then
|
||
exit;
|
||
|
||
{$IFDEF RegExpWithStackOverflowCheck_DecStack_Frame}
|
||
StackLimit := StackBottom;
|
||
if StackLimit <> nil then
|
||
StackLimit := StackLimit + 36000; // Add for any calls within the current MatchPrim // FPC has "STACK_MARGIN = 16384;", but we need to call Error, ..., raise
|
||
{$ENDIF}
|
||
|
||
ClearInternalExecData;
|
||
|
||
if fRaiseForRuntimeError then begin
|
||
Result := ExecPrimProtected(AOffset, ASlowChecks, ABackward, ATryMatchOnlyStartingBefore);
|
||
end
|
||
else begin
|
||
try
|
||
Result := ExecPrimProtected(AOffset, ASlowChecks, ABackward, ATryMatchOnlyStartingBefore);
|
||
except
|
||
on E: EStackOverflow do begin
|
||
Result := False;
|
||
fLastError := reeLoopStackExceeded;
|
||
end;
|
||
on E: ERegExpr do begin
|
||
Result := False;
|
||
end;
|
||
else begin
|
||
Result := False;
|
||
fLastError := reeUnknown;
|
||
end;
|
||
end;
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.ExecPrimProtected(AOffset: Integer; ASlowChecks,
|
||
ABackward: Boolean; ATryMatchOnlyStartingBefore: Integer): Boolean;
|
||
var
|
||
Ptr, SearchEnd: PRegExprChar;
|
||
begin
|
||
Result := False;
|
||
|
||
if ASlowChecks then ;
|
||
|
||
Ptr := fInputStart + AOffset - 1;
|
||
fInputContinue := Ptr;
|
||
FMatchesCleared := False;
|
||
// ATryOnce or anchored match (it needs to be tried only once).
|
||
if (ATryMatchOnlyStartingBefore = AOffset + 1) or (regAnchored in [raBOL, raOnlyOnce, raContinue]) then
|
||
begin
|
||
case regAnchored of
|
||
raBOL: if AOffset > 1 then Exit; // can't match the BOL
|
||
raEOL: Ptr := fInputEnd;
|
||
end;
|
||
{$IFDEF UseFirstCharSet}
|
||
if (Ptr < fInputEnd)
|
||
{$IFDEF UnicodeRE} and (Ord(Ptr^) <= $FF) {$ENDIF}
|
||
then
|
||
if not FirstCharArray[Byte(Ptr^)] then
|
||
Exit;
|
||
{$ENDIF}
|
||
|
||
Result := MatchAtOnePos(Ptr);
|
||
Exit;
|
||
end;
|
||
|
||
// Messy cases: unanchored match.
|
||
if ABackward then begin
|
||
Inc(Ptr, 2);
|
||
repeat
|
||
Dec(Ptr);
|
||
if Ptr < fInputStart then
|
||
Exit;
|
||
|
||
{$IFDEF UseFirstCharSet}
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(Ptr^) <= $FF then
|
||
{$ENDIF}
|
||
if not FirstCharArray[byte(Ptr^)] then
|
||
Continue;
|
||
{$ENDIF}
|
||
|
||
Result := MatchAtOnePos(Ptr);
|
||
// Exit on a match or after testing the end-of-string
|
||
if Result then
|
||
Exit;
|
||
until False;
|
||
end
|
||
else begin
|
||
Dec(Ptr);
|
||
SearchEnd := fInputEnd - FMinMatchLen;
|
||
if (ATryMatchOnlyStartingBefore > 0) and (fInputStart + ATryMatchOnlyStartingBefore < SearchEnd) then
|
||
SearchEnd := fInputStart + ATryMatchOnlyStartingBefore - 2;
|
||
repeat
|
||
Inc(Ptr);
|
||
if Ptr > SearchEnd then
|
||
Break;
|
||
|
||
{$IFDEF UseFirstCharSet}
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(Ptr^) <= $FF then
|
||
{$ENDIF}
|
||
if not FirstCharArray[byte(Ptr^)] then
|
||
Continue;
|
||
{$ENDIF}
|
||
|
||
Result := MatchAtOnePos(Ptr);
|
||
// Exit on a match or after testing the end-of-string
|
||
if Result then
|
||
Exit;
|
||
until False;
|
||
{$IFDEF UseFirstCharSet}
|
||
if FirstCharArray[0] and (fInputEnd^ <> #0) then
|
||
Result := MatchAtOnePos(fInputEnd);
|
||
{$ENDIF}
|
||
end;
|
||
end; { of function TRegExpr.ExecPrim
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.ExecNext(ABackward: Boolean {$IFDEF DefParam} = False{$ENDIF}): Boolean;
|
||
var
|
||
PtrBegin, PtrEnd: PRegExprChar;
|
||
Offset: PtrInt;
|
||
begin
|
||
PtrBegin := GrpBounds[0].GrpStart[0];
|
||
PtrEnd := GrpBounds[0].GrpEnd[0];
|
||
if (PtrBegin = nil) or (PtrEnd = nil) then
|
||
begin
|
||
Error(reeExecNextWithoutExec);
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
|
||
Offset := PtrEnd - fInputStart + 1;
|
||
// prevent infinite looping if empty string matches r.e.
|
||
if PtrBegin = PtrEnd then
|
||
Inc(Offset);
|
||
|
||
Result := ExecPrim(Offset, False, ABackward, 0);
|
||
end; { of function TRegExpr.ExecNext
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.SetInputString(const AInputString: RegExprString);
|
||
begin
|
||
ClearMatches;
|
||
|
||
fInputString := AInputString;
|
||
//UniqueString(fInputString);
|
||
|
||
fInputStart := PRegExprChar(fInputString);
|
||
fInputEnd := fInputStart + Length(fInputString);
|
||
fInputContinue := fInputStart;
|
||
end;
|
||
|
||
procedure TRegExpr.SetInputRange(AStart, AEnd, AContinueAnchor: PRegExprChar);
|
||
begin
|
||
ClearMatches;
|
||
fInputString := '';
|
||
fInputStart := AStart;
|
||
fInputEnd := AEnd;
|
||
fInputContinue := AContinueAnchor;
|
||
end;
|
||
|
||
{$IFDEF UseLineSep}
|
||
procedure TRegExpr.SetLineSeparators(const AStr: RegExprString);
|
||
begin
|
||
if AStr <> fLineSeparators then
|
||
begin
|
||
fLineSeparators := AStr;
|
||
InitLineSepArray;
|
||
InvalidateProgramm;
|
||
end;
|
||
end; { of procedure TRegExpr.SetLineSeparators
|
||
-------------------------------------------------------------- }
|
||
{$ENDIF}
|
||
|
||
procedure TRegExpr.SetUsePairedBreak(AValue: Boolean);
|
||
begin
|
||
if AValue <> fUsePairedBreak then
|
||
begin
|
||
fUsePairedBreak := AValue;
|
||
InvalidateProgramm;
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.Substitute(const ATemplate: RegExprString): RegExprString;
|
||
// perform substitutions after a regexp match
|
||
var
|
||
TemplateBeg, TemplateEnd: PRegExprChar;
|
||
|
||
function ParseVarName(var APtr: PRegExprChar): Integer;
|
||
// extract name of variable: $1 or ${1} or ${name}
|
||
// from APtr^, uses TemplateEnd
|
||
var
|
||
p: PRegExprChar;
|
||
Delimited: Boolean;
|
||
GrpName: RegExprString;
|
||
begin
|
||
Result := 0;
|
||
GrpName := '';
|
||
p := APtr;
|
||
Delimited := (p < TemplateEnd) and (p^ = '{');
|
||
if Delimited then
|
||
Inc(p); // skip left curly brace
|
||
if (p < TemplateEnd) and (p^ = '&') then
|
||
Inc(p) // this is '$&' or '${&}'
|
||
else
|
||
begin
|
||
if IsDigitChar(p^) then
|
||
begin
|
||
while (p < TemplateEnd) and IsDigitChar(p^) do
|
||
begin
|
||
Result := Result * 10 + (Ord(p^) - Ord('0'));
|
||
Inc(p);
|
||
end
|
||
end
|
||
else
|
||
if Delimited then
|
||
begin
|
||
FindGroupName(p, TemplateEnd, '}', GrpName);
|
||
Result := GrpNames.MatchIndexFromName(GrpName);
|
||
Inc(p, Length(GrpName));
|
||
end;
|
||
end;
|
||
if Delimited then
|
||
if (p < TemplateEnd) and (p^ = '}') then
|
||
Inc(p) // skip right curly brace
|
||
else
|
||
p := APtr; // isn't properly terminated
|
||
if p = APtr then
|
||
Result := -1; // no valid digits found or no right curly brace
|
||
APtr := p;
|
||
end;
|
||
|
||
procedure FindSubstGroupIndex(var p: PRegExprChar; var Idx: Integer; var NumberFound: Boolean);
|
||
begin
|
||
Idx := ParseVarName(p);
|
||
NumberFound := Idx >= 0;
|
||
if NumberFound and (Idx > GrpCount) then
|
||
Idx := -1;
|
||
end;
|
||
|
||
type
|
||
TSubstMode = (smodeNormal, smodeOneUpper, smodeOneLower, smodeAllUpper, smodeAllLower);
|
||
var
|
||
Mode: TSubstMode;
|
||
p, p0, p1, ResultPtr: PRegExprChar;
|
||
ResultLen, n: Integer;
|
||
Ch, QuotedChar: REChar;
|
||
GroupFound: Boolean;
|
||
begin
|
||
// Check programm and input string
|
||
if not IsProgrammOk then
|
||
Exit;
|
||
// Note: don't check for empty fInputString, it's valid case,
|
||
// e.g. user needs to replace regex "\b" to "_", it's zero match length
|
||
if ATemplate = '' then
|
||
begin
|
||
Result := '';
|
||
Exit;
|
||
end;
|
||
TemplateBeg := PRegExprChar(ATemplate);
|
||
TemplateEnd := TemplateBeg + Length(ATemplate);
|
||
// Count result length for speed optimization.
|
||
ResultLen := 0;
|
||
p := TemplateBeg;
|
||
while p < TemplateEnd do
|
||
begin
|
||
Ch := p^;
|
||
Inc(p);
|
||
n := -1;
|
||
GroupFound := False;
|
||
if Ch = SubstituteGroupChar then
|
||
FindSubstGroupIndex(p, n, GroupFound);
|
||
if GroupFound then
|
||
begin
|
||
if (n >= 0) and (GrpBounds[0].GrpStart[n] <> nil) then
|
||
Inc(ResultLen, GrpBounds[0].GrpEnd[n] - GrpBounds[0].GrpStart[n]);
|
||
end
|
||
else
|
||
begin
|
||
if (Ch = EscChar) and (p < TemplateEnd) then
|
||
begin // quoted or special char followed
|
||
Ch := p^;
|
||
Inc(p);
|
||
case Ch of
|
||
'n':
|
||
Inc(ResultLen, Length(fReplaceLineEnd));
|
||
'u', 'l', 'U', 'L': { nothing }
|
||
;
|
||
'x':
|
||
begin
|
||
Inc(ResultLen);
|
||
if (p^ = '{') then
|
||
begin // skip \x{....}
|
||
while ((p^ <> '}') and (p < TemplateEnd)) do
|
||
p := p + 1;
|
||
p := p + 1;
|
||
end
|
||
else
|
||
p := p + 2 // skip \x..
|
||
end;
|
||
else
|
||
Inc(ResultLen);
|
||
end;
|
||
end
|
||
else
|
||
Inc(ResultLen);
|
||
end;
|
||
end;
|
||
// Get memory. We do it once and it significant speed up work !
|
||
if ResultLen = 0 then
|
||
begin
|
||
Result := '';
|
||
Exit;
|
||
end;
|
||
SetLength(Result, ResultLen);
|
||
// Fill Result
|
||
ResultPtr := PRegExprChar(Result);
|
||
p := TemplateBeg;
|
||
Mode := smodeNormal;
|
||
while p < TemplateEnd do
|
||
begin
|
||
Ch := p^;
|
||
p0 := p;
|
||
Inc(p);
|
||
p1 := p;
|
||
n := -1;
|
||
GroupFound := False;
|
||
if Ch = SubstituteGroupChar then
|
||
FindSubstGroupIndex(p, n, GroupFound);
|
||
if GroupFound then
|
||
begin
|
||
if n >= 0 then
|
||
begin
|
||
p0 := GrpBounds[0].GrpStart[n];
|
||
if p0 = nil then
|
||
p1 := nil
|
||
else
|
||
p1 := GrpBounds[0].GrpEnd[n];
|
||
end
|
||
else
|
||
p1 := p0;
|
||
end
|
||
else
|
||
begin
|
||
if (Ch = EscChar) and (p < TemplateEnd) then
|
||
begin // quoted or special char followed
|
||
Ch := p^;
|
||
Inc(p);
|
||
case Ch of
|
||
'n':
|
||
begin
|
||
p0 := PRegExprChar(fReplaceLineEnd);
|
||
p1 := p0 + Length(fReplaceLineEnd);
|
||
end;
|
||
'x', 't', 'r', 'f', 'a', 'e':
|
||
begin
|
||
p := p - 1;
|
||
// UnquoteChar expects the escaped char under the pointer
|
||
QuotedChar := UnQuoteChar(p, TemplateEnd);
|
||
p := p + 1;
|
||
// Skip after last part of the escaped sequence - UnquoteChar stops on the last symbol of it
|
||
p0 := @QuotedChar;
|
||
p1 := p0 + 1;
|
||
end;
|
||
'l':
|
||
begin
|
||
Mode := smodeOneLower;
|
||
p1 := p0;
|
||
end;
|
||
'L':
|
||
begin
|
||
Mode := smodeAllLower;
|
||
p1 := p0;
|
||
end;
|
||
'u':
|
||
begin
|
||
Mode := smodeOneUpper;
|
||
p1 := p0;
|
||
end;
|
||
'U':
|
||
begin
|
||
Mode := smodeAllUpper;
|
||
p1 := p0;
|
||
end;
|
||
else
|
||
Inc(p0);
|
||
Inc(p1);
|
||
end;
|
||
end
|
||
end;
|
||
if p0 < p1 then
|
||
begin
|
||
while p0 < p1 do
|
||
begin
|
||
case Mode of
|
||
smodeOneLower:
|
||
begin
|
||
ResultPtr^ := _LowerCase(p0^);
|
||
Mode := smodeNormal;
|
||
end;
|
||
smodeAllLower:
|
||
begin
|
||
ResultPtr^ := _LowerCase(p0^);
|
||
end;
|
||
smodeOneUpper:
|
||
begin
|
||
ResultPtr^ := _UpperCase(p0^);
|
||
Mode := smodeNormal;
|
||
end;
|
||
smodeAllUpper:
|
||
begin
|
||
ResultPtr^ := _UpperCase(p0^);
|
||
end;
|
||
else
|
||
ResultPtr^ := p0^;
|
||
end;
|
||
Inc(ResultPtr);
|
||
Inc(p0);
|
||
end;
|
||
Mode := smodeNormal;
|
||
end;
|
||
end;
|
||
end; { of function TRegExpr.Substitute
|
||
-------------------------------------------------------------- }
|
||
|
||
procedure TRegExpr.Split(const AInputStr: RegExprString; APieces: TStrings);
|
||
var
|
||
PrevPos: PtrInt;
|
||
begin
|
||
PrevPos := 1;
|
||
if Exec(AInputStr) then
|
||
repeat
|
||
APieces.Add(System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos));
|
||
PrevPos := MatchPos[0] + MatchLen[0];
|
||
until not ExecNext;
|
||
APieces.Add(System.Copy(AInputStr, PrevPos, MaxInt)); // Tail
|
||
end; { of procedure TRegExpr.Split
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.Replace(const AInputStr: RegExprString;
|
||
const AReplaceStr: RegExprString;
|
||
AUseSubstitution: Boolean{$IFDEF DefParam} = False{$ENDIF}): RegExprString;
|
||
var
|
||
PrevPos: PtrInt;
|
||
begin
|
||
Result := '';
|
||
PrevPos := 1;
|
||
if Exec(AInputStr) then
|
||
repeat
|
||
Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos);
|
||
if AUseSubstitution
|
||
then
|
||
Result := Result + Substitute(AReplaceStr)
|
||
else
|
||
Result := Result + AReplaceStr;
|
||
PrevPos := MatchPos[0] + MatchLen[0];
|
||
until not ExecNext;
|
||
Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
|
||
end; { of function TRegExpr.Replace
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.ReplaceEx(const AInputStr: RegExprString;
|
||
AReplaceFunc: TRegExprReplaceFunction): RegExprString;
|
||
var
|
||
PrevPos: PtrInt;
|
||
begin
|
||
Result := '';
|
||
PrevPos := 1;
|
||
if Exec(AInputStr) then
|
||
repeat
|
||
Result := Result + System.Copy(AInputStr, PrevPos, MatchPos[0] - PrevPos)
|
||
+ AReplaceFunc(Self);
|
||
PrevPos := MatchPos[0] + MatchLen[0];
|
||
until not ExecNext;
|
||
Result := Result + System.Copy(AInputStr, PrevPos, MaxInt); // Tail
|
||
end; { of function TRegExpr.ReplaceEx
|
||
-------------------------------------------------------------- }
|
||
|
||
{$IFDEF OverMeth}
|
||
function TRegExpr.Replace(const AInputStr: RegExprString;
|
||
AReplaceFunc: TRegExprReplaceFunction): RegExprString;
|
||
begin
|
||
Result := ReplaceEx(AInputStr, AReplaceFunc);
|
||
end; { of function TRegExpr.Replace
|
||
-------------------------------------------------------------- }
|
||
{$ENDIF}
|
||
{ ============================================================= }
|
||
{ ====================== Debug section ======================== }
|
||
{ ============================================================= }
|
||
|
||
{$IFDEF UseFirstCharSet}
|
||
procedure TRegExpr.FillFirstCharSet(prog: PRegExprChar);
|
||
var
|
||
scan: PRegExprChar; // Current node.
|
||
Next: PRegExprChar; // Next node.
|
||
opnd: PRegExprChar;
|
||
Oper: TREOp;
|
||
ch: REChar;
|
||
min_cnt: Integer;
|
||
{$IFDEF UseLineSep}
|
||
i: Integer;
|
||
{$ENDIF}
|
||
TempSet, TmpFirstCharSet: TRegExprCharset;
|
||
begin
|
||
TempSet := [];
|
||
scan := prog;
|
||
while scan <> nil do
|
||
begin
|
||
Next := regNextQuick(scan);
|
||
Oper := PREOp(scan)^;
|
||
case Oper of
|
||
OP_BSUBEXP,
|
||
OP_BSUBEXP_CI:
|
||
begin
|
||
// we cannot optimize r.e. if it starts with back reference
|
||
FirstCharSet := RegExprAllSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_BOL,
|
||
OP_BOL_ML,
|
||
OP_CONTINUE_POS,
|
||
OP_RESET_MATCHPOS:
|
||
; // Exit;
|
||
|
||
OP_EOL,
|
||
OP_EOL2,
|
||
OP_EOL_ML:
|
||
begin
|
||
Include(FirstCharSet, 0);
|
||
if ModifierM then
|
||
begin
|
||
{$IFDEF UseLineSep}
|
||
for i := 1 to Length(LineSeparators) do
|
||
Include(FirstCharSet, Byte(LineSeparators[i]));
|
||
{$ELSE}
|
||
FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
|
||
{$ENDIF}
|
||
end;
|
||
Exit;
|
||
end;
|
||
|
||
OP_BOUND,
|
||
OP_NOTBOUND:
|
||
;
|
||
|
||
OP_ANY,
|
||
OP_ANY_ML:
|
||
begin // we can better define ANYML
|
||
FirstCharSet := RegExprAllSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYDIGIT:
|
||
begin
|
||
FirstCharSet := FirstCharSet + RegExprDigitSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_NOTDIGIT:
|
||
begin
|
||
FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprDigitSet);
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYLETTER:
|
||
begin
|
||
GetCharSetFromWordChars(TempSet);
|
||
FirstCharSet := FirstCharSet + TempSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_NOTLETTER:
|
||
begin
|
||
GetCharSetFromWordChars(TempSet);
|
||
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYSPACE:
|
||
begin
|
||
GetCharSetFromSpaceChars(TempSet);
|
||
FirstCharSet := FirstCharSet + TempSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_NOTSPACE:
|
||
begin
|
||
GetCharSetFromSpaceChars(TempSet);
|
||
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYVERTSEP:
|
||
begin
|
||
FirstCharSet := FirstCharSet + RegExprLineSeparatorsSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_NOTVERTSEP:
|
||
begin
|
||
FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprLineSeparatorsSet);
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYHORZSEP:
|
||
begin
|
||
FirstCharSet := FirstCharSet + RegExprHorzSeparatorsSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_NOTHORZSEP:
|
||
begin
|
||
FirstCharSet := FirstCharSet + (RegExprAllSet - RegExprHorzSeparatorsSet);
|
||
Exit;
|
||
end;
|
||
|
||
OP_EXACTLY_CI:
|
||
begin
|
||
ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(ch) <= $FF then
|
||
{$ENDIF}
|
||
begin
|
||
Include(FirstCharSet, Byte(ch));
|
||
Include(FirstCharSet, Byte(InvertCase(ch)));
|
||
end;
|
||
Exit;
|
||
end;
|
||
|
||
OP_EXACTLY:
|
||
begin
|
||
ch := (scan + REOpSz + RENextOffSz + RENumberSz)^;
|
||
{$IFDEF UnicodeRE}
|
||
if Ord(ch) <= $FF then
|
||
{$ENDIF}
|
||
Include(FirstCharSet, Byte(ch));
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYOF:
|
||
begin
|
||
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
|
||
FirstCharSet := FirstCharSet + TempSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYBUT:
|
||
begin
|
||
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, False, TempSet);
|
||
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYOF_CI:
|
||
begin
|
||
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
|
||
FirstCharSet := FirstCharSet + TempSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYBUT_CI:
|
||
begin
|
||
GetCharSetFromCharClass(scan + REOpSz + RENextOffSz, True, TempSet);
|
||
FirstCharSet := FirstCharSet + (RegExprAllSet - TempSet);
|
||
Exit;
|
||
end;
|
||
|
||
OP_NOTHING:
|
||
;
|
||
OP_COMMENT:
|
||
;
|
||
OP_BACK:
|
||
begin
|
||
// No point to rescan the code again
|
||
Next := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz;;
|
||
end;
|
||
|
||
OP_OPEN, OP_OPEN_ATOMIC:
|
||
begin
|
||
FillFirstCharSet(Next);
|
||
Exit;
|
||
end;
|
||
|
||
OP_CLOSE, OP_CLOSE_ATOMIC:
|
||
begin
|
||
FillFirstCharSet(Next);
|
||
Exit;
|
||
end;
|
||
|
||
OP_LOOKAHEAD:
|
||
begin
|
||
opnd := PRegExprChar(AlignToPtr(Next + 1)) + RENextOffSz;
|
||
Next := regNextQuick(Next);
|
||
|
||
TempSet := FirstCharSet;
|
||
FirstCharSet := [];
|
||
FillFirstCharSet(Next); // after the lookahead
|
||
|
||
Next := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz;
|
||
TmpFirstCharSet := FirstCharSet;
|
||
FirstCharSet := [];
|
||
FillFirstCharSet(Next); // inside the lookahead
|
||
|
||
if TmpFirstCharSet = [] then
|
||
FirstCharSet := TempSet + FirstCharSet
|
||
else
|
||
if FirstCharSet = [] then
|
||
FirstCharSet := TempSet + TmpFirstCharSet
|
||
else
|
||
FirstCharSet := TempSet + (FirstCharSet * TmpFirstCharSet);
|
||
exit;
|
||
end;
|
||
|
||
OP_LOOKAHEAD_NEG,
|
||
OP_LOOKBEHIND, OP_LOOKBEHIND_NEG:
|
||
begin
|
||
Next := PRegExprChar(AlignToPtr(Next + 1)) + RENextOffSz;
|
||
end;
|
||
|
||
OP_LOOKAHEAD_END, OP_LOOKBEHIND_END:
|
||
begin
|
||
Exit;
|
||
end;
|
||
|
||
OP_BRANCH, OP_GBRANCH, OP_GBRANCH_EX, OP_GBRANCH_EX_CI:
|
||
begin
|
||
repeat
|
||
TmpFirstCharSet := FirstCharSet;
|
||
FirstCharSet := [];
|
||
FillFirstCharSet(scan + REOpSz + RENextOffSz + REBranchArgSz);
|
||
FirstCharSet := FirstCharSet + TmpFirstCharSet;
|
||
scan := regNextQuick(scan);
|
||
until (scan = nil) or
|
||
( (PREOp(scan)^ <> OP_BRANCH) and (PREOp(Next)^ <> OP_GBRANCH) and
|
||
(PREOp(scan)^ <> OP_GBRANCH_EX) and (PREOp(scan)^ <> OP_GBRANCH_EX_CI) );
|
||
Exit;
|
||
end;
|
||
|
||
{$IFDEF ComplexBraces}
|
||
OP_LOOPENTRY:
|
||
begin
|
||
min_cnt := PREBracesArg(AlignToPtr(Next + REOpSz + RENextOffSz))^;
|
||
if min_cnt = 0 then begin
|
||
opnd := regNext(Next);
|
||
FillFirstCharSet(opnd); // FirstChar may be after loop
|
||
end;
|
||
Next := PRegExprChar(AlignToPtr(scan + 1)) + RENextOffSz;
|
||
end;
|
||
|
||
OP_LOOP,
|
||
OP_LOOP_NG,
|
||
OP_LOOP_POSS:
|
||
begin
|
||
min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^;
|
||
if min_cnt = 0 then
|
||
Exit;
|
||
// zero width loop
|
||
end;
|
||
{$ENDIF}
|
||
|
||
OP_STAR,
|
||
OP_STAR_NG,
|
||
OP_STAR_POSS:
|
||
FillFirstCharSet(scan + REOpSz + RENextOffSz);
|
||
|
||
OP_PLUS,
|
||
OP_PLUS_NG,
|
||
OP_PLUS_POSS:
|
||
begin
|
||
FillFirstCharSet(scan + REOpSz + RENextOffSz);
|
||
Exit;
|
||
end;
|
||
|
||
OP_BRACES,
|
||
OP_BRACES_NG,
|
||
OP_BRACES_POSS:
|
||
begin
|
||
opnd := scan + REOpSz + RENextOffSz + REBracesArgSz * 2;
|
||
min_cnt := PREBracesArg(AlignToPtr(scan + REOpSz + RENextOffSz))^; // BRACES
|
||
FillFirstCharSet(opnd);
|
||
if min_cnt > 0 then
|
||
Exit;
|
||
end;
|
||
|
||
OP_EEND:
|
||
begin
|
||
FirstCharSet := RegExprAllSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYCATEGORY,
|
||
OP_NOTCATEGORY:
|
||
begin
|
||
FirstCharSet := RegExprAllSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_RECUR,
|
||
OP_SUBCALL:
|
||
begin
|
||
// we cannot optimize // TODO: lookup the called group
|
||
FirstCharSet := RegExprAllSet;
|
||
Exit;
|
||
end;
|
||
|
||
OP_ANYLINEBREAK:
|
||
begin
|
||
Include(FirstCharSet, Byte(10));
|
||
Include(FirstCharSet, Byte(13));
|
||
Include(FirstCharSet, Byte($0B));
|
||
Include(FirstCharSet, Byte($0C));
|
||
Include(FirstCharSet, Byte($85));
|
||
Exit;
|
||
end;
|
||
|
||
else
|
||
fLastErrorOpcode := Oper;
|
||
Error(reeUnknownOpcodeInFillFirst);
|
||
Exit;
|
||
end; { of case scan^}
|
||
scan := Next;
|
||
end; { of while scan <> nil}
|
||
end; { of procedure FillFirstCharSet
|
||
--------------------------------------------------------------}
|
||
{$ENDIF}
|
||
|
||
procedure TRegExpr.InitCharCheckers;
|
||
var
|
||
Cnt: Integer;
|
||
//
|
||
function Add(AChecker: TRegExprCharChecker): Byte;
|
||
begin
|
||
Inc(Cnt);
|
||
if Cnt > High(CharCheckers) then
|
||
Error(reeTooSmallCheckersArray);
|
||
CharCheckers[Cnt - 1] := AChecker;
|
||
Result := Cnt - 1;
|
||
end;
|
||
//
|
||
begin
|
||
Cnt := 0;
|
||
FillChar(CharCheckers, SizeOf(CharCheckers), 0);
|
||
|
||
CheckerIndex_Word := Add(CharChecker_Word);
|
||
CheckerIndex_NotWord := Add(CharChecker_NotWord);
|
||
CheckerIndex_Space := Add(CharChecker_Space);
|
||
CheckerIndex_NotSpace := Add(CharChecker_NotSpace);
|
||
CheckerIndex_Digit := Add(CharChecker_Digit);
|
||
CheckerIndex_NotDigit := Add(CharChecker_NotDigit);
|
||
CheckerIndex_VertSep := Add(CharChecker_VertSep);
|
||
CheckerIndex_NotVertSep := Add(CharChecker_NotVertSep);
|
||
CheckerIndex_HorzSep := Add(CharChecker_HorzSep);
|
||
CheckerIndex_NotHorzSep := Add(CharChecker_NotHorzSep);
|
||
//CheckerIndex_AllAZ := Add(CharChecker_AllAZ);
|
||
CheckerIndex_LowerAZ := Add(CharChecker_LowerAZ);
|
||
CheckerIndex_UpperAZ := Add(CharChecker_UpperAZ);
|
||
CheckerIndex_AnyLineBreak := Add(CharChecker_AnyLineBreak);
|
||
|
||
SetLength(CharCheckerInfos, 3);
|
||
with CharCheckerInfos[0] do
|
||
begin
|
||
CharBegin := 'a';
|
||
CharEnd:= 'z';
|
||
CheckerIndex := CheckerIndex_LowerAZ;
|
||
end;
|
||
with CharCheckerInfos[1] do
|
||
begin
|
||
CharBegin := 'A';
|
||
CharEnd := 'Z';
|
||
CheckerIndex := CheckerIndex_UpperAZ;
|
||
end;
|
||
with CharCheckerInfos[2] do
|
||
begin
|
||
CharBegin := '0';
|
||
CharEnd := '9';
|
||
CheckerIndex := CheckerIndex_Digit;
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_Word(ch: REChar): Boolean;
|
||
begin
|
||
Result := IsWordChar(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_NotWord(ch: REChar): Boolean;
|
||
begin
|
||
Result := not IsWordChar(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_Space(ch: REChar): Boolean;
|
||
begin
|
||
Result := IsSpaceChar(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_NotSpace(ch: REChar): Boolean;
|
||
begin
|
||
Result := not IsSpaceChar(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_Digit(ch: REChar): Boolean;
|
||
begin
|
||
Result := IsDigitChar(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_NotDigit(ch: REChar): Boolean;
|
||
begin
|
||
Result := not IsDigitChar(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_VertSep(ch: REChar): Boolean;
|
||
begin
|
||
Result := IsVertLineSeparator(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_NotVertSep(ch: REChar): Boolean;
|
||
begin
|
||
Result := not IsVertLineSeparator(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_AnyLineBreak(ch: REChar): Boolean;
|
||
begin
|
||
Result := IsAnyLineBreak(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_HorzSep(ch: REChar): Boolean;
|
||
begin
|
||
Result := IsHorzSeparator(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_NotHorzSep(ch: REChar): Boolean;
|
||
begin
|
||
Result := not IsHorzSeparator(ch);
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_LowerAZ(ch: REChar): Boolean;
|
||
begin
|
||
case ch of
|
||
'a' .. 'z':
|
||
Result := True;
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
function TRegExpr.CharChecker_UpperAZ(ch: REChar): Boolean;
|
||
begin
|
||
case ch of
|
||
'A' .. 'Z':
|
||
Result := True;
|
||
else
|
||
Result := False;
|
||
end;
|
||
end;
|
||
|
||
|
||
{$IFDEF RegExpPCodeDump}
|
||
|
||
function TRegExpr.DumpOp(op: TREOp): RegExprString;
|
||
// printable representation of opcode
|
||
begin
|
||
case op of
|
||
OP_BOL:
|
||
Result := 'BOL';
|
||
OP_EOL:
|
||
Result := 'EOL';
|
||
OP_EOL2:
|
||
Result := 'EOL2';
|
||
OP_BOL_ML:
|
||
Result := 'BOL_ML';
|
||
OP_CONTINUE_POS:
|
||
Result := 'CONTINUE_POS';
|
||
OP_EOL_ML:
|
||
Result := 'EOL_ML';
|
||
OP_BOUND:
|
||
Result := 'BOUND';
|
||
OP_NOTBOUND:
|
||
Result := 'NOTBOUND';
|
||
OP_ANY:
|
||
Result := 'ANY';
|
||
OP_ANY_ML:
|
||
Result := 'ANY_ML';
|
||
OP_ANYLETTER:
|
||
Result := 'ANYLETTER';
|
||
OP_NOTLETTER:
|
||
Result := 'NOTLETTER';
|
||
OP_ANYDIGIT:
|
||
Result := 'ANYDIGIT';
|
||
OP_NOTDIGIT:
|
||
Result := 'NOTDIGIT';
|
||
OP_ANYSPACE:
|
||
Result := 'ANYSPACE';
|
||
OP_NOTSPACE:
|
||
Result := 'NOTSPACE';
|
||
OP_ANYHORZSEP:
|
||
Result := 'ANYHORZSEP';
|
||
OP_NOTHORZSEP:
|
||
Result := 'NOTHORZSEP';
|
||
OP_ANYVERTSEP:
|
||
Result := 'ANYVERTSEP';
|
||
OP_NOTVERTSEP:
|
||
Result := 'NOTVERTSEP';
|
||
OP_ANYOF:
|
||
Result := 'ANYOF';
|
||
OP_ANYBUT:
|
||
Result := 'ANYBUT';
|
||
OP_ANYOF_CI:
|
||
Result := 'ANYOF_CI';
|
||
OP_ANYBUT_CI:
|
||
Result := 'ANYBUT_CI';
|
||
OP_BRANCH:
|
||
Result := 'BRANCH';
|
||
OP_GBRANCH:
|
||
Result := 'G_BRANCH';
|
||
OP_GBRANCH_EX:
|
||
Result := 'G_BRANCH_EX';
|
||
OP_GBRANCH_EX_CI:
|
||
Result := 'G_BRANCH_EX_CI';
|
||
OP_EXACTLY:
|
||
Result := 'EXACTLY';
|
||
OP_EXACTLY_CI:
|
||
Result := 'EXACTLY_CI';
|
||
OP_NOTHING:
|
||
Result := 'NOTHING';
|
||
OP_COMMENT:
|
||
Result := 'COMMENT';
|
||
OP_BACK:
|
||
Result := 'BACK';
|
||
OP_EEND:
|
||
Result := 'END';
|
||
OP_BSUBEXP:
|
||
Result := 'BSUBEXP';
|
||
OP_BSUBEXP_CI:
|
||
Result := 'BSUBEXP_CI';
|
||
OP_OPEN:
|
||
Result := 'OPEN';
|
||
OP_CLOSE:
|
||
Result := 'CLOSE';
|
||
OP_OPEN_ATOMIC:
|
||
Result := 'OPEN_ATOMIC';
|
||
OP_CLOSE_ATOMIC:
|
||
Result := 'CLOSE_ATOMIC';
|
||
OP_LOOKAHEAD:
|
||
Result := 'LOOKAHEAD';
|
||
OP_LOOKAHEAD_NEG:
|
||
Result := 'LOOKAHEAD_NEG';
|
||
OP_LOOKBEHIND:
|
||
Result := 'LOOKBEHIND';
|
||
OP_LOOKBEHIND_NEG:
|
||
Result := 'LOOKBEHIND_NEG';
|
||
OP_LOOKAHEAD_END:
|
||
Result := 'LOOKAHEAD_END';
|
||
OP_LOOKBEHIND_END:
|
||
Result := 'LOOKBEHIND_END';
|
||
OP_STAR:
|
||
Result := 'STAR';
|
||
OP_PLUS:
|
||
Result := 'PLUS';
|
||
OP_BRACES:
|
||
Result := 'BRACES';
|
||
{$IFDEF ComplexBraces}
|
||
OP_LOOPENTRY:
|
||
Result := 'LOOPENTRY';
|
||
OP_LOOP:
|
||
Result := 'LOOP';
|
||
OP_LOOP_NG:
|
||
Result := 'LOOP_NG';
|
||
OP_LOOP_POSS:
|
||
Result := 'LOOP_POSS';
|
||
{$ENDIF}
|
||
OP_STAR_NG:
|
||
Result := 'STAR_NG';
|
||
OP_PLUS_NG:
|
||
Result := 'PLUS_NG';
|
||
OP_BRACES_NG:
|
||
Result := 'BRACES_NG';
|
||
OP_STAR_POSS:
|
||
Result := 'STAR_POSS';
|
||
OP_PLUS_POSS:
|
||
Result := 'PLUS_POSS';
|
||
OP_BRACES_POSS:
|
||
Result := 'BRACES_POSS';
|
||
OP_ANYCATEGORY:
|
||
Result := 'ANYCATEGORY';
|
||
OP_NOTCATEGORY:
|
||
Result := 'NOTCATEGORY';
|
||
OP_RECUR:
|
||
Result := 'RECURSION';
|
||
OP_SUBCALL:
|
||
Result := 'SUBCALL';
|
||
OP_ANYLINEBREAK:
|
||
Result := 'ANYLINEBREAK';
|
||
OP_RESET_MATCHPOS:
|
||
Result := 'RESET_MATCHPOS';
|
||
else
|
||
Error(reeDumpCorruptedOpcode);
|
||
end;
|
||
end; { of function TRegExpr.DumpOp
|
||
-------------------------------------------------------------- }
|
||
|
||
function TRegExpr.IsCompiled: Boolean;
|
||
begin
|
||
Result := programm <> nil;
|
||
end;
|
||
|
||
function PrintableChar(AChar: REChar): RegExprString; {$IFDEF InlineFuncs}inline;{$ENDIF}
|
||
begin
|
||
if AChar < ' ' then
|
||
Result := '#' + IntToStr(Ord(AChar))
|
||
else
|
||
Result := AChar;
|
||
end;
|
||
|
||
function TRegExpr.DumpCheckerIndex(N: Byte): RegExprString;
|
||
begin
|
||
Result := '?';
|
||
if N = CheckerIndex_Word then Result := '\w' else
|
||
if N = CheckerIndex_NotWord then Result := '\W' else
|
||
if N = CheckerIndex_Digit then Result := '\d' else
|
||
if N = CheckerIndex_NotDigit then Result := '\D' else
|
||
if N = CheckerIndex_Space then Result := '\s' else
|
||
if N = CheckerIndex_NotSpace then Result := '\S' else
|
||
if N = CheckerIndex_HorzSep then Result := '\h' else
|
||
if N = CheckerIndex_NotHorzSep then Result := '\H' else
|
||
if N = CheckerIndex_VertSep then Result := '\v' else
|
||
if N = CheckerIndex_NotVertSep then Result := '\V' else
|
||
if N = CheckerIndex_LowerAZ then Result := 'az' else
|
||
if N = CheckerIndex_UpperAZ then Result := 'AZ' else
|
||
if N = CheckerIndex_AnyLineBreak then Result := '\R'
|
||
;
|
||
end;
|
||
|
||
function TRegExpr.DumpCategoryChars(ch, ch2: REChar; Positive: Boolean): RegExprString;
|
||
const
|
||
S: array[Boolean] of RegExprString = ('P', 'p');
|
||
begin
|
||
Result := '\' + S[Positive] + '{' + ch;
|
||
if ch2 <> #0 then
|
||
Result := Result + ch2;
|
||
Result := Result + '} ';
|
||
end;
|
||
|
||
function TRegExpr.Dump(Indent: Integer): RegExprString;
|
||
// dump a regexp in vaguely comprehensible form
|
||
var
|
||
s: PRegExprChar;
|
||
op: TREOp; // Arbitrary non-END op.
|
||
next, BranchEnd: PRegExprChar;
|
||
BranchEndStack: Array of PRegExprChar;
|
||
i, NLen, CurIndent: Integer;
|
||
Diff: PtrInt;
|
||
iByte: Byte;
|
||
ch, ch2: REChar;
|
||
begin
|
||
Result := '';
|
||
if not IsProgrammOk then
|
||
Exit;
|
||
|
||
CurIndent := 0;
|
||
op := OP_EXACTLY;
|
||
s := regCodeWork;
|
||
BranchEnd := nil;
|
||
SetLength(BranchEndStack{%H-}, 0);
|
||
while op <> OP_EEND do
|
||
begin // While that wasn't END last time...
|
||
op := s^;
|
||
next := regNext(s);
|
||
|
||
if ((op =OP_CLOSE) or (op = OP_CLOSE_ATOMIC) or (op = OP_LOOP) or (op = OP_LOOP_NG) or (op = OP_LOOP_POSS) or
|
||
(op = OP_LOOKAHEAD_END) or (op = OP_LOOKBEHIND_END)
|
||
) and
|
||
(CurIndent > 0)
|
||
then
|
||
dec(CurIndent, Indent);
|
||
if s = BranchEnd then begin
|
||
dec(CurIndent, Indent);
|
||
BranchEnd := nil;
|
||
if Length(BranchEndStack) > 0 then begin
|
||
BranchEnd := BranchEndStack[Length(BranchEndStack)-1];
|
||
SetLength(BranchEndStack, Length(BranchEndStack)-1);
|
||
end;
|
||
end;
|
||
|
||
Result := Result + Format('%3d:%s %s', [s - programm, StringOfChar(' ', CurIndent), DumpOp(s^)]);
|
||
|
||
if (op = OP_OPEN) or (op = OP_OPEN_ATOMIC) or (op = OP_LOOPENTRY) or
|
||
(op = OP_LOOKAHEAD) or (op = OP_LOOKAHEAD_NEG) or (op = OP_LOOKBEHIND) or (op = OP_LOOKBEHIND_NEG)
|
||
then
|
||
inc(CurIndent, Indent);
|
||
if (op = OP_BRANCH) or (op = OP_GBRANCH) or (op = OP_GBRANCH_EX) or (op = OP_GBRANCH_EX_CI) then begin
|
||
inc(CurIndent, Indent);
|
||
if BranchEnd <> nil then begin
|
||
SetLength(BranchEndStack, Length(BranchEndStack)+1);
|
||
BranchEndStack[Length(BranchEndStack)-1] := BranchEnd;
|
||
end;
|
||
BranchEnd := next;
|
||
end;
|
||
|
||
// Where, what.
|
||
if next = nil // Next ptr.
|
||
then
|
||
Result := Result + ' (0)'
|
||
else
|
||
begin
|
||
if next > s
|
||
// PWideChar subtraction workaround (see comments in Tail method for details)
|
||
then
|
||
Diff := next - s
|
||
else
|
||
Diff := -(s - next);
|
||
Result := Result + Format(' (%d) ', [(s - programm) + Diff]);
|
||
end;
|
||
Inc(s, REOpSz + RENextOffSz);
|
||
if (op = OP_ANYOF) or (op = OP_ANYOF_CI) or (op = OP_ANYBUT) or (op = OP_ANYBUT_CI) then
|
||
begin
|
||
repeat
|
||
case s^ of
|
||
OpKind_End:
|
||
begin
|
||
Inc(s);
|
||
Break;
|
||
end;
|
||
OpKind_Range:
|
||
begin
|
||
Result := Result + 'Rng(';
|
||
Inc(s);
|
||
Result := Result + PrintableChar(s^) + '-';
|
||
Inc(s);
|
||
Result := Result + PrintableChar(s^);
|
||
Result := Result + ') ';
|
||
Inc(s);
|
||
end;
|
||
OpKind_MetaClass:
|
||
begin
|
||
Inc(s);
|
||
Result := Result + DumpCheckerIndex(Byte(s^)) + ' ';
|
||
Inc(s);
|
||
end;
|
||
OpKind_Char:
|
||
begin
|
||
Inc(s);
|
||
NLen := PLongInt(s)^;
|
||
Inc(s, RENumberSz);
|
||
Result := Result + 'Ch(';
|
||
for i := 1 to NLen do
|
||
begin
|
||
Result := Result + PrintableChar(s^);
|
||
Inc(s);
|
||
end;
|
||
Result := Result + ') ';
|
||
end;
|
||
OpKind_CategoryYes:
|
||
begin
|
||
Inc(s);
|
||
ch := s^;
|
||
Inc(s);
|
||
ch2 := s^;
|
||
Result := Result + DumpCategoryChars(ch, ch2, True);
|
||
Inc(s);
|
||
end;
|
||
OpKind_CategoryNo:
|
||
begin
|
||
Inc(s);
|
||
ch := s^;
|
||
Inc(s);
|
||
ch2 := s^;
|
||
Result := Result + DumpCategoryChars(ch, ch2, False);
|
||
Inc(s);
|
||
end;
|
||
else
|
||
Error(reeDumpCorruptedOpcode);
|
||
end;
|
||
until false;
|
||
end;
|
||
if (op = OP_EXACTLY) or (op = OP_EXACTLY_CI) then
|
||
begin
|
||
// Literal string, where present.
|
||
NLen := PLongInt(s)^;
|
||
Inc(s, RENumberSz);
|
||
for i := 1 to NLen do
|
||
begin
|
||
Result := Result + PrintableChar(s^);
|
||
Inc(s);
|
||
end;
|
||
end;
|
||
if (op = OP_BSUBEXP) or (op = OP_BSUBEXP_CI) then
|
||
begin
|
||
Result := Result + ' \' + IntToStr(PReGroupIndex(s)^);
|
||
Inc(s, ReGroupIndexSz);
|
||
end;
|
||
if (op = OP_SUBCALL) then
|
||
begin
|
||
Result := Result + ' (?' + IntToStr(PReGroupIndex(s)^) + ') @' + IntToStr(GrpOpCodes[PReGroupIndex(s)^]-programm);
|
||
Inc(s, ReGroupIndexSz);
|
||
end;
|
||
if (op = OP_OPEN) or (op = OP_OPEN_ATOMIC) or (op = OP_CLOSE) or (op = OP_CLOSE_ATOMIC) then
|
||
begin
|
||
Result := Result + ' [' + IntToStr(PReGroupIndex(s)^) + ']';
|
||
Inc(s, ReGroupIndexSz);
|
||
end;
|
||
if (op = OP_BRACES) or (op = OP_BRACES_NG) or (op = OP_BRACES_POSS) then
|
||
begin
|
||
// show min/max argument of braces operator
|
||
Result := Result + Format('{%d,%d}', [PREBracesArg(AlignToInt(s))^,
|
||
PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
|
||
Inc(s, REBracesArgSz * 2);
|
||
end;
|
||
{$IFDEF ComplexBraces}
|
||
if (op = OP_LOOP) or (op = OP_LOOP_NG) or (op = OP_LOOP_POSS) then
|
||
begin
|
||
Result := Result + Format(' -> (%d) {%d,%d}',
|
||
[(s - programm - (REOpSz + RENextOffSz)) +
|
||
PRENextOff(AlignToPtr(s + 2 * REBracesArgSz))^,
|
||
PREBracesArg(AlignToInt(s))^,
|
||
PREBracesArg(AlignToInt(s + REBracesArgSz))^]);
|
||
Inc(s, 2 * REBracesArgSz + RENextOffSz);
|
||
end;
|
||
{$ENDIF}
|
||
if (op = OP_ANYCATEGORY) or (op = OP_NOTCATEGORY) then
|
||
begin
|
||
ch := s^;
|
||
Inc(s);
|
||
ch2 := s^;
|
||
Inc(s);
|
||
if ch2<>#0 then
|
||
Result := Result + '{' + ch + ch2 + '}'
|
||
else
|
||
Result := Result + '{' + ch + '}';
|
||
end;
|
||
if (op = OP_LOOKBEHIND) or (op = OP_LOOKBEHIND_NEG) then
|
||
begin
|
||
if PReOpLookBehindOptions(s)^.IsGreedy = OPT_LOOKBEHIND_FIXED then
|
||
Result := Result + ' (fixed)'
|
||
else
|
||
if PReOpLookBehindOptions(s)^.IsGreedy = OPT_LOOKBEHIND_NON_GREEDY then
|
||
Result := Result + ' (not greedy)'
|
||
else
|
||
Result := Result + ' (greedy)';
|
||
Result := Result
|
||
+ ' Len: ' + IntToStr(PReOpLookBehindOptions(s)^.MatchLenMin)
|
||
+ '..' + IntToStr(PReOpLookBehindOptions(s)^.MatchLenMax);
|
||
Inc(s, ReOpLookBehindOptionsSz);
|
||
end
|
||
else
|
||
if (op = OP_BRANCH) or (op = OP_GBRANCH) then
|
||
begin
|
||
Inc(s, REBranchArgSz);
|
||
end
|
||
else
|
||
if (op = OP_GBRANCH_EX) or (op = OP_GBRANCH_EX_CI) then
|
||
begin
|
||
Result := Result + ' ' + s^;
|
||
if (op = OP_GBRANCH_EX_CI) then
|
||
Result := Result + (s+1)^;
|
||
Inc(s, REBranchArgSz);
|
||
end;
|
||
Result := Result + #$d#$a;
|
||
end; { of while }
|
||
|
||
// Header fields of interest.
|
||
case regAnchored of
|
||
raBOL: Result := Result + 'Anchored(BOL); ';
|
||
raEOL: Result := Result + 'Anchored(EOL); ';
|
||
raContinue: Result := Result + 'Anchored(\G); ';
|
||
raOnlyOnce: Result := Result + 'Anchored(start); ';
|
||
end;
|
||
|
||
if regMustString <> '' then
|
||
Result := Result + 'Must have: "' + regMustString + '"; ';
|
||
|
||
{$IFDEF UseFirstCharSet}
|
||
Result := Result + #$d#$a'First charset: ';
|
||
if FirstCharSet = [] then
|
||
Result := Result + '<empty set>'
|
||
else
|
||
if FirstCharSet = RegExprAllSet then
|
||
Result := Result + '<all chars>'
|
||
else
|
||
for iByte := 0 to 255 do
|
||
if iByte in FirstCharSet then
|
||
Result := Result + PrintableChar(REChar(iByte));
|
||
{$ENDIF}
|
||
Result := Result + #$d#$a;
|
||
end; { of function TRegExpr.Dump
|
||
-------------------------------------------------------------- }
|
||
{$ENDIF}
|
||
|
||
|
||
function TRegExpr.IsFixedLength(var op: TREOp; var ALen: Integer): Boolean;
|
||
var
|
||
s: PRegExprChar;
|
||
ADummyMaxLen: integer;
|
||
begin
|
||
Result := False;
|
||
if not IsCompiled then Exit;
|
||
s := regCodeWork;
|
||
Result := IsPartFixedLength(s, op, ALen, ADummyMaxLen, OP_EEND, nil, []);
|
||
end;
|
||
|
||
function TRegExpr.IsFixedLengthEx(var op: TREOp; var AMinLen, AMaxLen: integer
|
||
): boolean;
|
||
var
|
||
s: PRegExprChar;
|
||
begin
|
||
Result := False;
|
||
if not IsCompiled then Exit;
|
||
s := regCodeWork;
|
||
Result := IsPartFixedLength(s, op, AMinLen, AMaxLen, OP_EEND, nil, []);
|
||
end;
|
||
|
||
function TRegExpr.IsPartFixedLength(var prog: PRegExprChar; var op: TREOp;
|
||
var AMinLen, AMaxLen: integer; StopAt: TREOp; StopMaxProg: PRegExprChar;
|
||
Flags: TRegExprFindFixedLengthFlags): boolean;
|
||
|
||
function MultiplyLen(AVal, AFactor: Integer): Integer;
|
||
begin
|
||
if AFactor > High(AVal) div AVal then
|
||
Result := high(AVal)
|
||
else
|
||
Result := AVal * AFactor;
|
||
end;
|
||
|
||
procedure IncMaxLen(var AVal: Integer; AInc: Integer);
|
||
begin
|
||
if AInc > High(AVal) - AVal then
|
||
AVal := high(AVal)
|
||
else
|
||
AVal := AVal + AInc;
|
||
end;
|
||
|
||
function MaxStopOrNext(next: PRegExprChar): PRegExprChar;
|
||
begin
|
||
Result := next;
|
||
if (Result = nil) or ( (StopMaxProg <> nil) and (Result > StopMaxProg) ) then
|
||
Result := StopMaxProg;
|
||
end;
|
||
|
||
var
|
||
s, next: PRegExprChar;
|
||
N, N2, FndMaxLen, ASubLen, ABranchLen, ABranchMaxLen, ASubMaxLen: integer;
|
||
NotFixedLen, r, NextIsNil: Boolean;
|
||
FirstVarLenOp: TREOp;
|
||
begin
|
||
Result := False;
|
||
NotFixedLen := False;
|
||
AMinLen := 0;
|
||
AMaxLen := High(AMaxLen);
|
||
FndMaxLen := 0;
|
||
next := prog;
|
||
s := prog;
|
||
|
||
repeat
|
||
NextIsNil := next = nil;
|
||
next := regNext(s);
|
||
prog := s;
|
||
op := s^;
|
||
if not NotFixedLen then
|
||
FirstVarLenOp := op;
|
||
|
||
if (op = StopAt) or
|
||
((StopMaxProg <> nil) and (s >= StopMaxProg)) or
|
||
(NextIsNil and (flfReturnAtNextNil in Flags))
|
||
then begin
|
||
AMaxLen := FndMaxLen;
|
||
op := FirstVarLenOp;
|
||
if not NotFixedLen then
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
|
||
Inc(s, REOpSz + RENextOffSz);
|
||
|
||
case op of
|
||
OP_EEND, OP_BACK:
|
||
begin
|
||
AMaxLen := FndMaxLen;
|
||
op := FirstVarLenOp;
|
||
if not NotFixedLen then
|
||
Result := True;
|
||
Exit;
|
||
end;
|
||
|
||
OP_BRANCH, OP_GBRANCH, OP_GBRANCH_EX, OP_GBRANCH_EX_CI:
|
||
begin
|
||
s := s + REBranchArgSz;
|
||
if not IsPartFixedLength(s, op, ABranchLen, ABranchMaxLen, OP_EEND, MaxStopOrNext(next), Flags * [flfReturnAtNextNil, flfSkipLookAround]) then
|
||
begin
|
||
if not NotFixedLen then
|
||
FirstVarLenOp := op;
|
||
NotFixedLen := True;
|
||
end;
|
||
s := next;
|
||
repeat
|
||
next := regNext(s);
|
||
s := s + REBranchArgSz;
|
||
Inc(s, REOpSz + RENextOffSz);
|
||
if not IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_EEND, MaxStopOrNext(next), Flags * [flfReturnAtNextNil, flfSkipLookAround]) then
|
||
begin
|
||
if not NotFixedLen then
|
||
FirstVarLenOp := op;
|
||
NotFixedLen := True;
|
||
end;
|
||
s := next;
|
||
if (ASubLen <> ABranchLen) then
|
||
NotFixedLen := True;
|
||
if ASubLen < ABranchLen then
|
||
ABranchLen := ASubLen;
|
||
if ASubMaxLen > ABranchMaxLen then
|
||
ABranchMaxLen := ASubMaxLen;
|
||
until (next^ <> OP_BRANCH) and (next^ <> OP_GBRANCH) and
|
||
(next^ <> OP_GBRANCH_EX) and (next^ <> OP_GBRANCH_EX_CI);
|
||
AMinLen := AMinLen + ABranchLen;
|
||
IncMaxLen(FndMaxLen, ABranchMaxLen);
|
||
end;
|
||
|
||
OP_OPEN:
|
||
begin
|
||
Inc(s, ReGroupIndexSz);
|
||
if not IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_CLOSE, StopMaxProg, Flags * [flfReturnAtNextNil, flfSkipLookAround]) then
|
||
begin
|
||
if not NotFixedLen then
|
||
FirstVarLenOp := op;
|
||
NotFixedLen := True;
|
||
end;
|
||
assert(s^=OP_CLOSE);
|
||
AMinLen := AMinLen + ASubLen;
|
||
IncMaxLen(FndMaxLen, ASubMaxLen);
|
||
Inc(s, REOpSz + RENextOffSz + ReGroupIndexSz); // consume the OP_CLOSE
|
||
continue;
|
||
end;
|
||
|
||
OP_OPEN_ATOMIC:
|
||
begin
|
||
Inc(s, ReGroupIndexSz);
|
||
if not IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_CLOSE_ATOMIC, StopMaxProg, Flags * [flfReturnAtNextNil, flfSkipLookAround]) then
|
||
begin
|
||
if not NotFixedLen then
|
||
FirstVarLenOp := op;
|
||
NotFixedLen := True;
|
||
end;
|
||
assert(s^=OP_CLOSE_ATOMIC);
|
||
AMinLen := AMinLen + ASubLen;
|
||
IncMaxLen(FndMaxLen, ASubMaxLen);
|
||
Inc(s, REOpSz + RENextOffSz + ReGroupIndexSz); // consume the OP_CLOSE_ATOMIC;
|
||
continue;
|
||
end;
|
||
|
||
OP_CLOSE, OP_CLOSE_ATOMIC:
|
||
begin
|
||
Inc(s, ReGroupIndexSz);
|
||
continue;
|
||
end;
|
||
|
||
OP_LOOKAHEAD, OP_LOOKAHEAD_NEG:
|
||
begin
|
||
r := IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_LOOKAHEAD_END, MaxStopOrNext(next), [flfSkipLookAround] + Flags * [flfReturnAtNextNil]);
|
||
s := next;
|
||
Inc(s, REOpSz + RENextOffSz); // skip the OP_LOOKAHEAD_END
|
||
if not (flfSkipLookAround in Flags) then
|
||
begin
|
||
//if not r then
|
||
NotFixedLen := True;
|
||
end;
|
||
end;
|
||
|
||
OP_LOOKBEHIND, OP_LOOKBEHIND_NEG:
|
||
begin
|
||
Inc(s, ReOpLookBehindOptionsSz);
|
||
r := IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_LOOKBEHIND_END, MaxStopOrNext(next), [flfSkipLookAround] + Flags * [flfReturnAtNextNil]);
|
||
s := next;
|
||
Inc(s, REOpSz + RENextOffSz); // skip the OP_LOOKBEHIND_END
|
||
if not (flfSkipLookAround in Flags) then
|
||
NotFixedLen := True
|
||
end;
|
||
|
||
OP_LOOKAHEAD_END, OP_LOOKBEHIND_END:
|
||
if flfSkipLookAround in Flags then
|
||
begin
|
||
continue;
|
||
end;
|
||
|
||
OP_NOTHING,
|
||
OP_COMMENT,
|
||
OP_BOUND,
|
||
OP_NOTBOUND,
|
||
OP_BOL,
|
||
OP_BOL_ML,
|
||
OP_EOL,
|
||
OP_EOL2,
|
||
OP_EOL_ML,
|
||
OP_CONTINUE_POS:
|
||
Continue;
|
||
|
||
OP_ANY,
|
||
OP_ANY_ML,
|
||
OP_ANYDIGIT,
|
||
OP_NOTDIGIT,
|
||
OP_ANYLETTER,
|
||
OP_NOTLETTER,
|
||
OP_ANYSPACE,
|
||
OP_NOTSPACE,
|
||
OP_ANYHORZSEP,
|
||
OP_NOTHORZSEP,
|
||
OP_ANYVERTSEP,
|
||
OP_NOTVERTSEP:
|
||
begin
|
||
Inc(AMinLen);
|
||
IncMaxLen(FndMaxLen, 1);
|
||
Continue;
|
||
end;
|
||
|
||
OP_ANYOF,
|
||
OP_ANYOF_CI,
|
||
OP_ANYBUT,
|
||
OP_ANYBUT_CI:
|
||
begin
|
||
Inc(AMinLen);
|
||
IncMaxLen(FndMaxLen, 1);
|
||
repeat
|
||
case s^ of
|
||
OpKind_End:
|
||
begin
|
||
Inc(s);
|
||
Break;
|
||
end;
|
||
OpKind_Range:
|
||
begin
|
||
Inc(s);
|
||
Inc(s);
|
||
Inc(s);
|
||
end;
|
||
OpKind_MetaClass:
|
||
begin
|
||
Inc(s);
|
||
Inc(s);
|
||
end;
|
||
OpKind_Char:
|
||
begin
|
||
Inc(s);
|
||
Inc(s, RENumberSz + PLongInt(s)^);
|
||
end;
|
||
OpKind_CategoryYes,
|
||
OpKind_CategoryNo:
|
||
begin
|
||
Inc(s);
|
||
Inc(s);
|
||
Inc(s);
|
||
end;
|
||
end;
|
||
until False;
|
||
end;
|
||
|
||
OP_EXACTLY,
|
||
OP_EXACTLY_CI:
|
||
begin
|
||
N := PLongInt(s)^;
|
||
Inc(AMinLen, N);
|
||
IncMaxLen(FndMaxLen, N);
|
||
Inc(s, RENumberSz + N);
|
||
Continue;
|
||
end;
|
||
|
||
OP_ANYCATEGORY,
|
||
OP_NOTCATEGORY:
|
||
begin
|
||
Inc(AMinLen);
|
||
IncMaxLen(FndMaxLen, 1);
|
||
Inc(s, 2);
|
||
Continue;
|
||
end;
|
||
|
||
OP_BRACES,
|
||
OP_BRACES_NG,
|
||
OP_BRACES_POSS:
|
||
begin
|
||
// allow only d{n,n}
|
||
N := PREBracesArg(AlignToInt(s))^;
|
||
N2 := PREBracesArg(AlignToInt(s + REBracesArgSz))^;
|
||
Inc(s, REBracesArgSz * 2);
|
||
r := IsPartFixedLength(s, op, ASubLen, ASubMaxLen, OP_EEND, MaxStopOrNext(next), [flfSkipLookAround, flfReturnAtNextNil]);
|
||
if not r then
|
||
begin
|
||
if not NotFixedLen then
|
||
FirstVarLenOp := op;
|
||
end;
|
||
|
||
Inc(AMinLen, MultiplyLen(ASubLen, N));
|
||
IncMaxLen(FndMaxLen, MultiplyLen(ASubMaxLen, N2));
|
||
if (not r) or (N <> N2) then
|
||
NotFixedLen := True;
|
||
s := next;
|
||
end;
|
||
|
||
OP_BSUBEXP, OP_BSUBEXP_CI, OP_SUBCALL:
|
||
begin
|
||
s := next;
|
||
NotFixedLen := True; // group may be in look-around. Could be anything
|
||
FndMaxLen := high(FndMaxLen);
|
||
end;
|
||
|
||
OP_PLUS, OP_PLUS_NG, OP_PLUS_POSS:
|
||
begin
|
||
s := next;
|
||
Inc(AMinLen, 1);
|
||
FndMaxLen := high(FndMaxLen);
|
||
NotFixedLen := True
|
||
end;
|
||
|
||
else // OP_STAR...
|
||
begin
|
||
s := next;
|
||
FndMaxLen := high(FndMaxLen);
|
||
NotFixedLen := True
|
||
end;
|
||
end;
|
||
until False;
|
||
end;
|
||
|
||
procedure TRegExpr.SetInputSubString(const AInputString: RegExprString;
|
||
AInputStartPos, AInputLen: Integer);
|
||
begin
|
||
ClearMatches;
|
||
|
||
if AInputStartPos < 1 then
|
||
AInputStartPos := 1
|
||
else
|
||
if AInputStartPos > Length(AInputString) then
|
||
AInputStartPos := Length(AInputString) + 1;
|
||
if AInputLen < 0 then
|
||
AInputLen := 0
|
||
else
|
||
if AInputLen > Length(AInputString) + 1 - AInputStartPos then
|
||
AInputLen := Length(AInputString) + 1 - AInputStartPos;
|
||
|
||
fInputString := AInputString;
|
||
//UniqueString(fInputString);
|
||
|
||
fInputStart := PRegExprChar(fInputString) + AInputStartPos - 1;
|
||
fInputEnd := fInputStart + AInputLen;
|
||
fInputContinue := fInputStart;
|
||
end;
|
||
|
||
{$IFDEF reRealExceptionAddr}
|
||
{$OPTIMIZATION ON}
|
||
// ReturnAddr works correctly only if compiler optimization is ON
|
||
// I placed this method at very end of unit because there are no
|
||
// way to restore compiler optimization flag ...
|
||
{$ENDIF}
|
||
|
||
procedure TRegExpr.Error(AErrorID: Integer);
|
||
{$IFDEF windows}
|
||
{$IFDEF reRealExceptionAddr}
|
||
function ReturnAddr: Pointer;
|
||
asm
|
||
mov eax,[ebp+4]
|
||
end;
|
||
{$ENDIF}
|
||
{$ENDIF}
|
||
var
|
||
e: ERegExpr;
|
||
Msg: string;
|
||
begin
|
||
fLastError := AErrorID; // dummy stub - useless because will raise exception
|
||
Msg := ErrorMsg(AErrorID);
|
||
// compilation error ?
|
||
if AErrorID < reeFirstRuntimeCode then
|
||
Msg := Msg + ' (pos ' + IntToStr(CompilerErrorPos) + ')';
|
||
e := ERegExpr.Create(Msg);
|
||
e.ErrorCode := AErrorID;
|
||
e.CompilerErrorPos := CompilerErrorPos;
|
||
raise e
|
||
{$IFDEF windows}
|
||
{$IFDEF reRealExceptionAddr}
|
||
at ReturnAddr
|
||
{$ENDIF}
|
||
{$ENDIF};
|
||
end; { of procedure TRegExpr.Error
|
||
-------------------------------------------------------------- }
|
||
|
||
{$IFDEF Compat} // APIs needed only for users of old FPC 3.0
|
||
function TRegExpr.ExecPos(AOffset: Integer; ATryOnce: Boolean): Boolean; overload;
|
||
begin
|
||
// Check that the start position is not negative
|
||
if AOffset < 1 then
|
||
begin
|
||
ClearMatches;
|
||
Error(reeOffsetMustBePositive);
|
||
Result := False;
|
||
Exit;
|
||
end;
|
||
if ATryOnce then
|
||
Result := ExecPrim(AOffset, False, False, AOffset + 1)
|
||
else
|
||
Result := ExecPrim(AOffset, False, False, 0);
|
||
end;
|
||
|
||
function TRegExpr.OldInvertCase(const Ch: REChar): REChar;
|
||
begin
|
||
Result := _UpperCase(Ch);
|
||
if Result = Ch then
|
||
Result := _LowerCase(Ch);
|
||
end;
|
||
|
||
class function TRegExpr.InvertCaseFunction(const Ch: REChar): REChar;
|
||
begin
|
||
Result := _UpperCase(Ch);
|
||
if Result = Ch then
|
||
Result := _LowerCase(Ch);
|
||
end;
|
||
|
||
function TRegExpr.GetLinePairedSeparator: RegExprString;
|
||
begin
|
||
// not supported anymore
|
||
Result := '';
|
||
end;
|
||
|
||
procedure TRegExpr.SetLinePairedSeparator(const AValue: RegExprString);
|
||
begin
|
||
// not supported anymore
|
||
end;
|
||
|
||
procedure TRegExpr.SetUseOsLineEndOnReplace(AValue: Boolean);
|
||
begin
|
||
if fUseOsLineEndOnReplace = AValue then
|
||
Exit;
|
||
fUseOsLineEndOnReplace := AValue;
|
||
if fUseOsLineEndOnReplace then
|
||
fReplaceLineEnd := sLineBreak
|
||
else
|
||
fReplaceLineEnd := #10;
|
||
end;
|
||
{$ENDIF}
|
||
|
||
end.
|