LazEdit: update TRegExpr

This commit is contained in:
Martin 2023-11-20 23:54:24 +01:00
parent 7fd8824fbc
commit 327c5770ab

View File

@ -367,9 +367,10 @@ type
regMustLen: integer; // length of regMust string regMustLen: integer; // length of regMust string
regMustString: RegExprString; // string which must occur in match (got from regMust/regMustLen) regMustString: RegExprString; // string which must occur in match (got from regMust/regMustLen)
LookAroundInfoList: PRegExprLookAroundInfo; LookAroundInfoList: PRegExprLookAroundInfo;
regNestedCalls: integer; // some attempt to prevent 'catastrophic backtracking' but not used //regNestedCalls: integer; // some attempt to prevent 'catastrophic backtracking' but not used
CurrentSubCalled: integer; CurrentSubCalled: integer;
FMinMatchLen: integer;
{$IFDEF UseFirstCharSet} {$IFDEF UseFirstCharSet}
FirstCharSet: TRegExprCharset; FirstCharSet: TRegExprCharset;
FirstCharArray: array[byte] of boolean; FirstCharArray: array[byte] of boolean;
@ -1722,6 +1723,8 @@ const
OP_GBRANCH_EX = TREOp(68); OP_GBRANCH_EX = TREOp(68);
OP_GBRANCH_EX_CI = TREOp(69); OP_GBRANCH_EX_CI = TREOp(69);
OP_RESET_MATCHPOS = TReOp(70);
OP_NONE = high(TREOp); OP_NONE = high(TREOp);
// We work with p-code through pointers, compatible with PRegExprChar. // We work with p-code through pointers, compatible with PRegExprChar.
@ -3156,7 +3159,8 @@ function TRegExpr.CompileRegExpr(ARegExp: PRegExprChar): boolean;
var var
scan, scanTemp, longest, longestTemp: PRegExprChar; scan, scanTemp, longest, longestTemp: PRegExprChar;
Len, LenTemp: integer; Len, LenTemp: integer;
FlagTemp: integer; FlagTemp, MaxMatchLen: integer;
op: TREOp;
begin begin
Result := False; Result := False;
FlagTemp := 0; FlagTemp := 0;
@ -3219,6 +3223,7 @@ begin
Exit; Exit;
// Dig out information for optimizations. // Dig out information for optimizations.
IsFixedLengthEx(op, FMinMatchLen, MaxMatchLen);
{$IFDEF UseFirstCharSet} {$IFDEF UseFirstCharSet}
FirstCharSet := []; FirstCharSet := [];
FillFirstCharSet(regCodeWork); FillFirstCharSet(regCodeWork);
@ -4866,6 +4871,11 @@ begin
ret := EmitGroupRef(GrpIndex, fCompModifiers.I); ret := EmitGroupRef(GrpIndex, fCompModifiers.I);
FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE; FlagParse := FlagParse or FLAG_HASWIDTH or FLAG_SIMPLE;
end; end;
'K':
begin
ret := EmitNode(OP_RESET_MATCHPOS);
FlagParse := FlagParse or FLAG_NOT_QUANTIFIABLE;
end;
{$IFDEF FastUnicodeData} {$IFDEF FastUnicodeData}
'p': 'p':
begin begin
@ -5428,9 +5438,6 @@ end;
type type
TRegExprMatchPrimLocals = record TRegExprMatchPrimLocals = record
case TREOp of case TREOp of
OP_CLOSE_ATOMIC: (
IsAtomic: Boolean;
);
{$IFDEF ComplexBraces} {$IFDEF ComplexBraces}
OP_LOOPENTRY: ( OP_LOOPENTRY: (
LoopInfo: TOpLoopInfo; LoopInfo: TOpLoopInfo;
@ -5526,6 +5533,15 @@ begin
Exit; Exit;
end; end;
OP_RESET_MATCHPOS:
begin
save := GrpBounds[0].GrpStart[0];
GrpBounds[0].GrpStart[0] := regInput;
Result := MatchPrim(next);
if not Result then
GrpBounds[0].GrpStart[0] := save;
exit;
end;
OP_EOL: OP_EOL:
begin begin
// \z matches at the very end // \z matches at the very end
@ -5842,23 +5858,24 @@ begin
begin begin
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
save := GrpBounds[regRecursion].GrpStart[no]; save := GrpBounds[regRecursion].GrpStart[no];
opnd := GrpBounds[regRecursion].GrpEnd[no]; // save2
GrpBounds[regRecursion].GrpStart[no] := regInput; GrpBounds[regRecursion].GrpStart[no] := regInput;
Result := MatchPrim(next); Result := MatchPrim(next);
if GrpBacktrackingAsAtom[no] then if GrpBacktrackingAsAtom[no] then
IsBacktrackingGroupAsAtom := False; IsBacktrackingGroupAsAtom := False;
GrpBacktrackingAsAtom[no] := False; GrpBacktrackingAsAtom[no] := False;
if not Result then if not Result then begin
GrpBounds[regRecursion].GrpStart[no] := save; GrpBounds[regRecursion].GrpStart[no] := save;
GrpBounds[regRecursion].GrpEnd[no] := opnd;
end;
Exit; Exit;
end; end;
OP_CLOSE, OP_CLOSE_ATOMIC: OP_CLOSE:
begin begin
Local.IsAtomic := scan^ = OP_CLOSE_ATOMIC;
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^; no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
// handle atomic group, mark it as "done" // handle atomic group, mark it as "done"
// (we are here because some OP_BRANCH is matched) // (we are here because some OP_BRANCH is matched)
save := GrpBounds[regRecursion].GrpEnd[no];
GrpBounds[regRecursion].GrpEnd[no] := regInput; GrpBounds[regRecursion].GrpEnd[no] := regInput;
// if we are in OP_SUBCALL* call, it called OP_OPEN*, so we must return // if we are in OP_SUBCALL* call, it called OP_OPEN*, so we must return
@ -5868,11 +5885,18 @@ begin
Result := True; Result := True;
Exit; Exit;
end; end;
end;
OP_CLOSE_ATOMIC:
begin
no := PReGroupIndex((scan + REOpSz + RENextOffSz))^;
// handle atomic group, mark it as "done"
// (we are here because some OP_BRANCH is matched)
GrpBounds[regRecursion].GrpEnd[no] := regInput;
Result := MatchPrim(next); Result := MatchPrim(next);
if not Result then begin if not Result then begin
GrpBounds[regRecursion].GrpEnd[no] := save; if not IsBacktrackingGroupAsAtom then begin
if Local.IsAtomic and not IsBacktrackingGroupAsAtom then begin
GrpBacktrackingAsAtom[no] := True; GrpBacktrackingAsAtom[no] := True;
IsBacktrackingGroupAsAtom := True; IsBacktrackingGroupAsAtom := True;
end; end;
@ -5911,8 +5935,9 @@ begin
if (next^ = OP_LOOKAROUND_OPTIONAL) then if (next^ = OP_LOOKAROUND_OPTIONAL) then
next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz; next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz;
regInput := Local.LookAroundInfo.InputPos; regInput := Local.LookAroundInfo.InputPos;
Result := MatchPrim(next); Result := False;
Exit; scan := next;
continue;
end; end;
end end
else else
@ -5922,8 +5947,9 @@ begin
if (next^ = OP_LOOKAROUND_OPTIONAL) then if (next^ = OP_LOOKAROUND_OPTIONAL) then
next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz; next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz;
regInput := Local.LookAroundInfo.InputPos; regInput := Local.LookAroundInfo.InputPos;
Result := MatchPrim(next); Result := False;
Exit; scan := next;
continue;
end; end;
end; end;
@ -6000,8 +6026,9 @@ begin
if (next^ = OP_LOOKAROUND_OPTIONAL) then if (next^ = OP_LOOKAROUND_OPTIONAL) then
next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz; next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz;
regInput := Local.LookAroundInfo.InputPos; regInput := Local.LookAroundInfo.InputPos;
Result := MatchPrim(next); Result := False;
Exit; scan := next;
continue;
end; end;
end end
else else
@ -6011,8 +6038,9 @@ begin
if (next^ = OP_LOOKAROUND_OPTIONAL) then if (next^ = OP_LOOKAROUND_OPTIONAL) then
next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz; next := PRegExprChar(AlignToPtr(next + 1)) + RENextOffSz;
regInput := Local.LookAroundInfo.InputPos; regInput := Local.LookAroundInfo.InputPos;
Result := MatchPrim(next); Result := False;
Exit; scan := next;
continue;
end; end;
end; end;
@ -6351,8 +6379,10 @@ begin
end; end;
no := FindRepeated(opnd, BracesMax); no := FindRepeated(opnd, BracesMax);
if no >= BracesMin then if no >= BracesMin then
if (nextch = #0) or (regInput^ = nextch) then if (nextch = #0) or (regInput^ = nextch) then begin
Result := MatchPrim(next); scan := next;
continue;
end;
Exit; Exit;
end; end;
@ -6493,21 +6523,16 @@ end;
function TRegExpr.MatchAtOnePos(APos: PRegExprChar): boolean; function TRegExpr.MatchAtOnePos(APos: PRegExprChar): boolean;
begin begin
regInput := APos; regInput := APos;
regNestedCalls := 0; //regNestedCalls := 0;
regRecursion := 0;
fInputCurrentEnd := fInputEnd; fInputCurrentEnd := fInputEnd;
Result := False; GrpBounds[0].GrpStart[0] := APos;
{$IFDEF RegExpWithStackOverflowCheck_DecStack_Frame}
StackLimit := StackBottom;
if StackLimit <> nil then
StackLimit := StackLimit + 36000; // Add for any calls within the current MatchPrim // FPC has "STACK_MARGIN = 16384;", but we need to call Error, ..., raise
{$ENDIF}
Result := MatchPrim(regCodeWork); Result := MatchPrim(regCodeWork);
if Result then if Result then
begin Result := regInput >= GrpBounds[0].GrpStart[0];
GrpBounds[0].GrpStart[0] := APos; if Result then
GrpBounds[0].GrpEnd[0] := regInput; GrpBounds[0].GrpEnd[0] := regInput
end; else
GrpBounds[0].GrpStart[0] := nil;
end; end;
procedure TRegExpr.ClearMatches; procedure TRegExpr.ClearMatches;
@ -6530,6 +6555,7 @@ begin
{$ENDIF} {$ENDIF}
LookAroundInfoList := nil; LookAroundInfoList := nil;
CurrentSubCalled := -1; CurrentSubCalled := -1;
regRecursion := 0;
end; end;
procedure TRegExpr.InitInternalGroupData; procedure TRegExpr.InitInternalGroupData;
@ -6583,7 +6609,7 @@ end;
function TRegExpr.ExecPrimProtected(AOffset: Integer; ASlowChecks, function TRegExpr.ExecPrimProtected(AOffset: Integer; ASlowChecks,
ABackward: Boolean; ATryMatchOnlyStartingBefore: Integer): Boolean; ABackward: Boolean; ATryMatchOnlyStartingBefore: Integer): Boolean;
var var
Ptr: PRegExprChar; Ptr, SearchEnd: PRegExprChar;
begin begin
Result := False; Result := False;
@ -6631,6 +6657,12 @@ begin
if StrLPos(fInputStart, PRegExprChar(regMustString), fInputEnd - fInputStart, length(regMustString)) = nil then if StrLPos(fInputStart, PRegExprChar(regMustString), fInputEnd - fInputStart, length(regMustString)) = nil then
exit; exit;
{$IFDEF RegExpWithStackOverflowCheck_DecStack_Frame}
StackLimit := StackBottom;
if StackLimit <> nil then
StackLimit := StackLimit + 36000; // Add for any calls within the current MatchPrim // FPC has "STACK_MARGIN = 16384;", but we need to call Error, ..., raise
{$ENDIF}
FMatchesCleared := False; FMatchesCleared := False;
// ATryOnce or anchored match (it needs to be tried only once). // ATryOnce or anchored match (it needs to be tried only once).
if (ATryMatchOnlyStartingBefore = AOffset + 1) or (regAnchored in [raBOL, raOnlyOnce, raContinue]) then if (ATryMatchOnlyStartingBefore = AOffset + 1) or (regAnchored in [raBOL, raOnlyOnce, raContinue]) then
@ -6651,40 +6683,53 @@ begin
Exit; Exit;
end; end;
// Messy cases: unanchored match. // Messy cases: unanchored match.
if ABackward then if ABackward then begin
Inc(Ptr, 2) Inc(Ptr, 2);
else repeat
Dec(Ptr);
repeat
if ABackward then
begin
Dec(Ptr); Dec(Ptr);
if Ptr < fInputStart then if Ptr < fInputStart then
Exit; Exit;
end
else {$IFDEF UseFirstCharSet}
begin {$IFDEF UnicodeRE}
if Ord(Ptr^) <= $FF then
{$ENDIF}
if not FirstCharArray[byte(Ptr^)] then
Continue;
{$ENDIF}
Result := MatchAtOnePos(Ptr);
// Exit on a match or after testing the end-of-string
if Result then
Exit;
until False;
end
else begin
Dec(Ptr);
SearchEnd := fInputEnd - FMinMatchLen;
if (ATryMatchOnlyStartingBefore > 0) and (fInputStart + ATryMatchOnlyStartingBefore < SearchEnd) then
SearchEnd := fInputStart + ATryMatchOnlyStartingBefore - 2;
repeat
Inc(Ptr); Inc(Ptr);
if Ptr > fInputEnd then if Ptr > SearchEnd then
Exit; Exit;
if (ATryMatchOnlyStartingBefore > 0) and (Ptr - fInputStart >= ATryMatchOnlyStartingBefore - 1) then
{$IFDEF UseFirstCharSet}
{$IFDEF UnicodeRE}
if Ord(Ptr^) <= $FF then
{$ENDIF}
if not FirstCharArray[byte(Ptr^)] then
Continue;
{$ENDIF}
Result := MatchAtOnePos(Ptr);
// Exit on a match or after testing the end-of-string
if Result then
Exit; Exit;
end; until False;
end;
{$IFDEF UseFirstCharSet}
{$IFDEF UnicodeRE}
if Ord(Ptr^) <= $FF then
{$ENDIF}
if not FirstCharArray[byte(Ptr^)] then
Continue;
{$ENDIF}
Result := MatchAtOnePos(Ptr);
// Exit on a match or after testing the end-of-string
if Result then
Exit;
until False;
end; { of function TRegExpr.ExecPrim end; { of function TRegExpr.ExecPrim
-------------------------------------------------------------- } -------------------------------------------------------------- }
@ -7094,7 +7139,8 @@ begin
OP_BOL, OP_BOL,
OP_BOL_ML, OP_BOL_ML,
OP_CONTINUE_POS: OP_CONTINUE_POS,
OP_RESET_MATCHPOS:
; // Exit; ; // Exit;
OP_EOL, OP_EOL,
@ -7669,6 +7715,8 @@ begin
Result := 'SUBCALL'; Result := 'SUBCALL';
OP_ANYLINEBREAK: OP_ANYLINEBREAK:
Result := 'ANYLINEBREAK'; Result := 'ANYLINEBREAK';
OP_RESET_MATCHPOS:
Result := 'RESET_MATCHPOS';
else else
Error(reeDumpCorruptedOpcode); Error(reeDumpCorruptedOpcode);
end; end;
@ -8082,7 +8130,7 @@ begin
if (ABranchMaxLen = high(ABranchMaxLen)) and not(flfForceToStopAt in Flags) then if (ABranchMaxLen = high(ABranchMaxLen)) and not(flfForceToStopAt in Flags) then
Exit; Exit;
end; end;
assert(s^=OP_CLOSE); assert(s^=OP_CLOSE_ATOMIC);
AMinLen := AMinLen + ASubLen; AMinLen := AMinLen + ASubLen;
IncMaxLen(FndMaxLen, ASubMaxLen); IncMaxLen(FndMaxLen, ASubMaxLen);
Inc(s, REOpSz + RENextOffSz + ReGroupIndexSz); // consume the OP_CLOSE_ATOMIC; Inc(s, REOpSz + RENextOffSz + ReGroupIndexSz); // consume the OP_CLOSE_ATOMIC;