LazUtils: Masks.pas: implement handling of UTF8 codepoints for mask matching. Issue #0024373.

git-svn-id: trunk@43637 -
This commit is contained in:
bart 2014-01-02 18:48:37 +00:00
parent cc8cd63dec
commit 3fb2dd4469

View File

@ -20,21 +20,23 @@ interface
uses uses
// For Smart Linking: Do not use the LCL! // For Smart Linking: Do not use the LCL!
Classes, SysUtils, Contnrs, StrUtils, LazUtilsStrConsts; Classes, SysUtils, Contnrs, LazUtilsStrConsts, LazUtf8;
type type
TMaskCharType = (mcChar, mcCharSet, mcAnyChar, mcAnyText); TMaskCharType = (mcChar, mcCharSet, mcAnyChar, mcAnyText);
TCharSet = set of Char; TCharSet = set of Char;
PCharSet = ^TCharSet; PCharSet = ^TCharSet;
TUtf8Char = String[7];
TMaskChar = record TMaskChar = record
case CharType: TMaskCharType of case CharType: TMaskCharType of
mcChar: (CharValue: Char); mcChar: (CharValue: TUtf8Char);
mcCharSet: (Negative: Boolean; SetValue: PCharSet); mcCharSet: (Negative: Boolean; SetValue: PCharSet);
mcAnyChar, mcAnyText: (); mcAnyChar, mcAnyText: ();
end; end;
TMaskString = record TMaskString = record
MinLength: Integer; MinLength: Integer;
MaxLength: Integer; MaxLength: Integer;
@ -53,18 +55,18 @@ type
public public
constructor Create(const AValue: String; const CaseSensitive: Boolean = False); constructor Create(const AValue: String; const CaseSensitive: Boolean = False);
destructor Destroy; override; destructor Destroy; override;
function Matches(const AFileName: String): Boolean; function Matches(const AFileName: String): Boolean;
function MatchesWindowsMask(const AFileName: String): Boolean; function MatchesWindowsMask(const AFileName: String): Boolean;
end; end;
{ TParseStringList } { TParseStringList }
TParseStringList = class(TStringList) TParseStringList = class(TStringList)
public public
constructor Create(const AText, ASeparators: String); constructor Create(const AText, ASeparators: String);
end; end;
{ TMaskList } { TMaskList }
TMaskList = class TMaskList = class
@ -75,10 +77,10 @@ type
public public
constructor Create(const AValue: String; ASeparator: Char = ';'; const CaseSensitive: Boolean = False); constructor Create(const AValue: String; ASeparator: Char = ';'; const CaseSensitive: Boolean = False);
destructor Destroy; override; destructor Destroy; override;
function Matches(const AFileName: String): Boolean; function Matches(const AFileName: String): Boolean;
function MatchesWindowsMask(const AFileName: String): Boolean; function MatchesWindowsMask(const AFileName: String): Boolean;
property Count: Integer read GetCount; property Count: Integer read GetCount;
property Items[Index: Integer]: TMask read GetItem; property Items[Index: Integer]: TMask read GetItem;
end; end;
@ -90,6 +92,26 @@ function MatchesWindowsMaskList(const FileName, Mask: String; Separator: Char =
implementation implementation
//Utf8 helper functions
function GetCodePoint(const S: String; const Index: PtrInt): TUTF8Char;
//equivalent for Result := S[Index], but for Utf8 encoded strings
var
p: PChar;
PLen: PtrInt;
Res: AnsiString; //intermediate needed for PChar -> String -> ShortString assignement
begin
Result := '';
p := UTF8CharStart(PChar(S), Length(S), Index - 1); //zero-based call
//determine the length in bytes of this UTF-8 character
PLen := UTF8CharacterLength(p);
Res := p;
//Set correct length for Result (otherwise it returns all chars up to the end of the original string)
SetLength(Res,PLen);
Result := Res;
end;
function MatchesMask(const FileName, Mask: String; const CaseSensitive: Boolean): Boolean; function MatchesMask(const FileName, Mask: String; const CaseSensitive: Boolean): Boolean;
var var
AMask: TMask; AMask: TMask;
@ -184,6 +206,7 @@ var
CharSet: TCharSet; CharSet: TCharSet;
Valid: Boolean; Valid: Boolean;
C, Last: Char; C, Last: Char;
CP: TUtf8Char;
begin begin
SkipAnyText := False; SkipAnyText := False;
@ -191,7 +214,7 @@ var
FMask.Chars[High(FMask.Chars)].CharType := mcCharSet; FMask.Chars[High(FMask.Chars)].CharType := mcCharSet;
Inc(I); Inc(I);
if (I <= Length(AValue)) and (AValue[I] = '!') then if (I <= Utf8Length(AValue)) and (GetCodePoint(AValue,I) = '!') then
begin begin
FMask.Chars[High(FMask.Chars)].Negative := True; FMask.Chars[High(FMask.Chars)].Negative := True;
Inc(I); Inc(I);
@ -201,24 +224,30 @@ var
Last := '-'; Last := '-';
CharSet := []; CharSet := [];
Valid := False; Valid := False;
while I <= Length(AValue) do while I <= Utf8Length(AValue) do
begin begin
case AValue[I] of CP := GetCodePoint(AValue,I);
//CharSets can only contain 1-byte codepoints
if (Length(CP) <> 1) then CharSetError;
case CP of
'-': '-':
begin begin
if Last = '-' then CharSetError; if Last = '-' then CharSetError;
Inc(I); Inc(I);
if (I > Utf8Length(AValue)) then CharSetError;
if (I > Length(AValue)) then CharSetError; CP := GetCodePoint(AValue, I);
if (Length(CP) <> 1) then CharSetError;
if fCaseSensitive then if fCaseSensitive then
begin begin
//DebugLn('Set: ' + Last + '-' + (AValue[I])); //writeln('MaskUtf8: Set: ' + Last + '-' + (CP[1]));
for C := Last to (AValue[I]) do Include(CharSet, C); for C := Last to CP[1] do
Include(CharSet, C)
end end
else else
begin begin
//DebugLn('Set: ' + Last + '-' + UpCase(AValue[I])); //writeln('MaskUtf8: Set: ' + Last + '-' + UpCase(CP[1]));
for C := Last to UpCase(AValue[I]) do Include(CharSet, C); for C := Last to UpCase(CP[1]) do
Include(CharSet, C)
end; end;
Inc(I); Inc(I);
end; end;
@ -230,15 +259,14 @@ var
else else
begin begin
if fCaseSensitive then if fCaseSensitive then
Last := AValue[I] Last := CP[1]
else else
Last := UpCase(AValue[I]); Last := UpCase(CP[1]);
Include(CharSet, Last); Include(CharSet, Last);
Inc(I); Inc(I);
end; end;
end; end;
end; end;
if (not Valid) or (CharSet = []) then CharSetError; if (not Valid) or (CharSet = []) then CharSetError;
New(FMask.Chars[High(FMask.Chars)].SetValue); New(FMask.Chars[High(FMask.Chars)].SetValue);
@ -259,9 +287,9 @@ var
begin begin
CharType := mcChar; CharType := mcChar;
if fCaseSensitive then if fCaseSensitive then
CharValue := AValue[I] CharValue := GetCodePoint(AValue,I)
else else
CharValue := UpCase(AValue[I]); CharValue := Utf8UpperCase(GetCodePoint(AValue,I));
end; end;
Inc(FMask.MinLength); Inc(FMask.MinLength);
@ -278,9 +306,9 @@ begin
SkipAnyText := False; SkipAnyText := False;
I := 1; I := 1;
while I <= Length(AValue) do while I <= Utf8Length(AValue) do
begin begin
case AValue[I] of case GetCodePoint(AValue,I) of
'*': AddAnyText; '*': AddAnyText;
'?': AddAnyChar; '?': AddAnyChar;
'[': AddCharSet; '[': AddCharSet;
@ -316,13 +344,14 @@ function TMask.Matches(const AFileName: String): Boolean;
var var
L: Integer; L: Integer;
S: String; S: String;
function MatchToEnd(MaskIndex, CharIndex: Integer): Boolean; function MatchToEnd(MaskIndex, CharIndex: Integer): Boolean;
var var
I, J: Integer; I, J: Integer;
CP: TUtf8Char;
begin begin
Result := False; Result := False;
for I := MaskIndex to High(FMask.Chars) do for I := MaskIndex to High(FMask.Chars) do
begin begin
case FMask.Chars[I].CharType of case FMask.Chars[I].CharType of
@ -330,14 +359,17 @@ var
begin begin
if CharIndex > L then Exit; if CharIndex > L then Exit;
//DebugLn('Match ' + S[CharIndex] + '<?>' + FMask.Chars[I].CharValue); //DebugLn('Match ' + S[CharIndex] + '<?>' + FMask.Chars[I].CharValue);
if S[CharIndex] <> FMask.Chars[I].CharValue then Exit; if GetCodePoint(S,CharIndex) <> FMask.Chars[I].CharValue then Exit;
Inc(CharIndex); Inc(CharIndex);
end; end;
mcCharSet: mcCharSet:
begin begin
if CharIndex > L then Exit; if CharIndex > L then Exit;
CP := GetCodePoint(S, CharIndex);
//There are only 1-byte codepoints in charsets
if (Length(CP) <> 1) then Exit;
if FMask.Chars[I].Negative xor if FMask.Chars[I].Negative xor
(S[CharIndex] in FMask.Chars[I].SetValue^) then Inc(CharIndex) (Cp[1] in FMask.Chars[I].SetValue^) then Inc(CharIndex)
else Exit; else Exit;
end; end;
mcAnyChar: mcAnyChar:
@ -352,7 +384,7 @@ var
Result := True; Result := True;
Exit; Exit;
end; end;
for J := CharIndex to L do for J := CharIndex to L do
if MatchToEnd(I + 1, J) then if MatchToEnd(I + 1, J) then
begin begin
@ -362,24 +394,24 @@ var
end; end;
end; end;
end; end;
Result := CharIndex > L; Result := CharIndex > L;
end; end;
begin begin
Result := False; Result := False;
L := Length(AFileName); L := Utf8Length(AFileName);
if L = 0 then if L = 0 then
begin begin
if FMask.MinLength = 0 then Result := True; if FMask.MinLength = 0 then Result := True;
Exit; Exit;
end; end;
if (L < FMask.MinLength) or (L > FMask.MaxLength) then Exit; if (L < FMask.MinLength) or (L > FMask.MaxLength) then Exit;
if fCaseSensitive then if fCaseSensitive then
S := AFileName S := AFileName
else else
S := UpperCase(AFileName); S := Utf8UpperCase(AFileName);
Result := MatchToEnd(0, 1); Result := MatchToEnd(0, 1);
end; end;
@ -394,10 +426,10 @@ begin
// foo. matches only foo but not foo.txt // foo. matches only foo but not foo.txt
// foo.* -> match either foo or foo.* // foo.* -> match either foo or foo.*
if (Length(fInitialMask) > 2) and (RightStr(fInitialMask,3) = '*.*') then if (Utf8Length(fInitialMask) > 2) and (Utf8RightStr(fInitialMask,3) = '*.*') then
// foo*.* // foo*.*
begin begin
NewMaskValue := Copy(fInitialMask,1,Length(fInitialMask)-2); NewMaskValue := Utf8Copy(fInitialMask,1,Utf8Length(fInitialMask)-2);
ClearMaskString; ClearMaskString;
InitMaskString(NewMaskValue, fCaseSensitive); InitMaskString(NewMaskValue, fCaseSensitive);
Result := Matches(AFileName); Result := Matches(AFileName);
@ -406,7 +438,7 @@ begin
InitMaskString(fInitialMask, fCaseSensitive); InitMaskString(fInitialMask, fCaseSensitive);
end end
//else if (Length(fInitialMask) > 1) and (RightStr(fInitialMask,2) = '*.') then //else if (Length(fInitialMask) > 1) and (RightStr(fInitialMask,2) = '*.') then
else if (Length(fInitialMask) > 1) and (fInitialMask[Length(fInitialMask)] = '.') then else if (Utf8Length(fInitialMask) > 1) and (GetCodePoint(fInitialMask,Utf8Length(fInitialMask)) = '.') then
//foo*. or *. or foo. //foo*. or *. or foo.
begin begin
@ -415,7 +447,7 @@ begin
Ext := ExtractFileExt(AFileName); Ext := ExtractFileExt(AFileName);
if (Ext = '') or (Ext = AFileName) then if (Ext = '') or (Ext = AFileName) then
begin begin
NewMaskValue := Copy(fInitialMask,1,Length(fInitialMask)-1); NewMaskValue := Utf8Copy(fInitialMask,1,Utf8Length(fInitialMask)-1);
ClearMaskString; ClearMaskString;
InitMaskString(NewMaskValue, fCaseSensitive); InitMaskString(NewMaskValue, fCaseSensitive);
Result := Matches(AFileName); Result := Matches(AFileName);
@ -428,14 +460,14 @@ begin
Result := False; Result := False;
end; end;
end end
else if (Length(fInitialMask) > 2) and (RightStr(fInitialMask,2) = '.*') then else if (Utf8Length(fInitialMask) > 2) and (Utf8RightStr(fInitialMask,2) = '.*') then
//foo.* (but not '.*') //foo.* (but not '.*')
begin begin
//First see if we have 'foo' //First see if we have 'foo'
if fCaseSensitive then if fCaseSensitive then
Result := (AFileName = Copy(fInitialMask,1,Length(fInitialMask)-2)) Result := (AFileName = Utf8Copy(fInitialMask,1,Utf8Length(fInitialMask)-2))
else else
Result := (CompareText(AFileName,Copy(fInitialMask,1,Length(fInitialMask)-2)) = 0); Result := (Utf8CompareText(AFileName,Utf8Copy(fInitialMask,1,Utf8Length(fInitialMask)-2)) = 0);
if not Result then Result := Matches(AFileName); if not Result then Result := Matches(AFileName);
end end
else else
@ -462,7 +494,7 @@ begin
S := I + 1; S := I + 1;
end; end;
end; end;
if Length(AText) >= S then Add(Copy(AText, S, Length(AText) - S + 1)); if Length(AText) >= S then Add(Copy(AText, S, Length(AText) - S + 1));
end; end;
@ -484,7 +516,7 @@ var
I: Integer; I: Integer;
begin begin
FMasks := TObjectList.Create(True); FMasks := TObjectList.Create(True);
S := TParseStringList.Create(AValue, ASeparator); S := TParseStringList.Create(AValue, ASeparator);
try try
for I := 0 to S.Count - 1 do for I := 0 to S.Count - 1 do
@ -497,7 +529,7 @@ end;
destructor TMaskList.Destroy; destructor TMaskList.Destroy;
begin begin
FMasks.Free; FMasks.Free;
inherited Destroy; inherited Destroy;
end; end;
@ -506,7 +538,7 @@ var
I: Integer; I: Integer;
begin begin
Result := False; Result := False;
for I := 0 to FMasks.Count - 1 do for I := 0 to FMasks.Count - 1 do
begin begin
if TMask(FMasks.Items[I]).Matches(AFileName) then if TMask(FMasks.Items[I]).Matches(AFileName) then