LazUtils: Deprecate UTF8CompareLatinTextFast. Implement TStringListUTF8Fast differently.

This commit is contained in:
Juha 2024-02-09 00:16:24 +02:00
parent 041341dd1c
commit 890f92e33e

View File

@ -151,6 +151,8 @@ function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol, Inde
function UTF8WrapText(S, BreakStr: string; BreakChars: TSysCharSet; MaxCol: integer): string; overload;
function UTF8WrapText(S: string; MaxCol: integer): string; overload;
function IsPureAscii(S: string): Boolean; // String has only ASCII characters.
type
TEscapeMode = (emPascal, emHexPascal, emHexC, emC, emAsciiControlNames);
@ -175,17 +177,23 @@ function UTF8CompareStrP(S1, S2: PChar): PtrInt;
function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): PtrInt;
function UTF8CompareText(const S1, S2: string): PtrInt;
function UTF8CompareTextP(S1, S2: PChar): PtrInt;
function UTF8CompareLatinTextFast(S1, S2: String): PtrInt;
function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
// Deprecated in Lazarus 3.99, February 2024.
function UTF8CompareLatinTextFast(S1, S2: String): PtrInt; deprecated 'Use UTF8CompareText or AnsiCompareText instead.';
function UTF8CompareStrCollated(const S1, S2: string): PtrInt; deprecated 'Use UTF8CompareStr instead.';
function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer): Integer;
type
{ TStringListUTF8Fast }
// This list uses fast ASCII comparison functions when its data is pure ASCII.
// When data is Unicode, it switches to slower AnsiCompare.. functions.
// The switch is managed by setting "UseLocale" which should not be changed by user.
TStringListUTF8Fast = class(TStringList)
protected // Uses UTF8CompareLatinTextFast for comparison.
function DoCompareText(const s1,s2 : string): PtrInt; override;
protected
procedure InsertItem(Index: Integer; const S: string; O: TObject); override;
public
constructor Create;
end;
TConvertResult = (trNoError, trNullSrc, trNullDest, trDestExhausted,
@ -3192,7 +3200,7 @@ begin
TextLen := Utf8Length(AText);
SubTextLen := Utf8Length(ASubText);
if (TextLen >= SubTextLen) then
Result := (UTF8CompareLatinTextFast(Utf8Copy(AText,1,SubTextLen),ASubText) = 0);
Result := UTF8CompareText(UTF8Copy(AText,1,SubTextLen),ASubText) = 0;
end;
end;
@ -3206,7 +3214,7 @@ begin
TextLen := Utf8Length(AText);
SubTextLen := Utf8Length(ASubText);
if (TextLen >= SubTextLen) then
Result := (UTF8CompareLatinTextFast(Utf8Copy(AText,TextLen-SubTextLen+1,SubTextLen),ASubText) = 0);
Result := UTF8CompareText(UTF8Copy(AText,TextLen-SubTextLen+1,SubTextLen),ASubText) = 0;
end;
end;
@ -3303,6 +3311,16 @@ begin
Result := UTF8WrapText(S, LineEnding, [' ', '-', #9], MaxCol);
end;
function IsPureAscii(S: string): Boolean;
var
i: Integer;
begin
for i := 1 to Length(S) do
if Ord(S[i]) > $7F then // Not ASCII.
Exit(False);
Result := True;
end;
function UTF8Trim(const s: string; Flags: TUTF8TrimFlags): string;
var
p: PChar;
@ -3524,67 +3542,11 @@ begin
end;
function UTF8CompareLatinTextFast(S1, S2: String): PtrInt;
// Like UTF8CompareText but does not return strict alphabetical order.
// The order is deterministic and good for binary search and such uses.
// Optimizes comparison of single-byte encoding and also multi-byte portions
// when they are equal. Otherwise falls back to AnsiCompareText.
var
Count, Count1, Count2: SizeInt;
Chr1, Chr2: Char;
P1, P2, P1LastBytePointOffset: PChar;
begin
Count1 := Length(S1);
Count2 := Length(S2);
if Count1 > Count2 then
Count := Count2
else
Count := Count1;
if Count > 0 then
begin
P1 := @S1[1];
P2 := @S2[1];
P1LastBytePointOffset := P1;
while Count > 0 do
begin
Chr1 := P1^;
Chr2 := P2^;
if Chr1 <> Chr2 then
begin
if (Ord(Chr1) or Ord(Chr2)) < 128 then
begin
P1LastBytePointOffset := P1;
if (Chr1 in ['A'..'Z']) then
inc(Chr1, $20);
if (Chr2 in ['A'..'Z']) then
inc(Chr2, $20);
if Chr1 <> Chr2 then
break;
end
else
begin
P2 := P2 + (P1LastBytePointOffset - P1);
Delete(S1, 1, P1LastBytePointOffset-@S1[1]);
Delete(S2, 1, P2-@S2[1]);
Exit(AnsiCompareText(S1, S2));
end;
end
else
if (Ord(Chr1) or Ord(Chr2)) < 128 then
P1LastBytePointOffset := P1;
Inc(P1); Inc(P2);
Dec(Count);
end;
end;
if Count > 0 then
//Both Chr1 and Ch2 are lower ASCII if we arrive here, so this is safe
Result := AnsiCompareText(Chr1, Chr2)
else
Result := Count1-Count2;
Result := UTF8CompareText(S1, S2);
end;
function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
function UTF8CompareStrCollated(const S1, S2: string): PtrInt;
begin
{$IFDEF ACP_RTL}
//Only with this define AnsiCompareStr does not point to Utf8CompareStr
@ -4047,26 +4009,36 @@ var
LangID: TLanguageID;
begin
LangID := GetLanguageID;
Lang:=LangID.LanguageCode;
Lang := LangID.LanguageCode;
end;
procedure InitFPUpchars;
var
c: Char;
begin
for c:=Low(char) to High(char) do begin
FPUpChars[c]:=upcase(c);
end;
for c:=Low(char) to High(char) do
FPUpChars[c]:=UpCase(c);
end;
{ TStringListUTF8Fast }
function TStringListUTF8Fast.DoCompareText(const s1, s2: string): PtrInt;
constructor TStringListUTF8Fast.Create;
begin
if CaseSensitive then
Result:=Utf8CompareStr(s1, s2)
else
Result:=UTF8CompareLatinTextFast(s1, s2);
inherited Create;
UseLocale := False; // Assume pure ASCII string. Comparison will be fast.
end;
procedure TStringListUTF8Fast.InsertItem(Index: Integer; const S: string; O: TObject);
begin
if not IsPureAscii(S) then
begin
// Unicode string found, switch to Unicode mode.
WriteLn('TStringListUTF8Fast.InsertItem: Found non-ASCII string "'+S+'"');
UseLocale := True;
if Sorted then
Sort; // Sort a sorted list again with AnsiCompare.. functions.
end;
inherited InsertItem(Index, S, O);
end;