Merged revision(s) 51977-51978 #77e5428b3f-#77e5428b3f, 52004 #5d6e5012ea from trunk:

LazUtf8: first attempt to rewrite Utf8CompareStr and Utf8CompareText so that its results will be more consistent with AnsiCompareStr/WideCompareStr and AnsiCompareTex/WideCompareText. ( The old implementation was in effect a copy of CompareStr and, this made the claim about proper collation in Utf8CompareText (which uses Utf8CompareStr) rather ludicrous. The new implementaion is slower, mainly becaus of the fact we cannot use CompareMemrange/CompareByte anymore, and we have to iterate the bytes ourselves. This fact alone contributes much more to the loss in speed than the fact we use WideCompareStr on the 2 differing codepoints: - iterating in a for loop: adss a factor of appr. 10 to the time needed - using the final WideCompareStr adds a factor of about 1.6 to the time meeded. Because of the slowdown in speed in Utf8CompareStr, Utf8CompareText now calls WideCompareText directly, which is now appr. the same speed as converting to lowercase and then calling Utf8CompareStr ) ........ LazUtf8: In UTF8CompareStrCollated only call AnsiCompareStr if ACP_RTL is defined, since in all other cases AnsiCompareStr = widestringmanager.CompareStrAnsiStringProc = UTF8CompareStr. If ACP_RTL is not defined call Utf8CompareStr, since this is now does proper collation and is faster than converting to WideString. ........ LazFileUtils: fix uninitialized result in ChompPathDelim. Issue #0029866. ........ git-svn-id: branches/fixes_1_6@52012 -
2025-04-19 22:29:25 +02:00 · 2016-03-21 21:27:29 +00:00 · 2016-03-21 21:27:29 +00:00 · 33f3750e87
commit 33f3750e87
parent cc3b879102
2 changed files with 82 additions and 32 deletions
--- a/components/lazutils/lazfileutils.pas
+++ b/components/lazutils/lazfileutils.pas
@ -728,10 +728,9 @@ function ChompPathDelim(const Path: string): string;
 var
  Len, MinLen: Integer;
 begin
+  Result:=Path;
  if Path = '' then
    exit;
-
-  Result:=Path;
  Len:=length(Result);
  if (Result[1] in AllowDirectorySeparators) then begin
    MinLen := 1;
--- a/components/lazutils/lazutf8.pas
+++ b/components/lazutils/lazutf8.pas
@ -148,7 +148,7 @@ function UTF8CompareStr(const S1, S2: string): PtrInt; inline;
 function UTF8CompareStrP(S1, S2: PChar): PtrInt;
 function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): PtrInt;
 function UTF8CompareText(const S1, S2: string): PtrInt;
-function UTF8CompareStrCollated(const S1, S2: string): PtrInt;
+function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
 function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer): Integer;

 type
@ -3053,15 +3053,25 @@ end;

 {------------------------------------------------------------------------------
  Name:    UTF8CompareStr
-  Params: S1, S2 - UTF8 encoded strings
-  Returns: < 0 if S1 < S2, 0 if S1 = S2, > 0 if S1 > S2.
+  Params:  S1, S2 - UTF8 encoded strings
+  Compares UTF8 encoded strings
+  Returns
+     0: if S1 = S2
+    -1: if S1 < S2 ("alphabetically")
+    +1: if S1 > S2
+    -2: if S1 < S2, comparison ended at a different byte in an invalid UTF8 codepoint in either S1 or S2 (byte at S1 > byte at S2)
+    +2: if S1 > S2, comparison ended at a different byte in an invalid UTF8 codepoint in either S1 or S2
+
  Compare two UTF8 encoded strings, case sensitive.
-  Internally it uses CompareMemRange, which returns -1 if a byte of S1 is lower than S2.
- ------------------------------------------------------------------------------}
+
+  Internally it uses WideCompareStr on the first Utf8 codepoint that differs between S1 and S2
+  and therefor has proper colation on platforms where the WidestringManager supports this
+  (Windows, *nix with cwstring unit)
+------------------------------------------------------------------------------}
 function UTF8CompareStr(const S1, S2: string): PtrInt;
 begin
  Result := UTF8CompareStr(PChar(Pointer(S1)),length(S1),
-                            PChar(Pointer(S2)),length(S2));
+                           PChar(Pointer(S2)),length(S2));
 end;

 function UTF8CompareStrP(S1, S2: PChar): PtrInt;
@ -3069,24 +3079,68 @@ begin
  Result:=UTF8CompareStr(S1,StrLen(S1),S2,StrLen(S2));
 end;

-function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt
-  ): PtrInt;
+
+function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): PtrInt;
 var
  Count: SizeInt;
+  i, CL1, CL2: Integer;
+  B1, B2: Byte;
+  W1, W2: WideString;
+  Org1, Org2: PChar;
 begin
  Result := 0;
-  if Count1>Count2 then
-    Count:=Count2
+  Org1 := S1;
+  Org2 := S2;
+  if (Count1 > Count2) then
+    Count := Count2
  else
-    Count:=Count1;
-  Result := CompareMemRange(Pointer(S1),Pointer(S2), Count); // Note: CompareMemRange can handle nil if Count=0
-  if Result<>0 then exit;
-  if Count1>Count2 then
-    Result:=1
-  else if Count1<Count2 then
-    Result:=-1
+    Count := Count1;
+
+  i := 0;
+  if (Count > 0) then
+  begin
+   //unfortunately we cannot use CompareByte here, so we have to iterate ourselves
+    while (i < Count) do
+    begin
+      B1 := byte(S1^);
+      B2 := byte(S2^);
+      if (B1 <> B2) then
+      begin
+        //writeln('UCS: B1=',IntToHex(B1,2),', B2=',IntToHex(B2,2));
+        Break;
+      end;
+      Inc(S1); Inc(S2); Inc(I);
+    end;
+  end;
+  if (i < Count) then
+  begin
+    //Fallback result
+    Result := B1 - B2;
+    if (Result < 0) then
+      Result := -2
+    else
+      Result := 2;
+    //writeln('UCS: FallBack Result = ',Result);
+    //Try t find start of valid UTF8 codepoints
+    if (not Utf8TryFindCodepointStart(Org1, S1, CL1)) or
+        not Utf8TryFindCodepointStart(Org2, S2, CL2) then
+      Exit;
+
+    //writeln('UCS: CL1=',CL1,', CL2=',CL2);
+    //writeln('S1 = "',S1,'"');
+    //writeln('S2 = "',S2,'"');
+    W1 := Utf8ToUtf16(S1, CL1);
+    W2 := Utf8ToUtf16(S2, CL2);
+    //writeln('UCS: W1 = ',Word(W1[1]),' W2 = ',Word(W2[1]));
+    Result := WideCompareStr(W1, W2);
+  end
  else
-    Result:=0;
+    //Strings are the same up and until size of smallest one
+    Result := Count1 - Count2;
+  if (Result > 1) then
+    Result := 1
+  else if (Result < -1) then
+    Result := -1;
 end;

 {------------------------------------------------------------------------------
@ -3096,23 +3150,20 @@ end;
  Compare two UTF8 encoded strings, case insensitive.
  Note: Use this function instead of AnsiCompareText.
  This function guarantees proper collation on all supported platforms.
-  Internally it uses UTF8CompareStr.
+  Internally it uses WideCompareText.
 ------------------------------------------------------------------------------}
-function UTF8CompareText(const S1, S2: string): PtrInt;
-var
-  S1Lower, S2Lower: string;
-begin
-  S1Lower := UTF8LowerCase(S1);
-  S2Lower := UTF8LowerCase(S2);
-  Result := UTF8CompareStr(S1Lower, S2Lower);
-end;
+ function UTF8CompareText(const S1, S2: String): PtrInt;
+ begin
+   Result := WideCompareText(Utf8ToUtf16(S1),Utf8ToUtf16(S2));
+ end;

-function UTF8CompareStrCollated(const S1, S2: string): PtrInt;
+function UTF8CompareStrCollated(const S1, S2: string): PtrInt; {$IFnDEF ACP_RTL}inline;{$endif}
 begin
-  {$IFDEF MSWINDOWS}
+  {$IFDEF ACP_RTL}
+    //Only with this define AnsiCompareStr does not point to Utf8CompareStr
    Result := AnsiCompareStr(UTF8ToSys(S1), UTF8ToSys(S2));
  {$ELSE}
-    Result := WideCompareStr(WideString(S1),WideString(S2));
+    Result := Utf8CompareStr(S1,S2);
  {$ENDIF}
 end;