lazutf8: improved UTF8CharacterLength and UTF8CharacterLengthFast

git-svn-id: trunk@52857 -
2025-08-18 07:39:22 +02:00 · 2016-08-21 21:14:01 +00:00 · 2016-08-21 21:14:01 +00:00 · bc57de6bb9
commit bc57de6bb9
parent 413f000fc0
1 changed files with 51 additions and 42 deletions
--- a/components/lazutils/lazutf8.pas
+++ b/components/lazutils/lazutf8.pas
@ -72,7 +72,7 @@ function GetEnvironmentVariableUTF8(const EnvVar: string): String;
 function SysErrorMessageUTF8(ErrorCode: Integer): String;
 // Returns the size of one codepoint in bytes.
-function UTF8CharacterLength(p: PChar): integer;
+function UTF8CharacterLength(p: PChar): integer; inline;
 // Fast version of UTF8CharacterLength. Assumes the UTF-8 codepoint is valid.
 function UTF8CharacterLengthFast(p: PChar): integer; inline;
@ -370,56 +370,65 @@ begin
  Result := SysToUTF8(SysUtils.SysErrorMessage(ErrorCode));
 end;
-function UTF8CharacterLength(p: PChar): integer;
+function UTF8CharacterLengthFull(p: PChar): integer;
 begin
-  if p<>nil then begin
+  case p^ of
-    if ord(p^)<%11000000 then begin
+  #0..#191: // %11000000
-      // regular single byte character (#0 is a character, this is pascal ;)
+    // regular single byte character (#0 is a character, this is Pascal ;)
-      Result:=1;
+    Result:=1;
-    end
+  #192..#223: // p^ and %11100000 = %11000000
-    else begin
+    begin
-      // multi byte
+      // could be 2 byte character
-      if ((ord(p^) and %11100000) = %11000000) then begin
+      if (ord(p[1]) and %11000000) = %10000000 then
-        // could be 2 byte character
+        Result:=2
        if (ord(p[1]) and %11000000) = %10000000 then
          Result:=2
        else
          Result:=1;
      end
      else if ((ord(p^) and %11110000) = %11100000) then begin
        // could be 3 byte character
        if ((ord(p[1]) and %11000000) = %10000000)
        and ((ord(p[2]) and %11000000) = %10000000) then
          Result:=3
        else
          Result:=1;
      end
      else if ((ord(p^) and %11111000) = %11110000) then begin
        // could be 4 byte character
        if ((ord(p[1]) and %11000000) = %10000000)
        and ((ord(p[2]) and %11000000) = %10000000)
        and ((ord(p[3]) and %11000000) = %10000000) then
          Result:=4
        else
          Result:=1;
      end
      else
        Result:=1;
    end;
-  end else
+  #224..#239: // p^ and %11110000 = %11100000
-    Result:=0;
+    begin
      // could be 3 byte character
      if ((ord(p[1]) and %11000000) = %10000000)
      and ((ord(p[2]) and %11000000) = %10000000) then
        Result:=3
      else
        Result:=1;
    end;
  #240..#247: // p^ and %11111000 = %11110000
    begin
      // could be 4 byte character
      if ((ord(p[1]) and %11000000) = %10000000)
      and ((ord(p[2]) and %11000000) = %10000000)
      and ((ord(p[3]) and %11000000) = %10000000) then
        Result:=4
      else
        Result:=1;
    end;
  else
    Result:=1;
  end;
 end;
 function UTF8CharacterLength(p: PChar): integer; inline;
 begin
  if p=nil then exit(0);
  if p^<#192 then exit(1);
  Result:=UTF8CharacterLengthFull(p);
 end;
 function UTF8CharacterLengthFast(p: PChar): integer;
 begin
  case p^ of
-    #0..#191,#255: Result := 1;
+    #0..#191   : Result := 1;
-    #192..#223   : Result := 2;
+    #192..#223 : Result := 2;
-    #224..#239   : Result := 3;
+    #224..#239 : Result := 3;
-    #240..#247   : Result := 4;
+    #240..#247 : Result := 4;
-    #248..#251   : Result := 5;
+    #248..#255 : Result := 1;
-    #252, #253   : Result := 6;
+    // Theoretically UTF-8 supports length 1-7, but since 2003, RFC 3629 limits
-    #254         : Result := 7;
+    // it to 1-4 bytes.
    // This is an inline function, so keep the function short.
    //#248..#251   : Result := 5;
    //#252, #253   : Result := 6;
    //#254         : Result := 7;
  end;
 end;