lazutf8: improved UTF8CharacterLength and UTF8CharacterLengthFast

git-svn-id: trunk@52857 -
This commit is contained in:
mattias 2016-08-21 21:14:01 +00:00
parent 413f000fc0
commit bc57de6bb9

View File

@ -72,7 +72,7 @@ function GetEnvironmentVariableUTF8(const EnvVar: string): String;
function SysErrorMessageUTF8(ErrorCode: Integer): String; function SysErrorMessageUTF8(ErrorCode: Integer): String;
// Returns the size of one codepoint in bytes. // Returns the size of one codepoint in bytes.
function UTF8CharacterLength(p: PChar): integer; function UTF8CharacterLength(p: PChar): integer; inline;
// Fast version of UTF8CharacterLength. Assumes the UTF-8 codepoint is valid. // Fast version of UTF8CharacterLength. Assumes the UTF-8 codepoint is valid.
function UTF8CharacterLengthFast(p: PChar): integer; inline; function UTF8CharacterLengthFast(p: PChar): integer; inline;
@ -370,56 +370,65 @@ begin
Result := SysToUTF8(SysUtils.SysErrorMessage(ErrorCode)); Result := SysToUTF8(SysUtils.SysErrorMessage(ErrorCode));
end; end;
function UTF8CharacterLength(p: PChar): integer; function UTF8CharacterLengthFull(p: PChar): integer;
begin begin
if p<>nil then begin case p^ of
if ord(p^)<%11000000 then begin #0..#191: // %11000000
// regular single byte character (#0 is a character, this is pascal ;) // regular single byte character (#0 is a character, this is Pascal ;)
Result:=1; Result:=1;
end #192..#223: // p^ and %11100000 = %11000000
else begin begin
// multi byte // could be 2 byte character
if ((ord(p^) and %11100000) = %11000000) then begin if (ord(p[1]) and %11000000) = %10000000 then
// could be 2 byte character Result:=2
if (ord(p[1]) and %11000000) = %10000000 then
Result:=2
else
Result:=1;
end
else if ((ord(p^) and %11110000) = %11100000) then begin
// could be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
Result:=3
else
Result:=1;
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// could be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
Result:=4
else
Result:=1;
end
else else
Result:=1; Result:=1;
end; end;
end else #224..#239: // p^ and %11110000 = %11100000
Result:=0; begin
// could be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
Result:=3
else
Result:=1;
end;
#240..#247: // p^ and %11111000 = %11110000
begin
// could be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
Result:=4
else
Result:=1;
end;
else
Result:=1;
end;
end;
function UTF8CharacterLength(p: PChar): integer; inline;
begin
if p=nil then exit(0);
if p^<#192 then exit(1);
Result:=UTF8CharacterLengthFull(p);
end; end;
function UTF8CharacterLengthFast(p: PChar): integer; function UTF8CharacterLengthFast(p: PChar): integer;
begin begin
case p^ of case p^ of
#0..#191,#255: Result := 1; #0..#191 : Result := 1;
#192..#223 : Result := 2; #192..#223 : Result := 2;
#224..#239 : Result := 3; #224..#239 : Result := 3;
#240..#247 : Result := 4; #240..#247 : Result := 4;
#248..#251 : Result := 5; #248..#255 : Result := 1;
#252, #253 : Result := 6; // Theoretically UTF-8 supports length 1-7, but since 2003, RFC 3629 limits
#254 : Result := 7; // it to 1-4 bytes.
// This is an inline function, so keep the function short.
//#248..#251 : Result := 5;
//#252, #253 : Result := 6;
//#254 : Result := 7;
end; end;
end; end;