mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-04-08 11:58:12 +02:00
lazutf8: improved UTF8CharacterLength and UTF8CharacterLengthFast
git-svn-id: trunk@52857 -
This commit is contained in:
parent
413f000fc0
commit
bc57de6bb9
@ -72,7 +72,7 @@ function GetEnvironmentVariableUTF8(const EnvVar: string): String;
|
||||
function SysErrorMessageUTF8(ErrorCode: Integer): String;
|
||||
|
||||
// Returns the size of one codepoint in bytes.
|
||||
function UTF8CharacterLength(p: PChar): integer;
|
||||
function UTF8CharacterLength(p: PChar): integer; inline;
|
||||
// Fast version of UTF8CharacterLength. Assumes the UTF-8 codepoint is valid.
|
||||
function UTF8CharacterLengthFast(p: PChar): integer; inline;
|
||||
|
||||
@ -370,56 +370,65 @@ begin
|
||||
Result := SysToUTF8(SysUtils.SysErrorMessage(ErrorCode));
|
||||
end;
|
||||
|
||||
function UTF8CharacterLength(p: PChar): integer;
|
||||
function UTF8CharacterLengthFull(p: PChar): integer;
|
||||
begin
|
||||
if p<>nil then begin
|
||||
if ord(p^)<%11000000 then begin
|
||||
// regular single byte character (#0 is a character, this is pascal ;)
|
||||
Result:=1;
|
||||
end
|
||||
else begin
|
||||
// multi byte
|
||||
if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
// could be 2 byte character
|
||||
if (ord(p[1]) and %11000000) = %10000000 then
|
||||
Result:=2
|
||||
else
|
||||
Result:=1;
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
// could be 3 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
Result:=3
|
||||
else
|
||||
Result:=1;
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
// could be 4 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
Result:=4
|
||||
else
|
||||
Result:=1;
|
||||
end
|
||||
case p^ of
|
||||
#0..#191: // %11000000
|
||||
// regular single byte character (#0 is a character, this is Pascal ;)
|
||||
Result:=1;
|
||||
#192..#223: // p^ and %11100000 = %11000000
|
||||
begin
|
||||
// could be 2 byte character
|
||||
if (ord(p[1]) and %11000000) = %10000000 then
|
||||
Result:=2
|
||||
else
|
||||
Result:=1;
|
||||
end;
|
||||
end else
|
||||
Result:=0;
|
||||
#224..#239: // p^ and %11110000 = %11100000
|
||||
begin
|
||||
// could be 3 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
Result:=3
|
||||
else
|
||||
Result:=1;
|
||||
end;
|
||||
#240..#247: // p^ and %11111000 = %11110000
|
||||
begin
|
||||
// could be 4 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
Result:=4
|
||||
else
|
||||
Result:=1;
|
||||
end;
|
||||
else
|
||||
Result:=1;
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF8CharacterLength(p: PChar): integer; inline;
|
||||
begin
|
||||
if p=nil then exit(0);
|
||||
if p^<#192 then exit(1);
|
||||
Result:=UTF8CharacterLengthFull(p);
|
||||
end;
|
||||
|
||||
function UTF8CharacterLengthFast(p: PChar): integer;
|
||||
begin
|
||||
case p^ of
|
||||
#0..#191,#255: Result := 1;
|
||||
#192..#223 : Result := 2;
|
||||
#224..#239 : Result := 3;
|
||||
#240..#247 : Result := 4;
|
||||
#248..#251 : Result := 5;
|
||||
#252, #253 : Result := 6;
|
||||
#254 : Result := 7;
|
||||
#0..#191 : Result := 1;
|
||||
#192..#223 : Result := 2;
|
||||
#224..#239 : Result := 3;
|
||||
#240..#247 : Result := 4;
|
||||
#248..#255 : Result := 1;
|
||||
// Theoretically UTF-8 supports length 1-7, but since 2003, RFC 3629 limits
|
||||
// it to 1-4 bytes.
|
||||
// This is an inline function, so keep the function short.
|
||||
//#248..#251 : Result := 5;
|
||||
//#252, #253 : Result := 6;
|
||||
//#254 : Result := 7;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user