LazUtf8: Refactor UTF8FindNearestCharStart. Resolves Issue #0029851.

git-svn-id: trunk@51973 -
This commit is contained in:
bart 2016-03-17 10:42:52 +00:00
parent 34be9ae2d0
commit b192fb9760

View File

@ -643,45 +643,22 @@ begin
end; end;
{ Find the start of the UTF8 character which contains BytePos, { Find the start of the UTF8 character which contains BytePos,
if BytePos is not part of a valid Utf8Codepoint the function returns BytePos
Len is length in byte, BytePos starts at 0 } Len is length in byte, BytePos starts at 0 }
function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt): SizeInt;
): SizeInt; var
CurPos: PChar;
CharLen: Integer;
begin begin
Result:=0; if (BytePos > Len-1) then BytePos := Len - 1;
if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin CurPos := Utf8Str + BytePos;
Result:=BytePos; //No need to check the result value, since when it retuns False CurPos will be reset
if Result>Len then Result:=Len-1; //to it's original value, and that's what we want to return in that case
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin Utf8TryFindCodepointStart(Utf8Str, CurPos, CharLen);
dec(Result); Result := CurPos - Utf8Str;
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
dec(Result);
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
dec(Result);
// should be four byte character
if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin
// broken UTF8 character
inc(Result,3);
end else begin
// is four byte character
end;
end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin
// broken UTF8 character, should be three byte
inc(Result,2);
end else
begin
// is three byte character
end;
end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin
// broken UTF8 character, should be two byte
inc(Result);
end else
begin
// is two byte character
end;
end;
end;
end; end;
{ Len is the length in bytes of UTF8Str { Len is the length in bytes of UTF8Str
CharIndex is the position of the desired char (starting at 0), in chars CharIndex is the position of the desired char (starting at 0), in chars
} }