* converted readstr/writestr to use system.Utf8CodePointLen()

git-svn-id: trunk@30049 -
This commit is contained in:
Jonas Maebe 2015-03-01 17:12:24 +00:00
parent 9da8a2304a
commit 60dbce940a

View File

@ -2309,76 +2309,28 @@ end;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
function UTF8CodePointLength(firstbyte: byte): SizeInt;
var
firstzerobit: SizeInt;
begin
result:=1;
{ bsr searches for the leftmost 1 bit. We are interested in the
leftmost 0 bit, so first invert the value
}
firstzerobit:=BsrByte(not(firstbyte));
{ if there is no zero bit or the first zero bit is the rightmost bit
(bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
UTF-8-encoded string, and in the worst case bit 1 has to be zero)
}
if (firstzerobit=0) or (firstzerobit=255) then
exit;
{ the number of bytes belonging to this code point is
7-(pos first 0-bit).
}
result:=7-firstzerobit;
end;
function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
var
i, lenfound, codepointlen: SizeInt;
i, lenfound, codepointlen: sizeint;
b: byte;
begin
lenfound:=0;
for i:=t.bufpos-1 downto 0 do
begin
b:=byte(t.bufptr^[i]);
if b<=127 then
{ we don't care about combining diacritical marks here: we just want a
valid UTF-8 codepoint that we can translate to UTF-16. The combining
diacritical marks can be translated separately }
codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false);
{ complete codepoint -> flush till here }
if codepointlen>0 then
begin
if lenfound = 0 then
{ valid simple code point }
result:=i+1
else
{ valid simple code point followed by a bunch of invalid data ->
handle everything since it can't become valid by adding more
bytes }
result:=t.bufpos;
result:=i+codepointlen;
exit;
end;
{ start of a complex character }
if (b and %11000000)<>0 then
begin
codepointlen:=UTF8CodePointLength(b);
{ we did not yet get all bytes of the last code point -> handle
everything until the start of this character }
if codepointlen>lenfound+1 then
if i<>0 then
result:=i
{ the buffer is too small to contain the entire utf-8 code point
-> nothing else to do but handle the entire buffer (and end up
with an invalid character) -- since writestr uses the default
buffer size of 32 bytes, this can only happen for invalid utf-8
encodings }
else
result:=t.bufpos
{ the last code point is invalid -> handle everything since it can't
become valid by adding more bytes; in case it's complete, we also
handle everything, of course}
else
result:=t.bufpos;
exit;
end;
inc(lenfound);
end
end;
{ all invalid data, or the buffer is too small to be able to deal with the
complete utf8char -> nothing else to do but to handle the entire buffer }
complete utf8char -> nothing else to do but to handle the entire buffer
(and end up with a partial/invalid character) }
result:=t.bufpos;
end;