mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-29 12:20:28 +02:00
* converted readstr/writestr to use system.Utf8CodePointLen()
git-svn-id: trunk@30049 -
This commit is contained in:
parent
9da8a2304a
commit
60dbce940a
@ -2309,76 +2309,28 @@ end;
|
|||||||
|
|
||||||
|
|
||||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||||
function UTF8CodePointLength(firstbyte: byte): SizeInt;
|
|
||||||
var
|
|
||||||
firstzerobit: SizeInt;
|
|
||||||
begin
|
|
||||||
result:=1;
|
|
||||||
{ bsr searches for the leftmost 1 bit. We are interested in the
|
|
||||||
leftmost 0 bit, so first invert the value
|
|
||||||
}
|
|
||||||
firstzerobit:=BsrByte(not(firstbyte));
|
|
||||||
{ if there is no zero bit or the first zero bit is the rightmost bit
|
|
||||||
(bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
|
|
||||||
UTF-8-encoded string, and in the worst case bit 1 has to be zero)
|
|
||||||
}
|
|
||||||
if (firstzerobit=0) or (firstzerobit=255) then
|
|
||||||
exit;
|
|
||||||
{ the number of bytes belonging to this code point is
|
|
||||||
7-(pos first 0-bit).
|
|
||||||
}
|
|
||||||
result:=7-firstzerobit;
|
|
||||||
end;
|
|
||||||
|
|
||||||
|
|
||||||
function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
|
function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
|
||||||
var
|
var
|
||||||
i, lenfound, codepointlen: SizeInt;
|
i, lenfound, codepointlen: sizeint;
|
||||||
b: byte;
|
b: byte;
|
||||||
begin
|
begin
|
||||||
lenfound:=0;
|
lenfound:=0;
|
||||||
for i:=t.bufpos-1 downto 0 do
|
for i:=t.bufpos-1 downto 0 do
|
||||||
begin
|
begin
|
||||||
b:=byte(t.bufptr^[i]);
|
{ we don't care about combining diacritical marks here: we just want a
|
||||||
if b<=127 then
|
valid UTF-8 codepoint that we can translate to UTF-16. The combining
|
||||||
|
diacritical marks can be translated separately }
|
||||||
|
codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false);
|
||||||
|
{ complete codepoint -> flush till here }
|
||||||
|
if codepointlen>0 then
|
||||||
begin
|
begin
|
||||||
if lenfound = 0 then
|
result:=i+codepointlen;
|
||||||
{ valid simple code point }
|
|
||||||
result:=i+1
|
|
||||||
else
|
|
||||||
{ valid simple code point followed by a bunch of invalid data ->
|
|
||||||
handle everything since it can't become valid by adding more
|
|
||||||
bytes }
|
|
||||||
result:=t.bufpos;
|
|
||||||
exit;
|
exit;
|
||||||
end;
|
end
|
||||||
{ start of a complex character }
|
|
||||||
if (b and %11000000)<>0 then
|
|
||||||
begin
|
|
||||||
codepointlen:=UTF8CodePointLength(b);
|
|
||||||
{ we did not yet get all bytes of the last code point -> handle
|
|
||||||
everything until the start of this character }
|
|
||||||
if codepointlen>lenfound+1 then
|
|
||||||
if i<>0 then
|
|
||||||
result:=i
|
|
||||||
{ the buffer is too small to contain the entire utf-8 code point
|
|
||||||
-> nothing else to do but handle the entire buffer (and end up
|
|
||||||
with an invalid character) -- since writestr uses the default
|
|
||||||
buffer size of 32 bytes, this can only happen for invalid utf-8
|
|
||||||
encodings }
|
|
||||||
else
|
|
||||||
result:=t.bufpos
|
|
||||||
{ the last code point is invalid -> handle everything since it can't
|
|
||||||
become valid by adding more bytes; in case it's complete, we also
|
|
||||||
handle everything, of course}
|
|
||||||
else
|
|
||||||
result:=t.bufpos;
|
|
||||||
exit;
|
|
||||||
end;
|
|
||||||
inc(lenfound);
|
|
||||||
end;
|
end;
|
||||||
{ all invalid data, or the buffer is too small to be able to deal with the
|
{ all invalid data, or the buffer is too small to be able to deal with the
|
||||||
complete utf8char -> nothing else to do but to handle the entire buffer }
|
complete utf8char -> nothing else to do but to handle the entire buffer
|
||||||
|
(and end up with a partial/invalid character) }
|
||||||
result:=t.bufpos;
|
result:=t.bufpos;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user