mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-20 15:49:27 +02:00
* converted readstr/writestr to use system.Utf8CodePointLen()
git-svn-id: trunk@30049 -
This commit is contained in:
parent
9da8a2304a
commit
60dbce940a
@ -2309,76 +2309,28 @@ end;
|
||||
|
||||
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
function UTF8CodePointLength(firstbyte: byte): SizeInt;
|
||||
var
|
||||
firstzerobit: SizeInt;
|
||||
begin
|
||||
result:=1;
|
||||
{ bsr searches for the leftmost 1 bit. We are interested in the
|
||||
leftmost 0 bit, so first invert the value
|
||||
}
|
||||
firstzerobit:=BsrByte(not(firstbyte));
|
||||
{ if there is no zero bit or the first zero bit is the rightmost bit
|
||||
(bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
|
||||
UTF-8-encoded string, and in the worst case bit 1 has to be zero)
|
||||
}
|
||||
if (firstzerobit=0) or (firstzerobit=255) then
|
||||
exit;
|
||||
{ the number of bytes belonging to this code point is
|
||||
7-(pos first 0-bit).
|
||||
}
|
||||
result:=7-firstzerobit;
|
||||
end;
|
||||
|
||||
|
||||
function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
|
||||
var
|
||||
i, lenfound, codepointlen: SizeInt;
|
||||
i, lenfound, codepointlen: sizeint;
|
||||
b: byte;
|
||||
begin
|
||||
lenfound:=0;
|
||||
for i:=t.bufpos-1 downto 0 do
|
||||
begin
|
||||
b:=byte(t.bufptr^[i]);
|
||||
if b<=127 then
|
||||
{ we don't care about combining diacritical marks here: we just want a
|
||||
valid UTF-8 codepoint that we can translate to UTF-16. The combining
|
||||
diacritical marks can be translated separately }
|
||||
codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false);
|
||||
{ complete codepoint -> flush till here }
|
||||
if codepointlen>0 then
|
||||
begin
|
||||
if lenfound = 0 then
|
||||
{ valid simple code point }
|
||||
result:=i+1
|
||||
else
|
||||
{ valid simple code point followed by a bunch of invalid data ->
|
||||
handle everything since it can't become valid by adding more
|
||||
bytes }
|
||||
result:=t.bufpos;
|
||||
result:=i+codepointlen;
|
||||
exit;
|
||||
end;
|
||||
{ start of a complex character }
|
||||
if (b and %11000000)<>0 then
|
||||
begin
|
||||
codepointlen:=UTF8CodePointLength(b);
|
||||
{ we did not yet get all bytes of the last code point -> handle
|
||||
everything until the start of this character }
|
||||
if codepointlen>lenfound+1 then
|
||||
if i<>0 then
|
||||
result:=i
|
||||
{ the buffer is too small to contain the entire utf-8 code point
|
||||
-> nothing else to do but handle the entire buffer (and end up
|
||||
with an invalid character) -- since writestr uses the default
|
||||
buffer size of 32 bytes, this can only happen for invalid utf-8
|
||||
encodings }
|
||||
else
|
||||
result:=t.bufpos
|
||||
{ the last code point is invalid -> handle everything since it can't
|
||||
become valid by adding more bytes; in case it's complete, we also
|
||||
handle everything, of course}
|
||||
else
|
||||
result:=t.bufpos;
|
||||
exit;
|
||||
end;
|
||||
inc(lenfound);
|
||||
end
|
||||
end;
|
||||
{ all invalid data, or the buffer is too small to be able to deal with the
|
||||
complete utf8char -> nothing else to do but to handle the entire buffer }
|
||||
complete utf8char -> nothing else to do but to handle the entire buffer
|
||||
(and end up with a partial/invalid character) }
|
||||
result:=t.bufpos;
|
||||
end;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user