diff --git a/rtl/inc/text.inc b/rtl/inc/text.inc index fa89f8672c..29a2baa804 100644 --- a/rtl/inc/text.inc +++ b/rtl/inc/text.inc @@ -2309,76 +2309,28 @@ end; {$ifdef FPC_HAS_FEATURE_WIDESTRINGS} -function UTF8CodePointLength(firstbyte: byte): SizeInt; -var - firstzerobit: SizeInt; -begin - result:=1; - { bsr searches for the leftmost 1 bit. We are interested in the - leftmost 0 bit, so first invert the value - } - firstzerobit:=BsrByte(not(firstbyte)); - { if there is no zero bit or the first zero bit is the rightmost bit - (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an - UTF-8-encoded string, and in the worst case bit 1 has to be zero) - } - if (firstzerobit=0) or (firstzerobit=255) then - exit; - { the number of bytes belonging to this code point is - 7-(pos first 0-bit). - } - result:=7-firstzerobit; -end; - - function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt; var - i, lenfound, codepointlen: SizeInt; + i, lenfound, codepointlen: sizeint; b: byte; begin lenfound:=0; for i:=t.bufpos-1 downto 0 do begin - b:=byte(t.bufptr^[i]); - if b<=127 then + { we don't care about combining diacritical marks here: we just want a + valid UTF-8 codepoint that we can translate to UTF-16. The combining + diacritical marks can be translated separately } + codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false); + { complete codepoint -> flush till here } + if codepointlen>0 then begin - if lenfound = 0 then - { valid simple code point } - result:=i+1 - else - { valid simple code point followed by a bunch of invalid data -> - handle everything since it can't become valid by adding more - bytes } - result:=t.bufpos; + result:=i+codepointlen; exit; - end; - { start of a complex character } - if (b and %11000000)<>0 then - begin - codepointlen:=UTF8CodePointLength(b); - { we did not yet get all bytes of the last code point -> handle - everything until the start of this character } - if codepointlen>lenfound+1 then - if i<>0 then - result:=i - { the buffer is too small to contain the entire utf-8 code point - -> nothing else to do but handle the entire buffer (and end up - with an invalid character) -- since writestr uses the default - buffer size of 32 bytes, this can only happen for invalid utf-8 - encodings } - else - result:=t.bufpos - { the last code point is invalid -> handle everything since it can't - become valid by adding more bytes; in case it's complete, we also - handle everything, of course} - else - result:=t.bufpos; - exit; - end; - inc(lenfound); + end end; { all invalid data, or the buffer is too small to be able to deal with the - complete utf8char -> nothing else to do but to handle the entire buffer } + complete utf8char -> nothing else to do but to handle the entire buffer + (and end up with a partial/invalid character) } result:=t.bufpos; end;