* converted readstr/writestr to use system.Utf8CodePointLen()

git-svn-id: trunk@30049 -
2025-08-29 12:20:28 +02:00 · 2015-03-01 17:12:24 +00:00 · 2015-03-01 17:12:24 +00:00 · 60dbce940a
commit 60dbce940a
parent 9da8a2304a
1 changed files with 11 additions and 59 deletions
--- a/rtl/inc/text.inc
+++ b/rtl/inc/text.inc
@ -2309,76 +2309,28 @@ end;
 {$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
 function UTF8CodePointLength(firstbyte: byte): SizeInt;
 var
  firstzerobit: SizeInt;
 begin
  result:=1;
  { bsr searches for the leftmost 1 bit. We are interested in the
    leftmost 0 bit, so first invert the value
  }
  firstzerobit:=BsrByte(not(firstbyte));
  { if there is no zero bit or the first zero bit is the rightmost bit
    (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
    UTF-8-encoded string, and in the worst case bit 1 has to be zero)
  }
  if (firstzerobit=0) or (firstzerobit=255)  then
    exit;
  { the number of bytes belonging to this code point is
    7-(pos first 0-bit).
  }
  result:=7-firstzerobit;
 end;
 function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
 var
-  i, lenfound, codepointlen: SizeInt;
+  i, lenfound, codepointlen: sizeint;
  b: byte;
 begin
  lenfound:=0;
  for i:=t.bufpos-1 downto 0 do
    begin
-      b:=byte(t.bufptr^[i]);
+      { we don't care about combining diacritical marks here: we just want a
-      if b<=127 then
+        valid UTF-8 codepoint that we can translate to UTF-16. The combining
        diacritical marks can be translated separately }
      codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false);
      { complete codepoint -> flush till here }
      if codepointlen>0 then
        begin
-          if lenfound = 0 then
+          result:=i+codepointlen;
            { valid simple code point }
            result:=i+1
          else
            { valid simple code point followed by a bunch of invalid data ->
              handle everything since it can't become valid by adding more
              bytes }
            result:=t.bufpos;
          exit;
-        end;
+        end
      { start of a complex character }
      if (b and %11000000)<>0 then
        begin
          codepointlen:=UTF8CodePointLength(b);
          { we did not yet get all bytes of the last code point -> handle
            everything until the start of this character }
          if codepointlen>lenfound+1 then
            if i<>0 then
              result:=i
            { the buffer is too small to contain the entire utf-8 code point
              -> nothing else to do but handle the entire buffer (and end up
              with an invalid character) -- since writestr uses the default
              buffer size of 32 bytes, this can only happen for invalid utf-8
              encodings }
            else
              result:=t.bufpos
          { the last code point is invalid -> handle everything since it can't
            become valid by adding more bytes; in case it's complete, we also
            handle everything, of course}
          else
            result:=t.bufpos;
          exit;
        end;
      inc(lenfound);
    end;
  { all invalid data, or the buffer is too small to be able to deal with the
-    complete utf8char -> nothing else to do but to handle the entire buffer }
+    complete utf8char -> nothing else to do but to handle the entire buffer
    (and end up with a partial/invalid character) }
  result:=t.bufpos;
 end;