* converted readstr/writestr to use system.Utf8CodePointLen()

git-svn-id: trunk@30049 -
2025-04-20 15:49:27 +02:00 · 2015-03-01 17:12:24 +00:00 · 2015-03-01 17:12:24 +00:00 · 60dbce940a
commit 60dbce940a
parent 9da8a2304a
1 changed files with 11 additions and 59 deletions
--- a/rtl/inc/text.inc
+++ b/rtl/inc/text.inc
@ -2309,76 +2309,28 @@ end;


 {$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
-function UTF8CodePointLength(firstbyte: byte): SizeInt;
-var
-  firstzerobit: SizeInt;
-begin
-  result:=1;
-  { bsr searches for the leftmost 1 bit. We are interested in the
-    leftmost 0 bit, so first invert the value
-  }
-  firstzerobit:=BsrByte(not(firstbyte));
-  { if there is no zero bit or the first zero bit is the rightmost bit
-    (bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
-    UTF-8-encoded string, and in the worst case bit 1 has to be zero)
-  }
-  if (firstzerobit=0) or (firstzerobit=255)  then
-    exit;
-  { the number of bytes belonging to this code point is
-    7-(pos first 0-bit).
-  }
-  result:=7-firstzerobit;
-end;
-
-
 function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
 var
-  i, lenfound, codepointlen: SizeInt;
+  i, lenfound, codepointlen: sizeint;
  b: byte;
 begin
  lenfound:=0;
  for i:=t.bufpos-1 downto 0 do
    begin
-      b:=byte(t.bufptr^[i]);
-      if b<=127 then
+      { we don't care about combining diacritical marks here: we just want a
+        valid UTF-8 codepoint that we can translate to UTF-16. The combining
+        diacritical marks can be translated separately }
+      codepointlen:=Utf8CodePointLen(pchar(@t.bufptr^[i]),(t.bufpos-1-i)+1,false);
+      { complete codepoint -> flush till here }
+      if codepointlen>0 then
        begin
-          if lenfound = 0 then
-            { valid simple code point }
-            result:=i+1
-          else
-            { valid simple code point followed by a bunch of invalid data ->
-              handle everything since it can't become valid by adding more
-              bytes }
-            result:=t.bufpos;
+          result:=i+codepointlen;
          exit;
-        end;
-      { start of a complex character }
-      if (b and %11000000)<>0 then
-        begin
-          codepointlen:=UTF8CodePointLength(b);
-          { we did not yet get all bytes of the last code point -> handle
-            everything until the start of this character }
-          if codepointlen>lenfound+1 then
-            if i<>0 then
-              result:=i
-            { the buffer is too small to contain the entire utf-8 code point
-              -> nothing else to do but handle the entire buffer (and end up
-              with an invalid character) -- since writestr uses the default
-              buffer size of 32 bytes, this can only happen for invalid utf-8
-              encodings }
-            else
-              result:=t.bufpos
-          { the last code point is invalid -> handle everything since it can't
-            become valid by adding more bytes; in case it's complete, we also
-            handle everything, of course}
-          else
-            result:=t.bufpos;
-          exit;
-        end;
-      inc(lenfound);
+        end
    end;
  { all invalid data, or the buffer is too small to be able to deal with the
-    complete utf8char -> nothing else to do but to handle the entire buffer }
+    complete utf8char -> nothing else to do but to handle the entire buffer
+    (and end up with a partial/invalid character) }
  result:=t.bufpos;
 end;