* implement support for 4 Byte UTF-8 codepoints that result in a surrogate pair for UTF-16

git-svn-id: trunk@36116 -
2025-12-16 02:30:41 +01:00 · 2017-05-05 14:03:57 +00:00 · 2017-05-05 14:03:57 +00:00 · c552b2957a
commit c552b2957a
parent af48d176ec
3 changed files with 53 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -12291,6 +12291,7 @@ tests/test/tcpstr26b.pp svneol=native#text/plain
 tests/test/tcpstr26c.pp svneol=native#text/plain
 tests/test/tcpstr26d.pp svneol=native#text/plain
 tests/test/tcpstr27.pp svneol=native#text/plain
+tests/test/tcpstr28.pp svneol=native#text/pascal
 tests/test/tcpstr2a.pp svneol=native#text/plain
 tests/test/tcpstr3.pp svneol=native#text/plain
 tests/test/tcpstr4.pp svneol=native#text/plain
--- a/compiler/scanner.pas
+++ b/compiler/scanner.pas
@ -4586,6 +4586,7 @@ type
    procedure tscannerfile.readtoken(allowrecordtoken:boolean);
      var
        code    : integer;
+        d : cardinal;
        len,
        low,high,mid : longint;
        w : word;
@ -5160,9 +5161,35 @@ type
                                   iswidestring:=true;
                                   len:=0;
                                 end;
-                               { four or more chars aren't handled }
+                               { four chars }
                               if (ord(c) and $f0)=$f0 then
-                                 message(scan_e_utf8_bigger_than_65535)
+                                 begin
+                                   { this always represents a surrogate pair, so
+                                     read as 32-bit value and then split into
+                                     the corresponding pair of two wchars }
+                                   d:=ord(c) and $f;
+                                   readchar;
+                                   if (ord(c) and $c0)<>$80 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=(d shl 6) or (ord(c) and $3f);
+                                   readchar;
+                                   if (ord(c) and $c0)<>$80 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=(d shl 6) or (ord(c) and $3f);
+                                   readchar;
+                                   if (ord(c) and $c0)<>$80 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=(d shl 6) or (ord(c) and $3f);
+                                   if d<$10000 then
+                                     message(scan_e_utf8_malformed);
+                                   d:=d-$10000;
+                                   { high surrogate }
+                                   w:=$d800+(d shr 10);
+                                   concatwidestringchar(patternw,w);
+                                   { low surrogate }
+                                   w:=$dc00+(d and $3ff);
+                                   concatwidestringchar(patternw,w);
+                                 end
                               { three chars }
                               else if (ord(c) and $e0)=$e0 then
                                 begin
--- a/tests/test/tcpstr28.pp
+++ b/tests/test/tcpstr28.pp
@ -0,0 +1,23 @@
+program tcpstr28;
+
+{$codepage utf8}
+
+const
+  Test = '𝄞𤽜';
+  UTF8Test = UTF8String(Test);
+  UTF16Test = UnicodeString(Test);
+
+var
+  utf8str: UTF8String = Test;
+  utf16str: UnicodeString = Test;
+
+begin
+  if Length(UTF8Test) <> 8 then
+    Halt(1);
+  if Length(utf8str) <> 8 then
+    Halt(1);
+  if Length(UTF16Test) <> 4 then
+    Halt(1);
+  if Length(utf16str) <> 4 then
+    Halt(1);
+end.