* implement support for 4 Byte UTF-8 codepoints that result in a surrogate pair for UTF-16

git-svn-id: trunk@36116 -
This commit is contained in:
svenbarth 2017-05-05 14:03:57 +00:00
parent af48d176ec
commit c552b2957a
3 changed files with 53 additions and 2 deletions

1
.gitattributes vendored
View File

@ -12291,6 +12291,7 @@ tests/test/tcpstr26b.pp svneol=native#text/plain
tests/test/tcpstr26c.pp svneol=native#text/plain
tests/test/tcpstr26d.pp svneol=native#text/plain
tests/test/tcpstr27.pp svneol=native#text/plain
tests/test/tcpstr28.pp svneol=native#text/pascal
tests/test/tcpstr2a.pp svneol=native#text/plain
tests/test/tcpstr3.pp svneol=native#text/plain
tests/test/tcpstr4.pp svneol=native#text/plain

View File

@ -4586,6 +4586,7 @@ type
procedure tscannerfile.readtoken(allowrecordtoken:boolean);
var
code : integer;
d : cardinal;
len,
low,high,mid : longint;
w : word;
@ -5160,9 +5161,35 @@ type
iswidestring:=true;
len:=0;
end;
{ four or more chars aren't handled }
{ four chars }
if (ord(c) and $f0)=$f0 then
message(scan_e_utf8_bigger_than_65535)
begin
{ this always represents a surrogate pair, so
read as 32-bit value and then split into
the corresponding pair of two wchars }
d:=ord(c) and $f;
readchar;
if (ord(c) and $c0)<>$80 then
message(scan_e_utf8_malformed);
d:=(d shl 6) or (ord(c) and $3f);
readchar;
if (ord(c) and $c0)<>$80 then
message(scan_e_utf8_malformed);
d:=(d shl 6) or (ord(c) and $3f);
readchar;
if (ord(c) and $c0)<>$80 then
message(scan_e_utf8_malformed);
d:=(d shl 6) or (ord(c) and $3f);
if d<$10000 then
message(scan_e_utf8_malformed);
d:=d-$10000;
{ high surrogate }
w:=$d800+(d shr 10);
concatwidestringchar(patternw,w);
{ low surrogate }
w:=$dc00+(d and $3ff);
concatwidestringchar(patternw,w);
end
{ three chars }
else if (ord(c) and $e0)=$e0 then
begin

23
tests/test/tcpstr28.pp Normal file
View File

@ -0,0 +1,23 @@
program tcpstr28;
{$codepage utf8}
const
Test = '𝄞𤽜';
UTF8Test = UTF8String(Test);
UTF16Test = UnicodeString(Test);
var
utf8str: UTF8String = Test;
utf16str: UnicodeString = Test;
begin
if Length(UTF8Test) <> 8 then
Halt(1);
if Length(utf8str) <> 8 then
Halt(1);
if Length(UTF16Test) <> 4 then
Halt(1);
if Length(utf16str) <> 4 then
Halt(1);
end.