diff --git a/.gitattributes b/.gitattributes index 32ab9b5e3f..89992b9db2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -12291,6 +12291,7 @@ tests/test/tcpstr26b.pp svneol=native#text/plain tests/test/tcpstr26c.pp svneol=native#text/plain tests/test/tcpstr26d.pp svneol=native#text/plain tests/test/tcpstr27.pp svneol=native#text/plain +tests/test/tcpstr28.pp svneol=native#text/pascal tests/test/tcpstr2a.pp svneol=native#text/plain tests/test/tcpstr3.pp svneol=native#text/plain tests/test/tcpstr4.pp svneol=native#text/plain diff --git a/compiler/scanner.pas b/compiler/scanner.pas index be38faec24..b86b546a6c 100644 --- a/compiler/scanner.pas +++ b/compiler/scanner.pas @@ -4586,6 +4586,7 @@ type procedure tscannerfile.readtoken(allowrecordtoken:boolean); var code : integer; + d : cardinal; len, low,high,mid : longint; w : word; @@ -5160,9 +5161,35 @@ type iswidestring:=true; len:=0; end; - { four or more chars aren't handled } + { four chars } if (ord(c) and $f0)=$f0 then - message(scan_e_utf8_bigger_than_65535) + begin + { this always represents a surrogate pair, so + read as 32-bit value and then split into + the corresponding pair of two wchars } + d:=ord(c) and $f; + readchar; + if (ord(c) and $c0)<>$80 then + message(scan_e_utf8_malformed); + d:=(d shl 6) or (ord(c) and $3f); + readchar; + if (ord(c) and $c0)<>$80 then + message(scan_e_utf8_malformed); + d:=(d shl 6) or (ord(c) and $3f); + readchar; + if (ord(c) and $c0)<>$80 then + message(scan_e_utf8_malformed); + d:=(d shl 6) or (ord(c) and $3f); + if d<$10000 then + message(scan_e_utf8_malformed); + d:=d-$10000; + { high surrogate } + w:=$d800+(d shr 10); + concatwidestringchar(patternw,w); + { low surrogate } + w:=$dc00+(d and $3ff); + concatwidestringchar(patternw,w); + end { three chars } else if (ord(c) and $e0)=$e0 then begin diff --git a/tests/test/tcpstr28.pp b/tests/test/tcpstr28.pp new file mode 100644 index 0000000000..843df14055 --- /dev/null +++ b/tests/test/tcpstr28.pp @@ -0,0 +1,23 @@ +program tcpstr28; + +{$codepage utf8} + +const + Test = '𝄞𤽜'; + UTF8Test = UTF8String(Test); + UTF16Test = UnicodeString(Test); + +var + utf8str: UTF8String = Test; + utf16str: UnicodeString = Test; + +begin + if Length(UTF8Test) <> 8 then + Halt(1); + if Length(utf8str) <> 8 then + Halt(1); + if Length(UTF16Test) <> 4 then + Halt(1); + if Length(utf16str) <> 4 then + Halt(1); +end.