mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-19 20:39:25 +02:00
* fixed WideStringToUCS4String and UCS4StringToWideString for code points
requiring surrogate pairs in utf-16 + test git-svn-id: trunk@9391 -
This commit is contained in:
parent
1ab5e28f75
commit
2319d8c3ce
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -7299,6 +7299,7 @@ tests/test/twide1.pp svneol=native#text/plain
|
||||
tests/test/twide2.pp svneol=native#text/plain
|
||||
tests/test/twide3.pp svneol=native#text/plain
|
||||
tests/test/twide4.pp svneol=native#text/plain
|
||||
tests/test/twide5.pp svneol=native#text/plain
|
||||
tests/test/twrstr1.pp svneol=native#text/plain
|
||||
tests/test/twrstr2.pp svneol=native#text/plain
|
||||
tests/test/twrstr3.pp svneol=native#text/plain
|
||||
|
@ -1814,24 +1814,111 @@ function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inli
|
||||
end;
|
||||
|
||||
|
||||
{ converts an utf-16 code point or surrogate pair to utf-32 }
|
||||
function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char;
|
||||
var
|
||||
w: widechar;
|
||||
begin
|
||||
{ UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
|
||||
{ are the same in UTF-32 }
|
||||
w:=s[index];
|
||||
if (w<=#$d7ff) or
|
||||
(w>=#$e000) then
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end
|
||||
{ valid surrogate pair? }
|
||||
else if (w<=#$dbff) and
|
||||
{ w>=#$d7ff check not needed, checked above }
|
||||
(index<length(s)) and
|
||||
(s[index+1]>=#$dc00) and
|
||||
(s[index+1]<=#$dfff) then
|
||||
{ convert the surrogate pair to UTF-32 }
|
||||
begin
|
||||
result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
|
||||
len:=2;
|
||||
end
|
||||
else
|
||||
{ invalid surrogate -> do nothing }
|
||||
begin
|
||||
result:=UCS4Char(w);
|
||||
len:=1;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function WideStringToUCS4String(const s : WideString) : UCS4String;
|
||||
var
|
||||
i : SizeInt;
|
||||
i, slen,
|
||||
destindex : SizeInt;
|
||||
len : longint;
|
||||
uch : UCS4Char;
|
||||
begin
|
||||
setlength(result,length(s)+1);
|
||||
for i:=1 to length(s) do
|
||||
result[i-1]:=UCS4Char(s[i]);
|
||||
result[length(s)]:=UCS4Char(0);
|
||||
slen:=length(s);
|
||||
setlength(result,slen+1);
|
||||
i:=1;
|
||||
destindex:=0;
|
||||
while (i<=slen) do
|
||||
begin
|
||||
result[destindex]:=utf16toutf32(s,i,len);
|
||||
inc(destindex);
|
||||
inc(i,len);
|
||||
end;
|
||||
result[destindex]:=UCS4Char(0);
|
||||
{ destindex <= slen }
|
||||
setlength(result,destindex);
|
||||
end;
|
||||
|
||||
|
||||
{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
|
||||
procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
|
||||
var
|
||||
p : PWideChar;
|
||||
begin
|
||||
{ if nc > $ffff, we need two places }
|
||||
if (index+ord(nc > $ffff)>length(s)) then
|
||||
if (length(s) < 10*256) then
|
||||
setlength(s,length(s)+10)
|
||||
else
|
||||
setlength(s,length(s)+length(s) shr 8);
|
||||
{ we know that s is unique -> avoid uniquestring calls}
|
||||
p:=@s[index];
|
||||
if (nc<$ffff) then
|
||||
begin
|
||||
p^:=widechar(nc);
|
||||
inc(index);
|
||||
end
|
||||
else if (nc<=$10ffff) then
|
||||
begin
|
||||
p^:=widechar((nc - $10000) shr 10 + $d800);
|
||||
(p+1)^:=widechar((nc - $10000) and $3ff + $dc00);
|
||||
inc(index,2);
|
||||
end
|
||||
else
|
||||
{ invalid code point }
|
||||
begin
|
||||
p^:='?';
|
||||
inc(index);
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function UCS4StringToWideString(const s : UCS4String) : WideString;
|
||||
var
|
||||
i : SizeInt;
|
||||
i, slen : SizeInt;
|
||||
nc : wint_t;
|
||||
resindex : SizeInt;
|
||||
len : longint;
|
||||
valid : boolean;
|
||||
begin
|
||||
setlength(result,length(s)-1);
|
||||
for i:=1 to length(s)-1 do
|
||||
result[i]:=WideChar(s[i-1]);
|
||||
SetLength(result,length(s));
|
||||
resindex:=1;
|
||||
for i:=0 to high(s) do
|
||||
ConcatUTF32ToWideStr(s[i],result,resindex);
|
||||
{ adjust result length (may be too big due to growing }
|
||||
{ for surrogate pairs) }
|
||||
setlength(result,resindex-1);
|
||||
end;
|
||||
|
||||
|
||||
|
44
tests/test/twide5.pp
Normal file
44
tests/test/twide5.pp
Normal file
@ -0,0 +1,44 @@
|
||||
{$codepage utf-8}
|
||||
|
||||
var
|
||||
ws: widestring;
|
||||
us: UCS4String;
|
||||
begin
|
||||
// the compiler does not yet support characters which require
|
||||
// a surrogate pair in utf-16
|
||||
// ws:='鳣ćçŹ你';
|
||||
// so write the last character directly using a utf-16 surrogate pair
|
||||
ws:='鳣ćçŹ'#$d87e#$dc04;
|
||||
|
||||
if (length(ws)<>8) or
|
||||
(ws[1]<>'é') or
|
||||
(ws[2]<>'ł') or
|
||||
(ws[3]<>'Ł') or
|
||||
(ws[4]<>'ć') or
|
||||
(ws[5]<>'ç') or
|
||||
(ws[6]<>'Ź') or
|
||||
(ws[7]<>#$d87e) or
|
||||
(ws[8]<>#$dc04) then
|
||||
halt(1);
|
||||
us:=WideStringToUCS4String(ws);
|
||||
if (length(us)<>7) or
|
||||
(us[0]<>UCS4Char(widechar('é'))) or
|
||||
(us[1]<>UCS4Char(widechar('ł'))) or
|
||||
(us[2]<>UCS4Char(widechar('Ł'))) or
|
||||
(us[3]<>UCS4Char(widechar('ć'))) or
|
||||
(us[4]<>UCS4Char(widechar('ç'))) or
|
||||
(us[5]<>UCS4Char(widechar('Ź'))) or
|
||||
(us[6]<>UCS4Char($2F804)) then
|
||||
halt(2);
|
||||
ws:=UCS4StringToWideString(us);
|
||||
if (length(ws)<>8) or
|
||||
(ws[1]<>'é') or
|
||||
(ws[2]<>'ł') or
|
||||
(ws[3]<>'Ł') or
|
||||
(ws[4]<>'ć') or
|
||||
(ws[5]<>'ç') or
|
||||
(ws[6]<>'Ź') or
|
||||
(ws[7]<>#$d87e) or
|
||||
(ws[8]<>#$dc04) then
|
||||
halt(3);
|
||||
end.
|
Loading…
Reference in New Issue
Block a user