* fixed WideStringToUCS4String and UCS4StringToWideString for code points

requiring surrogate pairs in utf-16 + test

git-svn-id: trunk@9391 -
This commit is contained in:
Jonas Maebe 2007-12-05 13:05:09 +00:00
parent 1ab5e28f75
commit 2319d8c3ce
3 changed files with 141 additions and 9 deletions

1
.gitattributes vendored
View File

@ -7299,6 +7299,7 @@ tests/test/twide1.pp svneol=native#text/plain
tests/test/twide2.pp svneol=native#text/plain
tests/test/twide3.pp svneol=native#text/plain
tests/test/twide4.pp svneol=native#text/plain
tests/test/twide5.pp svneol=native#text/plain
tests/test/twrstr1.pp svneol=native#text/plain
tests/test/twrstr2.pp svneol=native#text/plain
tests/test/twrstr3.pp svneol=native#text/plain

View File

@ -1814,24 +1814,111 @@ function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inli
end;
{ converts an utf-16 code point or surrogate pair to utf-32 }
function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char;
var
w: widechar;
begin
{ UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
{ are the same in UTF-32 }
w:=s[index];
if (w<=#$d7ff) or
(w>=#$e000) then
begin
result:=UCS4Char(w);
len:=1;
end
{ valid surrogate pair? }
else if (w<=#$dbff) and
{ w>=#$d7ff check not needed, checked above }
(index<length(s)) and
(s[index+1]>=#$dc00) and
(s[index+1]<=#$dfff) then
{ convert the surrogate pair to UTF-32 }
begin
result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
len:=2;
end
else
{ invalid surrogate -> do nothing }
begin
result:=UCS4Char(w);
len:=1;
end;
end;
function WideStringToUCS4String(const s : WideString) : UCS4String;
var
i : SizeInt;
i, slen,
destindex : SizeInt;
len : longint;
uch : UCS4Char;
begin
setlength(result,length(s)+1);
for i:=1 to length(s) do
result[i-1]:=UCS4Char(s[i]);
result[length(s)]:=UCS4Char(0);
slen:=length(s);
setlength(result,slen+1);
i:=1;
destindex:=0;
while (i<=slen) do
begin
result[destindex]:=utf16toutf32(s,i,len);
inc(destindex);
inc(i,len);
end;
result[destindex]:=UCS4Char(0);
{ destindex <= slen }
setlength(result,destindex);
end;
{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
var
p : PWideChar;
begin
{ if nc > $ffff, we need two places }
if (index+ord(nc > $ffff)>length(s)) then
if (length(s) < 10*256) then
setlength(s,length(s)+10)
else
setlength(s,length(s)+length(s) shr 8);
{ we know that s is unique -> avoid uniquestring calls}
p:=@s[index];
if (nc<$ffff) then
begin
p^:=widechar(nc);
inc(index);
end
else if (nc<=$10ffff) then
begin
p^:=widechar((nc - $10000) shr 10 + $d800);
(p+1)^:=widechar((nc - $10000) and $3ff + $dc00);
inc(index,2);
end
else
{ invalid code point }
begin
p^:='?';
inc(index);
end;
end;
function UCS4StringToWideString(const s : UCS4String) : WideString;
var
i : SizeInt;
i, slen : SizeInt;
nc : wint_t;
resindex : SizeInt;
len : longint;
valid : boolean;
begin
setlength(result,length(s)-1);
for i:=1 to length(s)-1 do
result[i]:=WideChar(s[i-1]);
SetLength(result,length(s));
resindex:=1;
for i:=0 to high(s) do
ConcatUTF32ToWideStr(s[i],result,resindex);
{ adjust result length (may be too big due to growing }
{ for surrogate pairs) }
setlength(result,resindex-1);
end;

44
tests/test/twide5.pp Normal file
View File

@ -0,0 +1,44 @@
{$codepage utf-8}
var
ws: widestring;
us: UCS4String;
begin
// the compiler does not yet support characters which require
// a surrogate pair in utf-16
// ws:='鳣ćçŹ你';
// so write the last character directly using a utf-16 surrogate pair
ws:='鳣ćçŹ'#$d87e#$dc04;
if (length(ws)<>8) or
(ws[1]<>'é') or
(ws[2]<>'ł') or
(ws[3]<>'Ł') or
(ws[4]<>'ć') or
(ws[5]<>'ç') or
(ws[6]<>'Ź') or
(ws[7]<>#$d87e) or
(ws[8]<>#$dc04) then
halt(1);
us:=WideStringToUCS4String(ws);
if (length(us)<>7) or
(us[0]<>UCS4Char(widechar('é'))) or
(us[1]<>UCS4Char(widechar('ł'))) or
(us[2]<>UCS4Char(widechar('Ł'))) or
(us[3]<>UCS4Char(widechar('ć'))) or
(us[4]<>UCS4Char(widechar('ç'))) or
(us[5]<>UCS4Char(widechar('Ź'))) or
(us[6]<>UCS4Char($2F804)) then
halt(2);
ws:=UCS4StringToWideString(us);
if (length(ws)<>8) or
(ws[1]<>'é') or
(ws[2]<>'ł') or
(ws[3]<>'Ł') or
(ws[4]<>'ć') or
(ws[5]<>'ç') or
(ws[6]<>'Ź') or
(ws[7]<>#$d87e) or
(ws[8]<>#$dc04) then
halt(3);
end.