* fixed WideStringToUCS4String and UCS4StringToWideString for code points

requiring surrogate pairs in utf-16 + test git-svn-id: trunk@9391 -
2025-04-19 20:39:25 +02:00 · 2007-12-05 13:05:09 +00:00 · 2007-12-05 13:05:09 +00:00 · 2319d8c3ce
commit 2319d8c3ce
parent 1ab5e28f75
3 changed files with 141 additions and 9 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -7299,6 +7299,7 @@ tests/test/twide1.pp svneol=native#text/plain
 tests/test/twide2.pp svneol=native#text/plain
 tests/test/twide3.pp svneol=native#text/plain
 tests/test/twide4.pp svneol=native#text/plain
+tests/test/twide5.pp svneol=native#text/plain
 tests/test/twrstr1.pp svneol=native#text/plain
 tests/test/twrstr2.pp svneol=native#text/plain
 tests/test/twrstr3.pp svneol=native#text/plain
--- a/rtl/inc/wustrings.inc
+++ b/rtl/inc/wustrings.inc
@ -1814,24 +1814,111 @@ function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inli
  end;


+{ converts an utf-16 code point or surrogate pair to utf-32 }
+function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char;
+var
+  w: widechar;
+begin
+  { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
+  { are the same in UTF-32                                  }
+  w:=s[index];
+  if (w<=#$d7ff) or
+     (w>=#$e000) then
+    begin
+      result:=UCS4Char(w);
+      len:=1;
+    end
+  { valid surrogate pair? }
+  else if (w<=#$dbff) and
+          { w>=#$d7ff check not needed, checked above }
+          (index<length(s)) and
+          (s[index+1]>=#$dc00) and
+          (s[index+1]<=#$dfff) then
+      { convert the surrogate pair to UTF-32 }
+    begin
+      result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
+      len:=2;
+    end
+  else
+    { invalid surrogate -> do nothing }
+    begin
+      result:=UCS4Char(w);
+      len:=1;
+    end;
+end;
+
+
 function WideStringToUCS4String(const s : WideString) : UCS4String;
  var
-    i : SizeInt;
+    i, slen,
+    destindex : SizeInt;
+    len       : longint;
+    uch       : UCS4Char;
  begin
-    setlength(result,length(s)+1);
-    for i:=1 to length(s) do
-      result[i-1]:=UCS4Char(s[i]);
-    result[length(s)]:=UCS4Char(0);
+    slen:=length(s);
+    setlength(result,slen+1);
+    i:=1;
+    destindex:=0;
+    while (i<=slen) do
+      begin
+        result[destindex]:=utf16toutf32(s,i,len);
+        inc(destindex);
+        inc(i,len);
+      end;
+    result[destindex]:=UCS4Char(0);
+    { destindex <= slen }
+    setlength(result,destindex);
  end;


+{ concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
+procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
+var
+  p : PWideChar;
+begin
+  { if nc > $ffff, we need two places }
+  if (index+ord(nc > $ffff)>length(s)) then
+    if (length(s) < 10*256) then
+      setlength(s,length(s)+10)
+    else
+      setlength(s,length(s)+length(s) shr 8);
+  { we know that s is unique -> avoid uniquestring calls}
+  p:=@s[index];
+  if (nc<$ffff) then
+    begin
+      p^:=widechar(nc);
+      inc(index);
+    end
+  else if (nc<=$10ffff) then
+    begin
+      p^:=widechar((nc - $10000) shr 10 + $d800);
+      (p+1)^:=widechar((nc - $10000) and $3ff + $dc00);
+      inc(index,2);
+    end
+  else
+    { invalid code point }
+    begin
+      p^:='?';
+      inc(index);
+    end;
+end;
+
+
 function UCS4StringToWideString(const s : UCS4String) : WideString;
  var
-    i : SizeInt;
+    i, slen   : SizeInt;
+    nc        : wint_t;
+    resindex  : SizeInt;
+    len       : longint;
+    valid     : boolean;
  begin
-    setlength(result,length(s)-1);
-    for i:=1 to length(s)-1 do
-      result[i]:=WideChar(s[i-1]);
+    SetLength(result,length(s));
+    resindex:=1;
+    for i:=0 to high(s) do
+      ConcatUTF32ToWideStr(s[i],result,resindex);
+    { adjust result length (may be too big due to growing }
+    { for surrogate pairs)                                }
+    setlength(result,resindex-1);
  end;


--- a/tests/test/twide5.pp
+++ b/tests/test/twide5.pp
@ -0,0 +1,44 @@
+{$codepage utf-8}
+
+var
+  ws: widestring;
+  us: UCS4String;
+begin
+// the compiler does not yet support characters which require
+// a surrogate pair in utf-16
+//  ws:='éłŁćçŹ你';
+//  so write the last character directly using a utf-16 surrogate pair
+  ws:='éłŁćçŹ'#$d87e#$dc04;
+
+  if (length(ws)<>8) or
+     (ws[1]<>'é') or
+     (ws[2]<>'ł') or
+     (ws[3]<>'Ł') or
+     (ws[4]<>'ć') or
+     (ws[5]<>'ç') or
+     (ws[6]<>'Ź') or
+     (ws[7]<>#$d87e) or
+     (ws[8]<>#$dc04) then
+    halt(1);
+  us:=WideStringToUCS4String(ws);
+  if (length(us)<>7) or
+     (us[0]<>UCS4Char(widechar('é'))) or
+     (us[1]<>UCS4Char(widechar('ł'))) or
+     (us[2]<>UCS4Char(widechar('Ł'))) or
+     (us[3]<>UCS4Char(widechar('ć'))) or
+     (us[4]<>UCS4Char(widechar('ç'))) or
+     (us[5]<>UCS4Char(widechar('Ź'))) or
+     (us[6]<>UCS4Char($2F804)) then
+    halt(2);
+  ws:=UCS4StringToWideString(us);
+  if (length(ws)<>8) or
+     (ws[1]<>'é') or
+     (ws[2]<>'ł') or
+     (ws[3]<>'Ł') or
+     (ws[4]<>'ć') or
+     (ws[5]<>'ç') or
+     (ws[6]<>'Ź') or
+     (ws[7]<>#$d87e) or
+     (ws[8]<>#$dc04) then
+    halt(3);
+end.