* Improved ucs4 to utf16 conversions: removed function call on every character and multiple memory allocations, made as much code as possible shared between WideString and UnicodeString versions.

git-svn-id: trunk@20270 -
2025-08-18 22:49:17 +02:00 · 2012-02-06 19:55:56 +00:00 · 2012-02-06 19:55:56 +00:00 · a1cb87bcf8
commit a1cb87bcf8
parent e94508d5eb
1 changed files with 31 additions and 78 deletions
--- a/rtl/inc/ustrings.inc
+++ b/rtl/inc/ustrings.inc
@ -1895,35 +1895,27 @@ function WideStringToUCS4String(const s : WideString) : UCS4String;
  end;
-{ concatenates an utf-32 char to a unicodestring. S *must* be unique when entering. }
+{ dest should point to previously allocated wide/unicodestring }
-procedure ConcatUTF32ToUnicodeStr(const nc: UCS4Char; var S: UnicodeString; var index: SizeInt);
+procedure UCS4Decode(const s: UCS4String; dest: PWideChar);
 var
-  p : PUnicodeChar;
+  i: sizeint;
  nc: UCS4Char;
 begin
-  { if nc > $ffff, we need two places }
+  for i:=0 to length(s)-2 do  { -2 because s contains explicit terminating #0 }
  if (index+ord(nc > $ffff)>length(s)) then
    if (length(s) < 10*256) then
      setlength(s,length(s)+10)
    else
      setlength(s,length(s)+length(s) shr 8);
  { we know that s is unique -> avoid uniquestring calls}
  p:=@s[index];
  if (nc<$ffff) then
    begin
-      p^:=unicodechar(nc);
+      nc:=s[i];
-      inc(index);
+      if (nc<$ffff) then
-    end
+        dest^:=widechar(nc)
-  else if (dword(nc)<=$10ffff) then
+      else if (dword(nc)<=$10ffff) then
-    begin
+        begin
-      p^:=unicodechar((nc - $10000) shr 10 + $d800);
+          dest^:=widechar(nc shr 10 + $d7c0);
-      (p+1)^:=unicodechar((nc - $10000) and $3ff + $dc00);
+          { subtracting $10000 doesn't change low 10 bits }
-      inc(index,2);
+          dest[1]:=widechar(nc and $3ff + $dc00);
-    end
+          inc(dest);
-  else
+        end
-    { invalid code point }
+      else  { invalid code point }
-    begin
+        dest^:='?';
-      p^:='?';
+      inc(dest);
      inc(index);
    end;
 end;
@ -1931,65 +1923,26 @@ end;
 function UCS4StringToUnicodeString(const s : UCS4String) : UnicodeString;
  var
    i        : SizeInt;
-    resindex : SizeInt;
+    reslen   : SizeInt;
  begin
-    { skip terminating #0 }
+    reslen:=0;
-    SetLength(result,length(s)-1);
+    for i:=0 to length(s)-2 do     { skip terminating #0 }
-    resindex:=1;
+      Inc(reslen,1+ord((s[i]>$ffff) and (s[i]<=$10ffff)));
-    for i:=0 to high(s)-1 do
+    SetLength(result,reslen);
-      ConcatUTF32ToUnicodeStr(s[i],result,resindex);
+    UCS4Decode(s,pointer(result));
    { adjust result length (may be too big due to growing }
    { for surrogate pairs)                                }
    setlength(result,resindex-1);
  end;
 { concatenates an utf-32 char to a widestring. S *must* be unique when entering. }
 procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt);
 var
  p : PWideChar;
 begin
  { if nc > $ffff, we need two places }
  if (index+ord(nc > $ffff)>length(s)) then
    if (length(s) < 10*256) then
      setlength(s,length(s)+10)
    else
      setlength(s,length(s)+length(s) shr 8);
  { we know that s is unique -> avoid uniquestring calls}
  p:=@s[index];
  if (nc<$ffff) then
    begin
      p^:=widechar(nc);
      inc(index);
    end
  else if (dword(nc)<=$10ffff) then
    begin
      p^:=widechar((nc - $10000) shr 10 + $d800);
      (p+1)^:=widechar((nc - $10000) and $3ff + $dc00);
      inc(index,2);
    end
  else
    { invalid code point }
    begin
      p^:='?';
      inc(index);
    end;
 end;
 function UCS4StringToWideString(const s : UCS4String) : WideString;
  var
-    i        : SizeInt;
+    i      : SizeInt;
-    resindex : SizeInt;
+    reslen : SizeInt;
  begin
-    { skip terminating #0 }
+    reslen:=0;
-    SetLength(result,length(s)-1);
+    for i:=0 to length(s)-2 do     { skip terminating #0 }
-    resindex:=1;
+      Inc(reslen,1+ord((s[i]>$ffff) and (s[i]<=$10ffff)));
-    for i:=0 to high(s)-1 do
+    SetLength(result,reslen);
-      ConcatUTF32ToWideStr(s[i],result,resindex);
+    UCS4Decode(s,pointer(result));
    { adjust result length (may be too big due to growing }
    { for surrogate pairs)                                }
    setlength(result,resindex-1);
  end;