LCL UTF-16 conversion bug: do not ignore code point values > $DFFF and <= $FFFF, unrolled by Brad Campbell

git-svn-id: trunk@16868 -
2025-08-24 19:39:17 +02:00 · 2008-10-04 12:40:46 +00:00 · 2008-10-04 12:40:46 +00:00 · 8c3ef8f770
commit 8c3ef8f770
parent b57df0675a
3 changed files with 19 additions and 11 deletions
--- a/docs/Contributors.txt
+++ b/docs/Contributors.txt
@ -20,6 +20,7 @@ Bob Wingard
 Boguslaw Brandys
 Boris Arko
 Boris Glavin
 Brad Campbell
 Chris Rorden
 Christian Iversen
 Christian Ulrich
--- a/lcl/lclproc.pas
+++ b/lcl/lclproc.pas
@ -3636,7 +3636,7 @@ function UTF16CharacterLength(p: PWideChar): integer;
 // The endianess of the machine will be taken.
 begin
  if p<>nil then begin
-    if ord(p[0])<$D800 then
+    if (ord(p[0]) < $D800) or (ord(p[0]) > $DFFF) then
      Result:=1
    else
      Result:=2;
@ -3670,7 +3670,7 @@ var
 begin
  if p<>nil then begin
    w1:=ord(p[0]);
-    if w1<$D800 then begin
+    if (w1 < $D800) or (w1 > $DFFF) then begin
      // is 1 word character
      Result:=w1;
      CharLen:=1;
@ -3695,7 +3695,10 @@ end;
 function UnicodeToUTF16(u: cardinal): widestring;
 begin
-  if u<$D800 then
+  // u should be <= $10FFFF to fit into UTF-16
  if u < $10000 then
    // Note: codepoints $D800 - $DFFF are reserved
    Result:=widechar(u)
  else
    Result:=widechar($D800+((u - $10000) shr 10))+widechar($DC00+((u - $10000) and $3ff));
@ -3847,7 +3850,7 @@ begin
          if ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) then
          begin
            W := ((B1 and %00011111) shl 12) or ((B2 and %00111111) shl 6) or (B3 and %00111111);
-            if W < $D800 then // to single wide char UTF-16 char
+            if (W < $D800) or (W > $DFFF) then // to single wide char UTF-16 char
            begin
              Dest[DestI] := WideChar(W);
              Inc(DestI);
@ -3985,7 +3988,7 @@ begin
    W1 := Word(Src[SrcI]);
    Inc(SrcI);
-    if W1 < $D800 then // single wide char UTF-16 char
+    if (W1 < $D800) or (W1 > $DFFF) then // single wide char UTF-16 char
    begin
      if W1 < $0080 then // to single byte UTF-8 char
      begin
--- a/test/lcltests/testunicode.pas
+++ b/test/lcltests/testunicode.pas
@ -20,8 +20,8 @@ type
  end; 
 const
-  Limits: Array [0..8] of Cardinal =
+  Limits: Array [0..9] of Cardinal =
-    (0, $7F, $80, $7FF, $800, $10000, $10FFFF, $1FFFFF, $D7FF);
+    (0, $7F, $80, $7FF, $800, $D7FF, $E000, $FFFF, $10000, $10FFFF);
 implementation
@ -34,9 +34,9 @@ var
  SUTF8, S1UTF8: UTF8String;
  SUTF16, S1UTF16, R: WideString;
 begin
-  for U := 0 to $1FFFFF do // test each unicode char
+  for U := 0 to $10FFFF do // test each unicode char
  begin
-    if (U >= $D800) and (U <= $FFFF) then Continue;
+    if (U >= $D800) and (U <= $DFFF) then Continue;
    SUTF8 := UnicodeToUTF8(U);
    SUTF16 := UnicodeToUTF16(U);
@ -71,9 +71,9 @@ var
  SUTF8, S1UTF8, R: UTF8String;
  SUTF16, S1UTF16: WideString;
 begin
-  for U := 0 to $1FFFFF do
+  for U := 0 to $10FFFF do
  begin
-    if (U >= $D800) and (U <= $FFFF) then Continue;
+    if (U >= $D800) and (U <= $DFFF) then Continue;
    SUTF8 := UnicodeToUTF8(U);
    SUTF16 := UnicodeToUTF16(U);
@ -107,6 +107,8 @@ var
 begin
  AssertEquals(0, UTF16CharacterToUnicode(#0, L));
  AssertEquals($D7FF, UTF16CharacterToUnicode(#$D7FF, L));
  AssertEquals($E000, UTF16CharacterToUnicode(#$E000, L));
  AssertEquals($FFFF, UTF16CharacterToUnicode(#$FFFF, L));
  AssertEquals($10000, UTF16CharacterToUnicode(#$D800#$DC00, L));
  AssertEquals($10001, UTF16CharacterToUnicode(#$D800#$DC01, L));
  AssertEquals($10FFFD, UTF16CharacterToUnicode(#$DBFF#$DFFD, L));
@ -116,6 +118,8 @@ procedure TTestUnicode.TestUnicodeToUTF16;
 begin
  AssertEquals(#0, UnicodeToUTF16(0));
  AssertEquals(#$D7FF, UnicodeToUTF16($D7FF));
  AssertEquals(#$E000, UnicodeToUTF16($E000));
  AssertEquals(#$FFFF, UnicodeToUTF16($FFFF));
  AssertEquals(#$D800#$DC00, UnicodeToUTF16($10000));
  AssertEquals(#$D800#$DC01, UnicodeToUTF16($10001));
  AssertEquals(#$DBFF#$DFFD, UnicodeToUTF16($10FFFD));