* fixed UTF8ToUnicode() based on patch by JoshyFun, and also added

support for 4-character UTF-8 codepoints (mantis #11791) * fixed UnicodeToUtf8() based on patch by A. J. Miller (mantis #13075) git-svn-id: trunk@12902 -
2025-04-08 10:48:12 +02:00 · 2009-03-15 15:47:39 +00:00 · 2009-03-15 15:47:39 +00:00 · d67dbcf030
commit d67dbcf030
parent a43bda5052
5 changed files with 2373 additions and 309 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -8726,6 +8726,7 @@ tests/webtbs/tw11711.pp svneol=native#text/plain
 tests/webtbs/tw11762.pp svneol=native#text/plain
 tests/webtbs/tw11763.pp svneol=native#text/plain
 tests/webtbs/tw11786.pp svneol=native#text/plain
+tests/webtbs/tw11791.pp svneol=native#text/plain
 tests/webtbs/tw1181.pp svneol=native#text/plain
 tests/webtbs/tw11825.pp svneol=native#text/plain
 tests/webtbs/tw11846a.pp svneol=native#text/plain
@ -8788,6 +8789,7 @@ tests/webtbs/tw1299.pp svneol=native#text/plain
 tests/webtbs/tw12993.pp svneol=native#text/plain
 tests/webtbs/tw13015.pp svneol=native#text/plain
 tests/webtbs/tw13019.pp svneol=native#text/plain
+tests/webtbs/tw13075.pp svneol=native#text/plain
 tests/webtbs/tw1310.pp svneol=native#text/plain
 tests/webtbs/tw13133.pp svneol=native#text/plain
 tests/webtbs/tw1318.pp svneol=native#text/plain
--- a/rtl/inc/ustrings.inc
+++ b/rtl/inc/ustrings.inc
@ -1857,6 +1857,40 @@ end;

 {$endif CPU64}

+{ converts an utf-16 code point or surrogate pair to utf-32 }
+function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32'];
+var
+  w: unicodechar;
+begin
+  { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
+  { are the same in UTF-32                                  }
+  w:=s[index];
+  if (w<=#$d7ff) or
+     (w>=#$e000) then
+    begin
+      result:=UCS4Char(w);
+      len:=1;
+    end
+  { valid surrogate pair? }
+  else if (w<=#$dbff) and
+          { w>=#$d7ff check not needed, checked above }
+          (index<length(s)) and
+          (s[index+1]>=#$dc00) and
+          (s[index+1]<=#$dfff) then
+      { convert the surrogate pair to UTF-32 }
+    begin
+      result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
+      len:=2;
+    end
+  else
+    { invalid surrogate -> do nothing }
+    begin
+      result:=UCS4Char(w);
+      len:=1;
+    end;
+end;
+
+
 function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  begin
    if assigned(Source) then
@ -1870,6 +1904,8 @@ function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar
  var
    i,j : SizeUInt;
    w : word;
+    lw : longword;
+    len : longint;
  begin
    result:=0;
    if source=nil then
@ -1895,16 +1931,34 @@ function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar
                  Dest[j+1]:=char($80 or (w and $3f));
                  inc(j,2);
                end;
-              else
+              $800..$d7ff,$e000..$ffff:
                begin
-                    if j+2>=MaxDestBytes then
-                      break;
-                    Dest[j]:=char($e0 or (w shr 12));
-                    Dest[j+1]:=char($80 or ((w shr 6)and $3f));
-                    Dest[j+2]:=char($80 or (w and $3f));
-                    inc(j,3);
+                  if j+2>=MaxDestBytes then
+                    break;
+                  Dest[j]:=char($e0 or (w shr 12));
+                  Dest[j+1]:=char($80 or ((w shr 6) and $3f));
+                  Dest[j+2]:=char($80 or (w and $3f));
+                  inc(j,3);
                end;
-            end;
+              $d800..$dbff:
+                {High Surrogates}
+                begin
+                  if j+3>=MaxDestBytes then
+                    break;
+                  if (i<sourcechars-1) and
+                     (word(Source[i+1]) >= $dc00) and
+                     (word(Source[i+1]) <= $dfff) then
+                    begin
+                      lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
+                      Dest[j]:=char($f0 or (lw shr 18));
+                      Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
+                      Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
+                      Dest[j+3]:=char($80 or (lw and $3f));
+                      inc(j,4);
+                      inc(i);
+                    end;
+                end;
+              end;
            inc(i);
          end;

@ -1922,8 +1976,18 @@ function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar
                inc(j);
              $80..$7ff:
                inc(j,2);
-              else
+              $800..$d7ff,$e000..$ffff:
                inc(j,3);
+              $d800..$dbff:
+                begin
+                  if (i<sourcechars-1) and
+                     (word(Source[i+1]) >= $dc00) and
+                     (word(Source[i+1]) <= $dfff) then
+                    begin
+                      inc(j,4);
+                      inc(i);
+                    end;
+                end;
            end;
            inc(i);
          end;
@ -1941,88 +2005,285 @@ function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): Si
  end;


-function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
-
-var
-  i,j : SizeUInt;
-  w: SizeUInt;
-  b : byte;
-begin
-  if not assigned(Source) then
+function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
+  const
+    UNICODE_INVALID=63;
+  var
+    InputUTF8: SizeUInt;
+    IBYTE: BYTE;
+    OutputUnicode: SizeUInt;
+    PRECHAR: SizeUInt;
+    TempBYTE: BYTE;
+    CharLen: SizeUint;
+    LookAhead: SizeUInt;
+    UC: SizeUInt;
  begin
-    result:=0;
-    exit;
+    if not assigned(Source) then
+      begin
+        result:=0;
+        exit;
+      end;
+    result:=SizeUInt(-1);
+    InputUTF8:=0;
+    OutputUnicode:=0;
+    PreChar:=0;
+    if Assigned(Dest) Then
+      begin
+        while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
+          begin
+            IBYTE:=byte(Source[InputUTF8]);
+            if (IBYTE and $80) = 0 then
+              begin
+                //One character US-ASCII, convert it to unicode
+                if IBYTE = 10 then
+                  begin
+                    If (PreChar<>13) and FALSE then
+                      begin
+                        //Expand to crlf, conform UTF-8.
+                        //This procedure will break the memory alocation by
+                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
+                        if OutputUnicode+1<MaxDestChars then
+                          begin
+                            Dest[OutputUnicode]:=WideChar(13);
+                            inc(OutputUnicode);
+                            Dest[OutputUnicode]:=WideChar(10);
+                            inc(OutputUnicode);
+                            PreChar:=10;
+                          end
+                        else
+                          begin
+                            Dest[OutputUnicode]:=WideChar(13);
+                            inc(OutputUnicode);
+                          end;
+                      end
+                    else
+                      begin
+                        Dest[OutputUnicode]:=WideChar(IBYTE);
+                        inc(OutputUnicode);
+                        PreChar:=IBYTE;
+                      end;
+                  end
+                else
+                  begin
+                    Dest[OutputUnicode]:=WideChar(IBYTE);
+                    inc(OutputUnicode);
+                    PreChar:=IBYTE;
+                  end;
+                inc(InputUTF8);
+              end
+            else
+              begin
+                TempByte:=IBYTE;
+                CharLen:=0;
+                while (TempBYTE and $80)<>0 do
+                  begin
+                    TempBYTE:=(TempBYTE shl 1) and $FE;
+                    inc(CharLen);
+                  end;
+                //Test for the "CharLen" conforms UTF-8 string
+                //This means the 10xxxxxx pattern.
+                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
+                  begin
+                    //Insuficient chars in string to decode
+                    //UTF-8 array. Fallback to single char.
+                    CharLen:= 1;
+                  end;
+                for LookAhead := 1 to CharLen-1 do
+                  begin
+                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
+                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
+                      begin
+                        //Invalid UTF-8 sequence, fallback.
+                        CharLen:= LookAhead;
+                        break;
+                      end;
+                  end;
+                UC:=$FFFF;
+                case CharLen of
+                  1:  begin
+                        //Not valid UTF-8 sequence
+                        UC:=UNICODE_INVALID;
+                      end;
+                  2:  begin
+                        //Two bytes UTF, convert it
+                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
+                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
+                        if UC <= $7F then
+                          begin
+                            //Invalid UTF sequence.
+                            UC:=UNICODE_INVALID;
+                          end;
+                      end;
+                  3:  begin
+                        //Three bytes, convert it to unicode
+                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
+                        if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
+                          begin
+                            //Invalid UTF-8 sequence
+                            UC:= UNICODE_INVALID;
+                          End;
+                      end;
+                  4:  begin
+                        //Four bytes, convert it to two unicode characters
+                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
+                        if (UC < $10000) or (UC > $10FFFF) then
+                          begin
+                            UC:= UNICODE_INVALID;
+                          end
+                        else
+                          begin
+                            { only store pair if room }
+                            dec(UC,$10000);
+                            if (OutputUnicode<MaxDestChars-1) then
+                              begin
+                                Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
+                                inc(OutputUnicode);
+                                UC:=(UC and $3ff) + $DC00;
+                              end
+                            else
+                              begin
+                                InputUTF8:= InputUTF8 + CharLen;
+                                { don't store anything }
+                                CharLen:=0;
+                              end;
+                          end;
+                      end;
+                  5,6,7:  begin
+                            //Invalid UTF8 to unicode conversion,
+                            //mask it as invalid UNICODE too.
+                            UC:=UNICODE_INVALID;
+                          end;
+                end;
+                if CharLen > 0 then
+                  begin
+                    PreChar:=UC;
+                    Dest[OutputUnicode]:=WideChar(UC);
+                    inc(OutputUnicode);
+                  end;
+                InputUTF8:= InputUTF8 + CharLen;
+              end;
+          end;
+        Result:=OutputUnicode+1;
+      end
+    else
+      begin
+        while (InputUTF8<SourceBytes) do
+          begin
+            IBYTE:=byte(Source[InputUTF8]);
+            if (IBYTE and $80) = 0 then
+              begin
+                //One character US-ASCII, convert it to unicode
+                if IBYTE = 10 then
+                  begin
+                    if (PreChar<>13) and FALSE then
+                      begin
+                        //Expand to crlf, conform UTF-8.
+                        //This procedure will break the memory alocation by
+                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
+                        inc(OutputUnicode,2);
+                        PreChar:=10;
+                      end
+                    else
+                      begin
+                        inc(OutputUnicode);
+                        PreChar:=IBYTE;
+                      end;
+                  end
+                else
+                  begin
+                    inc(OutputUnicode);
+                    PreChar:=IBYTE;
+                  end;
+                inc(InputUTF8);
+              end
+            else
+              begin
+                TempByte:=IBYTE;
+                CharLen:=0;
+                while (TempBYTE and $80)<>0 do
+                  begin
+                    TempBYTE:=(TempBYTE shl 1) and $FE;
+                    inc(CharLen);
+                  end;
+                //Test for the "CharLen" conforms UTF-8 string
+                //This means the 10xxxxxx pattern.
+                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
+                  begin
+                    //Insuficient chars in string to decode
+                    //UTF-8 array. Fallback to single char.
+                    CharLen:= 1;
+                  end;
+                for LookAhead := 1 to CharLen-1 do
+                  begin
+                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
+                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
+                      begin
+                        //Invalid UTF-8 sequence, fallback.
+                        CharLen:= LookAhead;
+                        break;
+                      end;
+                  end;
+                UC:=$FFFF;
+                case CharLen of
+                  1:  begin
+                        //Not valid UTF-8 sequence
+                        UC:=UNICODE_INVALID;
+                      end;
+                  2:  begin
+                        //Two bytes UTF, convert it
+                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
+                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
+                        if UC <= $7F then
+                          begin
+                            //Invalid UTF sequence.
+                            UC:=UNICODE_INVALID;
+                          end;
+                      end;
+                  3:  begin
+                        //Three bytes, convert it to unicode
+                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
+                        If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
+                          begin
+                            //Invalid UTF-8 sequence
+                            UC:= UNICODE_INVALID;
+                          end;
+                      end;
+                  4:  begin
+                        //Four bytes, convert it to two unicode characters
+                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
+                        if (UC < $10000) or (UC > $10FFFF) then
+                          UC:= UNICODE_INVALID
+                        else
+                          { extra character character }
+                          inc(OutputUnicode);
+                      end;
+                  5,6,7:  begin
+                            //Invalid UTF8 to unicode conversion,
+                            //mask it as invalid UNICODE too.
+                            UC:=UNICODE_INVALID;
+                          end;
+                end;
+                if CharLen > 0 then
+                  begin
+                    PreChar:=UC;
+                    inc(OutputUnicode);
+                  end;
+                InputUTF8:= InputUTF8 + CharLen;
+              end;
+          end;
+        Result:=OutputUnicode+1;
+      end;
  end;
-  result:=SizeUInt(-1);
-  i:=0;
-  j:=0;
-  if assigned(Dest) then
-    begin
-      while (j<MaxDestChars) and (i<SourceBytes) do
-        begin
-          b:=byte(Source[i]);
-          w:=b;
-          inc(i);
-          // 2 or 3 bytes?
-          if b>=$80 then
-            begin
-              w:=b and $3f;
-              if i>=SourceBytes then
-                exit;
-              // 3 bytes?
-              if (b and $20)<>0 then
-                begin
-                  b:=byte(Source[i]);
-                  inc(i);
-                  if i>=SourceBytes then
-                    exit;
-                  if (b and $c0)<>$80 then
-                    exit;
-                  w:=(w shl 6) or (b and $3f);
-                end;
-              b:=byte(Source[i]);
-              w:=(w shl 6) or (b and $3f);
-              if (b and $c0)<>$80 then
-                exit;
-              inc(i);
-            end;
-          Dest[j]:=UnicodeChar(w);
-          inc(j);
-        end;
-      if j>=MaxDestChars then j:=MaxDestChars-1;
-      Dest[j]:=#0;
-    end
-  else
-    begin
-      while i<SourceBytes do
-        begin
-          b:=byte(Source[i]);
-          inc(i);
-          // 2 or 3 bytes?
-          if b>=$80 then
-            begin
-              if i>=SourceBytes then
-                exit;
-              // 3 bytes?
-              b := b and $3f;
-              if (b and $20)<>0 then
-                begin
-                  b:=byte(Source[i]);
-                  inc(i);
-                  if i>=SourceBytes then
-                    exit;
-                  if (b and $c0)<>$80 then
-                    exit;
-                end;
-              if (byte(Source[i]) and $c0)<>$80 then
-                exit;
-              inc(i);
-            end;
-          inc(j);
-        end;
-    end;
-  result:=j+1;
-end;


 function UTF8Encode(const s : Ansistring) : UTF8String; inline;
@ -2079,40 +2340,6 @@ function Utf8ToAnsi(const s : UTF8String) : ansistring;{$ifdef SYSTEMINLINE}inli
  end;


-{ converts an utf-16 code point or surrogate pair to utf-32 }
-function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32'];
-var
-  w: unicodechar;
-begin
-  { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF }
-  { are the same in UTF-32                                  }
-  w:=s[index];
-  if (w<=#$d7ff) or
-     (w>=#$e000) then
-    begin
-      result:=UCS4Char(w);
-      len:=1;
-    end
-  { valid surrogate pair? }
-  else if (w<=#$dbff) and
-          { w>=#$d7ff check not needed, checked above }
-          (index<length(s)) and
-          (s[index+1]>=#$dc00) and
-          (s[index+1]<=#$dfff) then
-      { convert the surrogate pair to UTF-32 }
-    begin
-      result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000;
-      len:=2;
-    end
-  else
-    { invalid surrogate -> do nothing }
-    begin
-      result:=UCS4Char(w);
-      len:=1;
-    end;
-end;
-
-
 function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;
  var
    i, slen,
--- a/rtl/inc/wstrings.inc
+++ b/rtl/inc/wstrings.inc
@ -1227,192 +1227,6 @@ end;

 {$endif CPU64}

-function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
-  begin
-    if assigned(Source) then
-      Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
-    else
-      Result:=0;
-  end;
-
-
-function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
-  var
-    i,j : SizeUInt;
-    w : word;
-  begin
-    result:=0;
-    if source=nil then
-      exit;
-    i:=0;
-    j:=0;
-    if assigned(Dest) then
-      begin
-        while (i<SourceChars) and (j<MaxDestBytes) do
-          begin
-            w:=word(Source[i]);
-            case w of
-              0..$7f:
-                begin
-                  Dest[j]:=char(w);
-                  inc(j);
-                end;
-              $80..$7ff:
-                begin
-                  if j+1>=MaxDestBytes then
-                    break;
-                  Dest[j]:=char($c0 or (w shr 6));
-                  Dest[j+1]:=char($80 or (w and $3f));
-                  inc(j,2);
-                end;
-              else
-                begin
-                    if j+2>=MaxDestBytes then
-                      break;
-                    Dest[j]:=char($e0 or (w shr 12));
-                    Dest[j+1]:=char($80 or ((w shr 6)and $3f));
-                    Dest[j+2]:=char($80 or (w and $3f));
-                    inc(j,3);
-                end;
-            end;
-            inc(i);
-          end;
-
-        if j>SizeUInt(MaxDestBytes-1) then
-          j:=MaxDestBytes-1;
-
-        Dest[j]:=#0;
-      end
-    else
-      begin
-        while i<SourceChars do
-          begin
-            case word(Source[i]) of
-              $0..$7f:
-                inc(j);
-              $80..$7ff:
-                inc(j,2);
-              else
-                inc(j,3);
-            end;
-            inc(i);
-          end;
-      end;
-    result:=j+1;
-  end;
-
-
-function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
-  begin
-    if assigned(Source) then
-      Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
-    else
-      Result:=0;
-  end;
-
-
-function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
-
-var
-  i,j : SizeUInt;
-  w: SizeUInt;
-  b : byte;
-begin
-  if not assigned(Source) then
-  begin
-    result:=0;
-    exit;
-  end;
-  result:=SizeUInt(-1);
-  i:=0;
-  j:=0;
-  if assigned(Dest) then
-    begin
-      while (j<MaxDestChars) and (i<SourceBytes) do
-        begin
-          b:=byte(Source[i]);
-          w:=b;
-          inc(i);
-          // 2 or 3 bytes?
-          if b>=$80 then
-            begin
-              w:=b and $3f;
-              if i>=SourceBytes then
-                exit;
-              // 3 bytes?
-              if (b and $20)<>0 then
-                begin
-                  b:=byte(Source[i]);
-                  inc(i);
-                  if i>=SourceBytes then
-                    exit;
-                  if (b and $c0)<>$80 then
-                    exit;
-                  w:=(w shl 6) or (b and $3f);
-                end;
-              b:=byte(Source[i]);
-              w:=(w shl 6) or (b and $3f);
-              if (b and $c0)<>$80 then
-                exit;
-              inc(i);
-            end;
-          Dest[j]:=WideChar(w);
-          inc(j);
-        end;
-      if j>=MaxDestChars then j:=MaxDestChars-1;
-      Dest[j]:=#0;
-    end
-  else
-    begin
-      while i<SourceBytes do
-        begin
-          b:=byte(Source[i]);
-          inc(i);
-          // 2 or 3 bytes?
-          if b>=$80 then
-            begin
-              if i>=SourceBytes then
-                exit;
-              // 3 bytes?
-              b := b and $3f;
-              if (b and $20)<>0 then
-                begin
-                  b:=byte(Source[i]);
-                  inc(i);
-                  if i>=SourceBytes then
-                    exit;
-                  if (b and $c0)<>$80 then
-                    exit;
-                end;
-              if (byte(Source[i]) and $c0)<>$80 then
-                exit;
-              inc(i);
-            end;
-          inc(j);
-        end;
-    end;
-  result:=j+1;
-end;
-
-
-function UTF8Encode(const s : WideString) : UTF8String;
-  var
-    i : SizeInt;
-    hs : UTF8String;
-  begin
-    result:='';
-    if s='' then
-      exit;
-    SetLength(hs,length(s)*3);
-    i:=UnicodeToUtf8(pchar(hs),length(hs)+1,PWideChar(s),length(s));
-    if i>0 then
-      begin
-        SetLength(hs,i-1);
-        result:=hs;
-      end;
-  end;
-
-
 { converts an utf-16 code point or surrogate pair to utf-32 }
 function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_WIDETOUTF32'];
 var
@ -1447,6 +1261,420 @@ begin
 end;


+function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+    if assigned(Source) then
+      Result:=UnicodeToUtf8(Dest,MaxBytes,Source,IndexWord(Source^,-1,0))
+    else
+      Result:=0;
+  end;
+
+
+function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PWideChar; SourceChars: SizeUInt): SizeUInt;
+  var
+    i,j : SizeUInt;
+    w : word;
+    lw : longword;
+    len : longint;
+  begin
+    result:=0;
+    if source=nil then
+      exit;
+    i:=0;
+    j:=0;
+    if assigned(Dest) then
+      begin
+        while (i<SourceChars) and (j<MaxDestBytes) do
+          begin
+            w:=word(Source[i]);
+            case w of
+              0..$7f:
+                begin
+                  Dest[j]:=char(w);
+                  inc(j);
+                end;
+              $80..$7ff:
+                begin
+                  if j+1>=MaxDestBytes then
+                    break;
+                  Dest[j]:=char($c0 or (w shr 6));
+                  Dest[j+1]:=char($80 or (w and $3f));
+                  inc(j,2);
+                end;
+              $800..$d7ff,$e000..$ffff:
+                begin
+                  if j+2>=MaxDestBytes then
+                    break;
+                  Dest[j]:=char($e0 or (w shr 12));
+                  Dest[j+1]:=char($80 or ((w shr 6) and $3f));
+                  Dest[j+2]:=char($80 or (w and $3f));
+                  inc(j,3);
+                end;
+              $d800..$dbff:
+                {High Surrogates}
+                begin
+                  if j+3>=MaxDestBytes then
+                    break;
+                  if (i<sourcechars-1) and
+                     (word(Source[i+1]) >= $dc00) and
+                     (word(Source[i+1]) <= $dfff) then
+                    begin
+                      lw:=longword(utf16toutf32(Source[i] + Source[i+1], 1, len));
+                      Dest[j]:=char($f0 or (lw shr 18));
+                      Dest[j+1]:=char($80 or ((lw shr 12) and $3f));
+                      Dest[j+2]:=char($80 or ((lw shr 6) and $3f));
+                      Dest[j+3]:=char($80 or (lw and $3f));
+                      inc(j,4);
+                      inc(i);
+                    end;
+                end;
+              end;
+            inc(i);
+          end;
+
+        if j>SizeUInt(MaxDestBytes-1) then
+          j:=MaxDestBytes-1;
+
+        Dest[j]:=#0;
+      end
+    else
+      begin
+        while i<SourceChars do
+          begin
+            case word(Source[i]) of
+              $0..$7f:
+                inc(j);
+              $80..$7ff:
+                inc(j,2);
+              $800..$d7ff,$e000..$ffff:
+                inc(j,3);
+              $d800..$dbff:
+                begin
+                  if (i<sourcechars-1) and
+                     (word(Source[i+1]) >= $dc00) and
+                     (word(Source[i+1]) <= $dfff) then
+                    begin
+                      inc(j,4);
+                      inc(i);
+                    end;
+                end;
+            end;
+            inc(i);
+          end;
+      end;
+    result:=j+1;
+  end;
+
+
+function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+    if assigned(Source) then
+      Result:=Utf8ToUnicode(Dest,MaxChars,Source,strlen(Source))
+    else
+      Result:=0;
+  end;
+
+
+
+function UTF8ToUnicode(Dest: PWideChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
+  const
+    UNICODE_INVALID=63;
+  var
+    InputUTF8: SizeUInt;
+    IBYTE: BYTE;
+    OutputUnicode: SizeUInt;
+    PRECHAR: SizeUInt;
+    TempBYTE: BYTE;
+    CharLen: SizeUint;
+    LookAhead: SizeUInt;
+    UC: SizeUInt;
+  begin
+    if not assigned(Source) then
+      begin
+        result:=0;
+        exit;
+      end;
+    result:=SizeUInt(-1);
+    InputUTF8:=0;
+    OutputUnicode:=0;
+    PreChar:=0;
+    if Assigned(Dest) Then
+      begin
+        while (OutputUnicode<MaxDestChars) and (InputUTF8<SourceBytes) do
+          begin
+            IBYTE:=byte(Source[InputUTF8]);
+            if (IBYTE and $80) = 0 then
+              begin
+                //One character US-ASCII, convert it to unicode
+                if IBYTE = 10 then
+                  begin
+                    If (PreChar<>13) and FALSE then
+                      begin
+                        //Expand to crlf, conform UTF-8.
+                        //This procedure will break the memory alocation by
+                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
+                        if OutputUnicode+1<MaxDestChars then
+                          begin
+                            Dest[OutputUnicode]:=WideChar(13);
+                            inc(OutputUnicode);
+                            Dest[OutputUnicode]:=WideChar(10);
+                            inc(OutputUnicode);
+                            PreChar:=10;
+                          end
+                        else
+                          begin
+                            Dest[OutputUnicode]:=WideChar(13);
+                            inc(OutputUnicode);
+                          end;
+                      end
+                    else
+                      begin
+                        Dest[OutputUnicode]:=WideChar(IBYTE);
+                        inc(OutputUnicode);
+                        PreChar:=IBYTE;
+                      end;
+                  end
+                else
+                  begin
+                    Dest[OutputUnicode]:=WideChar(IBYTE);
+                    inc(OutputUnicode);
+                    PreChar:=IBYTE;
+                  end;
+                inc(InputUTF8);
+              end
+            else
+              begin
+                TempByte:=IBYTE;
+                CharLen:=0;
+                while (TempBYTE and $80)<>0 do
+                  begin
+                    TempBYTE:=(TempBYTE shl 1) and $FE;
+                    inc(CharLen);
+                  end;
+                //Test for the "CharLen" conforms UTF-8 string
+                //This means the 10xxxxxx pattern.
+                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
+                  begin
+                    //Insuficient chars in string to decode
+                    //UTF-8 array. Fallback to single char.
+                    CharLen:= 1;
+                  end;
+                for LookAhead := 1 to CharLen-1 do
+                  begin
+                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
+                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
+                      begin
+                        //Invalid UTF-8 sequence, fallback.
+                        CharLen:= LookAhead;
+                        break;
+                      end;
+                  end;
+                UC:=$FFFF;
+                case CharLen of
+                  1:  begin
+                        //Not valid UTF-8 sequence
+                        UC:=UNICODE_INVALID;
+                      end;
+                  2:  begin
+                        //Two bytes UTF, convert it
+                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
+                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
+                        if UC <= $7F then
+                          begin
+                            //Invalid UTF sequence.
+                            UC:=UNICODE_INVALID;
+                          end;
+                      end;
+                  3:  begin
+                        //Three bytes, convert it to unicode
+                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
+                        if (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
+                          begin
+                            //Invalid UTF-8 sequence
+                            UC:= UNICODE_INVALID;
+                          End;
+                      end;
+                  4:  begin
+                        //Four bytes, convert it to two unicode characters
+                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
+                        if (UC < $10000) or (UC > $10FFFF) then
+                          begin
+                            UC:= UNICODE_INVALID;
+                          end
+                        else
+                          begin
+                            { only store pair if room }
+                            dec(UC,$10000);
+                            if (OutputUnicode<MaxDestChars-1) then
+                              begin
+                                Dest[OutputUnicode]:=WideChar(UC shr 10 + $D800);
+                                inc(OutputUnicode);
+                                UC:=(UC and $3ff) + $DC00;
+                              end
+                            else
+                              begin
+                                InputUTF8:= InputUTF8 + CharLen;
+                                { don't store anything }
+                                CharLen:=0;
+                              end;
+                          end;
+                      end;
+                  5,6,7:  begin
+                            //Invalid UTF8 to unicode conversion,
+                            //mask it as invalid UNICODE too.
+                            UC:=UNICODE_INVALID;
+                          end;
+                end;
+                if CharLen > 0 then
+                  begin
+                    PreChar:=UC;
+                    Dest[OutputUnicode]:=WideChar(UC);
+                    inc(OutputUnicode);
+                  end;
+                InputUTF8:= InputUTF8 + CharLen;
+              end;
+          end;
+        Result:=OutputUnicode+1;
+      end
+    else
+      begin
+        while (InputUTF8<SourceBytes) do
+          begin
+            IBYTE:=byte(Source[InputUTF8]);
+            if (IBYTE and $80) = 0 then
+              begin
+                //One character US-ASCII, convert it to unicode
+                if IBYTE = 10 then
+                  begin
+                    if (PreChar<>13) and FALSE then
+                      begin
+                        //Expand to crlf, conform UTF-8.
+                        //This procedure will break the memory alocation by
+                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
+                        inc(OutputUnicode,2);
+                        PreChar:=10;
+                      end
+                    else
+                      begin
+                        inc(OutputUnicode);
+                        PreChar:=IBYTE;
+                      end;
+                  end
+                else
+                  begin
+                    inc(OutputUnicode);
+                    PreChar:=IBYTE;
+                  end;
+                inc(InputUTF8);
+              end
+            else
+              begin
+                TempByte:=IBYTE;
+                CharLen:=0;
+                while (TempBYTE and $80)<>0 do
+                  begin
+                    TempBYTE:=(TempBYTE shl 1) and $FE;
+                    inc(CharLen);
+                  end;
+                //Test for the "CharLen" conforms UTF-8 string
+                //This means the 10xxxxxx pattern.
+                if SizeUInt(InputUTF8+CharLen-1)>SourceBytes then
+                  begin
+                    //Insuficient chars in string to decode
+                    //UTF-8 array. Fallback to single char.
+                    CharLen:= 1;
+                  end;
+                for LookAhead := 1 to CharLen-1 do
+                  begin
+                    if ((byte(Source[InputUTF8+LookAhead]) and $80)<>$80) or
+                       ((byte(Source[InputUTF8+LookAhead]) and $40)<>$00) then
+                      begin
+                        //Invalid UTF-8 sequence, fallback.
+                        CharLen:= LookAhead;
+                        break;
+                      end;
+                  end;
+                UC:=$FFFF;
+                case CharLen of
+                  1:  begin
+                        //Not valid UTF-8 sequence
+                        UC:=UNICODE_INVALID;
+                      end;
+                  2:  begin
+                        //Two bytes UTF, convert it
+                        UC:=(byte(Source[InputUTF8]) and $1F) shl 6;
+                        UC:=UC or (byte(Source[InputUTF8+1]) and $3F);
+                        if UC <= $7F then
+                          begin
+                            //Invalid UTF sequence.
+                            UC:=UNICODE_INVALID;
+                          end;
+                      end;
+                  3:  begin
+                        //Three bytes, convert it to unicode
+                        UC:= (byte(Source[InputUTF8]) and $0F) shl 12;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F));
+                        If (UC <= $7FF) or (UC >= $FFFE) or ((UC >= $D800) and (UC <= $DFFF)) then
+                          begin
+                            //Invalid UTF-8 sequence
+                            UC:= UNICODE_INVALID;
+                          end;
+                      end;
+                  4:  begin
+                        //Four bytes, convert it to two unicode characters
+                        UC:= (byte(Source[InputUTF8]) and $07) shl 18;
+                        UC:= UC or ((byte(Source[InputUTF8+1]) and $3F) shl 12);
+                        UC:= UC or ((byte(Source[InputUTF8+2]) and $3F) shl 6);
+                        UC:= UC or ((byte(Source[InputUTF8+3]) and $3F));
+                        if (UC < $10000) or (UC > $10FFFF) then
+                          UC:= UNICODE_INVALID
+                        else
+                          { extra character character }
+                          inc(OutputUnicode);
+                      end;
+                  5,6,7:  begin
+                            //Invalid UTF8 to unicode conversion,
+                            //mask it as invalid UNICODE too.
+                            UC:=UNICODE_INVALID;
+                          end;
+                end;
+                if CharLen > 0 then
+                  begin
+                    PreChar:=UC;
+                    inc(OutputUnicode);
+                  end;
+                InputUTF8:= InputUTF8 + CharLen;
+              end;
+          end;
+        Result:=OutputUnicode+1;
+      end;
+  end;
+
+
+function UTF8Encode(const s : WideString) : UTF8String;
+  var
+    i : SizeInt;
+    hs : UTF8String;
+  begin
+    result:='';
+    if s='' then
+      exit;
+    SetLength(hs,length(s)*3);
+    i:=UnicodeToUtf8(pchar(hs),length(hs)+1,PWideChar(s),length(s));
+    if i>0 then
+      begin
+        SetLength(hs,i-1);
+        result:=hs;
+      end;
+  end;
+
+
 const
  SNoWidestrings = 'This binary has no widestrings support compiled in.';
  SRecompileWithWidestrings = 'Recompile the application with a widestrings-manager in the program uses clause.';
--- a/tests/webtbs/tw11791.pp
+++ b/tests/webtbs/tw11791.pp
--- a/tests/webtbs/tw13075.pp
+++ b/tests/webtbs/tw13075.pp
@ -0,0 +1,172 @@
+{$mode objfpc}{$H+}
+
+
+uses
+{$ifdef unix}
+  cwstring,
+{$endif}
+  Classes, SysUtils;
+
+function localUnicodeToUTF8(u: cardinal; Buf: PChar): integer;
+
+  procedure RaiseInvalidUnicode;
+  begin
+    raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(u));
+  end;
+
+begin
+  case u of
+    0..$7f:
+      begin
+        Result:=1;
+        Buf[0]:=char(byte(u));
+      end;
+    $80..$7ff:
+      begin
+        Result:=2;
+        Buf[0]:=char(byte($c0 or (u shr 6)));
+        Buf[1]:=char(byte($80 or (u and $3f)));
+      end;
+    $800..$ffff:
+      begin
+        Result:=3;
+        Buf[0]:=char(byte($e0 or (u shr 12)));
+        Buf[1]:=char(byte((u shr 6) and $3f) or $80);
+        Buf[2]:=char(byte(u and $3f) or $80);
+      end;
+    $10000..$10ffff:
+      begin
+        Result:=4;
+        Buf[0]:=char(byte($f0 or (u shr 18)));
+        Buf[1]:=char(byte((u shr 12) and $3f) or $80);
+        Buf[2]:=char(byte((u shr 6) and $3f) or $80);
+        Buf[3]:=char(byte(u and $3f) or $80);
+      end;
+  else
+    RaiseInvalidUnicode;
+  end;
+end;
+
+function localUnicodeToUTF8(u: cardinal): shortstring;
+begin
+  Result[0]:=chr(localUnicodeToUTF8(u,@Result[1]));
+end;
+
+
+function localUnicodeToUTF16(u: cardinal): widestring;
+begin
+  // u should be <= $10FFFF to fit into UTF-16
+
+  if u < $10000 then
+    // Note: codepoints $D800 - $DFFF are reserved
+    Result:=widechar(u)
+  else
+    Result:=widechar($D800+((u - $10000) shr 10))+widechar($DC00+((u - $10000) and $3ff));
+end;
+
+
+function UnicodeToCESU8(u: cardinal; Buf: PChar): integer;
+
+  procedure RaiseInvalidUnicode;
+  begin
+    raise Exception.Create('UnicodeToCESU8: invalid unicode: '+IntToStr(u));
+  end;
+
+var
+  st: widestring;
+begin
+  case u of
+    0..$ffff:
+      begin
+        Result:=localUnicodeToUTF8(u,Buf);
+      end;
+    $10000..$10ffff:
+      begin
+        st := localUnicodeToUTF16(u);
+
+        Result:=6;
+        Buf[0]:=char(byte($e0 or (ord(st[1]) shr 12)));
+        Buf[1]:=char(byte((ord(st[1]) shr 6) and $3f) or $80);
+        Buf[2]:=char(byte(ord(st[1]) and $3f) or $80);
+        Buf[3]:=char(byte($e0 or (ord(st[2]) shr 12)));
+        Buf[4]:=char(byte((ord(st[2]) shr 6) and $3f) or $80);
+        Buf[5]:=char(byte(ord(st[2]) and $3f) or $80);
+     end;
+  else
+    RaiseInvalidUnicode;
+  end;
+end;
+
+function UnicodeToCESU8(u: cardinal): utf8string;
+begin
+  setlength(result,1000);
+  setlength(result,UnicodeToCESU8(u,@Result[1]));
+end;
+
+procedure dotest;
+var
+  s1,s2: utf8string;
+  w1,w2: unicodestring;
+  s3,s4: utf8string;
+  i: longint;
+begin
+  s1 := localUnicodeToUTF8 ($10300);
+  s2 := UnicodeToCESU8 ($10300);
+  setlength(w1,20);
+  setlength(w2,20);
+  // -1 because UTF8ToUnicode returns a null-terminated string
+  setlength(w1,UTF8ToUnicode(punicodechar(@w1[1]),length(w1),pchar(s1),Length(s1))-1);
+  setlength(w2,UTF8ToUnicode(punicodechar(@w2[1]),length(w2),pchar(s2),Length(s2))-1);
+(*
+  writeln('len: ',length(w1),' - "',w1,'"');
+  write('  ');
+  for i:= 1 to length(w1) do
+    write('#$',hexstr(ord(w1[i]),4));
+  writeln;
+  writeln('len: ',length(w2),' - "',w2,'"');
+  write('  ');
+  for i:= 1 to length(w2) do
+    write('#$',hexstr(ord(w2[i]),4));
+  writeln;
+  writeln;
+*)
+  
+  setlength(s3,20);
+  setlength(s4,20);
+  // -1 because UnicodeToUTF8 returns a null-terminated string
+  setlength(s3,UnicodeToUTF8(@s3[1],length(s3),punicodechar(@w1[1]),length(w1))-1);
+  setlength(s4,UnicodeToUTF8(@s4[1],length(s4),punicodechar(@w2[1]),length(w2))-1);
+  
+  if (s3<>s1) or
+     { invalid: CESU-8 }
+     (w2<>'??') or
+     (s4<>'??') then
+    begin
+      writeln('len: ',length(s1),' - "',s1,'"');
+      write('  ');
+      for i:= 1 to length(s1) do
+        write('#$',hexstr(ord(s1[i]),2));
+      writeln;
+      writeln('len: ',length(s2),' - "',s2,'"');
+      write('  ');
+      for i:= 1 to length(s2) do
+        write('#$',hexstr(ord(s2[i]),2));
+      writeln;
+      writeln('len: ',length(s3),' - "',s3,'"');
+      write('  ');
+      for i:= 1 to length(s3) do
+        write('#$',hexstr(ord(s3[i]),2));
+      writeln;
+      writeln('len: ',length(s4),' - "',s4,'"');
+      write('  ');
+      for i:= 1 to length(s4) do
+        write('#$',hexstr(ord(s4[i]),2));
+      writeln;
+      halt(1);
+    end;
+end;
+
+begin
+  dotest;
+end.
+