* Fix bug ID #38008: allow UTF8 to unicode conversion to react on/ignore invalid input

git-svn-id: trunk@47391 -
2025-04-20 20:49:49 +02:00 · 2020-11-12 09:17:09 +00:00 · 2020-11-12 09:17:09 +00:00 · 257ef24a1e
commit 257ef24a1e
parent d9784412a4
2 changed files with 26 additions and 66 deletions
--- a/rtl/inc/ustringh.inc
+++ b/rtl/inc/ustringh.inc
@ -134,7 +134,8 @@ var
 function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
 function UnicodeToUtf8(Dest: PChar; MaxDestBytes: SizeUInt; Source: PUnicodeChar; SourceChars: SizeUInt): SizeUInt;
 function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
-function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
+function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt; IgnoreInvalid : Boolean): SizeUInt;
 function UTF8Encode(const s : RawByteString) : RawByteString; inline;
 function UTF8Encode(const s : UnicodeString) : RawByteString;
 function UTF8Decode(const s : RawByteString): UnicodeString;
--- a/rtl/inc/ustrings.inc
+++ b/rtl/inc/ustrings.inc
@ -1792,13 +1792,20 @@ end;
 function Utf8ToUnicode(Dest: PUnicodeChar; Source: PChar; MaxChars: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  begin
    if assigned(Source) then
-      Result:=Utf8ToUnicode(Dest,MaxChars,Source,length(Source))
+      Result:=Utf8ToUnicode(Dest,MaxChars,Source,length(Source),True)
    else
      Result:=0;
  end;


-function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;
+function UTF8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt): SizeUInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+
+begin
+  Result:=Utf8ToUnicode(Dest,MaxDestChars,Source,SourceBytes,True);
+end;
+
+function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar; SourceBytes: SizeUInt; IgnoreInvalid : Boolean): SizeUInt;
+
 {$ifdef EXCLUDE_COMPLEX_PROCS}
 begin
  runerror(217);
@ -1832,44 +1839,12 @@ end;
            IBYTE:=byte(Source[InputUTF8]);
            if (IBYTE and $80) = 0 then
              begin
-                //One character US-ASCII, convert it to unicode
-(*
-                if IBYTE = 10 then
-                  begin
-                    If (PreChar<>13) and FALSE then
-                      begin
-                        //Expand to crlf, conform UTF-8.
-                        //This procedure will break the memory alocation by
-                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
-                        if OutputUnicode+1<MaxDestChars then
-                          begin
-                            Dest[OutputUnicode]:=WideChar(13);
-                            inc(OutputUnicode);
-                            Dest[OutputUnicode]:=WideChar(10);
-                            inc(OutputUnicode);
-                            PreChar:=10;
-                          end
-                        else
-                          begin
-                            Dest[OutputUnicode]:=WideChar(13);
-                            inc(OutputUnicode);
-                          end;
-                      end
-                    else
-                      begin
-                        Dest[OutputUnicode]:=WideChar(IBYTE);
-                        inc(OutputUnicode);
-                        PreChar:=IBYTE;
-                      end;
-                  end
-                else
-*)
-                  begin
-                    Dest[OutputUnicode]:=WideChar(IBYTE);
-                    inc(OutputUnicode);
-                    PreChar:=IBYTE;
-                  end;
-                inc(InputUTF8);
+              // One character US-ASCII, convert it to unicode
+              // Commented code to convert LF to CRLF has been removed
+              Dest[OutputUnicode]:=WideChar(IBYTE);
+              inc(OutputUnicode);
+              PreChar:=IBYTE;
+              inc(InputUTF8);
              end
            else
              begin
@ -1961,6 +1936,8 @@ end;
                end;
                if CharLen > 0 then
                  begin
+                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
+                      HandleError(231); // Will be converted to EConversionError in sysutils
                    PreChar:=UC;
                    Dest[OutputUnicode]:=WideChar(UC);
                    inc(OutputUnicode);
@ -1977,31 +1954,11 @@ end;
            IBYTE:=byte(Source[InputUTF8]);
            if (IBYTE and $80) = 0 then
              begin
-                //One character US-ASCII, convert it to unicode
-(*
-                if IBYTE = 10 then
-                  begin
-                    if (PreChar<>13) and FALSE then
-                      begin
-                        //Expand to crlf, conform UTF-8.
-                        //This procedure will break the memory alocation by
-                        //FPC for the widestring, so never use it. Condition never true due the "and FALSE".
-                        inc(OutputUnicode,2);
-                        PreChar:=10;
-                      end
-                    else
-                      begin
-                        inc(OutputUnicode);
-                        PreChar:=IBYTE;
-                      end;
-                  end
-                else
-*)
-                  begin
-                    inc(OutputUnicode);
-                    PreChar:=IBYTE;
-                  end;
-                inc(InputUTF8);
+              // One character US-ASCII, convert it to unicode
+              // Commented code to convert LF to CRLF has been removed
+              inc(OutputUnicode);
+              PreChar:=IBYTE;
+              inc(InputUTF8);
              end
            else
              begin
@ -2077,6 +2034,8 @@ end;
                end;
                if CharLen > 0 then
                  begin
+                    if (UC=UNICODE_INVALID) and Not IgnoreInvalid then
+                      HandleError(231); // Will be converted to EConversionError in sysutils
                    PreChar:=UC;
                    inc(OutputUnicode);
                  end;