diff --git a/.gitattributes b/.gitattributes index c3e7d19f39..df3e9b42d2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -11635,6 +11635,7 @@ tests/test/jvm/tvirtclmeth.pp svneol=native#text/plain tests/test/jvm/tw20212.pp svneol=native#text/plain tests/test/jvm/tw22807.pp svneol=native#text/plain tests/test/jvm/tw24089.pp svneol=native#text/plain +tests/test/jvm/tw29585.pp svneol=native#text/plain tests/test/jvm/twith.pp svneol=native#text/plain tests/test/jvm/uenum.pp svneol=native#text/plain tests/test/jvm/ujsetter.pp svneol=native#text/plain @@ -14939,6 +14940,7 @@ tests/webtbs/tw2953.pp svneol=native#text/plain tests/webtbs/tw29546.pp svneol=native#text/pascal tests/webtbs/tw2956.pp svneol=native#text/plain tests/webtbs/tw2958.pp svneol=native#text/plain +tests/webtbs/tw29585.pp svneol=native#text/plain tests/webtbs/tw29609.pp svneol=native#text/pascal tests/webtbs/tw2966.pp svneol=native#text/plain tests/webtbs/tw2975.pp svneol=native#text/plain diff --git a/rtl/inc/ustringh.inc b/rtl/inc/ustringh.inc index 980a2f1dda..ac1b0b8c08 100644 --- a/rtl/inc/ustringh.inc +++ b/rtl/inc/ustringh.inc @@ -136,12 +136,15 @@ function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar function UTF8Encode(const s : RawByteString) : RawByteString; inline; function UTF8Encode(const s : UnicodeString) : RawByteString; function UTF8Decode(const s : RawByteString): UnicodeString; -function UTF8ToString(const s : UTF8String): UnicodeString;inline; function UTF8ToString(const s : RawByteString): UnicodeString;inline; function UTF8ToString(const S: ShortString): unicodestring; function UTF8ToString(const S: PAnsiChar): unicodestring; +{ byte and ansichar are the same on the JVM, and "array of" and "pointer to" + are as well } +{$ifndef CPUJVM} function UTF8ToString(const S: array of AnsiChar): unicodestring; -function UTF8ToString(const S: array of Byte): unicodestring; +function UTF8ToString(const S: array of Byte): unicodestring; +{$endif not CPUJVM} function AnsiToUtf8(const s : RawByteString): RawByteString;{$ifdef SYSTEMINLINE}inline;{$endif} function Utf8ToAnsi(const s : RawByteString) : RawByteString;{$ifdef SYSTEMINLINE}inline;{$endif} function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String; diff --git a/rtl/inc/ustrings.inc b/rtl/inc/ustrings.inc index b3788a556d..afe4437ec6 100644 --- a/rtl/inc/ustrings.inc +++ b/rtl/inc/ustrings.inc @@ -2362,63 +2362,57 @@ Begin SetCodePage(Result,DefaultFileSystemCodePage,True); End; -function UTF8ToString(const S: UTF8String): UnicodeString; inline; +{ Delphi compatibility: always interpret the data in the string as UTF-8, + ignore any codepage } +function UTF8ToString(const S: RawByteString): UnicodeString; inline; begin Result := UTF8Decode(S); end; -function UTF8ToString(const S: RawByteString): UnicodeString; inline; - -Var - UTF8 : UTF8String; - -begin - UTF8:=S; - Result := UTF8Decode(UTF8); -end; - function UTF8ToString(const S: ShortString): UnicodeString; - Var - UTF8 : UTF8String; - + rs: RawByteString; begin - UTF8:=S; - Result := UTF8Decode(UTF8); + rs:=S; + Result := UTF8Decode(rs); end; function UTF8ToString(const S: PAnsiChar): UnicodeString; var - UTF: UTF8String; + rs: RawByteString; Count: Integer; begin - Count := StrLen(S); - SetLength(UTF, Count); + Count := length(S); + SetLength(rs, Count); if Count > 0 then - Move(S^, UTF[1], Count); - Result := UTF8ToString(UTF); + fpc_pchar_ansistr_intern_charmove(S,0,rs,0,Count); + Result := UTF8ToString(rs); end; +{ byte and ansichar are the same on the JVM, and "array of" and "pointer to" + are as well } +{$ifndef CPUJVM} function UTF8ToString(const S: array of AnsiChar): UnicodeString; var - UTF: UTF8String; + rs: RawByteString; Count: Integer; begin Count := Length(S); - SetLength(UTF, Count); + SetLength(rs, Count); if Count > 0 then - Move(S[Low(S)], UTF[1], Count); - Result := UTF8ToString(UTF); + fpc_pchar_ansistr_intern_charmove(@S,Low(S),rs,0,Count); + Result := UTF8ToString(rs); end; function UTF8ToString(const S: array of Byte): UnicodeString; var - UTF: UTF8String; + rs: RawByteString; Count: Integer; begin Count := Length(S); - SetLength(UTF, Count); + SetLength(rs, Count); if Count > 0 then - Move(S[Low(S)], UTF[1], Count); - Result := UTF8ToString(UTF); + fpc_pchar_ansistr_intern_charmove(pchar(@S),Low(S),rs,0,Count); + Result := UTF8ToString(rs); end; +{$endif not CPUJVM} diff --git a/tests/test/jvm/testall.bat b/tests/test/jvm/testall.bat index 028ad4b19c..cdb45c3b10 100644 --- a/tests/test/jvm/testall.bat +++ b/tests/test/jvm/testall.bat @@ -324,3 +324,7 @@ ppcjvm -O2 -g -B -CTinitlocals tprocvaranon if %errorlevel% neq 0 exit /b %errorlevel% java -Dfile.encoding=UTF-8 -cp ..\..\..\rtl\units\jvm-java;. tprocvaranon if %errorlevel% neq 0 exit /b %errorlevel% +ppcjvm -O2 -g -B -CTinitlocals tw29585 +if %errorlevel% neq 0 exit /b %errorlevel% +java -Dfile.encoding=UTF-8 -cp ..\..\..\rtl\units\jvm-java;. tw29585 +if %errorlevel% neq 0 exit /b %errorlevel% diff --git a/tests/test/jvm/testall.sh b/tests/test/jvm/testall.sh index 8331635433..9f725e3cf4 100755 --- a/tests/test/jvm/testall.sh +++ b/tests/test/jvm/testall.sh @@ -189,3 +189,5 @@ $PPC -O2 -g -B -Sa tformalclass java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tformalclass $PPC -O2 -g -B -Sa tprocvaranon java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tprocvaranon +$PPC -O2 -g -B -Sa tw29585 +java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tw29585 diff --git a/tests/test/jvm/tw29585.pp b/tests/test/jvm/tw29585.pp new file mode 100644 index 0000000000..e5ae6c4966 --- /dev/null +++ b/tests/test/jvm/tw29585.pp @@ -0,0 +1,218 @@ +program tw29585; +{$IFDEF FPC} +{$MODE OBJFPC}{$H+} +{$ENDIF} + +{$ifdef CPUJVM} +uses + {$ifdef java}jdk15{$else}androidr14{$endif}; + + {$macro on} + {$define writeln:=jlsystem.fout.println} + {$define write:=jlsystem.fout.print} +{$endif} + +{$IFNDEF FPC} +type + tsystemcodepage = word; +{$ENDIF} + +Type + tstr1251 = type ansistring(1251); + +const + utf8data: array[0..10] of ansichar = #$C3#$A9#$C2#$BA#$C3#$AE#$C5#$93#$E2#$88#$82; + utf8data_in_utf16: unicodestring = #$00E9#$00BA#$00EE#$0153#$2202; + + invalidutf8data: array[0..3] of ansichar = #$80#$81#$82#$83; + invalidutf8data_utf_16a: unicodestring = '????'; + invalidutf8data_utf_16b: unicodestring = #$fffd#$fffd#$fffd#$fffd; + + +function inttohex(l: longint; len: longint): unicodestring; +var + i: longint; +const + hexchars: array[0..15] of ansichar = ('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'); +begin + result:=''; + for i:=1 to len do + begin + result:=hexchars[l and $f]+result; + l:=l shr 4; + end; +end; + +procedure error(l: longint; const u: unicodestring); + var + i: longint; + begin + write('error for test '); + writeln(l); + write('result: '); + for i:=low(u) to high(u) do + begin + write('#$'); + write(inttohex(ord(u[i]),4)); + end; + writeln; + halt(l); + end; + + +procedure initarray(p: pbyte; const data: array of ansichar); + var + i: longint; + begin + for i:=low(data) to high(data) do + p[i]:=ord(data[i]); + end; + + +procedure initstr(var s: rawbytestring; cp: tsystemcodepage; const data: array of ansichar); overload; + var + i: longint; + begin + setlength(s,length(data)); + setcodepage(s,cp,false); + for i:=low(data) to high(data) do + s[i+1]:=data[i]; + end; + + +procedure initstr(var s: shortstring; const data: array of ansichar); overload; + var + i: longint; + begin + setlength(s,length(data)); + for i:=low(data) to high(data) do + s[i+1]:=data[i]; + end; + + +procedure testvalidutf8; + var + s1251: tstr1251; + rs: rawbytestring; + utf8: utf8string; + s: ansistring; + ss: shortstring; + ba: array[low(utf8data)..high(utf8data)] of byte; + bc: array[low(utf8data)..high(utf8data)] of ansichar; + bcc: array[low(utf8data)..high(utf8data)+1] of ansichar; + w: unicodestring; + begin + initstr(rawbytestring(s1251),1251,utf8data); + w:=UTF8ToString(s1251); + if w<>utf8data_in_utf16 then + error(1,w); + + initstr(rs,0,utf8data); + w:=UTF8ToString(rs); + if w<>utf8data_in_utf16 then + error(2,w); + + initstr(rawbytestring(utf8),CP_UTF8,utf8data); + w:=UTF8ToString(utf8); + if w<>utf8data_in_utf16 then + error(3,w); + + initstr(rawbytestring(s),defaultsystemcodepage,utf8data); + w:=UTF8ToString(s); + if w<>utf8data_in_utf16 then + error(4,w); + + initstr(ss,utf8data); + w:=UTF8ToString(ss); + if w<>utf8data_in_utf16 then + error(5,w); + + initarray(@bcc[0],utf8data); + bcc[high(bcc)]:=#0; + w:=UTF8ToString(@bcc[0]); + if w<>utf8data_in_utf16 then + error(6,w); + +{$ifndef cpujvm} + initarray(@ba[0],utf8data); + w:=UTF8ToString(ba); + if w<>utf8data_in_utf16 then + error(7,w); + + initarray(@bc[0],utf8data); + w:=UTF8ToString(bc); + if w<>utf8data_in_utf16 then + error(8,w); +{$endif not cpujvm} + end; + + +procedure testinvalidutf8; + var + s1251: tstr1251; + rs: rawbytestring; + utf8: utf8string; + s: ansistring; + ss: shortstring; + ba: array[low(invalidutf8data)..high(invalidutf8data)] of byte; + bc: array[low(invalidutf8data)..high(invalidutf8data)] of ansichar; + bcc: array[low(invalidutf8data)..high(invalidutf8data)+1] of ansichar; + w: unicodestring; + begin + initstr(rawbytestring(s1251),1251,invalidutf8data); + w:=UTF8ToString(s1251); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(11,w); + + initstr(rs,0,invalidutf8data); + w:=UTF8ToString(rs); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(12,w); + + initstr(rawbytestring(utf8),CP_UTF8,invalidutf8data); + w:=UTF8ToString(utf8); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(13,w); + + initstr(rawbytestring(s),defaultsystemcodepage,invalidutf8data); + w:=UTF8ToString(s); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(14,w); + + initstr(ss,invalidutf8data); + w:=UTF8ToString(ss); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(15,w); + + initarray(@bcc[0],invalidutf8data); + bcc[high(bcc)]:=#0; + w:=UTF8ToString(@bcc[0]); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(16,w); + +{$ifndef cpujvm} + initarray(@ba[0],invalidutf8data); + w:=UTF8ToString(ba); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(17,w); + + initarray(@bc[0],invalidutf8data); + w:=UTF8ToString(bc); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(18,w); +{$endif not cpujvm} + end; + + +begin + testvalidutf8; + testinvalidutf8; +end. diff --git a/tests/webtbs/tw29585.pp b/tests/webtbs/tw29585.pp new file mode 100644 index 0000000000..26b05cab84 --- /dev/null +++ b/tests/webtbs/tw29585.pp @@ -0,0 +1,196 @@ +program tw29585; +{$IFDEF FPC} +{$MODE OBJFPC}{$H+} +{$ELSE} +{$APPTYPE Console} +{$ENDIF} + +uses + {$ifndef FPC}Windows,{$endif}Sysutils; + +{$IFNDEF FPC} +type + tsystemcodepage = word; +{$ENDIF} + +Type + tstr1251 = type ansistring(1251); + +const + utf8data: array[0..10] of ansichar = #$C3#$A9#$C2#$BA#$C3#$AE#$C5#$93#$E2#$88#$82; + utf8data_in_utf16: unicodestring = #$00E9#$00BA#$00EE#$0153#$2202; + + invalidutf8data: array[0..3] of ansichar = #$80#$81#$82#$83; + invalidutf8data_utf_16a: unicodestring = '????'; + invalidutf8data_utf_16b: unicodestring = #$fffd#$fffd#$fffd#$fffd; + + +procedure error(l: longint; const u: unicodestring); + var + i: longint; + begin + writeln('error for test ',l); + write('result: '); + for i:=low(u) to high(u) do + write('#$',inttohex(ord(u[i]),2)); + writeln; + halt(l); + end; + + +procedure initarray(p: pbyte; const data: array of ansichar); + var + i: longint; + begin + for i:=low(data) to high(data) do + p[i]:=ord(data[i]); + end; + + +procedure initstr(var s: rawbytestring; cp: tsystemcodepage; const data: array of ansichar); overload; + var + i: longint; + begin + setlength(s,length(data)); + setcodepage(s,cp,false); + for i:=low(data) to high(data) do + s[i+1]:=data[i]; + end; + + +procedure initstr(var s: shortstring; const data: array of ansichar); overload; + var + i: longint; + begin + setlength(s,length(data)); + for i:=low(data) to high(data) do + s[i+1]:=data[i]; + end; + + +procedure testvalidutf8; + var + s1251: tstr1251; + rs: rawbytestring; + utf8: utf8string; + s: ansistring; + ss: shortstring; + ba: array[low(utf8data)..high(utf8data)] of byte; + bc: array[low(utf8data)..high(utf8data)] of ansichar; + bcc: array[low(utf8data)..high(utf8data)+1] of ansichar; + w: unicodestring; + begin + initstr(rawbytestring(s1251),1251,utf8data); + w:=UTF8ToString(s1251); + if w<>utf8data_in_utf16 then + error(1,w); + + initstr(rs,0,utf8data); + w:=UTF8ToString(rs); + if w<>utf8data_in_utf16 then + error(2,w); + + initstr(rawbytestring(utf8),CP_UTF8,utf8data); + w:=UTF8ToString(utf8); + if w<>utf8data_in_utf16 then + error(3,w); + + initstr(rawbytestring(s),defaultsystemcodepage,utf8data); + w:=UTF8ToString(s); + if w<>utf8data_in_utf16 then + error(4,w); + + initstr(ss,utf8data); + w:=UTF8ToString(ss); + if w<>utf8data_in_utf16 then + error(5,w); + + initarray(@bcc[0],utf8data); + bcc[high(bcc)]:=#0; + w:=UTF8ToString(@bcc[0]); + if w<>utf8data_in_utf16 then + error(6,w); + +{$ifndef cpujvm} + initarray(@ba[0],utf8data); + w:=UTF8ToString(ba); + if w<>utf8data_in_utf16 then + error(7,w); + + initarray(@bc[0],utf8data); + w:=UTF8ToString(bc); + if w<>utf8data_in_utf16 then + error(8,w); +{$endif not cpujvm} + end; + + +procedure testinvalidutf8; + var + s1251: tstr1251; + rs: rawbytestring; + utf8: utf8string; + s: ansistring; + ss: shortstring; + ba: array[low(invalidutf8data)..high(invalidutf8data)] of byte; + bc: array[low(invalidutf8data)..high(invalidutf8data)] of ansichar; + bcc: array[low(invalidutf8data)..high(invalidutf8data)+1] of ansichar; + w: unicodestring; + begin + initstr(rawbytestring(s1251),1251,invalidutf8data); + w:=UTF8ToString(s1251); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(11,w); + + initstr(rs,0,invalidutf8data); + w:=UTF8ToString(rs); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(12,w); + + initstr(rawbytestring(utf8),CP_UTF8,invalidutf8data); + w:=UTF8ToString(utf8); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(13,w); + + initstr(rawbytestring(s),defaultsystemcodepage,invalidutf8data); + w:=UTF8ToString(s); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(14,w); + + initstr(ss,invalidutf8data); + w:=UTF8ToString(ss); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(15,w); + + initarray(@bcc[0],invalidutf8data); + bcc[high(bcc)]:=#0; + w:=UTF8ToString(@bcc[0]); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(16,w); + +{$ifndef cpujvm} + initarray(@ba[0],invalidutf8data); + w:=UTF8ToString(ba); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(17,w); + + initarray(@bc[0],invalidutf8data); + w:=UTF8ToString(bc); + if (w<>invalidutf8data_utf_16a) and + (w<>invalidutf8data_utf_16b) then + error(18,w); +{$endif not cpujvm} + end; + + +begin + testvalidutf8; + testinvalidutf8; +end.