* made utf8tostring() Delphi-compatible (mantis #29585):

o removed utf8string overload
   o always ignore any code page information from the input, and interpret the
     contents of the input directly as utf8-encoded bytes
 * made utf8tostring() compatible with the JVM backend (mantis #29497)

git-svn-id: trunk@33159 -
This commit is contained in:
Jonas Maebe 2016-03-05 15:32:22 +00:00
parent 531ce3be61
commit a100309350
7 changed files with 450 additions and 31 deletions

2
.gitattributes vendored
View File

@ -11635,6 +11635,7 @@ tests/test/jvm/tvirtclmeth.pp svneol=native#text/plain
tests/test/jvm/tw20212.pp svneol=native#text/plain
tests/test/jvm/tw22807.pp svneol=native#text/plain
tests/test/jvm/tw24089.pp svneol=native#text/plain
tests/test/jvm/tw29585.pp svneol=native#text/plain
tests/test/jvm/twith.pp svneol=native#text/plain
tests/test/jvm/uenum.pp svneol=native#text/plain
tests/test/jvm/ujsetter.pp svneol=native#text/plain
@ -14939,6 +14940,7 @@ tests/webtbs/tw2953.pp svneol=native#text/plain
tests/webtbs/tw29546.pp svneol=native#text/pascal
tests/webtbs/tw2956.pp svneol=native#text/plain
tests/webtbs/tw2958.pp svneol=native#text/plain
tests/webtbs/tw29585.pp svneol=native#text/plain
tests/webtbs/tw29609.pp svneol=native#text/pascal
tests/webtbs/tw2966.pp svneol=native#text/plain
tests/webtbs/tw2975.pp svneol=native#text/plain

View File

@ -136,12 +136,15 @@ function Utf8ToUnicode(Dest: PUnicodeChar; MaxDestChars: SizeUInt; Source: PChar
function UTF8Encode(const s : RawByteString) : RawByteString; inline;
function UTF8Encode(const s : UnicodeString) : RawByteString;
function UTF8Decode(const s : RawByteString): UnicodeString;
function UTF8ToString(const s : UTF8String): UnicodeString;inline;
function UTF8ToString(const s : RawByteString): UnicodeString;inline;
function UTF8ToString(const S: ShortString): unicodestring;
function UTF8ToString(const S: PAnsiChar): unicodestring;
{ byte and ansichar are the same on the JVM, and "array of" and "pointer to"
are as well }
{$ifndef CPUJVM}
function UTF8ToString(const S: array of AnsiChar): unicodestring;
function UTF8ToString(const S: array of Byte): unicodestring;
function UTF8ToString(const S: array of Byte): unicodestring;
{$endif not CPUJVM}
function AnsiToUtf8(const s : RawByteString): RawByteString;{$ifdef SYSTEMINLINE}inline;{$endif}
function Utf8ToAnsi(const s : RawByteString) : RawByteString;{$ifdef SYSTEMINLINE}inline;{$endif}
function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String;

View File

@ -2362,63 +2362,57 @@ Begin
SetCodePage(Result,DefaultFileSystemCodePage,True);
End;
function UTF8ToString(const S: UTF8String): UnicodeString; inline;
{ Delphi compatibility: always interpret the data in the string as UTF-8,
ignore any codepage }
function UTF8ToString(const S: RawByteString): UnicodeString; inline;
begin
Result := UTF8Decode(S);
end;
function UTF8ToString(const S: RawByteString): UnicodeString; inline;
Var
UTF8 : UTF8String;
begin
UTF8:=S;
Result := UTF8Decode(UTF8);
end;
function UTF8ToString(const S: ShortString): UnicodeString;
Var
UTF8 : UTF8String;
rs: RawByteString;
begin
UTF8:=S;
Result := UTF8Decode(UTF8);
rs:=S;
Result := UTF8Decode(rs);
end;
function UTF8ToString(const S: PAnsiChar): UnicodeString;
var
UTF: UTF8String;
rs: RawByteString;
Count: Integer;
begin
Count := StrLen(S);
SetLength(UTF, Count);
Count := length(S);
SetLength(rs, Count);
if Count > 0 then
Move(S^, UTF[1], Count);
Result := UTF8ToString(UTF);
fpc_pchar_ansistr_intern_charmove(S,0,rs,0,Count);
Result := UTF8ToString(rs);
end;
{ byte and ansichar are the same on the JVM, and "array of" and "pointer to"
are as well }
{$ifndef CPUJVM}
function UTF8ToString(const S: array of AnsiChar): UnicodeString;
var
UTF: UTF8String;
rs: RawByteString;
Count: Integer;
begin
Count := Length(S);
SetLength(UTF, Count);
SetLength(rs, Count);
if Count > 0 then
Move(S[Low(S)], UTF[1], Count);
Result := UTF8ToString(UTF);
fpc_pchar_ansistr_intern_charmove(@S,Low(S),rs,0,Count);
Result := UTF8ToString(rs);
end;
function UTF8ToString(const S: array of Byte): UnicodeString;
var
UTF: UTF8String;
rs: RawByteString;
Count: Integer;
begin
Count := Length(S);
SetLength(UTF, Count);
SetLength(rs, Count);
if Count > 0 then
Move(S[Low(S)], UTF[1], Count);
Result := UTF8ToString(UTF);
fpc_pchar_ansistr_intern_charmove(pchar(@S),Low(S),rs,0,Count);
Result := UTF8ToString(rs);
end;
{$endif not CPUJVM}

View File

@ -324,3 +324,7 @@ ppcjvm -O2 -g -B -CTinitlocals tprocvaranon
if %errorlevel% neq 0 exit /b %errorlevel%
java -Dfile.encoding=UTF-8 -cp ..\..\..\rtl\units\jvm-java;. tprocvaranon
if %errorlevel% neq 0 exit /b %errorlevel%
ppcjvm -O2 -g -B -CTinitlocals tw29585
if %errorlevel% neq 0 exit /b %errorlevel%
java -Dfile.encoding=UTF-8 -cp ..\..\..\rtl\units\jvm-java;. tw29585
if %errorlevel% neq 0 exit /b %errorlevel%

View File

@ -189,3 +189,5 @@ $PPC -O2 -g -B -Sa tformalclass
java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tformalclass
$PPC -O2 -g -B -Sa tprocvaranon
java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tprocvaranon
$PPC -O2 -g -B -Sa tw29585
java -Dfile.encoding=UTF-8 -cp ../../../rtl/units/$RTLDIR:. tw29585

218
tests/test/jvm/tw29585.pp Normal file
View File

@ -0,0 +1,218 @@
program tw29585;
{$IFDEF FPC}
{$MODE OBJFPC}{$H+}
{$ENDIF}
{$ifdef CPUJVM}
uses
{$ifdef java}jdk15{$else}androidr14{$endif};
{$macro on}
{$define writeln:=jlsystem.fout.println}
{$define write:=jlsystem.fout.print}
{$endif}
{$IFNDEF FPC}
type
tsystemcodepage = word;
{$ENDIF}
Type
tstr1251 = type ansistring(1251);
const
utf8data: array[0..10] of ansichar = #$C3#$A9#$C2#$BA#$C3#$AE#$C5#$93#$E2#$88#$82;
utf8data_in_utf16: unicodestring = #$00E9#$00BA#$00EE#$0153#$2202;
invalidutf8data: array[0..3] of ansichar = #$80#$81#$82#$83;
invalidutf8data_utf_16a: unicodestring = '????';
invalidutf8data_utf_16b: unicodestring = #$fffd#$fffd#$fffd#$fffd;
function inttohex(l: longint; len: longint): unicodestring;
var
i: longint;
const
hexchars: array[0..15] of ansichar = ('0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f');
begin
result:='';
for i:=1 to len do
begin
result:=hexchars[l and $f]+result;
l:=l shr 4;
end;
end;
procedure error(l: longint; const u: unicodestring);
var
i: longint;
begin
write('error for test ');
writeln(l);
write('result: ');
for i:=low(u) to high(u) do
begin
write('#$');
write(inttohex(ord(u[i]),4));
end;
writeln;
halt(l);
end;
procedure initarray(p: pbyte; const data: array of ansichar);
var
i: longint;
begin
for i:=low(data) to high(data) do
p[i]:=ord(data[i]);
end;
procedure initstr(var s: rawbytestring; cp: tsystemcodepage; const data: array of ansichar); overload;
var
i: longint;
begin
setlength(s,length(data));
setcodepage(s,cp,false);
for i:=low(data) to high(data) do
s[i+1]:=data[i];
end;
procedure initstr(var s: shortstring; const data: array of ansichar); overload;
var
i: longint;
begin
setlength(s,length(data));
for i:=low(data) to high(data) do
s[i+1]:=data[i];
end;
procedure testvalidutf8;
var
s1251: tstr1251;
rs: rawbytestring;
utf8: utf8string;
s: ansistring;
ss: shortstring;
ba: array[low(utf8data)..high(utf8data)] of byte;
bc: array[low(utf8data)..high(utf8data)] of ansichar;
bcc: array[low(utf8data)..high(utf8data)+1] of ansichar;
w: unicodestring;
begin
initstr(rawbytestring(s1251),1251,utf8data);
w:=UTF8ToString(s1251);
if w<>utf8data_in_utf16 then
error(1,w);
initstr(rs,0,utf8data);
w:=UTF8ToString(rs);
if w<>utf8data_in_utf16 then
error(2,w);
initstr(rawbytestring(utf8),CP_UTF8,utf8data);
w:=UTF8ToString(utf8);
if w<>utf8data_in_utf16 then
error(3,w);
initstr(rawbytestring(s),defaultsystemcodepage,utf8data);
w:=UTF8ToString(s);
if w<>utf8data_in_utf16 then
error(4,w);
initstr(ss,utf8data);
w:=UTF8ToString(ss);
if w<>utf8data_in_utf16 then
error(5,w);
initarray(@bcc[0],utf8data);
bcc[high(bcc)]:=#0;
w:=UTF8ToString(@bcc[0]);
if w<>utf8data_in_utf16 then
error(6,w);
{$ifndef cpujvm}
initarray(@ba[0],utf8data);
w:=UTF8ToString(ba);
if w<>utf8data_in_utf16 then
error(7,w);
initarray(@bc[0],utf8data);
w:=UTF8ToString(bc);
if w<>utf8data_in_utf16 then
error(8,w);
{$endif not cpujvm}
end;
procedure testinvalidutf8;
var
s1251: tstr1251;
rs: rawbytestring;
utf8: utf8string;
s: ansistring;
ss: shortstring;
ba: array[low(invalidutf8data)..high(invalidutf8data)] of byte;
bc: array[low(invalidutf8data)..high(invalidutf8data)] of ansichar;
bcc: array[low(invalidutf8data)..high(invalidutf8data)+1] of ansichar;
w: unicodestring;
begin
initstr(rawbytestring(s1251),1251,invalidutf8data);
w:=UTF8ToString(s1251);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(11,w);
initstr(rs,0,invalidutf8data);
w:=UTF8ToString(rs);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(12,w);
initstr(rawbytestring(utf8),CP_UTF8,invalidutf8data);
w:=UTF8ToString(utf8);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(13,w);
initstr(rawbytestring(s),defaultsystemcodepage,invalidutf8data);
w:=UTF8ToString(s);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(14,w);
initstr(ss,invalidutf8data);
w:=UTF8ToString(ss);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(15,w);
initarray(@bcc[0],invalidutf8data);
bcc[high(bcc)]:=#0;
w:=UTF8ToString(@bcc[0]);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(16,w);
{$ifndef cpujvm}
initarray(@ba[0],invalidutf8data);
w:=UTF8ToString(ba);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(17,w);
initarray(@bc[0],invalidutf8data);
w:=UTF8ToString(bc);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(18,w);
{$endif not cpujvm}
end;
begin
testvalidutf8;
testinvalidutf8;
end.

196
tests/webtbs/tw29585.pp Normal file
View File

@ -0,0 +1,196 @@
program tw29585;
{$IFDEF FPC}
{$MODE OBJFPC}{$H+}
{$ELSE}
{$APPTYPE Console}
{$ENDIF}
uses
{$ifndef FPC}Windows,{$endif}Sysutils;
{$IFNDEF FPC}
type
tsystemcodepage = word;
{$ENDIF}
Type
tstr1251 = type ansistring(1251);
const
utf8data: array[0..10] of ansichar = #$C3#$A9#$C2#$BA#$C3#$AE#$C5#$93#$E2#$88#$82;
utf8data_in_utf16: unicodestring = #$00E9#$00BA#$00EE#$0153#$2202;
invalidutf8data: array[0..3] of ansichar = #$80#$81#$82#$83;
invalidutf8data_utf_16a: unicodestring = '????';
invalidutf8data_utf_16b: unicodestring = #$fffd#$fffd#$fffd#$fffd;
procedure error(l: longint; const u: unicodestring);
var
i: longint;
begin
writeln('error for test ',l);
write('result: ');
for i:=low(u) to high(u) do
write('#$',inttohex(ord(u[i]),2));
writeln;
halt(l);
end;
procedure initarray(p: pbyte; const data: array of ansichar);
var
i: longint;
begin
for i:=low(data) to high(data) do
p[i]:=ord(data[i]);
end;
procedure initstr(var s: rawbytestring; cp: tsystemcodepage; const data: array of ansichar); overload;
var
i: longint;
begin
setlength(s,length(data));
setcodepage(s,cp,false);
for i:=low(data) to high(data) do
s[i+1]:=data[i];
end;
procedure initstr(var s: shortstring; const data: array of ansichar); overload;
var
i: longint;
begin
setlength(s,length(data));
for i:=low(data) to high(data) do
s[i+1]:=data[i];
end;
procedure testvalidutf8;
var
s1251: tstr1251;
rs: rawbytestring;
utf8: utf8string;
s: ansistring;
ss: shortstring;
ba: array[low(utf8data)..high(utf8data)] of byte;
bc: array[low(utf8data)..high(utf8data)] of ansichar;
bcc: array[low(utf8data)..high(utf8data)+1] of ansichar;
w: unicodestring;
begin
initstr(rawbytestring(s1251),1251,utf8data);
w:=UTF8ToString(s1251);
if w<>utf8data_in_utf16 then
error(1,w);
initstr(rs,0,utf8data);
w:=UTF8ToString(rs);
if w<>utf8data_in_utf16 then
error(2,w);
initstr(rawbytestring(utf8),CP_UTF8,utf8data);
w:=UTF8ToString(utf8);
if w<>utf8data_in_utf16 then
error(3,w);
initstr(rawbytestring(s),defaultsystemcodepage,utf8data);
w:=UTF8ToString(s);
if w<>utf8data_in_utf16 then
error(4,w);
initstr(ss,utf8data);
w:=UTF8ToString(ss);
if w<>utf8data_in_utf16 then
error(5,w);
initarray(@bcc[0],utf8data);
bcc[high(bcc)]:=#0;
w:=UTF8ToString(@bcc[0]);
if w<>utf8data_in_utf16 then
error(6,w);
{$ifndef cpujvm}
initarray(@ba[0],utf8data);
w:=UTF8ToString(ba);
if w<>utf8data_in_utf16 then
error(7,w);
initarray(@bc[0],utf8data);
w:=UTF8ToString(bc);
if w<>utf8data_in_utf16 then
error(8,w);
{$endif not cpujvm}
end;
procedure testinvalidutf8;
var
s1251: tstr1251;
rs: rawbytestring;
utf8: utf8string;
s: ansistring;
ss: shortstring;
ba: array[low(invalidutf8data)..high(invalidutf8data)] of byte;
bc: array[low(invalidutf8data)..high(invalidutf8data)] of ansichar;
bcc: array[low(invalidutf8data)..high(invalidutf8data)+1] of ansichar;
w: unicodestring;
begin
initstr(rawbytestring(s1251),1251,invalidutf8data);
w:=UTF8ToString(s1251);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(11,w);
initstr(rs,0,invalidutf8data);
w:=UTF8ToString(rs);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(12,w);
initstr(rawbytestring(utf8),CP_UTF8,invalidutf8data);
w:=UTF8ToString(utf8);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(13,w);
initstr(rawbytestring(s),defaultsystemcodepage,invalidutf8data);
w:=UTF8ToString(s);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(14,w);
initstr(ss,invalidutf8data);
w:=UTF8ToString(ss);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(15,w);
initarray(@bcc[0],invalidutf8data);
bcc[high(bcc)]:=#0;
w:=UTF8ToString(@bcc[0]);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(16,w);
{$ifndef cpujvm}
initarray(@ba[0],invalidutf8data);
w:=UTF8ToString(ba);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(17,w);
initarray(@bc[0],invalidutf8data);
w:=UTF8ToString(bc);
if (w<>invalidutf8data_utf_16a) and
(w<>invalidutf8data_utf_16b) then
error(18,w);
{$endif not cpujvm}
end;
begin
testvalidutf8;
testinvalidutf8;
end.