+ support for arbitrary encodings in readstr/writestr

o set the code page of the temporary "text" file to utf-8 for writestr with
     unicodestring/widestring as destination, so that no data loss can occur
     (+ properly deal with cases whereby part of an utf-8 character is
      written to the textbuf in this case)
   o explicitly pass the code page of the destination ansistring for writestr
     with ansistring as destination and set it for the temporary "text" file
   o set the code page of the text file for readstr

git-svn-id: trunk@26317 -
This commit is contained in:
Jonas Maebe 2013-12-29 19:18:49 +00:00
parent ddf08b4eb7
commit d2b8275b99
5 changed files with 162 additions and 30 deletions

1
.gitattributes vendored
View File

@ -12041,6 +12041,7 @@ tests/test/twrstr5.pp svneol=native#text/plain
tests/test/twrstr6.pp svneol=native#text/plain
tests/test/twrstr7.pp svneol=native#text/plain
tests/test/twrstr8.pp svneol=native#text/plain
tests/test/twrstr9.pp svneol=native#text/plain
tests/test/uabstrcl.pp svneol=native#text/plain
tests/test/uchlp12.pp svneol=native#text/pascal
tests/test/uchlp18.pp svneol=native#text/pascal

View File

@ -1261,6 +1261,11 @@ implementation
{ parameter chain }
left:=filepara.right;
filepara.right:=ccallparanode.create(ctemprefnode.create(filetemp),nil);
{ in case of a writestr() to an ansistring, also pass the string's
code page }
if not do_read and
is_ansistring(filepara.left.resultdef) then
filepara:=ccallparanode.create(genintconstnode(tstringdef(filepara.left.resultdef).encoding),filepara);
{ pass the temp text file and the source/destination string to the
setup routine, which will store the string's address in the
textrec }

View File

@ -426,7 +426,7 @@ Procedure fpc_Write_Text_WideChar(Len : Longint;var t : Text;c : WideChar); comp
can appear inside the other arguments of writerstr }
procedure fpc_SetupWriteStr_Shortstr(var ReadWriteStrText: text; var s: shortstring); compilerproc;
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring); compilerproc;
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring; cp: TSystemCodePage); compilerproc;
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
procedure fpc_SetupWriteStr_Unicodestr(var ReadWriteStrText: text; var s: unicodestring); compilerproc;

View File

@ -2209,25 +2209,104 @@ end;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
procedure WriteStrUnicode(var t: textrec);
function UTF8CodePointLength(firstbyte: byte): SizeInt;
var
temp: ansistring;
firstzerobit: SizeInt;
begin
result:=1;
{ bsr searches for the leftmost 1 bit. We are interested in the
leftmost 0 bit, so first invert the value
}
firstzerobit:=BsrByte(not(firstbyte));
{ if there is no zero bit or the first zero bit is the rightmost bit
(bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
UTF-8-encoded string, and in the worst case bit 1 has to be zero)
}
if (firstzerobit=0) or (firstzerobit=255) then
exit;
{ the number of bytes belonging to this code point is
7-(pos first 0-bit).
}
result:=7-firstzerobit;
end;
function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
var
i, lenfound, codepointlen: SizeInt;
b: byte;
begin
lenfound:=0;
for i:=t.bufpos-1 downto 0 do
begin
b:=byte(t.bufptr^[i]);
if b<=127 then
begin
if lenfound = 0 then
{ valid simple code point }
result:=i+1
else
{ valid simple code point followed by a bunch of invalid data ->
handle everything since it can't become valid by adding more
bytes }
result:=t.bufpos;
exit;
end;
{ start of a complex character }
if (b and %11000000)<>0 then
begin
codepointlen:=UTF8CodePointLength(b);
{ we did not yet get all bytes of the last code point -> handle
everything until the start of this character }
if codepointlen>lenfound+1 then
result:=i
{ the last code point is invalid -> handle everything since it can't
become valid by adding more bytes; in case it's complete, we also
handle everything, of course}
else
result:=t.bufpos;
exit;
end;
inc(lenfound);
end;
{ all invalid data, or the buffer is too small to be able to deal with the
complete utf8char -> nothing else to do but to handle the entire buffer }
result:=t.bufpos;
end;
procedure WriteStrUnicodeIntern(var t: textrec; flush: boolean);
var
temp: unicodestring;
str: punicodestring;
validend: SizeInt;
begin
if (t.bufpos=0) then
exit;
str:=punicodestring(@t.userdata[TempWriteStrDestIndex]);
setlength(temp,t.bufpos);
move(t.bufptr^,temp[1],t.bufpos);
if not flush then
validend:=EndOfLastCompleteUTF8CodePoint(t)
else
validend:=t.bufpos;
widestringmanager.Ansi2UnicodeMoveProc(@t.bufptr^[0],CP_UTF8,temp,validend);
str^:=str^+temp;
t.bufpos:=0;
dec(t.bufpos,validend);
{ move remainder to the start }
if t.bufpos<>0 then
move(t.bufptr^[validend],t.bufptr^[0],t.bufpos);
end;
procedure WriteStrUnicode(var t: textrec);
begin
WriteStrUnicodeIntern(t,false);
end;
procedure WriteStrUnicodeFlush(var t: textrec);
begin
{ see comment in WriteStrShortFlush }
WriteStrUnicode(t);
WriteStrUnicodeIntern(t,true);
punicodestring(ppointer(@t.userdata[StrPtrIndex])^)^:=
punicodestring(@t.userdata[TempWriteStrDestIndex])^;
{ free memory/finalize temp }
@ -2236,25 +2315,38 @@ end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure WriteStrWide(var t: textrec);
procedure WriteStrWideIntern(var t: textrec; flush: boolean);
var
temp: ansistring;
temp: unicodestring;
str: pwidestring;
validend: SizeInt;
begin
if (t.bufpos=0) then
exit;
str:=pwidestring(@t.userdata[TempWriteStrDestIndex]);
setlength(temp,t.bufpos);
move(t.bufptr^,temp[1],t.bufpos);
if not flush then
validend:=EndOfLastCompleteUTF8CodePoint(t)
else
validend:=t.bufpos;
widestringmanager.Ansi2UnicodeMoveProc(@t.bufptr^[0],CP_UTF8,temp,validend);
str^:=str^+temp;
t.bufpos:=0;
dec(t.bufpos,validend);
{ move remainder to the start }
if t.bufpos<>0 then
move(t.bufptr^[validend],t.bufptr^[0],t.bufpos);
end;
procedure WriteStrWide(var t: textrec);
begin
WriteStrUnicodeIntern(t,false);
end;
procedure WriteStrWideFlush(var t: textrec);
begin
{ see comment in WriteStrShortFlush }
WriteStrWide(t);
WriteStrWideIntern(t,true);
pwidestring(ppointer(@t.userdata[StrPtrIndex])^)^:=
pwidestring(@t.userdata[TempWriteStrDestIndex])^;
{ free memory/finalize temp }
@ -2262,7 +2354,7 @@ begin
end;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure SetupWriteStrCommon(out t: textrec);
procedure SetupWriteStrCommon(out t: textrec; cp: TSystemCodePage);
begin
// initialise
Assign(text(t),'');
@ -2270,14 +2362,14 @@ begin
t.OpenFunc:=nil;
t.CloseFunc:=nil;
{$ifdef FPC_HAS_CPSTRING}
t.CodePage:=DefaultSystemCodePage;
t.CodePage:=TranslatePlaceholderCP(cp);
{$endif}
end;
procedure fpc_SetupWriteStr_Shortstr(var ReadWriteStrText: text; var s: shortstring); compilerproc;
begin
setupwritestrcommon(TextRec(ReadWriteStrText));
SetupWriteStrCommon(TextRec(ReadWriteStrText),DefaultSystemCodePage);
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
{ temporary destination (see comments for TempWriteStrDestIndex) }
@ -2291,9 +2383,12 @@ end;
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring); compilerproc;
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring; cp: TSystemCodePage); compilerproc;
begin
setupwritestrcommon(TextRec(ReadWriteStrText));
{ destination rawbytestring -> use CP_ACP }
if cp=CP_NONE then
cp:=CP_ACP;
SetupWriteStrCommon(TextRec(ReadWriteStrText),cp);
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
{ temp destination ansistring, nil = empty string }
@ -2308,7 +2403,7 @@ end;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
procedure fpc_SetupWriteStr_Unicodestr(var ReadWriteStrText: text; var s: unicodestring); compilerproc;
begin
setupwritestrcommon(TextRec(ReadWriteStrText));
SetupWriteStrCommon(TextRec(ReadWriteStrText),CP_UTF8);
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
{ temp destination unicodestring, nil = empty string }
@ -2323,7 +2418,7 @@ end;
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure fpc_SetupWriteStr_Widestr(var ReadWriteStrText: text; var s: widestring); compilerproc;
begin
setupwritestrcommon(TextRec(ReadWriteStrText));
SetupWriteStrCommon(TextRec(ReadWriteStrText),CP_UTF8);
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
{ temp destination widestring }
@ -2370,7 +2465,7 @@ begin
end;
procedure SetupReadStrCommon(out t: textrec);
procedure SetupReadStrCommon(out t: textrec; cp: TSystemCodePage);
begin
// initialise
Assign(text(t),'');
@ -2378,7 +2473,7 @@ begin
t.OpenFunc:=nil;
t.CloseFunc:=nil;
{$ifdef FPC_HAS_CPSTRING}
t.CodePage:=DefaultSystemCodePage;
t.CodePage:=TranslatePlaceholderCP(cp);
{$endif}
PSizeInt(@t.userdata[BytesReadIndex])^:=0;
end;
@ -2387,7 +2482,7 @@ end;
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
procedure fpc_SetupReadStr_Ansistr(var ReadWriteStrText: text; const s: ansistring); [public, alias: 'FPC_SETUPREADSTR_ANSISTR']; compilerproc;
begin
setupreadstrcommon(TextRec(ReadWriteStrText));
SetupReadStrCommon(TextRec(ReadWriteStrText),StringCodePage(s));
{ we need a reference, because 's' may be a temporary expression }
PAnsiString(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=s;
TextRec(ReadWriteStrText).InOutFunc:=@ReadStrAnsi;
@ -2395,7 +2490,7 @@ begin
TextRec(ReadWriteStrText).FlushFunc:=@ReadAnsiStrFinal;
end;
procedure fpc_SetupReadStr_Ansistr_Intern(var ReadWriteStrText: text; const s: ansistring); [external name 'FPC_SETUPREADSTR_ANSISTR'];
procedure fpc_SetupReadStr_Ansistr_Intern(var ReadWriteStrText: text; const s: rawbytestring); [external name 'FPC_SETUPREADSTR_ANSISTR'];
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
@ -2436,9 +2531,8 @@ end;
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
procedure fpc_SetupReadStr_Unicodestr(var ReadWriteStrText: text; const s: unicodestring); compilerproc;
begin
{ we use an ansistring to avoid code duplication, and let the }
{ assignment convert the widestring to an equivalent ansistring }
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,s);
{ we use an utf8string to avoid code duplication }
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,utf8string(s));
end;
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
@ -2446,9 +2540,8 @@ end;
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
procedure fpc_SetupReadStr_Widestr(var ReadWriteStrText: text; const s: widestring); compilerproc;
begin
{ we use an ansistring to avoid code duplication, and let the }
{ assignment convert the widestring to an equivalent ansistring }
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,s);
{ we use an utf8string to avoid code duplication }
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,utf8string(s));
end;
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}

33
tests/test/twrstr9.pp Normal file
View File

@ -0,0 +1,33 @@
{$codepage utf8}
{$ifdef unix}
uses
cwstring;
{$endif}
type
ts866 = type ansistring(866);
var
u: unicodestring;
s: utf8string;
rs: ts866;
p: pointer;
i: longint;
begin
DefaultSystemCodePage:=CP_ASCII;
s:='§èà£ù';
rs:=ts866('Популярные фото');
writestr(u,s,1,s,rs);
if u <>'§èà£ù1§èà£ùПопулярные фото' then
halt(1);
getmem(p,length(s)-1);
s:='';
for i:=1 to (256 div 3) do
s:=s+utf8string('㒨');
s:=s+utf8string('㒨');
{ check that splitting the last into two parts during writestr doesn't cause a
conversion error }
writestr(u,s);
if utf8string(u)<>s then
halt(2);
end.