mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-09-14 21:09:08 +02:00
+ support for arbitrary encodings in readstr/writestr
o set the code page of the temporary "text" file to utf-8 for writestr with unicodestring/widestring as destination, so that no data loss can occur (+ properly deal with cases whereby part of an utf-8 character is written to the textbuf in this case) o explicitly pass the code page of the destination ansistring for writestr with ansistring as destination and set it for the temporary "text" file o set the code page of the text file for readstr git-svn-id: trunk@26317 -
This commit is contained in:
parent
ddf08b4eb7
commit
d2b8275b99
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -12041,6 +12041,7 @@ tests/test/twrstr5.pp svneol=native#text/plain
|
||||
tests/test/twrstr6.pp svneol=native#text/plain
|
||||
tests/test/twrstr7.pp svneol=native#text/plain
|
||||
tests/test/twrstr8.pp svneol=native#text/plain
|
||||
tests/test/twrstr9.pp svneol=native#text/plain
|
||||
tests/test/uabstrcl.pp svneol=native#text/plain
|
||||
tests/test/uchlp12.pp svneol=native#text/pascal
|
||||
tests/test/uchlp18.pp svneol=native#text/pascal
|
||||
|
@ -1261,6 +1261,11 @@ implementation
|
||||
{ parameter chain }
|
||||
left:=filepara.right;
|
||||
filepara.right:=ccallparanode.create(ctemprefnode.create(filetemp),nil);
|
||||
{ in case of a writestr() to an ansistring, also pass the string's
|
||||
code page }
|
||||
if not do_read and
|
||||
is_ansistring(filepara.left.resultdef) then
|
||||
filepara:=ccallparanode.create(genintconstnode(tstringdef(filepara.left.resultdef).encoding),filepara);
|
||||
{ pass the temp text file and the source/destination string to the
|
||||
setup routine, which will store the string's address in the
|
||||
textrec }
|
||||
|
@ -426,7 +426,7 @@ Procedure fpc_Write_Text_WideChar(Len : Longint;var t : Text;c : WideChar); comp
|
||||
can appear inside the other arguments of writerstr }
|
||||
procedure fpc_SetupWriteStr_Shortstr(var ReadWriteStrText: text; var s: shortstring); compilerproc;
|
||||
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring); compilerproc;
|
||||
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring; cp: TSystemCodePage); compilerproc;
|
||||
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
procedure fpc_SetupWriteStr_Unicodestr(var ReadWriteStrText: text; var s: unicodestring); compilerproc;
|
||||
|
151
rtl/inc/text.inc
151
rtl/inc/text.inc
@ -2209,25 +2209,104 @@ end;
|
||||
|
||||
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
procedure WriteStrUnicode(var t: textrec);
|
||||
function UTF8CodePointLength(firstbyte: byte): SizeInt;
|
||||
var
|
||||
temp: ansistring;
|
||||
firstzerobit: SizeInt;
|
||||
begin
|
||||
result:=1;
|
||||
{ bsr searches for the leftmost 1 bit. We are interested in the
|
||||
leftmost 0 bit, so first invert the value
|
||||
}
|
||||
firstzerobit:=BsrByte(not(firstbyte));
|
||||
{ if there is no zero bit or the first zero bit is the rightmost bit
|
||||
(bit 0), this is an invalid UTF-8 byte ($ff cannot appear in an
|
||||
UTF-8-encoded string, and in the worst case bit 1 has to be zero)
|
||||
}
|
||||
if (firstzerobit=0) or (firstzerobit=255) then
|
||||
exit;
|
||||
{ the number of bytes belonging to this code point is
|
||||
7-(pos first 0-bit).
|
||||
}
|
||||
result:=7-firstzerobit;
|
||||
end;
|
||||
|
||||
|
||||
function EndOfLastCompleteUTF8CodePoint(var t: textrec): SizeInt;
|
||||
var
|
||||
i, lenfound, codepointlen: SizeInt;
|
||||
b: byte;
|
||||
begin
|
||||
lenfound:=0;
|
||||
for i:=t.bufpos-1 downto 0 do
|
||||
begin
|
||||
b:=byte(t.bufptr^[i]);
|
||||
if b<=127 then
|
||||
begin
|
||||
if lenfound = 0 then
|
||||
{ valid simple code point }
|
||||
result:=i+1
|
||||
else
|
||||
{ valid simple code point followed by a bunch of invalid data ->
|
||||
handle everything since it can't become valid by adding more
|
||||
bytes }
|
||||
result:=t.bufpos;
|
||||
exit;
|
||||
end;
|
||||
{ start of a complex character }
|
||||
if (b and %11000000)<>0 then
|
||||
begin
|
||||
codepointlen:=UTF8CodePointLength(b);
|
||||
{ we did not yet get all bytes of the last code point -> handle
|
||||
everything until the start of this character }
|
||||
if codepointlen>lenfound+1 then
|
||||
result:=i
|
||||
{ the last code point is invalid -> handle everything since it can't
|
||||
become valid by adding more bytes; in case it's complete, we also
|
||||
handle everything, of course}
|
||||
else
|
||||
result:=t.bufpos;
|
||||
exit;
|
||||
end;
|
||||
inc(lenfound);
|
||||
end;
|
||||
{ all invalid data, or the buffer is too small to be able to deal with the
|
||||
complete utf8char -> nothing else to do but to handle the entire buffer }
|
||||
result:=t.bufpos;
|
||||
end;
|
||||
|
||||
|
||||
procedure WriteStrUnicodeIntern(var t: textrec; flush: boolean);
|
||||
var
|
||||
temp: unicodestring;
|
||||
str: punicodestring;
|
||||
validend: SizeInt;
|
||||
begin
|
||||
if (t.bufpos=0) then
|
||||
exit;
|
||||
str:=punicodestring(@t.userdata[TempWriteStrDestIndex]);
|
||||
setlength(temp,t.bufpos);
|
||||
move(t.bufptr^,temp[1],t.bufpos);
|
||||
if not flush then
|
||||
validend:=EndOfLastCompleteUTF8CodePoint(t)
|
||||
else
|
||||
validend:=t.bufpos;
|
||||
widestringmanager.Ansi2UnicodeMoveProc(@t.bufptr^[0],CP_UTF8,temp,validend);
|
||||
str^:=str^+temp;
|
||||
t.bufpos:=0;
|
||||
dec(t.bufpos,validend);
|
||||
{ move remainder to the start }
|
||||
if t.bufpos<>0 then
|
||||
move(t.bufptr^[validend],t.bufptr^[0],t.bufpos);
|
||||
end;
|
||||
|
||||
|
||||
procedure WriteStrUnicode(var t: textrec);
|
||||
begin
|
||||
WriteStrUnicodeIntern(t,false);
|
||||
end;
|
||||
|
||||
|
||||
procedure WriteStrUnicodeFlush(var t: textrec);
|
||||
begin
|
||||
{ see comment in WriteStrShortFlush }
|
||||
WriteStrUnicode(t);
|
||||
WriteStrUnicodeIntern(t,true);
|
||||
punicodestring(ppointer(@t.userdata[StrPtrIndex])^)^:=
|
||||
punicodestring(@t.userdata[TempWriteStrDestIndex])^;
|
||||
{ free memory/finalize temp }
|
||||
@ -2236,25 +2315,38 @@ end;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
|
||||
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
|
||||
procedure WriteStrWide(var t: textrec);
|
||||
procedure WriteStrWideIntern(var t: textrec; flush: boolean);
|
||||
var
|
||||
temp: ansistring;
|
||||
temp: unicodestring;
|
||||
str: pwidestring;
|
||||
validend: SizeInt;
|
||||
begin
|
||||
if (t.bufpos=0) then
|
||||
exit;
|
||||
str:=pwidestring(@t.userdata[TempWriteStrDestIndex]);
|
||||
setlength(temp,t.bufpos);
|
||||
move(t.bufptr^,temp[1],t.bufpos);
|
||||
if not flush then
|
||||
validend:=EndOfLastCompleteUTF8CodePoint(t)
|
||||
else
|
||||
validend:=t.bufpos;
|
||||
widestringmanager.Ansi2UnicodeMoveProc(@t.bufptr^[0],CP_UTF8,temp,validend);
|
||||
str^:=str^+temp;
|
||||
t.bufpos:=0;
|
||||
dec(t.bufpos,validend);
|
||||
{ move remainder to the start }
|
||||
if t.bufpos<>0 then
|
||||
move(t.bufptr^[validend],t.bufptr^[0],t.bufpos);
|
||||
end;
|
||||
|
||||
|
||||
procedure WriteStrWide(var t: textrec);
|
||||
begin
|
||||
WriteStrUnicodeIntern(t,false);
|
||||
end;
|
||||
|
||||
|
||||
procedure WriteStrWideFlush(var t: textrec);
|
||||
begin
|
||||
{ see comment in WriteStrShortFlush }
|
||||
WriteStrWide(t);
|
||||
WriteStrWideIntern(t,true);
|
||||
pwidestring(ppointer(@t.userdata[StrPtrIndex])^)^:=
|
||||
pwidestring(@t.userdata[TempWriteStrDestIndex])^;
|
||||
{ free memory/finalize temp }
|
||||
@ -2262,7 +2354,7 @@ begin
|
||||
end;
|
||||
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
|
||||
|
||||
procedure SetupWriteStrCommon(out t: textrec);
|
||||
procedure SetupWriteStrCommon(out t: textrec; cp: TSystemCodePage);
|
||||
begin
|
||||
// initialise
|
||||
Assign(text(t),'');
|
||||
@ -2270,14 +2362,14 @@ begin
|
||||
t.OpenFunc:=nil;
|
||||
t.CloseFunc:=nil;
|
||||
{$ifdef FPC_HAS_CPSTRING}
|
||||
t.CodePage:=DefaultSystemCodePage;
|
||||
t.CodePage:=TranslatePlaceholderCP(cp);
|
||||
{$endif}
|
||||
end;
|
||||
|
||||
|
||||
procedure fpc_SetupWriteStr_Shortstr(var ReadWriteStrText: text; var s: shortstring); compilerproc;
|
||||
begin
|
||||
setupwritestrcommon(TextRec(ReadWriteStrText));
|
||||
SetupWriteStrCommon(TextRec(ReadWriteStrText),DefaultSystemCodePage);
|
||||
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
|
||||
|
||||
{ temporary destination (see comments for TempWriteStrDestIndex) }
|
||||
@ -2291,9 +2383,12 @@ end;
|
||||
|
||||
|
||||
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring); compilerproc;
|
||||
procedure fpc_SetupWriteStr_Ansistr(var ReadWriteStrText: text; var s: ansistring; cp: TSystemCodePage); compilerproc;
|
||||
begin
|
||||
setupwritestrcommon(TextRec(ReadWriteStrText));
|
||||
{ destination rawbytestring -> use CP_ACP }
|
||||
if cp=CP_NONE then
|
||||
cp:=CP_ACP;
|
||||
SetupWriteStrCommon(TextRec(ReadWriteStrText),cp);
|
||||
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
|
||||
|
||||
{ temp destination ansistring, nil = empty string }
|
||||
@ -2308,7 +2403,7 @@ end;
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
procedure fpc_SetupWriteStr_Unicodestr(var ReadWriteStrText: text; var s: unicodestring); compilerproc;
|
||||
begin
|
||||
setupwritestrcommon(TextRec(ReadWriteStrText));
|
||||
SetupWriteStrCommon(TextRec(ReadWriteStrText),CP_UTF8);
|
||||
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
|
||||
|
||||
{ temp destination unicodestring, nil = empty string }
|
||||
@ -2323,7 +2418,7 @@ end;
|
||||
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
|
||||
procedure fpc_SetupWriteStr_Widestr(var ReadWriteStrText: text; var s: widestring); compilerproc;
|
||||
begin
|
||||
setupwritestrcommon(TextRec(ReadWriteStrText));
|
||||
SetupWriteStrCommon(TextRec(ReadWriteStrText),CP_UTF8);
|
||||
PPointer(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=@s;
|
||||
|
||||
{ temp destination widestring }
|
||||
@ -2370,7 +2465,7 @@ begin
|
||||
end;
|
||||
|
||||
|
||||
procedure SetupReadStrCommon(out t: textrec);
|
||||
procedure SetupReadStrCommon(out t: textrec; cp: TSystemCodePage);
|
||||
begin
|
||||
// initialise
|
||||
Assign(text(t),'');
|
||||
@ -2378,7 +2473,7 @@ begin
|
||||
t.OpenFunc:=nil;
|
||||
t.CloseFunc:=nil;
|
||||
{$ifdef FPC_HAS_CPSTRING}
|
||||
t.CodePage:=DefaultSystemCodePage;
|
||||
t.CodePage:=TranslatePlaceholderCP(cp);
|
||||
{$endif}
|
||||
PSizeInt(@t.userdata[BytesReadIndex])^:=0;
|
||||
end;
|
||||
@ -2387,7 +2482,7 @@ end;
|
||||
{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
procedure fpc_SetupReadStr_Ansistr(var ReadWriteStrText: text; const s: ansistring); [public, alias: 'FPC_SETUPREADSTR_ANSISTR']; compilerproc;
|
||||
begin
|
||||
setupreadstrcommon(TextRec(ReadWriteStrText));
|
||||
SetupReadStrCommon(TextRec(ReadWriteStrText),StringCodePage(s));
|
||||
{ we need a reference, because 's' may be a temporary expression }
|
||||
PAnsiString(@TextRec(ReadWriteStrText).userdata[StrPtrIndex])^:=s;
|
||||
TextRec(ReadWriteStrText).InOutFunc:=@ReadStrAnsi;
|
||||
@ -2395,7 +2490,7 @@ begin
|
||||
TextRec(ReadWriteStrText).FlushFunc:=@ReadAnsiStrFinal;
|
||||
end;
|
||||
|
||||
procedure fpc_SetupReadStr_Ansistr_Intern(var ReadWriteStrText: text; const s: ansistring); [external name 'FPC_SETUPREADSTR_ANSISTR'];
|
||||
procedure fpc_SetupReadStr_Ansistr_Intern(var ReadWriteStrText: text; const s: rawbytestring); [external name 'FPC_SETUPREADSTR_ANSISTR'];
|
||||
{$endif FPC_HAS_FEATURE_ANSISTRINGS}
|
||||
|
||||
|
||||
@ -2436,9 +2531,8 @@ end;
|
||||
{$ifdef FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
procedure fpc_SetupReadStr_Unicodestr(var ReadWriteStrText: text; const s: unicodestring); compilerproc;
|
||||
begin
|
||||
{ we use an ansistring to avoid code duplication, and let the }
|
||||
{ assignment convert the widestring to an equivalent ansistring }
|
||||
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,s);
|
||||
{ we use an utf8string to avoid code duplication }
|
||||
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,utf8string(s));
|
||||
end;
|
||||
{$endif FPC_HAS_FEATURE_WIDESTRINGS}
|
||||
|
||||
@ -2446,9 +2540,8 @@ end;
|
||||
{$ifndef FPC_WIDESTRING_EQUAL_UNICODESTRING}
|
||||
procedure fpc_SetupReadStr_Widestr(var ReadWriteStrText: text; const s: widestring); compilerproc;
|
||||
begin
|
||||
{ we use an ansistring to avoid code duplication, and let the }
|
||||
{ assignment convert the widestring to an equivalent ansistring }
|
||||
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,s);
|
||||
{ we use an utf8string to avoid code duplication }
|
||||
fpc_SetupReadStr_Ansistr_Intern(ReadWriteStrText,utf8string(s));
|
||||
end;
|
||||
{$endif FPC_WIDESTRING_EQUAL_UNICODESTRING}
|
||||
|
||||
|
33
tests/test/twrstr9.pp
Normal file
33
tests/test/twrstr9.pp
Normal file
@ -0,0 +1,33 @@
|
||||
{$codepage utf8}
|
||||
|
||||
{$ifdef unix}
|
||||
uses
|
||||
cwstring;
|
||||
{$endif}
|
||||
|
||||
type
|
||||
ts866 = type ansistring(866);
|
||||
var
|
||||
u: unicodestring;
|
||||
s: utf8string;
|
||||
rs: ts866;
|
||||
p: pointer;
|
||||
i: longint;
|
||||
begin
|
||||
DefaultSystemCodePage:=CP_ASCII;
|
||||
s:='§èà£ù';
|
||||
rs:=ts866('Популярные фото');
|
||||
writestr(u,s,1,s,rs);
|
||||
if u <>'§èà£ù1§èà£ùПопулярные фото' then
|
||||
halt(1);
|
||||
getmem(p,length(s)-1);
|
||||
s:='';
|
||||
for i:=1 to (256 div 3) do
|
||||
s:=s+utf8string('㒨');
|
||||
s:=s+utf8string('㒨');
|
||||
{ check that splitting the last 㒨 into two parts during writestr doesn't cause a
|
||||
conversion error }
|
||||
writestr(u,s);
|
||||
if utf8string(u)<>s then
|
||||
halt(2);
|
||||
end.
|
Loading…
Reference in New Issue
Block a user