From 815cd2b39dadc6b5e8d681739e31cd566c2529f6 Mon Sep 17 00:00:00 2001 From: Jonas Maebe Date: Sun, 12 Aug 2007 20:01:08 +0000 Subject: [PATCH] + support for widestring manager based widechar conversions (widechar<->char, widechar<>*string), based on patch from Rimgaudas Laucius (mantis #7758) * no longer perform compile-time widechar/string->char/ansi/ shortstring conversions if they would destroy information (they can't cope with widechars with ord>=128). This means that you can now properly constant widechars/widestrings in source code with a {$codepage } set without risking that the compiler will mangle everything afterwards * support ESysEINVAL return code from iconv (happens if last multibyte char is incomplete) * fixed writing of widechars (were converted to char -> lost information) git-svn-id: trunk@8274 - --- .gitattributes | 2 + compiler/ncgcnv.pas | 4 +- compiler/ncnv.pas | 62 +++++++++++++---- compiler/widestr.pas | 25 ++++++- rtl/inc/compproc.inc | 8 ++- rtl/inc/text.inc | 8 +-- rtl/inc/wstrings.inc | 60 ++++++++++++++-- rtl/unix/cwstring.pp | 17 ++++- rtl/win32/system.pp | 16 +++-- rtl/win64/system.pp | 16 +++-- tests/webtbs/tw7758.pp | 148 ++++++++++++++++++++++++++++++++++++++++ tests/webtbs/tw7758a.pp | 19 ++++++ 12 files changed, 341 insertions(+), 44 deletions(-) create mode 100644 tests/webtbs/tw7758.pp create mode 100644 tests/webtbs/tw7758a.pp diff --git a/.gitattributes b/.gitattributes index 21897b3ba1..00632afcd0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -8253,6 +8253,8 @@ tests/webtbs/tw7643.pp svneol=native#text/plain tests/webtbs/tw7679.pp svneol=native#text/plain tests/webtbs/tw7719.pp svneol=native#text/plain tests/webtbs/tw7756.pp svneol=native#text/plain +tests/webtbs/tw7758.pp svneol=native#text/plain +tests/webtbs/tw7758a.pp svneol=native#text/plain tests/webtbs/tw7803.pp svneol=native#text/plain tests/webtbs/tw7806.pp svneol=native#text/plain tests/webtbs/tw7808.pp svneol=native#text/plain diff --git a/compiler/ncgcnv.pas b/compiler/ncgcnv.pas index 36a56d88d6..1c5757f482 100644 --- a/compiler/ncgcnv.pas +++ b/compiler/ncgcnv.pas @@ -514,9 +514,7 @@ interface procedure tcgtypeconvnode.second_char_to_char; begin - {$warning todo: add RTL routine for widechar-char conversion } - { Quick hack to at least generate 'working' code (PFV) } - second_int_to_int; + internalerror(2007081202); end; diff --git a/compiler/ncnv.pas b/compiler/ncnv.pas index 2bfa419881..4b90c3d5ca 100644 --- a/compiler/ncnv.pas +++ b/compiler/ncnv.pas @@ -875,12 +875,16 @@ implementation begin result:=nil; - if left.nodetype=stringconstn then - begin + if (left.nodetype=stringconstn) and + ((tstringdef(left.resultdef).stringtype<>st_widestring) or + (tstringdef(resultdef).stringtype=st_widestring) or + { non-ascii chars would be replaced with '?' -> loses info } + not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str))) then + begin tstringconstnode(left).changestringtype(resultdef); result:=left; left:=nil; - end + end else begin { get the correct procedure name } @@ -913,7 +917,13 @@ implementation begin result:=nil; - if left.nodetype=ordconstn then + { we can't do widechar to ansichar conversions at compile time, since } + { this maps all non-ascii chars to '?' -> loses information } + if (left.nodetype=ordconstn) and + ((tstringdef(resultdef).stringtype=st_widestring) or + (torddef(left.resultdef).ordtype=uchar) or + { >=128 is destroyed } + (tordconstnode(left).value.uvalue<128)) then begin if tstringdef(resultdef).stringtype=st_widestring then begin @@ -927,22 +937,30 @@ implementation end else begin - hp:=cstringconstnode.createstr(chr(tordconstnode(left).value.uvalue)); + if torddef(left.resultdef).ordtype=uwidechar then + hp:=cstringconstnode.createstr(unicode2asciichar(tcompilerwidechar(tordconstnode(left).value.uvalue))) + else + hp:=cstringconstnode.createstr(chr(tordconstnode(left).value.uvalue)); tstringconstnode(hp).changestringtype(resultdef); end; result:=hp; end else - { shortstrings are handled 'inline' } - if tstringdef(resultdef).stringtype <> st_shortstring then + { shortstrings are handled 'inline' (except for widechars) } + if (tstringdef(resultdef).stringtype <> st_shortstring) or + (torddef(left.resultdef).ordtype = uwidechar) then begin - { create the parameter } + { create the procname } + if torddef(left.resultdef).ordtype<>uwidechar then + procname := 'fpc_char_to_' + else + procname := 'fpc_wchar_to_'; + procname:=procname+tstringdef(resultdef).stringtypname; + + { and the parameter } para := ccallparanode.create(left,nil); left := nil; - { and the procname } - procname := 'fpc_char_to_' +tstringdef(resultdef).stringtypname; - { and finally the call } result := ccallnode.createinternres(procname,para,resultdef); end @@ -987,7 +1005,11 @@ implementation begin result:=nil; - if left.nodetype=ordconstn then + if (left.nodetype=ordconstn) and + ((torddef(resultdef).ordtype<>uchar) or + (torddef(left.resultdef).ordtype<>uwidechar) or + { >= 128 is replaced by '?' currently -> loses information } + (tordconstnode(left).value.uvalue<128)) then begin if (torddef(resultdef).ordtype=uchar) and (torddef(left.resultdef).ordtype=uwidechar) then @@ -2248,9 +2270,21 @@ implementation function ttypeconvnode.first_char_to_char : tnode; - + var + fname: string[18]; begin - first_char_to_char:=first_int_to_int; + if (torddef(resultdef).ordtype=uchar) and + (torddef(left.resultdef).ordtype=uwidechar) then + fname := 'fpc_wchar_to_char' + else if (torddef(resultdef).ordtype=uwidechar) and + (torddef(left.resultdef).ordtype=uchar) then + fname := 'fpc_char_to_wchar' + else + internalerror(2007081201); + + result := ccallnode.createintern(fname,ccallparanode.create(left,nil)); + left:=nil; + firstpass(result); end; diff --git a/compiler/widestr.pas b/compiler/widestr.pas index 8736f78d59..5bddf2a0cd 100644 --- a/compiler/widestr.pas +++ b/compiler/widestr.pas @@ -55,6 +55,7 @@ unit widestr; function unicode2asciichar(c : tcompilerwidechar) : char; procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring); procedure unicode2ascii(r : pcompilerwidestring;p : pchar); + function hasnonasciichars(const p: pcompilerwidestring): boolean; function getcharwidestring(r : pcompilerwidestring;l : SizeInt) : tcompilerwidechar; function cpavailable(const s : string) : boolean; @@ -166,10 +167,11 @@ unit widestr; end; function unicode2asciichar(c : tcompilerwidechar) : char; - begin -{$warning TODO unicode2asciichar} - unicode2asciichar:=#0; + if word(c)<128 then + unicode2asciichar:=char(word(c)) + else + unicode2asciichar:='?'; end; procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring); @@ -242,6 +244,23 @@ unit widestr; end; + function hasnonasciichars(const p: pcompilerwidestring): boolean; + var + source : tcompilerwidecharptr; + i : longint; + begin + source:=tcompilerwidecharptr(p^.data); + result:=true; + for i:=1 to p^.len do + begin + if word(source^)>=128 then + exit; + inc(source); + end; + result:=false; + end; + + function cpavailable(const s : string) : boolean; begin cpavailable:=mappingavailable(s); diff --git a/rtl/inc/compproc.inc b/rtl/inc/compproc.inc index 5caa740f9c..3f72575539 100644 --- a/rtl/inc/compproc.inc +++ b/rtl/inc/compproc.inc @@ -187,6 +187,7 @@ Procedure fpc_AnsiStr_ShortStr_Concat (Var S1: AnsiString; Var S2 : ShortString) function fpc_AnsiStr_To_ShortStr (high_of_res: SizeInt;const S2 : Ansistring): shortstring; compilerproc; Function fpc_ShortStr_To_AnsiStr (Const S2 : ShortString): ansistring; compilerproc; Function fpc_Char_To_AnsiStr(const c : Char): AnsiString; compilerproc; + Function fpc_PChar_To_AnsiStr(const p : pchar): ansistring; compilerproc; Function fpc_CharArray_To_AnsiStr(const arr: array of char; zerobased: boolean = true): ansistring; compilerproc; {$ifndef FPC_STRTOCHARARRAYPROC} @@ -224,7 +225,7 @@ function fpc_WideStr_Concat_multi (const sarr:array of Widestring): widestring; Procedure fpc_WideStr_Concat (Var DestS : Widestring;const S1,S2 : WideString); compilerproc; Procedure fpc_WideStr_Concat_multi (Var DestS : Widestring;const sarr:array of Widestring); compilerproc; {$endif STR_CONCAT_PROCS} -Function fpc_Char_To_WideStr(const c : WideChar): WideString; compilerproc; +Function fpc_Char_To_WideStr(const c : Char): WideString; compilerproc; Function fpc_PChar_To_WideStr(const p : pchar): WideString; compilerproc; Function fpc_CharArray_To_WideStr(const arr: array of char; zerobased: boolean = true): WideString; compilerproc; {$ifndef FPC_STRTOCHARARRAYPROC} @@ -250,6 +251,11 @@ Function fpc_widestr_Copy (Const S : WideString; Index,Size : SizeInt) : WideSt {$ifndef FPC_WINLIKEWIDESTRING} function fpc_widestr_Unique(Var S : Pointer): Pointer; compilerproc; {$endif FPC_WINLIKEWIDESTRING} +Function fpc_Char_To_WChar(const c : Char): WideChar; compilerproc; +Function fpc_WChar_To_Char(const c : WideChar): Char; compilerproc; +Function fpc_WChar_To_ShortStr(const c : WideChar): ShortString; compilerproc; +Function fpc_WChar_To_AnsiStr(const c : WideChar): AnsiString; compilerproc; +Function fpc_WChar_To_WideStr(const c : WideChar): WideString; compilerproc; {$endif FPC_HAS_FEATURE_WIDESTRINGS} {$ifdef FPC_HAS_FEATURE_WIDESTRINGS} diff --git a/rtl/inc/text.inc b/rtl/inc/text.inc index fcd1706153..7fe4418823 100644 --- a/rtl/inc/text.inc +++ b/rtl/inc/text.inc @@ -832,7 +832,7 @@ End; Procedure fpc_Write_Text_WideChar(Len : Longint;var t : Text;c : WideChar); iocheck; [Public,Alias:'FPC_WRITE_TEXT_WIDECHAR']; compilerproc; var - ch : char; + a : ansistring; Begin If (InOutRes<>0) then exit; @@ -848,9 +848,9 @@ Begin fpc_WriteBlanks(t,Len-1); If TextRec(t).BufPos>=TextRec(t).BufSize Then FileFunc(TextRec(t).InOutFunc)(TextRec(t)); - ch:=c; - TextRec(t).Bufptr^[TextRec(t).BufPos]:=ch; - Inc(TextRec(t).BufPos); + { a widechar can be translated into more than a single ansichar } + a:=c; + fpc_WriteBuffer(t,pchar(a)^,length(a)); End; diff --git a/rtl/inc/wstrings.inc b/rtl/inc/wstrings.inc index 6c1a09810a..09bf82ecc1 100644 --- a/rtl/inc/wstrings.inc +++ b/rtl/inc/wstrings.inc @@ -592,15 +592,21 @@ end; {$endif STR_CONCAT_PROCS} +Function fpc_Char_To_WChar(const c : Char): WideChar; compilerproc; +var + w: widestring; +begin + widestringmanager.Ansi2WideMoveProc(@c, w, 1); + fpc_Char_To_WChar:= w[1]; +end; + -Function fpc_Char_To_WideStr(const c : WideChar): WideString; compilerproc; + +Function fpc_Char_To_WideStr(const c : Char): WideString; compilerproc; { Converts a Char to a WideString; } begin - if c = #0 then - { result is automatically set to '' } - exit; Setlength(fpc_Char_To_WideStr,1); fpc_Char_To_WideStr[1]:=c; { Terminating Zero } @@ -608,6 +614,52 @@ begin end; +Function fpc_WChar_To_Char(const c : WideChar): Char; compilerproc; +{ + Converts a WideChar to a Char; +} +var + s: ansistring; +begin + widestringmanager.Wide2AnsiMoveProc(@c, s, 1); + if length(s)=1 then + fpc_WChar_To_Char:= s[1] + else + fpc_WChar_To_Char:='?'; +end; + + +Function fpc_WChar_To_WideStr(const c : WideChar): WideString; compilerproc; +{ + Converts a WideChar to a WideString; +} +begin + Setlength (fpc_WChar_To_WideStr,1); + fpc_WChar_To_WideStr[1]:= c; +end; + + +Function fpc_WChar_To_AnsiStr(const c : WideChar): AnsiString; compilerproc; +{ + Converts a WideChar to a AnsiString; +} +begin + widestringmanager.Wide2AnsiMoveProc(@c, fpc_WChar_To_AnsiStr, 1); +end; + + +Function fpc_WChar_To_ShortStr(const c : WideChar): ShortString; compilerproc; +{ + Converts a WideChar to a ShortString; +} +var + s: ansistring; +begin + widestringmanager.Wide2AnsiMoveProc(@c, s, 1); + fpc_WChar_To_ShortStr:= s; +end; + + Function fpc_PChar_To_WideStr(const p : pchar): WideString; compilerproc; Var L : SizeInt; diff --git a/rtl/unix/cwstring.pp b/rtl/unix/cwstring.pp index e9c47ea388..6a68be8bbf 100644 --- a/rtl/unix/cwstring.pp +++ b/rtl/unix/cwstring.pp @@ -122,6 +122,7 @@ procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt); destpos: pchar; mynil : pchar; my0 : size_t; + err: cint; begin mynil:=nil; my0:=0; @@ -134,7 +135,11 @@ procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt); outleft:=outlength; while iconv(iconv_wide2ansi,ppchar(@srcpos),@srclen,@destpos,@outleft)=size_t(-1) do begin - case fpgetCerrno of + err:=fpgetCerrno; + case err of + { last character is incomplete sequence } + ESysEINVAL, + { incomplete sequence in the middle } ESysEILSEQ: begin { skip and set to '?' } @@ -145,6 +150,8 @@ procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt); dec(outleft); { reset } iconv(iconv_wide2ansi,@mynil,@my0,@mynil,@my0); + if err=ESysEINVAL then + break; end; ESysE2BIG: begin @@ -174,19 +181,21 @@ procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt); destpos: pchar; mynil : pchar; my0 : size_t; + err: cint; begin mynil:=nil; my0:=0; // extra space outlength:=len+1; setlength(dest,outlength); - outlength:=len+1; srcpos:=source; destpos:=pchar(dest); outleft:=outlength*2; while iconv(iconv_ansi2wide,@srcpos,psize(@len),@destpos,@outleft)=size_t(-1) do begin - case fpgetCerrno of + err:=fpgetCerrno; + case err of + ESysEINVAL, ESysEILSEQ: begin { skip and set to '?' } @@ -197,6 +206,8 @@ procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt); dec(outleft,2); { reset } iconv(iconv_ansi2wide,@mynil,@my0,@mynil,@my0); + if err=ESysEINVAL then + break; end; ESysE2BIG: begin diff --git a/rtl/win32/system.pp b/rtl/win32/system.pp index c74464e7e0..3352e813c0 100644 --- a/rtl/win32/system.pp +++ b/rtl/win32/system.pp @@ -926,9 +926,11 @@ procedure Win32Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt); destlen: SizeInt; begin // retrieve length including trailing #0 - destlen:=WideCharToMultiByte(CP_ACP, 0, source, len+1, nil, 0, nil, nil); - setlength(dest, destlen-1); - WideCharToMultiByte(CP_ACP, 0, source, len+1, @dest[1], destlen, nil, nil); + // not anymore, because this must also be usable for single characters + destlen:=WideCharToMultiByte(CP_ACP, 0, source, len, nil, 0, nil, nil); + // this will null-terminate + setlength(dest, destlen); + WideCharToMultiByte(CP_ACP, 0, source, len, @dest[1], destlen, nil, nil); end; procedure Win32Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt); @@ -936,9 +938,11 @@ procedure Win32Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt); destlen: SizeInt; begin // retrieve length including trailing #0 - destlen:=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len+1, nil, 0); - setlength(dest, destlen-1); - MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len+1, @dest[1], destlen); + // not anymore, because this must also be usable for single characters + destlen:=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len, nil, 0); + // this will null-terminate + setlength(dest, destlen); + MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len, @dest[1], destlen); end; diff --git a/rtl/win64/system.pp b/rtl/win64/system.pp index dd70b8ccfc..0c08e7cdfc 100644 --- a/rtl/win64/system.pp +++ b/rtl/win64/system.pp @@ -968,9 +968,11 @@ procedure Win32Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt); destlen: SizeInt; begin // retrieve length including trailing #0 - destlen:=WideCharToMultiByte(CP_ACP, 0, source, len+1, nil, 0, nil, nil); - setlength(dest, destlen-1); - WideCharToMultiByte(CP_ACP, 0, source, len+1, @dest[1], destlen, nil, nil); + // not anymore, because this must also be usable for single characters + destlen:=WideCharToMultiByte(CP_ACP, 0, source, len, nil, 0, nil, nil); + // this will null-terminate + setlength(dest, destlen); + WideCharToMultiByte(CP_ACP, 0, source, len, @dest[1], destlen, nil, nil); end; procedure Win32Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt); @@ -978,9 +980,11 @@ procedure Win32Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt); destlen: SizeInt; begin // retrieve length including trailing #0 - destlen:=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len+1, nil, 0); - setlength(dest, destlen-1); - MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len+1, @dest[1], destlen); + // not anymore, because this must also be usable for single characters + destlen:=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len, nil, 0); + // this will null-terminate + setlength(dest, destlen); + MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, source, len, @dest[1], destlen); end; diff --git a/tests/webtbs/tw7758.pp b/tests/webtbs/tw7758.pp new file mode 100644 index 0000000000..56b1424987 --- /dev/null +++ b/tests/webtbs/tw7758.pp @@ -0,0 +1,148 @@ +{$codepage utf8} + +uses +{$ifdef unix} + cwstring, +{$endif} + sysutils; + +const + cwc=widechar('a'); + c2=widechar('é'); + c3=widestring('é'); +var + c: char; + wc,wc2: widechar; + s,s2,a: ansistring; + w: widestring; + ss: shortstring; +begin + c:=#0; + w:=c; + if (length(w)<>1) or + (w[1]<>#0) then + halt(1); + s:='é'; + w:=s; + wc:=w[1]; + s2:=wc; + if (w <> s2) or + (s <> s2) then + halt(2); + + c:=#0; + wc:=c; + c:=wc; + if (c<>#0) or + (wc<>#0) then + halt(5); + ss:=wc; + wc:=ss[1]; + if (length(ss)<>1) or + (ss[1]<>#0) or + (wc<>#0) then + halt(6); + a:=wc; + wc:=a[1]; + if (length(a)<>1) or + (a[1]<>#0) or + (wc<>#0) then + halt(7); + + c:='a'; + wc:=c; + c:=wc; + if (c<>'a') or + (wc<>'a') then + halt(8); + ss:=wc; + wc:=ss[1]; + if (length(ss)<>1) or + (ss[1]<>'a') or + (wc<>'a') then + halt(9); + a:=wc; + wc:=a[1]; + if (length(a)<>1) or + (a[1]<>'a') or + (wc<>'a') then + halt(10); + + wc2:=cwc; + if (wc2<>'a') or + (wc2<>cwc) then + halt(3); + ss:=cwc; + if (length(ss)<>1) or + (ss[1] <> 'a') then + halt(4); + c:=cwc; + if (c<>'a') or + (c<>cwc) then + halt(13); + w:=cwc; + if (length(w)<>1) or + (w[1] <> 'a') then + halt(11); + s:=cwc; + if (length(s)<>1) or + (s[1] <> 'a') then + halt(12); + + + wc:=c2; + c:=c2; + wc2:=c; + if ((c<>c2) and + (c<>'?')) or + (wc<>c2) or + ((wc2<>c2) and + (wc2<>'?')) then + halt(14); + ss:=c2; + w:=ss; + wc:=w[1]; + if (length(w)<>1) or + (w[1]<>c2) or + (wc<>c2) then + halt(15); + a:=c2; + w:=a; + wc:=w[1]; + if (length(w)<>1) or + (w[1]<>c2) or + (wc<>c2) then + halt(16); + + ss:=c3; + w:=ss; + wc:=w[1]; + if (length(w)<>1) or + (wc <> c2) then + halt(17); + c:=c3[1]; + if ((c<>c2) and + (c<>'?')) then + halt(18); + w:=c3; + if (length(w)<>1) or + (w[1] <> c2) then + halt(19); + s:=c3; + w:=s; + if (length(w)<>1) or + (w[1] <> c2) then + halt(20); + ss:=c3; + w:=ss; + if (length(w)<>1) or + (w[1] <> c2) then + halt(21); + + wc:=c2; + writestr(s,wc); + w:=s; + if (length(w)<>1) or + (w[1]<>c2) then + halt(22); +end. diff --git a/tests/webtbs/tw7758a.pp b/tests/webtbs/tw7758a.pp new file mode 100644 index 0000000000..2530de1808 --- /dev/null +++ b/tests/webtbs/tw7758a.pp @@ -0,0 +1,19 @@ +{ %norun } + +uses + {$ifdef unix} + cwstring, + {$endif} + sysutils; + +{ just to make sure that no all wide->shortstring compile time conversions } +{ fail, but only those resulting in data loss } +const + cw = widestring('abc'); + de = 'a'+shortstring(cw); + wc = widechar('a'); + df = shortstring(wc)+'abcd'; + dg = char(wc)+'abcd'; + +begin +end.