From a99ffb3097c48959cf7823d4ab7cd057b82755a0 Mon Sep 17 00:00:00 2001 From: paul <paul@idefix.freepascal.org> Date: Tue, 11 Oct 2011 01:21:07 +0000 Subject: [PATCH] compiler: apply patches from Inoussa and Jonas: defcmp: Address code paged' string type comparison taking care of the code page ncnv: Remove un-needed code page comparison to CP_UTF8, some fixes regarding shortstrings and wide char/string ncon: For the case of tstringconstnode.changestringtype (ncon.pas) where the code page are of CP_NONE or 0 no translation is done as : * CP_NONE is compatible to all * For 0 the raw bytes are just copied. My changes: - change ascii2unicode to allow pass source codepage, - convert in both cases when source or destination is UTF8 git-svn-id: trunk@19457 - --- compiler/defcmp.pas | 30 ++++++++++++++------- compiler/nadd.pas | 28 +++++++++++++++++--- compiler/ncnv.pas | 62 +++++++++++++++++++++++++------------------- compiler/ncon.pas | 53 ++++++++++++++++++++++++++++++++++--- compiler/scanner.pas | 8 +++--- compiler/widestr.pas | 6 ++--- 6 files changed, 138 insertions(+), 49 deletions(-) diff --git a/compiler/defcmp.pas b/compiler/defcmp.pas index 564b93ad68..19661d33d5 100644 --- a/compiler/defcmp.pas +++ b/compiler/defcmp.pas @@ -335,20 +335,32 @@ implementation { Constant string } if (fromtreetype=stringconstn) then begin - if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) then + if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and + ((tstringdef(def_from).stringtype<>st_ansistring) or + (tstringdef(def_from).encoding=tstringdef(def_to).encoding) + ) then eq:=te_equal else begin doconv:=tc_string_2_string; - { Don't prefer conversions from widestring to a - normal string as we can lose information } - if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and - not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then - eq:=te_convert_l3 - else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then - eq:=te_convert_l2 + if (tstringdef(def_from).stringtype = st_ansistring) and + (tstringdef(def_to).stringtype = st_ansistring) then + if (tstringdef(def_to).encoding=globals.CP_UTF8) then + eq:=te_convert_l1 + else + eq:=te_convert_l2 else - eq:=te_convert_l1; + begin + { Don't prefer conversions from widestring to a + normal string as we can lose information } + if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and + not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then + eq:=te_convert_l3 + else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then + eq:=te_convert_l2 + else + eq:=te_convert_l1; + end; end; end else if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and diff --git a/compiler/nadd.pas b/compiler/nadd.pas index 97f2eaeef0..57831d30a7 100644 --- a/compiler/nadd.pas +++ b/compiler/nadd.pas @@ -1644,10 +1644,30 @@ implementation end; st_ansistring : begin - if not(is_ansistring(rd)) then - inserttypeconv(right,cansistringtype); - if not(is_ansistring(ld)) then - inserttypeconv(left,cansistringtype); + { use same code page if possible (don't force same code + page in case both are ansistrings with code page <> + CP_NONE, since then data loss can occur (the ansistring + helpers will convert them at run time to an encoding + that can represent both encodings) } + if is_ansistring(ld) and + (tstringdef(ld).encoding<>0) and + (tstringdef(ld).encoding<>globals.CP_NONE) and + (not is_ansistring(rd) or + (tstringdef(rd).encoding=0) or + (tstringdef(rd).encoding=globals.CP_NONE)) then + inserttypeconv(right,ld) + else if is_ansistring(rd) and + (tstringdef(rd).encoding<>0) and + (tstringdef(rd).encoding<>globals.CP_NONE) and + (not is_ansistring(ld) or + (tstringdef(ld).encoding=0) or + (tstringdef(ld).encoding=globals.CP_NONE)) then + inserttypeconv(left,rd) + else + begin + inserttypeconv(left,cansistringtype); + inserttypeconv(right,cansistringtype); + end; end; st_longstring : begin diff --git a/compiler/ncnv.pas b/compiler/ncnv.pas index 4989e34f93..ec6bff85bc 100644 --- a/compiler/ncnv.pas +++ b/compiler/ncnv.pas @@ -1025,23 +1025,18 @@ implementation newblock : tblocknode; newstat : tstatementnode; restemp : ttempcreatenode; - //sa : ansistring; - //cw : WideChar; - //l : SizeUInt; + sa : ansistring; + cw : WideChar; + l : SizeUInt; begin result:=nil; - { we can't do widechar to ansichar conversions at compile time, since } - { this maps all non-ascii chars to '?' -> loses information } - if (left.nodetype=ordconstn) and - ((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or + ((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring]) or (torddef(left.resultdef).ordtype=uchar) or ((torddef(left.resultdef).ordtype=uwidechar) and - (current_settings.sourcecodepage<>CP_UTF8) + (tstringdef(resultdef).stringtype<>st_shortstring) ) - ) - { widechar >=128 is destroyed } - {(tordconstnode(left).value.uvalue<128))} then + ) then begin if (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) then begin @@ -1062,12 +1057,11 @@ implementation hp:=cstringconstnode.createstr(unicode2asciichar(tcompilerwidechar(tordconstnode(left).value.uvalue))) else begin - exit; - {Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue); + Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue); SetLength(sa,5); l:=UnicodeToUtf8(@(sa[1]),Length(sa),@cw,1); SetLength(sa,l-1); - hp:=cstringconstnode.createstr(sa);} + hp:=cstringconstnode.createstr(sa); end end else @@ -1077,6 +1071,18 @@ implementation result:=hp; end else + if (tstringdef(resultdef).stringtype=st_shortstring) and + (torddef(left.resultdef).ordtype=uwidechar) and + (tcompilerwidechar(tordconstnode(left).value.uvalue) <= 127) + then + begin + SetLength(sa,1); + Byte(sa[1]):= tordconstnode(left).value.uvalue; + hp:=cstringconstnode.createstr(sa); + tstringconstnode(hp).changestringtype(resultdef); + result:=hp; + end + else { shortstrings are handled 'inline' (except for widechars) } if (tstringdef(resultdef).stringtype<>st_shortstring) or (torddef(left.resultdef).ordtype=uwidechar) then @@ -1133,14 +1139,11 @@ implementation begin result:=nil; if (left.nodetype=stringconstn) and - ((tstringdef(resultdef).stringtype=st_shortstring) or - ((tstringdef(resultdef).stringtype=st_ansistring) and + (((tstringdef(resultdef).stringtype=st_ansistring) and (tstringdef(resultdef).encoding<>CP_NONE) ) ) and - ((tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and - (current_settings.sourcecodepage<>CP_UTF8) - ) then + (tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) then begin tstringconstnode(left).changestringtype(resultdef); Result:=left; @@ -1163,7 +1166,18 @@ implementation resultdef ); left:=nil; - end; + end + else if (left.nodetype=stringconstn) and + (tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and + (tstringdef(resultdef).stringtype=st_shortstring) then + begin + if not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)) then + begin + tstringconstnode(left).changestringtype(resultdef); + Result:=left; + left:=nil; + end; + end end; function ttypeconvnode.typecheck_char_to_chararray : tnode; @@ -1190,8 +1204,7 @@ implementation ((torddef(resultdef).ordtype<>uchar) or (torddef(left.resultdef).ordtype<>uwidechar) or (current_settings.sourcecodepage<>CP_UTF8)) - { >= 128 is replaced by '?' currently -> loses information } - {(tordconstnode(left).value.uvalue<128))} then + then begin if (torddef(resultdef).ordtype=uchar) and (torddef(left.resultdef).ordtype=uwidechar) and @@ -2269,11 +2282,8 @@ implementation ( ((not is_widechararray(left.resultdef) and not is_wide_or_unicode_string(left.resultdef)) or - (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or - (current_settings.sourcecodepage<>CP_UTF8) + (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring]) ) - { non-ascii chars would be replaced with '?' -> loses info } - {not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)))} ) then begin tstringconstnode(left).changestringtype(resultdef); diff --git a/compiler/ncon.pas b/compiler/ncon.pas index 97b3dd9bbe..cb8ed2df71 100644 --- a/compiler/ncon.pas +++ b/compiler/ncon.pas @@ -993,7 +993,7 @@ implementation not(cst_type in [cst_widestring,cst_unicodestring]) then begin initwidestring(pw); - ascii2unicode(value_str,len,pw); + ascii2unicode(value_str,len,current_settings.sourcecodepage,pw); ansistringdispose(value_str,len); pcompilerwidestring(value_str):=pw; end @@ -1035,8 +1035,55 @@ implementation cp2:=tstringdef(resultdef).encoding else if (cst_type in [cst_shortstring,cst_conststring,cst_longstring]) then cp2:=current_settings.sourcecodepage; - if cpavailable(cp1) and cpavailable(cp2) then - changecodepage(value_str,len,cp2,value_str,cp1); + { don't change string if codepages are equal or string length is 0 } + if (cp1<>cp2) and (len>0) then + begin + if cpavailable(cp1) and cpavailable(cp2) then + changecodepage(value_str,len,cp2,value_str,cp1) + else if (cp1 <> CP_NONE) and (cp2 <> CP_NONE) and (cp1 <> 0) and (cp2 <> 0) then + begin + { if source encoding is UTF8 convert using UTF8->UTF16->destination encoding } + if (cp2=CP_UTF8) then + begin + if not cpavailable(cp1) then + Message1(option_code_page_not_available,IntToStr(cp1)); + initwidestring(pw); + setlengthwidestring(pw,len); + l:=Utf8ToUnicode(PUnicodeChar(pw^.data),len,value_str,len); + if (l<>getlengthwidestring(pw)) then + begin + setlengthwidestring(pw,l); + ReAllocMem(value_str,l); + end; + unicode2ascii(pw,value_str,cp1); + donewidestring(pw); + end + else + { if destination encoding is UTF8 convert using source encoding->UTF16->UTF8 } + if (cp1=CP_UTF8) then + begin + if not cpavailable(cp2) then + Message1(option_code_page_not_available,IntToStr(cp2)); + initwidestring(pw); + setlengthwidestring(pw,len); + ascii2unicode(value_str,len,cp2,pw); + l:=UnicodeToUtf8(nil,PUnicodeChar(pw^.data),0); + if l<>len then + ReAllocMem(value_str,l); + len:=l-1; + UnicodeToUtf8(value_str,PUnicodeChar(pw^.data),l); + donewidestring(pw); + end + else + begin + { output error message that encoding is not available for the compiler } + if not cpavailable(cp1) then + Message1(option_code_page_not_available,IntToStr(cp1)); + if not cpavailable(cp2) then + Message1(option_code_page_not_available,IntToStr(cp2)); + end; + end; + end; end; cst_type:=st2cst[tstringdef(def).stringtype]; resultdef:=def; diff --git a/compiler/scanner.pas b/compiler/scanner.pas index ee32261319..749546110e 100644 --- a/compiler/scanner.pas +++ b/compiler/scanner.pas @@ -4208,9 +4208,9 @@ In case not, the value returned can be arbitrary. if not iswidestring then begin if len>0 then - ascii2unicode(@cstringpattern[1],len,patternw) + ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw) else - ascii2unicode(nil,len,patternw); + ascii2unicode(nil,len,current_settings.sourcecodepage,patternw); iswidestring:=true; len:=0; end; @@ -4252,9 +4252,9 @@ In case not, the value returned can be arbitrary. if not iswidestring then begin if len>0 then - ascii2unicode(@cstringpattern[1],len,patternw) + ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw) else - ascii2unicode(nil,len,patternw); + ascii2unicode(nil,len,current_settings.sourcecodepage,patternw); iswidestring:=true; len:=0; end; diff --git a/compiler/widestr.pas b/compiler/widestr.pas index 18646424d5..87701708c8 100644 --- a/compiler/widestr.pas +++ b/compiler/widestr.pas @@ -52,7 +52,7 @@ unit widestr; procedure copywidestring(s,d : pcompilerwidestring); function asciichar2unicode(c : char) : tcompilerwidechar; function unicode2asciichar(c : tcompilerwidechar) : char; - procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring); + procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring); procedure unicode2ascii(r : pcompilerwidestring;p : pchar;cp : tstringencoding); function hasnonasciichars(const p: pcompilerwidestring): boolean; function getcharwidestring(r : pcompilerwidestring;l : SizeInt) : tcompilerwidechar; @@ -189,14 +189,14 @@ unit widestr; Result := getascii(c,getmap(current_settings.sourcecodepage))[1]; end; - procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring); + procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring); var source : pchar; dest : tcompilerwidecharptr; i : SizeInt; m : punicodemap; begin - m:=getmap(current_settings.sourcecodepage); + m:=getmap(cp); setlengthwidestring(r,l); source:=p; dest:=tcompilerwidecharptr(r^.data);