compiler: apply patches from Inoussa and Jonas:

defcmp: Address code paged' string type comparison taking care of the code page
  ncnv: Remove un-needed code page comparison to CP_UTF8, some fixes regarding shortstrings and wide char/string 
  ncon: For the case of tstringconstnode.changestringtype (ncon.pas) where the code page are of CP_NONE or 0 no translation is done as :
    * CP_NONE is compatible to all
    * For 0 the raw bytes are just copied.
 My changes:
  - change ascii2unicode to allow pass source codepage, 
  - convert in both cases when source or destination is UTF8

git-svn-id: trunk@19457 -
This commit is contained in:
paul 2011-10-11 01:21:07 +00:00
parent cf81d39682
commit a99ffb3097
6 changed files with 138 additions and 49 deletions

View File

@ -335,20 +335,32 @@ implementation
{ Constant string }
if (fromtreetype=stringconstn) then
begin
if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) then
if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and
((tstringdef(def_from).stringtype<>st_ansistring) or
(tstringdef(def_from).encoding=tstringdef(def_to).encoding)
) then
eq:=te_equal
else
begin
doconv:=tc_string_2_string;
{ Don't prefer conversions from widestring to a
normal string as we can lose information }
if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and
not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then
eq:=te_convert_l3
else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then
eq:=te_convert_l2
if (tstringdef(def_from).stringtype = st_ansistring) and
(tstringdef(def_to).stringtype = st_ansistring) then
if (tstringdef(def_to).encoding=globals.CP_UTF8) then
eq:=te_convert_l1
else
eq:=te_convert_l2
else
eq:=te_convert_l1;
begin
{ Don't prefer conversions from widestring to a
normal string as we can lose information }
if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and
not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then
eq:=te_convert_l3
else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then
eq:=te_convert_l2
else
eq:=te_convert_l1;
end;
end;
end
else if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and

View File

@ -1644,10 +1644,30 @@ implementation
end;
st_ansistring :
begin
if not(is_ansistring(rd)) then
inserttypeconv(right,cansistringtype);
if not(is_ansistring(ld)) then
inserttypeconv(left,cansistringtype);
{ use same code page if possible (don't force same code
page in case both are ansistrings with code page <>
CP_NONE, since then data loss can occur (the ansistring
helpers will convert them at run time to an encoding
that can represent both encodings) }
if is_ansistring(ld) and
(tstringdef(ld).encoding<>0) and
(tstringdef(ld).encoding<>globals.CP_NONE) and
(not is_ansistring(rd) or
(tstringdef(rd).encoding=0) or
(tstringdef(rd).encoding=globals.CP_NONE)) then
inserttypeconv(right,ld)
else if is_ansistring(rd) and
(tstringdef(rd).encoding<>0) and
(tstringdef(rd).encoding<>globals.CP_NONE) and
(not is_ansistring(ld) or
(tstringdef(ld).encoding=0) or
(tstringdef(ld).encoding=globals.CP_NONE)) then
inserttypeconv(left,rd)
else
begin
inserttypeconv(left,cansistringtype);
inserttypeconv(right,cansistringtype);
end;
end;
st_longstring :
begin

View File

@ -1025,23 +1025,18 @@ implementation
newblock : tblocknode;
newstat : tstatementnode;
restemp : ttempcreatenode;
//sa : ansistring;
//cw : WideChar;
//l : SizeUInt;
sa : ansistring;
cw : WideChar;
l : SizeUInt;
begin
result:=nil;
{ we can't do widechar to ansichar conversions at compile time, since }
{ this maps all non-ascii chars to '?' -> loses information }
if (left.nodetype=ordconstn) and
((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or
((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring]) or
(torddef(left.resultdef).ordtype=uchar) or
((torddef(left.resultdef).ordtype=uwidechar) and
(current_settings.sourcecodepage<>CP_UTF8)
(tstringdef(resultdef).stringtype<>st_shortstring)
)
)
{ widechar >=128 is destroyed }
{(tordconstnode(left).value.uvalue<128))} then
) then
begin
if (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) then
begin
@ -1062,12 +1057,11 @@ implementation
hp:=cstringconstnode.createstr(unicode2asciichar(tcompilerwidechar(tordconstnode(left).value.uvalue)))
else
begin
exit;
{Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue);
Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue);
SetLength(sa,5);
l:=UnicodeToUtf8(@(sa[1]),Length(sa),@cw,1);
SetLength(sa,l-1);
hp:=cstringconstnode.createstr(sa);}
hp:=cstringconstnode.createstr(sa);
end
end
else
@ -1077,6 +1071,18 @@ implementation
result:=hp;
end
else
if (tstringdef(resultdef).stringtype=st_shortstring) and
(torddef(left.resultdef).ordtype=uwidechar) and
(tcompilerwidechar(tordconstnode(left).value.uvalue) <= 127)
then
begin
SetLength(sa,1);
Byte(sa[1]):= tordconstnode(left).value.uvalue;
hp:=cstringconstnode.createstr(sa);
tstringconstnode(hp).changestringtype(resultdef);
result:=hp;
end
else
{ shortstrings are handled 'inline' (except for widechars) }
if (tstringdef(resultdef).stringtype<>st_shortstring) or
(torddef(left.resultdef).ordtype=uwidechar) then
@ -1133,14 +1139,11 @@ implementation
begin
result:=nil;
if (left.nodetype=stringconstn) and
((tstringdef(resultdef).stringtype=st_shortstring) or
((tstringdef(resultdef).stringtype=st_ansistring) and
(((tstringdef(resultdef).stringtype=st_ansistring) and
(tstringdef(resultdef).encoding<>CP_NONE)
)
) and
((tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and
(current_settings.sourcecodepage<>CP_UTF8)
) then
(tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) then
begin
tstringconstnode(left).changestringtype(resultdef);
Result:=left;
@ -1163,7 +1166,18 @@ implementation
resultdef
);
left:=nil;
end;
end
else if (left.nodetype=stringconstn) and
(tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and
(tstringdef(resultdef).stringtype=st_shortstring) then
begin
if not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)) then
begin
tstringconstnode(left).changestringtype(resultdef);
Result:=left;
left:=nil;
end;
end
end;
function ttypeconvnode.typecheck_char_to_chararray : tnode;
@ -1190,8 +1204,7 @@ implementation
((torddef(resultdef).ordtype<>uchar) or
(torddef(left.resultdef).ordtype<>uwidechar) or
(current_settings.sourcecodepage<>CP_UTF8))
{ >= 128 is replaced by '?' currently -> loses information }
{(tordconstnode(left).value.uvalue<128))} then
then
begin
if (torddef(resultdef).ordtype=uchar) and
(torddef(left.resultdef).ordtype=uwidechar) and
@ -2269,11 +2282,8 @@ implementation
(
((not is_widechararray(left.resultdef) and
not is_wide_or_unicode_string(left.resultdef)) or
(tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or
(current_settings.sourcecodepage<>CP_UTF8)
(tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring])
)
{ non-ascii chars would be replaced with '?' -> loses info }
{not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)))}
) then
begin
tstringconstnode(left).changestringtype(resultdef);

View File

@ -993,7 +993,7 @@ implementation
not(cst_type in [cst_widestring,cst_unicodestring]) then
begin
initwidestring(pw);
ascii2unicode(value_str,len,pw);
ascii2unicode(value_str,len,current_settings.sourcecodepage,pw);
ansistringdispose(value_str,len);
pcompilerwidestring(value_str):=pw;
end
@ -1035,8 +1035,55 @@ implementation
cp2:=tstringdef(resultdef).encoding
else if (cst_type in [cst_shortstring,cst_conststring,cst_longstring]) then
cp2:=current_settings.sourcecodepage;
if cpavailable(cp1) and cpavailable(cp2) then
changecodepage(value_str,len,cp2,value_str,cp1);
{ don't change string if codepages are equal or string length is 0 }
if (cp1<>cp2) and (len>0) then
begin
if cpavailable(cp1) and cpavailable(cp2) then
changecodepage(value_str,len,cp2,value_str,cp1)
else if (cp1 <> CP_NONE) and (cp2 <> CP_NONE) and (cp1 <> 0) and (cp2 <> 0) then
begin
{ if source encoding is UTF8 convert using UTF8->UTF16->destination encoding }
if (cp2=CP_UTF8) then
begin
if not cpavailable(cp1) then
Message1(option_code_page_not_available,IntToStr(cp1));
initwidestring(pw);
setlengthwidestring(pw,len);
l:=Utf8ToUnicode(PUnicodeChar(pw^.data),len,value_str,len);
if (l<>getlengthwidestring(pw)) then
begin
setlengthwidestring(pw,l);
ReAllocMem(value_str,l);
end;
unicode2ascii(pw,value_str,cp1);
donewidestring(pw);
end
else
{ if destination encoding is UTF8 convert using source encoding->UTF16->UTF8 }
if (cp1=CP_UTF8) then
begin
if not cpavailable(cp2) then
Message1(option_code_page_not_available,IntToStr(cp2));
initwidestring(pw);
setlengthwidestring(pw,len);
ascii2unicode(value_str,len,cp2,pw);
l:=UnicodeToUtf8(nil,PUnicodeChar(pw^.data),0);
if l<>len then
ReAllocMem(value_str,l);
len:=l-1;
UnicodeToUtf8(value_str,PUnicodeChar(pw^.data),l);
donewidestring(pw);
end
else
begin
{ output error message that encoding is not available for the compiler }
if not cpavailable(cp1) then
Message1(option_code_page_not_available,IntToStr(cp1));
if not cpavailable(cp2) then
Message1(option_code_page_not_available,IntToStr(cp2));
end;
end;
end;
end;
cst_type:=st2cst[tstringdef(def).stringtype];
resultdef:=def;

View File

@ -4208,9 +4208,9 @@ In case not, the value returned can be arbitrary.
if not iswidestring then
begin
if len>0 then
ascii2unicode(@cstringpattern[1],len,patternw)
ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw)
else
ascii2unicode(nil,len,patternw);
ascii2unicode(nil,len,current_settings.sourcecodepage,patternw);
iswidestring:=true;
len:=0;
end;
@ -4252,9 +4252,9 @@ In case not, the value returned can be arbitrary.
if not iswidestring then
begin
if len>0 then
ascii2unicode(@cstringpattern[1],len,patternw)
ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw)
else
ascii2unicode(nil,len,patternw);
ascii2unicode(nil,len,current_settings.sourcecodepage,patternw);
iswidestring:=true;
len:=0;
end;

View File

@ -52,7 +52,7 @@ unit widestr;
procedure copywidestring(s,d : pcompilerwidestring);
function asciichar2unicode(c : char) : tcompilerwidechar;
function unicode2asciichar(c : tcompilerwidechar) : char;
procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring);
procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring);
procedure unicode2ascii(r : pcompilerwidestring;p : pchar;cp : tstringencoding);
function hasnonasciichars(const p: pcompilerwidestring): boolean;
function getcharwidestring(r : pcompilerwidestring;l : SizeInt) : tcompilerwidechar;
@ -189,14 +189,14 @@ unit widestr;
Result := getascii(c,getmap(current_settings.sourcecodepage))[1];
end;
procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring);
procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring);
var
source : pchar;
dest : tcompilerwidecharptr;
i : SizeInt;
m : punicodemap;
begin
m:=getmap(current_settings.sourcecodepage);
m:=getmap(cp);
setlengthwidestring(r,l);
source:=p;
dest:=tcompilerwidecharptr(r^.data);