compiler: apply patches from Inoussa and Jonas:

defcmp: Address code paged' string type comparison taking care of the code page ncnv: Remove un-needed code page comparison to CP_UTF8, some fixes regarding shortstrings and wide char/string ncon: For the case of tstringconstnode.changestringtype (ncon.pas) where the code page are of CP_NONE or 0 no translation is done as : * CP_NONE is compatible to all * For 0 the raw bytes are just copied. My changes: - change ascii2unicode to allow pass source codepage, - convert in both cases when source or destination is UTF8 git-svn-id: trunk@19457 -
2025-04-13 09:59:25 +02:00 · 2011-10-11 01:21:07 +00:00 · 2011-10-11 01:21:07 +00:00 · a99ffb3097
commit a99ffb3097
parent cf81d39682
6 changed files with 138 additions and 49 deletions
--- a/compiler/defcmp.pas
+++ b/compiler/defcmp.pas
@ -335,20 +335,32 @@ implementation
                     { Constant string }
                     if (fromtreetype=stringconstn) then
                      begin
-                        if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) then
+                        if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and
+                           ((tstringdef(def_from).stringtype<>st_ansistring) or
+                            (tstringdef(def_from).encoding=tstringdef(def_to).encoding)
+                           ) then
                          eq:=te_equal
                        else
                         begin
                           doconv:=tc_string_2_string;
-                           { Don't prefer conversions from widestring to a
-                             normal string as we can lose information }
-                           if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and
-                             not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then
-                             eq:=te_convert_l3
-                           else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then
-                             eq:=te_convert_l2
+                           if (tstringdef(def_from).stringtype = st_ansistring) and
+                              (tstringdef(def_to).stringtype = st_ansistring) then
+                             if (tstringdef(def_to).encoding=globals.CP_UTF8) then
+                               eq:=te_convert_l1
+                             else
+                               eq:=te_convert_l2
                           else
-                             eq:=te_convert_l1;
+                            begin
+                              { Don't prefer conversions from widestring to a
+                                normal string as we can lose information }
+                              if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and
+                                not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then
+                                eq:=te_convert_l3
+                              else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then
+                                eq:=te_convert_l2
+                              else
+                                eq:=te_convert_l1;
+                            end;
                         end;
                      end
                     else if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and
--- a/compiler/nadd.pas
+++ b/compiler/nadd.pas
@ -1644,10 +1644,30 @@ implementation
                    end;
                  st_ansistring :
                    begin
-                      if not(is_ansistring(rd)) then
-                        inserttypeconv(right,cansistringtype);
-                      if not(is_ansistring(ld)) then
-                        inserttypeconv(left,cansistringtype);
+                      { use same code page if possible (don't force same code
+                        page in case both are ansistrings with code page <>
+                        CP_NONE, since then data loss can occur (the ansistring
+                        helpers will convert them at run time to an encoding
+                        that can represent both encodings) }
+                      if is_ansistring(ld) and
+                         (tstringdef(ld).encoding<>0) and
+                         (tstringdef(ld).encoding<>globals.CP_NONE) and
+                         (not is_ansistring(rd) or
+                          (tstringdef(rd).encoding=0) or
+                          (tstringdef(rd).encoding=globals.CP_NONE)) then
+                        inserttypeconv(right,ld)
+                      else if is_ansistring(rd) and
+                         (tstringdef(rd).encoding<>0) and
+                         (tstringdef(rd).encoding<>globals.CP_NONE) and
+                         (not is_ansistring(ld) or
+                          (tstringdef(ld).encoding=0) or
+                          (tstringdef(ld).encoding=globals.CP_NONE)) then
+                        inserttypeconv(left,rd)
+                      else
+                        begin
+                          inserttypeconv(left,cansistringtype);
+                          inserttypeconv(right,cansistringtype);
+                        end;
                    end;
                  st_longstring :
                    begin
--- a/compiler/ncnv.pas
+++ b/compiler/ncnv.pas
@ -1025,23 +1025,18 @@ implementation
        newblock : tblocknode;
        newstat  : tstatementnode;
        restemp  : ttempcreatenode;
-        //sa : ansistring;
-        //cw : WideChar;
-        //l : SizeUInt;
+        sa : ansistring;
+        cw : WideChar;
+        l : SizeUInt;
      begin
         result:=nil;
-         { we can't do widechar to ansichar conversions at compile time, since }
-         { this maps all non-ascii chars to '?' -> loses information           }
-
         if (left.nodetype=ordconstn) and
-            ((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or
+            ((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring]) or
             (torddef(left.resultdef).ordtype=uchar) or
             ((torddef(left.resultdef).ordtype=uwidechar) and
-              (current_settings.sourcecodepage<>CP_UTF8)
+              (tstringdef(resultdef).stringtype<>st_shortstring)
             )
-            )
-             { widechar >=128 is destroyed }
-             {(tordconstnode(left).value.uvalue<128))} then
+            ) then
           begin
              if (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) then
               begin
@ -1062,12 +1057,11 @@ implementation
                      hp:=cstringconstnode.createstr(unicode2asciichar(tcompilerwidechar(tordconstnode(left).value.uvalue)))
                    else
                     begin
-                       exit;
-                       {Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue);
+                       Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue);
                       SetLength(sa,5);
                       l:=UnicodeToUtf8(@(sa[1]),Length(sa),@cw,1);
                       SetLength(sa,l-1);
-                       hp:=cstringconstnode.createstr(sa);}
+                       hp:=cstringconstnode.createstr(sa);
                     end
                   end
                  else
@ -1077,6 +1071,18 @@ implementation
              result:=hp;
           end
         else
+           if (tstringdef(resultdef).stringtype=st_shortstring) and
+              (torddef(left.resultdef).ordtype=uwidechar) and
+              (tcompilerwidechar(tordconstnode(left).value.uvalue) <= 127)
+           then
+             begin
+               SetLength(sa,1);
+               Byte(sa[1]):= tordconstnode(left).value.uvalue;
+               hp:=cstringconstnode.createstr(sa);
+               tstringconstnode(hp).changestringtype(resultdef);
+               result:=hp;
+             end
+           else
           { shortstrings are handled 'inline' (except for widechars) }
           if (tstringdef(resultdef).stringtype<>st_shortstring) or
              (torddef(left.resultdef).ordtype=uwidechar) then
@ -1133,14 +1139,11 @@ implementation
      begin
        result:=nil;
        if (left.nodetype=stringconstn) and
-           ((tstringdef(resultdef).stringtype=st_shortstring) or
-            ((tstringdef(resultdef).stringtype=st_ansistring) and
+           (((tstringdef(resultdef).stringtype=st_ansistring) and
             (tstringdef(resultdef).encoding<>CP_NONE)
            )
           ) and
-           ((tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and
-            (current_settings.sourcecodepage<>CP_UTF8)
-           ) then
+           (tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) then
          begin
            tstringconstnode(left).changestringtype(resultdef);
            Result:=left;
@ -1163,7 +1166,18 @@ implementation
                      resultdef
                    );
            left:=nil;
-          end;
+          end
+        else if (left.nodetype=stringconstn) and
+                (tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and
+                (tstringdef(resultdef).stringtype=st_shortstring) then
+          begin
+            if not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)) then
+              begin
+                tstringconstnode(left).changestringtype(resultdef);
+                Result:=left;
+                left:=nil;
+              end;
+          end
      end;

    function ttypeconvnode.typecheck_char_to_chararray : tnode;
@ -1190,8 +1204,7 @@ implementation
            ((torddef(resultdef).ordtype<>uchar) or
             (torddef(left.resultdef).ordtype<>uwidechar) or
             (current_settings.sourcecodepage<>CP_UTF8))
-             { >= 128 is replaced by '?' currently -> loses information }
-             {(tordconstnode(left).value.uvalue<128))} then
+         then
           begin
             if (torddef(resultdef).ordtype=uchar) and
                (torddef(left.resultdef).ordtype=uwidechar) and
@ -2269,11 +2282,8 @@ implementation
              (
                ((not is_widechararray(left.resultdef) and
                  not is_wide_or_unicode_string(left.resultdef)) or
-                 (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or
-                 (current_settings.sourcecodepage<>CP_UTF8)
+                 (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring])
                )
-                 { non-ascii chars would be replaced with '?' -> loses info }
-                 {not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)))}
              ) then
              begin
                tstringconstnode(left).changestringtype(resultdef);
--- a/compiler/ncon.pas
+++ b/compiler/ncon.pas
@ -993,7 +993,7 @@ implementation
           not(cst_type in [cst_widestring,cst_unicodestring]) then
          begin
            initwidestring(pw);
-            ascii2unicode(value_str,len,pw);
+            ascii2unicode(value_str,len,current_settings.sourcecodepage,pw);
            ansistringdispose(value_str,len);
            pcompilerwidestring(value_str):=pw;
          end
@ -1035,8 +1035,55 @@ implementation
                cp2:=tstringdef(resultdef).encoding
              else if (cst_type in [cst_shortstring,cst_conststring,cst_longstring]) then
                cp2:=current_settings.sourcecodepage;
-              if cpavailable(cp1) and cpavailable(cp2) then
-                changecodepage(value_str,len,cp2,value_str,cp1);
+              { don't change string if codepages are equal or string length is 0 }  
+              if (cp1<>cp2) and (len>0) then
+                begin
+                  if cpavailable(cp1) and cpavailable(cp2) then
+                    changecodepage(value_str,len,cp2,value_str,cp1)
+                  else if (cp1 <> CP_NONE) and (cp2 <> CP_NONE) and (cp1 <> 0) and (cp2 <> 0) then
+                    begin
+                      { if source encoding is UTF8 convert using UTF8->UTF16->destination encoding }
+                      if (cp2=CP_UTF8) then
+                        begin
+                          if not cpavailable(cp1) then
+                            Message1(option_code_page_not_available,IntToStr(cp1));
+                          initwidestring(pw);
+                          setlengthwidestring(pw,len);
+                          l:=Utf8ToUnicode(PUnicodeChar(pw^.data),len,value_str,len);
+                          if (l<>getlengthwidestring(pw)) then
+                            begin
+                              setlengthwidestring(pw,l);
+                              ReAllocMem(value_str,l);
+                            end;
+                          unicode2ascii(pw,value_str,cp1);
+                          donewidestring(pw);
+                        end
+                      else
+                      { if destination encoding is UTF8 convert using source encoding->UTF16->UTF8 }
+                      if (cp1=CP_UTF8) then
+                        begin
+                          if not cpavailable(cp2) then
+                            Message1(option_code_page_not_available,IntToStr(cp2));
+                          initwidestring(pw);
+                          setlengthwidestring(pw,len);
+                          ascii2unicode(value_str,len,cp2,pw);
+                          l:=UnicodeToUtf8(nil,PUnicodeChar(pw^.data),0);
+                          if l<>len then
+                            ReAllocMem(value_str,l);
+                          len:=l-1;
+                          UnicodeToUtf8(value_str,PUnicodeChar(pw^.data),l);
+                          donewidestring(pw);
+                        end
+                      else
+                        begin
+                          { output error message that encoding is not available for the compiler }
+                          if not cpavailable(cp1) then
+                            Message1(option_code_page_not_available,IntToStr(cp1));
+                          if not cpavailable(cp2) then
+                            Message1(option_code_page_not_available,IntToStr(cp2));
+                        end;
+                    end;
+                end;
            end;
        cst_type:=st2cst[tstringdef(def).stringtype];
        resultdef:=def;
--- a/compiler/scanner.pas
+++ b/compiler/scanner.pas
@ -4208,9 +4208,9 @@ In case not, the value returned can be arbitrary.
                                  if not iswidestring then
                                   begin
                                     if len>0 then
-                                       ascii2unicode(@cstringpattern[1],len,patternw)
+                                       ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw)
                                     else
-                                       ascii2unicode(nil,len,patternw);
+                                       ascii2unicode(nil,len,current_settings.sourcecodepage,patternw);
                                     iswidestring:=true;
                                     len:=0;
                                   end;
@ -4252,9 +4252,9 @@ In case not, the value returned can be arbitrary.
                               if not iswidestring then
                                 begin
                                   if len>0 then
-                                     ascii2unicode(@cstringpattern[1],len,patternw)
+                                     ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw)
                                   else
-                                     ascii2unicode(nil,len,patternw);
+                                     ascii2unicode(nil,len,current_settings.sourcecodepage,patternw);
                                   iswidestring:=true;
                                   len:=0;
                                 end;
--- a/compiler/widestr.pas
+++ b/compiler/widestr.pas
@ -52,7 +52,7 @@ unit widestr;
    procedure copywidestring(s,d : pcompilerwidestring);
    function asciichar2unicode(c : char) : tcompilerwidechar;
    function unicode2asciichar(c : tcompilerwidechar) : char;
-    procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring);
+    procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring);
    procedure unicode2ascii(r : pcompilerwidestring;p : pchar;cp : tstringencoding);
    function hasnonasciichars(const p: pcompilerwidestring): boolean;
    function getcharwidestring(r : pcompilerwidestring;l : SizeInt) : tcompilerwidechar;
@ -189,14 +189,14 @@ unit widestr;
         Result := getascii(c,getmap(current_settings.sourcecodepage))[1];
      end;

-    procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring);
+    procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring);
      var
         source : pchar;
         dest   : tcompilerwidecharptr;
         i      : SizeInt;
         m      : punicodemap;
      begin
-         m:=getmap(current_settings.sourcecodepage);
+         m:=getmap(cp);
         setlengthwidestring(r,l);
         source:=p;
         dest:=tcompilerwidecharptr(r^.data);