From a99ffb3097c48959cf7823d4ab7cd057b82755a0 Mon Sep 17 00:00:00 2001
From: paul <paul@idefix.freepascal.org>
Date: Tue, 11 Oct 2011 01:21:07 +0000
Subject: [PATCH] compiler: apply patches from Inoussa and Jonas:   defcmp:
 Address code paged' string type comparison taking care of the code page  
 ncnv: Remove un-needed code page comparison to CP_UTF8, some fixes regarding
 shortstrings and wide char/string   ncon: For the case of
 tstringconstnode.changestringtype (ncon.pas) where the code page are of
 CP_NONE or 0 no translation is done as :     * CP_NONE is compatible to all  
   * For 0 the raw bytes are just copied.  My changes:   - change
 ascii2unicode to allow pass source codepage,   - convert in both cases when
 source or destination is UTF8

git-svn-id: trunk@19457 -
---
 compiler/defcmp.pas  | 30 ++++++++++++++-------
 compiler/nadd.pas    | 28 +++++++++++++++++---
 compiler/ncnv.pas    | 62 +++++++++++++++++++++++++-------------------
 compiler/ncon.pas    | 53 ++++++++++++++++++++++++++++++++++---
 compiler/scanner.pas |  8 +++---
 compiler/widestr.pas |  6 ++---
 6 files changed, 138 insertions(+), 49 deletions(-)

diff --git a/compiler/defcmp.pas b/compiler/defcmp.pas
index 564b93ad68..19661d33d5 100644
--- a/compiler/defcmp.pas
+++ b/compiler/defcmp.pas
@@ -335,20 +335,32 @@ implementation
                      { Constant string }
                      if (fromtreetype=stringconstn) then
                       begin
-                        if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) then
+                        if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and
+                           ((tstringdef(def_from).stringtype<>st_ansistring) or
+                            (tstringdef(def_from).encoding=tstringdef(def_to).encoding)
+                           ) then
                           eq:=te_equal
                         else
                          begin
                            doconv:=tc_string_2_string;
-                           { Don't prefer conversions from widestring to a
-                             normal string as we can lose information }
-                           if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and
-                             not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then
-                             eq:=te_convert_l3
-                           else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then
-                             eq:=te_convert_l2
+                           if (tstringdef(def_from).stringtype = st_ansistring) and
+                              (tstringdef(def_to).stringtype = st_ansistring) then
+                             if (tstringdef(def_to).encoding=globals.CP_UTF8) then
+                               eq:=te_convert_l1
+                             else
+                               eq:=te_convert_l2
                            else
-                             eq:=te_convert_l1;
+                            begin
+                              { Don't prefer conversions from widestring to a
+                                normal string as we can lose information }
+                              if (tstringdef(def_from).stringtype in [st_widestring,st_unicodestring]) and
+                                not (tstringdef(def_to).stringtype in [st_widestring,st_unicodestring]) then
+                                eq:=te_convert_l3
+                              else if tstringdef(def_to).stringtype in [st_widestring,st_unicodestring] then
+                                eq:=te_convert_l2
+                              else
+                                eq:=te_convert_l1;
+                            end;
                          end;
                       end
                      else if (tstringdef(def_from).stringtype=tstringdef(def_to).stringtype) and
diff --git a/compiler/nadd.pas b/compiler/nadd.pas
index 97f2eaeef0..57831d30a7 100644
--- a/compiler/nadd.pas
+++ b/compiler/nadd.pas
@@ -1644,10 +1644,30 @@ implementation
                     end;
                   st_ansistring :
                     begin
-                      if not(is_ansistring(rd)) then
-                        inserttypeconv(right,cansistringtype);
-                      if not(is_ansistring(ld)) then
-                        inserttypeconv(left,cansistringtype);
+                      { use same code page if possible (don't force same code
+                        page in case both are ansistrings with code page <>
+                        CP_NONE, since then data loss can occur (the ansistring
+                        helpers will convert them at run time to an encoding
+                        that can represent both encodings) }
+                      if is_ansistring(ld) and
+                         (tstringdef(ld).encoding<>0) and
+                         (tstringdef(ld).encoding<>globals.CP_NONE) and
+                         (not is_ansistring(rd) or
+                          (tstringdef(rd).encoding=0) or
+                          (tstringdef(rd).encoding=globals.CP_NONE)) then
+                        inserttypeconv(right,ld)
+                      else if is_ansistring(rd) and
+                         (tstringdef(rd).encoding<>0) and
+                         (tstringdef(rd).encoding<>globals.CP_NONE) and
+                         (not is_ansistring(ld) or
+                          (tstringdef(ld).encoding=0) or
+                          (tstringdef(ld).encoding=globals.CP_NONE)) then
+                        inserttypeconv(left,rd)
+                      else
+                        begin
+                          inserttypeconv(left,cansistringtype);
+                          inserttypeconv(right,cansistringtype);
+                        end;
                     end;
                   st_longstring :
                     begin
diff --git a/compiler/ncnv.pas b/compiler/ncnv.pas
index 4989e34f93..ec6bff85bc 100644
--- a/compiler/ncnv.pas
+++ b/compiler/ncnv.pas
@@ -1025,23 +1025,18 @@ implementation
         newblock : tblocknode;
         newstat  : tstatementnode;
         restemp  : ttempcreatenode;
-        //sa : ansistring;
-        //cw : WideChar;
-        //l : SizeUInt;
+        sa : ansistring;
+        cw : WideChar;
+        l : SizeUInt;
       begin
          result:=nil;
-         { we can't do widechar to ansichar conversions at compile time, since }
-         { this maps all non-ascii chars to '?' -> loses information           }
-
          if (left.nodetype=ordconstn) and
-            ((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or
+            ((tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring]) or
              (torddef(left.resultdef).ordtype=uchar) or
              ((torddef(left.resultdef).ordtype=uwidechar) and
-              (current_settings.sourcecodepage<>CP_UTF8)
+              (tstringdef(resultdef).stringtype<>st_shortstring)
              )
-            )
-             { widechar >=128 is destroyed }
-             {(tordconstnode(left).value.uvalue<128))} then
+            ) then
            begin
               if (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) then
                begin
@@ -1062,12 +1057,11 @@ implementation
                       hp:=cstringconstnode.createstr(unicode2asciichar(tcompilerwidechar(tordconstnode(left).value.uvalue)))
                     else
                      begin
-                       exit;
-                       {Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue);
+                       Word(cw):=tcompilerwidechar(tordconstnode(left).value.uvalue);
                        SetLength(sa,5);
                        l:=UnicodeToUtf8(@(sa[1]),Length(sa),@cw,1);
                        SetLength(sa,l-1);
-                       hp:=cstringconstnode.createstr(sa);}
+                       hp:=cstringconstnode.createstr(sa);
                      end
                    end
                   else
@@ -1077,6 +1071,18 @@ implementation
               result:=hp;
            end
          else
+           if (tstringdef(resultdef).stringtype=st_shortstring) and
+              (torddef(left.resultdef).ordtype=uwidechar) and
+              (tcompilerwidechar(tordconstnode(left).value.uvalue) <= 127)
+           then
+             begin
+               SetLength(sa,1);
+               Byte(sa[1]):= tordconstnode(left).value.uvalue;
+               hp:=cstringconstnode.createstr(sa);
+               tstringconstnode(hp).changestringtype(resultdef);
+               result:=hp;
+             end
+           else
            { shortstrings are handled 'inline' (except for widechars) }
            if (tstringdef(resultdef).stringtype<>st_shortstring) or
               (torddef(left.resultdef).ordtype=uwidechar) then
@@ -1133,14 +1139,11 @@ implementation
       begin
         result:=nil;
         if (left.nodetype=stringconstn) and
-           ((tstringdef(resultdef).stringtype=st_shortstring) or
-            ((tstringdef(resultdef).stringtype=st_ansistring) and
+           (((tstringdef(resultdef).stringtype=st_ansistring) and
              (tstringdef(resultdef).encoding<>CP_NONE)
             )
            ) and
-           ((tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and
-            (current_settings.sourcecodepage<>CP_UTF8)
-           ) then
+           (tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) then
           begin
             tstringconstnode(left).changestringtype(resultdef);
             Result:=left;
@@ -1163,7 +1166,18 @@ implementation
                       resultdef
                     );
             left:=nil;
-          end;
+          end
+        else if (left.nodetype=stringconstn) and
+                (tstringdef(left.resultdef).stringtype in [st_unicodestring,st_widestring]) and
+                (tstringdef(resultdef).stringtype=st_shortstring) then
+          begin
+            if not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)) then
+              begin
+                tstringconstnode(left).changestringtype(resultdef);
+                Result:=left;
+                left:=nil;
+              end;
+          end
       end;
 
     function ttypeconvnode.typecheck_char_to_chararray : tnode;
@@ -1190,8 +1204,7 @@ implementation
             ((torddef(resultdef).ordtype<>uchar) or
              (torddef(left.resultdef).ordtype<>uwidechar) or
              (current_settings.sourcecodepage<>CP_UTF8))
-             { >= 128 is replaced by '?' currently -> loses information }
-             {(tordconstnode(left).value.uvalue<128))} then
+         then
            begin
              if (torddef(resultdef).ordtype=uchar) and
                 (torddef(left.resultdef).ordtype=uwidechar) and
@@ -2269,11 +2282,8 @@ implementation
               (
                 ((not is_widechararray(left.resultdef) and
                   not is_wide_or_unicode_string(left.resultdef)) or
-                 (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring]) or
-                 (current_settings.sourcecodepage<>CP_UTF8)
+                 (tstringdef(resultdef).stringtype in [st_widestring,st_unicodestring,st_ansistring])
                 )
-                 { non-ascii chars would be replaced with '?' -> loses info }
-                 {not hasnonasciichars(pcompilerwidestring(tstringconstnode(left).value_str)))}
               ) then
               begin
                 tstringconstnode(left).changestringtype(resultdef);
diff --git a/compiler/ncon.pas b/compiler/ncon.pas
index 97b3dd9bbe..cb8ed2df71 100644
--- a/compiler/ncon.pas
+++ b/compiler/ncon.pas
@@ -993,7 +993,7 @@ implementation
            not(cst_type in [cst_widestring,cst_unicodestring]) then
           begin
             initwidestring(pw);
-            ascii2unicode(value_str,len,pw);
+            ascii2unicode(value_str,len,current_settings.sourcecodepage,pw);
             ansistringdispose(value_str,len);
             pcompilerwidestring(value_str):=pw;
           end
@@ -1035,8 +1035,55 @@ implementation
                 cp2:=tstringdef(resultdef).encoding
               else if (cst_type in [cst_shortstring,cst_conststring,cst_longstring]) then
                 cp2:=current_settings.sourcecodepage;
-              if cpavailable(cp1) and cpavailable(cp2) then
-                changecodepage(value_str,len,cp2,value_str,cp1);
+              { don't change string if codepages are equal or string length is 0 }  
+              if (cp1<>cp2) and (len>0) then
+                begin
+                  if cpavailable(cp1) and cpavailable(cp2) then
+                    changecodepage(value_str,len,cp2,value_str,cp1)
+                  else if (cp1 <> CP_NONE) and (cp2 <> CP_NONE) and (cp1 <> 0) and (cp2 <> 0) then
+                    begin
+                      { if source encoding is UTF8 convert using UTF8->UTF16->destination encoding }
+                      if (cp2=CP_UTF8) then
+                        begin
+                          if not cpavailable(cp1) then
+                            Message1(option_code_page_not_available,IntToStr(cp1));
+                          initwidestring(pw);
+                          setlengthwidestring(pw,len);
+                          l:=Utf8ToUnicode(PUnicodeChar(pw^.data),len,value_str,len);
+                          if (l<>getlengthwidestring(pw)) then
+                            begin
+                              setlengthwidestring(pw,l);
+                              ReAllocMem(value_str,l);
+                            end;
+                          unicode2ascii(pw,value_str,cp1);
+                          donewidestring(pw);
+                        end
+                      else
+                      { if destination encoding is UTF8 convert using source encoding->UTF16->UTF8 }
+                      if (cp1=CP_UTF8) then
+                        begin
+                          if not cpavailable(cp2) then
+                            Message1(option_code_page_not_available,IntToStr(cp2));
+                          initwidestring(pw);
+                          setlengthwidestring(pw,len);
+                          ascii2unicode(value_str,len,cp2,pw);
+                          l:=UnicodeToUtf8(nil,PUnicodeChar(pw^.data),0);
+                          if l<>len then
+                            ReAllocMem(value_str,l);
+                          len:=l-1;
+                          UnicodeToUtf8(value_str,PUnicodeChar(pw^.data),l);
+                          donewidestring(pw);
+                        end
+                      else
+                        begin
+                          { output error message that encoding is not available for the compiler }
+                          if not cpavailable(cp1) then
+                            Message1(option_code_page_not_available,IntToStr(cp1));
+                          if not cpavailable(cp2) then
+                            Message1(option_code_page_not_available,IntToStr(cp2));
+                        end;
+                    end;
+                end;
             end;
         cst_type:=st2cst[tstringdef(def).stringtype];
         resultdef:=def;
diff --git a/compiler/scanner.pas b/compiler/scanner.pas
index ee32261319..749546110e 100644
--- a/compiler/scanner.pas
+++ b/compiler/scanner.pas
@@ -4208,9 +4208,9 @@ In case not, the value returned can be arbitrary.
                                   if not iswidestring then
                                    begin
                                      if len>0 then
-                                       ascii2unicode(@cstringpattern[1],len,patternw)
+                                       ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw)
                                      else
-                                       ascii2unicode(nil,len,patternw);
+                                       ascii2unicode(nil,len,current_settings.sourcecodepage,patternw);
                                      iswidestring:=true;
                                      len:=0;
                                    end;
@@ -4252,9 +4252,9 @@ In case not, the value returned can be arbitrary.
                                if not iswidestring then
                                  begin
                                    if len>0 then
-                                     ascii2unicode(@cstringpattern[1],len,patternw)
+                                     ascii2unicode(@cstringpattern[1],len,current_settings.sourcecodepage,patternw)
                                    else
-                                     ascii2unicode(nil,len,patternw);
+                                     ascii2unicode(nil,len,current_settings.sourcecodepage,patternw);
                                    iswidestring:=true;
                                    len:=0;
                                  end;
diff --git a/compiler/widestr.pas b/compiler/widestr.pas
index 18646424d5..87701708c8 100644
--- a/compiler/widestr.pas
+++ b/compiler/widestr.pas
@@ -52,7 +52,7 @@ unit widestr;
     procedure copywidestring(s,d : pcompilerwidestring);
     function asciichar2unicode(c : char) : tcompilerwidechar;
     function unicode2asciichar(c : tcompilerwidechar) : char;
-    procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring);
+    procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring);
     procedure unicode2ascii(r : pcompilerwidestring;p : pchar;cp : tstringencoding);
     function hasnonasciichars(const p: pcompilerwidestring): boolean;
     function getcharwidestring(r : pcompilerwidestring;l : SizeInt) : tcompilerwidechar;
@@ -189,14 +189,14 @@ unit widestr;
          Result := getascii(c,getmap(current_settings.sourcecodepage))[1];
       end;
 
-    procedure ascii2unicode(p : pchar;l : SizeInt;r : pcompilerwidestring);
+    procedure ascii2unicode(p : pchar;l : SizeInt;cp : tstringencoding;r : pcompilerwidestring);
       var
          source : pchar;
          dest   : tcompilerwidecharptr;
          i      : SizeInt;
          m      : punicodemap;
       begin
-         m:=getmap(current_settings.sourcecodepage);
+         m:=getmap(cp);
          setlengthwidestring(r,l);
          source:=p;
          dest:=tcompilerwidecharptr(r^.data);