* basic avx support for floating point operations (use -Cfavx to activate)

git-svn-id: trunk@24896 -
2025-04-20 11:29:27 +02:00 · 2013-06-14 20:03:01 +00:00 · 2013-06-14 20:03:01 +00:00 · e81d2d1f3b
commit e81d2d1f3b
parent 6a8e4f0381
13 changed files with 764 additions and 244 deletions
--- a/compiler/cgobj.pas
+++ b/compiler/cgobj.pas
@ -52,8 +52,10 @@ unit cgobj;
          by Free Pascal. For 32-bit processors, the base class
          should be @link(tcg64f32) and not @var(tcg).
       }
+
+       { tcg }
+
       tcg = class
-       public
          { how many times is this current code executed }
          executionweight : longint;
          alignment : talignment;
@ -271,6 +273,9 @@ unit cgobj;
          procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); virtual;
          procedure a_opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; reg: tregister;shuffle : pmmshuffle); virtual;
          procedure a_opmm_reg_ref(list: TAsmList; Op: TOpCG; size : tcgsize;reg: tregister;const ref: treference; shuffle : pmmshuffle); virtual;
+          procedure a_opmm_loc_reg_reg(list: TAsmList;Op : TOpCG;size : tcgsize;const loc : tlocation;src,dst : tregister;shuffle : pmmshuffle); virtual;
+          procedure a_opmm_reg_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src1,src2,dst: tregister;shuffle : pmmshuffle); virtual;
+          procedure a_opmm_ref_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; src,dst: tregister;shuffle : pmmshuffle); virtual;

          procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); virtual;
          procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister; shuffle : pmmshuffle); virtual;
@ -2061,6 +2066,33 @@ implementation
      end;


+    procedure tcg.a_opmm_loc_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; src,dst: tregister;shuffle : pmmshuffle);
+      begin
+        case loc.loc of
+          LOC_CMMREGISTER,LOC_MMREGISTER:
+            a_opmm_reg_reg_reg(list,op,size,loc.register,src,dst,shuffle);
+          LOC_CREFERENCE,LOC_REFERENCE:
+            a_opmm_ref_reg_reg(list,op,size,loc.reference,src,dst,shuffle);
+          else
+            internalerror(200312232);
+        end;
+      end;
+
+
+    procedure tcg.a_opmm_reg_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;
+      src1,src2,dst : tregister;shuffle : pmmshuffle);
+      begin
+        internalerror(2013061102);
+      end;
+
+
+    procedure tcg.a_opmm_ref_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;
+      const ref : treference;src,dst : tregister;shuffle : pmmshuffle);
+      begin
+        internalerror(2013061101);
+      end;
+
+
    procedure tcg.g_concatcopy_unaligned(list : TAsmList;const source,dest : treference;len : tcgint);
      begin
        g_concatcopy(list,source,dest,len);
--- a/compiler/i386/cpuinfo.pas
+++ b/compiler/i386/cpuinfo.pas
@ -59,7 +59,8 @@ Type
      fpu_ssse3,
      fpu_sse41,
      fpu_sse42,
-      fpu_avx
+      fpu_avx,
+      fpu_avx2
     );


@ -96,11 +97,14 @@ Const
     'SSSE3',
     'SSE41',
     'SSE42',
-     'AVX'
+     'AVX',
+     'AVX2'
   );

-   sse_singlescalar : set of tfputype = [fpu_sse,fpu_sse2,fpu_sse3];
-   sse_doublescalar : set of tfputype = [fpu_sse2,fpu_sse3];
+   sse_singlescalar = [fpu_sse..fpu_avx2];
+   sse_doublescalar = [fpu_sse2..fpu_avx2];
+
+   fpu_avx_instructionsets = [fpu_avx,fpu_avx2];

   { Supported optimizations, only used for information }
   supported_optimizerswitches = genericlevel1optimizerswitches+
--- a/compiler/i386/i386prop.inc
+++ b/compiler/i386/i386prop.inc
@ -685,6 +685,10 @@
 (Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
 (Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@ -772,21 +776,17 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
--- a/compiler/x86/aasmcpu.pas
+++ b/compiler/x86/aasmcpu.pas
@ -296,7 +296,7 @@ interface
         constructor op_reg_reg_reg(op : tasmop;_size : topsize;_op1,_op2,_op3 : tregister);
         constructor op_const_reg_reg(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;_op3 : tregister);
         constructor op_const_ref_reg(op : tasmop;_size : topsize;_op1 : aint;const _op2 : treference;_op3 : tregister);
-         constructor op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister; const _op3 : treference);
+         constructor op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
         constructor op_const_reg_ref(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;const _op3 : treference);

         { this is for Jmp instructions }
@ -375,7 +375,8 @@ implementation
       systems,
       procinfo,
       itcpugas,
-       symsym;
+       symsym,
+       cpuinfo;

 {*****************************************************************************
                              Instruction table
@ -813,14 +814,14 @@ implementation
      end;


-    constructor taicpu.op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister;const _op3 : treference);
+    constructor taicpu.op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
      begin
         inherited create(op);
         init(_size);
         ops:=3;
-         loadreg(0,_op1);
+         loadref(0,_op1);
         loadreg(1,_op2);
-         loadref(2,_op3);
+         loadreg(2,_op3);
      end;


@ -2874,7 +2875,9 @@ implementation
                 (oper[0]^.reg=oper[1]^.reg)
                ) or
                (((opcode=A_MOVSS) or (opcode=A_MOVSD) or (opcode=A_MOVQ) or
-                  (opcode=A_MOVAPS) or (OPCODE=A_MOVAPD)) and
+                  (opcode=A_MOVAPS) or (OPCODE=A_MOVAPD) or
+                  (opcode=A_VMOVSS) or (opcode=A_VMOVSD) or (opcode=A_VMOVQ) or
+                  (opcode=A_VMOVAPS) or (OPCODE=A_VMOVAPD)) and
                 (regtype = R_MMREGISTER) and
                 (ops=2) and
                 (oper[0]^.typ=top_reg) and
@ -2929,8 +2932,11 @@ implementation
      begin
        { the information in the instruction table is made for the string copy
          operation MOVSD so hack here (FK)
+
+          VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
+          so fix it here (FK)
        }
-        if (opcode=A_MOVSD) and (ops=2) then
+        if ((opcode=A_MOVSD) or (opcode=A_VMOVSS) or (opcode=A_VMOVSD)) and (ops=2) then
          begin
            case opnr of
              0:
@ -2961,17 +2967,30 @@ implementation
              result:=taicpu.op_ref_reg(A_MOV,reg2opsize(r),tmpref,r);
            end;
          R_MMREGISTER :
-            case getsubreg(r) of
-              R_SUBMMD:
-                result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
-              R_SUBMMS:
-                result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
-              R_SUBQ,
-              R_SUBMMWHOLE:
-                result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
-              else
-                internalerror(200506043);
-            end;
+            if current_settings.fputype in fpu_avx_instructionsets then
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_ref_reg(A_VMOVSD,reg2opsize(r),ref,r);
+                R_SUBMMS:
+                  result:=taicpu.op_ref_reg(A_VMOVSS,reg2opsize(r),ref,r);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_ref_reg(A_VMOVQ,S_NO,ref,r);
+                else
+                  internalerror(200506043);
+              end
+            else
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
+                R_SUBMMS:
+                  result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
+                else
+                  internalerror(200506043);
+              end;
          else
            internalerror(200401041);
        end;
@ -3002,17 +3021,30 @@ implementation
              result:=taicpu.op_reg_ref(A_MOV,size,r,tmpref);
            end;
          R_MMREGISTER :
-            case getsubreg(r) of
-              R_SUBMMD:
-                result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
-              R_SUBMMS:
-                result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
-              R_SUBQ,
-              R_SUBMMWHOLE:
-                result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
-              else
-                internalerror(200506042);
-            end;
+            if current_settings.fputype in fpu_avx_instructionsets then
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_reg_ref(A_VMOVSD,reg2opsize(r),r,ref);
+                R_SUBMMS:
+                  result:=taicpu.op_reg_ref(A_VMOVSS,reg2opsize(r),r,ref);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_reg_ref(A_VMOVQ,S_NO,r,ref);
+                else
+                  internalerror(200506042);
+              end
+            else
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
+                R_SUBMMS:
+                  result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
+                else
+                  internalerror(200506042);
+              end;
          else
            internalerror(200401041);
        end;
--- a/compiler/x86/cgx86.pas
+++ b/compiler/x86/cgx86.pas
@ -92,6 +92,8 @@ unit cgx86;
        procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
        procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
        procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src,dst: tregister;shuffle : pmmshuffle);override;
+        procedure a_opmm_ref_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;const ref : treference;src,dst : tregister;shuffle : pmmshuffle);override;
+        procedure a_opmm_reg_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;src1,src2,dst : tregister;shuffle : pmmshuffle);override;

        {  comparison operations }
        procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
@ -126,9 +128,9 @@ unit cgx86;
        procedure check_register_size(size:tcgsize;reg:tregister);

        procedure opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
+        procedure opmm_loc_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;loc : tlocation;src,dst : tregister;shuffle : pmmshuffle);

        function get_darwin_call_stub(const s: string; weak: boolean): tasmsymbol;
-      private
        procedure sizes2load(s1,s2 : tcgsize;var op: tasmop; var s3: topsize);

        procedure floatload(list: TAsmList; t : tcgsize;const ref : treference);
@ -175,7 +177,7 @@ unit cgx86;

    function UseAVX: boolean;
      begin
-        Result:=current_settings.fputype in [fpu_avx];
+        Result:=current_settings.fputype in fpu_avx_instructionsets;
      end;

    const
@ -1144,12 +1146,18 @@ unit cgx86;

    function get_scalar_mm_op(fromsize,tosize : tcgsize) : tasmop;
      const
-        convertop : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
+        convertopsse : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
          (A_MOVSS,A_CVTSS2SD,A_NONE,A_NONE,A_NONE),
          (A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
          (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
+        convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
+          (A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
+          (A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
      begin
        { we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
          OS_32/OS_64 (record in memory/LOC_REFERENCE) }
@ -1161,14 +1169,24 @@ unit cgx86;
            OS_64:
              tosize:=OS_F64;
          end;
-        if (fromsize in [low(convertop)..high(convertop)]) and
-           (tosize in [low(convertop)..high(convertop)]) then
-          result:=convertop[fromsize,tosize]
+        if (fromsize in [low(convertopsse)..high(convertopsse)]) and
+           (tosize in [low(convertopsse)..high(convertopsse)]) then
+          begin
+            if UseAVX then
+              result:=convertopavx[fromsize,tosize]
+            else
+              result:=convertopsse[fromsize,tosize];
+          end
        { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
          OS_64 (record in memory/LOC_REFERENCE) }
        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
                (fromsize=OS_M64) then
-          result:=A_MOVQ
+          begin
+            if UseAVX then
+              result:=A_VMOVQ
+            else
+              result:=A_MOVQ;
+          end
        else
          internalerror(2010060104);
        if result=A_NONE then
@ -1179,6 +1197,7 @@ unit cgx86;
    procedure tcgx86.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle);
      var
        instr : taicpu;
+        op : TAsmOp;
      begin
        if shuffle=nil then
          begin
@ -1200,8 +1219,26 @@ unit cgx86;
          end
        else if shufflescalar(shuffle) then
          begin
-            instr:=taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg1,reg2);
+            op:=get_scalar_mm_op(fromsize,tosize);
+
+            { VMOVSD/SS is not available with two register operands }
+            if op=A_VMOVSD then
+              op:=A_VMOVAPD
+            else if op=A_VMOVSS then
+              op:=A_VMOVAPS;
+
+            { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+            if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+              instr:=taicpu.op_reg_reg_reg(op,S_NO,reg1,reg2,reg2)
+            else
+              instr:=taicpu.op_reg_reg(op,S_NO,reg1,reg2);
+
            case get_scalar_mm_op(fromsize,tosize) of
+              A_VMOVAPD,
+              A_VMOVAPS,
+              A_VMOVSS,
+              A_VMOVSD,
+              A_VMOVQ,
              A_MOVSS,
              A_MOVSD,
              A_MOVQ:
@ -1217,6 +1254,7 @@ unit cgx86;
    procedure tcgx86.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle);
       var
         tmpref  : treference;
+         op : tasmop;
       begin
         tmpref:=ref;
         make_simple_ref(list,tmpref);
@ -1233,7 +1271,15 @@ unit cgx86;
 {$endif x86_64}
           end
         else if shufflescalar(shuffle) then
-           list.concat(taicpu.op_ref_reg(get_scalar_mm_op(fromsize,tosize),S_NO,tmpref,reg))
+           begin
+             op:=get_scalar_mm_op(fromsize,tosize);
+
+             { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+             if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+               list.concat(taicpu.op_ref_reg_reg(op,S_NO,tmpref,reg,reg))
+             else
+               list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg))
+           end
         else
           internalerror(200312252);
       end;
@ -1243,6 +1289,7 @@ unit cgx86;
       var
         hreg : tregister;
         tmpref  : treference;
+         op : tasmop;
       begin
         tmpref:=ref;
         make_simple_ref(list,tmpref);
@ -1263,8 +1310,15 @@ unit cgx86;
             if tcgsize2size[tosize]<>tcgsize2size[fromsize] then
               begin
                 hreg:=getmmregister(list,tosize);
-                 list.concat(taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg,hreg));
-                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref));
+                 op:=get_scalar_mm_op(fromsize,tosize);
+
+                 { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+                 if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+                   list.concat(taicpu.op_reg_reg_reg(op,S_NO,reg,hreg,hreg))
+                 else
+                   list.concat(taicpu.op_reg_reg(op,S_NO,reg,hreg));
+
+                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref))
               end
             else
               list.concat(taicpu.op_reg_ref(get_scalar_mm_op(fromsize,tosize),S_NO,reg,tmpref));
@ -1296,6 +1350,103 @@ unit cgx86;
     end;


+    procedure tcgx86.opmm_loc_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;src,dst: tregister; shuffle : pmmshuffle);
+      const
+        opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
+          ( { scalar }
+            ( { OS_F32 }
+              A_NOP,A_NOP,A_VADDSS,A_NOP,A_VDIVSS,A_NOP,A_NOP,A_VMULSS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSS,A_NOP,A_NOP,A_NOP
+            ),
+            ( { OS_F64 }
+              A_NOP,A_NOP,A_VADDSD,A_NOP,A_VDIVSD,A_NOP,A_NOP,A_VMULSD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSD,A_NOP,A_NOP,A_NOP
+            )
+          ),
+          ( { vectorized/packed }
+            { because the logical packed single instructions have shorter op codes, we use always
+              these
+            }
+            ( { OS_F32 }
+              A_NOP,A_NOP,A_VADDPS,A_NOP,A_VDIVPS,A_NOP,A_NOP,A_VMULPS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPS,A_VXORPS,A_NOP,A_NOP
+            ),
+            ( { OS_F64 }
+              A_NOP,A_NOP,A_VADDPD,A_NOP,A_VDIVPD,A_NOP,A_NOP,A_VMULPD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPD,A_VXORPD,A_NOP,A_NOP
+            )
+          )
+        );
+
+      var
+        resultreg : tregister;
+        asmop : tasmop;
+      begin
+        { this is an internally used procedure so the parameters have
+          some constrains
+        }
+        if loc.size<>size then
+          internalerror(2013061108);
+        resultreg:=dst;
+        { deshuffle }
+        //!!!
+        if (shuffle<>nil) and not(shufflescalar(shuffle)) then
+          begin
+            internalerror(2013061107);
+          end
+        else if (shuffle=nil) then
+          asmop:=opmm2asmop[1,size,op]
+        else if shufflescalar(shuffle) then
+          begin
+            asmop:=opmm2asmop[0,size,op];
+            { no scalar operation available? }
+            if asmop=A_NOP then
+              begin
+                { do vectorized and shuffle finally }
+                internalerror(2010060102);
+              end;
+          end
+        else
+          internalerror(2013061106);
+        if asmop=A_NOP then
+          internalerror(2013061105);
+        case loc.loc of
+          LOC_CREFERENCE,LOC_REFERENCE:
+            begin
+              make_simple_ref(current_asmdata.CurrAsmList,loc.reference);
+              list.concat(taicpu.op_ref_reg_reg(asmop,S_NO,loc.reference,src,resultreg));
+            end;
+          LOC_CMMREGISTER,LOC_MMREGISTER:
+            list.concat(taicpu.op_reg_reg_reg(asmop,S_NO,loc.register,src,resultreg));
+          else
+            internalerror(2013061104);
+        end;
+        { shuffle }
+        if resultreg<>dst then
+          begin
+            internalerror(2013061103);
+          end;
+      end;
+
+
+    procedure tcgx86.a_opmm_reg_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src1,src2,dst: tregister;shuffle : pmmshuffle);
+      var
+        l : tlocation;
+      begin
+        l.loc:=LOC_MMREGISTER;
+        l.register:=src1;
+        l.size:=size;
+        opmm_loc_reg_reg(list,op,size,l,src2,dst,shuffle);
+      end;
+
+
+    procedure tcgx86.a_opmm_ref_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; src,dst: tregister;shuffle : pmmshuffle);
+      var
+        l : tlocation;
+      begin
+        l.loc:=LOC_REFERENCE;
+        l.reference:=ref;
+        l.size:=size;
+        opmm_loc_reg_reg(list,op,size,l,src,dst,shuffle);
+      end;
+
+
    procedure tcgx86.opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
      const
        opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
@ -1319,7 +1470,6 @@ unit cgx86;
            )
          )
        );
-
      var
        resultreg : tregister;
        asmop : tasmop;
--- a/compiler/x86/nx86add.pas
+++ b/compiler/x86/nx86add.pas
@ -41,7 +41,10 @@ unit nx86add;
        procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);

        procedure second_cmpfloatsse;
+        procedure second_cmpfloatavx;
+
        procedure second_addfloatsse;
+        procedure second_addfloatavx;
      public
        procedure second_addfloat;override;
 {$ifndef i8086}
@ -794,6 +797,141 @@ unit nx86add;
          end;
      end;

+    procedure tx86addnode.second_addfloatavx;
+      var
+        op : topcg;
+        sqr_sum : boolean;
+        tmp : tnode;
+      begin
+        sqr_sum:=false;
+{$ifdef dummy}
+        if (current_settings.fputype>=fpu_sse3) and
+           use_vectorfpu(resultdef) and
+           (nodetype in [addn,subn]) and
+          (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
+          (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
+          begin
+            sqr_sum:=true;
+            tmp:=tinlinenode(left).left;
+            tinlinenode(left).left:=nil;
+            left.free;
+            left:=tmp;
+
+            tmp:=tinlinenode(right).left;
+            tinlinenode(right).left:=nil;
+            right.free;
+            right:=tmp;
+          end;
+{$endif dummy}
+
+        pass_left_right;
+        check_left_and_right_fpureg(false);
+
+        if (nf_swapped in flags) then
+          { can't use swapleftright if both are on the fpu stack, since then }
+          { both are "R_ST" -> nothing would change -> manually switch       }
+          if (left.location.loc = LOC_FPUREGISTER) and
+             (right.location.loc = LOC_FPUREGISTER) then
+            emit_none(A_FXCH,S_NO)
+          else
+            swapleftright;
+
+        case nodetype of
+          addn :
+            op:=OP_ADD;
+          muln :
+            op:=OP_MUL;
+          subn :
+            op:=OP_SUB;
+          slashn :
+            op:=OP_DIV;
+          else
+            internalerror(200312231);
+        end;
+
+        location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+
+        if sqr_sum then
+          begin
+            if nf_swapped in flags then
+              swapleftright;
+
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
+            location:=left.location;
+            if is_double(resultdef) then
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
+                  subn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
+                  else
+                    internalerror(201108162);
+                end;
+              end
+            else
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
+                { ensure that bits 64..127 contain valid values }
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
+                { the data is now in bits 0..32 and 64..95 }
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
+                    end;
+                  subn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
+                    end;
+                  else
+                    internalerror(201108163);
+                end;
+              end
+          end
+        { we can use only right as left operand if the operation is commutative }
+        else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
+          begin
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+            cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
+              left.location,
+              right.location.register,
+              location.register,
+              mms_movescalar);
+          end
+        else
+          begin
+            if (nf_swapped in flags) then
+              swapleftright;
+
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+            cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
+              right.location,
+              left.location.register,
+              location.register,
+              mms_movescalar);
+          end;
+      end;
+

    procedure tx86addnode.second_cmpfloatsse;
      var
@ -860,6 +998,72 @@ unit nx86add;
      end;


+
+    procedure tx86addnode.second_cmpfloatavx;
+      var
+        op : tasmop;
+      begin
+        if is_single(left.resultdef) then
+          op:=A_VCOMISS
+        else if is_double(left.resultdef) then
+          op:=A_VCOMISD
+        else
+          internalerror(200402222);
+        pass_left_right;
+
+        location_reset(location,LOC_FLAGS,def_cgsize(resultdef));
+        { we can use only right as left operand if the operation is commutative }
+        if (right.location.loc=LOC_MMREGISTER) then
+          begin
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+            case left.location.loc of
+              LOC_REFERENCE,LOC_CREFERENCE:
+                begin
+                  tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
+                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
+                end;
+              LOC_MMREGISTER,LOC_CMMREGISTER:
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
+              else
+                internalerror(200402221);
+            end;
+            if nf_swapped in flags then
+              exclude(flags,nf_swapped)
+            else
+              include(flags,nf_swapped)
+          end
+        else
+          begin
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+            case right.location.loc of
+              LOC_REFERENCE,LOC_CREFERENCE:
+                begin
+                  tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
+                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
+                end;
+              LOC_MMREGISTER,LOC_CMMREGISTER:
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
+              else
+                internalerror(200402223);
+            end;
+          end;
+        location.resflags:=getresflags(true);
+      end;
+
+
    procedure tx86addnode.second_opvector;
      var
        op : topcg;
@ -912,7 +1116,10 @@ unit nx86add;
      begin
        if use_vectorfpu(resultdef) then
          begin
-            second_addfloatsse;
+            if UseAVX then
+              second_addfloatavx
+            else
+              second_addfloatsse;
            exit;
          end;

@ -959,7 +1166,10 @@ unit nx86add;
      begin
        if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
          begin
-            second_cmpfloatsse;
+            if UseAVX then
+              second_cmpfloatavx
+            else
+              second_cmpfloatsse;
            exit;
          end;

--- a/compiler/x86/nx86cnv.pas
+++ b/compiler/x86/nx86cnv.pas
@ -276,14 +276,25 @@ implementation
          begin
            location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
-            case location.size of
-              OS_F32:
-                op:=A_CVTSI2SS;
-              OS_F64:
-                op:=A_CVTSI2SD;
-              else
-                internalerror(2007120902);
-            end;
+            if UseAVX then
+              case location.size of
+                OS_F32:
+                  op:=A_VCVTSI2SS;
+                OS_F64:
+                  op:=A_VCVTSI2SD;
+                else
+                  internalerror(2007120902);
+              end
+            else
+              case location.size of
+                OS_F32:
+                  op:=A_CVTSI2SS;
+                OS_F64:
+                  op:=A_CVTSI2SD;
+                else
+                  internalerror(2007120902);
+              end;
+
            { don't use left.location.size, because that one may be OS_32/OS_64
              if the lower bound of the orddef >= 0
            }
@ -301,11 +312,19 @@ implementation
                begin
                  href:=left.location.reference;
                  tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,href);
-                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
+                  if UseAVX then
+                    { VCVTSI2.. requires a second source operand to copy bits 64..127 }
+                    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(op,opsize,href,location.register,location.register))
+                  else
+                    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
                end;
              LOC_REGISTER,
              LOC_CREGISTER:
-                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
+                if UseAVX then
+                    { VCVTSI2.. requires a second source operand to copy bits 64..127 }
+                  current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,opsize,left.location.register,location.register,location.register))
+                else
+                  current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
            end;
          end
        else
--- a/compiler/x86/nx86inl.pas
+++ b/compiler/x86/nx86inl.pas
@ -289,14 +289,24 @@ implementation
             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
             location_reset(location,LOC_REGISTER,OS_S64);
             location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
-             case left.location.size of
-               OS_F32:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
-               OS_F64:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
-               else
-                 internalerror(2007031402);
-             end;
+             if UseAVX then
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031402);
+               end
+             else
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031402);
+               end;
           end
         else
 {$endif x86_64}
@ -323,14 +333,24 @@ implementation
             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
             location_reset(location,LOC_REGISTER,OS_S64);
             location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
-             case left.location.size of
-               OS_F32:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
-               OS_F64:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
-               else
-                 internalerror(2007031401);
-             end;
+             if UseAVX then
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031401);
+               end
+             else
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031401);
+               end;
           end
         else
 {$endif x86_64}
@ -371,9 +391,18 @@ implementation
         if use_vectorfpu(resultdef) then
           begin
             secondpass(left);
-             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
-             location:=left.location;
-             cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location,left.location.register,mms_movescalar);
+             location_reset(location,LOC_MMREGISTER,left.location.size);
+             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+             if UseAVX then
+               begin
+                 hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+                 cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location.register,left.location.register,location.register,mms_movescalar);
+               end
+             else
+               begin
+                 cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
+                 cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,location.register,location.register,mms_movescalar);
+               end;
           end
         else
           begin
@ -389,15 +418,26 @@ implementation
           begin
             secondpass(left);
             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
-             location:=left.location;
-             case tfloatdef(resultdef).floattype of
-               s32real:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,location.register,location.register));
-               s64real:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,location.register,location.register));
-               else
-                 internalerror(200510031);
-             end;
+             location_reset(location,LOC_MMREGISTER,left.location.size);
+             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+             if UseAVX then
+               case tfloatdef(resultdef).floattype of
+                 s32real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSS,S_XMM,left.location.register,location.register,location.register));
+                 s64real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSD,S_XMM,left.location.register,location.register,location.register));
+                 else
+                   internalerror(200510031);
+               end
+             else
+               case tfloatdef(resultdef).floattype of
+                 s32real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,left.location.register,location.register));
+                 s64real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,left.location.register,location.register));
+                 else
+                   internalerror(200510031);
+               end;
           end
         else
           begin
--- a/compiler/x86/nx86mat.pas
+++ b/compiler/x86/nx86mat.pas
@ -154,14 +154,11 @@ interface

        if expectloc=LOC_MMREGISTER then
          begin
-            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
            location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));

            { make life of register allocator easier }
            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
-            cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
-
-            reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));

            current_asmdata.getdatalabel(l1);
            new_section(current_asmdata.asmlists[al_typedconsts],sec_rodata_norel,l1.name,const_align(sizeof(pint)));
@ -179,9 +176,16 @@ interface
            end;

            reference_reset_symbol(href,l1,0,resultdef.alignment);
+            reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
            cg.a_loadmm_ref_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),href,reg,mms_movescalar);

-            cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
+            if UseAVX then
+              cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,left.location.register,location.register,nil)
+            else
+              begin
+                cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
+                cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
+              end;
          end
        else
          begin
--- a/compiler/x86/rgx86.pas
+++ b/compiler/x86/rgx86.pas
@ -134,143 +134,164 @@ implementation
                end;
              2,3 :
                begin
-                  { We can handle opcodes with 2 and 3 operands the same way. The opcodes
-                    with 3 registers are shrd/shld, where the 3rd operand is const or CL,
-                    that doesn't need spilling.
-                    However, due to AT&T order inside the compiler, the 3rd operand is
-                    numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
-                    adding a "n". }
-                  n:=0;
-                  if ops=3 then
-                    n:=1;
-                  if (oper[n+0]^.typ=top_reg) and
-                     (oper[n+1]^.typ=top_reg) and
-                     ((getregtype(oper[n+0]^.reg)<>regtype) or
-                      (getregtype(oper[n+1]^.reg)<>regtype) or
-                      (get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
+                  { avx instruction?
+                    currently this rule is sufficient but it might be extended }
+                  if (ops=3) and (opcode<>A_SHRD) and (opcode<>A_SHLD) then
                    begin
-                      if (getregtype(oper[n+0]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
-                        replaceoper:=0+n
-                      else if (getregtype(oper[n+1]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
-                        replaceoper:=1+n;
+                      { avx instructions allow only the first operand (at&t counting) to be a register operand }
+                      { all operands must be registers ... }
+                      if (oper[0]^.typ=top_reg) and
+                         (oper[1]^.typ=top_reg) and
+                         (oper[2]^.typ=top_reg) and
+                         { but they must be different }
+                         ((getregtype(oper[1]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[1]^.reg)))
+                         ) and
+                         ((getregtype(oper[2]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[2]^.reg)))
+                         ) and
+                         (get_alias(getsupreg(oper[0]^.reg))=orgreg) then
+                        replaceoper:=0;
                    end
-                  else if (oper[n+0]^.typ=top_reg) and
-                     (oper[n+1]^.typ=top_const) then
+                  else
                    begin
-                      if (getregtype(oper[0+n]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
-                        replaceoper:=0+n
-                      else
-                        internalerror(200704282);
-                    end
-                  else if (oper[n+0]^.typ=top_const) and
-                     (oper[n+1]^.typ=top_reg) then
-                    begin
-                      if (getregtype(oper[1+n]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
-                        replaceoper:=1+n
-                      else
-                        internalerror(200704283);
-                    end;
-                  case replaceoper of
-                    0 :
-                      begin
-                        { Some instructions don't allow memory references
-                          for source }
-                        case instr.opcode of
-                          A_BT,
-                          A_BTS,
-                          A_BTC,
-                          A_BTR,
-
-                          { shufp* would require 16 byte alignment for memory locations so we force the source
-                            operand into a register }
-                          A_SHUFPD,
-                          A_SHUFPS :
-                            replaceoper:=-1;
+                      { We can handle opcodes with 2 and shrd/shld the same way, where the 3rd operand is const or CL,
+                        that doesn't need spilling.
+                        However, due to AT&T order inside the compiler, the 3rd operand is
+                        numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
+                        adding a "n". }
+                      n:=0;
+                      if ops=3 then
+                        n:=1;
+                      if (oper[n+0]^.typ=top_reg) and
+                         (oper[n+1]^.typ=top_reg) and
+                         ((getregtype(oper[n+0]^.reg)<>regtype) or
+                          (getregtype(oper[n+1]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
+                        begin
+                          if (getregtype(oper[n+0]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
+                            replaceoper:=0+n
+                          else if (getregtype(oper[n+1]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
+                            replaceoper:=1+n;
+                        end
+                      else if (oper[n+0]^.typ=top_reg) and
+                         (oper[n+1]^.typ=top_const) then
+                        begin
+                          if (getregtype(oper[0+n]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
+                            replaceoper:=0+n
+                          else
+                            internalerror(200704282);
+                        end
+                      else if (oper[n+0]^.typ=top_const) and
+                         (oper[n+1]^.typ=top_reg) then
+                        begin
+                          if (getregtype(oper[1+n]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
+                            replaceoper:=1+n
+                          else
+                            internalerror(200704283);
                        end;
-                      end;
-                    1 :
-                      begin
-                        { Some instructions don't allow memory references
-                          for destination }
-                        case instr.opcode of
-                          A_CMOVcc,
-                          A_MOVZX,
-                          A_MOVSX,
-                          A_MOVSXD,
-                          A_MULSS,
-                          A_MULSD,
-                          A_SUBSS,
-                          A_SUBSD,
-                          A_ADDSD,
-                          A_ADDSS,
-                          A_DIVSD,
-                          A_DIVSS,
-                          A_SHLD,
-                          A_SHRD,
-                          A_COMISD,
-                          A_COMISS,
-                          A_CVTDQ2PD,
-                          A_CVTDQ2PS,
-                          A_CVTPD2DQ,
-                          A_CVTPD2PI,
-                          A_CVTPD2PS,
-                          A_CVTPI2PD,
-                          A_CVTPS2DQ,
-                          A_CVTPS2PD,
-                          A_CVTSD2SI,
-                          A_CVTSD2SS,
-                          A_CVTSI2SD,
-                          A_CVTSS2SD,
-                          A_CVTTPD2PI,
-                          A_CVTTPD2DQ,
-                          A_CVTTPS2DQ,
-                          A_CVTTSD2SI,
-                          A_CVTPI2PS,
-                          A_CVTPS2PI,
-                          A_CVTSI2SS,
-                          A_CVTSS2SI,
-                          A_CVTTPS2PI,
-                          A_CVTTSS2SI,
-                          A_IMUL,
-                          A_XORPD,
-                          A_XORPS,
-                          A_ORPD,
-                          A_ORPS,
-                          A_ANDPD,
-                          A_ANDPS,
-                          A_UNPCKLPS,
-                          A_UNPCKHPS,
-                          A_SHUFPD,
-                          A_SHUFPS:
+                      case replaceoper of
+                        0 :
+                          begin
+                            { Some instructions don't allow memory references
+                              for source }
+                            case instr.opcode of
+                              A_BT,
+                              A_BTS,
+                              A_BTC,
+                              A_BTR,

-                            replaceoper:=-1;
+                              { shufp* would require 16 byte alignment for memory locations so we force the source
+                                operand into a register }
+                              A_SHUFPD,
+                              A_SHUFPS :
+                                replaceoper:=-1;
+                            end;
+                          end;
+                        1 :
+                          begin
+                            { Some instructions don't allow memory references
+                              for destination }
+                            case instr.opcode of
+                              A_CMOVcc,
+                              A_MOVZX,
+                              A_MOVSX,
+                              A_MOVSXD,
+                              A_MULSS,
+                              A_MULSD,
+                              A_SUBSS,
+                              A_SUBSD,
+                              A_ADDSD,
+                              A_ADDSS,
+                              A_DIVSD,
+                              A_DIVSS,
+                              A_SHLD,
+                              A_SHRD,
+                              A_COMISD,
+                              A_COMISS,
+                              A_CVTDQ2PD,
+                              A_CVTDQ2PS,
+                              A_CVTPD2DQ,
+                              A_CVTPD2PI,
+                              A_CVTPD2PS,
+                              A_CVTPI2PD,
+                              A_CVTPS2DQ,
+                              A_CVTPS2PD,
+                              A_CVTSD2SI,
+                              A_CVTSD2SS,
+                              A_CVTSI2SD,
+                              A_CVTSS2SD,
+                              A_CVTTPD2PI,
+                              A_CVTTPD2DQ,
+                              A_CVTTPS2DQ,
+                              A_CVTTSD2SI,
+                              A_CVTPI2PS,
+                              A_CVTPS2PI,
+                              A_CVTSI2SS,
+                              A_CVTSS2SI,
+                              A_CVTTPS2PI,
+                              A_CVTTSS2SI,
+                              A_IMUL,
+                              A_XORPD,
+                              A_XORPS,
+                              A_ORPD,
+                              A_ORPS,
+                              A_ANDPD,
+                              A_ANDPS,
+                              A_UNPCKLPS,
+                              A_UNPCKHPS,
+                              A_SHUFPD,
+                              A_SHUFPS:
+
+                                replaceoper:=-1;
 {$ifdef x86_64}
-                          A_MOV:
-                             { 64 bit constants can only be moved into registers }
-                             if (oper[0]^.typ=top_const) and
-                                (oper[1]^.typ=top_reg) and
-                                ((oper[0]^.val<low(longint)) or
-                                 (oper[0]^.val>high(longint))) then
-                               replaceoper:=-1;
+                              A_MOV:
+                                 { 64 bit constants can only be moved into registers }
+                                 if (oper[0]^.typ=top_const) and
+                                    (oper[1]^.typ=top_reg) and
+                                    ((oper[0]^.val<low(longint)) or
+                                     (oper[0]^.val>high(longint))) then
+                                   replaceoper:=-1;
 {$endif x86_64}
+                            end;
+                          end;
                        end;
-                      end;
                    end;
                end;
             end;

-            {$ifdef x86_64}
+{$ifdef x86_64}
            { 32 bit operations on 32 bit registers on x86_64 can result in
              zeroing the upper 32 bits of the register. This does not happen
              with memory operations, so we have to perform these calculations
              in registers.  }
            if (instr.opsize=S_L) then
              replaceoper:=-1;
-            {$endif x86_64}
+{$endif x86_64}

            { Replace register with spill reference }
            if replaceoper<>-1 then
@ -287,6 +308,10 @@ implementation
                    opcode:=A_MOVSS;
                  A_MOVAPD:
                    opcode:=A_MOVSD;
+                  A_VMOVAPS:
+                    opcode:=A_VMOVSS;
+                  A_VMOVAPD:
+                    opcode:=A_VMOVSD;
                end;
                result:=true;
              end;
--- a/compiler/x86/x86ins.dat
+++ b/compiler/x86/x86ins.dat
@ -3453,22 +3453,22 @@ void                   \326\1\xA7                                    X86_64


 [VADDPD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x58\75\120        AVX,SANDYBRIDGE

 [VADDPS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \362\370\1\x58\75\120                AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x58\75\120            AVX,SANDYBRIDGE

 [VADDSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem64                  \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE

 [VADDSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem32                  \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE

@ -3919,7 +3919,7 @@ rm64,xmmreg                          \361\362\363\370\1\x7E\101           AVX,SA
 xmmreg,rm64                          \361\362\363\370\1\x6E\110           AVX,SANDYBRIDGE

 [VMOVSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,mem64                         \334\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x11\75\102            AVX,SANDYBRIDGE
@ -3936,7 +3936,7 @@ xmmreg,xmmrm                         \333\362\370\1\x12\110               AVX,SA
 ymmreg,ymmrm                         \333\362\364\370\1\x12\110           AVX,SANDYBRIDGE

 [VMOVSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,mem64                         \333\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x11\75\102            AVX,SANDYBRIDGE
@ -3961,22 +3961,22 @@ ymmrm,ymmreg                         \362\364\370\1\x11\101               AVX,SA
 xmmreg,xmmreg,xmmrm,imm8             \361\362\372\1\x42\75\120\27         AVX,SANDYBRIDGE

 [VMULPD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x59\75\120        AVX,SANDYBRIDGE

 [VMULPS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \362\370\1\x59\75\120                AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x59\75\120            AVX,SANDYBRIDGE

 [VMULSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem64                  \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE

 [VMULSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem32                  \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE

--- a/compiler/x86_64/cpuinfo.pas
+++ b/compiler/x86_64/cpuinfo.pas
@ -51,7 +51,8 @@ Type
      fpu_ssse3,
      fpu_sse41,
      fpu_sse42,
-      fpu_avx
+      fpu_avx,
+      fpu_avx2
     );

 Const
@ -86,11 +87,14 @@ Const
     'SSSE3',
     'SSE41',
     'SSE42',
-     'AVX'
+     'AVX',
+     'AVX2'
   );

-   sse_singlescalar : set of tfputype = [fpu_sse64,fpu_sse3];
-   sse_doublescalar : set of tfputype = [fpu_sse64,fpu_sse3];
+   sse_singlescalar = [fpu_sse64..fpu_avx2];
+   sse_doublescalar = [fpu_sse64..fpu_avx2];
+
+   fpu_avx_instructionsets = [fpu_avx,fpu_avx2];

   { Supported optimizations, only used for information }
   supported_optimizerswitches = genericlevel1optimizerswitches+
--- a/compiler/x86_64/x8664pro.inc
+++ b/compiler/x86_64/x8664pro.inc
@ -685,6 +685,10 @@
 (Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
 (Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@ -772,21 +776,17 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),