* basic avx support for floating point operations (use -Cfavx to activate)

git-svn-id: trunk@24896 -
This commit is contained in:
florian 2013-06-14 20:03:01 +00:00
parent 6a8e4f0381
commit e81d2d1f3b
13 changed files with 764 additions and 244 deletions

View File

@ -52,8 +52,10 @@ unit cgobj;
by Free Pascal. For 32-bit processors, the base class
should be @link(tcg64f32) and not @var(tcg).
}
{ tcg }
tcg = class
public
{ how many times is this current code executed }
executionweight : longint;
alignment : talignment;
@ -271,6 +273,9 @@ unit cgobj;
procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); virtual;
procedure a_opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; reg: tregister;shuffle : pmmshuffle); virtual;
procedure a_opmm_reg_ref(list: TAsmList; Op: TOpCG; size : tcgsize;reg: tregister;const ref: treference; shuffle : pmmshuffle); virtual;
procedure a_opmm_loc_reg_reg(list: TAsmList;Op : TOpCG;size : tcgsize;const loc : tlocation;src,dst : tregister;shuffle : pmmshuffle); virtual;
procedure a_opmm_reg_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src1,src2,dst: tregister;shuffle : pmmshuffle); virtual;
procedure a_opmm_ref_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; src,dst: tregister;shuffle : pmmshuffle); virtual;
procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); virtual;
procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister; shuffle : pmmshuffle); virtual;
@ -2061,6 +2066,33 @@ implementation
end;
procedure tcg.a_opmm_loc_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; src,dst: tregister;shuffle : pmmshuffle);
begin
case loc.loc of
LOC_CMMREGISTER,LOC_MMREGISTER:
a_opmm_reg_reg_reg(list,op,size,loc.register,src,dst,shuffle);
LOC_CREFERENCE,LOC_REFERENCE:
a_opmm_ref_reg_reg(list,op,size,loc.reference,src,dst,shuffle);
else
internalerror(200312232);
end;
end;
procedure tcg.a_opmm_reg_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;
src1,src2,dst : tregister;shuffle : pmmshuffle);
begin
internalerror(2013061102);
end;
procedure tcg.a_opmm_ref_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;
const ref : treference;src,dst : tregister;shuffle : pmmshuffle);
begin
internalerror(2013061101);
end;
procedure tcg.g_concatcopy_unaligned(list : TAsmList;const source,dest : treference;len : tcgint);
begin
g_concatcopy(list,source,dest,len);

View File

@ -59,7 +59,8 @@ Type
fpu_ssse3,
fpu_sse41,
fpu_sse42,
fpu_avx
fpu_avx,
fpu_avx2
);
@ -96,11 +97,14 @@ Const
'SSSE3',
'SSE41',
'SSE42',
'AVX'
'AVX',
'AVX2'
);
sse_singlescalar : set of tfputype = [fpu_sse,fpu_sse2,fpu_sse3];
sse_doublescalar : set of tfputype = [fpu_sse2,fpu_sse3];
sse_singlescalar = [fpu_sse..fpu_avx2];
sse_doublescalar = [fpu_sse2..fpu_avx2];
fpu_avx_instructionsets = [fpu_avx,fpu_avx2];
{ Supported optimizations, only used for information }
supported_optimizerswitches = genericlevel1optimizerswitches+

View File

@ -685,6 +685,10 @@
(Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
(Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
@ -772,21 +776,17 @@
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),

View File

@ -296,7 +296,7 @@ interface
constructor op_reg_reg_reg(op : tasmop;_size : topsize;_op1,_op2,_op3 : tregister);
constructor op_const_reg_reg(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;_op3 : tregister);
constructor op_const_ref_reg(op : tasmop;_size : topsize;_op1 : aint;const _op2 : treference;_op3 : tregister);
constructor op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister; const _op3 : treference);
constructor op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
constructor op_const_reg_ref(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;const _op3 : treference);
{ this is for Jmp instructions }
@ -375,7 +375,8 @@ implementation
systems,
procinfo,
itcpugas,
symsym;
symsym,
cpuinfo;
{*****************************************************************************
Instruction table
@ -813,14 +814,14 @@ implementation
end;
constructor taicpu.op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister;const _op3 : treference);
constructor taicpu.op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
begin
inherited create(op);
init(_size);
ops:=3;
loadreg(0,_op1);
loadref(0,_op1);
loadreg(1,_op2);
loadref(2,_op3);
loadreg(2,_op3);
end;
@ -2874,7 +2875,9 @@ implementation
(oper[0]^.reg=oper[1]^.reg)
) or
(((opcode=A_MOVSS) or (opcode=A_MOVSD) or (opcode=A_MOVQ) or
(opcode=A_MOVAPS) or (OPCODE=A_MOVAPD)) and
(opcode=A_MOVAPS) or (OPCODE=A_MOVAPD) or
(opcode=A_VMOVSS) or (opcode=A_VMOVSD) or (opcode=A_VMOVQ) or
(opcode=A_VMOVAPS) or (OPCODE=A_VMOVAPD)) and
(regtype = R_MMREGISTER) and
(ops=2) and
(oper[0]^.typ=top_reg) and
@ -2929,8 +2932,11 @@ implementation
begin
{ the information in the instruction table is made for the string copy
operation MOVSD so hack here (FK)
VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
so fix it here (FK)
}
if (opcode=A_MOVSD) and (ops=2) then
if ((opcode=A_MOVSD) or (opcode=A_VMOVSS) or (opcode=A_VMOVSD)) and (ops=2) then
begin
case opnr of
0:
@ -2961,17 +2967,30 @@ implementation
result:=taicpu.op_ref_reg(A_MOV,reg2opsize(r),tmpref,r);
end;
R_MMREGISTER :
case getsubreg(r) of
R_SUBMMD:
result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
R_SUBMMS:
result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
R_SUBQ,
R_SUBMMWHOLE:
result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
else
internalerror(200506043);
end;
if current_settings.fputype in fpu_avx_instructionsets then
case getsubreg(r) of
R_SUBMMD:
result:=taicpu.op_ref_reg(A_VMOVSD,reg2opsize(r),ref,r);
R_SUBMMS:
result:=taicpu.op_ref_reg(A_VMOVSS,reg2opsize(r),ref,r);
R_SUBQ,
R_SUBMMWHOLE:
result:=taicpu.op_ref_reg(A_VMOVQ,S_NO,ref,r);
else
internalerror(200506043);
end
else
case getsubreg(r) of
R_SUBMMD:
result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
R_SUBMMS:
result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
R_SUBQ,
R_SUBMMWHOLE:
result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
else
internalerror(200506043);
end;
else
internalerror(200401041);
end;
@ -3002,17 +3021,30 @@ implementation
result:=taicpu.op_reg_ref(A_MOV,size,r,tmpref);
end;
R_MMREGISTER :
case getsubreg(r) of
R_SUBMMD:
result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
R_SUBMMS:
result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
R_SUBQ,
R_SUBMMWHOLE:
result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
else
internalerror(200506042);
end;
if current_settings.fputype in fpu_avx_instructionsets then
case getsubreg(r) of
R_SUBMMD:
result:=taicpu.op_reg_ref(A_VMOVSD,reg2opsize(r),r,ref);
R_SUBMMS:
result:=taicpu.op_reg_ref(A_VMOVSS,reg2opsize(r),r,ref);
R_SUBQ,
R_SUBMMWHOLE:
result:=taicpu.op_reg_ref(A_VMOVQ,S_NO,r,ref);
else
internalerror(200506042);
end
else
case getsubreg(r) of
R_SUBMMD:
result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
R_SUBMMS:
result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
R_SUBQ,
R_SUBMMWHOLE:
result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
else
internalerror(200506042);
end;
else
internalerror(200401041);
end;

View File

@ -92,6 +92,8 @@ unit cgx86;
procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src,dst: tregister;shuffle : pmmshuffle);override;
procedure a_opmm_ref_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;const ref : treference;src,dst : tregister;shuffle : pmmshuffle);override;
procedure a_opmm_reg_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;src1,src2,dst : tregister;shuffle : pmmshuffle);override;
{ comparison operations }
procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
@ -126,9 +128,9 @@ unit cgx86;
procedure check_register_size(size:tcgsize;reg:tregister);
procedure opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
procedure opmm_loc_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;loc : tlocation;src,dst : tregister;shuffle : pmmshuffle);
function get_darwin_call_stub(const s: string; weak: boolean): tasmsymbol;
private
procedure sizes2load(s1,s2 : tcgsize;var op: tasmop; var s3: topsize);
procedure floatload(list: TAsmList; t : tcgsize;const ref : treference);
@ -175,7 +177,7 @@ unit cgx86;
function UseAVX: boolean;
begin
Result:=current_settings.fputype in [fpu_avx];
Result:=current_settings.fputype in fpu_avx_instructionsets;
end;
const
@ -1144,12 +1146,18 @@ unit cgx86;
function get_scalar_mm_op(fromsize,tosize : tcgsize) : tasmop;
const
convertop : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
convertopsse : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
(A_MOVSS,A_CVTSS2SD,A_NONE,A_NONE,A_NONE),
(A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
(A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
(A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
begin
{ we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
OS_32/OS_64 (record in memory/LOC_REFERENCE) }
@ -1161,14 +1169,24 @@ unit cgx86;
OS_64:
tosize:=OS_F64;
end;
if (fromsize in [low(convertop)..high(convertop)]) and
(tosize in [low(convertop)..high(convertop)]) then
result:=convertop[fromsize,tosize]
if (fromsize in [low(convertopsse)..high(convertopsse)]) and
(tosize in [low(convertopsse)..high(convertopsse)]) then
begin
if UseAVX then
result:=convertopavx[fromsize,tosize]
else
result:=convertopsse[fromsize,tosize];
end
{ we can have OS_M64 (record in function result/LOC_MMREGISTER) to
OS_64 (record in memory/LOC_REFERENCE) }
else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
(fromsize=OS_M64) then
result:=A_MOVQ
begin
if UseAVX then
result:=A_VMOVQ
else
result:=A_MOVQ;
end
else
internalerror(2010060104);
if result=A_NONE then
@ -1179,6 +1197,7 @@ unit cgx86;
procedure tcgx86.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle);
var
instr : taicpu;
op : TAsmOp;
begin
if shuffle=nil then
begin
@ -1200,8 +1219,26 @@ unit cgx86;
end
else if shufflescalar(shuffle) then
begin
instr:=taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg1,reg2);
op:=get_scalar_mm_op(fromsize,tosize);
{ VMOVSD/SS is not available with two register operands }
if op=A_VMOVSD then
op:=A_VMOVAPD
else if op=A_VMOVSS then
op:=A_VMOVAPS;
{ A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
instr:=taicpu.op_reg_reg_reg(op,S_NO,reg1,reg2,reg2)
else
instr:=taicpu.op_reg_reg(op,S_NO,reg1,reg2);
case get_scalar_mm_op(fromsize,tosize) of
A_VMOVAPD,
A_VMOVAPS,
A_VMOVSS,
A_VMOVSD,
A_VMOVQ,
A_MOVSS,
A_MOVSD,
A_MOVQ:
@ -1217,6 +1254,7 @@ unit cgx86;
procedure tcgx86.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle);
var
tmpref : treference;
op : tasmop;
begin
tmpref:=ref;
make_simple_ref(list,tmpref);
@ -1233,7 +1271,15 @@ unit cgx86;
{$endif x86_64}
end
else if shufflescalar(shuffle) then
list.concat(taicpu.op_ref_reg(get_scalar_mm_op(fromsize,tosize),S_NO,tmpref,reg))
begin
op:=get_scalar_mm_op(fromsize,tosize);
{ A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
list.concat(taicpu.op_ref_reg_reg(op,S_NO,tmpref,reg,reg))
else
list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg))
end
else
internalerror(200312252);
end;
@ -1243,6 +1289,7 @@ unit cgx86;
var
hreg : tregister;
tmpref : treference;
op : tasmop;
begin
tmpref:=ref;
make_simple_ref(list,tmpref);
@ -1263,8 +1310,15 @@ unit cgx86;
if tcgsize2size[tosize]<>tcgsize2size[fromsize] then
begin
hreg:=getmmregister(list,tosize);
list.concat(taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg,hreg));
list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref));
op:=get_scalar_mm_op(fromsize,tosize);
{ A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
list.concat(taicpu.op_reg_reg_reg(op,S_NO,reg,hreg,hreg))
else
list.concat(taicpu.op_reg_reg(op,S_NO,reg,hreg));
list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref))
end
else
list.concat(taicpu.op_reg_ref(get_scalar_mm_op(fromsize,tosize),S_NO,reg,tmpref));
@ -1296,6 +1350,103 @@ unit cgx86;
end;
procedure tcgx86.opmm_loc_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;src,dst: tregister; shuffle : pmmshuffle);
const
opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
( { scalar }
( { OS_F32 }
A_NOP,A_NOP,A_VADDSS,A_NOP,A_VDIVSS,A_NOP,A_NOP,A_VMULSS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSS,A_NOP,A_NOP,A_NOP
),
( { OS_F64 }
A_NOP,A_NOP,A_VADDSD,A_NOP,A_VDIVSD,A_NOP,A_NOP,A_VMULSD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSD,A_NOP,A_NOP,A_NOP
)
),
( { vectorized/packed }
{ because the logical packed single instructions have shorter op codes, we use always
these
}
( { OS_F32 }
A_NOP,A_NOP,A_VADDPS,A_NOP,A_VDIVPS,A_NOP,A_NOP,A_VMULPS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPS,A_VXORPS,A_NOP,A_NOP
),
( { OS_F64 }
A_NOP,A_NOP,A_VADDPD,A_NOP,A_VDIVPD,A_NOP,A_NOP,A_VMULPD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPD,A_VXORPD,A_NOP,A_NOP
)
)
);
var
resultreg : tregister;
asmop : tasmop;
begin
{ this is an internally used procedure so the parameters have
some constrains
}
if loc.size<>size then
internalerror(2013061108);
resultreg:=dst;
{ deshuffle }
//!!!
if (shuffle<>nil) and not(shufflescalar(shuffle)) then
begin
internalerror(2013061107);
end
else if (shuffle=nil) then
asmop:=opmm2asmop[1,size,op]
else if shufflescalar(shuffle) then
begin
asmop:=opmm2asmop[0,size,op];
{ no scalar operation available? }
if asmop=A_NOP then
begin
{ do vectorized and shuffle finally }
internalerror(2010060102);
end;
end
else
internalerror(2013061106);
if asmop=A_NOP then
internalerror(2013061105);
case loc.loc of
LOC_CREFERENCE,LOC_REFERENCE:
begin
make_simple_ref(current_asmdata.CurrAsmList,loc.reference);
list.concat(taicpu.op_ref_reg_reg(asmop,S_NO,loc.reference,src,resultreg));
end;
LOC_CMMREGISTER,LOC_MMREGISTER:
list.concat(taicpu.op_reg_reg_reg(asmop,S_NO,loc.register,src,resultreg));
else
internalerror(2013061104);
end;
{ shuffle }
if resultreg<>dst then
begin
internalerror(2013061103);
end;
end;
procedure tcgx86.a_opmm_reg_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src1,src2,dst: tregister;shuffle : pmmshuffle);
var
l : tlocation;
begin
l.loc:=LOC_MMREGISTER;
l.register:=src1;
l.size:=size;
opmm_loc_reg_reg(list,op,size,l,src2,dst,shuffle);
end;
procedure tcgx86.a_opmm_ref_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; src,dst: tregister;shuffle : pmmshuffle);
var
l : tlocation;
begin
l.loc:=LOC_REFERENCE;
l.reference:=ref;
l.size:=size;
opmm_loc_reg_reg(list,op,size,l,src,dst,shuffle);
end;
procedure tcgx86.opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
const
opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
@ -1319,7 +1470,6 @@ unit cgx86;
)
)
);
var
resultreg : tregister;
asmop : tasmop;

View File

@ -41,7 +41,10 @@ unit nx86add;
procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
procedure second_cmpfloatsse;
procedure second_cmpfloatavx;
procedure second_addfloatsse;
procedure second_addfloatavx;
public
procedure second_addfloat;override;
{$ifndef i8086}
@ -794,6 +797,141 @@ unit nx86add;
end;
end;
procedure tx86addnode.second_addfloatavx;
var
op : topcg;
sqr_sum : boolean;
tmp : tnode;
begin
sqr_sum:=false;
{$ifdef dummy}
if (current_settings.fputype>=fpu_sse3) and
use_vectorfpu(resultdef) and
(nodetype in [addn,subn]) and
(left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
(right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
begin
sqr_sum:=true;
tmp:=tinlinenode(left).left;
tinlinenode(left).left:=nil;
left.free;
left:=tmp;
tmp:=tinlinenode(right).left;
tinlinenode(right).left:=nil;
right.free;
right:=tmp;
end;
{$endif dummy}
pass_left_right;
check_left_and_right_fpureg(false);
if (nf_swapped in flags) then
{ can't use swapleftright if both are on the fpu stack, since then }
{ both are "R_ST" -> nothing would change -> manually switch }
if (left.location.loc = LOC_FPUREGISTER) and
(right.location.loc = LOC_FPUREGISTER) then
emit_none(A_FXCH,S_NO)
else
swapleftright;
case nodetype of
addn :
op:=OP_ADD;
muln :
op:=OP_MUL;
subn :
op:=OP_SUB;
slashn :
op:=OP_DIV;
else
internalerror(200312231);
end;
location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
if sqr_sum then
begin
if nf_swapped in flags then
swapleftright;
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
location:=left.location;
if is_double(resultdef) then
begin
current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
case nodetype of
addn:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
subn:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
else
internalerror(201108162);
end;
end
else
begin
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
{ ensure that bits 64..127 contain valid values }
current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
{ the data is now in bits 0..32 and 64..95 }
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
case nodetype of
addn:
begin
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
end;
subn:
begin
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
end;
else
internalerror(201108163);
end;
end
end
{ we can use only right as left operand if the operation is commutative }
else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
begin
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
{ force floating point reg. location to be written to memory,
we don't force it to mm register because writing to memory
allows probably shorter code because there is no direct fpu->mm register
copy instruction
}
if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
left.location,
right.location.register,
location.register,
mms_movescalar);
end
else
begin
if (nf_swapped in flags) then
swapleftright;
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
{ force floating point reg. location to be written to memory,
we don't force it to mm register because writing to memory
allows probably shorter code because there is no direct fpu->mm register
copy instruction
}
if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
right.location,
left.location.register,
location.register,
mms_movescalar);
end;
end;
procedure tx86addnode.second_cmpfloatsse;
var
@ -860,6 +998,72 @@ unit nx86add;
end;
procedure tx86addnode.second_cmpfloatavx;
var
op : tasmop;
begin
if is_single(left.resultdef) then
op:=A_VCOMISS
else if is_double(left.resultdef) then
op:=A_VCOMISD
else
internalerror(200402222);
pass_left_right;
location_reset(location,LOC_FLAGS,def_cgsize(resultdef));
{ we can use only right as left operand if the operation is commutative }
if (right.location.loc=LOC_MMREGISTER) then
begin
{ force floating point reg. location to be written to memory,
we don't force it to mm register because writing to memory
allows probably shorter code because there is no direct fpu->mm register
copy instruction
}
if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
case left.location.loc of
LOC_REFERENCE,LOC_CREFERENCE:
begin
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
end;
LOC_MMREGISTER,LOC_CMMREGISTER:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
else
internalerror(200402221);
end;
if nf_swapped in flags then
exclude(flags,nf_swapped)
else
include(flags,nf_swapped)
end
else
begin
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
{ force floating point reg. location to be written to memory,
we don't force it to mm register because writing to memory
allows probably shorter code because there is no direct fpu->mm register
copy instruction
}
if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
case right.location.loc of
LOC_REFERENCE,LOC_CREFERENCE:
begin
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
end;
LOC_MMREGISTER,LOC_CMMREGISTER:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
else
internalerror(200402223);
end;
end;
location.resflags:=getresflags(true);
end;
procedure tx86addnode.second_opvector;
var
op : topcg;
@ -912,7 +1116,10 @@ unit nx86add;
begin
if use_vectorfpu(resultdef) then
begin
second_addfloatsse;
if UseAVX then
second_addfloatavx
else
second_addfloatsse;
exit;
end;
@ -959,7 +1166,10 @@ unit nx86add;
begin
if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
begin
second_cmpfloatsse;
if UseAVX then
second_cmpfloatavx
else
second_cmpfloatsse;
exit;
end;

View File

@ -276,14 +276,25 @@ implementation
begin
location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
case location.size of
OS_F32:
op:=A_CVTSI2SS;
OS_F64:
op:=A_CVTSI2SD;
else
internalerror(2007120902);
end;
if UseAVX then
case location.size of
OS_F32:
op:=A_VCVTSI2SS;
OS_F64:
op:=A_VCVTSI2SD;
else
internalerror(2007120902);
end
else
case location.size of
OS_F32:
op:=A_CVTSI2SS;
OS_F64:
op:=A_CVTSI2SD;
else
internalerror(2007120902);
end;
{ don't use left.location.size, because that one may be OS_32/OS_64
if the lower bound of the orddef >= 0
}
@ -301,11 +312,19 @@ implementation
begin
href:=left.location.reference;
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,href);
current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
if UseAVX then
{ VCVTSI2.. requires a second source operand to copy bits 64..127 }
current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(op,opsize,href,location.register,location.register))
else
current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
end;
LOC_REGISTER,
LOC_CREGISTER:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
if UseAVX then
{ VCVTSI2.. requires a second source operand to copy bits 64..127 }
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,opsize,left.location.register,location.register,location.register))
else
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
end;
end
else

View File

@ -289,14 +289,24 @@ implementation
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
location_reset(location,LOC_REGISTER,OS_S64);
location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
case left.location.size of
OS_F32:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
OS_F64:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
else
internalerror(2007031402);
end;
if UseAVX then
case left.location.size of
OS_F32:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSS2SI,S_Q,left.location.register,location.register));
OS_F64:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSD2SI,S_Q,left.location.register,location.register));
else
internalerror(2007031402);
end
else
case left.location.size of
OS_F32:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
OS_F64:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
else
internalerror(2007031402);
end;
end
else
{$endif x86_64}
@ -323,14 +333,24 @@ implementation
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
location_reset(location,LOC_REGISTER,OS_S64);
location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
case left.location.size of
OS_F32:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
OS_F64:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
else
internalerror(2007031401);
end;
if UseAVX then
case left.location.size of
OS_F32:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSS2SI,S_Q,left.location.register,location.register));
OS_F64:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSD2SI,S_Q,left.location.register,location.register));
else
internalerror(2007031401);
end
else
case left.location.size of
OS_F32:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
OS_F64:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
else
internalerror(2007031401);
end;
end
else
{$endif x86_64}
@ -371,9 +391,18 @@ implementation
if use_vectorfpu(resultdef) then
begin
secondpass(left);
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
location:=left.location;
cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location,left.location.register,mms_movescalar);
location_reset(location,LOC_MMREGISTER,left.location.size);
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
if UseAVX then
begin
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location.register,left.location.register,location.register,mms_movescalar);
end
else
begin
cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,location.register,location.register,mms_movescalar);
end;
end
else
begin
@ -389,15 +418,26 @@ implementation
begin
secondpass(left);
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
location:=left.location;
case tfloatdef(resultdef).floattype of
s32real:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,location.register,location.register));
s64real:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,location.register,location.register));
else
internalerror(200510031);
end;
location_reset(location,LOC_MMREGISTER,left.location.size);
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
if UseAVX then
case tfloatdef(resultdef).floattype of
s32real:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSS,S_XMM,left.location.register,location.register,location.register));
s64real:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSD,S_XMM,left.location.register,location.register,location.register));
else
internalerror(200510031);
end
else
case tfloatdef(resultdef).floattype of
s32real:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,left.location.register,location.register));
s64real:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,left.location.register,location.register));
else
internalerror(200510031);
end;
end
else
begin

View File

@ -154,14 +154,11 @@ interface
if expectloc=LOC_MMREGISTER then
begin
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
{ make life of register allocator easier }
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
current_asmdata.getdatalabel(l1);
new_section(current_asmdata.asmlists[al_typedconsts],sec_rodata_norel,l1.name,const_align(sizeof(pint)));
@ -179,9 +176,16 @@ interface
end;
reference_reset_symbol(href,l1,0,resultdef.alignment);
reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
cg.a_loadmm_ref_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),href,reg,mms_movescalar);
cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
if UseAVX then
cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,left.location.register,location.register,nil)
else
begin
cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
end;
end
else
begin

View File

@ -134,143 +134,164 @@ implementation
end;
2,3 :
begin
{ We can handle opcodes with 2 and 3 operands the same way. The opcodes
with 3 registers are shrd/shld, where the 3rd operand is const or CL,
that doesn't need spilling.
However, due to AT&T order inside the compiler, the 3rd operand is
numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
adding a "n". }
n:=0;
if ops=3 then
n:=1;
if (oper[n+0]^.typ=top_reg) and
(oper[n+1]^.typ=top_reg) and
((getregtype(oper[n+0]^.reg)<>regtype) or
(getregtype(oper[n+1]^.reg)<>regtype) or
(get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
{ avx instruction?
currently this rule is sufficient but it might be extended }
if (ops=3) and (opcode<>A_SHRD) and (opcode<>A_SHLD) then
begin
if (getregtype(oper[n+0]^.reg)=regtype) and
(get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
replaceoper:=0+n
else if (getregtype(oper[n+1]^.reg)=regtype) and
(get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
replaceoper:=1+n;
{ avx instructions allow only the first operand (at&t counting) to be a register operand }
{ all operands must be registers ... }
if (oper[0]^.typ=top_reg) and
(oper[1]^.typ=top_reg) and
(oper[2]^.typ=top_reg) and
{ but they must be different }
((getregtype(oper[1]^.reg)<>regtype) or
(get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[1]^.reg)))
) and
((getregtype(oper[2]^.reg)<>regtype) or
(get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[2]^.reg)))
) and
(get_alias(getsupreg(oper[0]^.reg))=orgreg) then
replaceoper:=0;
end
else if (oper[n+0]^.typ=top_reg) and
(oper[n+1]^.typ=top_const) then
else
begin
if (getregtype(oper[0+n]^.reg)=regtype) and
(get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
replaceoper:=0+n
else
internalerror(200704282);
end
else if (oper[n+0]^.typ=top_const) and
(oper[n+1]^.typ=top_reg) then
begin
if (getregtype(oper[1+n]^.reg)=regtype) and
(get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
replaceoper:=1+n
else
internalerror(200704283);
end;
case replaceoper of
0 :
begin
{ Some instructions don't allow memory references
for source }
case instr.opcode of
A_BT,
A_BTS,
A_BTC,
A_BTR,
{ shufp* would require 16 byte alignment for memory locations so we force the source
operand into a register }
A_SHUFPD,
A_SHUFPS :
replaceoper:=-1;
{ We can handle opcodes with 2 and shrd/shld the same way, where the 3rd operand is const or CL,
that doesn't need spilling.
However, due to AT&T order inside the compiler, the 3rd operand is
numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
adding a "n". }
n:=0;
if ops=3 then
n:=1;
if (oper[n+0]^.typ=top_reg) and
(oper[n+1]^.typ=top_reg) and
((getregtype(oper[n+0]^.reg)<>regtype) or
(getregtype(oper[n+1]^.reg)<>regtype) or
(get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
begin
if (getregtype(oper[n+0]^.reg)=regtype) and
(get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
replaceoper:=0+n
else if (getregtype(oper[n+1]^.reg)=regtype) and
(get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
replaceoper:=1+n;
end
else if (oper[n+0]^.typ=top_reg) and
(oper[n+1]^.typ=top_const) then
begin
if (getregtype(oper[0+n]^.reg)=regtype) and
(get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
replaceoper:=0+n
else
internalerror(200704282);
end
else if (oper[n+0]^.typ=top_const) and
(oper[n+1]^.typ=top_reg) then
begin
if (getregtype(oper[1+n]^.reg)=regtype) and
(get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
replaceoper:=1+n
else
internalerror(200704283);
end;
end;
1 :
begin
{ Some instructions don't allow memory references
for destination }
case instr.opcode of
A_CMOVcc,
A_MOVZX,
A_MOVSX,
A_MOVSXD,
A_MULSS,
A_MULSD,
A_SUBSS,
A_SUBSD,
A_ADDSD,
A_ADDSS,
A_DIVSD,
A_DIVSS,
A_SHLD,
A_SHRD,
A_COMISD,
A_COMISS,
A_CVTDQ2PD,
A_CVTDQ2PS,
A_CVTPD2DQ,
A_CVTPD2PI,
A_CVTPD2PS,
A_CVTPI2PD,
A_CVTPS2DQ,
A_CVTPS2PD,
A_CVTSD2SI,
A_CVTSD2SS,
A_CVTSI2SD,
A_CVTSS2SD,
A_CVTTPD2PI,
A_CVTTPD2DQ,
A_CVTTPS2DQ,
A_CVTTSD2SI,
A_CVTPI2PS,
A_CVTPS2PI,
A_CVTSI2SS,
A_CVTSS2SI,
A_CVTTPS2PI,
A_CVTTSS2SI,
A_IMUL,
A_XORPD,
A_XORPS,
A_ORPD,
A_ORPS,
A_ANDPD,
A_ANDPS,
A_UNPCKLPS,
A_UNPCKHPS,
A_SHUFPD,
A_SHUFPS:
case replaceoper of
0 :
begin
{ Some instructions don't allow memory references
for source }
case instr.opcode of
A_BT,
A_BTS,
A_BTC,
A_BTR,
replaceoper:=-1;
{ shufp* would require 16 byte alignment for memory locations so we force the source
operand into a register }
A_SHUFPD,
A_SHUFPS :
replaceoper:=-1;
end;
end;
1 :
begin
{ Some instructions don't allow memory references
for destination }
case instr.opcode of
A_CMOVcc,
A_MOVZX,
A_MOVSX,
A_MOVSXD,
A_MULSS,
A_MULSD,
A_SUBSS,
A_SUBSD,
A_ADDSD,
A_ADDSS,
A_DIVSD,
A_DIVSS,
A_SHLD,
A_SHRD,
A_COMISD,
A_COMISS,
A_CVTDQ2PD,
A_CVTDQ2PS,
A_CVTPD2DQ,
A_CVTPD2PI,
A_CVTPD2PS,
A_CVTPI2PD,
A_CVTPS2DQ,
A_CVTPS2PD,
A_CVTSD2SI,
A_CVTSD2SS,
A_CVTSI2SD,
A_CVTSS2SD,
A_CVTTPD2PI,
A_CVTTPD2DQ,
A_CVTTPS2DQ,
A_CVTTSD2SI,
A_CVTPI2PS,
A_CVTPS2PI,
A_CVTSI2SS,
A_CVTSS2SI,
A_CVTTPS2PI,
A_CVTTSS2SI,
A_IMUL,
A_XORPD,
A_XORPS,
A_ORPD,
A_ORPS,
A_ANDPD,
A_ANDPS,
A_UNPCKLPS,
A_UNPCKHPS,
A_SHUFPD,
A_SHUFPS:
replaceoper:=-1;
{$ifdef x86_64}
A_MOV:
{ 64 bit constants can only be moved into registers }
if (oper[0]^.typ=top_const) and
(oper[1]^.typ=top_reg) and
((oper[0]^.val<low(longint)) or
(oper[0]^.val>high(longint))) then
replaceoper:=-1;
A_MOV:
{ 64 bit constants can only be moved into registers }
if (oper[0]^.typ=top_const) and
(oper[1]^.typ=top_reg) and
((oper[0]^.val<low(longint)) or
(oper[0]^.val>high(longint))) then
replaceoper:=-1;
{$endif x86_64}
end;
end;
end;
end;
end;
end;
end;
{$ifdef x86_64}
{$ifdef x86_64}
{ 32 bit operations on 32 bit registers on x86_64 can result in
zeroing the upper 32 bits of the register. This does not happen
with memory operations, so we have to perform these calculations
in registers. }
if (instr.opsize=S_L) then
replaceoper:=-1;
{$endif x86_64}
{$endif x86_64}
{ Replace register with spill reference }
if replaceoper<>-1 then
@ -287,6 +308,10 @@ implementation
opcode:=A_MOVSS;
A_MOVAPD:
opcode:=A_MOVSD;
A_VMOVAPS:
opcode:=A_VMOVSS;
A_VMOVAPD:
opcode:=A_VMOVSD;
end;
result:=true;
end;

View File

@ -3453,22 +3453,22 @@ void \326\1\xA7 X86_64
[VADDPD]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,xmmrm \361\362\370\1\x58\75\120 AVX,SANDYBRIDGE
ymmreg,ymmreg,ymmrm \361\362\364\370\1\x58\75\120 AVX,SANDYBRIDGE
[VADDPS]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,xmmrm \362\370\1\x58\75\120 AVX,SANDYBRIDGE
ymmreg,ymmreg,ymmrm \362\364\370\1\x58\75\120 AVX,SANDYBRIDGE
[VADDSD]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,mem64 \334\362\370\1\x58\75\120 AVX,SANDYBRIDGE
xmmreg,xmmreg,xmmreg \334\362\370\1\x58\75\120 AVX,SANDYBRIDGE
[VADDSS]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,mem32 \333\362\370\1\x58\75\120 AVX,SANDYBRIDGE
xmmreg,xmmreg,xmmreg \333\362\370\1\x58\75\120 AVX,SANDYBRIDGE
@ -3919,7 +3919,7 @@ rm64,xmmreg \361\362\363\370\1\x7E\101 AVX,SA
xmmreg,rm64 \361\362\363\370\1\x6E\110 AVX,SANDYBRIDGE
[VMOVSD]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,xmmreg \334\362\370\1\x10\75\120 AVX,SANDYBRIDGE
xmmreg,mem64 \334\362\370\1\x10\110 AVX,SANDYBRIDGE
xmmreg,xmmreg,xmmreg \334\362\370\1\x11\75\102 AVX,SANDYBRIDGE
@ -3936,7 +3936,7 @@ xmmreg,xmmrm \333\362\370\1\x12\110 AVX,SA
ymmreg,ymmrm \333\362\364\370\1\x12\110 AVX,SANDYBRIDGE
[VMOVSS]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,xmmreg \333\362\370\1\x10\75\120 AVX,SANDYBRIDGE
xmmreg,mem64 \333\362\370\1\x10\110 AVX,SANDYBRIDGE
xmmreg,xmmreg,xmmreg \333\362\370\1\x11\75\102 AVX,SANDYBRIDGE
@ -3961,22 +3961,22 @@ ymmrm,ymmreg \362\364\370\1\x11\101 AVX,SA
xmmreg,xmmreg,xmmrm,imm8 \361\362\372\1\x42\75\120\27 AVX,SANDYBRIDGE
[VMULPD]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,xmmrm \361\362\370\1\x59\75\120 AVX,SANDYBRIDGE
ymmreg,ymmreg,ymmrm \361\362\364\370\1\x59\75\120 AVX,SANDYBRIDGE
[VMULPS]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,xmmrm \362\370\1\x59\75\120 AVX,SANDYBRIDGE
ymmreg,ymmreg,ymmrm \362\364\370\1\x59\75\120 AVX,SANDYBRIDGE
[VMULSD]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,mem64 \334\362\370\1\x59\75\120 AVX,SANDYBRIDGE
xmmreg,xmmreg,xmmreg \334\362\370\1\x59\75\120 AVX,SANDYBRIDGE
[VMULSS]
(Ch_All, Ch_None, Ch_None)
(Ch_Wop3, Ch_Rop2, Ch_Rop1)
xmmreg,xmmreg,mem32 \333\362\370\1\x59\75\120 AVX,SANDYBRIDGE
xmmreg,xmmreg,xmmreg \333\362\370\1\x59\75\120 AVX,SANDYBRIDGE

View File

@ -51,7 +51,8 @@ Type
fpu_ssse3,
fpu_sse41,
fpu_sse42,
fpu_avx
fpu_avx,
fpu_avx2
);
Const
@ -86,11 +87,14 @@ Const
'SSSE3',
'SSE41',
'SSE42',
'AVX'
'AVX',
'AVX2'
);
sse_singlescalar : set of tfputype = [fpu_sse64,fpu_sse3];
sse_doublescalar : set of tfputype = [fpu_sse64,fpu_sse3];
sse_singlescalar = [fpu_sse64..fpu_avx2];
sse_doublescalar = [fpu_sse64..fpu_avx2];
fpu_avx_instructionsets = [fpu_avx,fpu_avx2];
{ Supported optimizations, only used for information }
supported_optimizerswitches = genericlevel1optimizerswitches+

View File

@ -685,6 +685,10 @@
(Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
(Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
@ -772,21 +776,17 @@
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),
(Ch: (Ch_All, Ch_None, Ch_None)),