* make use of mulps/mulpd and haddps/haddpd/hsubpd/hsubps to optimze x*x+y*y and x*x-y*y where x and y might be single or double

git-svn-id: trunk@18790 -
This commit is contained in:
florian 2011-08-20 12:34:37 +00:00
parent 13ac5d185f
commit 46cc0209de
3 changed files with 156 additions and 2 deletions

1
.gitattributes vendored
View File

@ -10448,6 +10448,7 @@ tests/test/tset5a.pp svneol=native#text/plain
tests/test/tset6.pp svneol=native#text/plain
tests/test/tset7.pp svneol=native#text/plain
tests/test/tsetsize.pp svneol=native#text/plain
tests/test/tshuffle1.pp svneol=native#text/pascal
tests/test/tstack.pp svneol=native#text/plain
tests/test/tstatic1.pp svneol=native#text/pascal
tests/test/tstatic2.pp svneol=native#text/pascal

View File

@ -66,7 +66,7 @@ unit nx86add;
symconst,symdef,
cgobj,cgx86,cga,cgutils,
paramgr,tgobj,ncgutil,
ncon,nset,
ncon,nset,ninl,
defutil;
@ -660,7 +660,28 @@ unit nx86add;
procedure tx86addnode.second_addfloatsse;
var
op : topcg;
sqr_sum : boolean;
tmp : tnode;
begin
sqr_sum:=false;
if (current_settings.fputype>=fpu_sse3) and
use_vectorfpu(resultdef) and
(nodetype in [addn,subn]) and
(left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
(right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
begin
sqr_sum:=true;
tmp:=tinlinenode(left).left;
tinlinenode(left).left:=nil;
left.free;
left:=tmp;
tmp:=tinlinenode(right).left;
tinlinenode(right).left:=nil;
right.free;
right:=tmp;
end;
pass_left_right;
check_left_and_right_fpureg(false);
@ -687,8 +708,51 @@ unit nx86add;
end;
location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
if sqr_sum then
begin
if nf_swapped in flags then
swapleftright;
location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
location:=left.location;
if is_double(resultdef) then
begin
current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
case nodetype of
addn:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
subn:
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
else
internalerror(201108162);
end;
end
else
begin
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
{ ensure that bits 64..127 contain valid values }
current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
{ the data is now in bits 0..32 and 64..95 }
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
case nodetype of
addn:
begin
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
end;
subn:
begin
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
end;
else
internalerror(201108163);
end;
end
end
{ we can use only right as left operand if the operation is commutative }
if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
begin
location.register:=right.location.register;
{ force floating point reg. location to be written to memory,

89
tests/test/tshuffle1.pp Normal file
View File

@ -0,0 +1,89 @@
{ %cpu=i386,x86_64 }
{ %opt=-Cfsse3 -O3 }
{$mode objfpc}
uses
cpu;
function test_double : longint;
var
f,f1,f2 : double;
i : longint;
begin
result:=0;
f1:=1;
f2:=2;
f:=f1*f1+f2*f2;
if f<>5 then
result:=1;
f:=f1*f1-f2*f2;
if f<>-3 then
result:=1;
{ fool ssa }
for i:=1 to 3 do
begin
f:=f1*f1+f2*f2;
if f<>5 then
result:=1;
f:=f1*f1-f2*f2;
if f<>-3 then
result:=1;
end;
end;
function test_single : longint;
var
f,f1,f2 : single;
i : longint;
begin
result:=0;
f1:=1;
f2:=2;
f:=f1*f1+f2*f2;
if f<>5 then
result:=1;
f:=f1*f1-f2*f2;
if f<>-3 then
result:=1;
{ fool ssa }
for i:=1 to 3 do
begin
f:=f1*f1+f2*f2;
if f<>5 then
result:=1;
f:=f1*f1-f2*f2;
if f<>-3 then
result:=1;
end;
end;
var
f,f1,f2 : double;
i : longint;
begin
if not(is_sse3_cpu) then
halt(0);
f1:=1;
f2:=2;
f:=f1*f1+f2*f2;
if f<>5 then
halt(1);
f:=f1*f1-f2*f2;
if f<>-3 then
halt(1);
{ fool ssa }
for i:=1 to 3 do
begin
f:=f1*f1+f2*f2;
if f<>5 then
halt(1);
f:=f1*f1-f2*f2;
if f<>-3 then
halt(1);
end;
if test_double<>0 then
halt(1);
if test_single<>0 then
halt(1);
writeln('ok');
end.