mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-09-27 03:29:22 +02:00
+ automatically insert fma inlines into floating point code if possible and fastmath is activated
git-svn-id: trunk@28382 -
This commit is contained in:
parent
482e61dafa
commit
499dd078e3
@ -73,6 +73,10 @@ interface
|
||||
{ full 64 bit multiplies. }
|
||||
function use_generic_mul64bit: boolean; virtual;
|
||||
|
||||
{ shall be overriden if the target cpu supports
|
||||
an fma instruction
|
||||
}
|
||||
function use_fma : boolean; virtual;
|
||||
{ This routine calls internal runtime library helpers
|
||||
for all floating point arithmetic in the case
|
||||
where the emulation switches is on. Otherwise
|
||||
@ -80,18 +84,22 @@ interface
|
||||
the code generation phase.
|
||||
}
|
||||
function first_addfloat : tnode; virtual;
|
||||
private
|
||||
{ checks whether a muln can be calculated as a 32bit }
|
||||
{ * 32bit -> 64 bit }
|
||||
function try_make_mul32to64: boolean;
|
||||
{ Match against the ranges, i.e.:
|
||||
var a:1..10;
|
||||
begin
|
||||
if a>0 then
|
||||
...
|
||||
always evaluates to true. (DM)
|
||||
}
|
||||
function cmp_of_disjunct_ranges(var res : boolean) : boolean;
|
||||
private
|
||||
{ checks whether a muln can be calculated as a 32bit }
|
||||
{ * 32bit -> 64 bit }
|
||||
function try_make_mul32to64: boolean;
|
||||
|
||||
{ Match against the ranges, i.e.:
|
||||
var a:1..10;
|
||||
begin
|
||||
if a>0 then
|
||||
...
|
||||
always evaluates to true. (DM)
|
||||
}
|
||||
function cmp_of_disjunct_ranges(var res : boolean) : boolean;
|
||||
|
||||
{ tries to replace the current node by a fma node }
|
||||
function try_fma(ld,rd : tdef) : tnode;
|
||||
end;
|
||||
taddnodeclass = class of taddnode;
|
||||
|
||||
@ -2612,6 +2620,127 @@ implementation
|
||||
end;
|
||||
|
||||
|
||||
function taddnode.use_fma : boolean;
|
||||
begin
|
||||
result:=false;
|
||||
end;
|
||||
|
||||
|
||||
function taddnode.try_fma(ld,rd : tdef) : tnode;
|
||||
var
|
||||
inlinennr : Integer;
|
||||
begin
|
||||
result:=nil;
|
||||
if (cs_opt_fastmath in current_settings.optimizerswitches) and
|
||||
use_fma and
|
||||
(nodetype in [addn,subn]) and
|
||||
(rd.typ=floatdef) and (ld.typ=floatdef) and
|
||||
(is_single(rd) or is_double(rd)) and
|
||||
equal_defs(rd,ld) and
|
||||
{ transforming a*b+c into fma(a,b,c) makes only sense if c can be
|
||||
calculated easily. Consider a*b+c*d which results in
|
||||
|
||||
fmul
|
||||
fmul
|
||||
fadd
|
||||
|
||||
and in
|
||||
|
||||
fmul
|
||||
fma
|
||||
|
||||
when using the fma optimization. On a super scalar architecture, the first instruction
|
||||
sequence requires clock_cycles(fmul)+clock_cycles(fadd) clock cycles because the fmuls can be executed in parallel.
|
||||
The second sequence requires clock_cycles(fmul)+clock_cycles(fma) because the fma has to wait for the
|
||||
result of the fmul. Since typically clock_cycles(fma)>clock_cycles(fadd) applies, the first sequence is better.
|
||||
}
|
||||
(((left.nodetype=muln) and (node_complexity(right)<3)) or
|
||||
((right.nodetype=muln) and (node_complexity(left)<3)) or
|
||||
((left.nodetype=inlinen) and
|
||||
(tinlinenode(left).inlinenumber=in_sqr_real) and
|
||||
(node_complexity(right)<3)) or
|
||||
((right.nodetype=inlinen) and
|
||||
(tinlinenode(right).inlinenumber=in_sqr_real) and
|
||||
(node_complexity(left)<3))
|
||||
) then
|
||||
begin
|
||||
case tfloatdef(ld).floattype of
|
||||
s32real:
|
||||
inlinennr:=in_fma_single;
|
||||
s64real:
|
||||
inlinennr:=in_fma_double;
|
||||
s80real:
|
||||
inlinennr:=in_fma_extended;
|
||||
s128real:
|
||||
inlinennr:=in_fma_float128;
|
||||
else
|
||||
internalerror(2014042601);
|
||||
end;
|
||||
if left.nodetype=muln then
|
||||
begin
|
||||
if nodetype=subn then
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right),
|
||||
ccallparanode.create(taddnode(left).right,
|
||||
ccallparanode.create(taddnode(left).left,nil
|
||||
))))
|
||||
else
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right,
|
||||
ccallparanode.create(taddnode(left).right,
|
||||
ccallparanode.create(taddnode(left).left,nil
|
||||
))));
|
||||
right:=nil;
|
||||
taddnode(left).right:=nil;
|
||||
taddnode(left).left:=nil;
|
||||
end
|
||||
else if right.nodetype=muln then
|
||||
begin
|
||||
if nodetype=subn then
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
|
||||
ccallparanode.create(cunaryminusnode.create(taddnode(right).right),
|
||||
ccallparanode.create(taddnode(right).left,nil
|
||||
))))
|
||||
else
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
|
||||
ccallparanode.create(taddnode(right).right,
|
||||
ccallparanode.create(taddnode(right).left,nil
|
||||
))));
|
||||
left:=nil;
|
||||
taddnode(right).right:=nil;
|
||||
taddnode(right).left:=nil;
|
||||
end
|
||||
else if (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) then
|
||||
begin
|
||||
if nodetype=subn then
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right),
|
||||
ccallparanode.create(tinlinenode(left).left.getcopy,
|
||||
ccallparanode.create(tinlinenode(left).left.getcopy,nil
|
||||
))))
|
||||
else
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right,
|
||||
ccallparanode.create(tinlinenode(left).left.getcopy,
|
||||
ccallparanode.create(tinlinenode(left).left.getcopy,nil
|
||||
))));
|
||||
right:=nil;
|
||||
end
|
||||
{ we get here only if right is a sqr node }
|
||||
else if (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
|
||||
begin
|
||||
if nodetype=subn then
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
|
||||
ccallparanode.create(cunaryminusnode.create(tinlinenode(right).left.getcopy),
|
||||
ccallparanode.create(tinlinenode(right).left.getcopy,nil
|
||||
))))
|
||||
else
|
||||
result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
|
||||
ccallparanode.create(tinlinenode(right).left.getcopy,
|
||||
ccallparanode.create(tinlinenode(right).left.getcopy,nil
|
||||
))));
|
||||
left:=nil;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function taddnode.first_add64bitint: tnode;
|
||||
var
|
||||
procname: string[31];
|
||||
@ -3109,6 +3238,10 @@ implementation
|
||||
expectloc:=LOC_FPUREGISTER
|
||||
else
|
||||
expectloc:=LOC_FLAGS;
|
||||
|
||||
result:=try_fma(ld,rd);
|
||||
if assigned(result) then
|
||||
exit;
|
||||
end
|
||||
|
||||
{ pointer comperation and subtraction }
|
||||
|
@ -47,6 +47,7 @@ unit nx86add;
|
||||
procedure second_addfloatsse;
|
||||
procedure second_addfloatavx;
|
||||
public
|
||||
function use_fma : boolean;override;
|
||||
procedure second_addfloat;override;
|
||||
{$ifndef i8086}
|
||||
procedure second_addsmallset;override;
|
||||
@ -273,6 +274,15 @@ unit nx86add;
|
||||
procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
|
||||
begin
|
||||
refnode:=nil;
|
||||
|
||||
{ later on, no mm registers are allowed, so transfer everything to memory here
|
||||
below it is loaded into an fpu register if neede }
|
||||
if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
|
||||
hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
|
||||
|
||||
if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
|
||||
hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
|
||||
|
||||
case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
|
||||
0:
|
||||
begin
|
||||
@ -1072,6 +1082,18 @@ unit nx86add;
|
||||
end;
|
||||
|
||||
|
||||
function tx86addnode.use_fma : boolean;
|
||||
begin
|
||||
{$ifndef i8086}
|
||||
{ test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense }
|
||||
Result:=use_vectorfpu(resultdef) and
|
||||
((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]);
|
||||
{$else i8086}
|
||||
Result:=inherited use_fma;
|
||||
{$endif i8086}
|
||||
end;
|
||||
|
||||
|
||||
procedure tx86addnode.second_cmpfloatvector;
|
||||
var
|
||||
op : tasmop;
|
||||
|
Loading…
Reference in New Issue
Block a user