From 499dd078e3b9e0956c1d94af82a2d3798c8638ea Mon Sep 17 00:00:00 2001 From: florian Date: Mon, 11 Aug 2014 20:50:21 +0000 Subject: [PATCH] + automatically insert fma inlines into floating point code if possible and fastmath is activated git-svn-id: trunk@28382 - --- compiler/nadd.pas | 157 ++++++++++++++++++++++++++++++++++++--- compiler/x86/nx86add.pas | 22 ++++++ 2 files changed, 167 insertions(+), 12 deletions(-) diff --git a/compiler/nadd.pas b/compiler/nadd.pas index 696148df38..da5c15525d 100644 --- a/compiler/nadd.pas +++ b/compiler/nadd.pas @@ -73,6 +73,10 @@ interface { full 64 bit multiplies. } function use_generic_mul64bit: boolean; virtual; + { shall be overriden if the target cpu supports + an fma instruction + } + function use_fma : boolean; virtual; { This routine calls internal runtime library helpers for all floating point arithmetic in the case where the emulation switches is on. Otherwise @@ -80,18 +84,22 @@ interface the code generation phase. } function first_addfloat : tnode; virtual; - private - { checks whether a muln can be calculated as a 32bit } - { * 32bit -> 64 bit } - function try_make_mul32to64: boolean; - { Match against the ranges, i.e.: - var a:1..10; - begin - if a>0 then - ... - always evaluates to true. (DM) - } - function cmp_of_disjunct_ranges(var res : boolean) : boolean; + private + { checks whether a muln can be calculated as a 32bit } + { * 32bit -> 64 bit } + function try_make_mul32to64: boolean; + + { Match against the ranges, i.e.: + var a:1..10; + begin + if a>0 then + ... + always evaluates to true. (DM) + } + function cmp_of_disjunct_ranges(var res : boolean) : boolean; + + { tries to replace the current node by a fma node } + function try_fma(ld,rd : tdef) : tnode; end; taddnodeclass = class of taddnode; @@ -2612,6 +2620,127 @@ implementation end; + function taddnode.use_fma : boolean; + begin + result:=false; + end; + + + function taddnode.try_fma(ld,rd : tdef) : tnode; + var + inlinennr : Integer; + begin + result:=nil; + if (cs_opt_fastmath in current_settings.optimizerswitches) and + use_fma and + (nodetype in [addn,subn]) and + (rd.typ=floatdef) and (ld.typ=floatdef) and + (is_single(rd) or is_double(rd)) and + equal_defs(rd,ld) and + { transforming a*b+c into fma(a,b,c) makes only sense if c can be + calculated easily. Consider a*b+c*d which results in + + fmul + fmul + fadd + + and in + + fmul + fma + + when using the fma optimization. On a super scalar architecture, the first instruction + sequence requires clock_cycles(fmul)+clock_cycles(fadd) clock cycles because the fmuls can be executed in parallel. + The second sequence requires clock_cycles(fmul)+clock_cycles(fma) because the fma has to wait for the + result of the fmul. Since typically clock_cycles(fma)>clock_cycles(fadd) applies, the first sequence is better. + } + (((left.nodetype=muln) and (node_complexity(right)<3)) or + ((right.nodetype=muln) and (node_complexity(left)<3)) or + ((left.nodetype=inlinen) and + (tinlinenode(left).inlinenumber=in_sqr_real) and + (node_complexity(right)<3)) or + ((right.nodetype=inlinen) and + (tinlinenode(right).inlinenumber=in_sqr_real) and + (node_complexity(left)<3)) + ) then + begin + case tfloatdef(ld).floattype of + s32real: + inlinennr:=in_fma_single; + s64real: + inlinennr:=in_fma_double; + s80real: + inlinennr:=in_fma_extended; + s128real: + inlinennr:=in_fma_float128; + else + internalerror(2014042601); + end; + if left.nodetype=muln then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right), + ccallparanode.create(taddnode(left).right, + ccallparanode.create(taddnode(left).left,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right, + ccallparanode.create(taddnode(left).right, + ccallparanode.create(taddnode(left).left,nil + )))); + right:=nil; + taddnode(left).right:=nil; + taddnode(left).left:=nil; + end + else if right.nodetype=muln then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(cunaryminusnode.create(taddnode(right).right), + ccallparanode.create(taddnode(right).left,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(taddnode(right).right, + ccallparanode.create(taddnode(right).left,nil + )))); + left:=nil; + taddnode(right).right:=nil; + taddnode(right).left:=nil; + end + else if (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right), + ccallparanode.create(tinlinenode(left).left.getcopy, + ccallparanode.create(tinlinenode(left).left.getcopy,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right, + ccallparanode.create(tinlinenode(left).left.getcopy, + ccallparanode.create(tinlinenode(left).left.getcopy,nil + )))); + right:=nil; + end + { we get here only if right is a sqr node } + else if (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then + begin + if nodetype=subn then + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(cunaryminusnode.create(tinlinenode(right).left.getcopy), + ccallparanode.create(tinlinenode(right).left.getcopy,nil + )))) + else + result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left, + ccallparanode.create(tinlinenode(right).left.getcopy, + ccallparanode.create(tinlinenode(right).left.getcopy,nil + )))); + left:=nil; + end; + end; + end; + + function taddnode.first_add64bitint: tnode; var procname: string[31]; @@ -3109,6 +3238,10 @@ implementation expectloc:=LOC_FPUREGISTER else expectloc:=LOC_FLAGS; + + result:=try_fma(ld,rd); + if assigned(result) then + exit; end { pointer comperation and subtraction } diff --git a/compiler/x86/nx86add.pas b/compiler/x86/nx86add.pas index 1f147bec76..41be991e0d 100644 --- a/compiler/x86/nx86add.pas +++ b/compiler/x86/nx86add.pas @@ -47,6 +47,7 @@ unit nx86add; procedure second_addfloatsse; procedure second_addfloatavx; public + function use_fma : boolean;override; procedure second_addfloat;override; {$ifndef i8086} procedure second_addsmallset;override; @@ -273,6 +274,15 @@ unit nx86add; procedure tx86addnode.prepare_x87_locations(out refnode: tnode); begin refnode:=nil; + + { later on, no mm registers are allowed, so transfer everything to memory here + below it is loaded into an fpu register if neede } + if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then + hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef); + + if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then + hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef); + case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of 0: begin @@ -1072,6 +1082,18 @@ unit nx86add; end; + function tx86addnode.use_fma : boolean; + begin +{$ifndef i8086} + { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense } + Result:=use_vectorfpu(resultdef) and + ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]); +{$else i8086} + Result:=inherited use_fma; +{$endif i8086} + end; + + procedure tx86addnode.second_cmpfloatvector; var op : tasmop;