From 499dd078e3b9e0956c1d94af82a2d3798c8638ea Mon Sep 17 00:00:00 2001
From: florian <florian@freepascal.org>
Date: Mon, 11 Aug 2014 20:50:21 +0000
Subject: [PATCH] + automatically insert fma inlines into floating point code
 if possible and fastmath is activated

git-svn-id: trunk@28382 -
---
 compiler/nadd.pas        | 157 ++++++++++++++++++++++++++++++++++++---
 compiler/x86/nx86add.pas |  22 ++++++
 2 files changed, 167 insertions(+), 12 deletions(-)

diff --git a/compiler/nadd.pas b/compiler/nadd.pas
index 696148df38..da5c15525d 100644
--- a/compiler/nadd.pas
+++ b/compiler/nadd.pas
@@ -73,6 +73,10 @@ interface
           { full 64 bit multiplies.                                }
           function use_generic_mul64bit: boolean; virtual;
 
+          { shall be overriden if the target cpu supports
+            an fma instruction
+          }
+          function use_fma : boolean; virtual;
           { This routine calls internal runtime library helpers
             for all floating point arithmetic in the case
             where the emulation switches is on. Otherwise
@@ -80,18 +84,22 @@ interface
             the code generation phase.
           }
           function first_addfloat : tnode; virtual;
-         private
-           { checks whether a muln can be calculated as a 32bit }
-           { * 32bit -> 64 bit                                  }
-           function try_make_mul32to64: boolean;
-           { Match against the ranges, i.e.:
-             var a:1..10;
-             begin
-               if a>0 then
-                 ...
-             always evaluates to true. (DM)
-           }
-           function cmp_of_disjunct_ranges(var res : boolean) : boolean;
+       private
+          { checks whether a muln can be calculated as a 32bit }
+          { * 32bit -> 64 bit                                  }
+          function try_make_mul32to64: boolean;
+
+          { Match against the ranges, i.e.:
+            var a:1..10;
+            begin
+              if a>0 then
+                ...
+            always evaluates to true. (DM)
+          }
+          function cmp_of_disjunct_ranges(var res : boolean) : boolean;
+
+          { tries to replace the current node by a fma node }
+          function try_fma(ld,rd : tdef) : tnode;
        end;
        taddnodeclass = class of taddnode;
 
@@ -2612,6 +2620,127 @@ implementation
       end;
 
 
+    function taddnode.use_fma : boolean;
+      begin
+        result:=false;
+      end;
+
+
+    function taddnode.try_fma(ld,rd : tdef) : tnode;
+      var
+        inlinennr : Integer;
+      begin
+        result:=nil;
+        if (cs_opt_fastmath in current_settings.optimizerswitches) and
+          use_fma and
+          (nodetype in [addn,subn]) and
+          (rd.typ=floatdef) and (ld.typ=floatdef) and
+          (is_single(rd) or is_double(rd)) and
+          equal_defs(rd,ld) and
+          { transforming a*b+c into fma(a,b,c) makes only sense if c can be
+            calculated easily. Consider a*b+c*d which results in
+
+            fmul
+            fmul
+            fadd
+
+            and in
+
+            fmul
+            fma
+
+            when using the fma optimization. On a super scalar architecture, the first instruction
+            sequence requires clock_cycles(fmul)+clock_cycles(fadd) clock cycles because the fmuls can be executed in parallel.
+            The second sequence requires clock_cycles(fmul)+clock_cycles(fma) because the fma has to wait for the
+            result of the fmul. Since typically clock_cycles(fma)>clock_cycles(fadd) applies, the first sequence is better.
+          }
+          (((left.nodetype=muln) and (node_complexity(right)<3)) or
+           ((right.nodetype=muln) and (node_complexity(left)<3)) or
+           ((left.nodetype=inlinen) and
+            (tinlinenode(left).inlinenumber=in_sqr_real) and
+             (node_complexity(right)<3)) or
+           ((right.nodetype=inlinen) and
+            (tinlinenode(right).inlinenumber=in_sqr_real) and
+            (node_complexity(left)<3))
+          ) then
+          begin
+            case tfloatdef(ld).floattype of
+              s32real:
+               inlinennr:=in_fma_single;
+              s64real:
+               inlinennr:=in_fma_double;
+              s80real:
+               inlinennr:=in_fma_extended;
+              s128real:
+               inlinennr:=in_fma_float128;
+              else
+                internalerror(2014042601);
+            end;
+            if left.nodetype=muln then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right),
+                    ccallparanode.create(taddnode(left).right,
+                    ccallparanode.create(taddnode(left).left,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right,
+                    ccallparanode.create(taddnode(left).right,
+                    ccallparanode.create(taddnode(left).left,nil
+                    ))));
+                right:=nil;
+                taddnode(left).right:=nil;
+                taddnode(left).left:=nil;
+              end
+            else if right.nodetype=muln then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(cunaryminusnode.create(taddnode(right).right),
+                    ccallparanode.create(taddnode(right).left,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(taddnode(right).right,
+                    ccallparanode.create(taddnode(right).left,nil
+                    ))));
+                left:=nil;
+                taddnode(right).right:=nil;
+                taddnode(right).left:=nil;
+              end
+            else if (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(cunaryminusnode.create(right),
+                    ccallparanode.create(tinlinenode(left).left.getcopy,
+                    ccallparanode.create(tinlinenode(left).left.getcopy,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(right,
+                    ccallparanode.create(tinlinenode(left).left.getcopy,
+                    ccallparanode.create(tinlinenode(left).left.getcopy,nil
+                    ))));
+                right:=nil;
+              end
+            { we get here only if right is a sqr node }
+            else if (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
+              begin
+                if nodetype=subn then
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(cunaryminusnode.create(tinlinenode(right).left.getcopy),
+                    ccallparanode.create(tinlinenode(right).left.getcopy,nil
+                    ))))
+                else
+                  result:=cinlinenode.create(inlinennr,false,ccallparanode.create(left,
+                    ccallparanode.create(tinlinenode(right).left.getcopy,
+                    ccallparanode.create(tinlinenode(right).left.getcopy,nil
+                    ))));
+                left:=nil;
+              end;
+          end;
+      end;
+
+
     function taddnode.first_add64bitint: tnode;
       var
         procname: string[31];
@@ -3109,6 +3238,10 @@ implementation
                 expectloc:=LOC_FPUREGISTER
               else
                 expectloc:=LOC_FLAGS;
+
+              result:=try_fma(ld,rd);
+              if assigned(result) then
+                exit;
             end
 
          { pointer comperation and subtraction }
diff --git a/compiler/x86/nx86add.pas b/compiler/x86/nx86add.pas
index 1f147bec76..41be991e0d 100644
--- a/compiler/x86/nx86add.pas
+++ b/compiler/x86/nx86add.pas
@@ -47,6 +47,7 @@ unit nx86add;
         procedure second_addfloatsse;
         procedure second_addfloatavx;
       public
+        function use_fma : boolean;override;
         procedure second_addfloat;override;
 {$ifndef i8086}
         procedure second_addsmallset;override;
@@ -273,6 +274,15 @@ unit nx86add;
     procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
       begin
         refnode:=nil;
+
+        { later on, no mm registers are allowed, so transfer everything to memory here
+          below it is loaded into an fpu register if neede }
+        if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
+          hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+
+        if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
+          hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+
         case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
           0:
             begin
@@ -1072,6 +1082,18 @@ unit nx86add;
       end;
 
 
+    function tx86addnode.use_fma : boolean;
+      begin
+{$ifndef i8086}
+        { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense }
+        Result:=use_vectorfpu(resultdef) and
+          ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]);
+{$else i8086}
+        Result:=inherited use_fma;
+{$endif i8086}
+      end;
+
+
     procedure tx86addnode.second_cmpfloatvector;
       var
         op : tasmop;