+ support for FMA intrinsic: if there is no hardware support, the compiler throws an error.

Currently it is implemented only for x86-CPUs supporting the FMA extension. While it would be possible to implement it in software or simulate fma(<single>,<single>,<single>) using double operations, it makes no sense in my opinion to do so. git-svn-id: trunk@27564 -
2025-04-07 21:28:03 +02:00 · 2014-04-13 19:21:54 +00:00 · 2014-04-13 19:21:54 +00:00 · d88d644925
commit d88d644925
parent d404d15c1e
16 changed files with 642 additions and 347 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -11382,6 +11382,7 @@ tests/test/textthr.pp svneol=native#text/plain
 tests/test/tfillchr.pp svneol=native#text/plain
 tests/test/tfinal1.pp svneol=native#text/pascal
 tests/test/tfinal2.pp svneol=native#text/pascal
+tests/test/tfma1.pp svneol=native#text/pascal
 tests/test/tforin1.pp svneol=native#text/pascal
 tests/test/tforin10.pp svneol=native#text/plain
 tests/test/tforin11.pp svneol=native#text/plain
--- a/compiler/compinnr.inc
+++ b/compiler/compinnr.inc
@ -115,6 +115,10 @@ const
   in_arctan_real      = 130;
   in_ln_real          = 131;
   in_sin_real         = 132;
+   in_fma_single       = 133;
+   in_fma_double       = 134;
+   in_fma_extended     = 135;
+   in_fma_float128     = 136;

 { MMX functions }
  { these contants are used by the mmx unit }
--- a/compiler/i386/cpuinfo.pas
+++ b/compiler/i386/cpuinfo.pas
@ -137,7 +137,9 @@ type
       CPUX86_HAS_POPCNT,
       CPUX86_HAS_AVXUNIT,
       CPUX86_HAS_LZCNT,
-       CPUX86_HAS_MOVBE
+       CPUX86_HAS_MOVBE,
+       CPUX86_HAS_FMA,
+       CPUX86_HAS_FMA4
      );

 const
@ -151,7 +153,7 @@ type
     { cpu_PentiumM  } [CPUX86_HAS_SSEUNIT],
     { cpu_core_i    } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
     { cpu_core_avx  } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
-     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
+     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
   );


--- a/compiler/msg/errore.msg
+++ b/compiler/msg/errore.msg
@ -2358,6 +2358,11 @@ cg_e_mod_only_defined_for_pos_quotient=06054_E_In ISO mode, the mod operator is
 % In ISO pascal, only positive values are allowed for the quotient: \var{n mod m} is only valid if \var{m>0}.
 cg_d_autoinlining=06055_DL_Auto inlining: $1
 % Due to auto inlining turned on, the compiler auto inlines this subroutine.
+cg_e_function_not_support_by_selected_instruction_set=06056_E_The function used, is not supported by the selected instruction set: $1
+% Some functions cannot be implemented efficiently for certain instruction sets, one example is fused multiply/add.
+% To avoid very inefficient code, the compiler complains in this case, so either select another instruction set
+% or replace the function call by alternative code
+%
 % \end{description}
 # EndOfTeX
 #
@ -2615,7 +2620,6 @@ asmr_e_mixing_regtypes=07108_E_All registers in a register set must be of the sa
 asmr_e_empty_regset=07109_E_A register set cannot be empty
 % Instructions on the ARM architecture that take a register set as argument require that such a set
 % contains at least one register.
-
 asmr_w_useless_got_for_local=07110_W_@GOTPCREL is useless and potentially dangereous for local symbols
 % The use of @GOTPCREL supposes an extra indirection that is
 % not present if the symbol is local, which might lead to wrong asembler code
--- a/compiler/msgidx.inc
+++ b/compiler/msgidx.inc
@ -654,6 +654,7 @@ const
  cg_e_goto_across_procedures_with_exceptions_not_allowed=06053;
  cg_e_mod_only_defined_for_pos_quotient=06054;
  cg_d_autoinlining=06055;
+  cg_e_function_not_support_by_selected_instruction_set=06056;
  asmr_d_start_reading=07000;
  asmr_d_finish_reading=07001;
  asmr_e_none_label_contain_at=07002;
@ -985,9 +986,9 @@ const
  option_info=11024;
  option_help_pages=11025;

-  MsgTxtSize = 71162;
+  MsgTxtSize = 71242;

  MsgIdxMax : array[1..20] of longint=(
-    26,96,337,121,89,56,126,27,202,64,
+    26,96,337,121,89,57,126,27,202,64,
    57,20,1,1,1,1,1,1,1,1
  );
--- a/compiler/msgtxt.inc
+++ b/compiler/msgtxt.inc
--- a/compiler/ncal.pas
+++ b/compiler/ncal.pas
@ -220,6 +220,7 @@ interface
          { a refcounted into a non-refcounted type                     }
          function can_be_inlined: boolean;

+          property paravalue : tnode read left write left;
          property nextpara : tnode read right write right;
          { third is reused to store the parameter name (only while parsing
            vardispatch calls, never in real node tree) and copy of 'high'
--- a/compiler/ncginl.pas
+++ b/compiler/ncginl.pas
@ -60,6 +60,7 @@ interface
          procedure second_box; virtual; abstract;
          procedure second_popcnt; virtual;
          procedure second_seg; virtual; abstract;
+          procedure second_fma; virtual;
       end;

 implementation
@ -190,6 +191,11 @@ implementation
               second_popcnt;
            in_seg_x:
               second_seg;
+            in_fma_single,
+            in_fma_double,
+            in_fma_extended,
+            in_fma_float128:
+               second_fma;
            else internalerror(9);
         end;
      end;
@ -768,6 +774,12 @@ implementation
      end;


+    procedure tcginlinenode.second_fma;
+      begin
+        internalerror(2014032701);
+      end;
+
+
 begin
   cinlinenode:=tcginlinenode;
 end.  s
--- a/compiler/ninl.pas
+++ b/compiler/ninl.pas
@ -45,6 +45,7 @@ interface
          { pack and unpack are changed into for-loops by the compiler }
          function first_pack_unpack: tnode; virtual;

+          property parameters : tnode read left write left;
         protected
          { All the following routines currently
            call compilerprocs, unless they are
@ -83,6 +84,7 @@ interface
          function typecheck_seg: tnode; virtual;
          function first_seg: tnode; virtual;
          function first_sar: tnode; virtual;
+          function first_fma : tnode; virtual;
        private
          function handle_str: tnode;
          function handle_reset_rewrite_typed: tnode;
@ -3245,6 +3247,16 @@ implementation
                begin
                  result:=handle_unbox;
                end;
+              in_fma_single,
+              in_fma_double,
+              in_fma_extended,
+              in_fma_float128:
+                begin
+                  set_varstate(tcallparanode(left).left,vs_read,[vsf_must_be_valid]);
+                  set_varstate(tcallparanode(tcallparanode(left).right).left,vs_read,[vsf_must_be_valid]);
+                  set_varstate(tcallparanode(tcallparanode(tcallparanode(left).right).right).left,vs_read,[vsf_must_be_valid]);
+                  resultdef:=tcallparanode(left).left.resultdef;
+                end;
              else
                internalerror(8);
            end;
@ -3659,6 +3671,11 @@ implementation
           result:=first_box;
         in_unbox_x_y:
           result:=first_unbox;
+         in_fma_single,
+         in_fma_double,
+         in_fma_extended,
+         in_fma_float128:
+           result:=first_fma;
         else
           internalerror(89);
          end;
@ -4218,4 +4235,12 @@ implementation
         result := loop;
       end;

+
+     function tinlinenode.first_fma: tnode;
+       begin
+         CGMessage1(cg_e_function_not_support_by_selected_instruction_set,'FMA');
+         result:=nil;
+       end;
+
 end.
+
--- a/compiler/x86/cga.pas
+++ b/compiler/x86/cga.pas
@ -44,6 +44,7 @@ interface

    procedure emit_const_reg_reg(i : tasmop;s : topsize;c : longint;reg1,reg2 : tregister);
    procedure emit_reg_reg_reg(i : tasmop;s : topsize;reg1,reg2,reg3 : tregister);
+    procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);


    procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
@ -124,6 +125,12 @@ implementation
         current_asmdata.CurrAsmList.concat(Taicpu.Op_reg_reg_reg(i,s,reg1,reg2,reg3));
      end;

+    procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
+      begin
+        tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,ref);
+        current_asmdata.CurrAsmList.concat(Taicpu.Op_ref_reg_reg(i,s,ref,reg1,reg2));
+      end;
+
    procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
      begin
        current_asmdata.CurrAsmList.concat(Taicpu.Op_sym(i,s,op));
--- a/compiler/x86/cgx86.pas
+++ b/compiler/x86/cgx86.pas
@ -185,7 +185,7 @@ unit cgx86;

    function UseAVX: boolean;
      begin
-        Result:=current_settings.fputype in fpu_avx_instructionsets;
+        Result:=(current_settings.fputype in fpu_avx_instructionsets) or (CPUX86_HAS_AVXUNIT in cpu_capabilities[current_settings.cputype]);
      end;


--- a/compiler/x86/nx86inl.pas
+++ b/compiler/x86/nx86inl.pas
@ -45,6 +45,7 @@ interface
          function first_round_real: tnode; override;
          function first_trunc_real: tnode; override;
          function first_popcnt: tnode; override;
+          function first_fma: tnode; override;
          { second pass override to generate these nodes }
          procedure second_IncludeExclude;override;
          procedure second_pi; override;
@ -64,6 +65,7 @@ interface
          procedure second_abs_long;override;
 {$endif not i8086}
          procedure second_popcnt;override;
+          procedure second_fma;override;
       private
          procedure load_fpu_location(lnode: tnode);
       end;
@ -247,7 +249,20 @@ implementation
       end;


-     procedure tx86inlinenode.second_Pi;
+     function tx86inlinenode.first_fma : tnode;
+       begin
+         if ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]) and
+           ((is_double(resultdef)) or (is_single(resultdef))) then
+           begin
+             expectloc:=LOC_MMREGISTER;
+             Result:=nil;
+           end
+         else
+           Result:=inherited first_fma;
+       end;
+
+
+     procedure tx86inlinenode.second_pi;
       begin
         location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
         emit_none(A_FLDPI,S_NO);
@ -741,4 +756,85 @@ implementation
        else
          emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
      end;
+
+
+    procedure tx86inlinenode.second_fma;
+      const
+        op : array[s32real..s64real,0..3] of TAsmOp = ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
+                                                       (A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD));
+      var
+        paraarray : array[1..3] of tnode;
+        memop,
+        i : integer;
+        gotmem : boolean;
+      begin
+         if (cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[] then
+           begin
+             paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
+             paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
+             paraarray[3]:=tcallparanode(parameters).paravalue;
+
+             for i:=1 to 3 do
+               secondpass(paraarray[i]);
+
+             { only one memory operand is allowed }
+             gotmem:=false;
+             memop:=0;
+             for i:=1 to 3 do
+               begin
+                 if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
+                   begin
+                     if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
+                       begin
+                         memop:=i;
+                         gotmem:=true;
+                       end
+                     else
+                       hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
+                   end;
+               end;
+
+             location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
+             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+
+             if gotmem then
+               begin
+                 case memop of
+                   1:
+                     begin
+                       hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
+                         paraarray[3].location.register,location.register,mms_movescalar);
+                       emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
+                         paraarray[1].location.reference,paraarray[2].location.register,location.register);
+                     end;
+                   2:
+                     begin
+                       hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
+                         paraarray[3].location.register,location.register,mms_movescalar);
+                       emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
+                         paraarray[2].location.reference,paraarray[1].location.register,location.register);
+                     end;
+                   3:
+                     begin
+                       hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
+                         paraarray[1].location.register,location.register,mms_movescalar);
+                       emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
+                         paraarray[3].location.reference,paraarray[2].location.register,location.register);
+                     end
+                   else
+                     internalerror(2014041301);
+                 end;
+               end
+             else
+               begin
+                 hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
+                   paraarray[3].location.register,location.register,mms_movescalar);
+                 emit_reg_reg_reg(op[tfloatdef(resultdef).floattype,0],S_NO,
+                   paraarray[1].location.register,paraarray[2].location.register,location.register);
+               end;
+           end
+         else
+           internalerror(2014032301);
+      end;
+
 end.
--- a/compiler/x86_64/cpuinfo.pas
+++ b/compiler/x86_64/cpuinfo.pas
@ -125,7 +125,9 @@ type
       CPUX86_HAS_POPCNT,
       CPUX86_HAS_AVXUNIT,
       CPUX86_HAS_LZCNT,
-       CPUX86_HAS_MOVBE
+       CPUX86_HAS_MOVBE,
+       CPUX86_HAS_FMA,
+       CPUX86_HAS_FMA4
      );

 const
@ -134,7 +136,7 @@ type
     { Athlon64      } [CPUX86_HAS_SSEUNIT],
     { cpu_core_i    } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
     { cpu_core_avx  } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
-     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
+     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
   );

 Implementation
--- a/rtl/inc/innr.inc
+++ b/rtl/inc/innr.inc
@ -115,6 +115,10 @@ const
   fpc_in_arctan_real      = 130;
   fpc_in_ln_real          = 131;
   fpc_in_sin_real         = 132;
+   fpc_in_fma_single       = 133;
+   fpc_in_fma_double       = 134;
+   fpc_in_fma_extended     = 135;
+   fpc_in_fma_float128     = 136;

 { MMX functions }
 { these contants are used by the mmx unit }
--- a/rtl/inc/mathh.inc
+++ b/rtl/inc/mathh.inc
@ -109,3 +109,15 @@ procedure float_raise(i: TFPUExceptionMask);
    operator := (b:real48) e:extended;
 {$endif SUPPORT_EXTENDED}

+
+    function fma(s1,s2,s3 : single) : single;[internproc:fpc_in_fma_single];
+{$ifdef SUPPORT_DOUBLE}
+    function fma(d1,d2,d3 : double) : double;[internproc:fpc_in_fma_double];
+{$endif SUPPORT_DOUBLE}
+{$ifdef SUPPORT_EXTENDED}
+    function fma(e1,e2,e3 : extended) : extended;[internproc:fpc_in_fma_extended];
+{$endif SUPPORT_EXTENDED}
+{$ifdef SUPPORT_FLOAT128}
+    function fma(f1,f2,f3 : float128) : float128;[internproc:fpc_in_fma_float128];
+{$endif SUPPORT_FLOAT128}
+
--- a/tests/test/tfma1.pp
+++ b/tests/test/tfma1.pp
@ -0,0 +1,121 @@
+{ %CPU=i386,x86_64 }
+{ %OPT=-Cfavx2 -Cpcoreavx2 }
+uses
+  cpu;
+var
+  d0,d1,d2,d3 : double;
+  s0,s1,s2,s3 : single;
+
+procedure testsingle;
+  var
+    l0,l1,l2,l3 : single;
+  begin
+    l1:=2;
+    l2:=3;
+    l3:=4;
+    l0:=fma(l1,l2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(s1,l2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(l1,s2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(l1,l2,s3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(s1,s2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(s1,l2,s3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(l1,s2,s3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+  end;
+
+procedure testdouble;
+  var
+    l0,l1,l2,l3 : double;
+  begin
+    l1:=2;
+    l2:=3;
+    l3:=4;
+    l0:=fma(l1,l2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(d1,l2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(l1,d2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(l1,l2,d3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(d1,d2,l3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(d1,l2,d3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+
+    l0:=fma(l1,d2,d3);
+    writeln(l0);
+    if l0<>10.0 then
+      halt(1);
+  end;
+
+begin
+  if AVXSupport and FMASupport then
+    begin
+      d1:=2;
+      d2:=3;
+      d3:=4;
+      d0:=fma(d1,d2,d3);
+      writeln(d0);
+      if d0<>10.0 then
+        halt(1);
+
+      s1:=2;
+      s2:=3;
+      s3:=4;
+      s0:=fma(s1,s2,s3);
+      writeln(s0);
+      if s0<>10.0 then
+        halt(1);
+
+      testsingle;
+      testdouble;
+
+      writeln('ok');
+    end
+  else
+    writeln('Skipped because not supported by the CPU');
+end.