+ support for FMA intrinsic: if there is no hardware support, the compiler throws an error.

Currently it is implemented only for x86-CPUs supporting the FMA extension. While it would be possible to implement it in software or simulate fma(<single>,<single>,<single>) using double operations, it makes no sense in my opinion to do so. git-svn-id: trunk@27564 -
2025-11-07 12:44:39 +01:00 · 2014-04-13 19:21:54 +00:00 · 2014-04-13 19:21:54 +00:00 · d88d644925
commit d88d644925
parent d404d15c1e
16 changed files with 642 additions and 347 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -11382,6 +11382,7 @@ tests/test/textthr.pp svneol=native#text/plain
 tests/test/tfillchr.pp svneol=native#text/plain
 tests/test/tfinal1.pp svneol=native#text/pascal
 tests/test/tfinal2.pp svneol=native#text/pascal
 tests/test/tfma1.pp svneol=native#text/pascal
 tests/test/tforin1.pp svneol=native#text/pascal
 tests/test/tforin10.pp svneol=native#text/plain
 tests/test/tforin11.pp svneol=native#text/plain
--- a/compiler/compinnr.inc
+++ b/compiler/compinnr.inc
@ -115,6 +115,10 @@ const
   in_arctan_real      = 130;
   in_ln_real          = 131;
   in_sin_real         = 132;
   in_fma_single       = 133;
   in_fma_double       = 134;
   in_fma_extended     = 135;
   in_fma_float128     = 136;
 { MMX functions }
  { these contants are used by the mmx unit }
--- a/compiler/i386/cpuinfo.pas
+++ b/compiler/i386/cpuinfo.pas
@ -137,7 +137,9 @@ type
       CPUX86_HAS_POPCNT,
       CPUX86_HAS_AVXUNIT,
       CPUX86_HAS_LZCNT,
-       CPUX86_HAS_MOVBE
+       CPUX86_HAS_MOVBE,
       CPUX86_HAS_FMA,
       CPUX86_HAS_FMA4
      );
 const
@ -151,7 +153,7 @@ type
     { cpu_PentiumM  } [CPUX86_HAS_SSEUNIT],
     { cpu_core_i    } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
     { cpu_core_avx  } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
-     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
+     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
   );
--- a/compiler/msg/errore.msg
+++ b/compiler/msg/errore.msg
@ -2358,6 +2358,11 @@ cg_e_mod_only_defined_for_pos_quotient=06054_E_In ISO mode, the mod operator is
 % In ISO pascal, only positive values are allowed for the quotient: \var{n mod m} is only valid if \var{m>0}.
 cg_d_autoinlining=06055_DL_Auto inlining: $1
 % Due to auto inlining turned on, the compiler auto inlines this subroutine.
 cg_e_function_not_support_by_selected_instruction_set=06056_E_The function used, is not supported by the selected instruction set: $1
 % Some functions cannot be implemented efficiently for certain instruction sets, one example is fused multiply/add.
 % To avoid very inefficient code, the compiler complains in this case, so either select another instruction set
 % or replace the function call by alternative code
 %
 % \end{description}
 # EndOfTeX
 #
@ -2615,7 +2620,6 @@ asmr_e_mixing_regtypes=07108_E_All registers in a register set must be of the sa
 asmr_e_empty_regset=07109_E_A register set cannot be empty
 % Instructions on the ARM architecture that take a register set as argument require that such a set
 % contains at least one register.
 asmr_w_useless_got_for_local=07110_W_@GOTPCREL is useless and potentially dangereous for local symbols
 % The use of @GOTPCREL supposes an extra indirection that is
 % not present if the symbol is local, which might lead to wrong asembler code
--- a/compiler/msgidx.inc
+++ b/compiler/msgidx.inc
@ -654,6 +654,7 @@ const
  cg_e_goto_across_procedures_with_exceptions_not_allowed=06053;
  cg_e_mod_only_defined_for_pos_quotient=06054;
  cg_d_autoinlining=06055;
  cg_e_function_not_support_by_selected_instruction_set=06056;
  asmr_d_start_reading=07000;
  asmr_d_finish_reading=07001;
  asmr_e_none_label_contain_at=07002;
@ -985,9 +986,9 @@ const
  option_info=11024;
  option_help_pages=11025;
-  MsgTxtSize = 71162;
+  MsgTxtSize = 71242;
  MsgIdxMax : array[1..20] of longint=(
-    26,96,337,121,89,56,126,27,202,64,
+    26,96,337,121,89,57,126,27,202,64,
    57,20,1,1,1,1,1,1,1,1
  );
--- a/compiler/msgtxt.inc
+++ b/compiler/msgtxt.inc
--- a/compiler/ncal.pas
+++ b/compiler/ncal.pas
@ -220,6 +220,7 @@ interface
          { a refcounted into a non-refcounted type                     }
          function can_be_inlined: boolean;
          property paravalue : tnode read left write left;
          property nextpara : tnode read right write right;
          { third is reused to store the parameter name (only while parsing
            vardispatch calls, never in real node tree) and copy of 'high'
--- a/compiler/ncginl.pas
+++ b/compiler/ncginl.pas
@ -60,6 +60,7 @@ interface
          procedure second_box; virtual; abstract;
          procedure second_popcnt; virtual;
          procedure second_seg; virtual; abstract;
          procedure second_fma; virtual;
       end;
 implementation
@ -190,6 +191,11 @@ implementation
               second_popcnt;
            in_seg_x:
               second_seg;
            in_fma_single,
            in_fma_double,
            in_fma_extended,
            in_fma_float128:
               second_fma;
            else internalerror(9);
         end;
      end;
@ -768,6 +774,12 @@ implementation
      end;
    procedure tcginlinenode.second_fma;
      begin
        internalerror(2014032701);
      end;
 begin
   cinlinenode:=tcginlinenode;
 end.  s
--- a/compiler/ninl.pas
+++ b/compiler/ninl.pas
@ -45,6 +45,7 @@ interface
          { pack and unpack are changed into for-loops by the compiler }
          function first_pack_unpack: tnode; virtual;
          property parameters : tnode read left write left;
         protected
          { All the following routines currently
            call compilerprocs, unless they are
@ -83,6 +84,7 @@ interface
          function typecheck_seg: tnode; virtual;
          function first_seg: tnode; virtual;
          function first_sar: tnode; virtual;
          function first_fma : tnode; virtual;
        private
          function handle_str: tnode;
          function handle_reset_rewrite_typed: tnode;
@ -3245,6 +3247,16 @@ implementation
                begin
                  result:=handle_unbox;
                end;
              in_fma_single,
              in_fma_double,
              in_fma_extended,
              in_fma_float128:
                begin
                  set_varstate(tcallparanode(left).left,vs_read,[vsf_must_be_valid]);
                  set_varstate(tcallparanode(tcallparanode(left).right).left,vs_read,[vsf_must_be_valid]);
                  set_varstate(tcallparanode(tcallparanode(tcallparanode(left).right).right).left,vs_read,[vsf_must_be_valid]);
                  resultdef:=tcallparanode(left).left.resultdef;
                end;
              else
                internalerror(8);
            end;
@ -3659,6 +3671,11 @@ implementation
           result:=first_box;
         in_unbox_x_y:
           result:=first_unbox;
         in_fma_single,
         in_fma_double,
         in_fma_extended,
         in_fma_float128:
           result:=first_fma;
         else
           internalerror(89);
          end;
@ -4218,4 +4235,12 @@ implementation
         result := loop;
       end;
     function tinlinenode.first_fma: tnode;
       begin
         CGMessage1(cg_e_function_not_support_by_selected_instruction_set,'FMA');
         result:=nil;
       end;
 end.
--- a/compiler/x86/cga.pas
+++ b/compiler/x86/cga.pas
@ -44,6 +44,7 @@ interface
    procedure emit_const_reg_reg(i : tasmop;s : topsize;c : longint;reg1,reg2 : tregister);
    procedure emit_reg_reg_reg(i : tasmop;s : topsize;reg1,reg2,reg3 : tregister);
    procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
    procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
@ -124,6 +125,12 @@ implementation
         current_asmdata.CurrAsmList.concat(Taicpu.Op_reg_reg_reg(i,s,reg1,reg2,reg3));
      end;
    procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
      begin
        tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,ref);
        current_asmdata.CurrAsmList.concat(Taicpu.Op_ref_reg_reg(i,s,ref,reg1,reg2));
      end;
    procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
      begin
        current_asmdata.CurrAsmList.concat(Taicpu.Op_sym(i,s,op));
--- a/compiler/x86/cgx86.pas
+++ b/compiler/x86/cgx86.pas
@ -185,7 +185,7 @@ unit cgx86;
    function UseAVX: boolean;
      begin
-        Result:=current_settings.fputype in fpu_avx_instructionsets;
+        Result:=(current_settings.fputype in fpu_avx_instructionsets) or (CPUX86_HAS_AVXUNIT in cpu_capabilities[current_settings.cputype]);
      end;
--- a/compiler/x86/nx86inl.pas
+++ b/compiler/x86/nx86inl.pas
@ -45,6 +45,7 @@ interface
          function first_round_real: tnode; override;
          function first_trunc_real: tnode; override;
          function first_popcnt: tnode; override;
          function first_fma: tnode; override;
          { second pass override to generate these nodes }
          procedure second_IncludeExclude;override;
          procedure second_pi; override;
@ -64,6 +65,7 @@ interface
          procedure second_abs_long;override;
 {$endif not i8086}
          procedure second_popcnt;override;
          procedure second_fma;override;
       private
          procedure load_fpu_location(lnode: tnode);
       end;
@ -247,7 +249,20 @@ implementation
       end;
-     procedure tx86inlinenode.second_Pi;
+     function tx86inlinenode.first_fma : tnode;
       begin
         if ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]) and
           ((is_double(resultdef)) or (is_single(resultdef))) then
           begin
             expectloc:=LOC_MMREGISTER;
             Result:=nil;
           end
         else
           Result:=inherited first_fma;
       end;
     procedure tx86inlinenode.second_pi;
       begin
         location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
         emit_none(A_FLDPI,S_NO);
@ -741,4 +756,85 @@ implementation
        else
          emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
      end;
    procedure tx86inlinenode.second_fma;
      const
        op : array[s32real..s64real,0..3] of TAsmOp = ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
                                                       (A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD));
      var
        paraarray : array[1..3] of tnode;
        memop,
        i : integer;
        gotmem : boolean;
      begin
         if (cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[] then
           begin
             paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
             paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
             paraarray[3]:=tcallparanode(parameters).paravalue;
             for i:=1 to 3 do
               secondpass(paraarray[i]);
             { only one memory operand is allowed }
             gotmem:=false;
             memop:=0;
             for i:=1 to 3 do
               begin
                 if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
                   begin
                     if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
                       begin
                         memop:=i;
                         gotmem:=true;
                       end
                     else
                       hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
                   end;
               end;
             location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
             if gotmem then
               begin
                 case memop of
                   1:
                     begin
                       hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
                         paraarray[3].location.register,location.register,mms_movescalar);
                       emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
                         paraarray[1].location.reference,paraarray[2].location.register,location.register);
                     end;
                   2:
                     begin
                       hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
                         paraarray[3].location.register,location.register,mms_movescalar);
                       emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
                         paraarray[2].location.reference,paraarray[1].location.register,location.register);
                     end;
                   3:
                     begin
                       hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
                         paraarray[1].location.register,location.register,mms_movescalar);
                       emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
                         paraarray[3].location.reference,paraarray[2].location.register,location.register);
                     end
                   else
                     internalerror(2014041301);
                 end;
               end
             else
               begin
                 hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
                   paraarray[3].location.register,location.register,mms_movescalar);
                 emit_reg_reg_reg(op[tfloatdef(resultdef).floattype,0],S_NO,
                   paraarray[1].location.register,paraarray[2].location.register,location.register);
               end;
           end
         else
           internalerror(2014032301);
      end;
 end.
--- a/compiler/x86_64/cpuinfo.pas
+++ b/compiler/x86_64/cpuinfo.pas
@ -125,7 +125,9 @@ type
       CPUX86_HAS_POPCNT,
       CPUX86_HAS_AVXUNIT,
       CPUX86_HAS_LZCNT,
-       CPUX86_HAS_MOVBE
+       CPUX86_HAS_MOVBE,
       CPUX86_HAS_FMA,
       CPUX86_HAS_FMA4
      );
 const
@ -134,7 +136,7 @@ type
     { Athlon64      } [CPUX86_HAS_SSEUNIT],
     { cpu_core_i    } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
     { cpu_core_avx  } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
-     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
+     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
   );
 Implementation
--- a/rtl/inc/innr.inc
+++ b/rtl/inc/innr.inc
@ -115,6 +115,10 @@ const
   fpc_in_arctan_real      = 130;
   fpc_in_ln_real          = 131;
   fpc_in_sin_real         = 132;
   fpc_in_fma_single       = 133;
   fpc_in_fma_double       = 134;
   fpc_in_fma_extended     = 135;
   fpc_in_fma_float128     = 136;
 { MMX functions }
 { these contants are used by the mmx unit }
--- a/rtl/inc/mathh.inc
+++ b/rtl/inc/mathh.inc
@ -109,3 +109,15 @@ procedure float_raise(i: TFPUExceptionMask);
    operator := (b:real48) e:extended;
 {$endif SUPPORT_EXTENDED}
    function fma(s1,s2,s3 : single) : single;[internproc:fpc_in_fma_single];
 {$ifdef SUPPORT_DOUBLE}
    function fma(d1,d2,d3 : double) : double;[internproc:fpc_in_fma_double];
 {$endif SUPPORT_DOUBLE}
 {$ifdef SUPPORT_EXTENDED}
    function fma(e1,e2,e3 : extended) : extended;[internproc:fpc_in_fma_extended];
 {$endif SUPPORT_EXTENDED}
 {$ifdef SUPPORT_FLOAT128}
    function fma(f1,f2,f3 : float128) : float128;[internproc:fpc_in_fma_float128];
 {$endif SUPPORT_FLOAT128}
--- a/tests/test/tfma1.pp
+++ b/tests/test/tfma1.pp
@ -0,0 +1,121 @@
 { %CPU=i386,x86_64 }
 { %OPT=-Cfavx2 -Cpcoreavx2 }
 uses
  cpu;
 var
  d0,d1,d2,d3 : double;
  s0,s1,s2,s3 : single;
 procedure testsingle;
  var
    l0,l1,l2,l3 : single;
  begin
    l1:=2;
    l2:=3;
    l3:=4;
    l0:=fma(l1,l2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(s1,l2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(l1,s2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(l1,l2,s3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(s1,s2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(s1,l2,s3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(l1,s2,s3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
  end;
 procedure testdouble;
  var
    l0,l1,l2,l3 : double;
  begin
    l1:=2;
    l2:=3;
    l3:=4;
    l0:=fma(l1,l2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(d1,l2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(l1,d2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(l1,l2,d3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(d1,d2,l3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(d1,l2,d3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
    l0:=fma(l1,d2,d3);
    writeln(l0);
    if l0<>10.0 then
      halt(1);
  end;
 begin
  if AVXSupport and FMASupport then
    begin
      d1:=2;
      d2:=3;
      d3:=4;
      d0:=fma(d1,d2,d3);
      writeln(d0);
      if d0<>10.0 then
        halt(1);
      s1:=2;
      s2:=3;
      s3:=4;
      s0:=fma(s1,s2,s3);
      writeln(s0);
      if s0<>10.0 then
        halt(1);
      testsingle;
      testdouble;
      writeln('ok');
    end
  else
    writeln('Skipped because not supported by the CPU');
 end.