+ support for FMA intrinsic: if there is no hardware support, the compiler throws an error.

Currently it is implemented only for x86-CPUs supporting the FMA extension. While it would
  be possible to implement it in software or simulate fma(<single>,<single>,<single>) using
  double operations, it makes no sense in my opinion to do so.

git-svn-id: trunk@27564 -
This commit is contained in:
florian 2014-04-13 19:21:54 +00:00
parent d404d15c1e
commit d88d644925
16 changed files with 642 additions and 347 deletions

1
.gitattributes vendored
View File

@ -11382,6 +11382,7 @@ tests/test/textthr.pp svneol=native#text/plain
tests/test/tfillchr.pp svneol=native#text/plain tests/test/tfillchr.pp svneol=native#text/plain
tests/test/tfinal1.pp svneol=native#text/pascal tests/test/tfinal1.pp svneol=native#text/pascal
tests/test/tfinal2.pp svneol=native#text/pascal tests/test/tfinal2.pp svneol=native#text/pascal
tests/test/tfma1.pp svneol=native#text/pascal
tests/test/tforin1.pp svneol=native#text/pascal tests/test/tforin1.pp svneol=native#text/pascal
tests/test/tforin10.pp svneol=native#text/plain tests/test/tforin10.pp svneol=native#text/plain
tests/test/tforin11.pp svneol=native#text/plain tests/test/tforin11.pp svneol=native#text/plain

View File

@ -115,6 +115,10 @@ const
in_arctan_real = 130; in_arctan_real = 130;
in_ln_real = 131; in_ln_real = 131;
in_sin_real = 132; in_sin_real = 132;
in_fma_single = 133;
in_fma_double = 134;
in_fma_extended = 135;
in_fma_float128 = 136;
{ MMX functions } { MMX functions }
{ these contants are used by the mmx unit } { these contants are used by the mmx unit }

View File

@ -137,7 +137,9 @@ type
CPUX86_HAS_POPCNT, CPUX86_HAS_POPCNT,
CPUX86_HAS_AVXUNIT, CPUX86_HAS_AVXUNIT,
CPUX86_HAS_LZCNT, CPUX86_HAS_LZCNT,
CPUX86_HAS_MOVBE CPUX86_HAS_MOVBE,
CPUX86_HAS_FMA,
CPUX86_HAS_FMA4
); );
const const
@ -151,7 +153,7 @@ type
{ cpu_PentiumM } [CPUX86_HAS_SSEUNIT], { cpu_PentiumM } [CPUX86_HAS_SSEUNIT],
{ cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT], { cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
{ cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT], { cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE] { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
); );

View File

@ -2358,6 +2358,11 @@ cg_e_mod_only_defined_for_pos_quotient=06054_E_In ISO mode, the mod operator is
% In ISO pascal, only positive values are allowed for the quotient: \var{n mod m} is only valid if \var{m>0}. % In ISO pascal, only positive values are allowed for the quotient: \var{n mod m} is only valid if \var{m>0}.
cg_d_autoinlining=06055_DL_Auto inlining: $1 cg_d_autoinlining=06055_DL_Auto inlining: $1
% Due to auto inlining turned on, the compiler auto inlines this subroutine. % Due to auto inlining turned on, the compiler auto inlines this subroutine.
cg_e_function_not_support_by_selected_instruction_set=06056_E_The function used, is not supported by the selected instruction set: $1
% Some functions cannot be implemented efficiently for certain instruction sets, one example is fused multiply/add.
% To avoid very inefficient code, the compiler complains in this case, so either select another instruction set
% or replace the function call by alternative code
%
% \end{description} % \end{description}
# EndOfTeX # EndOfTeX
# #
@ -2615,7 +2620,6 @@ asmr_e_mixing_regtypes=07108_E_All registers in a register set must be of the sa
asmr_e_empty_regset=07109_E_A register set cannot be empty asmr_e_empty_regset=07109_E_A register set cannot be empty
% Instructions on the ARM architecture that take a register set as argument require that such a set % Instructions on the ARM architecture that take a register set as argument require that such a set
% contains at least one register. % contains at least one register.
asmr_w_useless_got_for_local=07110_W_@GOTPCREL is useless and potentially dangereous for local symbols asmr_w_useless_got_for_local=07110_W_@GOTPCREL is useless and potentially dangereous for local symbols
% The use of @GOTPCREL supposes an extra indirection that is % The use of @GOTPCREL supposes an extra indirection that is
% not present if the symbol is local, which might lead to wrong asembler code % not present if the symbol is local, which might lead to wrong asembler code

View File

@ -654,6 +654,7 @@ const
cg_e_goto_across_procedures_with_exceptions_not_allowed=06053; cg_e_goto_across_procedures_with_exceptions_not_allowed=06053;
cg_e_mod_only_defined_for_pos_quotient=06054; cg_e_mod_only_defined_for_pos_quotient=06054;
cg_d_autoinlining=06055; cg_d_autoinlining=06055;
cg_e_function_not_support_by_selected_instruction_set=06056;
asmr_d_start_reading=07000; asmr_d_start_reading=07000;
asmr_d_finish_reading=07001; asmr_d_finish_reading=07001;
asmr_e_none_label_contain_at=07002; asmr_e_none_label_contain_at=07002;
@ -985,9 +986,9 @@ const
option_info=11024; option_info=11024;
option_help_pages=11025; option_help_pages=11025;
MsgTxtSize = 71162; MsgTxtSize = 71242;
MsgIdxMax : array[1..20] of longint=( MsgIdxMax : array[1..20] of longint=(
26,96,337,121,89,56,126,27,202,64, 26,96,337,121,89,57,126,27,202,64,
57,20,1,1,1,1,1,1,1,1 57,20,1,1,1,1,1,1,1,1
); );

File diff suppressed because it is too large Load Diff

View File

@ -220,6 +220,7 @@ interface
{ a refcounted into a non-refcounted type } { a refcounted into a non-refcounted type }
function can_be_inlined: boolean; function can_be_inlined: boolean;
property paravalue : tnode read left write left;
property nextpara : tnode read right write right; property nextpara : tnode read right write right;
{ third is reused to store the parameter name (only while parsing { third is reused to store the parameter name (only while parsing
vardispatch calls, never in real node tree) and copy of 'high' vardispatch calls, never in real node tree) and copy of 'high'

View File

@ -60,6 +60,7 @@ interface
procedure second_box; virtual; abstract; procedure second_box; virtual; abstract;
procedure second_popcnt; virtual; procedure second_popcnt; virtual;
procedure second_seg; virtual; abstract; procedure second_seg; virtual; abstract;
procedure second_fma; virtual;
end; end;
implementation implementation
@ -190,6 +191,11 @@ implementation
second_popcnt; second_popcnt;
in_seg_x: in_seg_x:
second_seg; second_seg;
in_fma_single,
in_fma_double,
in_fma_extended,
in_fma_float128:
second_fma;
else internalerror(9); else internalerror(9);
end; end;
end; end;
@ -768,6 +774,12 @@ implementation
end; end;
procedure tcginlinenode.second_fma;
begin
internalerror(2014032701);
end;
begin begin
cinlinenode:=tcginlinenode; cinlinenode:=tcginlinenode;
end. s end. s

View File

@ -45,6 +45,7 @@ interface
{ pack and unpack are changed into for-loops by the compiler } { pack and unpack are changed into for-loops by the compiler }
function first_pack_unpack: tnode; virtual; function first_pack_unpack: tnode; virtual;
property parameters : tnode read left write left;
protected protected
{ All the following routines currently { All the following routines currently
call compilerprocs, unless they are call compilerprocs, unless they are
@ -83,6 +84,7 @@ interface
function typecheck_seg: tnode; virtual; function typecheck_seg: tnode; virtual;
function first_seg: tnode; virtual; function first_seg: tnode; virtual;
function first_sar: tnode; virtual; function first_sar: tnode; virtual;
function first_fma : tnode; virtual;
private private
function handle_str: tnode; function handle_str: tnode;
function handle_reset_rewrite_typed: tnode; function handle_reset_rewrite_typed: tnode;
@ -3245,6 +3247,16 @@ implementation
begin begin
result:=handle_unbox; result:=handle_unbox;
end; end;
in_fma_single,
in_fma_double,
in_fma_extended,
in_fma_float128:
begin
set_varstate(tcallparanode(left).left,vs_read,[vsf_must_be_valid]);
set_varstate(tcallparanode(tcallparanode(left).right).left,vs_read,[vsf_must_be_valid]);
set_varstate(tcallparanode(tcallparanode(tcallparanode(left).right).right).left,vs_read,[vsf_must_be_valid]);
resultdef:=tcallparanode(left).left.resultdef;
end;
else else
internalerror(8); internalerror(8);
end; end;
@ -3659,6 +3671,11 @@ implementation
result:=first_box; result:=first_box;
in_unbox_x_y: in_unbox_x_y:
result:=first_unbox; result:=first_unbox;
in_fma_single,
in_fma_double,
in_fma_extended,
in_fma_float128:
result:=first_fma;
else else
internalerror(89); internalerror(89);
end; end;
@ -4218,4 +4235,12 @@ implementation
result := loop; result := loop;
end; end;
function tinlinenode.first_fma: tnode;
begin
CGMessage1(cg_e_function_not_support_by_selected_instruction_set,'FMA');
result:=nil;
end;
end. end.

View File

@ -44,6 +44,7 @@ interface
procedure emit_const_reg_reg(i : tasmop;s : topsize;c : longint;reg1,reg2 : tregister); procedure emit_const_reg_reg(i : tasmop;s : topsize;c : longint;reg1,reg2 : tregister);
procedure emit_reg_reg_reg(i : tasmop;s : topsize;reg1,reg2,reg3 : tregister); procedure emit_reg_reg_reg(i : tasmop;s : topsize;reg1,reg2,reg3 : tregister);
procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol); procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
@ -124,6 +125,12 @@ implementation
current_asmdata.CurrAsmList.concat(Taicpu.Op_reg_reg_reg(i,s,reg1,reg2,reg3)); current_asmdata.CurrAsmList.concat(Taicpu.Op_reg_reg_reg(i,s,reg1,reg2,reg3));
end; end;
procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
begin
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,ref);
current_asmdata.CurrAsmList.concat(Taicpu.Op_ref_reg_reg(i,s,ref,reg1,reg2));
end;
procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol); procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
begin begin
current_asmdata.CurrAsmList.concat(Taicpu.Op_sym(i,s,op)); current_asmdata.CurrAsmList.concat(Taicpu.Op_sym(i,s,op));

View File

@ -185,7 +185,7 @@ unit cgx86;
function UseAVX: boolean; function UseAVX: boolean;
begin begin
Result:=current_settings.fputype in fpu_avx_instructionsets; Result:=(current_settings.fputype in fpu_avx_instructionsets) or (CPUX86_HAS_AVXUNIT in cpu_capabilities[current_settings.cputype]);
end; end;

View File

@ -45,6 +45,7 @@ interface
function first_round_real: tnode; override; function first_round_real: tnode; override;
function first_trunc_real: tnode; override; function first_trunc_real: tnode; override;
function first_popcnt: tnode; override; function first_popcnt: tnode; override;
function first_fma: tnode; override;
{ second pass override to generate these nodes } { second pass override to generate these nodes }
procedure second_IncludeExclude;override; procedure second_IncludeExclude;override;
procedure second_pi; override; procedure second_pi; override;
@ -64,6 +65,7 @@ interface
procedure second_abs_long;override; procedure second_abs_long;override;
{$endif not i8086} {$endif not i8086}
procedure second_popcnt;override; procedure second_popcnt;override;
procedure second_fma;override;
private private
procedure load_fpu_location(lnode: tnode); procedure load_fpu_location(lnode: tnode);
end; end;
@ -247,7 +249,20 @@ implementation
end; end;
procedure tx86inlinenode.second_Pi; function tx86inlinenode.first_fma : tnode;
begin
if ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]) and
((is_double(resultdef)) or (is_single(resultdef))) then
begin
expectloc:=LOC_MMREGISTER;
Result:=nil;
end
else
Result:=inherited first_fma;
end;
procedure tx86inlinenode.second_pi;
begin begin
location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef)); location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
emit_none(A_FLDPI,S_NO); emit_none(A_FLDPI,S_NO);
@ -741,4 +756,85 @@ implementation
else else
emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register); emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
end; end;
procedure tx86inlinenode.second_fma;
const
op : array[s32real..s64real,0..3] of TAsmOp = ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
(A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD));
var
paraarray : array[1..3] of tnode;
memop,
i : integer;
gotmem : boolean;
begin
if (cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[] then
begin
paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
paraarray[3]:=tcallparanode(parameters).paravalue;
for i:=1 to 3 do
secondpass(paraarray[i]);
{ only one memory operand is allowed }
gotmem:=false;
memop:=0;
for i:=1 to 3 do
begin
if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
begin
if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
begin
memop:=i;
gotmem:=true;
end
else
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
end;
end;
location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
if gotmem then
begin
case memop of
1:
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
paraarray[3].location.register,location.register,mms_movescalar);
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
paraarray[1].location.reference,paraarray[2].location.register,location.register);
end;
2:
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
paraarray[3].location.register,location.register,mms_movescalar);
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
paraarray[2].location.reference,paraarray[1].location.register,location.register);
end;
3:
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
paraarray[1].location.register,location.register,mms_movescalar);
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
paraarray[3].location.reference,paraarray[2].location.register,location.register);
end
else
internalerror(2014041301);
end;
end
else
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
paraarray[3].location.register,location.register,mms_movescalar);
emit_reg_reg_reg(op[tfloatdef(resultdef).floattype,0],S_NO,
paraarray[1].location.register,paraarray[2].location.register,location.register);
end;
end
else
internalerror(2014032301);
end;
end. end.

View File

@ -125,7 +125,9 @@ type
CPUX86_HAS_POPCNT, CPUX86_HAS_POPCNT,
CPUX86_HAS_AVXUNIT, CPUX86_HAS_AVXUNIT,
CPUX86_HAS_LZCNT, CPUX86_HAS_LZCNT,
CPUX86_HAS_MOVBE CPUX86_HAS_MOVBE,
CPUX86_HAS_FMA,
CPUX86_HAS_FMA4
); );
const const
@ -134,7 +136,7 @@ type
{ Athlon64 } [CPUX86_HAS_SSEUNIT], { Athlon64 } [CPUX86_HAS_SSEUNIT],
{ cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT], { cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
{ cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT], { cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE] { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
); );
Implementation Implementation

View File

@ -115,6 +115,10 @@ const
fpc_in_arctan_real = 130; fpc_in_arctan_real = 130;
fpc_in_ln_real = 131; fpc_in_ln_real = 131;
fpc_in_sin_real = 132; fpc_in_sin_real = 132;
fpc_in_fma_single = 133;
fpc_in_fma_double = 134;
fpc_in_fma_extended = 135;
fpc_in_fma_float128 = 136;
{ MMX functions } { MMX functions }
{ these contants are used by the mmx unit } { these contants are used by the mmx unit }

View File

@ -109,3 +109,15 @@ procedure float_raise(i: TFPUExceptionMask);
operator := (b:real48) e:extended; operator := (b:real48) e:extended;
{$endif SUPPORT_EXTENDED} {$endif SUPPORT_EXTENDED}
function fma(s1,s2,s3 : single) : single;[internproc:fpc_in_fma_single];
{$ifdef SUPPORT_DOUBLE}
function fma(d1,d2,d3 : double) : double;[internproc:fpc_in_fma_double];
{$endif SUPPORT_DOUBLE}
{$ifdef SUPPORT_EXTENDED}
function fma(e1,e2,e3 : extended) : extended;[internproc:fpc_in_fma_extended];
{$endif SUPPORT_EXTENDED}
{$ifdef SUPPORT_FLOAT128}
function fma(f1,f2,f3 : float128) : float128;[internproc:fpc_in_fma_float128];
{$endif SUPPORT_FLOAT128}

121
tests/test/tfma1.pp Normal file
View File

@ -0,0 +1,121 @@
{ %CPU=i386,x86_64 }
{ %OPT=-Cfavx2 -Cpcoreavx2 }
uses
cpu;
var
d0,d1,d2,d3 : double;
s0,s1,s2,s3 : single;
procedure testsingle;
var
l0,l1,l2,l3 : single;
begin
l1:=2;
l2:=3;
l3:=4;
l0:=fma(l1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(s1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,s2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,l2,s3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(s1,s2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(s1,l2,s3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,s2,s3);
writeln(l0);
if l0<>10.0 then
halt(1);
end;
procedure testdouble;
var
l0,l1,l2,l3 : double;
begin
l1:=2;
l2:=3;
l3:=4;
l0:=fma(l1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(d1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,d2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,l2,d3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(d1,d2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(d1,l2,d3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,d2,d3);
writeln(l0);
if l0<>10.0 then
halt(1);
end;
begin
if AVXSupport and FMASupport then
begin
d1:=2;
d2:=3;
d3:=4;
d0:=fma(d1,d2,d3);
writeln(d0);
if d0<>10.0 then
halt(1);
s1:=2;
s2:=3;
s3:=4;
s0:=fma(s1,s2,s3);
writeln(s0);
if s0<>10.0 then
halt(1);
testsingle;
testdouble;
writeln('ok');
end
else
writeln('Skipped because not supported by the CPU');
end.