+ support for FMA intrinsic: if there is no hardware support, the compiler throws an error.

Currently it is implemented only for x86-CPUs supporting the FMA extension. While it would
  be possible to implement it in software or simulate fma(<single>,<single>,<single>) using
  double operations, it makes no sense in my opinion to do so.

git-svn-id: trunk@27564 -
This commit is contained in:
florian 2014-04-13 19:21:54 +00:00
parent d404d15c1e
commit d88d644925
16 changed files with 642 additions and 347 deletions

1
.gitattributes vendored
View File

@ -11382,6 +11382,7 @@ tests/test/textthr.pp svneol=native#text/plain
tests/test/tfillchr.pp svneol=native#text/plain
tests/test/tfinal1.pp svneol=native#text/pascal
tests/test/tfinal2.pp svneol=native#text/pascal
tests/test/tfma1.pp svneol=native#text/pascal
tests/test/tforin1.pp svneol=native#text/pascal
tests/test/tforin10.pp svneol=native#text/plain
tests/test/tforin11.pp svneol=native#text/plain

View File

@ -115,6 +115,10 @@ const
in_arctan_real = 130;
in_ln_real = 131;
in_sin_real = 132;
in_fma_single = 133;
in_fma_double = 134;
in_fma_extended = 135;
in_fma_float128 = 136;
{ MMX functions }
{ these contants are used by the mmx unit }

View File

@ -137,7 +137,9 @@ type
CPUX86_HAS_POPCNT,
CPUX86_HAS_AVXUNIT,
CPUX86_HAS_LZCNT,
CPUX86_HAS_MOVBE
CPUX86_HAS_MOVBE,
CPUX86_HAS_FMA,
CPUX86_HAS_FMA4
);
const
@ -151,7 +153,7 @@ type
{ cpu_PentiumM } [CPUX86_HAS_SSEUNIT],
{ cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
{ cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
);

View File

@ -2358,6 +2358,11 @@ cg_e_mod_only_defined_for_pos_quotient=06054_E_In ISO mode, the mod operator is
% In ISO pascal, only positive values are allowed for the quotient: \var{n mod m} is only valid if \var{m>0}.
cg_d_autoinlining=06055_DL_Auto inlining: $1
% Due to auto inlining turned on, the compiler auto inlines this subroutine.
cg_e_function_not_support_by_selected_instruction_set=06056_E_The function used, is not supported by the selected instruction set: $1
% Some functions cannot be implemented efficiently for certain instruction sets, one example is fused multiply/add.
% To avoid very inefficient code, the compiler complains in this case, so either select another instruction set
% or replace the function call by alternative code
%
% \end{description}
# EndOfTeX
#
@ -2615,7 +2620,6 @@ asmr_e_mixing_regtypes=07108_E_All registers in a register set must be of the sa
asmr_e_empty_regset=07109_E_A register set cannot be empty
% Instructions on the ARM architecture that take a register set as argument require that such a set
% contains at least one register.
asmr_w_useless_got_for_local=07110_W_@GOTPCREL is useless and potentially dangereous for local symbols
% The use of @GOTPCREL supposes an extra indirection that is
% not present if the symbol is local, which might lead to wrong asembler code

View File

@ -654,6 +654,7 @@ const
cg_e_goto_across_procedures_with_exceptions_not_allowed=06053;
cg_e_mod_only_defined_for_pos_quotient=06054;
cg_d_autoinlining=06055;
cg_e_function_not_support_by_selected_instruction_set=06056;
asmr_d_start_reading=07000;
asmr_d_finish_reading=07001;
asmr_e_none_label_contain_at=07002;
@ -985,9 +986,9 @@ const
option_info=11024;
option_help_pages=11025;
MsgTxtSize = 71162;
MsgTxtSize = 71242;
MsgIdxMax : array[1..20] of longint=(
26,96,337,121,89,56,126,27,202,64,
26,96,337,121,89,57,126,27,202,64,
57,20,1,1,1,1,1,1,1,1
);

File diff suppressed because it is too large Load Diff

View File

@ -220,6 +220,7 @@ interface
{ a refcounted into a non-refcounted type }
function can_be_inlined: boolean;
property paravalue : tnode read left write left;
property nextpara : tnode read right write right;
{ third is reused to store the parameter name (only while parsing
vardispatch calls, never in real node tree) and copy of 'high'

View File

@ -60,6 +60,7 @@ interface
procedure second_box; virtual; abstract;
procedure second_popcnt; virtual;
procedure second_seg; virtual; abstract;
procedure second_fma; virtual;
end;
implementation
@ -190,6 +191,11 @@ implementation
second_popcnt;
in_seg_x:
second_seg;
in_fma_single,
in_fma_double,
in_fma_extended,
in_fma_float128:
second_fma;
else internalerror(9);
end;
end;
@ -768,6 +774,12 @@ implementation
end;
procedure tcginlinenode.second_fma;
begin
internalerror(2014032701);
end;
begin
cinlinenode:=tcginlinenode;
end. s

View File

@ -45,6 +45,7 @@ interface
{ pack and unpack are changed into for-loops by the compiler }
function first_pack_unpack: tnode; virtual;
property parameters : tnode read left write left;
protected
{ All the following routines currently
call compilerprocs, unless they are
@ -83,6 +84,7 @@ interface
function typecheck_seg: tnode; virtual;
function first_seg: tnode; virtual;
function first_sar: tnode; virtual;
function first_fma : tnode; virtual;
private
function handle_str: tnode;
function handle_reset_rewrite_typed: tnode;
@ -3245,6 +3247,16 @@ implementation
begin
result:=handle_unbox;
end;
in_fma_single,
in_fma_double,
in_fma_extended,
in_fma_float128:
begin
set_varstate(tcallparanode(left).left,vs_read,[vsf_must_be_valid]);
set_varstate(tcallparanode(tcallparanode(left).right).left,vs_read,[vsf_must_be_valid]);
set_varstate(tcallparanode(tcallparanode(tcallparanode(left).right).right).left,vs_read,[vsf_must_be_valid]);
resultdef:=tcallparanode(left).left.resultdef;
end;
else
internalerror(8);
end;
@ -3659,6 +3671,11 @@ implementation
result:=first_box;
in_unbox_x_y:
result:=first_unbox;
in_fma_single,
in_fma_double,
in_fma_extended,
in_fma_float128:
result:=first_fma;
else
internalerror(89);
end;
@ -4218,4 +4235,12 @@ implementation
result := loop;
end;
function tinlinenode.first_fma: tnode;
begin
CGMessage1(cg_e_function_not_support_by_selected_instruction_set,'FMA');
result:=nil;
end;
end.

View File

@ -44,6 +44,7 @@ interface
procedure emit_const_reg_reg(i : tasmop;s : topsize;c : longint;reg1,reg2 : tregister);
procedure emit_reg_reg_reg(i : tasmop;s : topsize;reg1,reg2,reg3 : tregister);
procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
@ -124,6 +125,12 @@ implementation
current_asmdata.CurrAsmList.concat(Taicpu.Op_reg_reg_reg(i,s,reg1,reg2,reg3));
end;
procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
begin
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,ref);
current_asmdata.CurrAsmList.concat(Taicpu.Op_ref_reg_reg(i,s,ref,reg1,reg2));
end;
procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
begin
current_asmdata.CurrAsmList.concat(Taicpu.Op_sym(i,s,op));

View File

@ -185,7 +185,7 @@ unit cgx86;
function UseAVX: boolean;
begin
Result:=current_settings.fputype in fpu_avx_instructionsets;
Result:=(current_settings.fputype in fpu_avx_instructionsets) or (CPUX86_HAS_AVXUNIT in cpu_capabilities[current_settings.cputype]);
end;

View File

@ -45,6 +45,7 @@ interface
function first_round_real: tnode; override;
function first_trunc_real: tnode; override;
function first_popcnt: tnode; override;
function first_fma: tnode; override;
{ second pass override to generate these nodes }
procedure second_IncludeExclude;override;
procedure second_pi; override;
@ -64,6 +65,7 @@ interface
procedure second_abs_long;override;
{$endif not i8086}
procedure second_popcnt;override;
procedure second_fma;override;
private
procedure load_fpu_location(lnode: tnode);
end;
@ -247,7 +249,20 @@ implementation
end;
procedure tx86inlinenode.second_Pi;
function tx86inlinenode.first_fma : tnode;
begin
if ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]) and
((is_double(resultdef)) or (is_single(resultdef))) then
begin
expectloc:=LOC_MMREGISTER;
Result:=nil;
end
else
Result:=inherited first_fma;
end;
procedure tx86inlinenode.second_pi;
begin
location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
emit_none(A_FLDPI,S_NO);
@ -741,4 +756,85 @@ implementation
else
emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
end;
procedure tx86inlinenode.second_fma;
const
op : array[s32real..s64real,0..3] of TAsmOp = ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
(A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD));
var
paraarray : array[1..3] of tnode;
memop,
i : integer;
gotmem : boolean;
begin
if (cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[] then
begin
paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
paraarray[3]:=tcallparanode(parameters).paravalue;
for i:=1 to 3 do
secondpass(paraarray[i]);
{ only one memory operand is allowed }
gotmem:=false;
memop:=0;
for i:=1 to 3 do
begin
if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
begin
if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
begin
memop:=i;
gotmem:=true;
end
else
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
end;
end;
location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
if gotmem then
begin
case memop of
1:
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
paraarray[3].location.register,location.register,mms_movescalar);
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
paraarray[1].location.reference,paraarray[2].location.register,location.register);
end;
2:
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
paraarray[3].location.register,location.register,mms_movescalar);
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
paraarray[2].location.reference,paraarray[1].location.register,location.register);
end;
3:
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
paraarray[1].location.register,location.register,mms_movescalar);
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
paraarray[3].location.reference,paraarray[2].location.register,location.register);
end
else
internalerror(2014041301);
end;
end
else
begin
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
paraarray[3].location.register,location.register,mms_movescalar);
emit_reg_reg_reg(op[tfloatdef(resultdef).floattype,0],S_NO,
paraarray[1].location.register,paraarray[2].location.register,location.register);
end;
end
else
internalerror(2014032301);
end;
end.

View File

@ -125,7 +125,9 @@ type
CPUX86_HAS_POPCNT,
CPUX86_HAS_AVXUNIT,
CPUX86_HAS_LZCNT,
CPUX86_HAS_MOVBE
CPUX86_HAS_MOVBE,
CPUX86_HAS_FMA,
CPUX86_HAS_FMA4
);
const
@ -134,7 +136,7 @@ type
{ Athlon64 } [CPUX86_HAS_SSEUNIT],
{ cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
{ cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
);
Implementation

View File

@ -115,6 +115,10 @@ const
fpc_in_arctan_real = 130;
fpc_in_ln_real = 131;
fpc_in_sin_real = 132;
fpc_in_fma_single = 133;
fpc_in_fma_double = 134;
fpc_in_fma_extended = 135;
fpc_in_fma_float128 = 136;
{ MMX functions }
{ these contants are used by the mmx unit }

View File

@ -109,3 +109,15 @@ procedure float_raise(i: TFPUExceptionMask);
operator := (b:real48) e:extended;
{$endif SUPPORT_EXTENDED}
function fma(s1,s2,s3 : single) : single;[internproc:fpc_in_fma_single];
{$ifdef SUPPORT_DOUBLE}
function fma(d1,d2,d3 : double) : double;[internproc:fpc_in_fma_double];
{$endif SUPPORT_DOUBLE}
{$ifdef SUPPORT_EXTENDED}
function fma(e1,e2,e3 : extended) : extended;[internproc:fpc_in_fma_extended];
{$endif SUPPORT_EXTENDED}
{$ifdef SUPPORT_FLOAT128}
function fma(f1,f2,f3 : float128) : float128;[internproc:fpc_in_fma_float128];
{$endif SUPPORT_FLOAT128}

121
tests/test/tfma1.pp Normal file
View File

@ -0,0 +1,121 @@
{ %CPU=i386,x86_64 }
{ %OPT=-Cfavx2 -Cpcoreavx2 }
uses
cpu;
var
d0,d1,d2,d3 : double;
s0,s1,s2,s3 : single;
procedure testsingle;
var
l0,l1,l2,l3 : single;
begin
l1:=2;
l2:=3;
l3:=4;
l0:=fma(l1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(s1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,s2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,l2,s3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(s1,s2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(s1,l2,s3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,s2,s3);
writeln(l0);
if l0<>10.0 then
halt(1);
end;
procedure testdouble;
var
l0,l1,l2,l3 : double;
begin
l1:=2;
l2:=3;
l3:=4;
l0:=fma(l1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(d1,l2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,d2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,l2,d3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(d1,d2,l3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(d1,l2,d3);
writeln(l0);
if l0<>10.0 then
halt(1);
l0:=fma(l1,d2,d3);
writeln(l0);
if l0<>10.0 then
halt(1);
end;
begin
if AVXSupport and FMASupport then
begin
d1:=2;
d2:=3;
d3:=4;
d0:=fma(d1,d2,d3);
writeln(d0);
if d0<>10.0 then
halt(1);
s1:=2;
s2:=3;
s3:=4;
s0:=fma(s1,s2,s3);
writeln(s0);
if s0<>10.0 then
halt(1);
testsingle;
testdouble;
writeln('ok');
end
else
writeln('Skipped because not supported by the CPU');
end.