mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-07 21:28:03 +02:00
+ support for FMA intrinsic: if there is no hardware support, the compiler throws an error.
Currently it is implemented only for x86-CPUs supporting the FMA extension. While it would be possible to implement it in software or simulate fma(<single>,<single>,<single>) using double operations, it makes no sense in my opinion to do so. git-svn-id: trunk@27564 -
This commit is contained in:
parent
d404d15c1e
commit
d88d644925
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -11382,6 +11382,7 @@ tests/test/textthr.pp svneol=native#text/plain
|
||||
tests/test/tfillchr.pp svneol=native#text/plain
|
||||
tests/test/tfinal1.pp svneol=native#text/pascal
|
||||
tests/test/tfinal2.pp svneol=native#text/pascal
|
||||
tests/test/tfma1.pp svneol=native#text/pascal
|
||||
tests/test/tforin1.pp svneol=native#text/pascal
|
||||
tests/test/tforin10.pp svneol=native#text/plain
|
||||
tests/test/tforin11.pp svneol=native#text/plain
|
||||
|
@ -115,6 +115,10 @@ const
|
||||
in_arctan_real = 130;
|
||||
in_ln_real = 131;
|
||||
in_sin_real = 132;
|
||||
in_fma_single = 133;
|
||||
in_fma_double = 134;
|
||||
in_fma_extended = 135;
|
||||
in_fma_float128 = 136;
|
||||
|
||||
{ MMX functions }
|
||||
{ these contants are used by the mmx unit }
|
||||
|
@ -137,7 +137,9 @@ type
|
||||
CPUX86_HAS_POPCNT,
|
||||
CPUX86_HAS_AVXUNIT,
|
||||
CPUX86_HAS_LZCNT,
|
||||
CPUX86_HAS_MOVBE
|
||||
CPUX86_HAS_MOVBE,
|
||||
CPUX86_HAS_FMA,
|
||||
CPUX86_HAS_FMA4
|
||||
);
|
||||
|
||||
const
|
||||
@ -151,7 +153,7 @@ type
|
||||
{ cpu_PentiumM } [CPUX86_HAS_SSEUNIT],
|
||||
{ cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
|
||||
{ cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
|
||||
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
|
||||
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
|
||||
);
|
||||
|
||||
|
||||
|
@ -2358,6 +2358,11 @@ cg_e_mod_only_defined_for_pos_quotient=06054_E_In ISO mode, the mod operator is
|
||||
% In ISO pascal, only positive values are allowed for the quotient: \var{n mod m} is only valid if \var{m>0}.
|
||||
cg_d_autoinlining=06055_DL_Auto inlining: $1
|
||||
% Due to auto inlining turned on, the compiler auto inlines this subroutine.
|
||||
cg_e_function_not_support_by_selected_instruction_set=06056_E_The function used, is not supported by the selected instruction set: $1
|
||||
% Some functions cannot be implemented efficiently for certain instruction sets, one example is fused multiply/add.
|
||||
% To avoid very inefficient code, the compiler complains in this case, so either select another instruction set
|
||||
% or replace the function call by alternative code
|
||||
%
|
||||
% \end{description}
|
||||
# EndOfTeX
|
||||
#
|
||||
@ -2615,7 +2620,6 @@ asmr_e_mixing_regtypes=07108_E_All registers in a register set must be of the sa
|
||||
asmr_e_empty_regset=07109_E_A register set cannot be empty
|
||||
% Instructions on the ARM architecture that take a register set as argument require that such a set
|
||||
% contains at least one register.
|
||||
|
||||
asmr_w_useless_got_for_local=07110_W_@GOTPCREL is useless and potentially dangereous for local symbols
|
||||
% The use of @GOTPCREL supposes an extra indirection that is
|
||||
% not present if the symbol is local, which might lead to wrong asembler code
|
||||
|
@ -654,6 +654,7 @@ const
|
||||
cg_e_goto_across_procedures_with_exceptions_not_allowed=06053;
|
||||
cg_e_mod_only_defined_for_pos_quotient=06054;
|
||||
cg_d_autoinlining=06055;
|
||||
cg_e_function_not_support_by_selected_instruction_set=06056;
|
||||
asmr_d_start_reading=07000;
|
||||
asmr_d_finish_reading=07001;
|
||||
asmr_e_none_label_contain_at=07002;
|
||||
@ -985,9 +986,9 @@ const
|
||||
option_info=11024;
|
||||
option_help_pages=11025;
|
||||
|
||||
MsgTxtSize = 71162;
|
||||
MsgTxtSize = 71242;
|
||||
|
||||
MsgIdxMax : array[1..20] of longint=(
|
||||
26,96,337,121,89,56,126,27,202,64,
|
||||
26,96,337,121,89,57,126,27,202,64,
|
||||
57,20,1,1,1,1,1,1,1,1
|
||||
);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -220,6 +220,7 @@ interface
|
||||
{ a refcounted into a non-refcounted type }
|
||||
function can_be_inlined: boolean;
|
||||
|
||||
property paravalue : tnode read left write left;
|
||||
property nextpara : tnode read right write right;
|
||||
{ third is reused to store the parameter name (only while parsing
|
||||
vardispatch calls, never in real node tree) and copy of 'high'
|
||||
|
@ -60,6 +60,7 @@ interface
|
||||
procedure second_box; virtual; abstract;
|
||||
procedure second_popcnt; virtual;
|
||||
procedure second_seg; virtual; abstract;
|
||||
procedure second_fma; virtual;
|
||||
end;
|
||||
|
||||
implementation
|
||||
@ -190,6 +191,11 @@ implementation
|
||||
second_popcnt;
|
||||
in_seg_x:
|
||||
second_seg;
|
||||
in_fma_single,
|
||||
in_fma_double,
|
||||
in_fma_extended,
|
||||
in_fma_float128:
|
||||
second_fma;
|
||||
else internalerror(9);
|
||||
end;
|
||||
end;
|
||||
@ -768,6 +774,12 @@ implementation
|
||||
end;
|
||||
|
||||
|
||||
procedure tcginlinenode.second_fma;
|
||||
begin
|
||||
internalerror(2014032701);
|
||||
end;
|
||||
|
||||
|
||||
begin
|
||||
cinlinenode:=tcginlinenode;
|
||||
end. s
|
||||
|
@ -45,6 +45,7 @@ interface
|
||||
{ pack and unpack are changed into for-loops by the compiler }
|
||||
function first_pack_unpack: tnode; virtual;
|
||||
|
||||
property parameters : tnode read left write left;
|
||||
protected
|
||||
{ All the following routines currently
|
||||
call compilerprocs, unless they are
|
||||
@ -83,6 +84,7 @@ interface
|
||||
function typecheck_seg: tnode; virtual;
|
||||
function first_seg: tnode; virtual;
|
||||
function first_sar: tnode; virtual;
|
||||
function first_fma : tnode; virtual;
|
||||
private
|
||||
function handle_str: tnode;
|
||||
function handle_reset_rewrite_typed: tnode;
|
||||
@ -3245,6 +3247,16 @@ implementation
|
||||
begin
|
||||
result:=handle_unbox;
|
||||
end;
|
||||
in_fma_single,
|
||||
in_fma_double,
|
||||
in_fma_extended,
|
||||
in_fma_float128:
|
||||
begin
|
||||
set_varstate(tcallparanode(left).left,vs_read,[vsf_must_be_valid]);
|
||||
set_varstate(tcallparanode(tcallparanode(left).right).left,vs_read,[vsf_must_be_valid]);
|
||||
set_varstate(tcallparanode(tcallparanode(tcallparanode(left).right).right).left,vs_read,[vsf_must_be_valid]);
|
||||
resultdef:=tcallparanode(left).left.resultdef;
|
||||
end;
|
||||
else
|
||||
internalerror(8);
|
||||
end;
|
||||
@ -3659,6 +3671,11 @@ implementation
|
||||
result:=first_box;
|
||||
in_unbox_x_y:
|
||||
result:=first_unbox;
|
||||
in_fma_single,
|
||||
in_fma_double,
|
||||
in_fma_extended,
|
||||
in_fma_float128:
|
||||
result:=first_fma;
|
||||
else
|
||||
internalerror(89);
|
||||
end;
|
||||
@ -4218,4 +4235,12 @@ implementation
|
||||
result := loop;
|
||||
end;
|
||||
|
||||
|
||||
function tinlinenode.first_fma: tnode;
|
||||
begin
|
||||
CGMessage1(cg_e_function_not_support_by_selected_instruction_set,'FMA');
|
||||
result:=nil;
|
||||
end;
|
||||
|
||||
end.
|
||||
|
||||
|
@ -44,6 +44,7 @@ interface
|
||||
|
||||
procedure emit_const_reg_reg(i : tasmop;s : topsize;c : longint;reg1,reg2 : tregister);
|
||||
procedure emit_reg_reg_reg(i : tasmop;s : topsize;reg1,reg2,reg3 : tregister);
|
||||
procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
|
||||
|
||||
|
||||
procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
|
||||
@ -124,6 +125,12 @@ implementation
|
||||
current_asmdata.CurrAsmList.concat(Taicpu.Op_reg_reg_reg(i,s,reg1,reg2,reg3));
|
||||
end;
|
||||
|
||||
procedure emit_ref_reg_reg(i : tasmop;s : topsize;ref : treference;reg1,reg2 : tregister);
|
||||
begin
|
||||
tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,ref);
|
||||
current_asmdata.CurrAsmList.concat(Taicpu.Op_ref_reg_reg(i,s,ref,reg1,reg2));
|
||||
end;
|
||||
|
||||
procedure emit_sym(i : tasmop;s : topsize;op : tasmsymbol);
|
||||
begin
|
||||
current_asmdata.CurrAsmList.concat(Taicpu.Op_sym(i,s,op));
|
||||
|
@ -185,7 +185,7 @@ unit cgx86;
|
||||
|
||||
function UseAVX: boolean;
|
||||
begin
|
||||
Result:=current_settings.fputype in fpu_avx_instructionsets;
|
||||
Result:=(current_settings.fputype in fpu_avx_instructionsets) or (CPUX86_HAS_AVXUNIT in cpu_capabilities[current_settings.cputype]);
|
||||
end;
|
||||
|
||||
|
||||
|
@ -45,6 +45,7 @@ interface
|
||||
function first_round_real: tnode; override;
|
||||
function first_trunc_real: tnode; override;
|
||||
function first_popcnt: tnode; override;
|
||||
function first_fma: tnode; override;
|
||||
{ second pass override to generate these nodes }
|
||||
procedure second_IncludeExclude;override;
|
||||
procedure second_pi; override;
|
||||
@ -64,6 +65,7 @@ interface
|
||||
procedure second_abs_long;override;
|
||||
{$endif not i8086}
|
||||
procedure second_popcnt;override;
|
||||
procedure second_fma;override;
|
||||
private
|
||||
procedure load_fpu_location(lnode: tnode);
|
||||
end;
|
||||
@ -247,7 +249,20 @@ implementation
|
||||
end;
|
||||
|
||||
|
||||
procedure tx86inlinenode.second_Pi;
|
||||
function tx86inlinenode.first_fma : tnode;
|
||||
begin
|
||||
if ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]) and
|
||||
((is_double(resultdef)) or (is_single(resultdef))) then
|
||||
begin
|
||||
expectloc:=LOC_MMREGISTER;
|
||||
Result:=nil;
|
||||
end
|
||||
else
|
||||
Result:=inherited first_fma;
|
||||
end;
|
||||
|
||||
|
||||
procedure tx86inlinenode.second_pi;
|
||||
begin
|
||||
location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
|
||||
emit_none(A_FLDPI,S_NO);
|
||||
@ -741,4 +756,85 @@ implementation
|
||||
else
|
||||
emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
|
||||
end;
|
||||
|
||||
|
||||
procedure tx86inlinenode.second_fma;
|
||||
const
|
||||
op : array[s32real..s64real,0..3] of TAsmOp = ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
|
||||
(A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD));
|
||||
var
|
||||
paraarray : array[1..3] of tnode;
|
||||
memop,
|
||||
i : integer;
|
||||
gotmem : boolean;
|
||||
begin
|
||||
if (cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[] then
|
||||
begin
|
||||
paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
|
||||
paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
|
||||
paraarray[3]:=tcallparanode(parameters).paravalue;
|
||||
|
||||
for i:=1 to 3 do
|
||||
secondpass(paraarray[i]);
|
||||
|
||||
{ only one memory operand is allowed }
|
||||
gotmem:=false;
|
||||
memop:=0;
|
||||
for i:=1 to 3 do
|
||||
begin
|
||||
if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
|
||||
begin
|
||||
if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
|
||||
begin
|
||||
memop:=i;
|
||||
gotmem:=true;
|
||||
end
|
||||
else
|
||||
hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
|
||||
end;
|
||||
end;
|
||||
|
||||
location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
|
||||
location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
|
||||
|
||||
if gotmem then
|
||||
begin
|
||||
case memop of
|
||||
1:
|
||||
begin
|
||||
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
|
||||
paraarray[3].location.register,location.register,mms_movescalar);
|
||||
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
|
||||
paraarray[1].location.reference,paraarray[2].location.register,location.register);
|
||||
end;
|
||||
2:
|
||||
begin
|
||||
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
|
||||
paraarray[3].location.register,location.register,mms_movescalar);
|
||||
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
|
||||
paraarray[2].location.reference,paraarray[1].location.register,location.register);
|
||||
end;
|
||||
3:
|
||||
begin
|
||||
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
|
||||
paraarray[1].location.register,location.register,mms_movescalar);
|
||||
emit_ref_reg_reg(op[tfloatdef(resultdef).floattype,memop],S_NO,
|
||||
paraarray[3].location.reference,paraarray[2].location.register,location.register);
|
||||
end
|
||||
else
|
||||
internalerror(2014041301);
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
|
||||
paraarray[3].location.register,location.register,mms_movescalar);
|
||||
emit_reg_reg_reg(op[tfloatdef(resultdef).floattype,0],S_NO,
|
||||
paraarray[1].location.register,paraarray[2].location.register,location.register);
|
||||
end;
|
||||
end
|
||||
else
|
||||
internalerror(2014032301);
|
||||
end;
|
||||
|
||||
end.
|
||||
|
@ -125,7 +125,9 @@ type
|
||||
CPUX86_HAS_POPCNT,
|
||||
CPUX86_HAS_AVXUNIT,
|
||||
CPUX86_HAS_LZCNT,
|
||||
CPUX86_HAS_MOVBE
|
||||
CPUX86_HAS_MOVBE,
|
||||
CPUX86_HAS_FMA,
|
||||
CPUX86_HAS_FMA4
|
||||
);
|
||||
|
||||
const
|
||||
@ -134,7 +136,7 @@ type
|
||||
{ Athlon64 } [CPUX86_HAS_SSEUNIT],
|
||||
{ cpu_core_i } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
|
||||
{ cpu_core_avx } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
|
||||
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE]
|
||||
{ cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
|
||||
);
|
||||
|
||||
Implementation
|
||||
|
@ -115,6 +115,10 @@ const
|
||||
fpc_in_arctan_real = 130;
|
||||
fpc_in_ln_real = 131;
|
||||
fpc_in_sin_real = 132;
|
||||
fpc_in_fma_single = 133;
|
||||
fpc_in_fma_double = 134;
|
||||
fpc_in_fma_extended = 135;
|
||||
fpc_in_fma_float128 = 136;
|
||||
|
||||
{ MMX functions }
|
||||
{ these contants are used by the mmx unit }
|
||||
|
@ -109,3 +109,15 @@ procedure float_raise(i: TFPUExceptionMask);
|
||||
operator := (b:real48) e:extended;
|
||||
{$endif SUPPORT_EXTENDED}
|
||||
|
||||
|
||||
function fma(s1,s2,s3 : single) : single;[internproc:fpc_in_fma_single];
|
||||
{$ifdef SUPPORT_DOUBLE}
|
||||
function fma(d1,d2,d3 : double) : double;[internproc:fpc_in_fma_double];
|
||||
{$endif SUPPORT_DOUBLE}
|
||||
{$ifdef SUPPORT_EXTENDED}
|
||||
function fma(e1,e2,e3 : extended) : extended;[internproc:fpc_in_fma_extended];
|
||||
{$endif SUPPORT_EXTENDED}
|
||||
{$ifdef SUPPORT_FLOAT128}
|
||||
function fma(f1,f2,f3 : float128) : float128;[internproc:fpc_in_fma_float128];
|
||||
{$endif SUPPORT_FLOAT128}
|
||||
|
||||
|
121
tests/test/tfma1.pp
Normal file
121
tests/test/tfma1.pp
Normal file
@ -0,0 +1,121 @@
|
||||
{ %CPU=i386,x86_64 }
|
||||
{ %OPT=-Cfavx2 -Cpcoreavx2 }
|
||||
uses
|
||||
cpu;
|
||||
var
|
||||
d0,d1,d2,d3 : double;
|
||||
s0,s1,s2,s3 : single;
|
||||
|
||||
procedure testsingle;
|
||||
var
|
||||
l0,l1,l2,l3 : single;
|
||||
begin
|
||||
l1:=2;
|
||||
l2:=3;
|
||||
l3:=4;
|
||||
l0:=fma(l1,l2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(s1,l2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(l1,s2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(l1,l2,s3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(s1,s2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(s1,l2,s3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(l1,s2,s3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
end;
|
||||
|
||||
procedure testdouble;
|
||||
var
|
||||
l0,l1,l2,l3 : double;
|
||||
begin
|
||||
l1:=2;
|
||||
l2:=3;
|
||||
l3:=4;
|
||||
l0:=fma(l1,l2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(d1,l2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(l1,d2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(l1,l2,d3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(d1,d2,l3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(d1,l2,d3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
l0:=fma(l1,d2,d3);
|
||||
writeln(l0);
|
||||
if l0<>10.0 then
|
||||
halt(1);
|
||||
end;
|
||||
|
||||
begin
|
||||
if AVXSupport and FMASupport then
|
||||
begin
|
||||
d1:=2;
|
||||
d2:=3;
|
||||
d3:=4;
|
||||
d0:=fma(d1,d2,d3);
|
||||
writeln(d0);
|
||||
if d0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
s1:=2;
|
||||
s2:=3;
|
||||
s3:=4;
|
||||
s0:=fma(s1,s2,s3);
|
||||
writeln(s0);
|
||||
if s0<>10.0 then
|
||||
halt(1);
|
||||
|
||||
testsingle;
|
||||
testdouble;
|
||||
|
||||
writeln('ok');
|
||||
end
|
||||
else
|
||||
writeln('Skipped because not supported by the CPU');
|
||||
end.
|
Loading…
Reference in New Issue
Block a user