+ implementation of the vectorcall calling convention by J. Gareth Moreton

+ tests

git-svn-id: trunk@38206 -
This commit is contained in:
florian 2018-02-11 17:50:37 +00:00
parent c63c3e99c7
commit 31f78ea2b6
26 changed files with 2531 additions and 251 deletions

3
.gitattributes vendored
View File

@ -11999,6 +11999,9 @@ tests/test/cg/ttryfin4.pp svneol=native#text/plain
tests/test/cg/ttryfin5.pp svneol=native#text/plain
tests/test/cg/tumin.pp svneol=native#text/plain
tests/test/cg/tvec.pp svneol=native#text/plain
tests/test/cg/tvectorcall1.pp svneol=native#text/pascal
tests/test/cg/tvectorcall2.pp svneol=native#text/pascal
tests/test/cg/tvectorcall3.pp svneol=native#text/pascal
tests/test/cg/uandorxorassign.pp svneol=native#text/plain
tests/test/cg/unegnotassign.pp svneol=native#text/plain
tests/test/cg/uprintf3.pp svneol=native#text/plain

View File

@ -164,14 +164,18 @@ interface
{ OS_NO is also used memory references with large data that can
not be loaded in a register directly }
TCgSize = (OS_NO,
{ integer registers }
OS_8,OS_16,OS_32,OS_64,OS_128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
{ single,double,extended,comp,float128 }
OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
OS_8, OS_16, OS_32, OS_64, OS_128,
OS_S8, OS_S16, OS_S32, OS_S64, OS_S128,
{ single, double, extended, comp, float128 }
OS_F32, OS_F64, OS_F80, OS_C64, OS_F128,
{ multi-media sizes: split in byte, word, dword, ... }
{ entities, then the signed counterparts }
OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,
OS_MS8,OS_MS16,OS_MS32,OS_MS64,OS_MS128,OS_MS256 );
OS_M8, OS_M16, OS_M32, OS_M64, OS_M128, OS_M256, OS_M512,
OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128, OS_MS256, OS_MS512,
{ multi-media sizes: single-precision floating-point }
OS_MF32, OS_MF128, OS_MF256, OS_MF512,
{ multi-media sizes: double-precision floating-point }
OS_MD64, OS_MD128, OS_MD256, OS_MD512);
{ Register types }
TRegisterType = (
@ -205,15 +209,16 @@ interface
{ For Intel X86 AVX-Register }
R_SUBMMX, { = 12; 128 BITS }
R_SUBMMY, { = 13; 256 BITS }
R_SUBMMZ, { = 14; 512 BITS }
{ Subregisters for the flags register (x86) }
R_SUBFLAGCARRY, { = 14; Carry flag }
R_SUBFLAGPARITY, { = 15; Parity flag }
R_SUBFLAGAUXILIARY, { = 16; Auxiliary flag }
R_SUBFLAGZERO, { = 17; Zero flag }
R_SUBFLAGSIGN, { = 18; Sign flag }
R_SUBFLAGOVERFLOW, { = 19; Overflow flag }
R_SUBFLAGINTERRUPT, { = 20; Interrupt enable flag }
R_SUBFLAGDIRECTION { = 21; Direction flag }
R_SUBFLAGCARRY, { = 15; Carry flag }
R_SUBFLAGPARITY, { = 16; Parity flag }
R_SUBFLAGAUXILIARY, { = 17; Auxiliary flag }
R_SUBFLAGZERO, { = 18; Zero flag }
R_SUBFLAGSIGN, { = 19; Sign flag }
R_SUBFLAGOVERFLOW, { = 20; Overflow flag }
R_SUBFLAGINTERRUPT, { = 21; Interrupt enable flag }
R_SUBFLAGDIRECTION { = 22; Direction flag }
);
TSubRegisterSet = set of TSubRegister;
@ -307,12 +312,19 @@ interface
NR_INVALID = tregister($fffffffff);
tcgsize2size : Array[tcgsize] of integer =
(0,
{ integer values }
(0,1,2,4,8,16,1,2,4,8,16,
1, 2, 4, 8, 16,
1, 2, 4, 8, 16,
{ floating point values }
4,8,10,8,16,
4, 8, 10, 8, 16,
{ multimedia values }
1,2,4,8,16,32,1,2,4,8,16,32);
1, 2, 4, 8, 16, 32, 64,
1, 2, 4, 8, 16, 32, 64,
{ single-precision multimedia values }
4, 16, 32, 64,
{ double-precision multimedia values }
8, 16, 32, 64);
tfloat2tcgsize: array[tfloattype] of tcgsize =
(OS_F32,OS_F64,OS_F80,OS_F80,OS_C64,OS_C64,OS_F128);
@ -348,16 +360,25 @@ interface
{ Table to convert tcgsize variables to the correspondending
unsigned types }
tcgsize2unsigned : array[tcgsize] of tcgsize = (OS_NO,
OS_8,OS_16,OS_32,OS_64,OS_128,OS_8,OS_16,OS_32,OS_64,OS_128,
OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
OS_M64,OS_M128,OS_M256);
OS_8, OS_16, OS_32, OS_64, OS_128,
OS_8, OS_16, OS_32, OS_64, OS_128,
OS_F32, OS_F64, OS_F80, OS_C64, OS_F128,
OS_M8, OS_M16, OS_M32, OS_M64, OS_M128, OS_M256, OS_M512,
OS_M8, OS_M16, OS_M32, OS_M64, OS_M128, OS_M256, OS_M512,
OS_MF32, OS_MF128,OS_MF256,OS_MF512,
OS_MD64, OS_MD128,OS_MD256,OS_MD512);
tcgsize2signed : array[tcgsize] of tcgsize = (OS_NO,
OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
OS_M64,OS_M128,OS_M256);
OS_S8, OS_S16, OS_S32, OS_S64, OS_S128,
OS_S8, OS_S16, OS_S32, OS_S64, OS_S128,
OS_F32, OS_F64, OS_F80, OS_C64, OS_F128,
OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
OS_MF32, OS_MF128,OS_MF256,OS_MF512,
OS_MD64, OS_MD128,OS_MD256,OS_MD512);
tcgloc2str : array[TCGLoc] of string[12] = (
@ -404,6 +425,8 @@ interface
}
function int_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
function int_float_cgsize(const a: tcgint): tcgsize;
function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
function tcgsize2str(cgsize: tcgsize):string;
@ -660,6 +683,8 @@ implementation
result:=result+'mx';
R_SUBMMY:
result:=result+'my';
R_SUBMMZ:
result:=result+'mz';
else
internalerror(200308252);
end;
@ -701,6 +726,39 @@ implementation
end;
function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
begin
case a of
4:
result := OS_MF32;
16:
result := OS_MF128;
32:
result := OS_MF256;
64:
result := OS_MF512;
else
result := int_cgsize(a);
end;
end;
function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
begin
case a of
8:
result := OS_MD64;
16:
result := OS_MD128;
32:
result := OS_MD256;
64:
result := OS_MD512;
else
result := int_cgsize(a);
end;
end;
function tcgsize2str(cgsize: tcgsize):string;
begin
Str(cgsize, Result);

View File

@ -1338,7 +1338,24 @@ implementation
arraydef :
begin
if is_dynamic_array(def) or not is_special_array(def) then
result := int_cgsize(def.size)
begin
if (cs_support_vectors in current_settings.globalswitches) and is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
begin
{ Determine if, based on the floating-point type and the size
of the array, if it can be made into a vector }
case TFloatDef(def).floattype of
s32real:
result := float_array_cgsize(def.size);
s64real:
result := double_array_cgsize(def.size);
else
{ If not, fall back }
result := int_cgsize(def.size);
end;
end
else
result := int_cgsize(def.size);
end
else
result := OS_NO;
end;
@ -1379,25 +1396,53 @@ implementation
case def.typ of
arraydef:
begin
if tarraydef(def).elementdef.typ in [orddef,floatdef] then
begin
{ this is not correct, OS_MX normally mean that the vector
contains elements of size X. However, vectors themselves
can also have different sizes (e.g. a vector of 2 singles on
SSE) and the total size is currently more important }
case def.size of
1: result:=OS_M8;
2: result:=OS_M16;
4: result:=OS_M32;
8: result:=OS_M64;
16: result:=OS_M128;
32: result:=OS_M256;
else
internalerror(2013060103);
case tarraydef(def).elementdef.typ of
orddef:
begin
{ this is not correct, OS_MX normally mean that the vector
contains elements of size X. However, vectors themselves
can also have different sizes (e.g. a vector of 2 singles on
SSE) and the total size is currently more important }
case def.size of
1: result:=OS_M8;
2: result:=OS_M16;
4: result:=OS_M32;
8: result:=OS_M64;
16: result:=OS_M128;
32: result:=OS_M256;
64: result:=OS_M512;
else
internalerror(2013060103);
end;
end;
end
else
result:=def_cgsize(def);
floatdef:
begin
case TFloatDef(tarraydef(def).elementdef).floattype of
s32real:
case def.size of
4: result:=OS_MF32;
16: result:=OS_MF128;
32: result:=OS_MF256;
64: result:=OS_MF512;
else
internalerror(2017121400);
end;
s64real:
case def.size of
8: result:=OS_MD64;
16: result:=OS_MD128;
32: result:=OS_MD256;
64: result:=OS_MD512;
else
internalerror(2017121401);
end;
else
internalerror(2017121402);
end;
end;
else
result:=def_cgsize(def);
end;
end
else
result:=def_cgsize(def);

View File

@ -1112,7 +1112,8 @@ implementation
'SYSV_ABI_DEFAULT',
'SYSV_ABI_CDECL',
'MS_ABI_DEFAULT',
'MS_ABI_CDECL'
'MS_ABI_CDECL',
'VECTORCALL'
);
var
t : tproccalloption;

View File

@ -539,7 +539,9 @@ interface
pocall_sysv_abi_cdecl,
{ for x86-64: forces Microsoft ABI (Pascal resp. C) }
pocall_ms_abi_default,
pocall_ms_abi_cdecl
pocall_ms_abi_cdecl,
{ for x86-64: Microsoft's "vectorcall" ABI }
pocall_vectorcall
);
tproccalloptions = set of tproccalloption;
@ -560,9 +562,10 @@ interface
'Interrupt',
'HardFloat',
'SysV_ABI_Default',
'MS_ABI_CDecl',
'MS_ABI_CDecl', { TODO: Is this correct? Shouldn't it be SysV_ABI_Default }
'MS_ABI_Default',
'MS_ABI_CDecl'
'MS_ABI_CDecl',
'VectorCall'
);
{ Default calling convention }

View File

@ -1538,6 +1538,8 @@ implementation
result:=OS_F32;
OS_64:
result:=OS_F64;
OS_128:
result:=OS_M128;
end;
end;
end;

View File

@ -35,7 +35,8 @@
S_NEAR,S_FAR,S_SHORT,
S_T,
S_XMM,
S_YMM
S_YMM,
S_ZMM
);
TOpSizes = set of topsize;

View File

@ -35,7 +35,8 @@
S_NEAR,S_FAR,S_SHORT,
S_T,
S_XMM,
S_YMM
S_YMM,
S_ZMM
);
TOpSizes = set of topsize;

View File

@ -682,6 +682,7 @@ implementation
procedure tcgassignmentnode.pass_generate_code;
var
shuffle : pmmshuffle;
hlabel : tasmlabel;
href : treference;
releaseright : boolean;
@ -968,22 +969,21 @@ implementation
LOC_MMREGISTER,
LOC_CMMREGISTER:
begin
if left.resultdef.typ=arraydef then
begin
end
if (is_vector(left.resultdef)) then
shuffle := nil
else
begin
case left.location.loc of
LOC_CMMREGISTER,
LOC_MMREGISTER:
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register,mms_movescalar);
LOC_REFERENCE,
LOC_CREFERENCE:
hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference,mms_movescalar);
else
internalerror(2009112601);
end;
end;
shuffle := mms_movescalar;
case left.location.loc of
LOC_CMMREGISTER,
LOC_MMREGISTER:
hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register, shuffle);
LOC_REFERENCE,
LOC_CREFERENCE:
hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference, shuffle);
else
internalerror(2009112601);
end;
end;
LOC_REGISTER,
LOC_CREGISTER :

View File

@ -335,7 +335,8 @@ implementation
{ pocall_sysv_abi_default } 14,
{ pocall_sysv_abi_cdecl } 15,
{ pocall_ms_abi_default } 16,
{ pocall_ms_abi_cdecl } 17
{ pocall_ms_abi_cdecl } 17,
{ pocall_vectorcall } 18
);
begin
tcb.emit_ord_const(ProcCallOptionToCallConv[def.proccalloption],u8inttype);

View File

@ -2382,7 +2382,7 @@ type
end;
const
{Should contain the number of procedure directives we support.}
num_proc_directives=50;
num_proc_directives=51;
proc_direcdata:array[1..num_proc_directives] of proc_dir_rec=
(
(
@ -2849,6 +2849,15 @@ const
mutexclpocall : [];
mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
mutexclpo : [po_interrupt]
),(
idtok:_VECTORCALL;
pd_flags : [pd_interface,pd_implemen,pd_body,pd_procvar];
handler : nil;
pocall : pocall_vectorcall;
pooption : [];
mutexclpocall : [];
mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
mutexclpo : [po_interrupt]
)
);

View File

@ -82,6 +82,7 @@ interface
function alignment:shortint;virtual;abstract;
{ alignment when this type appears in a record/class/... }
function structalignment:shortint;virtual;
function aggregatealignment:shortint;virtual;
function getvardef:longint;virtual;abstract;
function getparentdef:tdef;virtual;
function getsymtable(t:tgetsymtable):TSymtable;virtual;
@ -379,6 +380,14 @@ implementation
result:=alignment;
end;
function tdef.aggregatealignment: shortint;
begin
if Assigned(Owner) and Assigned(Owner.defowner) and (Owner.defowner is TDef) and (Owner.defowner <> Self) then
Result := max(structalignment, TDef(Owner.defowner).aggregatealignment)
else
Result := structalignment;
end;
procedure tdef.ChangeOwner(st:TSymtable);
begin

View File

@ -289,6 +289,7 @@ type
_OPENSTRING,
_RIGHTSHIFT,
_SPECIALIZE,
_VECTORCALL,
_CONSTRUCTOR,
_GREATERTHAN,
_INTERNCONST,
@ -628,6 +629,7 @@ const
(str:'OPENSTRING' ;special:false;keyword:[m_none];op:NOTOKEN),
(str:'RIGHTSHIFT' ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
(str:'SPECIALIZE' ;special:false;keyword:[m_none];op:NOTOKEN),
(str:'VECTORCALL' ;special:false;keyword:[m_none];op:NOTOKEN),
(str:'CONSTRUCTOR' ;special:false;keyword:alllanguagemodes-[m_iso,m_extpas];op:NOTOKEN),
(str:'GREATERTHAN' ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
(str:'INTERNCONST' ;special:false;keyword:[m_none];op:NOTOKEN),

View File

@ -52,6 +52,7 @@ interface
OT_BITS64 = $00000008; { x86_64 and FPU }
OT_BITS128 = $10000000; { 16 byte SSE }
OT_BITS256 = $20000000; { 32 byte AVX }
OT_BITS512 = $40000000; { 64 byte AVX512 }
OT_BITS80 = $00000010; { FPU only }
OT_FAR = $00000020; { this means 16:16 or 16:32, like in CALL/JMP }
OT_NEAR = $00000040;
@ -612,7 +613,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
),
(OT_NONE,
OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,OT_BITS8,OT_BITS16,OT_BITS32,
@ -622,7 +624,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
),
(OT_NONE,
OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,
@ -632,7 +635,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
)
);
@ -650,7 +654,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
),
(OT_NONE,
OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
@ -660,7 +665,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
),
(OT_NONE,
OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
@ -670,7 +676,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
)
);
@ -688,7 +695,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
),
(OT_NONE,
OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
@ -698,7 +706,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
),
(OT_NONE,
OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
@ -708,7 +717,8 @@ implementation
OT_NEAR,OT_FAR,OT_SHORT,
OT_NONE,
OT_BITS128,
OT_BITS256
OT_BITS256,
OT_BITS512
)
);

View File

@ -158,20 +158,26 @@ unit cgx86;
TCGSize2OpSize: Array[tcgsize] of topsize =
(S_NO,S_B,S_W,S_L,S_Q,S_XMM,S_B,S_W,S_L,S_Q,S_XMM,
S_FS,S_FL,S_FX,S_IQ,S_FXX,
S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
S_NO,S_XMM,S_YMM,S_ZMM,
S_NO,S_XMM,S_YMM,S_ZMM);
{$elseif defined(i386)}
TCGSize2OpSize: Array[tcgsize] of topsize =
(S_NO,S_B,S_W,S_L,S_L,S_T,S_B,S_W,S_L,S_L,S_L,
S_FS,S_FL,S_FX,S_IQ,S_FXX,
S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
S_NO,S_XMM,S_YMM,S_ZMM,
S_NO,S_XMM,S_YMM,S_ZMM);
{$elseif defined(i8086)}
TCGSize2OpSize: Array[tcgsize] of topsize =
(S_NO,S_B,S_W,S_W,S_W,S_T,S_B,S_W,S_W,S_W,S_W,
S_FS,S_FL,S_FX,S_IQ,S_FXX,
S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
S_NO,S_XMM,S_YMM,S_ZMM,
S_NO,S_XMM,S_YMM,S_ZMM);
{$endif}
{$ifndef NOTARGETWIN}
@ -185,6 +191,9 @@ unit cgx86;
{ returns true, if the compiler should use leave instead of mov/pop }
function UseLeave: boolean;
{ Gets the byte alignment of a reference }
function GetRefAlignment(ref: treference): Byte;
implementation
uses
@ -225,6 +234,22 @@ unit cgx86;
{$endif}
end;
function GetRefAlignment(ref: treference): Byte; {$IFDEF USEINLINE}inline;{$ENDIF}
begin
{$ifdef x86_64}
{ The stack pointer and base pointer will be aligned to 16-byte boundaries if the machine code is well-behaved }
if (ref.base = NR_RSP) or (ref.base = NR_RBP) then
begin
if (ref.index = NR_NO) and ((ref.offset mod 16) = 0) then
Result := 16
else
Result := ref.alignment;
end
else
{$endif x86_64}
Result := ref.alignment;
end;
const
TOpCG2AsmOp: Array[topcg] of TAsmOp = (A_NONE,A_MOV,A_ADD,A_AND,A_DIV,
A_IDIV,A_IMUL,A_MUL,A_NEG,A_NOT,A_OR,
@ -268,8 +293,19 @@ unit cgx86;
result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
OS_M64:
result:=rg[R_MMREGISTER].getregister(list,R_SUBQ);
OS_M128:
result:=rg[R_MMREGISTER].getregister(list,R_SUBMMWHOLE);
OS_M128,
OS_F128,
OS_MF128,
OS_MD128:
result:=rg[R_MMREGISTER].getregister(list,R_SUBMMX); { R_SUBMMWHOLE seems a bit dangerous and ambiguous, so changed to R_SUBMMX. [Kit] }
OS_M256,
OS_MF256,
OS_MD256:
result:=rg[R_MMREGISTER].getregister(list,R_SUBMMY);
OS_M512,
OS_MF512,
OS_MD512:
result:=rg[R_MMREGISTER].getregister(list,R_SUBMMZ);
else
internalerror(200506041);
end;
@ -1260,13 +1296,13 @@ unit cgx86;
(A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
(A_NONE,A_NONE,A_NONE,A_NONE,A_MOVAPS));
convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
(A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
(A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
(A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
(A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
(A_NONE,A_NONE,A_NONE,A_NONE,A_VMOVAPS));
begin
{ we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
OS_32/OS_64 (record in memory/LOC_REFERENCE) }
@ -1288,13 +1324,33 @@ unit cgx86;
end
{ we can have OS_M64 (record in function result/LOC_MMREGISTER) to
OS_64 (record in memory/LOC_REFERENCE) }
else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
(fromsize=OS_M64) then
else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) then
begin
if UseAVX then
result:=A_VMOVQ
else
result:=A_MOVQ;
case fromsize of
OS_M64:
{ we can have OS_M64 (record in function result/LOC_MMREGISTER) to
OS_64 (record in memory/LOC_REFERENCE) }
if UseAVX then
result:=A_VMOVQ
else
result:=A_MOVQ;
OS_M128:
{ 128-bit aligned vector }
if UseAVX then
result:=A_VMOVAPS
else
result:=A_MOVAPS;
OS_M256,
OS_M512:
{ 256-bit aligned vector }
if UseAVX then
result:=A_VMOVAPS
else
{ SSE does not support 256-bit or 512-bit vectors }
InternalError(2018012930);
else
InternalError(2018012920);
end;
end
else
internalerror(2010060104);
@ -1313,12 +1369,14 @@ unit cgx86;
if fromsize=tosize then
{ needs correct size in case of spilling }
case fromsize of
OS_F32:
OS_F32,
OS_MF128:
if UseAVX then
instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
else
instr:=taicpu.op_reg_reg(A_MOVAPS,S_NO,reg1,reg2);
OS_F64:
OS_F64,
OS_MD128:
if UseAVX then
instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
else
@ -1328,6 +1386,32 @@ unit cgx86;
instr:=taicpu.op_reg_reg(A_VMOVQ,S_NO,reg1,reg2)
else
instr:=taicpu.op_reg_reg(A_MOVQ,S_NO,reg1,reg2);
OS_M128, OS_MS128:
if UseAVX then
instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
else
instr:=taicpu.op_reg_reg(A_MOVDQA,S_NO,reg1,reg2);
OS_MF256,
OS_MF512:
if UseAVX then
instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012931);
OS_MD256,
OS_MD512:
if UseAVX then
instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012932);
OS_M256, OS_MS256,
OS_M512, OS_MS512:
if UseAVX then
instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012933);
else
internalerror(2006091201);
end
@ -1385,15 +1469,152 @@ unit cgx86;
make_simple_ref(list,tmpref);
if shuffle=nil then
begin
if fromsize=OS_M64 then
list.concat(taicpu.op_ref_reg(A_MOVQ,S_NO,tmpref,reg))
else
{$ifdef x86_64}
{ x86-64 has always properly aligned data }
list.concat(taicpu.op_ref_reg(A_MOVDQA,S_NO,tmpref,reg));
{$else x86_64}
list.concat(taicpu.op_ref_reg(A_MOVDQU,S_NO,tmpref,reg));
{$endif x86_64}
case fromsize of
OS_F32:
if UseAVX then
op := A_VMOVSS
else
op := A_MOVSS;
OS_F64:
if UseAVX then
op := A_VMOVSD
else
op := A_MOVSD;
OS_M32, OS_32, OS_S32:
if UseAVX then
op := A_VMOVD
else
op := A_MOVD;
OS_M64, OS_64, OS_S64:
if UseAVX then
op := A_VMOVQ
else
op := A_MOVQ;
OS_MF128:
{ Use XMM transfer of packed singles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 16 then
op := A_VMOVAPS
else
op := A_VMOVUPS
end
else
begin
if GetRefAlignment(tmpref) = 16 then
op := A_MOVAPS
else
op := A_MOVUPS
end;
OS_MD128:
{ Use XMM transfer of packed doubles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 16 then
op := A_VMOVAPD
else
op := A_VMOVUPD
end
else
begin
if GetRefAlignment(tmpref) = 16 then
op := A_MOVAPD
else
op := A_MOVUPD
end;
OS_M128, OS_MS128:
{ Use XMM integer transfer }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 16 then
op := A_VMOVDQA
else
op := A_VMOVDQU
end
else
begin
if GetRefAlignment(tmpref) = 16 then
op := A_MOVDQA
else
op := A_MOVDQU
end;
OS_MF256:
{ Use YMM transfer of packed singles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 32 then
op := A_VMOVAPS
else
op := A_VMOVUPS
end
else
{ SSE doesn't support 256-bit vectors }
InternalError(2018012934);
OS_MD256:
{ Use YMM transfer of packed doubles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 32 then
op := A_VMOVAPD
else
op := A_VMOVUPD
end
else
{ SSE doesn't support 256-bit vectors }
InternalError(2018012935);
OS_M256, OS_MS256:
{ Use YMM integer transfer }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 32 then
op := A_VMOVDQA
else
op := A_VMOVDQU
end
else
{ SSE doesn't support 256-bit vectors }
InternalError(2018012936);
OS_MF512:
{ Use ZMM transfer of packed singles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 64 then
op := A_VMOVAPS
else
op := A_VMOVUPS
end
else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012937);
OS_MD512:
{ Use ZMM transfer of packed doubles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 64 then
op := A_VMOVAPD
else
op := A_VMOVUPD
end
else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012938);
OS_M512, OS_MS512:
{ Use ZMM integer transfer }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 64 then
op := A_VMOVDQA
else
op := A_VMOVDQU
end
else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012939);
else
{ No valid transfer command available }
internalerror(2017121410);
end;
list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg));
end
else if shufflescalar(shuffle) then
begin
@ -1415,20 +1636,149 @@ unit cgx86;
hreg : tregister;
tmpref : treference;
op : tasmop;
begin
tmpref:=ref;
make_simple_ref(list,tmpref);
if shuffle=nil then
begin
if fromsize=OS_M64 then
list.concat(taicpu.op_reg_ref(A_MOVQ,S_NO,reg,tmpref))
else
{$ifdef x86_64}
{ x86-64 has always properly aligned data }
list.concat(taicpu.op_reg_ref(A_MOVDQA,S_NO,reg,tmpref))
{$else x86_64}
list.concat(taicpu.op_reg_ref(A_MOVDQU,S_NO,reg,tmpref))
{$endif x86_64}
case fromsize of
OS_F32:
if UseAVX then
op := A_VMOVSS
else
op := A_MOVSS;
OS_F64:
if UseAVX then
op := A_VMOVSD
else
op := A_MOVSD;
OS_M32, OS_32, OS_S32:
if UseAVX then
op := A_VMOVD
else
op := A_MOVD;
OS_M64, OS_64, OS_S64:
if UseAVX then
op := A_VMOVQ
else
op := A_MOVQ;
OS_MF128:
{ Use XMM transfer of packed singles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 16 then
op := A_VMOVAPS
else
op := A_VMOVUPS
end else
begin
if GetRefAlignment(tmpref) = 16 then
op := A_MOVAPS
else
op := A_MOVUPS
end;
OS_MD128:
{ Use XMM transfer of packed doubles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 16 then
op := A_VMOVAPD
else
op := A_VMOVUPD
end else
begin
if GetRefAlignment(tmpref) = 16 then
op := A_MOVAPD
else
op := A_MOVUPD
end;
OS_M128, OS_MS128:
{ Use XMM integer transfer }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 16 then
op := A_VMOVDQA
else
op := A_VMOVDQU
end else
begin
if GetRefAlignment(tmpref) = 16 then
op := A_MOVDQA
else
op := A_MOVDQU
end;
OS_MF256:
{ Use XMM transfer of packed singles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 32 then
op := A_VMOVAPS
else
op := A_VMOVUPS
end else
{ SSE doesn't support 256-bit vectors }
InternalError(2018012940);
OS_MD256:
{ Use XMM transfer of packed doubles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 32 then
op := A_VMOVAPD
else
op := A_VMOVUPD
end else
{ SSE doesn't support 256-bit vectors }
InternalError(2018012941);
OS_M256, OS_MS256:
{ Use XMM integer transfer }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 32 then
op := A_VMOVDQA
else
op := A_VMOVDQU
end else
{ SSE doesn't support 256-bit vectors }
InternalError(2018012942);
OS_MF512:
{ Use XMM transfer of packed singles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 64 then
op := A_VMOVAPS
else
op := A_VMOVUPS
end else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012943);
OS_MD512:
{ Use XMM transfer of packed doubles }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 64 then
op := A_VMOVAPD
else
op := A_VMOVUPD
end else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012944);
OS_M512, OS_MS512:
{ Use XMM integer transfer }
if UseAVX then
begin
if GetRefAlignment(tmpref) = 64 then
op := A_VMOVDQA
else
op := A_VMOVDQU
end else
{ SSE doesn't support 512-bit vectors }
InternalError(2018012945);
else
{ No valid transfer command available }
internalerror(2017121411);
end;
list.concat(taicpu.op_reg_ref(op,S_NO,reg,tmpref));
end
else if shufflescalar(shuffle) then
begin

View File

@ -419,10 +419,12 @@ implementation
else
internalerror(2009071902);
end;
OS_M128,OS_MS128:
OS_M128,OS_MS128,OS_MF128,OS_MD128:
cgsize2subreg:=R_SUBMMX;
OS_M256,OS_MS256:
OS_M256,OS_MS256,OS_MF256,OS_MD256:
cgsize2subreg:=R_SUBMMY;
OS_M512,OS_MS512,OS_MF512,OS_MD512:
cgsize2subreg:=R_SUBMMZ;
OS_NO:
{ error message should have been thrown already before, so avoid only
an internal error }
@ -435,7 +437,7 @@ implementation
function reg_cgsize(const reg: tregister): tcgsize;
const subreg2cgsize:array[Tsubregister] of Tcgsize =
(OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
(OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_M512,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
begin
case getregtype(reg) of
R_INTREGISTER :
@ -466,7 +468,7 @@ implementation
function reg2opsize(r:Tregister):topsize;
const
subreg2opsize : array[tsubregister] of topsize =
(S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
(S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
begin
reg2opsize:=S_L;
case getregtype(r) of

View File

@ -52,27 +52,28 @@ interface
'd',
'','','',
't',
'x',
'y'
'x',
'y',
'z'
);
{ suffix-to-opsize conversion tables, used in asmreadrer }
{ !! S_LQ excluded: movzlq does not exist, movslq is processed
as a separate instruction w/o suffix (aka movsxd), and there are
no more instructions needing it. }
att_sizesuffixstr : array[0..13] of string[2] = (
'','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y'
att_sizesuffixstr : array[0..14] of string[2] = (
'','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y','Z'
);
att_sizesuffix : array[0..13] of topsize = (
S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO
att_sizesuffix : array[0..14] of topsize = (
S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO,S_NO
);
att_sizefpusuffix : array[0..13] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
att_sizefpusuffix : array[0..14] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
);
att_sizefpuintsuffix : array[0..13] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
att_sizefpuintsuffix : array[0..14] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
);
att_sizemmsuffix : array[0..13] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
att_sizemmsuffix : array[0..14] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
);
{$else x86_64}
gas_opsize2str : array[topsize] of string[2] = ('',
@ -82,24 +83,25 @@ interface
'd',
'','','',
't',
'x',
'y'
'x',
'y',
'z'
);
{ suffix-to-opsize conversion tables, used in asmreadrer }
att_sizesuffixstr : array[0..11] of string[2] = (
'','BW','BL','WL','B','W','L','S','Q','T','X','Y'
att_sizesuffixstr : array[0..12] of string[2] = (
'','BW','BL','WL','B','W','L','S','Q','T','X','Y','Z'
);
att_sizesuffix : array[0..11] of topsize = (
S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO
att_sizesuffix : array[0..12] of topsize = (
S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO
);
att_sizefpusuffix : array[0..11] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
att_sizefpusuffix : array[0..12] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
);
att_sizefpuintsuffix : array[0..11] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
att_sizefpuintsuffix : array[0..12] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
);
att_sizemmsuffix : array[0..11] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
att_sizemmsuffix : array[0..12] of topsize = (
S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
);
{$endif x86_64}

View File

@ -343,7 +343,8 @@ const
0,0,0,
80,
128,
256
256,
512
);
{$else}
topsize2memsize: array[topsize] of integer =
@ -354,7 +355,8 @@ topsize2memsize: array[topsize] of integer =
0,0,0,
80,
128,
256
256,
512
);
{$endif}

View File

@ -74,10 +74,14 @@ uses
A_MOVZX:
Result:=OptPass1Movx(p);
A_VMOVAPS,
A_VMOVAPD:
A_VMOVAPD,
A_VMOVUPS,
A_VMOVUPD:
result:=OptPass1VMOVAP(p);
A_MOVAPD,
A_MOVAPS:
A_MOVAPS,
A_MOVUPD,
A_MOVUPS:
result:=OptPass1MOVAP(p);
A_VDIVSD,
A_VDIVSS,

View File

@ -35,7 +35,8 @@ type
S_NEAR,S_FAR,S_SHORT,
S_T,
S_XMM,
S_YMM
S_YMM,
S_ZMM
);
TOpSizes = set of topsize;

View File

@ -108,7 +108,8 @@ Const
pocall_sysv_abi_default,
pocall_sysv_abi_cdecl,
pocall_ms_abi_default,
pocall_ms_abi_cdecl
pocall_ms_abi_cdecl,
pocall_vectorcall
];
cputypestr : array[tcputype] of string[10] = ('',

File diff suppressed because it is too large Load Diff

View File

@ -173,7 +173,7 @@ implementation
result:=
((target_info.system=system_x86_64_win64) and
not(proccall in [pocall_sysv_abi_default,pocall_sysv_abi_cdecl])) or
(proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl]);
(proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl,pocall_vectorcall]);
end;

View File

@ -0,0 +1,869 @@
{ %CPU=x86_64 }
program vectorcall_hva_test1;
{$IFNDEF CPUX86_64}
{$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
{$ENDIF}
{$ASMMODE Intel}
{$PUSH}
{$CODEALIGN RECORDMIN=16}
{$PACKRECORDS C}
type
TM128 = record
case Byte of
0: (M128_F32: array[0..3] of Single);
1: (M128_F64: array[0..1] of Double);
end;
{$POP}
{ HFA test: field style. }
{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
turned into vectors rather than HFAs. }
THFA1_SF = packed record
F1: Single;
end;
{$IFDEF WIN64}
THFA2_SF = packed record
F1, F2: Single;
end;
THFA3_SF = packed record
F1, F2, F3: Single;
end;
THFA4_SF = packed record
F1, F2, F3, F4: Single;
end;
{$ENDIF}
THFA1_DF = packed record
F1: Double;
end;
{$IFDEF WIN64}
THFA2_DF = packed record
F1, F2: Double;
end;
THFA3_DF = packed record
F1, F2, F3: Double;
end;
THFA4_DF = packed record
F1, F2, F3, F4: Double;
end;
{$ENDIF}
{ HFA test - array style }
{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
turned into vectors rather than HFAs. }
THFA1_SA = packed record
F: array[0..0] of Single;
end;
{$IFDEF WIN64}
THFA2_SA = packed record
F: array[0..1] of Single;
end;
THFA3_SA = packed record
F: array[0..2] of Single;
end;
THFA4_SA = packed record
F: array[0..3] of Single;
end;
{$ENDIF}
THFA1_DA = packed record
F: array[0..0] of Double;
end;
{$IFDEF WIN64}
THFA2_DA = packed record
F: array[0..1] of Double;
end;
THFA3_DA = packed record
F: array[0..2] of Double;
end;
THFA4_DA = packed record
F: array[0..3] of Double;
end;
{$ENDIF}
{ Single-type vector }
function HorizontalAddSingle(V: TM128): Single; vectorcall;
begin
HorizontalAddSingle := V.M128_F32[0] + V.M128_F32[1] + V.M128_F32[2] + V.M128_F32[3];
end;
function HorizontalAddSingle_ASM(V: TM128): Single; vectorcall; assembler; nostackframe;
asm
HADDPS XMM0, XMM0
HADDPS XMM0, XMM0
end;
{ Double-type vector }
function HorizontalAddDouble(V: TM128): Double; vectorcall;
begin
HorizontalAddDouble := V.M128_F64[0] + V.M128_F64[1];
end;
function HorizontalAddDouble_ASM(V: TM128): Double; vectorcall; assembler; nostackframe;
asm
HADDPD XMM0, XMM0
end;
{ 3-element aggregate }
function AddSingles1F(HFA: THFA1_SF): Single; vectorcall;
begin
AddSingles1F := HFA.F1;
end;
function AddSingles1F_ASM(HFA: THFA1_SF): Single; vectorcall; assembler; nostackframe;
asm
{ Do absolutely nothing! }
end;
function AddDoubles1F(HFA: THFA1_DF): Double; vectorcall;
begin
AddDoubles1F := HFA.F1;
end;
function AddDoubles1F_ASM(HFA: THFA1_DF): Double; vectorcall; assembler; nostackframe;
asm
{ Do absolutely nothing! }
end;
function AddSingles1A(HFA: THFA1_SA): Single; vectorcall;
begin
AddSingles1A := HFA.F[0];
end;
function AddSingles1A_ASM(HFA: THFA1_SA): Single; vectorcall; assembler; nostackframe;
asm
{ Do absolutely nothing! }
end;
function AddDoubles1A(HFA: THFA1_DA): Double; vectorcall;
begin
AddDoubles1A := HFA.F[0];
end;
function AddDoubles1A_ASM(HFA: THFA1_DA): Double; vectorcall; assembler; nostackframe;
asm
{ Do absolutely nothing! }
end;
{$IFDEF WIN64}
{ 2-element aggregate }
function AddSingles2F(HFA: THFA2_SF): Single; vectorcall;
begin
AddSingles2F := HFA.F1 + HFA.F2;
end;
function AddSingles2F_ASM(HFA: THFA2_SF): Single; vectorcall; assembler; nostackframe;
asm
ADDSS XMM0, XMM1
end;
function AddDoubles2F(HFA: THFA2_DF): Double; vectorcall;
begin
AddDoubles2F := HFA.F1 + HFA.F2;
end;
function AddDoubles2F_ASM(HFA: THFA2_DF): Double; vectorcall; assembler; nostackframe;
asm
ADDSD XMM0, XMM1
end;
function AddSingles2A(HFA: THFA2_SA): Single; vectorcall;
begin
AddSingles2A := HFA.F[0] + HFA.F[1];
end;
function AddSingles2A_ASM(HFA: THFA2_SA): Single; vectorcall; assembler; nostackframe;
asm
ADDSS XMM0, XMM1
end;
function AddDoubles2A(HFA: THFA2_DA): Double; vectorcall;
begin
AddDoubles2A := HFA.F[0] + HFA.F[1];
end;
function AddDoubles2A_ASM(HFA: THFA2_DA): Double; vectorcall; assembler; nostackframe;
asm
ADDSD XMM0, XMM1
end;
{ 3-element aggregate }
function AddSingles3F(HFA: THFA3_SF): Single; vectorcall;
begin
AddSingles3F := HFA.F1 + HFA.F2 + HFA.F3;
end;
function AddSingles3F_ASM(HFA: THFA3_SF): Single; vectorcall; assembler; nostackframe;
asm
ADDSS XMM0, XMM1
ADDSS XMM0, XMM2
end;
function AddDoubles3F(HFA: THFA3_DF): Double; vectorcall;
begin
AddDoubles3F := HFA.F1 + HFA.F2 + HFA.F3;
end;
function AddDoubles3F_ASM(HFA: THFA3_DF): Double; vectorcall; assembler; nostackframe;
asm
ADDSD XMM0, XMM1
ADDSD XMM0, XMM2
end;
function AddSingles3A(HFA: THFA3_SA): Single; vectorcall;
begin
AddSingles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
end;
function AddSingles3A_ASM(HFA: THFA3_SA): Single; vectorcall; assembler; nostackframe;
asm
ADDSS XMM0, XMM1
ADDSS XMM0, XMM2
end;
function AddDoubles3A(HFA: THFA3_DA): Double; vectorcall;
begin
AddDoubles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
end;
function AddDoubles3A_ASM(HFA: THFA3_DA): Double; vectorcall; assembler; nostackframe;
asm
ADDSD XMM0, XMM1
ADDSD XMM0, XMM2
end;
{ 4-element aggregate }
function AddSingles4F(HFA: THFA4_SF): Single; vectorcall;
begin
AddSingles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
end;
function AddSingles4F_ASM(HFA: THFA4_SF): Single; vectorcall; assembler; nostackframe;
asm
ADDSS XMM0, XMM1
ADDSS XMM0, XMM2
ADDSS XMM0, XMM3
end;
function AddDoubles4F(HFA: THFA4_DF): Double; vectorcall;
begin
AddDoubles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
end;
function AddDoubles4F_ASM(HFA: THFA4_DF): Double; vectorcall; assembler; nostackframe;
asm
ADDSD XMM0, XMM1
ADDSD XMM0, XMM2
ADDSD XMM0, XMM3
end;
function AddSingles4A(HFA: THFA4_SA): Single; vectorcall;
begin
AddSingles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
end;
function AddSingles4A_ASM(HFA: THFA4_SA): Single; vectorcall; assembler; nostackframe;
asm
ADDSS XMM0, XMM1
ADDSS XMM0, XMM2
ADDSS XMM0, XMM3
end;
function AddDoubles4A(HFA: THFA4_DA): Double; vectorcall;
begin
AddDoubles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
end;
function AddDoubles4A_ASM(HFA: THFA4_DA): Double; vectorcall; assembler; nostackframe;
asm
ADDSD XMM0, XMM1
ADDSD XMM0, XMM2
ADDSD XMM0, XMM3
end;
{$ENDIF}
var
HVA: TM128;
HFA1_SF: THFA1_SF;
HFA1_DF: THFA1_DF;
HFA1_SA: THFA1_SA;
HFA1_DA: THFA1_DA;
{$IFDEF WIN64}
HFA2_SF: THFA2_SF;
HFA2_DF: THFA2_DF;
HFA2_SA: THFA2_SA;
HFA2_DA: THFA2_DA;
HFA3_SF: THFA3_SF;
HFA3_DF: THFA3_DF;
HFA3_SA: THFA3_SA;
HFA3_DA: THFA3_DA;
HFA4_SF: THFA4_SF;
HFA4_DF: THFA4_DF;
HFA4_SA: THFA4_SA;
HFA4_DA: THFA4_DA;
{$ENDIF}
TestPointer: PtrUInt;
I, J: Integer;
ResS, ResSA: Single;
ResD, ResDA: Double;
Addresses: array[0..3] of Pointer;
FieldAddresses: array[0..3, 0..3] of Pointer;
const
AddressNames1: array[0..3] of ShortString = ('HFA1_SF', 'HFA1_DF', 'HFA1_SA', 'HFA1_DA');
{$IFDEF WIN64}
AddressNames2: array[0..3] of ShortString = ('HFA2_SF', 'HFA2_DF', 'HFA2_SA', 'HFA2_DA');
AddressNames3: array[0..3] of ShortString = ('HFA3_SF', 'HFA3_DF', 'HFA3_SA', 'HFA3_DA');
AddressNames4: array[0..3] of ShortString = ('HFA4_SF', 'HFA4_DF', 'HFA4_SA', 'HFA4_DA');
{$ENDIF}
FieldAddressNames: array[0..3] of ShortString = ('F1', 'F2', 'F3', 'F4');
ExpS1: Single = 5.0;
{$IFDEF WIN64}
ExpS2: Single = -5.0;
ExpS3: Single = 10.0;
{$ENDIF}
ExpS4: Single = -10.0;
ExpD1: Double = 5.0;
ExpD2: Double = -5.0;
{$IFDEF WIN64}
ExpD3: Double = 10.0;
ExpD4: Double = -10.0;
{$ENDIF}
begin
if (PtrUInt(@HVA) and $F) <> 0 then
begin
WriteLn('FAIL: HVA is not correctly aligned.');
Halt(1);
end;
{ array of singles }
WriteLn('- horizontal add (4 singles)');
HVA.M128_F32[0] := 5.0;
HVA.M128_F32[1] := -10.0;
HVA.M128_F32[2] := 15.0;
HVA.M128_F32[3] := -20.0;
ResS := HorizontalAddSingle(HVA);
ResSA := HorizontalAddSingle_ASM(HVA);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: HorizontalAddSingle(HVA) has the vector in the wrong register.');
Halt(1);
end else
begin
if ResS <> ExpS4 then
begin
WriteLn('FAIL: HorizontalAddSingle(HVA) returned ', ResS, ' instead of ', ExpS4);
Halt(1);
end;
end;
{ array of doubles }
WriteLn('- horizontal add (2 doubles)');
HVA.M128_F64[0] := 5.0;
HVA.M128_F64[1] := -10.0;
ResD := HorizontalAddDouble(HVA);
ResDA := HorizontalAddDouble_ASM(HVA);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: HorizontalAddDouble(HVA) has the vector in the wrong register.');
Halt(1);
end else
begin
if ResD <> ExpD2 then
begin
WriteLn('FAIL: HorizontalAddDouble(HVA) returned ', ResD, ' instead of ', ExpD2);
Halt(1);
end;
end;
{ 1-field aggregates }
WriteLn('- 1-field aggregates');
Addresses[0] := @HFA1_SF;
Addresses[1] := @HFA1_SA;
Addresses[2] := @HFA1_DF;
Addresses[3] := @HFA1_DA;
FieldAddresses[0][0] := @(HFA1_SF.F1);
FieldAddresses[1][0] := @(HFA1_SA.F[0]);
FieldAddresses[2][0] := @(HFA1_DF.F1);
FieldAddresses[3][0] := @(HFA1_DA.F[0]);
{ Check alignment }
for I := 0 to 1 do
begin
TestPointer := PtrUInt(Addresses[I]);
if Pointer(TestPointer) <> FieldAddresses[I][0] then
begin
WriteLn('FAIL: ', AddressNames1[I], ' is not correctly packed; field F1 is not in the expected place.');
Halt(1);
end;
end;
HFA1_SF.F1 := 5.0;
ResS := AddSingles1F(HFA1_SF);
ResSA := AddSingles1F_ASM(HFA1_SF);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS1 then
begin
WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
Halt(1);
end;
end;
HFA1_DF.F1 := 5.0;
ResD := AddDoubles1F(HFA1_DF);
ResDA := AddDoubles1F_ASM(HFA1_DF);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD1 then
begin
WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
Halt(1);
end;
end;
HFA1_SA.F[0] := 5.0;
ResS := AddSingles1A(HFA1_SA);
ResSA := AddSingles1A_ASM(HFA1_SA);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS1 then
begin
WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
Halt(1);
end;
end;
HFA1_DA.F[0] := 5.0;
ResD := AddDoubles1A(HFA1_DA);
ResDA := AddDoubles1A_ASM(HFA1_DA);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD1 then
begin
WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
Halt(1);
end;
end;
{$IFDEF WIN64}
{ 2-field aggregates }
WriteLn('- 2-field aggregates');
Addresses[0] := @HFA2_SF;
Addresses[1] := @HFA2_SA;
FieldAddresses[0][0] := @(HFA2_SF.F1);
FieldAddresses[0][1] := @(HFA2_SF.F2);
FieldAddresses[1][0] := @(HFA2_SA.F[0]);
FieldAddresses[1][1] := @(HFA2_SA.F[1]);
{ Check alignment of Singles }
for I := 0 to 1 do
begin
TestPointer := PtrUInt(Addresses[I]);
for J := 0 to 1 do
begin
if Pointer(TestPointer) <> FieldAddresses[I][J] then
begin
WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
Halt(1);
end;
Inc(TestPointer, $4);
end;
end;
Addresses[2] := @HFA2_DF;
Addresses[3] := @HFA2_DA;
FieldAddresses[2][0] := @(HFA2_DF.F1);
FieldAddresses[2][1] := @(HFA2_DF.F2);
FieldAddresses[3][0] := @(HFA2_DA.F[0]);
FieldAddresses[3][1] := @(HFA2_DA.F[1]);
{ Check alignment of Doubles }
for I := 2 to 3 do
begin
TestPointer := PtrUInt(Addresses[I]);
for J := 0 to 1 do
begin
if Pointer(TestPointer) <> FieldAddresses[I][J] then
begin
WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
Halt(1);
end;
Inc(TestPointer, $8);
end;
end;
HFA2_SF.F1 := 5.0;
HFA2_SF.F2 := -10.0;
ResS := AddSingles2F(HFA2_SF);
ResSA := AddSingles2F_ASM(HFA2_SF);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles2F(HFA2_SF) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS2 then
begin
WriteLn('FAIL: AddSingles2F(HFA2_SF) returned ', ResS, ' instead of ', ExpS2);
Halt(1);
end;
end;
HFA2_DF.F1 := 5.0;
HFA2_DF.F2 := -10.0;
ResD := AddDoubles2F(HFA2_DF);
ResDA := AddDoubles2F_ASM(HFA2_DF);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles2F(HFA2_DF) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD2 then
begin
WriteLn('FAIL: AddDoubles2F(HFA2_DF) returned ', ResD, ' instead of ', ExpD2);
Halt(1);
end;
end;
HFA2_SA.F[0] := 5.0;
HFA2_SA.F[1] := -10.0;
ResS := AddSingles2A(HFA2_SA);
ResSA := AddSingles2A_ASM(HFA2_SA);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles2A(HFA2_SA) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS2 then
begin
WriteLn('FAIL: AddSingles2A(HFA2_SA) returned ', ResS, ' instead of ', ExpS2);
Halt(1);
end;
end;
HFA2_DA.F[0] := 5.0;
HFA2_DA.F[1] := -10.0;
ResD := AddDoubles2A(HFA2_DA);
ResDA := AddDoubles2A_ASM(HFA2_DA);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles2A(HFA2_DA) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD2 then
begin
WriteLn('FAIL: AddDoubles2A(HFA2_DA) returned ', ResD, ' instead of ', ExpD2);
Halt(1);
end;
end;
{ 3-field aggregates }
WriteLn('- 3-field aggregates');
Addresses[0] := @HFA3_SF;
Addresses[1] := @HFA3_SA;
FieldAddresses[0][0] := @(HFA3_SF.F1);
FieldAddresses[0][1] := @(HFA3_SF.F2);
FieldAddresses[0][2] := @(HFA3_SF.F3);
FieldAddresses[1][0] := @(HFA3_SA.F[0]);
FieldAddresses[1][1] := @(HFA3_SA.F[1]);
FieldAddresses[1][2] := @(HFA3_SA.F[2]);
{ Check alignment of Singles }
for I := 0 to 1 do
begin
TestPointer := PtrUInt(Addresses[I]);
for J := 0 to 2 do
begin
if Pointer(TestPointer) <> FieldAddresses[I][J] then
begin
WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
Halt(1);
end;
Inc(TestPointer, $4);
end;
end;
Addresses[2] := @HFA3_DF;
Addresses[3] := @HFA3_DA;
FieldAddresses[2][0] := @(HFA3_DF.F1);
FieldAddresses[2][1] := @(HFA3_DF.F2);
FieldAddresses[2][2] := @(HFA3_DF.F3);
FieldAddresses[3][0] := @(HFA3_DA.F[0]);
FieldAddresses[3][1] := @(HFA3_DA.F[1]);
FieldAddresses[3][2] := @(HFA3_DA.F[2]);
{ Check alignment of Doubles }
for I := 2 to 3 do
begin
TestPointer := PtrUInt(Addresses[I]);
for J := 0 to 2 do
begin
if Pointer(TestPointer) <> FieldAddresses[I][J] then
begin
WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
Halt(1);
end;
Inc(TestPointer, $8);
end;
end;
HFA3_SF.F1 := 5.0;
HFA3_SF.F2 := -10.0;
HFA3_SF.F3 := 15.0;
ResS := AddSingles3F(HFA3_SF);
ResSA := AddSingles3F_ASM(HFA3_SF);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles3F(HFA3_SF) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS3 then
begin
WriteLn('FAIL: AddSingles3F(HFA3_SF) returned ', ResS, ' instead of ', ExpS3);
Halt(1);
end;
end;
HFA3_DF.F1 := 5.0;
HFA3_DF.F2 := -10.0;
HFA3_DF.F3 := 15.0;
ResD := AddDoubles3F(HFA3_DF);
ResDA := AddDoubles3F_ASM(HFA3_DF);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles3F(HFA3_DF) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD3 then
begin
WriteLn('FAIL: AddDoubles3F(HFA3_DF) returned ', ResD, ' instead of ', ExpD3);
Halt(1);
end;
end;
HFA3_SA.F[0] := 5.0;
HFA3_SA.F[1] := -10.0;
HFA3_SA.F[2] := 15.0;
ResS := AddSingles3A(HFA3_SA);
ResSA := AddSingles3A_ASM(HFA3_SA);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles3A(HFA3_SA) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS3 then
begin
WriteLn('FAIL: AddSingles3A(HFA3_SA) returned ', ResS, ' instead of ', ExpS3);
Halt(1);
end;
end;
HFA3_DA.F[0] := 5.0;
HFA3_DA.F[1] := -10.0;
HFA3_DA.F[2] := 15.0;
ResD := AddDoubles3A(HFA3_DA);
ResDA := AddDoubles3A_ASM(HFA3_DA);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles3A(HFA3_DA) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD3 then
begin
WriteLn('FAIL: AddDoubles3A(HFA3_DA) returned ', ResD, ' instead of ', ExpD3);
Halt(1);
end;
end;
{ 4-field aggregates }
WriteLn('- 4-field aggregates');
Addresses[0] := @HFA4_SF;
Addresses[1] := @HFA4_SA;
FieldAddresses[0][0] := @(HFA4_SF.F1);
FieldAddresses[0][1] := @(HFA4_SF.F2);
FieldAddresses[0][2] := @(HFA4_SF.F3);
FieldAddresses[0][3] := @(HFA4_SF.F4);
FieldAddresses[1][0] := @(HFA4_SA.F[0]);
FieldAddresses[1][1] := @(HFA4_SA.F[1]);
FieldAddresses[1][2] := @(HFA4_SA.F[2]);
FieldAddresses[1][3] := @(HFA4_SA.F[3]);
{ Check alignment of Singles }
for I := 0 to 1 do
begin
TestPointer := PtrUInt(Addresses[I]);
for J := 0 to 3 do
begin
if Pointer(TestPointer) <> FieldAddresses[I][J] then
begin
WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
Halt(1);
end;
Inc(TestPointer, $4);
end;
end;
Addresses[2] := @HFA4_DF;
Addresses[3] := @HFA4_DA;
FieldAddresses[2][0] := @(HFA4_DF.F1);
FieldAddresses[2][1] := @(HFA4_DF.F2);
FieldAddresses[2][2] := @(HFA4_DF.F3);
FieldAddresses[2][3] := @(HFA4_DF.F4);
FieldAddresses[3][0] := @(HFA4_DA.F[0]);
FieldAddresses[3][1] := @(HFA4_DA.F[1]);
FieldAddresses[3][2] := @(HFA4_DA.F[2]);
FieldAddresses[3][3] := @(HFA4_DA.F[3]);
{ Check alignment of Doubles }
for I := 2 to 3 do
begin
TestPointer := PtrUInt(Addresses[I]);
for J := 0 to 3 do
begin
if Pointer(TestPointer) <> FieldAddresses[I][J] then
begin
WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
Halt(1);
end;
Inc(TestPointer, $8);
end;
end;
HFA4_SF.F1 := 5.0;
HFA4_SF.F2 := -10.0;
HFA4_SF.F3 := 15.0;
HFA4_SF.F4 := -20.0;
ResS := AddSingles4F(HFA4_SF);
ResSA := AddSingles4F_ASM(HFA4_SF);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles4F(HFA4_SF) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS4 then
begin
WriteLn('FAIL: AddSingles4F(HFA4_SF) returned ', ResS, ' instead of ', ExpS4);
Halt(1);
end;
end;
HFA4_DF.F1 := 5.0;
HFA4_DF.F2 := -10.0;
HFA4_DF.F3 := 15.0;
HFA4_DF.F4 := -20.0;
ResD := AddDoubles4F(HFA4_DF);
ResDA := AddDoubles4F_ASM(HFA4_DF);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles4F(HFA4_DF) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD4 then
begin
WriteLn('FAIL: AddDoubles4F(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
Halt(1);
end;
end;
HFA4_SA.F[0] := 5.0;
HFA4_SA.F[1] := -10.0;
HFA4_SA.F[2] := 15.0;
HFA4_SA.F[3] := -20.0;
ResS := AddSingles4A(HFA4_SA);
ResSA := AddSingles4A_ASM(HFA4_SA);
if (ResS <> ResSA) then
begin
WriteLn('FAIL: AddSingles4A(HFA4_SA) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResS <> ExpS4 then
begin
WriteLn('FAIL: AddSingles4A(HFA4_SA) returned ', ResS, ' instead of ', ExpS4);
Halt(1);
end;
end;
HFA4_DA.F[0] := 5.0;
HFA4_DA.F[1] := -10.0;
HFA4_DA.F[2] := 15.0;
HFA4_DA.F[3] := -20.0;
ResD := AddDoubles4A(HFA4_DA);
ResDA := AddDoubles4A_ASM(HFA4_DA);
if (ResD <> ResDA) then
begin
WriteLn('FAIL: AddDoubles4A(HFA4_DF) is not passing the aggregate correctly.');
Halt(1);
end else
begin
if ResD <> ExpD4 then
begin
WriteLn('FAIL: AddDoubles4A(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
Halt(1);
end;
end;
{$ENDIF}
WriteLn('ok');
end.

View File

@ -0,0 +1,162 @@
{ %CPU=x86_64 }
program vectorcall_hva_test2;
{$IFNDEF CPUX86_64}
{$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
{$ENDIF}
{$push}
{$CODEALIGN RECORDMIN=16}
{$PACKRECORDS C}
type
TM128 = record
case Byte of
0: (M128_F32: array[0..3] of Single);
1: (M128_F64: array[0..1] of Double);
end;
{$pop}
{ HVA test }
THVA = record
V1, V2, V3, V4: TM128;
end;
operator +(X, Y: TM128)Z: TM128; vectorcall;
var
I: Integer;
begin
for I := 0 to 3 do
Z.M128_F32[I] := X.M128_F32[I] + Y.M128_F32[I];
end;
operator -(X, Y: TM128)Z: TM128; vectorcall;
var
I: Integer;
begin
for I := 0 to 3 do
Z.M128_F32[I] := X.M128_F32[I] - Y.M128_F32[I];
end;
{ - InputHVA goes on the stack because there are not enough free XMM registers to contain the entire argument
- A4 does NOT go on the stack and goes into an XMM register.
}
function HVATest(A1, A2, A3: TM128; InputHVA: THVA; A4: TM128; Op: Integer): THVA; vectorcall;
begin
{ FIXME: There is an internal stack misalignment for A4, necessitating the
use of (V)MOVDQU instead of (V)MOVDQA in the compiled code. }
case Op of
1:
begin
HVATest.V1 := InputHVA.V1 + A1;
HVATest.V2 := InputHVA.V2 + A2;
HVATest.V3 := InputHVA.V3 + A3;
HVATest.V4 := InputHVA.V4 + A4;
end;
2:
begin
HVATest.V1 := InputHVA.V1 - A1;
HVATest.V2 := InputHVA.V2 - A2;
HVATest.V3 := InputHVA.V3 - A3;
HVATest.V4 := InputHVA.V4 - A4;
end;
else
begin
HVATest.V1 := InputHVA.V1 + A1;
HVATest.V2 := InputHVA.V2 - A2;
HVATest.V3 := InputHVA.V3 + A3;
HVATest.V4 := InputHVA.V4 - A4;
end;
end;
end;
var
B1, B2, B3, B4: TM128; HVA, AddRes, SubRes, MixRes, AddExp, SubExp, MixExp: THVA; I: Integer;
begin
B1.M128_F32[0] := 1.0; B1.M128_F32[1] := 2.0; B1.M128_F32[2] := 3.0; B1.M128_F32[3] := 4.0;
B2.M128_F32[0] := 5.0; B2.M128_F32[1] := 6.0; B2.M128_F32[2] := 7.0; B2.M128_F32[3] := 8.0;
B3.M128_F32[0] := 9.0; B3.M128_F32[1] := 10.0; B3.M128_F32[2] := 11.0; B3.M128_F32[3] := 12.0;
B4.M128_F32[0] := 13.0; B4.M128_F32[1] := 14.0; B4.M128_F32[2] := 15.0; B4.M128_F32[3] := 16.0;
HVA.V1.M128_F32[0] := 10.0; HVA.V1.M128_F32[1] := 20.0; HVA.V1.M128_F32[2] := 30.0; HVA.V1.M128_F32[3] := 40.0;
HVA.V2.M128_F32[0] := 50.0; HVA.V2.M128_F32[1] := 60.0; HVA.V2.M128_F32[2] := 70.0; HVA.V2.M128_F32[3] := 80.0;
HVA.V3.M128_F32[0] := 90.0; HVA.V3.M128_F32[1] := 100.0; HVA.V3.M128_F32[2] := 110.0; HVA.V3.M128_F32[3] := 120.0;
HVA.V4.M128_F32[0] := 130.0; HVA.V4.M128_F32[1] := 140.0; HVA.V4.M128_F32[2] := 150.0; HVA.V4.M128_F32[3] := 160.0;
AddExp.V1.M128_F32[0] := 11.0; AddExp.V1.M128_F32[1] := 22.0; AddExp.V1.M128_F32[2] := 33.0; AddExp.V1.M128_F32[3] := 44.0;
AddExp.V2.M128_F32[0] := 55.0; AddExp.V2.M128_F32[1] := 66.0; AddExp.V2.M128_F32[2] := 77.0; AddExp.V2.M128_F32[3] := 88.0;
AddExp.V3.M128_F32[0] := 99.0; AddExp.V3.M128_F32[1] := 110.0; AddExp.V3.M128_F32[2] := 121.0; AddExp.V3.M128_F32[3] := 132.0;
AddExp.V4.M128_F32[0] := 143.0; AddExp.V4.M128_F32[1] := 154.0; AddExp.V4.M128_F32[2] := 165.0; AddExp.V4.M128_F32[3] := 176.0;
SubExp.V1.M128_F32[0] := 9.0; SubExp.V1.M128_F32[1] := 18.0; SubExp.V1.M128_F32[2] := 27.0; SubExp.V1.M128_F32[3] := 36.0;
SubExp.V2.M128_F32[0] := 45.0; SubExp.V2.M128_F32[1] := 54.0; SubExp.V2.M128_F32[2] := 63.0; SubExp.V2.M128_F32[3] := 72.0;
SubExp.V3.M128_F32[0] := 81.0; SubExp.V3.M128_F32[1] := 90.0; SubExp.V3.M128_F32[2] := 99.0; SubExp.V3.M128_F32[3] := 108.0;
SubExp.V4.M128_F32[0] := 117.0; SubExp.V4.M128_F32[1] := 126.0; SubExp.V4.M128_F32[2] := 135.0; SubExp.V4.M128_F32[3] := 144.0;
MixExp.V1.M128_F32[0] := 11.0; MixExp.V1.M128_F32[1] := 22.0; MixExp.V1.M128_F32[2] := 33.0; MixExp.V1.M128_F32[3] := 44.0;
MixExp.V2.M128_F32[0] := 45.0; MixExp.V2.M128_F32[1] := 54.0; MixExp.V2.M128_F32[2] := 63.0; MixExp.V2.M128_F32[3] := 72.0;
MixExp.V3.M128_F32[0] := 99.0; MixExp.V3.M128_F32[1] := 110.0; MixExp.V3.M128_F32[2] := 121.0; MixExp.V3.M128_F32[3] := 132.0;
MixExp.V4.M128_F32[0] := 117.0; MixExp.V4.M128_F32[1] := 126.0; MixExp.V4.M128_F32[2] := 135.0; MixExp.V4.M128_F32[3] := 144.0;
WriteLn(' B1: ', B1.M128_F32[0], ',', B1.M128_F32[1], ',', B1.M128_F32[2], ',', B1.M128_F32[3]);
WriteLn(' B2: ', B2.M128_F32[0], ',', B2.M128_F32[1], ',', B2.M128_F32[2], ',', B2.M128_F32[3]);
WriteLn(' B3: ', B3.M128_F32[0], ',', B3.M128_F32[1], ',', B3.M128_F32[2], ',', B3.M128_F32[3]);
WriteLn(' B4: ', B4.M128_F32[0], ',', B4.M128_F32[1], ',', B4.M128_F32[2], ',', B4.M128_F32[3]);
WriteLn('HVA.V1: ', HVA.V1.M128_F32[0], ',', HVA.V1.M128_F32[1], ',', HVA.V1.M128_F32[2], ',', HVA.V1.M128_F32[3]);
WriteLn('HVA.V2: ', HVA.V2.M128_F32[0], ',', HVA.V2.M128_F32[1], ',', HVA.V2.M128_F32[2], ',', HVA.V2.M128_F32[3]);
WriteLn('HVA.V3: ', HVA.V3.M128_F32[0], ',', HVA.V3.M128_F32[1], ',', HVA.V3.M128_F32[2], ',', HVA.V3.M128_F32[3]);
WriteLn('HVA.V4: ', HVA.V4.M128_F32[0], ',', HVA.V4.M128_F32[1], ',', HVA.V4.M128_F32[2], ',', HVA.V4.M128_F32[3]);
AddRes := HVATest(B1, B2, B3, HVA, B4, 1);
SubRes := HVATest(B1, B2, B3, HVA, B4, 2);
MixRes := HVATest(B1, B2, B3, HVA, B4, 0);
WriteLn('----');
WriteLn('AddRes.V1: ', AddRes.V1.M128_F32[0], ',', AddRes.V1.M128_F32[1], ',', AddRes.V1.M128_F32[2], ',', AddRes.V1.M128_F32[3]);
WriteLn('AddRes.V2: ', AddRes.V2.M128_F32[0], ',', AddRes.V2.M128_F32[1], ',', AddRes.V2.M128_F32[2], ',', AddRes.V2.M128_F32[3]);
WriteLn('AddRes.V3: ', AddRes.V3.M128_F32[0], ',', AddRes.V3.M128_F32[1], ',', AddRes.V3.M128_F32[2], ',', AddRes.V3.M128_F32[3]);
WriteLn('AddRes.V4: ', AddRes.V4.M128_F32[0], ',', AddRes.V4.M128_F32[1], ',', AddRes.V4.M128_F32[2], ',', AddRes.V4.M128_F32[3]);
WriteLn();
WriteLn('AddExp.V1: ', AddExp.V1.M128_F32[0], ',', AddExp.V1.M128_F32[1], ',', AddExp.V1.M128_F32[2], ',', AddExp.V1.M128_F32[3]);
WriteLn('AddExp.V2: ', AddExp.V2.M128_F32[0], ',', AddExp.V2.M128_F32[1], ',', AddExp.V2.M128_F32[2], ',', AddExp.V2.M128_F32[3]);
WriteLn('AddExp.V3: ', AddExp.V3.M128_F32[0], ',', AddExp.V3.M128_F32[1], ',', AddExp.V3.M128_F32[2], ',', AddExp.V3.M128_F32[3]);
WriteLn('AddExp.V4: ', AddExp.V4.M128_F32[0], ',', AddExp.V4.M128_F32[1], ',', AddExp.V4.M128_F32[2], ',', AddExp.V4.M128_F32[3]);
WriteLn('----');
WriteLn('SubRes.V1: ', SubRes.V1.M128_F32[0], ',', SubRes.V1.M128_F32[1], ',', SubRes.V1.M128_F32[2], ',', SubRes.V1.M128_F32[3]);
WriteLn('SubRes.V2: ', SubRes.V2.M128_F32[0], ',', SubRes.V2.M128_F32[1], ',', SubRes.V2.M128_F32[2], ',', SubRes.V2.M128_F32[3]);
WriteLn('SubRes.V3: ', SubRes.V3.M128_F32[0], ',', SubRes.V3.M128_F32[1], ',', SubRes.V3.M128_F32[2], ',', SubRes.V3.M128_F32[3]);
WriteLn('SubRes.V4: ', SubRes.V4.M128_F32[0], ',', SubRes.V4.M128_F32[1], ',', SubRes.V4.M128_F32[2], ',', SubRes.V4.M128_F32[3]);
WriteLn();
WriteLn('SubExp.V1: ', SubExp.V1.M128_F32[0], ',', SubExp.V1.M128_F32[1], ',', SubExp.V1.M128_F32[2], ',', SubExp.V1.M128_F32[3]);
WriteLn('SubExp.V2: ', SubExp.V2.M128_F32[0], ',', SubExp.V2.M128_F32[1], ',', SubExp.V2.M128_F32[2], ',', SubExp.V2.M128_F32[3]);
WriteLn('SubExp.V3: ', SubExp.V3.M128_F32[0], ',', SubExp.V3.M128_F32[1], ',', SubExp.V3.M128_F32[2], ',', SubExp.V3.M128_F32[3]);
WriteLn('SubExp.V4: ', SubExp.V4.M128_F32[0], ',', SubExp.V4.M128_F32[1], ',', SubExp.V4.M128_F32[2], ',', SubExp.V4.M128_F32[3]);
WriteLn('----');
WriteLn('MixRes.V1: ', MixRes.V1.M128_F32[0], ',', MixRes.V1.M128_F32[1], ',', MixRes.V1.M128_F32[2], ',', MixRes.V1.M128_F32[3]);
WriteLn('MixRes.V2: ', MixRes.V2.M128_F32[0], ',', MixRes.V2.M128_F32[1], ',', MixRes.V2.M128_F32[2], ',', MixRes.V2.M128_F32[3]);
WriteLn('MixRes.V3: ', MixRes.V3.M128_F32[0], ',', MixRes.V3.M128_F32[1], ',', MixRes.V3.M128_F32[2], ',', MixRes.V3.M128_F32[3]);
WriteLn('MixRes.V4: ', MixRes.V4.M128_F32[0], ',', MixRes.V4.M128_F32[1], ',', MixRes.V4.M128_F32[2], ',', MixRes.V4.M128_F32[3]);
WriteLn();
WriteLn('MixExp.V1: ', MixExp.V1.M128_F32[0], ',', MixExp.V1.M128_F32[1], ',', MixExp.V1.M128_F32[2], ',', MixExp.V1.M128_F32[3]);
WriteLn('MixExp.V2: ', MixExp.V2.M128_F32[0], ',', MixExp.V2.M128_F32[1], ',', MixExp.V2.M128_F32[2], ',', MixExp.V2.M128_F32[3]);
WriteLn('MixExp.V3: ', MixExp.V3.M128_F32[0], ',', MixExp.V3.M128_F32[1], ',', MixExp.V3.M128_F32[2], ',', MixExp.V3.M128_F32[3]);
WriteLn('MixExp.V4: ', MixExp.V4.M128_F32[0], ',', MixExp.V4.M128_F32[1], ',', MixExp.V4.M128_F32[2], ',', MixExp.V4.M128_F32[3]);
for I := 0 to 3 do
begin
if AddRes.V1.M128_F32[I] <> AddExp.V1.M128_F32[I] then
begin
WriteLn('FAILURE on AddRes.V1.M128_F32[', I, ']');
Halt(1);
end;
if SubRes.V1.M128_F32[I] <> SubExp.V1.M128_F32[I] then
begin
WriteLn('FAILURE on SubRes.V1.M128_F32[', I, ']');
Halt(1);
end;
if MixRes.V1.M128_F32[I] <> MixExp.V1.M128_F32[I] then
begin
WriteLn('FAILURE on MixRes.V1.M128_F32[', I, ']');
Halt(1);
end;
end;
WriteLn('ok');
end.

View File

@ -0,0 +1,158 @@
{ %CPU=x86_64 }
program vectorcall_stack_test;
{$IFNDEF CPUX86_64}
{$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
{$ENDIF}
{ This program can be compiled on Linux, and all the vectorcall
routines should work the same, including the assembler routine.
'vectorcall' should be ignored by the compiler on this platform. }
{$push}
{$CODEALIGN RECORDMIN=16}
{$PACKRECORDS C}
type
TM128 = record
case Byte of
0: (M128_F32: array[0..3] of Single);
1: (M128_F64: array[0..1] of Double);
end;
{$CODEALIGN RECORDMIN=32}
{$PACKRECORDS C}
type
TM256 = record
case Byte of
0: (M256_F32: array[0..7] of Single);
1: (M256_F64: array[0..3] of Double);
2: (M256_M128: array[0..1] of TM128);
end;
{$pop}
TVector4f = packed record
case Byte of
0: (M128: TM128);
1: (X, Y, Z, W: Single);
end;
TVectorPair4f = packed record
case Byte of
0: (M256: TM256);
1: (V: array[0..1] of TVector4f);
2: (X1, Y1, Z1, W1, X2, Y2, Z2, W2: Single);
end;
function TestFloat(TP: Single): Single; vectorcall; { vectorcall should have no effect on how this function behaves }
begin
TestFloat := TP * 1.5;
end;
function AddVectors(V1, V2: TVector4f): TVector4f; vectorcall;
begin
AddVectors.X := V1.X + V2.X;
AddVectors.Y := V1.Y + V2.Y;
AddVectors.Z := V1.Z + V2.Z;
AddVectors.W := V1.W + V2.W;
end;
{$ASMMODE Intel}
function AddVectorsAsm(V1, V2: TVector4f): TVector4f; vectorcall; assembler; nostackframe; inline; { The inline is for a future test }
asm
ADDPS XMM0, XMM1
end;
{ Note: V1, V2 and the result will go on the stack until FPC fully supports 256-bit vectors }
function AddVectors(V1, V2: TVectorPair4f): TVectorPair4f; vectorcall;
var
C: Integer;
begin
for C := 0 to 1 do
begin
AddVectors.V[C].X := V1.V[C].X + V2.V[C].X;
AddVectors.V[C].Y := V1.V[C].Y + V2.V[C].Y;
AddVectors.V[C].Z := V1.V[C].Z + V2.V[C].Z;
AddVectors.V[C].W := V1.V[C].W + V2.V[C].W;
end;
end;
var
Vecs: array[0..1] of TVector4f; Res, ResAsm, Exp: TVector4f;
Pairs: array[0..1] of TVectorPair4f; ResPair, ExpPair: TVectorPair4f;
I: Integer;
begin
FillDWord(Vecs[0], 0, 8);
Vecs[0].X := TestFloat(2.0);
Vecs[0].Y := 1.0;
Vecs[0].Z := -4.0;
Vecs[0].W := 1.0;
Vecs[1].X := 0.0;
Vecs[1].Y := -2.0;
Vecs[1].Z := TestFloat(4.0);
Vecs[1].W := 0.0;
Exp.X := 3.0;
Exp.Y := -1.0;
Exp.Z := 2.0;
Exp.W := 1.0;
Pairs[0].V[0].X := 1.0; Pairs[0].V[1].X := 5.0;
Pairs[0].V[0].Y := 2.0; Pairs[0].V[1].Y := 6.0;
Pairs[0].V[0].Z := 3.0; Pairs[0].V[1].Z := 7.0;
Pairs[0].V[0].W := 4.0; Pairs[0].V[1].W := 8.0;
Pairs[1].V[0].X := 9.0; Pairs[1].V[1].X := 13.0;
Pairs[1].V[0].Y := 10.0; Pairs[1].V[1].Y := 14.0;
Pairs[1].V[0].Z := 11.0; Pairs[1].V[1].Z := 15.0;
Pairs[1].V[0].W := 12.0; Pairs[1].V[1].W := 16.0;
ExpPair.V[0].X := 10.0; ExpPair.V[1].X := 18.0;
ExpPair.V[0].Y := 12.0; ExpPair.V[1].Y := 20.0;
ExpPair.V[0].Z := 14.0; ExpPair.V[1].Z := 22.0;
ExpPair.V[0].W := 16.0; ExpPair.V[1].W := 24.0;
WriteLn('Vecs[0] = (', Vecs[0].X, ', ', Vecs[0].Y, ', ', Vecs[0].Z, ', ', Vecs[0].W, ')');
WriteLn('Vecs[1] = (', Vecs[1].X, ', ', Vecs[1].Y, ', ', Vecs[1].Z, ', ', Vecs[1].W, ')');
Res := AddVectors(Vecs[0], Vecs[1]);
ResAsm := AddVectorsAsm(Vecs[0], Vecs[1]);
WriteLn('Result = (', Res.X, ', ', Res.Y, ', ', Res.Z, ', ', Res.W, ')');
WriteLn('ResAsm = (', ResAsm.X, ', ', ResAsm.Y, ', ', ResAsm.Z, ', ', ResAsm.W, ')');
WriteLn('Expected = (', Exp.X, ', ', Exp.Y, ', ', Exp.Z, ', ', Exp.W, ')');
WriteLn('Pairs[0] = (', Pairs[0].V[0].X, ', ', Pairs[0].V[0].Y, ', ', Pairs[0].V[0].Z, ', ', Pairs[0].V[0].W, ', ', Pairs[0].V[1].X, ', ', Pairs[0].V[1].Y, ', ', Pairs[0].V[1].Z, ', ', Pairs[0].V[1].W, ')');
WriteLn('Pairs[1] = (', Pairs[1].V[0].X, ', ', Pairs[1].V[0].Y, ', ', Pairs[1].V[0].Z, ', ', Pairs[1].V[0].W, ', ', Pairs[1].V[1].X, ', ', Pairs[1].V[1].Y, ', ', Pairs[1].V[1].Z, ', ', Pairs[1].V[1].W, ')');
ResPair := AddVectors(Pairs[0], Pairs[1]);
WriteLn('ResPair = (', ResPair.V[0].X, ', ', ResPair.V[0].Y, ', ', ResPair.V[0].Z, ', ', ResPair.V[0].W, ', ', ResPair.V[1].X, ', ', ResPair.V[1].Y, ', ', ResPair.V[1].Z, ', ', ResPair.V[1].W, ')');
WriteLn('Expected = (', ExpPair.V[0].X, ', ', ExpPair.V[0].Y, ', ', ExpPair.V[0].Z, ', ', ExpPair.V[0].W, ', ', ExpPair.V[1].X, ', ', ExpPair.V[1].Y, ', ', ExpPair.V[1].Z, ', ', ExpPair.V[1].W, ')');
for I := 0 to 3 do
begin
if Res.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
begin
WriteLn('FAILURE on Res.M128.M128_F32[', I, ']');
Halt(1);
end;
if ResAsm.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
begin
WriteLn('FAILURE on ResAsm.M128.M128_F32[', I, ']');
Halt(1);
end;
end;
for I := 0 to 7 do
begin
if ResPair.M256.M256_F32[I] <> ExpPair.M256.M256_F32[I] then
begin
WriteLn('FAILURE on ResPair.M256.M256_F32[', I, ']');
Halt(1);
end;
end;
WriteLn('ok');
end.