mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-20 07:39:25 +02:00
REP STOS branch for x64 Fill* (only for System V ABI for now).
This commit is contained in:
parent
a4c324ee23
commit
1ec0326995
@ -21,6 +21,15 @@
|
||||
Primitives
|
||||
****************************************************************************}
|
||||
|
||||
{$ifndef win64}
|
||||
{$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
|
||||
{$endif}
|
||||
|
||||
{$ifdef use_fast_repmovstos}
|
||||
var
|
||||
fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
|
||||
{$endif}
|
||||
|
||||
{$define FPC_SYSTEM_HAS_SPTR}
|
||||
Function Sptr : Pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
asm
|
||||
@ -297,6 +306,11 @@ procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
|
||||
rdx = byte count
|
||||
xmm0 = pattern for unaligned writes
|
||||
xmm1 = pattern for aligned writes }
|
||||
const
|
||||
{$ifdef use_fast_repmovstos}
|
||||
ErmsThreshold = 1536;
|
||||
{$endif}
|
||||
NtThreshold = 512 * 1024;
|
||||
asm
|
||||
{ x can start and end misaligned on the vector boundary:
|
||||
|
||||
@ -326,8 +340,13 @@ asm
|
||||
jle .LFourAlignedTailWrites
|
||||
|
||||
add $48, %rcx
|
||||
cmp $0x80000, %rdx
|
||||
{$ifdef use_fast_repmovstos}
|
||||
cmp $ErmsThreshold, %rdx
|
||||
jae .LRepStos
|
||||
{$else}
|
||||
cmp $NtThreshold, %rdx
|
||||
jae .L64xNT_Body
|
||||
{$endif}
|
||||
|
||||
.balign 16
|
||||
.L64x_Body:
|
||||
@ -346,9 +365,38 @@ asm
|
||||
movdqa %xmm1, 32(%rax) { T2 }
|
||||
.LOneAlignedTailWrite:
|
||||
movdqa %xmm1, 48(%rax) { T1 }
|
||||
movdqu %xmm0, 49(%r8) { UT }
|
||||
movdqu %xmm0, 65-16(%r8) { UT }
|
||||
ret
|
||||
|
||||
{$ifdef use_fast_repmovstos}
|
||||
.LRepStos:
|
||||
{$ifdef FPC_PIC}
|
||||
movq fast_large_repmovstosb@GOTPCREL(%rip), %r9
|
||||
cmpb $1, (%r9)
|
||||
{$else FPC_PIC}
|
||||
cmpb $1, fast_large_repmovstosb(%rip)
|
||||
{$endif FPC_PIC}
|
||||
jne .LRepStosIsNotBetter
|
||||
{$ifdef win64}
|
||||
push %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
|
||||
{$endif}
|
||||
mov %rcx, %rdi { rdi = REP STOS destination. }
|
||||
lea 65-16+8-1(%r8), %rcx
|
||||
sub %rdi, %rcx
|
||||
shr $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
|
||||
movq %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
|
||||
rep stosq
|
||||
movdqu %xmm0, 65-16(%r8) { UT }
|
||||
{$ifdef win64}
|
||||
pop %rdi
|
||||
{$endif}
|
||||
ret
|
||||
{$endif}
|
||||
|
||||
.LRepStosIsNotBetter:
|
||||
cmp $NtThreshold, %rdx
|
||||
jb .L64x_Body
|
||||
|
||||
.balign 16
|
||||
.L64xNT_Body:
|
||||
movntdq %xmm1, (%rcx)
|
||||
@ -1452,7 +1500,7 @@ const
|
||||
{$define FPC_SYSTEM_HAS_FPC_CPUINIT}
|
||||
procedure fpc_cpuinit;
|
||||
var
|
||||
_eax,_ebx,cpuid1_ecx : dword;
|
||||
_eax,cpuid7_ebx,cpuid1_ecx : dword;
|
||||
begin
|
||||
{ don't let libraries influence the FPU cw set by the host program }
|
||||
if IsLibrary then
|
||||
@ -1473,7 +1521,14 @@ procedure fpc_cpuinit;
|
||||
xorl %ecx,%ecx
|
||||
cpuid
|
||||
movl %ecx,cpuid1_ecx
|
||||
movl $7,%eax
|
||||
xorl %ecx,%ecx
|
||||
cpuid
|
||||
movl %ebx,cpuid7_ebx
|
||||
end;
|
||||
{$ifdef use_fast_repmovstos}
|
||||
fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
|
||||
{$endif}
|
||||
{ XGETBV support? }
|
||||
if (cpuid1_ecx and $8000000)<>0 then
|
||||
begin
|
||||
@ -1485,13 +1540,7 @@ procedure fpc_cpuinit;
|
||||
if (_eax and 6)=6 then
|
||||
begin
|
||||
has_avx_support:=(cpuid1_ecx and $10000000)<>0;
|
||||
asm
|
||||
movl $7,%eax
|
||||
xorl %ecx,%ecx
|
||||
cpuid
|
||||
movl %ebx,_ebx
|
||||
end;
|
||||
has_avx2_support:=(_ebx and $20)<>0;
|
||||
has_avx2_support:=(cpuid7_ebx and $20)<>0;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
Loading…
Reference in New Issue
Block a user