REP STOS branch for x64 Fill* (only for System V ABI for now).

This commit is contained in:
Rika Ichinose 2023-11-25 03:29:32 +03:00 committed by FPK
parent a4c324ee23
commit 1ec0326995

View File

@ -21,6 +21,15 @@
Primitives
****************************************************************************}
{$ifndef win64}
{$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
{$endif}
{$ifdef use_fast_repmovstos}
var
fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
{$endif}
{$define FPC_SYSTEM_HAS_SPTR}
Function Sptr : Pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
@ -297,6 +306,11 @@ procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
rdx = byte count
xmm0 = pattern for unaligned writes
xmm1 = pattern for aligned writes }
const
{$ifdef use_fast_repmovstos}
ErmsThreshold = 1536;
{$endif}
NtThreshold = 512 * 1024;
asm
{ x can start and end misaligned on the vector boundary:
@ -326,8 +340,13 @@ asm
jle .LFourAlignedTailWrites
add $48, %rcx
cmp $0x80000, %rdx
{$ifdef use_fast_repmovstos}
cmp $ErmsThreshold, %rdx
jae .LRepStos
{$else}
cmp $NtThreshold, %rdx
jae .L64xNT_Body
{$endif}
.balign 16
.L64x_Body:
@ -346,9 +365,38 @@ asm
movdqa %xmm1, 32(%rax) { T2 }
.LOneAlignedTailWrite:
movdqa %xmm1, 48(%rax) { T1 }
movdqu %xmm0, 49(%r8) { UT }
movdqu %xmm0, 65-16(%r8) { UT }
ret
{$ifdef use_fast_repmovstos}
.LRepStos:
{$ifdef FPC_PIC}
movq fast_large_repmovstosb@GOTPCREL(%rip), %r9
cmpb $1, (%r9)
{$else FPC_PIC}
cmpb $1, fast_large_repmovstosb(%rip)
{$endif FPC_PIC}
jne .LRepStosIsNotBetter
{$ifdef win64}
push %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
{$endif}
mov %rcx, %rdi { rdi = REP STOS destination. }
lea 65-16+8-1(%r8), %rcx
sub %rdi, %rcx
shr $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
movq %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
rep stosq
movdqu %xmm0, 65-16(%r8) { UT }
{$ifdef win64}
pop %rdi
{$endif}
ret
{$endif}
.LRepStosIsNotBetter:
cmp $NtThreshold, %rdx
jb .L64x_Body
.balign 16
.L64xNT_Body:
movntdq %xmm1, (%rcx)
@ -1452,7 +1500,7 @@ const
{$define FPC_SYSTEM_HAS_FPC_CPUINIT}
procedure fpc_cpuinit;
var
_eax,_ebx,cpuid1_ecx : dword;
_eax,cpuid7_ebx,cpuid1_ecx : dword;
begin
{ don't let libraries influence the FPU cw set by the host program }
if IsLibrary then
@ -1473,7 +1521,14 @@ procedure fpc_cpuinit;
xorl %ecx,%ecx
cpuid
movl %ecx,cpuid1_ecx
movl $7,%eax
xorl %ecx,%ecx
cpuid
movl %ebx,cpuid7_ebx
end;
{$ifdef use_fast_repmovstos}
fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
{$endif}
{ XGETBV support? }
if (cpuid1_ecx and $8000000)<>0 then
begin
@ -1485,13 +1540,7 @@ procedure fpc_cpuinit;
if (_eax and 6)=6 then
begin
has_avx_support:=(cpuid1_ecx and $10000000)<>0;
asm
movl $7,%eax
xorl %ecx,%ecx
cpuid
movl %ebx,_ebx
end;
has_avx2_support:=(_ebx and $20)<>0;
has_avx2_support:=(cpuid7_ebx and $20)<>0;
end;
end;
end;