Simplify x86_64.inc:Move non-temporal loops, and adjust thresholds for move distances considered too short for NT.

This commit is contained in:
Rika Ichinose 2024-02-08 11:45:38 +03:00 committed by FPK
parent 0b5998ee8b
commit 12f18177ae

View File

@ -86,6 +86,9 @@ end;
procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe; procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
{ Linux: rdi source, rsi dest, rdx count { Linux: rdi source, rsi dest, rdx count
win64: rcx source, rdx dest, r8 count } win64: rcx source, rdx dest, r8 count }
const
NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
PrefetchDistance = 512;
asm asm
{$ifndef win64} {$ifndef win64}
mov %rdx, %r8 mov %rdx, %r8
@ -157,7 +160,7 @@ asm
.LRestAfterNTf: .LRestAfterNTf:
sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. } sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
jbe .LPost32f jbe .LPost32f
cmp $0x40000, %r8 { this limit must be processor-specific (1/2 L2 cache size) } cmp $NtThreshold-32, %r8
jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... } jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
.balign 16 { no-op } .balign 16 { no-op }
@ -176,46 +179,31 @@ asm
movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. } movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. }
ret ret
.balign 16
.Lntf: .Lntf:
cmp $0x1000, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other } cmp $NtThreshold, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) } jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
sub $0xFE0, %r8 { r8 = remaining - 0x1000, but 32 was subtracted already, so must subtract only (0x1000 - 32) = 0xFE0. } sub $PrefetchDistance+32, %r8 { r8 = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
.Lntloopf: .balign 16 { no-op }
mov $32, %eax
.balign 16
.Lpref:
prefetchnta (%rcx,%rdx,1)
prefetchnta 0x40(%rcx,%rdx,1)
add $0x80, %rdx
dec %eax
jnz .Lpref
sub $0x1000, %rdx
mov $64, %eax
.balign 16
.Lntloop64f: .Lntloop64f:
prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
movdqu (%rcx,%rdx,1), %xmm0
movntdq %xmm0, (%rdx)
movdqu 16(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 16(%rdx)
movdqu 32(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 32(%rdx)
movdqu 48(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 48(%rdx)
add $64, %rdx add $64, %rdx
movdqu -64(%rcx,%rdx,1), %xmm0 sub $64, %r8
movntdq %xmm0, -64(%rdx) jae .Lntloop64f
movdqu -48(%rcx,%rdx,1), %xmm0
movntdq %xmm0, -48(%rdx)
movdqu -32(%rcx,%rdx,1), %xmm0
movntdq %xmm0, -32(%rdx)
movdqu -16(%rcx,%rdx,1), %xmm0
movntdq %xmm0, -16(%rdx)
dec %eax
jnz .Lntloop64f
sub $0x1000, %r8 sfence
jae .Lntloopf add $PrefetchDistance+64, %r8
mfence
add $0x1000, %r8
jmpq .LRestAfterNTf { go handle remaining bytes } jmpq .LRestAfterNTf { go handle remaining bytes }
.byte 102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } .byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
{ backwards move } { backwards move }
.Lback: .Lback:
@ -229,7 +217,7 @@ asm
.LRestAfterNTb: .LRestAfterNTb:
sub $32, %r8 sub $32, %r8
jbe .LPost32b jbe .LPost32b
cmp $0x40000, %r8 cmp $NtThreshold-32, %r8
jae .Lntb jae .Lntb
.balign 16 { no-op } .balign 16 { no-op }
@ -249,27 +237,15 @@ asm
movdqu %xmm5, -16(%r9) movdqu %xmm5, -16(%r9)
ret ret
.balign 16
.Lntb: .Lntb:
cmp $0xfffffffffffff000,%rcx cmp $-NtThreshold,%rcx
jnb .Lloop32b jnb .Lloop32b
sub $0xFE0, %r8 sub $PrefetchDistance+32, %r8
.Lntloopb: .balign 16 { no-op }
mov $32, %eax
.balign 16
.Lprefb:
sub $0x80, %rdx
prefetchnta (%rcx,%rdx,1)
prefetchnta 0x40(%rcx,%rdx,1)
dec %eax
jnz .Lprefb
add $0x1000, %rdx
mov $0x40, %eax
.balign 16
.Lntloop64b: .Lntloop64b:
prefetchnta -PrefetchDistance(%rcx,%rdx,1)
sub $64, %rdx sub $64, %rdx
movdqu 48(%rcx,%rdx,1), %xmm0 movdqu 48(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 48(%rdx) movntdq %xmm0, 48(%rdx)
@ -279,14 +255,11 @@ asm
movntdq %xmm0, 16(%rdx) movntdq %xmm0, 16(%rdx)
movdqu (%rcx,%rdx,1), %xmm0 movdqu (%rcx,%rdx,1), %xmm0
movntdq %xmm0, (%rdx) movntdq %xmm0, (%rdx)
dec %eax sub $64, %r8
jnz .Lntloop64b jae .Lntloop64b
sub $0x1000, %r8 sfence
jae .Lntloopb add $PrefetchDistance+64, %r8
mfence
add $0x1000, %r8
jmpq .LRestAfterNTb jmpq .LRestAfterNTb
end; end;
{$endif FPC_SYSTEM_HAS_MOVE} {$endif FPC_SYSTEM_HAS_MOVE}