mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-30 07:40:27 +02:00
Simplify x86_64.inc:Move non-temporal loops, and adjust thresholds for move distances considered too short for NT.
This commit is contained in:
parent
0b5998ee8b
commit
12f18177ae
@ -86,6 +86,9 @@ end;
|
|||||||
procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
|
procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
|
||||||
{ Linux: rdi source, rsi dest, rdx count
|
{ Linux: rdi source, rsi dest, rdx count
|
||||||
win64: rcx source, rdx dest, r8 count }
|
win64: rcx source, rdx dest, r8 count }
|
||||||
|
const
|
||||||
|
NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
|
||||||
|
PrefetchDistance = 512;
|
||||||
asm
|
asm
|
||||||
{$ifndef win64}
|
{$ifndef win64}
|
||||||
mov %rdx, %r8
|
mov %rdx, %r8
|
||||||
@ -157,7 +160,7 @@ asm
|
|||||||
.LRestAfterNTf:
|
.LRestAfterNTf:
|
||||||
sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
|
sub $32, %r8 { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
|
||||||
jbe .LPost32f
|
jbe .LPost32f
|
||||||
cmp $0x40000, %r8 { this limit must be processor-specific (1/2 L2 cache size) }
|
cmp $NtThreshold-32, %r8
|
||||||
jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
|
jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
|
||||||
|
|
||||||
.balign 16 { no-op }
|
.balign 16 { no-op }
|
||||||
@ -176,46 +179,31 @@ asm
|
|||||||
movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. }
|
movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. }
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
.balign 16
|
||||||
.Lntf:
|
.Lntf:
|
||||||
cmp $0x1000, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
|
cmp $NtThreshold, %rcx { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
|
||||||
jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
|
jb .Lloop32f { (this check is performed here to not stand in the way of smaller counts) }
|
||||||
sub $0xFE0, %r8 { r8 = remaining - 0x1000, but 32 was subtracted already, so must subtract only (0x1000 - 32) = 0xFE0. }
|
sub $PrefetchDistance+32, %r8 { r8 = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
|
||||||
|
|
||||||
.Lntloopf:
|
.balign 16 { no-op }
|
||||||
mov $32, %eax
|
|
||||||
|
|
||||||
.balign 16
|
|
||||||
.Lpref:
|
|
||||||
prefetchnta (%rcx,%rdx,1)
|
|
||||||
prefetchnta 0x40(%rcx,%rdx,1)
|
|
||||||
add $0x80, %rdx
|
|
||||||
dec %eax
|
|
||||||
jnz .Lpref
|
|
||||||
|
|
||||||
sub $0x1000, %rdx
|
|
||||||
mov $64, %eax
|
|
||||||
|
|
||||||
.balign 16
|
|
||||||
.Lntloop64f:
|
.Lntloop64f:
|
||||||
|
prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
|
||||||
|
movdqu (%rcx,%rdx,1), %xmm0
|
||||||
|
movntdq %xmm0, (%rdx)
|
||||||
|
movdqu 16(%rcx,%rdx,1), %xmm0
|
||||||
|
movntdq %xmm0, 16(%rdx)
|
||||||
|
movdqu 32(%rcx,%rdx,1), %xmm0
|
||||||
|
movntdq %xmm0, 32(%rdx)
|
||||||
|
movdqu 48(%rcx,%rdx,1), %xmm0
|
||||||
|
movntdq %xmm0, 48(%rdx)
|
||||||
add $64, %rdx
|
add $64, %rdx
|
||||||
movdqu -64(%rcx,%rdx,1), %xmm0
|
sub $64, %r8
|
||||||
movntdq %xmm0, -64(%rdx)
|
jae .Lntloop64f
|
||||||
movdqu -48(%rcx,%rdx,1), %xmm0
|
|
||||||
movntdq %xmm0, -48(%rdx)
|
|
||||||
movdqu -32(%rcx,%rdx,1), %xmm0
|
|
||||||
movntdq %xmm0, -32(%rdx)
|
|
||||||
movdqu -16(%rcx,%rdx,1), %xmm0
|
|
||||||
movntdq %xmm0, -16(%rdx)
|
|
||||||
dec %eax
|
|
||||||
jnz .Lntloop64f
|
|
||||||
|
|
||||||
sub $0x1000, %r8
|
sfence
|
||||||
jae .Lntloopf
|
add $PrefetchDistance+64, %r8
|
||||||
|
|
||||||
mfence
|
|
||||||
add $0x1000, %r8
|
|
||||||
jmpq .LRestAfterNTf { go handle remaining bytes }
|
jmpq .LRestAfterNTf { go handle remaining bytes }
|
||||||
.byte 102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
|
.byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
|
||||||
|
|
||||||
{ backwards move }
|
{ backwards move }
|
||||||
.Lback:
|
.Lback:
|
||||||
@ -229,7 +217,7 @@ asm
|
|||||||
.LRestAfterNTb:
|
.LRestAfterNTb:
|
||||||
sub $32, %r8
|
sub $32, %r8
|
||||||
jbe .LPost32b
|
jbe .LPost32b
|
||||||
cmp $0x40000, %r8
|
cmp $NtThreshold-32, %r8
|
||||||
jae .Lntb
|
jae .Lntb
|
||||||
|
|
||||||
.balign 16 { no-op }
|
.balign 16 { no-op }
|
||||||
@ -249,27 +237,15 @@ asm
|
|||||||
movdqu %xmm5, -16(%r9)
|
movdqu %xmm5, -16(%r9)
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
.balign 16
|
||||||
.Lntb:
|
.Lntb:
|
||||||
cmp $0xfffffffffffff000,%rcx
|
cmp $-NtThreshold,%rcx
|
||||||
jnb .Lloop32b
|
jnb .Lloop32b
|
||||||
sub $0xFE0, %r8
|
sub $PrefetchDistance+32, %r8
|
||||||
|
|
||||||
.Lntloopb:
|
.balign 16 { no-op }
|
||||||
mov $32, %eax
|
|
||||||
|
|
||||||
.balign 16
|
|
||||||
.Lprefb:
|
|
||||||
sub $0x80, %rdx
|
|
||||||
prefetchnta (%rcx,%rdx,1)
|
|
||||||
prefetchnta 0x40(%rcx,%rdx,1)
|
|
||||||
dec %eax
|
|
||||||
jnz .Lprefb
|
|
||||||
|
|
||||||
add $0x1000, %rdx
|
|
||||||
mov $0x40, %eax
|
|
||||||
|
|
||||||
.balign 16
|
|
||||||
.Lntloop64b:
|
.Lntloop64b:
|
||||||
|
prefetchnta -PrefetchDistance(%rcx,%rdx,1)
|
||||||
sub $64, %rdx
|
sub $64, %rdx
|
||||||
movdqu 48(%rcx,%rdx,1), %xmm0
|
movdqu 48(%rcx,%rdx,1), %xmm0
|
||||||
movntdq %xmm0, 48(%rdx)
|
movntdq %xmm0, 48(%rdx)
|
||||||
@ -279,14 +255,11 @@ asm
|
|||||||
movntdq %xmm0, 16(%rdx)
|
movntdq %xmm0, 16(%rdx)
|
||||||
movdqu (%rcx,%rdx,1), %xmm0
|
movdqu (%rcx,%rdx,1), %xmm0
|
||||||
movntdq %xmm0, (%rdx)
|
movntdq %xmm0, (%rdx)
|
||||||
dec %eax
|
sub $64, %r8
|
||||||
jnz .Lntloop64b
|
jae .Lntloop64b
|
||||||
|
|
||||||
sub $0x1000, %r8
|
sfence
|
||||||
jae .Lntloopb
|
add $PrefetchDistance+64, %r8
|
||||||
|
|
||||||
mfence
|
|
||||||
add $0x1000, %r8
|
|
||||||
jmpq .LRestAfterNTb
|
jmpq .LRestAfterNTb
|
||||||
end;
|
end;
|
||||||
{$endif FPC_SYSTEM_HAS_MOVE}
|
{$endif FPC_SYSTEM_HAS_MOVE}
|
||||||
|
Loading…
Reference in New Issue
Block a user