Check for Move overlaps in more obvious way (that also does no jumps in forward case).

This commit is contained in:
Rika Ichinose 2024-02-08 11:30:32 +03:00 committed by FPK
parent 8a48d1bbbc
commit e395166cb7
2 changed files with 41 additions and 49 deletions

View File

@ -7,35 +7,34 @@ procedure Move_8OrMore_Valgrind; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
If FPC_PIC: ebx pushed. }
asm
{$ifndef FPC_PIC}
push %ebx
{$endif}
sub %edx, %eax
jae .LForward
mov %ecx, %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .LBack { if no overlap, still do forward move }
sub %eax, %edx { edx = dest - src }
cmp %edx, %ecx
ja .LBack { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
.LForward:
{$ifdef FPC_ENABLED_CLD}
cld
{$endif FPC_ENABLED_CLD}
push %esi
push %edi
lea (%eax,%edx), %esi
mov %edx, %edi
mov %eax, %esi
lea (%edx,%eax), %edi
rep movsb
pop %edi
pop %esi
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
.LBack:
add %ecx, %edx
{$ifndef FPC_PIC}
push %ebx
{$endif}
add %ecx, %eax
.LNextb:
dec %edx
mov (%eax,%edx), %bl
mov %bl, (%edx)
dec %eax
mov (%eax), %bl
mov %bl, (%edx,%eax)
dec %ecx
jnz .LNextb
pop %ebx
@ -77,13 +76,11 @@ asm
{$ifndef FPC_PIC}
push %ebx
{$endif}
jnb .LForward { src>dest => forward move }
mov %eax, %ebx
neg %ebx
cmp %ebx, %ecx
ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
mov %ecx, %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
.LForward:
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $8, %edx
@ -161,13 +158,11 @@ asm
movq -8(%eax,%ecx), %mm5
sub %edx, %eax { eax = src - dest }
jz .Lquit { exit if src=dest }
jnb .LForward { src>dest => forward move }
mov %eax, %ebx
neg %ebx
cmp %ebx, %ecx
ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
mov %ecx, %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
.LForward:
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $8, %edx
@ -237,7 +232,7 @@ end;
{$ifndef FASTMOVE_DISABLE_SSE}
label
Move_8OrMore_SSE_9to16, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
const
Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
@ -248,20 +243,20 @@ procedure Move_8OrMore_SSE; assembler; nostackframe;
const
PrefetchDistance = 512;
asm
cmp $16, %ecx
jle Move_8OrMore_SSE_9to16
cmp $15, %ecx
jle Move_8OrMore_SSE_9to15
movups (%eax), %xmm4 { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 1732 branch. }
movups -16(%eax,%ecx), %xmm5
cmp $32, %ecx
jg Move_8OrMore_SSE_33OrMore
movups %xmm4, (%edx) { 1732 bytes }
movups %xmm4, (%edx) { 1632 bytes }
movups %xmm5, -16(%edx,%ecx)
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
Move_8OrMore_SSE_9to16:
Move_8OrMore_SSE_9to15:
movlps (%eax), %xmm0
movlps -8(%eax,%ecx), %xmm1
movlps %xmm0, (%edx)
@ -271,7 +266,7 @@ Move_8OrMore_SSE_9to16:
pop %ebx
{$endif}
ret
.byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
.byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
Move_8OrMore_SSE_33OrMore:
sub %edx, %eax { eax = src - dest }
@ -279,13 +274,11 @@ Move_8OrMore_SSE_33OrMore:
{$ifndef FPC_PIC}
push %ebx
{$endif}
jnb .LForward { src>dest => forward move }
mov %eax, %ebx
neg %ebx
cmp %ebx, %ecx
ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
lea -1(%ecx), %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
.LForward:
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $16, %edx
@ -466,15 +459,15 @@ procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
const
ErmsThreshold = 1536;
asm
cmp $16, %ecx
jle Move_8OrMore_SSE_9to16
cmp $15, %ecx
jle Move_8OrMore_SSE_9to15
cmp $ErmsThreshold, %ecx
jae .LRepMovs
movups (%eax), %xmm4 { Same as in Move_8OrMore_SSE. }
movups -16(%eax,%ecx), %xmm5
cmp $32, %ecx
jg Move_8OrMore_SSE_33OrMore
movups %xmm4, (%edx) { 1732 bytes }
movups %xmm4, (%edx) { 1632 bytes }
movups %xmm5, -16(%edx,%ecx)
{$ifdef FPC_PIC}
pop %ebx

View File

@ -134,18 +134,17 @@ asm
mov %r9, -8(%rdx,%r8)
.Lquit:
ret
.byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32f into a no-op. }
.byte 102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
.L33OrMore:
sub %rdx, %rcx { rcx = src - dest }
jz .Lquit { exit if src=dest }
jnb .LForward { src>dest => forward move }
mov %r8, %rax
add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
mov %rcx, %rax
neg %rax
cmp %rax, %r8
ja .Lback { count (r8) > unsigned(dest - src) (rax) if regions overlap }
.LForward:
mov %rdx, %r9 { remember original dest to write first 16 bytes }
add %rdx, %r8 { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $16, %rdx
@ -217,7 +216,7 @@ asm
mfence
add $0x1000, %r8
jmpq .LRestAfterNTf { go handle remaining bytes }
.byte 0x90,0x90,0x90 { Turns .balign 16 before .Lloop32b into a no-op. }
.byte 102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
{ backwards move }
.Lback: