Write two last values after 2× loops unconditionally instead of an extra check.

This commit is contained in:
Rika Ichinose 2024-02-08 11:34:36 +03:00 committed by FPK
parent e395166cb7
commit 0b5998ee8b
2 changed files with 32 additions and 56 deletions

View File

@ -63,19 +63,19 @@ asm
ret
.Lcancel:
fstp %st(0) { Pop the “second int64 from the end” .L33OrMore loads. }
fucompp { Pop two elements loaded at the beginning. }
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
.byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
.byte 102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
.L33OrMore:
sub %edx, %eax { eax = src - dest }
jz .Lcancel { exit if src=dest }
fildq -16(%eax,%ecx) { Second int64 from the end. }
{$ifndef FPC_PIC}
push %ebx
{$endif}
sub %edx, %eax { eax = src - dest }
jz .Lcancel { exit if src=dest }
mov %eax, %ebx
neg %ebx
cmp %ebx, %ecx
@ -101,19 +101,17 @@ asm
ja .Lloop16f
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
cmp $-8, %ecx
jle .LFirstAndLast8f
fildq (%eax,%edx)
fistpq (%edx)
.LFirstAndLast8f:
fistpq (%edx,%ecx)
fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
fistpq (%ebx) { Important for <8-byte step between src and dest. }
pop %ebx
ret
.byte 102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
.byte 102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
{ backwards move }
.Lback:
fstp %st(0)
fildq 8(%eax,%edx) { Second int64 from the start. }
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
and $-8, %ecx
@ -134,12 +132,8 @@ asm
ja .Lloop16b
.LPost16b:
cmp $-8, %ecx
jle .LFirstAndLast8b
fildq -8(%eax,%edx)
fistpq -8(%edx)
.LFirstAndLast8b:
sub %ecx, %edx
fistpq -8(%edx)
fistpq -7(%ebx)
fistpq -16(%edx)
pop %ebx
@ -156,6 +150,7 @@ asm
{$endif}
movq (%eax), %mm4 { First and last 8 bytes. }
movq -8(%eax,%ecx), %mm5
movq -16(%eax,%ecx), %mm3 { Second vector from the end. }
sub %edx, %eax { eax = src - dest }
jz .Lquit { exit if src=dest }
mov %eax, %ebx
@ -183,21 +178,18 @@ asm
ja .Lloop16f
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
cmp $-8, %ecx
jle .LFirstAndLast8f
movq (%eax,%edx), %mm0
movq %mm0, (%edx)
.LFirstAndLast8f:
movq %mm3, (%edx,%ecx)
movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
.Lquit:
emms
pop %ebx
ret
.byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
.byte 144 { Turns .balign 16 before .Lloop16b into a no-op. }
{ backwards move }
.Lback:
movq 8(%eax,%edx), %mm3 { Second vector from the start. }
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
and $-8, %ecx
@ -218,12 +210,8 @@ asm
ja .Lloop16b
.LPost16b:
cmp $-8, %ecx
jle .LFirstAndLast8b
movq -8(%eax,%edx), %mm0
movq %mm0, -8(%edx)
.LFirstAndLast8b:
sub %ecx, %edx
movq %mm3, -8(%edx)
movq %mm4, -16(%edx)
movq %mm5, -7(%ebx)
emms
@ -266,9 +254,12 @@ Move_8OrMore_SSE_9to15:
pop %ebx
{$endif}
ret
.byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
.byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
Move_8OrMore_SSE_33OrMore:
movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
{ but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
sub %edx, %eax { eax = src - dest }
jz .Lquit { exit if src=dest }
{$ifndef FPC_PIC}
@ -305,11 +296,7 @@ Move_8OrMore_SSE_33OrMore:
ja .Lloop32f
.LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
cmp $-16, %ecx
jle .LFirstAndLast16f
movups (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
.LFirstAndLast16f:
movups %xmm3, (%edx, %ecx)
movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
pop %ebx
@ -326,11 +313,7 @@ Move_8OrMore_SSE_33OrMore:
ja .Lalignedloop32f
.LalignedPost32f:
cmp $-16, %ecx
jle .LalignedFirstAndLast16f
movaps (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
.LalignedFirstAndLast16f:
movups %xmm3, (%edx, %ecx)
movups %xmm5, 16(%edx,%ecx)
movups %xmm4, (%ebx)
pop %ebx
@ -380,7 +363,7 @@ Move_8OrMore_SSE_33OrMore:
sfence
add $PrefetchDistance+64, %ecx
jmp .LRestAfterNTf
.byte {$ifdef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
.byte {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
Move_8OrMore_SSE_CancelERMSBackwards:
{ Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 arent read, ebx isn't pushed if not FPC_PIC. }
@ -394,6 +377,7 @@ Move_8OrMore_SSE_CancelERMSBackwards:
{ backwards move }
.Lback:
movups 16(%eax,%edx), %xmm3 { Second vector from the start. }
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
and $-16, %ecx
@ -417,12 +401,8 @@ Move_8OrMore_SSE_CancelERMSBackwards:
ja .Lloop32b
.LPost32b:
cmp $-16, %ecx
jle .LFirstAndLast16b
movups -16(%eax,%edx), %xmm0
movaps %xmm0, -16(%edx)
.LFirstAndLast16b:
sub %ecx, %edx
movups %xmm3, -16(%edx)
movups %xmm4, -32(%edx)
movups %xmm5, -15(%ebx)
pop %ebx

View File

@ -134,9 +134,12 @@ asm
mov %r9, -8(%rdx,%r8)
.Lquit:
ret
.byte 102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
.byte 102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
.L33OrMore:
movdqu -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
{ but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
sub %rdx, %rcx { rcx = src - dest }
jz .Lquit { exit if src=dest }
@ -168,11 +171,7 @@ asm
ja .Lloop32f
.LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
cmp $-16, %r8
jle .LFirstAndLast16f
movdqu (%rcx,%rdx), %xmm0
movdqa %xmm0, (%rdx)
.LFirstAndLast16f:
movdqu %xmm3, (%rdx, %r8)
movdqu %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. }
ret
@ -216,10 +215,11 @@ asm
mfence
add $0x1000, %r8
jmpq .LRestAfterNTf { go handle remaining bytes }
.byte 102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
.byte 102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
{ backwards move }
.Lback:
movdqu 16(%rcx,%rdx), %xmm3 { Second vector from the start. }
lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes }
lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... }
and $-16, %r8
@ -243,12 +243,8 @@ asm
ja .Lloop32b
.LPost32b:
cmp $-16, %r8
jle .LFirstAndLast16b
movdqu -16(%rcx,%rdx), %xmm0
movdqa %xmm0, -16(%rdx)
.LFirstAndLast16b:
sub %r8, %rdx
movdqu %xmm3, -16(%rdx)
movdqu %xmm4, -32(%rdx)
movdqu %xmm5, -16(%r9)
ret