From 0b5998ee8bf31b127aa942905f6b8aee4e6edd03 Mon Sep 17 00:00:00 2001 From: Rika Ichinose Date: Thu, 8 Feb 2024 11:34:36 +0300 Subject: [PATCH] =?UTF-8?q?Write=20two=20last=20values=20after=202=C3=97?= =?UTF-8?q?=20loops=20unconditionally=20instead=20of=20an=20extra=20check.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rtl/i386/fastmove.inc | 68 +++++++++++++++---------------------------- rtl/x86_64/x86_64.inc | 20 +++++-------- 2 files changed, 32 insertions(+), 56 deletions(-) diff --git a/rtl/i386/fastmove.inc b/rtl/i386/fastmove.inc index 84b59e1844..40b2fa15ae 100644 --- a/rtl/i386/fastmove.inc +++ b/rtl/i386/fastmove.inc @@ -63,19 +63,19 @@ asm ret .Lcancel: + fstp %st(0) { Pop the “second int64 from the end” .L33OrMore loads. } fucompp { Pop two elements loaded at the beginning. } -{$ifdef FPC_PIC} pop %ebx -{$endif} ret - .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. } + .byte 102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. } .L33OrMore: - sub %edx, %eax { eax = src - dest } - jz .Lcancel { exit if src=dest } + fildq -16(%eax,%ecx) { Second int64 from the end. } {$ifndef FPC_PIC} push %ebx {$endif} + sub %edx, %eax { eax = src - dest } + jz .Lcancel { exit if src=dest } mov %eax, %ebx neg %ebx cmp %ebx, %ecx @@ -101,19 +101,17 @@ asm ja .Lloop16f .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. } - cmp $-8, %ecx - jle .LFirstAndLast8f - fildq (%eax,%edx) - fistpq (%edx) -.LFirstAndLast8f: + fistpq (%edx,%ecx) fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. } fistpq (%ebx) { Important for <8-byte step between src and dest. } pop %ebx ret - .byte 102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. } + .byte 102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. } { backwards move } .Lback: + fstp %st(0) + fildq 8(%eax,%edx) { Second int64 from the start. } lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes } mov %ebx, %ecx { move dest to the previous 8-byte boundary... } and $-8, %ecx @@ -134,12 +132,8 @@ asm ja .Lloop16b .LPost16b: - cmp $-8, %ecx - jle .LFirstAndLast8b - fildq -8(%eax,%edx) - fistpq -8(%edx) -.LFirstAndLast8b: sub %ecx, %edx + fistpq -8(%edx) fistpq -7(%ebx) fistpq -16(%edx) pop %ebx @@ -156,6 +150,7 @@ asm {$endif} movq (%eax), %mm4 { First and last 8 bytes. } movq -8(%eax,%ecx), %mm5 + movq -16(%eax,%ecx), %mm3 { Second vector from the end. } sub %edx, %eax { eax = src - dest } jz .Lquit { exit if src=dest } mov %eax, %ebx @@ -183,21 +178,18 @@ asm ja .Lloop16f .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. } - cmp $-8, %ecx - jle .LFirstAndLast8f - movq (%eax,%edx), %mm0 - movq %mm0, (%edx) -.LFirstAndLast8f: + movq %mm3, (%edx,%ecx) movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. } movq %mm4, (%ebx) { Important for <8-byte step between src and dest. } .Lquit: emms pop %ebx ret - .byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. } + .byte 144 { Turns .balign 16 before .Lloop16b into a no-op. } { backwards move } .Lback: + movq 8(%eax,%edx), %mm3 { Second vector from the start. } lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes } mov %ebx, %ecx { move dest to the previous 8-byte boundary... } and $-8, %ecx @@ -218,12 +210,8 @@ asm ja .Lloop16b .LPost16b: - cmp $-8, %ecx - jle .LFirstAndLast8b - movq -8(%eax,%edx), %mm0 - movq %mm0, -8(%edx) -.LFirstAndLast8b: sub %ecx, %edx + movq %mm3, -8(%edx) movq %mm4, -16(%edx) movq %mm5, -7(%ebx) emms @@ -266,9 +254,12 @@ Move_8OrMore_SSE_9to15: pop %ebx {$endif} ret - .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } + .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } Move_8OrMore_SSE_33OrMore: + movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), } + { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. } + sub %edx, %eax { eax = src - dest } jz .Lquit { exit if src=dest } {$ifndef FPC_PIC} @@ -305,11 +296,7 @@ Move_8OrMore_SSE_33OrMore: ja .Lloop32f .LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. } - cmp $-16, %ecx - jle .LFirstAndLast16f - movups (%eax,%edx), %xmm0 - movaps %xmm0, (%edx) -.LFirstAndLast16f: + movups %xmm3, (%edx, %ecx) movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. } movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. } pop %ebx @@ -326,11 +313,7 @@ Move_8OrMore_SSE_33OrMore: ja .Lalignedloop32f .LalignedPost32f: - cmp $-16, %ecx - jle .LalignedFirstAndLast16f - movaps (%eax,%edx), %xmm0 - movaps %xmm0, (%edx) -.LalignedFirstAndLast16f: + movups %xmm3, (%edx, %ecx) movups %xmm5, 16(%edx,%ecx) movups %xmm4, (%ebx) pop %ebx @@ -380,7 +363,7 @@ Move_8OrMore_SSE_33OrMore: sfence add $PrefetchDistance+64, %ecx jmp .LRestAfterNTf - .byte {$ifdef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } + .byte {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } Move_8OrMore_SSE_CancelERMSBackwards: { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. } @@ -394,6 +377,7 @@ Move_8OrMore_SSE_CancelERMSBackwards: { backwards move } .Lback: + movups 16(%eax,%edx), %xmm3 { Second vector from the start. } lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes } mov %ebx, %ecx { move dest to the previous 16-byte boundary... } and $-16, %ecx @@ -417,12 +401,8 @@ Move_8OrMore_SSE_CancelERMSBackwards: ja .Lloop32b .LPost32b: - cmp $-16, %ecx - jle .LFirstAndLast16b - movups -16(%eax,%edx), %xmm0 - movaps %xmm0, -16(%edx) -.LFirstAndLast16b: sub %ecx, %edx + movups %xmm3, -16(%edx) movups %xmm4, -32(%edx) movups %xmm5, -15(%ebx) pop %ebx diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc index 850aa127d7..666051717e 100644 --- a/rtl/x86_64/x86_64.inc +++ b/rtl/x86_64/x86_64.inc @@ -134,9 +134,12 @@ asm mov %r9, -8(%rdx,%r8) .Lquit: ret - .byte 102,144 { Turns .balign 16 before .Lloop32f into a no-op. } + .byte 102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } .L33OrMore: + movdqu -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), } + { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. } + sub %rdx, %rcx { rcx = src - dest } jz .Lquit { exit if src=dest } @@ -168,11 +171,7 @@ asm ja .Lloop32f .LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. } - cmp $-16, %r8 - jle .LFirstAndLast16f - movdqu (%rcx,%rdx), %xmm0 - movdqa %xmm0, (%rdx) -.LFirstAndLast16f: + movdqu %xmm3, (%rdx, %r8) movdqu %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. } movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. } ret @@ -216,10 +215,11 @@ asm mfence add $0x1000, %r8 jmpq .LRestAfterNTf { go handle remaining bytes } - .byte 102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } + .byte 102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } { backwards move } .Lback: + movdqu 16(%rcx,%rdx), %xmm3 { Second vector from the start. } lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes } lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... } and $-16, %r8 @@ -243,12 +243,8 @@ asm ja .Lloop32b .LPost32b: - cmp $-16, %r8 - jle .LFirstAndLast16b - movdqu -16(%rcx,%rdx), %xmm0 - movdqa %xmm0, -16(%rdx) -.LFirstAndLast16b: sub %r8, %rdx + movdqu %xmm3, -16(%rdx) movdqu %xmm4, -32(%rdx) movdqu %xmm5, -16(%r9) ret