From 0b5998ee8bf31b127aa942905f6b8aee4e6edd03 Mon Sep 17 00:00:00 2001
From: Rika Ichinose <rrunewalsh@gmail.com>
Date: Thu, 8 Feb 2024 11:34:36 +0300
Subject: [PATCH] =?UTF-8?q?Write=20two=20last=20values=20after=202=C3=97?=
 =?UTF-8?q?=20loops=20unconditionally=20instead=20of=20an=20extra=20check.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 rtl/i386/fastmove.inc | 68 +++++++++++++++----------------------------
 rtl/x86_64/x86_64.inc | 20 +++++--------
 2 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/rtl/i386/fastmove.inc b/rtl/i386/fastmove.inc
index 84b59e1844..40b2fa15ae 100644
--- a/rtl/i386/fastmove.inc
+++ b/rtl/i386/fastmove.inc
@@ -63,19 +63,19 @@ asm
     ret
 
 .Lcancel:
+    fstp   %st(0)                { Pop the “second int64 from the end” .L33OrMore loads. }
     fucompp                      { Pop two elements loaded at the beginning. }
-{$ifdef FPC_PIC}
     pop    %ebx
-{$endif}
     ret
-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
+    .byte  102,102,144           { Turns .balign 16 before .Lloop16f into a no-op. }
 
 .L33OrMore:
-    sub    %edx, %eax            { eax = src - dest }
-    jz     .Lcancel              { exit if src=dest }
+    fildq  -16(%eax,%ecx)        { Second int64 from the end. }
 {$ifndef FPC_PIC}
     push   %ebx
 {$endif}
+    sub    %edx, %eax            { eax = src - dest }
+    jz     .Lcancel              { exit if src=dest }
     mov    %eax, %ebx
     neg    %ebx
     cmp    %ebx, %ecx
@@ -101,19 +101,17 @@ asm
     ja     .Lloop16f
 
 .LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
-    cmp    $-8, %ecx
-    jle    .LFirstAndLast8f
-    fildq  (%eax,%edx)
-    fistpq (%edx)
-.LFirstAndLast8f:
+    fistpq (%edx,%ecx)
     fistpq 8(%edx,%ecx)          { Write first and last 8 bytes after everything else. }
     fistpq (%ebx)                { Important for <8-byte step between src and dest. }
     pop    %ebx
     ret
-    .byte  102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
+    .byte  102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
 
 { backwards move }
 .Lback:
+    fstp   %st(0)
+    fildq  8(%eax,%edx)          { Second int64 from the start. }
     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
     mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
     and    $-8, %ecx
@@ -134,12 +132,8 @@ asm
     ja     .Lloop16b
 
 .LPost16b:
-    cmp    $-8, %ecx
-    jle    .LFirstAndLast8b
-    fildq  -8(%eax,%edx)
-    fistpq -8(%edx)
-.LFirstAndLast8b:
     sub    %ecx, %edx
+    fistpq -8(%edx)
     fistpq -7(%ebx)
     fistpq -16(%edx)
     pop    %ebx
@@ -156,6 +150,7 @@ asm
 {$endif}
     movq   (%eax), %mm4          { First and last 8 bytes. }
     movq   -8(%eax,%ecx), %mm5
+    movq   -16(%eax,%ecx), %mm3  { Second vector from the end. }
     sub    %edx, %eax            { eax = src - dest }
     jz     .Lquit                { exit if src=dest }
     mov    %eax, %ebx
@@ -183,21 +178,18 @@ asm
     ja     .Lloop16f
 
 .LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
-    cmp    $-8, %ecx
-    jle    .LFirstAndLast8f
-    movq   (%eax,%edx), %mm0
-    movq   %mm0, (%edx)
-.LFirstAndLast8f:
+    movq   %mm3, (%edx,%ecx)
     movq   %mm5, 8(%edx,%ecx)    { Write first and last 8 bytes after everything else. }
     movq   %mm4, (%ebx)          { Important for <8-byte step between src and dest. }
 .Lquit:
     emms
     pop    %ebx
     ret
-    .byte  102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
+    .byte  144 { Turns .balign 16 before .Lloop16b into a no-op. }
 
 { backwards move }
 .Lback:
+    movq   8(%eax,%edx), %mm3    { Second vector from the start. }
     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
     mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
     and    $-8, %ecx
@@ -218,12 +210,8 @@ asm
     ja     .Lloop16b
 
 .LPost16b:
-    cmp    $-8, %ecx
-    jle    .LFirstAndLast8b
-    movq   -8(%eax,%edx), %mm0
-    movq   %mm0, -8(%edx)
-.LFirstAndLast8b:
     sub    %ecx, %edx
+    movq   %mm3, -8(%edx)
     movq   %mm4, -16(%edx)
     movq   %mm5, -7(%ebx)
     emms
@@ -266,9 +254,12 @@ Move_8OrMore_SSE_9to15:
     pop    %ebx
 {$endif}
     ret
-    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
+    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
 
 Move_8OrMore_SSE_33OrMore:
+    movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
+                                 { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
+
     sub    %edx, %eax            { eax = src - dest }
     jz     .Lquit                { exit if src=dest }
 {$ifndef FPC_PIC}
@@ -305,11 +296,7 @@ Move_8OrMore_SSE_33OrMore:
     ja     .Lloop32f
 
 .LPost32f:                       { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
-    cmp    $-16, %ecx
-    jle    .LFirstAndLast16f
-    movups (%eax,%edx), %xmm0
-    movaps %xmm0, (%edx)
-.LFirstAndLast16f:
+    movups %xmm3, (%edx, %ecx)
     movups %xmm5, 16(%edx,%ecx)  { Write first and last 16 bytes after everything else. }
     movups %xmm4, (%ebx)         { Important for <16-byte step between src and dest. }
     pop    %ebx
@@ -326,11 +313,7 @@ Move_8OrMore_SSE_33OrMore:
     ja     .Lalignedloop32f
 
 .LalignedPost32f:
-    cmp    $-16, %ecx
-    jle    .LalignedFirstAndLast16f
-    movaps (%eax,%edx), %xmm0
-    movaps %xmm0, (%edx)
-.LalignedFirstAndLast16f:
+    movups %xmm3, (%edx, %ecx)
     movups %xmm5, 16(%edx,%ecx)
     movups %xmm4, (%ebx)
     pop    %ebx
@@ -380,7 +363,7 @@ Move_8OrMore_SSE_33OrMore:
     sfence
     add    $PrefetchDistance+64, %ecx
     jmp    .LRestAfterNTf
-    .byte  {$ifdef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
+    .byte  {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
 
 Move_8OrMore_SSE_CancelERMSBackwards:
     { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. }
@@ -394,6 +377,7 @@ Move_8OrMore_SSE_CancelERMSBackwards:
 
 { backwards move }
 .Lback:
+    movups 16(%eax,%edx), %xmm3  { Second vector from the start. }
     lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
     mov    %ebx, %ecx            { move dest to the previous 16-byte boundary... }
     and    $-16, %ecx
@@ -417,12 +401,8 @@ Move_8OrMore_SSE_CancelERMSBackwards:
     ja     .Lloop32b
 
 .LPost32b:
-    cmp    $-16, %ecx
-    jle    .LFirstAndLast16b
-    movups -16(%eax,%edx), %xmm0
-    movaps %xmm0, -16(%edx)
-.LFirstAndLast16b:
     sub    %ecx, %edx
+    movups %xmm3, -16(%edx)
     movups %xmm4, -32(%edx)
     movups %xmm5, -15(%ebx)
     pop    %ebx
diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc
index 850aa127d7..666051717e 100644
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -134,9 +134,12 @@ asm
     mov    %r9, -8(%rdx,%r8)
 .Lquit:
     ret
-    .byte  102,144               { Turns .balign 16 before .Lloop32f into a no-op. }
+    .byte  102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
 
 .L33OrMore:
+    movdqu -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
+                                 { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
+
     sub    %rdx, %rcx            { rcx = src - dest }
     jz     .Lquit                { exit if src=dest }
 
@@ -168,11 +171,7 @@ asm
     ja     .Lloop32f
 
 .LPost32f:                       { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
-    cmp    $-16, %r8
-    jle    .LFirstAndLast16f
-    movdqu (%rcx,%rdx), %xmm0
-    movdqa %xmm0, (%rdx)
-.LFirstAndLast16f:
+    movdqu %xmm3, (%rdx, %r8)
     movdqu %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
     movdqu %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
     ret
@@ -216,10 +215,11 @@ asm
     mfence
     add    $0x1000, %r8
     jmpq   .LRestAfterNTf        { go handle remaining bytes }
-    .byte  102,102,144           { Turns .balign 16 before .Lloop32b into a no-op. }
+    .byte  102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
 
 { backwards move }
 .Lback:
+    movdqu 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
     lea    (%rdx,%r8), %r9       { points to the end of dest; remember to write last 16 bytes }
     lea    -1(%r9), %r8          { move dest to the previous 16-byte boundary... }
     and    $-16, %r8
@@ -243,12 +243,8 @@ asm
     ja     .Lloop32b
 
 .LPost32b:
-    cmp    $-16, %r8
-    jle    .LFirstAndLast16b
-    movdqu -16(%rcx,%rdx), %xmm0
-    movdqa %xmm0, -16(%rdx)
-.LFirstAndLast16b:
     sub    %r8, %rdx
+    movdqu %xmm3, -16(%rdx)
     movdqu %xmm4, -32(%rdx)
     movdqu %xmm5, -16(%r9)
     ret