From 7bf502ad40afd7b63b163f9d898442daac4cad08 Mon Sep 17 00:00:00 2001
From: Rika Ichinose <rrunewalsh@gmail.com>
Date: Thu, 8 Feb 2024 11:46:17 +0300
Subject: [PATCH] Change Mov*DQ to Mov*PS; they are always equivalent because
 no operations but the memory transfers are performed, and 1 byte shorter
 each.

---
 rtl/x86_64/x86_64.inc | 76 +++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc
index 9dbecb0027..5ce13eeb7b 100644
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -102,12 +102,12 @@ asm
     jle    .L4to8
     cmp    $16, %r8
     jle    .L9to16
-    movdqu (%rcx), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
-    movdqu -16(%rcx,%r8), %xmm5
+    movups (%rcx), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
+    movups -16(%rcx,%r8), %xmm5
     cmp    $32, %r8
     jg     .L33OrMore
-    movdqu %xmm4, (%rdx)         { 17–32 bytes }
-    movdqu %xmm5, -16(%rdx,%r8)
+    movups %xmm4, (%rdx)         { 17–32 bytes }
+    movups %xmm5, -16(%rdx,%r8)
     ret
 
     .balign 16
@@ -137,10 +137,10 @@ asm
     mov    %r9, -8(%rdx,%r8)
 .Lquit:
     ret
-    .byte  102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
+    .byte  102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
 
 .L33OrMore:
-    movdqu -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
+    movups -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
                                  { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
 
     sub    %rdx, %rcx            { rcx = src - dest }
@@ -165,18 +165,18 @@ asm
 
     .balign 16                   { no-op }
 .Lloop32f:
-    movdqu (%rcx,%rdx), %xmm0
-    movdqa %xmm0, (%rdx)
-    movdqu 16(%rcx,%rdx), %xmm0
-    movdqa %xmm0, 16(%rdx)
+    movups (%rcx,%rdx), %xmm0
+    movaps %xmm0, (%rdx)
+    movups 16(%rcx,%rdx), %xmm0
+    movaps %xmm0, 16(%rdx)
     add    $32, %rdx
     sub    $32, %r8
     ja     .Lloop32f
 
 .LPost32f:                       { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
-    movdqu %xmm3, (%rdx, %r8)
-    movdqu %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
-    movdqu %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
+    movups %xmm3, (%rdx, %r8)
+    movups %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
+    movups %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
     ret
 
     .balign 16
@@ -188,14 +188,14 @@ asm
     .balign 16                   { no-op }
 .Lntloop64f:
     prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
-    movdqu (%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, (%rdx)
-    movdqu 16(%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, 16(%rdx)
-    movdqu 32(%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, 32(%rdx)
-    movdqu 48(%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, 48(%rdx)
+    movups (%rcx,%rdx,1), %xmm0
+    movntps %xmm0, (%rdx)
+    movups 16(%rcx,%rdx,1), %xmm0
+    movntps %xmm0, 16(%rdx)
+    movups 32(%rcx,%rdx,1), %xmm0
+    movntps %xmm0, 32(%rdx)
+    movups 48(%rcx,%rdx,1), %xmm0
+    movntps %xmm0, 48(%rdx)
     add    $64, %rdx
     sub    $64, %r8
     jae    .Lntloop64f
@@ -203,11 +203,11 @@ asm
     sfence
     add    $PrefetchDistance+64, %r8
     jmpq   .LRestAfterNTf        { go handle remaining bytes }
-    .byte  102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
+    .byte  102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
 
 { backwards move }
 .Lback:
-    movdqu 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
+    movups 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
     lea    (%rdx,%r8), %r9       { points to the end of dest; remember to write last 16 bytes }
     lea    -1(%r9), %r8          { move dest to the previous 16-byte boundary... }
     and    $-16, %r8
@@ -223,18 +223,18 @@ asm
     .balign 16                   { no-op }
 .Lloop32b:
     sub    $32, %rdx
-    movdqu 16(%rcx,%rdx), %xmm0
-    movdqa %xmm0, 16(%rdx)
-    movdqu (%rcx,%rdx), %xmm0
-    movdqa %xmm0, (%rdx)
+    movups 16(%rcx,%rdx), %xmm0
+    movaps %xmm0, 16(%rdx)
+    movups (%rcx,%rdx), %xmm0
+    movaps %xmm0, (%rdx)
     sub    $32, %r8
     ja     .Lloop32b
 
 .LPost32b:
     sub    %r8, %rdx
-    movdqu %xmm3, -16(%rdx)
-    movdqu %xmm4, -32(%rdx)
-    movdqu %xmm5, -16(%r9)
+    movups %xmm3, -16(%rdx)
+    movups %xmm4, -32(%rdx)
+    movups %xmm5, -16(%r9)
     ret
 
     .balign 16
@@ -247,14 +247,14 @@ asm
 .Lntloop64b:
     prefetchnta -PrefetchDistance(%rcx,%rdx,1)
     sub    $64, %rdx
-    movdqu 48(%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, 48(%rdx)
-    movdqu 32(%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, 32(%rdx)
-    movdqu 16(%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, 16(%rdx)
-    movdqu (%rcx,%rdx,1), %xmm0
-    movntdq %xmm0, (%rdx)
+    movups 48(%rcx,%rdx,1), %xmm0
+    movntps %xmm0, 48(%rdx)
+    movups 32(%rcx,%rdx,1), %xmm0
+    movntps %xmm0, 32(%rdx)
+    movups 16(%rcx,%rdx,1), %xmm0
+    movntps %xmm0, 16(%rdx)
+    movups (%rcx,%rdx,1), %xmm0
+    movntps %xmm0, (%rdx)
     sub    $64, %r8
     jae    .Lntloop64b