From 7bf502ad40afd7b63b163f9d898442daac4cad08 Mon Sep 17 00:00:00 2001 From: Rika Ichinose Date: Thu, 8 Feb 2024 11:46:17 +0300 Subject: [PATCH] Change Mov*DQ to Mov*PS; they are always equivalent because no operations but the memory transfers are performed, and 1 byte shorter each. --- rtl/x86_64/x86_64.inc | 76 +++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc index 9dbecb0027..5ce13eeb7b 100644 --- a/rtl/x86_64/x86_64.inc +++ b/rtl/x86_64/x86_64.inc @@ -102,12 +102,12 @@ asm jle .L4to8 cmp $16, %r8 jle .L9to16 - movdqu (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. } - movdqu -16(%rcx,%r8), %xmm5 + movups (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. } + movups -16(%rcx,%r8), %xmm5 cmp $32, %r8 jg .L33OrMore - movdqu %xmm4, (%rdx) { 17–32 bytes } - movdqu %xmm5, -16(%rdx,%r8) + movups %xmm4, (%rdx) { 17–32 bytes } + movups %xmm5, -16(%rdx,%r8) ret .balign 16 @@ -137,10 +137,10 @@ asm mov %r9, -8(%rdx,%r8) .Lquit: ret - .byte 102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } + .byte 102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } .L33OrMore: - movdqu -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), } + movups -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), } { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. } sub %rdx, %rcx { rcx = src - dest } @@ -165,18 +165,18 @@ asm .balign 16 { no-op } .Lloop32f: - movdqu (%rcx,%rdx), %xmm0 - movdqa %xmm0, (%rdx) - movdqu 16(%rcx,%rdx), %xmm0 - movdqa %xmm0, 16(%rdx) + movups (%rcx,%rdx), %xmm0 + movaps %xmm0, (%rdx) + movups 16(%rcx,%rdx), %xmm0 + movaps %xmm0, 16(%rdx) add $32, %rdx sub $32, %r8 ja .Lloop32f .LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. } - movdqu %xmm3, (%rdx, %r8) - movdqu %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. } - movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. } + movups %xmm3, (%rdx, %r8) + movups %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. } + movups %xmm4, (%r9) { Important for <16-byte step between src and dest. } ret .balign 16 @@ -188,14 +188,14 @@ asm .balign 16 { no-op } .Lntloop64f: prefetchnta 0+PrefetchDistance(%rcx,%rdx,1) - movdqu (%rcx,%rdx,1), %xmm0 - movntdq %xmm0, (%rdx) - movdqu 16(%rcx,%rdx,1), %xmm0 - movntdq %xmm0, 16(%rdx) - movdqu 32(%rcx,%rdx,1), %xmm0 - movntdq %xmm0, 32(%rdx) - movdqu 48(%rcx,%rdx,1), %xmm0 - movntdq %xmm0, 48(%rdx) + movups (%rcx,%rdx,1), %xmm0 + movntps %xmm0, (%rdx) + movups 16(%rcx,%rdx,1), %xmm0 + movntps %xmm0, 16(%rdx) + movups 32(%rcx,%rdx,1), %xmm0 + movntps %xmm0, 32(%rdx) + movups 48(%rcx,%rdx,1), %xmm0 + movntps %xmm0, 48(%rdx) add $64, %rdx sub $64, %r8 jae .Lntloop64f @@ -203,11 +203,11 @@ asm sfence add $PrefetchDistance+64, %r8 jmpq .LRestAfterNTf { go handle remaining bytes } - .byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } + .byte 102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } { backwards move } .Lback: - movdqu 16(%rcx,%rdx), %xmm3 { Second vector from the start. } + movups 16(%rcx,%rdx), %xmm3 { Second vector from the start. } lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes } lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... } and $-16, %r8 @@ -223,18 +223,18 @@ asm .balign 16 { no-op } .Lloop32b: sub $32, %rdx - movdqu 16(%rcx,%rdx), %xmm0 - movdqa %xmm0, 16(%rdx) - movdqu (%rcx,%rdx), %xmm0 - movdqa %xmm0, (%rdx) + movups 16(%rcx,%rdx), %xmm0 + movaps %xmm0, 16(%rdx) + movups (%rcx,%rdx), %xmm0 + movaps %xmm0, (%rdx) sub $32, %r8 ja .Lloop32b .LPost32b: sub %r8, %rdx - movdqu %xmm3, -16(%rdx) - movdqu %xmm4, -32(%rdx) - movdqu %xmm5, -16(%r9) + movups %xmm3, -16(%rdx) + movups %xmm4, -32(%rdx) + movups %xmm5, -16(%r9) ret .balign 16 @@ -247,14 +247,14 @@ asm .Lntloop64b: prefetchnta -PrefetchDistance(%rcx,%rdx,1) sub $64, %rdx - movdqu 48(%rcx,%rdx,1), %xmm0 - movntdq %xmm0, 48(%rdx) - movdqu 32(%rcx,%rdx,1), %xmm0 - movntdq %xmm0, 32(%rdx) - movdqu 16(%rcx,%rdx,1), %xmm0 - movntdq %xmm0, 16(%rdx) - movdqu (%rcx,%rdx,1), %xmm0 - movntdq %xmm0, (%rdx) + movups 48(%rcx,%rdx,1), %xmm0 + movntps %xmm0, 48(%rdx) + movups 32(%rcx,%rdx,1), %xmm0 + movntps %xmm0, 32(%rdx) + movups 16(%rcx,%rdx,1), %xmm0 + movntps %xmm0, 16(%rdx) + movups (%rcx,%rdx,1), %xmm0 + movntps %xmm0, (%rdx) sub $64, %r8 jae .Lntloop64b