Change Mov*DQ to Mov*PS; they are always equivalent because no operations but the memory transfers are performed, and 1 byte shorter each.

This commit is contained in:
Rika Ichinose 2024-02-08 11:46:17 +03:00 committed by FPK
parent 12f18177ae
commit 7bf502ad40

View File

@ -102,12 +102,12 @@ asm
jle .L4to8
cmp $16, %r8
jle .L9to16
movdqu (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 1732 branch. }
movdqu -16(%rcx,%r8), %xmm5
movups (%rcx), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 1732 branch. }
movups -16(%rcx,%r8), %xmm5
cmp $32, %r8
jg .L33OrMore
movdqu %xmm4, (%rdx) { 1732 bytes }
movdqu %xmm5, -16(%rdx,%r8)
movups %xmm4, (%rdx) { 1732 bytes }
movups %xmm5, -16(%rdx,%r8)
ret
.balign 16
@ -137,10 +137,10 @@ asm
mov %r9, -8(%rdx,%r8)
.Lquit:
ret
.byte 102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
.byte 102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
.L33OrMore:
movdqu -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
movups -32(%rcx,%r8), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
{ but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
sub %rdx, %rcx { rcx = src - dest }
@ -165,18 +165,18 @@ asm
.balign 16 { no-op }
.Lloop32f:
movdqu (%rcx,%rdx), %xmm0
movdqa %xmm0, (%rdx)
movdqu 16(%rcx,%rdx), %xmm0
movdqa %xmm0, 16(%rdx)
movups (%rcx,%rdx), %xmm0
movaps %xmm0, (%rdx)
movups 16(%rcx,%rdx), %xmm0
movaps %xmm0, 16(%rdx)
add $32, %rdx
sub $32, %r8
ja .Lloop32f
.LPost32f: { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
movdqu %xmm3, (%rdx, %r8)
movdqu %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
movdqu %xmm4, (%r9) { Important for <16-byte step between src and dest. }
movups %xmm3, (%rdx, %r8)
movups %xmm5, 16(%rdx,%r8) { Write first and last 16 bytes after everything else. }
movups %xmm4, (%r9) { Important for <16-byte step between src and dest. }
ret
.balign 16
@ -188,14 +188,14 @@ asm
.balign 16 { no-op }
.Lntloop64f:
prefetchnta 0+PrefetchDistance(%rcx,%rdx,1)
movdqu (%rcx,%rdx,1), %xmm0
movntdq %xmm0, (%rdx)
movdqu 16(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 16(%rdx)
movdqu 32(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 32(%rdx)
movdqu 48(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 48(%rdx)
movups (%rcx,%rdx,1), %xmm0
movntps %xmm0, (%rdx)
movups 16(%rcx,%rdx,1), %xmm0
movntps %xmm0, 16(%rdx)
movups 32(%rcx,%rdx,1), %xmm0
movntps %xmm0, 32(%rdx)
movups 48(%rcx,%rdx,1), %xmm0
movntps %xmm0, 48(%rdx)
add $64, %rdx
sub $64, %r8
jae .Lntloop64f
@ -203,11 +203,11 @@ asm
sfence
add $PrefetchDistance+64, %r8
jmpq .LRestAfterNTf { go handle remaining bytes }
.byte 102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
.byte 102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
{ backwards move }
.Lback:
movdqu 16(%rcx,%rdx), %xmm3 { Second vector from the start. }
movups 16(%rcx,%rdx), %xmm3 { Second vector from the start. }
lea (%rdx,%r8), %r9 { points to the end of dest; remember to write last 16 bytes }
lea -1(%r9), %r8 { move dest to the previous 16-byte boundary... }
and $-16, %r8
@ -223,18 +223,18 @@ asm
.balign 16 { no-op }
.Lloop32b:
sub $32, %rdx
movdqu 16(%rcx,%rdx), %xmm0
movdqa %xmm0, 16(%rdx)
movdqu (%rcx,%rdx), %xmm0
movdqa %xmm0, (%rdx)
movups 16(%rcx,%rdx), %xmm0
movaps %xmm0, 16(%rdx)
movups (%rcx,%rdx), %xmm0
movaps %xmm0, (%rdx)
sub $32, %r8
ja .Lloop32b
.LPost32b:
sub %r8, %rdx
movdqu %xmm3, -16(%rdx)
movdqu %xmm4, -32(%rdx)
movdqu %xmm5, -16(%r9)
movups %xmm3, -16(%rdx)
movups %xmm4, -32(%rdx)
movups %xmm5, -16(%r9)
ret
.balign 16
@ -247,14 +247,14 @@ asm
.Lntloop64b:
prefetchnta -PrefetchDistance(%rcx,%rdx,1)
sub $64, %rdx
movdqu 48(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 48(%rdx)
movdqu 32(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 32(%rdx)
movdqu 16(%rcx,%rdx,1), %xmm0
movntdq %xmm0, 16(%rdx)
movdqu (%rcx,%rdx,1), %xmm0
movntdq %xmm0, (%rdx)
movups 48(%rcx,%rdx,1), %xmm0
movntps %xmm0, 48(%rdx)
movups 32(%rcx,%rdx,1), %xmm0
movntps %xmm0, 32(%rdx)
movups 16(%rcx,%rdx,1), %xmm0
movntps %xmm0, 16(%rdx)
movups (%rcx,%rdx,1), %xmm0
movntps %xmm0, (%rdx)
sub $64, %r8
jae .Lntloop64b