Shortcut Compare*(a, a) before entering the aligned loop.

This commit is contained in:
Rika Ichinose 2024-04-04 16:44:52 +03:00 committed by FPK
parent 8093b1ba0c
commit ce6db34224
2 changed files with 51 additions and 39 deletions

View File

@ -1363,26 +1363,7 @@ asm
pop %ebx
ret
.LNothing:
pop %ebx
xor %eax, %eax
ret
.LAligned32xLoop_TwoVectorsDiffer:
add %eax, %edx { restore edx = buf2 }
pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
inc %cx
jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
mov %ecx, %ebx
.LVec0Differs:
bsf %ebx, %ebx
movzbl (%eax,%ebx), %eax
movzbl (%edx,%ebx), %edx
sub %edx, %eax
pop %ebx
ret
.byte 144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
.byte 102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
CompareByte_CantOverReadBoth_AVX2:
cmp $16, %ecx
jb .LCantOverReadBoth
@ -1410,8 +1391,9 @@ CompareByte_CantOverReadBoth_AVX2:
jbe .LLastTwoVectors
{ More than four vectors: aligned loop. }
lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
sub %eax, %edx { edx = buf2 - buf1 }
jz .LNothing { Exit if buf1 = buf2. }
lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
sub %eax, %ecx { ecx = count to be handled with loop }
.balign 16 { No-op. }
@ -1444,10 +1426,25 @@ CompareByte_CantOverReadBoth_AVX2:
pmovmskb %xmm0, %ebx
inc %bx
jnz .LVecEm1Differs
.LNothing:
pop %ebx
xor %eax, %eax
ret
.LAligned32xLoop_TwoVectorsDiffer:
add %eax, %edx { restore edx = buf2 }
pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
inc %cx
jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
mov %ecx, %ebx
.LVec0Differs:
bsf %ebx, %ebx
movzbl (%eax,%ebx), %eax
movzbl (%edx,%ebx), %edx
sub %edx, %eax
pop %ebx
ret
.LVec1Differs:
xor %ecx, %ecx
.LVecEm1Differs:
@ -1563,6 +1560,7 @@ asm
{ bzhi %ecx, %ebx, %ecx }
.byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
jnz .LVec0Differs
.LNothing:
vzeroupper
pop %ebx
xor %eax, %eax
@ -1584,6 +1582,20 @@ asm
pop %ebx
ret
.LVec1Differs:
xor %ecx, %ecx
.LVecEm1Differs:
add $32, %ecx
.LVecEm2Differs:
vzeroupper
tzcnt %ebx, %ebx
add %ecx, %ebx
movzbl (%eax,%ebx), %eax
movzbl (%edx,%ebx), %edx
sub %edx, %eax
pop %ebx
ret
.LVecOrMore:
{ Compare first vectors. }
vmovdqu (%eax), %ymm0
@ -1606,8 +1618,9 @@ asm
jbe .LLastTwoVectors
{ More than four vectors: aligned loop. }
lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
sub %eax, %edx { edx = buf2 - buf1 }
jz .LNothing { Exit if buf1 = buf2. }
lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
and $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
sub %eax, %ecx { ecx = count to be handled with loop }
.balign 16 { No-op. }
@ -1641,20 +1654,6 @@ asm
vzeroupper
pop %ebx
xor %eax, %eax
ret
.LVec1Differs:
xor %ecx, %ecx
.LVecEm1Differs:
add $32, %ecx
.LVecEm2Differs:
vzeroupper
tzcnt %ebx, %ebx
add %ecx, %ebx
movzbl (%eax,%ebx), %eax
movzbl (%edx,%ebx), %edx
sub %edx, %eax
pop %ebx
end;
{$ifndef CPUX86_HAS_BMI2}
@ -1795,6 +1794,7 @@ asm
pop %ebx
ret
.byte 102,102,102,102,102,102,102,102,102,102,102,144
.LVecOrMore:
movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
movdqu (%eax), %xmm1
@ -1807,6 +1807,8 @@ asm
sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
jle .LLastVec
test %edx, %edx
jz .LNothing { Exit if buf1 = buf2. }
push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
add %eax, %ecx
and $-16, %eax { align buf1; +16 is performed by the loop. }
@ -1931,6 +1933,8 @@ asm
sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
jle .LLastVec
test %edx, %edx
jz .LNothing { Exit if buf1 = buf2. }
push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
add %eax, %ecx
and $-16, %eax { align buf1; +16 is performed by the loop. }
@ -1955,6 +1959,7 @@ asm
pmovmskb %xmm0, %ebx
inc %bx
jnz .LVec0Differs
.LNothing:
pop %ebx
xor %eax, %eax
ret

View File

@ -987,7 +987,7 @@ asm
movzbl (%rcx,%rax), %eax
sub %rdx, %rax
ret
.byte 0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
.byte 102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
.LVecOrMore:
{ Compare first vectors. }
@ -1013,8 +1013,9 @@ asm
jbe .LLastTwoVectors
{ More than four vectors: aligned loop. }
lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
sub %rcx, %rdx { rdx = buf2 - buf1 }
jz .LNothing { Exit if buf1 = buf2. }
lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
and $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
sub %rcx, %r8 { r8 = count to be handled with loop }
.balign 16 { no-op }
@ -1200,6 +1201,7 @@ asm
sub %rdx, %rax
ret
.byte 102,102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned8xLoop_Body into a no-op. }
.LVecOrMore:
movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
movdqu (%rcx), %xmm1
@ -1212,12 +1214,14 @@ asm
sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
jle .LLastVec
test %rdx, %rdx
jz .LNothing { Exit if buf1 = buf2. }
mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
add %rcx, %r8
and $-16, %rcx { align buf1; +16 is performed by the loop. }
sub %rcx, %r8
.balign 16
.balign 16 { no-op }
.LAligned8xLoop_Body:
add $16, %rcx
movdqu (%rdx,%rcx), %xmm0
@ -1278,6 +1282,8 @@ asm
sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
jle .LLastVec
test %rdx, %rdx
jz .LNothing { Exit if buf1 = buf2. }
mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
add %rcx, %r8
and $-16, %rcx { align buf1; +16 is performed by the loop. }
@ -1301,6 +1307,7 @@ asm
pmovmskb %xmm0, %eax
inc %ax
jnz .LVec0Differs
.LNothing:
xor %eax, %eax
ret