mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-05 04:18:31 +02:00
Shortcut Compare*(a, a) before entering the aligned loop.
This commit is contained in:
parent
8093b1ba0c
commit
ce6db34224
@ -1363,26 +1363,7 @@ asm
|
||||
pop %ebx
|
||||
ret
|
||||
|
||||
.LNothing:
|
||||
pop %ebx
|
||||
xor %eax, %eax
|
||||
ret
|
||||
|
||||
.LAligned32xLoop_TwoVectorsDiffer:
|
||||
add %eax, %edx { restore edx = buf2 }
|
||||
pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
|
||||
inc %cx
|
||||
jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
|
||||
mov %ecx, %ebx
|
||||
.LVec0Differs:
|
||||
bsf %ebx, %ebx
|
||||
movzbl (%eax,%ebx), %eax
|
||||
movzbl (%edx,%ebx), %edx
|
||||
sub %edx, %eax
|
||||
pop %ebx
|
||||
ret
|
||||
|
||||
.byte 144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
|
||||
.byte 102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
|
||||
CompareByte_CantOverReadBoth_AVX2:
|
||||
cmp $16, %ecx
|
||||
jb .LCantOverReadBoth
|
||||
@ -1410,8 +1391,9 @@ CompareByte_CantOverReadBoth_AVX2:
|
||||
jbe .LLastTwoVectors
|
||||
|
||||
{ More than four vectors: aligned loop. }
|
||||
lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
|
||||
sub %eax, %edx { edx = buf2 - buf1 }
|
||||
jz .LNothing { Exit if buf1 = buf2. }
|
||||
lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
|
||||
and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
|
||||
sub %eax, %ecx { ecx = count to be handled with loop }
|
||||
.balign 16 { No-op. }
|
||||
@ -1444,10 +1426,25 @@ CompareByte_CantOverReadBoth_AVX2:
|
||||
pmovmskb %xmm0, %ebx
|
||||
inc %bx
|
||||
jnz .LVecEm1Differs
|
||||
.LNothing:
|
||||
pop %ebx
|
||||
xor %eax, %eax
|
||||
ret
|
||||
|
||||
.LAligned32xLoop_TwoVectorsDiffer:
|
||||
add %eax, %edx { restore edx = buf2 }
|
||||
pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
|
||||
inc %cx
|
||||
jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
|
||||
mov %ecx, %ebx
|
||||
.LVec0Differs:
|
||||
bsf %ebx, %ebx
|
||||
movzbl (%eax,%ebx), %eax
|
||||
movzbl (%edx,%ebx), %edx
|
||||
sub %edx, %eax
|
||||
pop %ebx
|
||||
ret
|
||||
|
||||
.LVec1Differs:
|
||||
xor %ecx, %ecx
|
||||
.LVecEm1Differs:
|
||||
@ -1563,6 +1560,7 @@ asm
|
||||
{ bzhi %ecx, %ebx, %ecx }
|
||||
.byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
|
||||
jnz .LVec0Differs
|
||||
.LNothing:
|
||||
vzeroupper
|
||||
pop %ebx
|
||||
xor %eax, %eax
|
||||
@ -1584,6 +1582,20 @@ asm
|
||||
pop %ebx
|
||||
ret
|
||||
|
||||
.LVec1Differs:
|
||||
xor %ecx, %ecx
|
||||
.LVecEm1Differs:
|
||||
add $32, %ecx
|
||||
.LVecEm2Differs:
|
||||
vzeroupper
|
||||
tzcnt %ebx, %ebx
|
||||
add %ecx, %ebx
|
||||
movzbl (%eax,%ebx), %eax
|
||||
movzbl (%edx,%ebx), %edx
|
||||
sub %edx, %eax
|
||||
pop %ebx
|
||||
ret
|
||||
|
||||
.LVecOrMore:
|
||||
{ Compare first vectors. }
|
||||
vmovdqu (%eax), %ymm0
|
||||
@ -1606,8 +1618,9 @@ asm
|
||||
jbe .LLastTwoVectors
|
||||
|
||||
{ More than four vectors: aligned loop. }
|
||||
lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
|
||||
sub %eax, %edx { edx = buf2 - buf1 }
|
||||
jz .LNothing { Exit if buf1 = buf2. }
|
||||
lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
|
||||
and $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
|
||||
sub %eax, %ecx { ecx = count to be handled with loop }
|
||||
.balign 16 { No-op. }
|
||||
@ -1641,20 +1654,6 @@ asm
|
||||
vzeroupper
|
||||
pop %ebx
|
||||
xor %eax, %eax
|
||||
ret
|
||||
|
||||
.LVec1Differs:
|
||||
xor %ecx, %ecx
|
||||
.LVecEm1Differs:
|
||||
add $32, %ecx
|
||||
.LVecEm2Differs:
|
||||
vzeroupper
|
||||
tzcnt %ebx, %ebx
|
||||
add %ecx, %ebx
|
||||
movzbl (%eax,%ebx), %eax
|
||||
movzbl (%edx,%ebx), %edx
|
||||
sub %edx, %eax
|
||||
pop %ebx
|
||||
end;
|
||||
|
||||
{$ifndef CPUX86_HAS_BMI2}
|
||||
@ -1795,6 +1794,7 @@ asm
|
||||
pop %ebx
|
||||
ret
|
||||
|
||||
.byte 102,102,102,102,102,102,102,102,102,102,102,144
|
||||
.LVecOrMore:
|
||||
movdqu (%edx,%eax), %xmm0 { Compare first vectors. }
|
||||
movdqu (%eax), %xmm1
|
||||
@ -1807,6 +1807,8 @@ asm
|
||||
sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
|
||||
jle .LLastVec
|
||||
|
||||
test %edx, %edx
|
||||
jz .LNothing { Exit if buf1 = buf2. }
|
||||
push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
|
||||
add %eax, %ecx
|
||||
and $-16, %eax { align buf1; +16 is performed by the loop. }
|
||||
@ -1931,6 +1933,8 @@ asm
|
||||
sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
|
||||
jle .LLastVec
|
||||
|
||||
test %edx, %edx
|
||||
jz .LNothing { Exit if buf1 = buf2. }
|
||||
push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
|
||||
add %eax, %ecx
|
||||
and $-16, %eax { align buf1; +16 is performed by the loop. }
|
||||
@ -1955,6 +1959,7 @@ asm
|
||||
pmovmskb %xmm0, %ebx
|
||||
inc %bx
|
||||
jnz .LVec0Differs
|
||||
.LNothing:
|
||||
pop %ebx
|
||||
xor %eax, %eax
|
||||
ret
|
||||
|
@ -987,7 +987,7 @@ asm
|
||||
movzbl (%rcx,%rax), %eax
|
||||
sub %rdx, %rax
|
||||
ret
|
||||
.byte 0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
|
||||
.byte 102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
|
||||
|
||||
.LVecOrMore:
|
||||
{ Compare first vectors. }
|
||||
@ -1013,8 +1013,9 @@ asm
|
||||
jbe .LLastTwoVectors
|
||||
|
||||
{ More than four vectors: aligned loop. }
|
||||
lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
|
||||
sub %rcx, %rdx { rdx = buf2 - buf1 }
|
||||
jz .LNothing { Exit if buf1 = buf2. }
|
||||
lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
|
||||
and $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
|
||||
sub %rcx, %r8 { r8 = count to be handled with loop }
|
||||
.balign 16 { no-op }
|
||||
@ -1200,6 +1201,7 @@ asm
|
||||
sub %rdx, %rax
|
||||
ret
|
||||
|
||||
.byte 102,102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned8xLoop_Body into a no-op. }
|
||||
.LVecOrMore:
|
||||
movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. }
|
||||
movdqu (%rcx), %xmm1
|
||||
@ -1212,12 +1214,14 @@ asm
|
||||
sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
|
||||
jle .LLastVec
|
||||
|
||||
test %rdx, %rdx
|
||||
jz .LNothing { Exit if buf1 = buf2. }
|
||||
mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
|
||||
add %rcx, %r8
|
||||
and $-16, %rcx { align buf1; +16 is performed by the loop. }
|
||||
sub %rcx, %r8
|
||||
|
||||
.balign 16
|
||||
.balign 16 { no-op }
|
||||
.LAligned8xLoop_Body:
|
||||
add $16, %rcx
|
||||
movdqu (%rdx,%rcx), %xmm0
|
||||
@ -1278,6 +1282,8 @@ asm
|
||||
sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
|
||||
jle .LLastVec
|
||||
|
||||
test %rdx, %rdx
|
||||
jz .LNothing { Exit if buf1 = buf2. }
|
||||
mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
|
||||
add %rcx, %r8
|
||||
and $-16, %rcx { align buf1; +16 is performed by the loop. }
|
||||
@ -1301,6 +1307,7 @@ asm
|
||||
pmovmskb %xmm0, %eax
|
||||
inc %ax
|
||||
jnz .LVec0Differs
|
||||
.LNothing:
|
||||
xor %eax, %eax
|
||||
ret
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user