diff --git a/rtl/i386/i386.inc b/rtl/i386/i386.inc index 286bd4d9fd..bd7e3bd37f 100644 --- a/rtl/i386/i386.inc +++ b/rtl/i386/i386.inc @@ -1363,26 +1363,7 @@ asm pop %ebx ret -.LNothing: - pop %ebx - xor %eax, %eax - ret - -.LAligned32xLoop_TwoVectorsDiffer: - add %eax, %edx { restore edx = buf2 } - pmovmskb %xmm0, %ecx { Is there a difference in the first vector? } - inc %cx - jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. } - mov %ecx, %ebx -.LVec0Differs: - bsf %ebx, %ebx - movzbl (%eax,%ebx), %eax - movzbl (%edx,%ebx), %edx - sub %edx, %eax - pop %ebx - ret - - .byte 144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. } + .byte 102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. } CompareByte_CantOverReadBoth_AVX2: cmp $16, %ecx jb .LCantOverReadBoth @@ -1410,8 +1391,9 @@ CompareByte_CantOverReadBoth_AVX2: jbe .LLastTwoVectors { More than four vectors: aligned loop. } - lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) } sub %eax, %edx { edx = buf2 - buf1 } + jz .LNothing { Exit if buf1 = buf2. } + lea -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) } and $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. } sub %eax, %ecx { ecx = count to be handled with loop } .balign 16 { No-op. } @@ -1444,10 +1426,25 @@ CompareByte_CantOverReadBoth_AVX2: pmovmskb %xmm0, %ebx inc %bx jnz .LVecEm1Differs +.LNothing: pop %ebx xor %eax, %eax ret +.LAligned32xLoop_TwoVectorsDiffer: + add %eax, %edx { restore edx = buf2 } + pmovmskb %xmm0, %ecx { Is there a difference in the first vector? } + inc %cx + jz .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. } + mov %ecx, %ebx +.LVec0Differs: + bsf %ebx, %ebx + movzbl (%eax,%ebx), %eax + movzbl (%edx,%ebx), %edx + sub %edx, %eax + pop %ebx + ret + .LVec1Differs: xor %ecx, %ecx .LVecEm1Differs: @@ -1563,6 +1560,7 @@ asm { bzhi %ecx, %ebx, %ecx } .byte 0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi } jnz .LVec0Differs +.LNothing: vzeroupper pop %ebx xor %eax, %eax @@ -1584,6 +1582,20 @@ asm pop %ebx ret +.LVec1Differs: + xor %ecx, %ecx +.LVecEm1Differs: + add $32, %ecx +.LVecEm2Differs: + vzeroupper + tzcnt %ebx, %ebx + add %ecx, %ebx + movzbl (%eax,%ebx), %eax + movzbl (%edx,%ebx), %edx + sub %edx, %eax + pop %ebx + ret + .LVecOrMore: { Compare first vectors. } vmovdqu (%eax), %ymm0 @@ -1606,8 +1618,9 @@ asm jbe .LLastTwoVectors { More than four vectors: aligned loop. } - lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) } sub %eax, %edx { edx = buf2 - buf1 } + jz .LNothing { Exit if buf1 = buf2. } + lea -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) } and $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. } sub %eax, %ecx { ecx = count to be handled with loop } .balign 16 { No-op. } @@ -1641,20 +1654,6 @@ asm vzeroupper pop %ebx xor %eax, %eax - ret - -.LVec1Differs: - xor %ecx, %ecx -.LVecEm1Differs: - add $32, %ecx -.LVecEm2Differs: - vzeroupper - tzcnt %ebx, %ebx - add %ecx, %ebx - movzbl (%eax,%ebx), %eax - movzbl (%edx,%ebx), %edx - sub %edx, %eax - pop %ebx end; {$ifndef CPUX86_HAS_BMI2} @@ -1795,6 +1794,7 @@ asm pop %ebx ret + .byte 102,102,102,102,102,102,102,102,102,102,102,144 .LVecOrMore: movdqu (%edx,%eax), %xmm0 { Compare first vectors. } movdqu (%eax), %xmm1 @@ -1807,6 +1807,8 @@ asm sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately } jle .LLastVec + test %edx, %edx + jz .LNothing { Exit if buf1 = buf2. } push %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). } add %eax, %ecx and $-16, %eax { align buf1; +16 is performed by the loop. } @@ -1931,6 +1933,8 @@ asm sub $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately } jle .LLastVec + test %edx, %edx + jz .LNothing { Exit if buf1 = buf2. } push %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). } add %eax, %ecx and $-16, %eax { align buf1; +16 is performed by the loop. } @@ -1955,6 +1959,7 @@ asm pmovmskb %xmm0, %ebx inc %bx jnz .LVec0Differs +.LNothing: pop %ebx xor %eax, %eax ret diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc index 110cd382a7..32a2df1146 100644 --- a/rtl/x86_64/x86_64.inc +++ b/rtl/x86_64/x86_64.inc @@ -987,7 +987,7 @@ asm movzbl (%rcx,%rax), %eax sub %rdx, %rax ret - .byte 0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. } + .byte 102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. } .LVecOrMore: { Compare first vectors. } @@ -1013,8 +1013,9 @@ asm jbe .LLastTwoVectors { More than four vectors: aligned loop. } - lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). } sub %rcx, %rdx { rdx = buf2 - buf1 } + jz .LNothing { Exit if buf1 = buf2. } + lea -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). } and $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. } sub %rcx, %r8 { r8 = count to be handled with loop } .balign 16 { no-op } @@ -1200,6 +1201,7 @@ asm sub %rdx, %rax ret + .byte 102,102,102,102,102,102,102,102,102,102,102,102,102,102,144 { Turn .balign 16 before .LAligned8xLoop_Body into a no-op. } .LVecOrMore: movdqu (%rdx,%rcx), %xmm0 { Compare first vectors. } movdqu (%rcx), %xmm1 @@ -1212,12 +1214,14 @@ asm sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately } jle .LLastVec + test %rdx, %rdx + jz .LNothing { Exit if buf1 = buf2. } mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). } add %rcx, %r8 and $-16, %rcx { align buf1; +16 is performed by the loop. } sub %rcx, %r8 -.balign 16 +.balign 16 { no-op } .LAligned8xLoop_Body: add $16, %rcx movdqu (%rdx,%rcx), %xmm0 @@ -1278,6 +1282,8 @@ asm sub $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately } jle .LLastVec + test %rdx, %rdx + jz .LNothing { Exit if buf1 = buf2. } mov %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). } add %rcx, %r8 and $-16, %rcx { align buf1; +16 is performed by the loop. } @@ -1301,6 +1307,7 @@ asm pmovmskb %xmm0, %eax inc %ax jnz .LVec0Differs +.LNothing: xor %eax, %eax ret