Faster path for IndexBytes with a match at the beginning.

This commit is contained in:
Rika Ichinose 2024-08-05 12:12:15 +03:00 committed by FPK
parent edf7b26f52
commit ca0e04a346
2 changed files with 99 additions and 37 deletions

View File

@ -786,13 +786,58 @@ function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (con
asm
test %edx, %edx
jz .Lnotfound { exit if len=0 }
push %ebx
movd %ecx, %xmm1
lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
mov %eax, %ecx
punpcklbw %xmm1, %xmm1
and $-0x10, %ecx { first aligned address after buf }
punpcklbw %xmm1, %xmm1
and $4095, %ecx
pshufd $0, %xmm1, %xmm1
cmp $4080, %ecx
ja .LCrossPage
movdqu (%eax), %xmm0 { Analyze first 16 bytes, unaligned. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %ecx
test %ecx, %ecx
jz .LContinueAligned
bsf %ecx, %eax
cmp %edx, %eax
jae .Lnotfound
ret
.byte 144 { Make .balign 16 before .Lloop a no-op. }
.LContinueAligned:
cmp $16, %edx { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
push %ebx
lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
and $-0x10, %ecx { first aligned address after buf }
sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
.balign 16
.Lloop:
movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
add $16, %ecx { but their sum is evenly divisible by 16. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %ebx
test %ebx, %ebx
jnz .Lmatch
.Lcontinue:
cmp %ecx, %edx
ja .Lloop
pop %ebx
.Lnotfound:
or $-1, %eax
ret
.LCrossPage:
push %ebx
lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
and $-0x10, %ecx { first aligned address after buf }
movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
@ -809,22 +854,6 @@ asm
pop %ebx
cmp %eax, %edx { check against the buffer length }
jbe .Lnotfound
ret
.balign 16
.Lloop:
movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
add $16, %ecx { but their sum is evenly divisible by 16. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %ebx
test %ebx, %ebx
jnz .Lmatch
.Lcontinue:
cmp %ecx, %edx
ja .Lloop
pop %ebx
.Lnotfound:
or $-1, %eax
end;
{$ifndef CPUX86_HAS_SSE2}

View File

@ -595,17 +595,65 @@ function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackfram
asm
test len, len
jz .Lnotfound { exit if len=0 }
movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
mov {$ifdef win64} %ecx {$else} %edi {$endif}, %eax
punpcklbw %xmm1, %xmm1
punpcklbw %xmm1, %xmm1
and $4095, %eax
pshufd $0, %xmm1, %xmm1
cmp $4080, %eax
ja .LCrossPage
movdqu ({$ifdef win64} %rcx {$else} %rdi {$endif}), %xmm0 { Analyze first 16 bytes, unaligned. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jz .LContinueAligned
bsf %eax, %eax
cmp len, %rax
jae .Lnotfound
ret
.byte {$ifndef win64}102,102,102,102,{$endif}102,102,102,102,102,102,102,102,102,144 { Make .balign 16 before .Lloop a no-op. }
.LContinueAligned:
cmp $16, len { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
{$ifdef win64}
mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
add $16, %rcx
{$else}
lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
{$endif}
and $-0x10, %rcx { first aligned address after buf }
sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
.balign 16
.Lloop:
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
add $16, %rcx { but their sum is evenly divisible by 16. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz .Lmatch
.Lcontinue:
cmp %rcx, len
ja .Lloop
.Lnotfound:
or $-1, %rax
ret
.LCrossPage:
{$ifdef win64}
mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
add $16, %rcx
{$else}
lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
{$endif}
punpcklbw %xmm1, %xmm1
and $-0x10, %rcx { first aligned address after buf }
punpcklbw %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
@ -621,21 +669,6 @@ asm
lea -16(%rcx,%rax), %rax
cmp %rax, len { check against the buffer length }
jbe .Lnotfound
ret
.balign 16
.Lloop:
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
add $16, %rcx { but their sum is evenly divisible by 16. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz .Lmatch
.Lcontinue:
cmp %rcx, len
ja .Lloop
.Lnotfound:
or $-1, %rax
end;
{$endif FPC_SYSTEM_HAS_INDEXBYTE}