mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-06 23:28:28 +02:00
Faster path for IndexBytes with a match at the beginning.
This commit is contained in:
parent
edf7b26f52
commit
ca0e04a346
@ -786,13 +786,58 @@ function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (con
|
||||
asm
|
||||
test %edx, %edx
|
||||
jz .Lnotfound { exit if len=0 }
|
||||
push %ebx
|
||||
|
||||
movd %ecx, %xmm1
|
||||
lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
|
||||
mov %eax, %ecx
|
||||
punpcklbw %xmm1, %xmm1
|
||||
and $-0x10, %ecx { first aligned address after buf }
|
||||
punpcklbw %xmm1, %xmm1
|
||||
and $4095, %ecx
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
|
||||
cmp $4080, %ecx
|
||||
ja .LCrossPage
|
||||
|
||||
movdqu (%eax), %xmm0 { Analyze first 16 bytes, unaligned. }
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %ecx
|
||||
test %ecx, %ecx
|
||||
jz .LContinueAligned
|
||||
|
||||
bsf %ecx, %eax
|
||||
cmp %edx, %eax
|
||||
jae .Lnotfound
|
||||
ret
|
||||
|
||||
.byte 144 { Make .balign 16 before .Lloop a no-op. }
|
||||
.LContinueAligned:
|
||||
cmp $16, %edx { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
|
||||
jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
|
||||
|
||||
push %ebx
|
||||
lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
|
||||
and $-0x10, %ecx { first aligned address after buf }
|
||||
sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
|
||||
|
||||
.balign 16
|
||||
.Lloop:
|
||||
movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
|
||||
add $16, %ecx { but their sum is evenly divisible by 16. }
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %ebx
|
||||
test %ebx, %ebx
|
||||
jnz .Lmatch
|
||||
.Lcontinue:
|
||||
cmp %ecx, %edx
|
||||
ja .Lloop
|
||||
pop %ebx
|
||||
.Lnotfound:
|
||||
or $-1, %eax
|
||||
ret
|
||||
|
||||
.LCrossPage:
|
||||
push %ebx
|
||||
lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
|
||||
and $-0x10, %ecx { first aligned address after buf }
|
||||
movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
|
||||
sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr }
|
||||
|
||||
@ -809,22 +854,6 @@ asm
|
||||
pop %ebx
|
||||
cmp %eax, %edx { check against the buffer length }
|
||||
jbe .Lnotfound
|
||||
ret
|
||||
|
||||
.balign 16
|
||||
.Lloop:
|
||||
movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, }
|
||||
add $16, %ecx { but their sum is evenly divisible by 16. }
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %ebx
|
||||
test %ebx, %ebx
|
||||
jnz .Lmatch
|
||||
.Lcontinue:
|
||||
cmp %ecx, %edx
|
||||
ja .Lloop
|
||||
pop %ebx
|
||||
.Lnotfound:
|
||||
or $-1, %eax
|
||||
end;
|
||||
|
||||
{$ifndef CPUX86_HAS_SSE2}
|
||||
|
@ -595,17 +595,65 @@ function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackfram
|
||||
asm
|
||||
test len, len
|
||||
jz .Lnotfound { exit if len=0 }
|
||||
|
||||
movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
|
||||
mov {$ifdef win64} %ecx {$else} %edi {$endif}, %eax
|
||||
punpcklbw %xmm1, %xmm1
|
||||
punpcklbw %xmm1, %xmm1
|
||||
and $4095, %eax
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
|
||||
cmp $4080, %eax
|
||||
ja .LCrossPage
|
||||
|
||||
movdqu ({$ifdef win64} %rcx {$else} %rdi {$endif}), %xmm0 { Analyze first 16 bytes, unaligned. }
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
jz .LContinueAligned
|
||||
|
||||
bsf %eax, %eax
|
||||
cmp len, %rax
|
||||
jae .Lnotfound
|
||||
ret
|
||||
|
||||
.byte {$ifndef win64}102,102,102,102,{$endif}102,102,102,102,102,102,102,102,102,144 { Make .balign 16 before .Lloop a no-op. }
|
||||
.LContinueAligned:
|
||||
cmp $16, len { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
|
||||
jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }
|
||||
|
||||
{$ifdef win64}
|
||||
mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||
add $16, %rcx
|
||||
{$else}
|
||||
lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||
{$endif}
|
||||
and $-0x10, %rcx { first aligned address after buf }
|
||||
sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
|
||||
|
||||
.balign 16
|
||||
.Lloop:
|
||||
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
|
||||
add $16, %rcx { but their sum is evenly divisible by 16. }
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
jnz .Lmatch
|
||||
.Lcontinue:
|
||||
cmp %rcx, len
|
||||
ja .Lloop
|
||||
.Lnotfound:
|
||||
or $-1, %rax
|
||||
ret
|
||||
|
||||
.LCrossPage:
|
||||
{$ifdef win64}
|
||||
mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||
add $16, %rcx
|
||||
{$else}
|
||||
lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||
{$endif}
|
||||
punpcklbw %xmm1, %xmm1
|
||||
and $-0x10, %rcx { first aligned address after buf }
|
||||
punpcklbw %xmm1, %xmm1
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
|
||||
sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
|
||||
|
||||
@ -621,21 +669,6 @@ asm
|
||||
lea -16(%rcx,%rax), %rax
|
||||
cmp %rax, len { check against the buffer length }
|
||||
jbe .Lnotfound
|
||||
ret
|
||||
|
||||
.balign 16
|
||||
.Lloop:
|
||||
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
|
||||
add $16, %rcx { but their sum is evenly divisible by 16. }
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
jnz .Lmatch
|
||||
.Lcontinue:
|
||||
cmp %rcx, len
|
||||
ja .Lloop
|
||||
.Lnotfound:
|
||||
or $-1, %rax
|
||||
end;
|
||||
{$endif FPC_SYSTEM_HAS_INDEXBYTE}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user