Remove runtime ABI adapter in x86_64.inc:IndexByte/Word, and save two jumps in the common case.

This commit is contained in:
Rika Ichinose 2023-11-17 19:53:01 +03:00 committed by FPK
parent 2575cbc439
commit c29dd86bb2

View File

@ -595,23 +595,21 @@ function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackfram
{ win64: rcx buf, rdx len, r8b word { win64: rcx buf, rdx len, r8b word
linux: rdi buf, rsi len, rdx word } linux: rdi buf, rsi len, rdx word }
asm asm
test {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif} test len, len
jz .Lnotfound { exit if len=0 } jz .Lnotfound { exit if len=0 }
movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
{$ifdef win64} {$ifdef win64}
movd %r8d, %xmm1 mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
add $16, %rcx
{$else} {$else}
movd %edx, %xmm1 lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
movq %rdi, %rcx
movq %rsi, %rdx
{$endif} {$endif}
mov %rcx, %r8
punpcklbw %xmm1, %xmm1 punpcklbw %xmm1, %xmm1
and $-0x10, %rcx { highest aligned address before buf } and $-0x10, %rcx { first aligned address after buf }
punpcklbw %xmm1, %xmm1 punpcklbw %xmm1, %xmm1
add $16, %rcx { first aligned address after buf }
pshufd $0, %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) } movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
sub %r8, %rcx { rcx=number of valid bytes, r8=original ptr } sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask } pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
pmovmskb %xmm0, %eax pmovmskb %xmm0, %eax
@ -619,28 +617,27 @@ asm
shl %cl, %eax { shift valid bits into high word } shl %cl, %eax { shift valid bits into high word }
and $0xffff0000, %eax { clear low word containing invalid bits } and $0xffff0000, %eax { clear low word containing invalid bits }
shr %cl, %eax { shift back } shr %cl, %eax { shift back }
jmp .Lcontinue jz .Lcontinue
.balign 16
.Lloop:
movdqa (%r8,%rcx), %xmm0 { r8 and rcx may have any values, }
add $16, %rcx { but their sum is evenly divisible by 16. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
.Lcontinue:
test %eax, %eax
jnz .Lmatch
cmp %rcx, %rdx
ja .Lloop
.Lnotfound:
or $-1, %rax
retq
.Lmatch: .Lmatch:
bsf %eax, %eax bsf %eax, %eax
lea -16(%rcx,%rax), %rax lea -16(%rcx,%rax), %rax
cmp %rax, %rdx { check against the buffer length } cmp %rax, len { check against the buffer length }
jbe .Lnotfound jbe .Lnotfound
ret
.balign 16
.Lloop:
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
add $16, %rcx { but their sum is evenly divisible by 16. }
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz .Lmatch
.Lcontinue:
cmp %rcx, len
ja .Lloop
.Lnotfound:
or $-1, %rax
end; end;
{$endif FPC_SYSTEM_HAS_INDEXBYTE} {$endif FPC_SYSTEM_HAS_INDEXBYTE}
@ -650,24 +647,22 @@ function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackfram
{ win64: rcx buf, rdx len, r8b word { win64: rcx buf, rdx len, r8b word
linux: rdi buf, rsi len, rdx word } linux: rdi buf, rsi len, rdx word }
asm asm
test {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif} test len, len
jz .Lnotfound { exit if len=0 } jz .Lnotfound { exit if len=0 }
movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
{$ifdef win64} {$ifdef win64}
movd %r8d, %xmm1 mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
add $16, %rcx
{$else} {$else}
movd %edx, %xmm1 lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
movq %rdi, %rcx
movq %rsi, %rdx
{$endif} {$endif}
mov %rcx, %r8
punpcklwd %xmm1, %xmm1 punpcklwd %xmm1, %xmm1
and $-0x10, %rcx and $-0x10, %rcx
pshufd $0, %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1
add $16, %rcx
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) } movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
sub %r8, %rcx { rcx=number of valid bytes } sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes }
test $1, %r8b { if buffer isn't aligned to word boundary, } test $1, {$ifdef win64} %r8b {$else} %dil {$endif} { if buffer isn't aligned to word boundary, }
jnz .Lunaligned { use a different algorithm } jnz .Lunaligned { use a different algorithm }
pcmpeqw %xmm1, %xmm0 pcmpeqw %xmm1, %xmm0
@ -677,32 +672,32 @@ asm
and $0xffff0000, %eax and $0xffff0000, %eax
shr %cl, %eax shr %cl, %eax
shr $1, %ecx { bytes->words } shr $1, %ecx { bytes->words }
jmp .Lcontinue
.balign 16
.Lloop:
movdqa (%r8,%rcx,2), %xmm0
add $8, %rcx
pcmpeqw %xmm1, %xmm0
pmovmskb %xmm0, %eax
.Lcontinue:
test %eax, %eax test %eax, %eax
jnz .Lmatch jz .Lcontinue
cmp %rcx, %rdx
ja .Lloop
.Lnotfound:
or $-1, %rax
retq
.Lmatch: .Lmatch:
bsf %eax, %eax bsf %eax, %eax
shr $1, %eax { in words } shr $1, %eax { in words }
lea -8(%rcx,%rax), %rax lea -8(%rcx,%rax), %rax
cmp %rax, %rdx cmp %rax, len
jbe .Lnotfound { if match is after the specified length, ignore it } jbe .Lnotfound { if match is after the specified length, ignore it }
retq retq
.balign 16
.Lloop:
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx,2), %xmm0
add $8, %rcx
pcmpeqw %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz .Lmatch
.Lcontinue:
cmp %rcx, len
ja .Lloop
.Lnotfound:
or $-1, %rax
retq
.Lunaligned: .Lunaligned:
movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: } movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
psllw $8, %xmm1 { swap bytes of each word of pattern) } psllw $8, %xmm1 { swap bytes of each word of pattern) }
@ -716,13 +711,13 @@ asm
and $0xffff0000, %eax and $0xffff0000, %eax
shr %cl, %eax shr %cl, %eax
add %rdx, %rdx { length words -> bytes } add len, len { length words -> bytes }
xor %r10d, %r10d { nothing to merge yet } xor %r10d, %r10d { nothing to merge yet }
jmp .Lcontinue_u jmp .Lcontinue_u
.balign 16 .balign 16
.Lloop_u: .Lloop_u:
movdqa (%r8,%rcx), %xmm0 movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0
add $16, %rcx add $16, %rcx
pcmpeqb %xmm1, %xmm0 { compare by bytes } pcmpeqb %xmm1, %xmm0 { compare by bytes }
shr $16, %r10d { bit 16 shifts into 0 } shr $16, %r10d { bit 16 shifts into 0 }
@ -735,7 +730,7 @@ asm
and %r10d, %eax and %r10d, %eax
and $0x5555, %eax { also reset odd bits } and $0x5555, %eax { also reset odd bits }
jnz .Lmatch_u jnz .Lmatch_u
cmpq %rcx, %rdx cmpq %rcx, len
ja .Lloop_u ja .Lloop_u
.Lnotfound_u: .Lnotfound_u:
@ -744,7 +739,7 @@ asm
.Lmatch_u: .Lmatch_u:
bsf %eax, %eax bsf %eax, %eax
lea -16(%rcx,%rax), %rax lea -16(%rcx,%rax), %rax
cmp %rax, %rdx cmp %rax, len
jbe .Lnotfound_u { if match is after the specified length, ignore it } jbe .Lnotfound_u { if match is after the specified length, ignore it }
sar $1, %rax { in words } sar $1, %rax { in words }
end; end;