mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-11 21:46:00 +02:00
Remove runtime ABI adapter in x86_64.inc:IndexByte/Word, and save two jumps in the common case.
This commit is contained in:
parent
2575cbc439
commit
c29dd86bb2
@ -595,23 +595,21 @@ function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackfram
|
|||||||
{ win64: rcx buf, rdx len, r8b word
|
{ win64: rcx buf, rdx len, r8b word
|
||||||
linux: rdi buf, rsi len, rdx word }
|
linux: rdi buf, rsi len, rdx word }
|
||||||
asm
|
asm
|
||||||
test {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif}
|
test len, len
|
||||||
jz .Lnotfound { exit if len=0 }
|
jz .Lnotfound { exit if len=0 }
|
||||||
|
movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
|
||||||
{$ifdef win64}
|
{$ifdef win64}
|
||||||
movd %r8d, %xmm1
|
mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||||
|
add $16, %rcx
|
||||||
{$else}
|
{$else}
|
||||||
movd %edx, %xmm1
|
lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||||
movq %rdi, %rcx
|
|
||||||
movq %rsi, %rdx
|
|
||||||
{$endif}
|
{$endif}
|
||||||
mov %rcx, %r8
|
|
||||||
punpcklbw %xmm1, %xmm1
|
punpcklbw %xmm1, %xmm1
|
||||||
and $-0x10, %rcx { highest aligned address before buf }
|
and $-0x10, %rcx { first aligned address after buf }
|
||||||
punpcklbw %xmm1, %xmm1
|
punpcklbw %xmm1, %xmm1
|
||||||
add $16, %rcx { first aligned address after buf }
|
|
||||||
pshufd $0, %xmm1, %xmm1
|
pshufd $0, %xmm1, %xmm1
|
||||||
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
|
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
|
||||||
sub %r8, %rcx { rcx=number of valid bytes, r8=original ptr }
|
sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr }
|
||||||
|
|
||||||
pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
|
pcmpeqb %xmm1, %xmm0 { compare with pattern and get bitmask }
|
||||||
pmovmskb %xmm0, %eax
|
pmovmskb %xmm0, %eax
|
||||||
@ -619,28 +617,27 @@ asm
|
|||||||
shl %cl, %eax { shift valid bits into high word }
|
shl %cl, %eax { shift valid bits into high word }
|
||||||
and $0xffff0000, %eax { clear low word containing invalid bits }
|
and $0xffff0000, %eax { clear low word containing invalid bits }
|
||||||
shr %cl, %eax { shift back }
|
shr %cl, %eax { shift back }
|
||||||
jmp .Lcontinue
|
jz .Lcontinue
|
||||||
|
|
||||||
.balign 16
|
|
||||||
.Lloop:
|
|
||||||
movdqa (%r8,%rcx), %xmm0 { r8 and rcx may have any values, }
|
|
||||||
add $16, %rcx { but their sum is evenly divisible by 16. }
|
|
||||||
pcmpeqb %xmm1, %xmm0
|
|
||||||
pmovmskb %xmm0, %eax
|
|
||||||
.Lcontinue:
|
|
||||||
test %eax, %eax
|
|
||||||
jnz .Lmatch
|
|
||||||
cmp %rcx, %rdx
|
|
||||||
ja .Lloop
|
|
||||||
.Lnotfound:
|
|
||||||
or $-1, %rax
|
|
||||||
retq
|
|
||||||
|
|
||||||
.Lmatch:
|
.Lmatch:
|
||||||
bsf %eax, %eax
|
bsf %eax, %eax
|
||||||
lea -16(%rcx,%rax), %rax
|
lea -16(%rcx,%rax), %rax
|
||||||
cmp %rax, %rdx { check against the buffer length }
|
cmp %rax, len { check against the buffer length }
|
||||||
jbe .Lnotfound
|
jbe .Lnotfound
|
||||||
|
ret
|
||||||
|
|
||||||
|
.balign 16
|
||||||
|
.Lloop:
|
||||||
|
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, }
|
||||||
|
add $16, %rcx { but their sum is evenly divisible by 16. }
|
||||||
|
pcmpeqb %xmm1, %xmm0
|
||||||
|
pmovmskb %xmm0, %eax
|
||||||
|
test %eax, %eax
|
||||||
|
jnz .Lmatch
|
||||||
|
.Lcontinue:
|
||||||
|
cmp %rcx, len
|
||||||
|
ja .Lloop
|
||||||
|
.Lnotfound:
|
||||||
|
or $-1, %rax
|
||||||
end;
|
end;
|
||||||
{$endif FPC_SYSTEM_HAS_INDEXBYTE}
|
{$endif FPC_SYSTEM_HAS_INDEXBYTE}
|
||||||
|
|
||||||
@ -650,24 +647,22 @@ function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackfram
|
|||||||
{ win64: rcx buf, rdx len, r8b word
|
{ win64: rcx buf, rdx len, r8b word
|
||||||
linux: rdi buf, rsi len, rdx word }
|
linux: rdi buf, rsi len, rdx word }
|
||||||
asm
|
asm
|
||||||
test {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif}
|
test len, len
|
||||||
jz .Lnotfound { exit if len=0 }
|
jz .Lnotfound { exit if len=0 }
|
||||||
|
movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1
|
||||||
{$ifdef win64}
|
{$ifdef win64}
|
||||||
movd %r8d, %xmm1
|
mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||||
|
add $16, %rcx
|
||||||
{$else}
|
{$else}
|
||||||
movd %edx, %xmm1
|
lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. }
|
||||||
movq %rdi, %rcx
|
|
||||||
movq %rsi, %rdx
|
|
||||||
{$endif}
|
{$endif}
|
||||||
mov %rcx, %r8
|
|
||||||
punpcklwd %xmm1, %xmm1
|
punpcklwd %xmm1, %xmm1
|
||||||
and $-0x10, %rcx
|
and $-0x10, %rcx
|
||||||
pshufd $0, %xmm1, %xmm1
|
pshufd $0, %xmm1, %xmm1
|
||||||
add $16, %rcx
|
|
||||||
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
|
movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
|
||||||
sub %r8, %rcx { rcx=number of valid bytes }
|
sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes }
|
||||||
|
|
||||||
test $1, %r8b { if buffer isn't aligned to word boundary, }
|
test $1, {$ifdef win64} %r8b {$else} %dil {$endif} { if buffer isn't aligned to word boundary, }
|
||||||
jnz .Lunaligned { use a different algorithm }
|
jnz .Lunaligned { use a different algorithm }
|
||||||
|
|
||||||
pcmpeqw %xmm1, %xmm0
|
pcmpeqw %xmm1, %xmm0
|
||||||
@ -677,32 +672,32 @@ asm
|
|||||||
and $0xffff0000, %eax
|
and $0xffff0000, %eax
|
||||||
shr %cl, %eax
|
shr %cl, %eax
|
||||||
shr $1, %ecx { bytes->words }
|
shr $1, %ecx { bytes->words }
|
||||||
jmp .Lcontinue
|
|
||||||
|
|
||||||
.balign 16
|
|
||||||
.Lloop:
|
|
||||||
movdqa (%r8,%rcx,2), %xmm0
|
|
||||||
add $8, %rcx
|
|
||||||
pcmpeqw %xmm1, %xmm0
|
|
||||||
pmovmskb %xmm0, %eax
|
|
||||||
.Lcontinue:
|
|
||||||
test %eax, %eax
|
test %eax, %eax
|
||||||
jnz .Lmatch
|
jz .Lcontinue
|
||||||
cmp %rcx, %rdx
|
|
||||||
ja .Lloop
|
|
||||||
|
|
||||||
.Lnotfound:
|
|
||||||
or $-1, %rax
|
|
||||||
retq
|
|
||||||
|
|
||||||
.Lmatch:
|
.Lmatch:
|
||||||
bsf %eax, %eax
|
bsf %eax, %eax
|
||||||
shr $1, %eax { in words }
|
shr $1, %eax { in words }
|
||||||
lea -8(%rcx,%rax), %rax
|
lea -8(%rcx,%rax), %rax
|
||||||
cmp %rax, %rdx
|
cmp %rax, len
|
||||||
jbe .Lnotfound { if match is after the specified length, ignore it }
|
jbe .Lnotfound { if match is after the specified length, ignore it }
|
||||||
retq
|
retq
|
||||||
|
|
||||||
|
.balign 16
|
||||||
|
.Lloop:
|
||||||
|
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx,2), %xmm0
|
||||||
|
add $8, %rcx
|
||||||
|
pcmpeqw %xmm1, %xmm0
|
||||||
|
pmovmskb %xmm0, %eax
|
||||||
|
test %eax, %eax
|
||||||
|
jnz .Lmatch
|
||||||
|
.Lcontinue:
|
||||||
|
cmp %rcx, len
|
||||||
|
ja .Lloop
|
||||||
|
|
||||||
|
.Lnotfound:
|
||||||
|
or $-1, %rax
|
||||||
|
retq
|
||||||
|
|
||||||
.Lunaligned:
|
.Lunaligned:
|
||||||
movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
|
movdqa %xmm1, %xmm2 { (mis)align the pattern (in this particular case: }
|
||||||
psllw $8, %xmm1 { swap bytes of each word of pattern) }
|
psllw $8, %xmm1 { swap bytes of each word of pattern) }
|
||||||
@ -716,13 +711,13 @@ asm
|
|||||||
and $0xffff0000, %eax
|
and $0xffff0000, %eax
|
||||||
shr %cl, %eax
|
shr %cl, %eax
|
||||||
|
|
||||||
add %rdx, %rdx { length words -> bytes }
|
add len, len { length words -> bytes }
|
||||||
xor %r10d, %r10d { nothing to merge yet }
|
xor %r10d, %r10d { nothing to merge yet }
|
||||||
jmp .Lcontinue_u
|
jmp .Lcontinue_u
|
||||||
|
|
||||||
.balign 16
|
.balign 16
|
||||||
.Lloop_u:
|
.Lloop_u:
|
||||||
movdqa (%r8,%rcx), %xmm0
|
movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0
|
||||||
add $16, %rcx
|
add $16, %rcx
|
||||||
pcmpeqb %xmm1, %xmm0 { compare by bytes }
|
pcmpeqb %xmm1, %xmm0 { compare by bytes }
|
||||||
shr $16, %r10d { bit 16 shifts into 0 }
|
shr $16, %r10d { bit 16 shifts into 0 }
|
||||||
@ -735,7 +730,7 @@ asm
|
|||||||
and %r10d, %eax
|
and %r10d, %eax
|
||||||
and $0x5555, %eax { also reset odd bits }
|
and $0x5555, %eax { also reset odd bits }
|
||||||
jnz .Lmatch_u
|
jnz .Lmatch_u
|
||||||
cmpq %rcx, %rdx
|
cmpq %rcx, len
|
||||||
ja .Lloop_u
|
ja .Lloop_u
|
||||||
|
|
||||||
.Lnotfound_u:
|
.Lnotfound_u:
|
||||||
@ -744,7 +739,7 @@ asm
|
|||||||
.Lmatch_u:
|
.Lmatch_u:
|
||||||
bsf %eax, %eax
|
bsf %eax, %eax
|
||||||
lea -16(%rcx,%rax), %rax
|
lea -16(%rcx,%rax), %rax
|
||||||
cmp %rax, %rdx
|
cmp %rax, len
|
||||||
jbe .Lnotfound_u { if match is after the specified length, ignore it }
|
jbe .Lnotfound_u { if match is after the specified length, ignore it }
|
||||||
sar $1, %rax { in words }
|
sar $1, %rax { in words }
|
||||||
end;
|
end;
|
||||||
|
Loading…
Reference in New Issue
Block a user