mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-23 03:49:47 +02:00
+ x86_64 optimized assembler functions IndexByte and IndexWord
+ Extended tests of IndexByte with checks to verify correct operation when passed length is -1. git-svn-id: trunk@17281 -
This commit is contained in:
parent
7f995c093e
commit
c5e7902e4b
@ -458,7 +458,156 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
|
||||
end;
|
||||
{$endif FPC_SYSTEM_HAS_FILLCHAR}
|
||||
|
||||
{$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
|
||||
{ based on libc/sysdeps/x86_64/memchr.S }
|
||||
{$define FPC_SYSTEM_HAS_INDEXBYTE}
|
||||
function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
|
||||
{ win64: rcx buf, rdx len, r8b word
|
||||
linux: rdi buf, rsi len, rdx word }
|
||||
asm
|
||||
{$ifdef win64}
|
||||
movd %r8d, %xmm1
|
||||
{$else}
|
||||
movd %edx, %xmm1
|
||||
movq %rdi, %rcx
|
||||
movq %rsi, %rdx
|
||||
{$endif}
|
||||
mov %rcx, %rax { duplicate buf }
|
||||
punpcklbw %xmm1, %xmm1
|
||||
and $0xfffffffffffffff0, %rax
|
||||
test %rdx, %rdx
|
||||
punpcklbw %xmm1, %xmm1
|
||||
jz .L3 { exit if len=0 }
|
||||
orl $0xffffffff, %r8d
|
||||
movdqa (%rax), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
sub %rax, %rcx { rcx=misalignment }
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
add %rcx, %rdx { add misalignment to length }
|
||||
cmovb %r8, %rdx { if it overflows (happens when length=-1), set back to -1, }
|
||||
{ otherwise loop will terminate too early }
|
||||
mov %rcx, %r9 { and save it, will subtract back in the end }
|
||||
shl %cl, %r8d
|
||||
pmovmskb %xmm0, %ecx
|
||||
andl %r8d, %ecx { mask away matches before buffer start }
|
||||
movl $16, %r8d
|
||||
jnz .L1 { got a match within buffer -> we're done (almost) }
|
||||
cmpq %r8, %rdx
|
||||
jbe .L3
|
||||
|
||||
.balign 16
|
||||
.L2:
|
||||
movdqa (%rax,%r8), %xmm0
|
||||
lea 16(%r8), %r8
|
||||
pcmpeqb %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %ecx
|
||||
test %ecx, %ecx
|
||||
jnz .L1
|
||||
cmp %r8, %rdx
|
||||
ja .L2
|
||||
|
||||
.L3:
|
||||
or $-1, %rax
|
||||
jmp .Ldone
|
||||
|
||||
.L1:
|
||||
bsfl %ecx, %ecx { compute position of the first match }
|
||||
lea -16(%rcx,%r8), %rax
|
||||
cmp %rax, %rdx
|
||||
jbe .L3 { if it is after the specified length, ignore it }
|
||||
sub %r9, %rax
|
||||
.Ldone:
|
||||
end;
|
||||
{$endif FPC_SYSTEM_HAS_INDEXBYTE}
|
||||
|
||||
{$ifndef FPC_SYSTEM_HAS_INDEXWORD}
|
||||
{$define FPC_SYSTEM_HAS_INDEXWORD}
|
||||
function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
|
||||
{ win64: rcx buf, rdx len, r8b word
|
||||
linux: rdi buf, rsi len, rdx word }
|
||||
asm
|
||||
{$ifdef win64}
|
||||
movd %r8d, %xmm1
|
||||
{$else}
|
||||
movd %edx, %xmm1
|
||||
movq %rdi, %rcx
|
||||
movq %rsi, %rdx
|
||||
{$endif}
|
||||
mov %rcx, %rax { duplicate buf }
|
||||
punpcklwd %xmm1, %xmm1
|
||||
and $0xfffffffffffffff0, %rax
|
||||
test %rdx, %rdx
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
jz .L3 { exit if len=0 }
|
||||
orl $0xffffffff, %r8d
|
||||
test $1, %cl { if buffer isn't aligned to word boundary, }
|
||||
jnz .Lunaligned { fallback to slower unaligned loop }
|
||||
|
||||
movdqa (%rax), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
|
||||
sub %rax, %rcx { rcx=misalignment }
|
||||
pcmpeqw %xmm1, %xmm0
|
||||
|
||||
mov %rcx, %r9
|
||||
shr $1, %r9 { save misalignment in words }
|
||||
|
||||
add %r9, %rdx { add misalignment to length }
|
||||
cmovb %r8, %rdx { if it overflows (happens when length=-1), set back to -1, }
|
||||
{ otherwise loop will terminate too early }
|
||||
shl %cl, %r8d
|
||||
pmovmskb %xmm0, %ecx
|
||||
andl %r8d, %ecx { mask away matches before buffer start }
|
||||
movl $8, %r8d
|
||||
jnz .L1 { got a match within buffer -> we're done (almost) }
|
||||
cmpq %r8, %rdx
|
||||
jbe .L3
|
||||
|
||||
.balign 16
|
||||
.L2:
|
||||
movdqa (%rax,%r8,2), %xmm0
|
||||
lea 8(%r8), %r8
|
||||
pcmpeqw %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %ecx
|
||||
test %ecx, %ecx
|
||||
jnz .L1
|
||||
cmp %r8, %rdx
|
||||
ja .L2
|
||||
|
||||
.L3:
|
||||
or $-1, %rax
|
||||
jmp .Ldone
|
||||
|
||||
.L1:
|
||||
bsfl %ecx, %ecx { compute position of the first match }
|
||||
shr $1, %ecx { in words }
|
||||
lea -8(%rcx,%r8), %rax
|
||||
cmp %rax, %rdx
|
||||
jbe .L3 { if it is after the specified length, ignore it }
|
||||
sub %r9, %rax
|
||||
.Ldone:
|
||||
retq
|
||||
|
||||
{ TODO: aligned processing is still possible, but for now
|
||||
use the simplest form }
|
||||
.Lunaligned:
|
||||
xor %r9, %r9
|
||||
xor %r8, %r8
|
||||
mov %rcx, %rax
|
||||
|
||||
.balign 16
|
||||
.L2u:
|
||||
movdqu (%rax,%r8,2), %xmm0
|
||||
lea 8(%r8), %r8
|
||||
pcmpeqw %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %ecx
|
||||
test %ecx, %ecx
|
||||
jnz .L1
|
||||
cmp %r8, %rdx
|
||||
ja .L2u
|
||||
or $-1, %rax
|
||||
end;
|
||||
{$endif FPC_SYSTEM_HAS_INDEXWORD}
|
||||
|
||||
{$asmmode att}
|
||||
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
|
||||
{ does a thread save inc/dec }
|
||||
function declocked(var l : longint) : boolean;assembler;
|
||||
|
@ -46,12 +46,29 @@ begin
|
||||
writeln('indexbyte error 2 for (',i,',',j,',',k,')');
|
||||
halt(2);
|
||||
end;
|
||||
{same for length=-1}
|
||||
if indexbyte(b[k+4],-1,0)<>index then
|
||||
begin
|
||||
writeln(indexbyte(b[k+4],-1,0),' <> ',index);
|
||||
writeln('indexbyte error 2a for (',i,',',j,',',k,')');
|
||||
halt(22);
|
||||
end;
|
||||
|
||||
|
||||
if indexbyte(b[k+4],i,b[k+4+i-1])<>i-1 then
|
||||
begin
|
||||
writeln('indexbyte error 3 for (',i,',',j,',',k,')');
|
||||
halt(3);
|
||||
end;
|
||||
{same for length=-1}
|
||||
if i<>0 then // previous test will be no-op when i=0
|
||||
if indexbyte(b[k+4],-1,b[k+4+i-1])<>i-1 then
|
||||
begin
|
||||
writeln('indexbyte error 3a for (',i,',',j,',',k,')');
|
||||
halt(23);
|
||||
end;
|
||||
|
||||
|
||||
if (i<1) then
|
||||
index:=-1
|
||||
else
|
||||
@ -62,6 +79,16 @@ begin
|
||||
writeln('indexbyte error 4 for (',i,',',j,',',k,')');
|
||||
halt(4);
|
||||
end;
|
||||
{same for length=-1}
|
||||
if i<>0 then // previous test will be no-op when i=0
|
||||
if indexbyte(b[k+4],-1,b[k+4+i shr 1])<>index then
|
||||
begin
|
||||
writeln(indexbyte(b[k+4],-1,b[k+4+i shr 1]),' <> ',index);
|
||||
writeln('indexbyte error 4a for (',i,',',j,',',k,')');
|
||||
halt(24);
|
||||
end;
|
||||
|
||||
|
||||
if (i=0) then
|
||||
index:=-1
|
||||
else
|
||||
@ -69,8 +96,15 @@ begin
|
||||
if indexbyte(b[k+4],i,b[k+4])<>index then
|
||||
begin
|
||||
writeln('indexbyte error 5 for (',i,',',j,',',k,')');
|
||||
halt(3);
|
||||
halt(5);
|
||||
end;
|
||||
{same for length=-1}
|
||||
if i<>0 then
|
||||
if indexbyte(b[k+4],-1,b[k+4])<>index then
|
||||
begin
|
||||
writeln('indexbyte error 5a for (',i,',',j,',',k,')');
|
||||
halt(25);
|
||||
end;
|
||||
|
||||
|
||||
if indexword(b[k+4],i shr 1,0)<>-1 then
|
||||
|
Loading…
Reference in New Issue
Block a user