+ x86_64 optimized assembler functions IndexByte and IndexWord

+ Extended tests of IndexByte with checks to verify correct operation when passed length is -1.

git-svn-id: trunk@17281 -
This commit is contained in:
sergei 2011-04-10 17:05:18 +00:00
parent 7f995c093e
commit c5e7902e4b
2 changed files with 184 additions and 1 deletions
rtl/x86_64
tests/test

View File

@ -458,7 +458,156 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
end;
{$endif FPC_SYSTEM_HAS_FILLCHAR}
{$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
{ based on libc/sysdeps/x86_64/memchr.S }
{$define FPC_SYSTEM_HAS_INDEXBYTE}
function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
{ win64: rcx buf, rdx len, r8b word
linux: rdi buf, rsi len, rdx word }
asm
{$ifdef win64}
movd %r8d, %xmm1
{$else}
movd %edx, %xmm1
movq %rdi, %rcx
movq %rsi, %rdx
{$endif}
mov %rcx, %rax { duplicate buf }
punpcklbw %xmm1, %xmm1
and $0xfffffffffffffff0, %rax
test %rdx, %rdx
punpcklbw %xmm1, %xmm1
jz .L3 { exit if len=0 }
orl $0xffffffff, %r8d
movdqa (%rax), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) }
pshufd $0, %xmm1, %xmm1
sub %rax, %rcx { rcx=misalignment }
pcmpeqb %xmm1, %xmm0
add %rcx, %rdx { add misalignment to length }
cmovb %r8, %rdx { if it overflows (happens when length=-1), set back to -1, }
{ otherwise loop will terminate too early }
mov %rcx, %r9 { and save it, will subtract back in the end }
shl %cl, %r8d
pmovmskb %xmm0, %ecx
andl %r8d, %ecx { mask away matches before buffer start }
movl $16, %r8d
jnz .L1 { got a match within buffer -> we're done (almost) }
cmpq %r8, %rdx
jbe .L3
.balign 16
.L2:
movdqa (%rax,%r8), %xmm0
lea 16(%r8), %r8
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %ecx
test %ecx, %ecx
jnz .L1
cmp %r8, %rdx
ja .L2
.L3:
or $-1, %rax
jmp .Ldone
.L1:
bsfl %ecx, %ecx { compute position of the first match }
lea -16(%rcx,%r8), %rax
cmp %rax, %rdx
jbe .L3 { if it is after the specified length, ignore it }
sub %r9, %rax
.Ldone:
end;
{$endif FPC_SYSTEM_HAS_INDEXBYTE}
{$ifndef FPC_SYSTEM_HAS_INDEXWORD}
{$define FPC_SYSTEM_HAS_INDEXWORD}
function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
{ win64: rcx buf, rdx len, r8b word
linux: rdi buf, rsi len, rdx word }
asm
{$ifdef win64}
movd %r8d, %xmm1
{$else}
movd %edx, %xmm1
movq %rdi, %rcx
movq %rsi, %rdx
{$endif}
mov %rcx, %rax { duplicate buf }
punpcklwd %xmm1, %xmm1
and $0xfffffffffffffff0, %rax
test %rdx, %rdx
pshufd $0, %xmm1, %xmm1
jz .L3 { exit if len=0 }
orl $0xffffffff, %r8d
test $1, %cl { if buffer isn't aligned to word boundary, }
jnz .Lunaligned { fallback to slower unaligned loop }
movdqa (%rax), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
sub %rax, %rcx { rcx=misalignment }
pcmpeqw %xmm1, %xmm0
mov %rcx, %r9
shr $1, %r9 { save misalignment in words }
add %r9, %rdx { add misalignment to length }
cmovb %r8, %rdx { if it overflows (happens when length=-1), set back to -1, }
{ otherwise loop will terminate too early }
shl %cl, %r8d
pmovmskb %xmm0, %ecx
andl %r8d, %ecx { mask away matches before buffer start }
movl $8, %r8d
jnz .L1 { got a match within buffer -> we're done (almost) }
cmpq %r8, %rdx
jbe .L3
.balign 16
.L2:
movdqa (%rax,%r8,2), %xmm0
lea 8(%r8), %r8
pcmpeqw %xmm1, %xmm0
pmovmskb %xmm0, %ecx
test %ecx, %ecx
jnz .L1
cmp %r8, %rdx
ja .L2
.L3:
or $-1, %rax
jmp .Ldone
.L1:
bsfl %ecx, %ecx { compute position of the first match }
shr $1, %ecx { in words }
lea -8(%rcx,%r8), %rax
cmp %rax, %rdx
jbe .L3 { if it is after the specified length, ignore it }
sub %r9, %rax
.Ldone:
retq
{ TODO: aligned processing is still possible, but for now
use the simplest form }
.Lunaligned:
xor %r9, %r9
xor %r8, %r8
mov %rcx, %rax
.balign 16
.L2u:
movdqu (%rax,%r8,2), %xmm0
lea 8(%r8), %r8
pcmpeqw %xmm1, %xmm0
pmovmskb %xmm0, %ecx
test %ecx, %ecx
jnz .L1
cmp %r8, %rdx
ja .L2u
or $-1, %rax
end;
{$endif FPC_SYSTEM_HAS_INDEXWORD}
{$asmmode att}
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
{ does a thread save inc/dec }
function declocked(var l : longint) : boolean;assembler;

View File

@ -46,12 +46,29 @@ begin
writeln('indexbyte error 2 for (',i,',',j,',',k,')');
halt(2);
end;
{same for length=-1}
if indexbyte(b[k+4],-1,0)<>index then
begin
writeln(indexbyte(b[k+4],-1,0),' <> ',index);
writeln('indexbyte error 2a for (',i,',',j,',',k,')');
halt(22);
end;
if indexbyte(b[k+4],i,b[k+4+i-1])<>i-1 then
begin
writeln('indexbyte error 3 for (',i,',',j,',',k,')');
halt(3);
end;
{same for length=-1}
if i<>0 then // previous test will be no-op when i=0
if indexbyte(b[k+4],-1,b[k+4+i-1])<>i-1 then
begin
writeln('indexbyte error 3a for (',i,',',j,',',k,')');
halt(23);
end;
if (i<1) then
index:=-1
else
@ -62,6 +79,16 @@ begin
writeln('indexbyte error 4 for (',i,',',j,',',k,')');
halt(4);
end;
{same for length=-1}
if i<>0 then // previous test will be no-op when i=0
if indexbyte(b[k+4],-1,b[k+4+i shr 1])<>index then
begin
writeln(indexbyte(b[k+4],-1,b[k+4+i shr 1]),' <> ',index);
writeln('indexbyte error 4a for (',i,',',j,',',k,')');
halt(24);
end;
if (i=0) then
index:=-1
else
@ -69,8 +96,15 @@ begin
if indexbyte(b[k+4],i,b[k+4])<>index then
begin
writeln('indexbyte error 5 for (',i,',',j,',',k,')');
halt(3);
halt(5);
end;
{same for length=-1}
if i<>0 then
if indexbyte(b[k+4],-1,b[k+4])<>index then
begin
writeln('indexbyte error 5a for (',i,',',j,',',k,')');
halt(25);
end;
if indexword(b[k+4],i shr 1,0)<>-1 then