+ x86_64 assembler implementations of Move and FillChar. Does not use SIMD, so probably not the fastest for large move sizes, but for small to medium sizes it should be competitive.

* Extended the related test with checks for medium and large move sizes, to improve coverage for different code paths that are used depending on size.

git-svn-id: trunk@17249 -
This commit is contained in:
sergei 2011-04-05 09:53:54 +00:00
parent da7ef036ba
commit 6e09d76b07
2 changed files with 428 additions and 156 deletions

View File

@ -72,181 +72,387 @@ asm
.Lg_a_null:
end ['RAX'];
(*
{$define FPC_SYSTEM_HAS_MOVE}
procedure Move(const source;var dest;count:longint);[public, alias: 'FPC_MOVE'];assembler;
procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
{ Linux: rdi source, rsi dest, rdx count
win64: rcx source, rdx dest, r8 count }
asm
{ rdi destination
rsi source
rdx count
}
pushq %rbx
prefetcht0 (%rsi) // for more hopefully the hw prefetch will kick in
movq %rdi,%rax
{$ifndef win64}
mov %rdx, %r8
mov %rsi, %rdx
mov %rdi, %rcx
{$endif win64}
movl %edi,%ecx
andl $7,%ecx
jnz .Lbad_alignment
.Lafter_bad_alignment:
movq %rdx,%rcx
movl $64,%ebx
shrq $6,%rcx
jz .Lhandle_tail
mov %r8, %rax
sub %rdx, %rcx { rcx = src - dest }
jz .Lquit { exit if src=dest }
jnb .L1 { src>dest => forward move }
.Lloop_64:
{ no prefetch because we assume the hw prefetcher does it already
and we have no specific temporal hint to give. XXX or give a nta
hint for the source? }
movq (%rsi),%r11
movq 8(%rsi),%r8
movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10
movnti %r11,(%rdi)
movnti %r8,1*8(%rdi)
movnti %r9,2*8(%rdi)
movnti %r10,3*8(%rdi)
add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8
movq 6*8(%rsi),%r9
movq 7*8(%rsi),%r10
movnti %r11,4*8(%rdi)
movnti %r8,5*8(%rdi)
movnti %r9,6*8(%rdi)
movnti %r10,7*8(%rdi)
.L1:
cmp $8, %r8
jl .Lless8f { signed compare, negative count not allowed }
test $7, %dl
je .Ldestaligned
addq %rbx,%rsi
addq %rbx,%rdi
loop .Lloop_64
test $1, %dl { align dest by moving first 1+2+4 bytes }
je .L2f
mov (%rcx,%rdx,1),%al
dec %r8
mov %al, (%rdx)
add $1, %rdx
.L2f:
test $2, %dl
je .L4f
mov (%rcx,%rdx,1),%ax
sub $2, %r8
mov %ax, (%rdx)
add $2, %rdx
.L4f:
test $4, %dl
je .Ldestaligned
mov (%rcx,%rdx,1),%eax
sub $4, %r8
mov %eax, (%rdx)
add $4, %rdx
.Lhandle_tail:
movl %edx,%ecx
andl $63,%ecx
shrl $3,%ecx
jz .Lhandle_7
movl $8,%ebx
.Lloop_8:
movq (%rsi),%r8
movnti %r8,(%rdi)
addq %rbx,%rdi
addq %rbx,%rsi
loop .Lloop_8
.Ldestaligned:
mov %r8, %r9
shr $5, %r9
jne .Lmore32
.Lhandle_7:
movl %edx,%ecx
andl $7,%ecx
jz .Lende
.Lloop_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
loop .Lloop_1
.Ltail:
mov %r8, %r9
shr $3, %r9
je .Lless8f
jmp .Lende
.balign 16
.Lloop8f: { max. 8 iterations }
mov (%rcx,%rdx,1),%rax
mov %rax, (%rdx)
add $8, %rdx
dec %r9
jne .Lloop8f
and $7, %r8
{ align destination }
{ This is simpleminded. For bigger blocks it may make sense to align
src and dst to their aligned subset and handle the rest separately }
.Lbad_alignment:
movl $8,%r9d
subl %ecx,%r9d
movl %r9d,%ecx
subq %r9,%rdx
js .Lsmall_alignment
jz .Lsmall_alignment
.Lalign_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
loop .Lalign_1
jmp .Lafter_bad_alignment
.Lsmall_alignment:
addq %r9,%rdx
jmp .Lhandle_7
.Lless8f:
test %r8, %r8
jle .Lquit
.Lende:
sfence
popq %rbx
.balign 16
.Lloop1f:
mov (%rcx,%rdx,1),%al
mov %al,(%rdx)
inc %rdx
dec %r8
jne .Lloop1f
.Lquit:
retq
.Lmore32:
cmp $0x2000, %r9 { this limit must be processor-specific (1/2 L2 cache size) }
jnae .Lloop32
cmp $0x1000, %rcx { but don't bother bypassing cache if src and dest }
jnb .Lntloopf { are close to each other}
.balign 16
.Lloop32:
add $32,%rdx
mov -32(%rcx,%rdx,1),%rax
mov -24(%rcx,%rdx,1),%r10
mov %rax,-32(%rdx)
mov %r10,-24(%rdx)
dec %r9
mov -16(%rcx,%rdx,1),%rax
mov -8(%rcx,%rdx,1),%r10
mov %rax,-16(%rdx)
mov %r10,-8(%rdx)
jne .Lloop32
and $0x1f, %r8
jmpq .Ltail
.Lntloopf:
mov $32, %eax
.balign 16
.Lpref:
prefetchnta (%rcx,%rdx,1)
prefetchnta 0x40(%rcx,%rdx,1)
add $0x80, %rdx
dec %eax
jne .Lpref
sub $0x1000, %rdx
mov $64, %eax
.balign 16
.Loop64:
add $64, %rdx
mov -64(%rcx,%rdx,1), %r9
mov -56(%rcx,%rdx,1), %r10
movnti %r9, -64(%rdx)
movnti %r10, -56(%rdx)
mov -48(%rcx,%rdx,1), %r9
mov -40(%rcx,%rdx,1), %r10
movnti %r9, -48(%rdx)
movnti %r10, -40(%rdx)
dec %eax
mov -32(%rcx,%rdx,1), %r9
mov -24(%rcx,%rdx,1), %r10
movnti %r9, -32(%rdx)
movnti %r10, -24(%rdx)
mov -16(%rcx,%rdx,1), %r9
mov -8(%rcx,%rdx,1), %r10
movnti %r9, -16(%rdx)
movnti %r10, -8(%rdx)
jne .Loop64
sub $0x1000, %r8
cmp $0x1000, %r8
jae .Lntloopf
mfence
jmpq .Ldestaligned { go handle remaining bytes }
{ backwards move }
.Lback:
add %r8, %rdx { points to the end of dest }
cmp $8, %r8
jl .Lless8b { signed compare, negative count not allowed }
test $7, %dl
je .Ldestalignedb
test $1, %dl
je .L2b
dec %rdx
mov (%rcx,%rdx,1), %al
dec %r8
mov %al, (%rdx)
.L2b:
test $2, %dl
je .L4b
sub $2, %rdx
mov (%rcx,%rdx,1), %ax
sub $2, %r8
mov %ax, (%rdx)
.L4b:
test $4, %dl
je .Ldestalignedb
sub $4, %rdx
mov (%rcx,%rdx,1), %eax
sub $4, %r8
mov %eax, (%rdx)
.Ldestalignedb:
mov %r8, %r9
shr $5, %r9
jne .Lmore32b
.Ltailb:
mov %r8, %r9
shr $3, %r9
je .Lless8b
.Lloop8b:
sub $8, %rdx
mov (%rcx,%rdx,1), %rax
dec %r9
mov %rax, (%rdx)
jne .Lloop8b
and $7, %r8
.Lless8b:
test %r8, %r8
jle .Lquit2
.balign 16
.Lsmallb:
dec %rdx
mov (%rcx,%rdx,1), %al
dec %r8
mov %al,(%rdx)
jnz .Lsmallb
.Lquit2:
retq
.Lmore32b:
cmp $0x2000, %r9
jnae .Lloop32b
cmp $0xfffffffffffff000,%rcx
jb .Lntloopb
.balign 16
.Lloop32b:
sub $32, %rdx
mov 24(%rcx,%rdx,1), %rax
mov 16(%rcx,%rdx,1), %r10
mov %rax, 24(%rdx)
mov %r10, 16(%rdx)
dec %r9
mov 8(%rcx,%rdx,1),%rax
mov (%rcx,%rdx,1), %r10
mov %rax, 8(%rdx)
mov %r10, (%rdx)
jne .Lloop32b
and $0x1f, %r8
jmpq .Ltailb
.Lntloopb:
mov $32, %eax
.balign 16
.Lprefb:
sub $0x80, %rdx
prefetchnta (%rcx,%rdx,1)
prefetchnta 0x40(%rcx,%rdx,1)
dec %eax
jnz .Lprefb
add $0x1000, %rdx
mov $0x40, %eax
.balign 16
.Lloop64b:
sub $64, %rdx
mov 56(%rcx,%rdx,1), %r9
mov 48(%rcx,%rdx,1), %r10
movnti %r9, 56(%rdx)
movnti %r10, 48(%rdx)
mov 40(%rcx,%rdx,1), %r9
mov 32(%rcx,%rdx,1), %r10
movnti %r9, 40(%rdx)
movnti %r10, 32(%rdx)
dec %eax
mov 24(%rcx,%rdx,1), %r9
mov 16(%rcx,%rdx,1), %r10
movnti %r9, 24(%rdx)
movnti %r10, 16(%rdx)
mov 8(%rcx,%rdx,1), %r9
mov (%rcx,%rdx,1), %r10
movnti %r9, 8(%rdx)
movnti %r10, (%rdx)
jne .Lloop64b
sub $0x1000, %r8
cmp $0x1000, %r8
jae .Lntloopb
mfence
jmpq .Ldestalignedb
end;
*)
(*
{$define FPC_SYSTEM_HAS_FILLCHAR}
Procedure FillChar(var x;count:longint;value:byte);assembler;
Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
asm
{ rdi destination
rsi value (char)
rdx count (bytes)
}
movq %rdi,%r10
movq %rdx,%r11
{ win64: rcx dest, rdx count, r8b value
linux: rdi dest, rsi count, rdx value }
{$ifndef win64}
mov %rdx, %r8
mov %rsi, %rdx
mov %rdi, %rcx
{$endif win64}
cmp $8, %rdx
jl .Ltiny
{ expand byte value }
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
mul %rcx { with rax, clobbers rdx }
movzbl %r8b, %r8
mov $0x0101010101010101,%r9
imul %r9, %r8
{ align dst }
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
.Lafter_bad_alignment:
test $7, %cl
je .Laligned
movq %r11,%rcx
movl $64,%r8d
shrq $6,%rcx
jz .Lhandle_tail
{ align dest to 8 bytes }
test $1, %cl
je .L2
movb %r8b, (%rcx)
add $1, %rcx
sub $1, %rdx
.L2:
test $2, %cl
je .L4
movw %r8w, (%rcx)
add $2, %rcx
sub $2, %rdx
.L4:
test $4, %cl
je .Laligned
movl %r8d, (%rcx)
add $4, %rcx
sub $4, %rdx
.Lloop_64:
movnti %rax,(%rdi)
movnti %rax,8(%rdi)
movnti %rax,16(%rdi)
movnti %rax,24(%rdi)
movnti %rax,32(%rdi)
movnti %rax,40(%rdi)
movnti %rax,48(%rdi)
movnti %rax,56(%rdi)
addq %r8,%rdi
loop .Lloop_64
.Laligned:
mov %rdx, %rax
and $0x3f, %rdx
shr $6, %rax
jne .Lmore64
{ Handle tail in loops. The loops should be faster than hard
to predict jump tables. }
.Lhandle_tail:
movl %r11d,%ecx
andl $56,%ecx
jz .Lhandle_7
shrl $3,%ecx
.Lloop_8:
movnti %rax,(%rdi)
addq $8,%rdi
loop .Lloop_8
.Lhandle_7:
movl %r11d,%ecx
andl $7,%ecx
jz .Lende
.Lloop_1:
movb %al,(%rdi)
addq $1,%rdi
loop .Lloop_1
.Lless64:
mov %rdx, %rax
and $7, %rdx
shr $3, %rax
je .Ltiny
jmp .Lende
.balign 16
.Lloop8: { max. 8 iterations }
mov %r8, (%rcx)
add $8, %rcx
dec %rax
jne .Lloop8
.Ltiny:
test %rdx, %rdx
jle .Lquit
.Lloop1:
movb %r8b, (%rcx)
inc %rcx
dec %rdx
jnz .Lloop1
.Lquit:
retq
.Lbad_alignment:
cmpq $7,%r11
jbe .Lhandle_7
movnti %rax,(%rdi) (* unaligned store *)
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%r11
jmp .Lafter_bad_alignment
.Lmore64:
cmp $0x2000,%rax
jae .Lloop64nti
.Lende:
movq %r10,%rax
.balign 16
.Lloop64:
add $64, %rcx
mov %r8, -64(%rcx)
mov %r8, -56(%rcx)
mov %r8, -48(%rcx)
mov %r8, -40(%rcx)
dec %rax
mov %r8, -32(%rcx)
mov %r8, -24(%rcx)
mov %r8, -16(%rcx)
mov %r8, -8(%rcx)
jne .Lloop64
jmp .Lless64
.balign 16
.Lloop64nti:
add $64, %rcx
movnti %r8, -64(%rcx)
movnti %r8, -56(%rcx)
movnti %r8, -48(%rcx)
movnti %r8, -40(%rcx)
dec %rax
movnti %r8, -32(%rcx)
movnti %r8, -24(%rcx)
movnti %r8, -16(%rcx)
movnti %r8, -8(%rcx)
jnz .Lloop64nti
mfence
jmp .Lless64
end;
*)
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}

View File

@ -16,9 +16,7 @@ var
dst_arraybyte : array[1..MAX_TABLE] of byte;
src_arraybyte : array[1..MAX_TABLE] of byte;
dst_arrayword : array[1..MAX_TABLE] of word;
src_arrayword : array[1..MAX_TABLE] of word;
dst_arraylongword : array[1..MAX_TABLE] of longword;
src_arratlongword : array[1..MAX_TABLE] of longword;
i: integer;
@ -70,6 +68,8 @@ procedure test_fillchar;
for i := 1 to MAX_TABLE do
dst_arraybyte[i] := DEFAULT_VALUE;
fillchar(dst_arraybyte, -1, FILL_VALUE);
for i := 1 to MAX_TABLE do
test(dst_arraybyte[i], DEFAULT_VALUE);
writeln('Passed!');
end;
@ -103,7 +103,7 @@ begin
test(dst_arraybyte[i], FILL_VALUE);
writeln('Passed!');
{ zero move count }
write('test move (zero count)...');
write('testing move (zero count)...');
for i := 1 to MAX_TABLE do
begin
dst_arraybyte[i] := DEFAULT_VALUE;
@ -114,11 +114,75 @@ begin
test(dst_arraybyte[i], DEFAULT_VALUE);
writeln('Passed!');
{ negative move count }
write('test move (negative count)...');
write('testing move (negative count)...');
move(src_arraybyte,dst_arraybyte,-12);
writeln('Passed!');
end;
procedure test_move_large(size: longint);
var
src, dst: PLongInt;
i: LongInt;
begin
GetMem(src, size*sizeof(LongInt));
GetMem(dst, size*sizeof(LongInt));
write('testing move of ',size,' dwords ...');
for i := 0 to size-1 do
begin
src[i] := i;
dst[i] := -1;
end;
move(src[0], dst[2], (size-4)*sizeof(LongInt));
test(dst[0], -1);
test(dst[1], -1);
test(dst[size-1], -1);
test(dst[size-2], -1);
for i := 2 to size-3 do
test(dst[i], i-2);
writeln('Passed!');
// repeat with source and dest swapped (maybe move in opposite direction)
// current implementations detect that regions don't overlap and move forward,
// so this test is mostly useless. But it won't harm anyway.
write('testing move of ',size,' dwords, opposite direction...');
for i := 0 to size-1 do
begin
dst[i] := i;
src[i] := -1;
end;
move(dst[0], src[2], (size-4)*sizeof(LongInt));
test(src[0], -1);
test(src[1], -1);
test(src[size-1], -1);
test(src[size-2], -1);
for i := 2 to size-3 do
test(src[i], i-2);
writeln('Passed!');
write('testing move of ',size,' dwords, overlapping forward...');
for i := 0 to size-1 do
src[i] := i;
move(src[0], src[100], (size-100)*sizeof(LongInt));
for i := 0 to 99 do
test(src[i], i);
for i := 100 to size-101 do
test(src[i], i-100);
writeln('Passed!');
write('testing move of ',size,' dwords, overlapping backward...');
for i := 0 to size-1 do
src[i] := i;
move(src[100], src[0], (size-100)*sizeof(LongInt));
for i := 0 to size-101 do
test(src[i], i+100);
for i := size-100 to size-1 do
test(src[i], i);
writeln('Passed!');
FreeMem(dst);
FreeMem(src);
end;
{$ifdef fpc}
procedure test_fillword;
var
@ -271,6 +335,8 @@ end;
begin
test_fillchar;
test_move;
test_move_large(500); // 512 longints=2048 bytes
test_move_large(500000);
{$ifdef fpc}
test_fillword;
test_filldword;