+ x86_64 assembler implementations of Move and FillChar. Does not use SIMD, so probably not the fastest for large move sizes, but for small to medium sizes it should be competitive.

* Extended the related test with checks for medium and large move sizes, to improve coverage for different code paths that are used depending on size.

git-svn-id: trunk@17249 -
This commit is contained in:
sergei 2011-04-05 09:53:54 +00:00
parent da7ef036ba
commit 6e09d76b07
2 changed files with 428 additions and 156 deletions

View File

@ -72,181 +72,387 @@ asm
.Lg_a_null: .Lg_a_null:
end ['RAX']; end ['RAX'];
(*
{$define FPC_SYSTEM_HAS_MOVE} {$define FPC_SYSTEM_HAS_MOVE}
procedure Move(const source;var dest;count:longint);[public, alias: 'FPC_MOVE'];assembler; procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
asm { Linux: rdi source, rsi dest, rdx count
{ rdi destination win64: rcx source, rdx dest, r8 count }
rsi source asm
rdx count {$ifndef win64}
} mov %rdx, %r8
pushq %rbx mov %rsi, %rdx
prefetcht0 (%rsi) // for more hopefully the hw prefetch will kick in mov %rdi, %rcx
movq %rdi,%rax {$endif win64}
movl %edi,%ecx mov %r8, %rax
andl $7,%ecx sub %rdx, %rcx { rcx = src - dest }
jnz .Lbad_alignment jz .Lquit { exit if src=dest }
.Lafter_bad_alignment: jnb .L1 { src>dest => forward move }
movq %rdx,%rcx
movl $64,%ebx
shrq $6,%rcx
jz .Lhandle_tail
.Lloop_64: add %rcx, %rax { rcx is negative => r8+rcx > 0 if regions overlap }
{ no prefetch because we assume the hw prefetcher does it already jb .Lback { if no overlap, still do forward move }
and we have no specific temporal hint to give. XXX or give a nta
hint for the source? }
movq (%rsi),%r11
movq 8(%rsi),%r8
movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10
movnti %r11,(%rdi)
movnti %r8,1*8(%rdi)
movnti %r9,2*8(%rdi)
movnti %r10,3*8(%rdi)
movq 4*8(%rsi),%r11 .L1:
movq 5*8(%rsi),%r8 cmp $8, %r8
movq 6*8(%rsi),%r9 jl .Lless8f { signed compare, negative count not allowed }
movq 7*8(%rsi),%r10 test $7, %dl
movnti %r11,4*8(%rdi) je .Ldestaligned
movnti %r8,5*8(%rdi)
movnti %r9,6*8(%rdi)
movnti %r10,7*8(%rdi)
addq %rbx,%rsi test $1, %dl { align dest by moving first 1+2+4 bytes }
addq %rbx,%rdi je .L2f
loop .Lloop_64 mov (%rcx,%rdx,1),%al
dec %r8
mov %al, (%rdx)
add $1, %rdx
.L2f:
test $2, %dl
je .L4f
mov (%rcx,%rdx,1),%ax
sub $2, %r8
mov %ax, (%rdx)
add $2, %rdx
.L4f:
test $4, %dl
je .Ldestaligned
mov (%rcx,%rdx,1),%eax
sub $4, %r8
mov %eax, (%rdx)
add $4, %rdx
.Lhandle_tail: .Ldestaligned:
movl %edx,%ecx mov %r8, %r9
andl $63,%ecx shr $5, %r9
shrl $3,%ecx jne .Lmore32
jz .Lhandle_7
movl $8,%ebx
.Lloop_8:
movq (%rsi),%r8
movnti %r8,(%rdi)
addq %rbx,%rdi
addq %rbx,%rsi
loop .Lloop_8
.Lhandle_7: .Ltail:
movl %edx,%ecx mov %r8, %r9
andl $7,%ecx shr $3, %r9
jz .Lende je .Lless8f
.Lloop_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
loop .Lloop_1
jmp .Lende .balign 16
.Lloop8f: { max. 8 iterations }
mov (%rcx,%rdx,1),%rax
mov %rax, (%rdx)
add $8, %rdx
dec %r9
jne .Lloop8f
and $7, %r8
{ align destination } .Lless8f:
{ This is simpleminded. For bigger blocks it may make sense to align test %r8, %r8
src and dst to their aligned subset and handle the rest separately } jle .Lquit
.Lbad_alignment:
movl $8,%r9d
subl %ecx,%r9d
movl %r9d,%ecx
subq %r9,%rdx
js .Lsmall_alignment
jz .Lsmall_alignment
.Lalign_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
loop .Lalign_1
jmp .Lafter_bad_alignment
.Lsmall_alignment:
addq %r9,%rdx
jmp .Lhandle_7
.Lende: .balign 16
sfence .Lloop1f:
popq %rbx mov (%rcx,%rdx,1),%al
end; mov %al,(%rdx)
*) inc %rdx
dec %r8
jne .Lloop1f
.Lquit:
retq
.Lmore32:
cmp $0x2000, %r9 { this limit must be processor-specific (1/2 L2 cache size) }
jnae .Lloop32
cmp $0x1000, %rcx { but don't bother bypassing cache if src and dest }
jnb .Lntloopf { are close to each other}
.balign 16
.Lloop32:
add $32,%rdx
mov -32(%rcx,%rdx,1),%rax
mov -24(%rcx,%rdx,1),%r10
mov %rax,-32(%rdx)
mov %r10,-24(%rdx)
dec %r9
mov -16(%rcx,%rdx,1),%rax
mov -8(%rcx,%rdx,1),%r10
mov %rax,-16(%rdx)
mov %r10,-8(%rdx)
jne .Lloop32
and $0x1f, %r8
jmpq .Ltail
.Lntloopf:
mov $32, %eax
.balign 16
.Lpref:
prefetchnta (%rcx,%rdx,1)
prefetchnta 0x40(%rcx,%rdx,1)
add $0x80, %rdx
dec %eax
jne .Lpref
sub $0x1000, %rdx
mov $64, %eax
.balign 16
.Loop64:
add $64, %rdx
mov -64(%rcx,%rdx,1), %r9
mov -56(%rcx,%rdx,1), %r10
movnti %r9, -64(%rdx)
movnti %r10, -56(%rdx)
mov -48(%rcx,%rdx,1), %r9
mov -40(%rcx,%rdx,1), %r10
movnti %r9, -48(%rdx)
movnti %r10, -40(%rdx)
dec %eax
mov -32(%rcx,%rdx,1), %r9
mov -24(%rcx,%rdx,1), %r10
movnti %r9, -32(%rdx)
movnti %r10, -24(%rdx)
mov -16(%rcx,%rdx,1), %r9
mov -8(%rcx,%rdx,1), %r10
movnti %r9, -16(%rdx)
movnti %r10, -8(%rdx)
jne .Loop64
sub $0x1000, %r8
cmp $0x1000, %r8
jae .Lntloopf
mfence
jmpq .Ldestaligned { go handle remaining bytes }
{ backwards move }
.Lback:
add %r8, %rdx { points to the end of dest }
cmp $8, %r8
jl .Lless8b { signed compare, negative count not allowed }
test $7, %dl
je .Ldestalignedb
test $1, %dl
je .L2b
dec %rdx
mov (%rcx,%rdx,1), %al
dec %r8
mov %al, (%rdx)
.L2b:
test $2, %dl
je .L4b
sub $2, %rdx
mov (%rcx,%rdx,1), %ax
sub $2, %r8
mov %ax, (%rdx)
.L4b:
test $4, %dl
je .Ldestalignedb
sub $4, %rdx
mov (%rcx,%rdx,1), %eax
sub $4, %r8
mov %eax, (%rdx)
.Ldestalignedb:
mov %r8, %r9
shr $5, %r9
jne .Lmore32b
.Ltailb:
mov %r8, %r9
shr $3, %r9
je .Lless8b
.Lloop8b:
sub $8, %rdx
mov (%rcx,%rdx,1), %rax
dec %r9
mov %rax, (%rdx)
jne .Lloop8b
and $7, %r8
.Lless8b:
test %r8, %r8
jle .Lquit2
.balign 16
.Lsmallb:
dec %rdx
mov (%rcx,%rdx,1), %al
dec %r8
mov %al,(%rdx)
jnz .Lsmallb
.Lquit2:
retq
.Lmore32b:
cmp $0x2000, %r9
jnae .Lloop32b
cmp $0xfffffffffffff000,%rcx
jb .Lntloopb
.balign 16
.Lloop32b:
sub $32, %rdx
mov 24(%rcx,%rdx,1), %rax
mov 16(%rcx,%rdx,1), %r10
mov %rax, 24(%rdx)
mov %r10, 16(%rdx)
dec %r9
mov 8(%rcx,%rdx,1),%rax
mov (%rcx,%rdx,1), %r10
mov %rax, 8(%rdx)
mov %r10, (%rdx)
jne .Lloop32b
and $0x1f, %r8
jmpq .Ltailb
.Lntloopb:
mov $32, %eax
.balign 16
.Lprefb:
sub $0x80, %rdx
prefetchnta (%rcx,%rdx,1)
prefetchnta 0x40(%rcx,%rdx,1)
dec %eax
jnz .Lprefb
add $0x1000, %rdx
mov $0x40, %eax
.balign 16
.Lloop64b:
sub $64, %rdx
mov 56(%rcx,%rdx,1), %r9
mov 48(%rcx,%rdx,1), %r10
movnti %r9, 56(%rdx)
movnti %r10, 48(%rdx)
mov 40(%rcx,%rdx,1), %r9
mov 32(%rcx,%rdx,1), %r10
movnti %r9, 40(%rdx)
movnti %r10, 32(%rdx)
dec %eax
mov 24(%rcx,%rdx,1), %r9
mov 16(%rcx,%rdx,1), %r10
movnti %r9, 24(%rdx)
movnti %r10, 16(%rdx)
mov 8(%rcx,%rdx,1), %r9
mov (%rcx,%rdx,1), %r10
movnti %r9, 8(%rdx)
movnti %r10, (%rdx)
jne .Lloop64b
sub $0x1000, %r8
cmp $0x1000, %r8
jae .Lntloopb
mfence
jmpq .Ldestalignedb
end;
(*
{$define FPC_SYSTEM_HAS_FILLCHAR} {$define FPC_SYSTEM_HAS_FILLCHAR}
Procedure FillChar(var x;count:longint;value:byte);assembler; Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
asm asm
{ rdi destination { win64: rcx dest, rdx count, r8b value
rsi value (char) linux: rdi dest, rsi count, rdx value }
rdx count (bytes) {$ifndef win64}
} mov %rdx, %r8
movq %rdi,%r10 mov %rsi, %rdx
movq %rdx,%r11 mov %rdi, %rcx
{$endif win64}
cmp $8, %rdx
jl .Ltiny
{ expand byte value } { expand byte value }
movzbl %sil,%ecx movzbl %r8b, %r8
movabs $0x0101010101010101,%rax mov $0x0101010101010101,%r9
mul %rcx { with rax, clobbers rdx } imul %r9, %r8
{ align dst } test $7, %cl
movl %edi,%r9d je .Laligned
andl $7,%r9d
jnz .Lbad_alignment
.Lafter_bad_alignment:
movq %r11,%rcx { align dest to 8 bytes }
movl $64,%r8d test $1, %cl
shrq $6,%rcx je .L2
jz .Lhandle_tail movb %r8b, (%rcx)
add $1, %rcx
sub $1, %rdx
.L2:
test $2, %cl
je .L4
movw %r8w, (%rcx)
add $2, %rcx
sub $2, %rdx
.L4:
test $4, %cl
je .Laligned
movl %r8d, (%rcx)
add $4, %rcx
sub $4, %rdx
.Lloop_64: .Laligned:
movnti %rax,(%rdi) mov %rdx, %rax
movnti %rax,8(%rdi) and $0x3f, %rdx
movnti %rax,16(%rdi) shr $6, %rax
movnti %rax,24(%rdi) jne .Lmore64
movnti %rax,32(%rdi)
movnti %rax,40(%rdi)
movnti %rax,48(%rdi)
movnti %rax,56(%rdi)
addq %r8,%rdi
loop .Lloop_64
{ Handle tail in loops. The loops should be faster than hard .Lless64:
to predict jump tables. } mov %rdx, %rax
.Lhandle_tail: and $7, %rdx
movl %r11d,%ecx shr $3, %rax
andl $56,%ecx je .Ltiny
jz .Lhandle_7
shrl $3,%ecx
.Lloop_8:
movnti %rax,(%rdi)
addq $8,%rdi
loop .Lloop_8
.Lhandle_7:
movl %r11d,%ecx
andl $7,%ecx
jz .Lende
.Lloop_1:
movb %al,(%rdi)
addq $1,%rdi
loop .Lloop_1
jmp .Lende .balign 16
.Lloop8: { max. 8 iterations }
mov %r8, (%rcx)
add $8, %rcx
dec %rax
jne .Lloop8
.Ltiny:
test %rdx, %rdx
jle .Lquit
.Lloop1:
movb %r8b, (%rcx)
inc %rcx
dec %rdx
jnz .Lloop1
.Lquit:
retq
.Lbad_alignment: .Lmore64:
cmpq $7,%r11 cmp $0x2000,%rax
jbe .Lhandle_7 jae .Lloop64nti
movnti %rax,(%rdi) (* unaligned store *)
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%r11
jmp .Lafter_bad_alignment
.Lende: .balign 16
movq %r10,%rax .Lloop64:
add $64, %rcx
mov %r8, -64(%rcx)
mov %r8, -56(%rcx)
mov %r8, -48(%rcx)
mov %r8, -40(%rcx)
dec %rax
mov %r8, -32(%rcx)
mov %r8, -24(%rcx)
mov %r8, -16(%rcx)
mov %r8, -8(%rcx)
jne .Lloop64
jmp .Lless64
.balign 16
.Lloop64nti:
add $64, %rcx
movnti %r8, -64(%rcx)
movnti %r8, -56(%rcx)
movnti %r8, -48(%rcx)
movnti %r8, -40(%rcx)
dec %rax
movnti %r8, -32(%rcx)
movnti %r8, -24(%rcx)
movnti %r8, -16(%rcx)
movnti %r8, -8(%rcx)
jnz .Lloop64nti
mfence
jmp .Lless64
end; end;
*)
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT} {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}

View File

@ -16,9 +16,7 @@ var
dst_arraybyte : array[1..MAX_TABLE] of byte; dst_arraybyte : array[1..MAX_TABLE] of byte;
src_arraybyte : array[1..MAX_TABLE] of byte; src_arraybyte : array[1..MAX_TABLE] of byte;
dst_arrayword : array[1..MAX_TABLE] of word; dst_arrayword : array[1..MAX_TABLE] of word;
src_arrayword : array[1..MAX_TABLE] of word;
dst_arraylongword : array[1..MAX_TABLE] of longword; dst_arraylongword : array[1..MAX_TABLE] of longword;
src_arratlongword : array[1..MAX_TABLE] of longword;
i: integer; i: integer;
@ -70,6 +68,8 @@ procedure test_fillchar;
for i := 1 to MAX_TABLE do for i := 1 to MAX_TABLE do
dst_arraybyte[i] := DEFAULT_VALUE; dst_arraybyte[i] := DEFAULT_VALUE;
fillchar(dst_arraybyte, -1, FILL_VALUE); fillchar(dst_arraybyte, -1, FILL_VALUE);
for i := 1 to MAX_TABLE do
test(dst_arraybyte[i], DEFAULT_VALUE);
writeln('Passed!'); writeln('Passed!');
end; end;
@ -103,7 +103,7 @@ begin
test(dst_arraybyte[i], FILL_VALUE); test(dst_arraybyte[i], FILL_VALUE);
writeln('Passed!'); writeln('Passed!');
{ zero move count } { zero move count }
write('test move (zero count)...'); write('testing move (zero count)...');
for i := 1 to MAX_TABLE do for i := 1 to MAX_TABLE do
begin begin
dst_arraybyte[i] := DEFAULT_VALUE; dst_arraybyte[i] := DEFAULT_VALUE;
@ -114,11 +114,75 @@ begin
test(dst_arraybyte[i], DEFAULT_VALUE); test(dst_arraybyte[i], DEFAULT_VALUE);
writeln('Passed!'); writeln('Passed!');
{ negative move count } { negative move count }
write('test move (negative count)...'); write('testing move (negative count)...');
move(src_arraybyte,dst_arraybyte,-12); move(src_arraybyte,dst_arraybyte,-12);
writeln('Passed!'); writeln('Passed!');
end; end;
procedure test_move_large(size: longint);
var
src, dst: PLongInt;
i: LongInt;
begin
GetMem(src, size*sizeof(LongInt));
GetMem(dst, size*sizeof(LongInt));
write('testing move of ',size,' dwords ...');
for i := 0 to size-1 do
begin
src[i] := i;
dst[i] := -1;
end;
move(src[0], dst[2], (size-4)*sizeof(LongInt));
test(dst[0], -1);
test(dst[1], -1);
test(dst[size-1], -1);
test(dst[size-2], -1);
for i := 2 to size-3 do
test(dst[i], i-2);
writeln('Passed!');
// repeat with source and dest swapped (maybe move in opposite direction)
// current implementations detect that regions don't overlap and move forward,
// so this test is mostly useless. But it won't harm anyway.
write('testing move of ',size,' dwords, opposite direction...');
for i := 0 to size-1 do
begin
dst[i] := i;
src[i] := -1;
end;
move(dst[0], src[2], (size-4)*sizeof(LongInt));
test(src[0], -1);
test(src[1], -1);
test(src[size-1], -1);
test(src[size-2], -1);
for i := 2 to size-3 do
test(src[i], i-2);
writeln('Passed!');
write('testing move of ',size,' dwords, overlapping forward...');
for i := 0 to size-1 do
src[i] := i;
move(src[0], src[100], (size-100)*sizeof(LongInt));
for i := 0 to 99 do
test(src[i], i);
for i := 100 to size-101 do
test(src[i], i-100);
writeln('Passed!');
write('testing move of ',size,' dwords, overlapping backward...');
for i := 0 to size-1 do
src[i] := i;
move(src[100], src[0], (size-100)*sizeof(LongInt));
for i := 0 to size-101 do
test(src[i], i+100);
for i := size-100 to size-1 do
test(src[i], i);
writeln('Passed!');
FreeMem(dst);
FreeMem(src);
end;
{$ifdef fpc} {$ifdef fpc}
procedure test_fillword; procedure test_fillword;
var var
@ -271,6 +335,8 @@ end;
begin begin
test_fillchar; test_fillchar;
test_move; test_move;
test_move_large(500); // 512 longints=2048 bytes
test_move_large(500000);
{$ifdef fpc} {$ifdef fpc}
test_fillword; test_fillword;
test_filldword; test_filldword;