From 6e09d76b07bafb0f13eeec9e3eedb1040711369e Mon Sep 17 00:00:00 2001
From: sergei <gorelkin@nanoreflex.ru>
Date: Tue, 5 Apr 2011 09:53:54 +0000
Subject: [PATCH] + x86_64 assembler implementations of Move and FillChar. Does
 not use SIMD, so probably not the fastest for large move sizes, but for small
 to medium sizes it should be competitive. * Extended the related test with
 checks for medium and large move sizes, to improve coverage for different
 code paths that are used depending on size.

git-svn-id: trunk@17249 -
---
 rtl/x86_64/x86_64.inc           | 510 ++++++++++++++++++++++----------
 tests/test/units/system/tmem.pp |  74 ++++-
 2 files changed, 428 insertions(+), 156 deletions(-)

diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc
index 58c86ee6a3..64717041bb 100644
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@@ -72,181 +72,387 @@ asm
 .Lg_a_null:
 end ['RAX'];
 
-(*
 {$define FPC_SYSTEM_HAS_MOVE}
-procedure Move(const source;var dest;count:longint);[public, alias: 'FPC_MOVE'];assembler;
-  asm
-     { rdi destination
-       rsi source
-       rdx count
-     }
-     pushq %rbx
-     prefetcht0 (%rsi)  // for more hopefully the hw prefetch will kick in
-     movq %rdi,%rax
+procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
+{ Linux: rdi source, rsi dest, rdx count
+  win64: rcx source, rdx dest, r8 count }
+asm
+{$ifndef win64}
+    mov    %rdx, %r8
+    mov    %rsi, %rdx
+    mov    %rdi, %rcx
+{$endif win64}
 
-     movl %edi,%ecx
-     andl $7,%ecx
-     jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-     movq %rdx,%rcx
-     movl $64,%ebx
-     shrq $6,%rcx
-     jz .Lhandle_tail
+    mov    %r8, %rax
+    sub    %rdx, %rcx            { rcx = src - dest }
+    jz     .Lquit                { exit if src=dest }
+    jnb    .L1                   { src>dest => forward move }
 
-.Lloop_64:
-     { no prefetch because we assume the hw prefetcher does it already
-       and we have no specific temporal hint to give. XXX or give a nta
-       hint for the source? }
-     movq (%rsi),%r11
-     movq 8(%rsi),%r8
-     movq 2*8(%rsi),%r9
-     movq 3*8(%rsi),%r10
-     movnti %r11,(%rdi)
-     movnti %r8,1*8(%rdi)
-     movnti %r9,2*8(%rdi)
-     movnti %r10,3*8(%rdi)
+    add    %rcx, %rax            { rcx is negative => r8+rcx > 0 if regions overlap }
+    jb     .Lback                { if no overlap, still do forward move }
 
-     movq 4*8(%rsi),%r11
-     movq 5*8(%rsi),%r8
-     movq 6*8(%rsi),%r9
-     movq 7*8(%rsi),%r10
-     movnti %r11,4*8(%rdi)
-     movnti %r8,5*8(%rdi)
-     movnti %r9,6*8(%rdi)
-     movnti %r10,7*8(%rdi)
+.L1:
+    cmp    $8, %r8
+    jl     .Lless8f              { signed compare, negative count not allowed }
+    test   $7, %dl
+    je     .Ldestaligned
 
-     addq %rbx,%rsi
-     addq %rbx,%rdi
-     loop .Lloop_64
+    test   $1, %dl               { align dest by moving first 1+2+4 bytes }
+    je     .L2f
+    mov    (%rcx,%rdx,1),%al
+    dec    %r8
+    mov    %al, (%rdx)
+    add    $1, %rdx
+.L2f:
+    test   $2, %dl
+    je     .L4f
+    mov    (%rcx,%rdx,1),%ax
+    sub    $2, %r8
+    mov    %ax, (%rdx)
+    add    $2, %rdx
+.L4f:
+    test   $4, %dl
+    je     .Ldestaligned
+    mov    (%rcx,%rdx,1),%eax
+    sub    $4, %r8
+    mov    %eax, (%rdx)
+    add    $4, %rdx
 
-.Lhandle_tail:
-     movl %edx,%ecx
-     andl $63,%ecx
-     shrl $3,%ecx
-     jz   .Lhandle_7
-     movl $8,%ebx
-.Lloop_8:
-     movq (%rsi),%r8
-     movnti %r8,(%rdi)
-     addq %rbx,%rdi
-     addq %rbx,%rsi
-     loop .Lloop_8
+.Ldestaligned:
+    mov    %r8, %r9
+    shr    $5, %r9
+    jne    .Lmore32
 
-.Lhandle_7:
-     movl %edx,%ecx
-     andl $7,%ecx
-     jz .Lende
-.Lloop_1:
-     movb (%rsi),%r8b
-     movb %r8b,(%rdi)
-     incq %rdi
-     incq %rsi
-     loop .Lloop_1
+.Ltail:
+    mov    %r8, %r9
+    shr    $3, %r9
+    je     .Lless8f
 
-     jmp .Lende
+    .balign 16
+.Lloop8f:                             { max. 8 iterations }
+    mov    (%rcx,%rdx,1),%rax
+    mov    %rax, (%rdx)
+    add    $8, %rdx
+    dec    %r9
+    jne    .Lloop8f
+    and    $7, %r8
 
-     { align destination }
-     { This is simpleminded. For bigger blocks it may make sense to align
-        src and dst to their aligned subset and handle the rest separately }
-.Lbad_alignment:
-     movl $8,%r9d
-     subl %ecx,%r9d
-     movl %r9d,%ecx
-     subq %r9,%rdx
-     js   .Lsmall_alignment
-     jz   .Lsmall_alignment
-.Lalign_1:
-     movb (%rsi),%r8b
-     movb %r8b,(%rdi)
-     incq %rdi
-     incq %rsi
-     loop .Lalign_1
-     jmp .Lafter_bad_alignment
-.Lsmall_alignment:
-     addq %r9,%rdx
-     jmp .Lhandle_7
+.Lless8f:
+    test   %r8, %r8
+    jle    .Lquit
 
-.Lende:
-     sfence
-     popq %rbx
-  end;
-*)
+    .balign 16
+.Lloop1f:
+    mov    (%rcx,%rdx,1),%al
+    mov    %al,(%rdx)
+    inc    %rdx
+    dec    %r8
+    jne    .Lloop1f
+.Lquit:
+    retq
+
+
+.Lmore32:
+    cmp    $0x2000, %r9          { this limit must be processor-specific (1/2 L2 cache size) }
+    jnae   .Lloop32
+    cmp    $0x1000, %rcx         { but don't bother bypassing cache if src and dest }
+    jnb    .Lntloopf             { are close to each other}
+
+    .balign 16
+.Lloop32:
+    add    $32,%rdx
+    mov    -32(%rcx,%rdx,1),%rax
+    mov    -24(%rcx,%rdx,1),%r10
+    mov    %rax,-32(%rdx)
+    mov    %r10,-24(%rdx)
+    dec    %r9
+    mov    -16(%rcx,%rdx,1),%rax
+    mov    -8(%rcx,%rdx,1),%r10
+    mov    %rax,-16(%rdx)
+    mov    %r10,-8(%rdx)
+    jne    .Lloop32
+
+    and    $0x1f, %r8
+    jmpq   .Ltail
+
+.Lntloopf:
+    mov    $32, %eax
+
+    .balign 16
+.Lpref:
+    prefetchnta (%rcx,%rdx,1)
+    prefetchnta 0x40(%rcx,%rdx,1)
+    add    $0x80, %rdx
+    dec    %eax
+    jne    .Lpref
+
+    sub    $0x1000, %rdx
+    mov    $64, %eax
+
+    .balign 16
+.Loop64:
+    add    $64, %rdx
+    mov    -64(%rcx,%rdx,1), %r9
+    mov    -56(%rcx,%rdx,1), %r10
+    movnti %r9, -64(%rdx)
+    movnti %r10, -56(%rdx)
+
+    mov    -48(%rcx,%rdx,1), %r9
+    mov    -40(%rcx,%rdx,1), %r10
+    movnti %r9, -48(%rdx)
+    movnti %r10, -40(%rdx)
+    dec    %eax
+    mov    -32(%rcx,%rdx,1), %r9
+    mov    -24(%rcx,%rdx,1), %r10
+    movnti %r9, -32(%rdx)
+    movnti %r10, -24(%rdx)
+
+    mov    -16(%rcx,%rdx,1), %r9
+    mov    -8(%rcx,%rdx,1), %r10
+    movnti %r9, -16(%rdx)
+    movnti %r10, -8(%rdx)
+    jne    .Loop64
+
+    sub    $0x1000, %r8
+    cmp    $0x1000, %r8
+    jae    .Lntloopf
+
+    mfence
+    jmpq    .Ldestaligned        { go handle remaining bytes }
+
+{ backwards move }
+.Lback:
+    add    %r8, %rdx             { points to the end of dest }
+    cmp    $8, %r8
+    jl     .Lless8b              { signed compare, negative count not allowed }
+    test   $7, %dl
+    je     .Ldestalignedb
+    test   $1, %dl
+    je     .L2b
+    dec    %rdx
+    mov    (%rcx,%rdx,1), %al
+    dec    %r8
+    mov    %al, (%rdx)
+.L2b:
+    test   $2, %dl
+    je     .L4b
+    sub    $2, %rdx
+    mov    (%rcx,%rdx,1), %ax
+    sub    $2, %r8
+    mov    %ax, (%rdx)
+.L4b:
+    test   $4, %dl
+    je     .Ldestalignedb
+    sub    $4, %rdx
+    mov    (%rcx,%rdx,1), %eax
+    sub    $4, %r8
+    mov    %eax, (%rdx)
+
+.Ldestalignedb:
+    mov    %r8, %r9
+    shr    $5, %r9
+    jne    .Lmore32b
+
+.Ltailb:
+    mov    %r8, %r9
+    shr    $3, %r9
+    je     .Lless8b
+
+.Lloop8b:
+    sub    $8, %rdx
+    mov    (%rcx,%rdx,1), %rax
+    dec    %r9
+    mov    %rax, (%rdx)
+    jne    .Lloop8b
+    and    $7, %r8
+
+.Lless8b:
+    test   %r8, %r8
+    jle    .Lquit2
+
+    .balign 16
+.Lsmallb:
+    dec   %rdx
+    mov   (%rcx,%rdx,1), %al
+    dec   %r8
+    mov   %al,(%rdx)
+    jnz   .Lsmallb
+.Lquit2:
+    retq
+
+.Lmore32b:
+    cmp   $0x2000, %r9
+    jnae  .Lloop32b
+    cmp    $0xfffffffffffff000,%rcx
+    jb     .Lntloopb
+
+    .balign 16
+.Lloop32b:
+    sub    $32, %rdx
+    mov    24(%rcx,%rdx,1), %rax
+    mov    16(%rcx,%rdx,1), %r10
+    mov    %rax, 24(%rdx)
+    mov    %r10, 16(%rdx)
+    dec    %r9
+    mov    8(%rcx,%rdx,1),%rax
+    mov    (%rcx,%rdx,1), %r10
+    mov    %rax, 8(%rdx)
+    mov    %r10, (%rdx)
+    jne    .Lloop32b
+    and    $0x1f, %r8
+    jmpq   .Ltailb
+
+
+.Lntloopb:
+    mov    $32, %eax
+
+    .balign 16
+.Lprefb:
+    sub    $0x80, %rdx
+    prefetchnta (%rcx,%rdx,1)
+    prefetchnta 0x40(%rcx,%rdx,1)
+    dec    %eax
+    jnz    .Lprefb
+
+    add    $0x1000, %rdx
+    mov    $0x40, %eax
+
+    .balign 16
+.Lloop64b:
+    sub    $64, %rdx
+    mov    56(%rcx,%rdx,1), %r9
+    mov    48(%rcx,%rdx,1), %r10
+    movnti %r9, 56(%rdx)
+    movnti %r10, 48(%rdx)
+
+    mov    40(%rcx,%rdx,1), %r9
+    mov    32(%rcx,%rdx,1), %r10
+    movnti %r9, 40(%rdx)
+    movnti %r10, 32(%rdx)
+    dec    %eax
+    mov    24(%rcx,%rdx,1), %r9
+    mov    16(%rcx,%rdx,1), %r10
+    movnti %r9, 24(%rdx)
+    movnti %r10, 16(%rdx)
+
+    mov    8(%rcx,%rdx,1), %r9
+    mov    (%rcx,%rdx,1), %r10
+    movnti %r9, 8(%rdx)
+    movnti %r10, (%rdx)
+    jne    .Lloop64b
+
+    sub    $0x1000, %r8
+    cmp    $0x1000, %r8
+    jae    .Lntloopb
+    mfence
+    jmpq   .Ldestalignedb
+end;
 
-(*
 {$define FPC_SYSTEM_HAS_FILLCHAR}
-Procedure FillChar(var x;count:longint;value:byte);assembler;
+Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
   asm
-    { rdi   destination
-      rsi   value (char)
-      rdx   count (bytes)
-    }
-    movq %rdi,%r10
-    movq %rdx,%r11
+{ win64: rcx dest, rdx count, r8b value
+  linux: rdi dest, rsi count, rdx value }
+{$ifndef win64}
+    mov    %rdx, %r8
+    mov    %rsi, %rdx
+    mov    %rdi, %rcx
+{$endif win64}
+
+    cmp    $8, %rdx
+    jl     .Ltiny
 
     { expand byte value  }
-    movzbl %sil,%ecx
-    movabs $0x0101010101010101,%rax
-    mul    %rcx         { with rax, clobbers rdx }
+    movzbl %r8b, %r8
+    mov    $0x0101010101010101,%r9
+    imul   %r9, %r8
 
-    { align dst }
-    movl  %edi,%r9d
-    andl  $7,%r9d
-    jnz  .Lbad_alignment
-.Lafter_bad_alignment:
+    test   $7, %cl
+    je     .Laligned
 
-     movq %r11,%rcx
-     movl $64,%r8d
-     shrq $6,%rcx
-     jz  .Lhandle_tail
+    { align dest to 8 bytes }
+    test   $1, %cl
+    je     .L2
+    movb   %r8b, (%rcx)
+    add    $1, %rcx
+    sub    $1, %rdx
+.L2:
+    test   $2, %cl
+    je     .L4
+    movw   %r8w, (%rcx)
+    add    $2, %rcx
+    sub    $2, %rdx
+.L4:
+    test   $4, %cl
+    je     .Laligned
+    movl   %r8d, (%rcx)
+    add    $4, %rcx
+    sub    $4, %rdx
 
-.Lloop_64:
-     movnti  %rax,(%rdi)
-     movnti  %rax,8(%rdi)
-     movnti  %rax,16(%rdi)
-     movnti  %rax,24(%rdi)
-     movnti  %rax,32(%rdi)
-     movnti  %rax,40(%rdi)
-     movnti  %rax,48(%rdi)
-     movnti  %rax,56(%rdi)
-     addq    %r8,%rdi
-     loop    .Lloop_64
+.Laligned:
+    mov    %rdx, %rax
+    and    $0x3f, %rdx
+    shr    $6, %rax
+    jne    .Lmore64
 
-     { Handle tail in loops. The loops should be faster than hard
-        to predict jump tables. }
-.Lhandle_tail:
-     movl       %r11d,%ecx
-     andl    $56,%ecx
-     jz     .Lhandle_7
-     shrl       $3,%ecx
-.Lloop_8:
-     movnti  %rax,(%rdi)
-     addq    $8,%rdi
-     loop    .Lloop_8
-.Lhandle_7:
-     movl       %r11d,%ecx
-     andl       $7,%ecx
-     jz      .Lende
-.Lloop_1:
-     movb       %al,(%rdi)
-     addq       $1,%rdi
-     loop       .Lloop_1
+.Lless64:
+    mov    %rdx, %rax
+    and    $7, %rdx
+    shr    $3, %rax
+    je     .Ltiny
 
-     jmp .Lende
+    .balign 16
+.Lloop8:                               { max. 8 iterations }
+    mov    %r8, (%rcx)
+    add    $8, %rcx
+    dec    %rax
+    jne    .Lloop8
+.Ltiny:
+    test   %rdx, %rdx
+    jle    .Lquit
+.Lloop1:
+    movb   %r8b, (%rcx)
+    inc    %rcx
+    dec    %rdx
+    jnz    .Lloop1
+.Lquit:
+    retq
 
-.Lbad_alignment:
-     cmpq $7,%r11
-     jbe .Lhandle_7
-     movnti %rax,(%rdi) (* unaligned store *)
-     movq $8,%r8
-     subq %r9,%r8
-     addq %r8,%rdi
-     subq %r8,%r11
-     jmp .Lafter_bad_alignment
+.Lmore64:
+    cmp    $0x2000,%rax
+    jae    .Lloop64nti
 
-.Lende:
-     movq       %r10,%rax
+    .balign 16
+.Lloop64:
+    add    $64, %rcx
+    mov    %r8, -64(%rcx)
+    mov    %r8, -56(%rcx)
+    mov    %r8, -48(%rcx)
+    mov    %r8, -40(%rcx)
+    dec    %rax
+    mov    %r8, -32(%rcx)
+    mov    %r8, -24(%rcx)
+    mov    %r8, -16(%rcx)
+    mov    %r8, -8(%rcx)
+    jne    .Lloop64
+    jmp    .Lless64
+
+    .balign 16
+.Lloop64nti:
+    add    $64, %rcx
+    movnti %r8, -64(%rcx)
+    movnti %r8, -56(%rcx)
+    movnti %r8, -48(%rcx)
+    movnti %r8, -40(%rcx)
+    dec    %rax
+    movnti %r8, -32(%rcx)
+    movnti %r8, -24(%rcx)
+    movnti %r8, -16(%rcx)
+    movnti %r8, -8(%rcx)
+    jnz    .Lloop64nti
+    mfence
+    jmp    .Lless64
   end;
-*)
 
 
 {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
diff --git a/tests/test/units/system/tmem.pp b/tests/test/units/system/tmem.pp
index 3ef647419c..7abb55226d 100644
--- a/tests/test/units/system/tmem.pp
+++ b/tests/test/units/system/tmem.pp
@@ -16,9 +16,7 @@ var
   dst_arraybyte : array[1..MAX_TABLE] of byte;
   src_arraybyte : array[1..MAX_TABLE] of byte;
   dst_arrayword : array[1..MAX_TABLE] of word;
-  src_arrayword : array[1..MAX_TABLE] of word;
   dst_arraylongword : array[1..MAX_TABLE] of longword;
-  src_arratlongword : array[1..MAX_TABLE] of longword;
   i: integer;
 
 
@@ -70,6 +68,8 @@ procedure test_fillchar;
   for i := 1 to MAX_TABLE do
     dst_arraybyte[i] := DEFAULT_VALUE;
   fillchar(dst_arraybyte, -1, FILL_VALUE);
+  for i := 1 to MAX_TABLE do
+    test(dst_arraybyte[i], DEFAULT_VALUE);
   writeln('Passed!');
  end;
 
@@ -103,7 +103,7 @@ begin
     test(dst_arraybyte[i], FILL_VALUE);
   writeln('Passed!');
   { zero move count }
-  write('test move (zero count)...');
+  write('testing move (zero count)...');
   for i := 1 to MAX_TABLE do
   begin
     dst_arraybyte[i] := DEFAULT_VALUE;
@@ -114,11 +114,75 @@ begin
     test(dst_arraybyte[i], DEFAULT_VALUE);
   writeln('Passed!');
   { negative move count }
-  write('test move (negative count)...');
+  write('testing move (negative count)...');
   move(src_arraybyte,dst_arraybyte,-12);
   writeln('Passed!');
 end;
 
+
+procedure test_move_large(size: longint);
+var
+  src, dst: PLongInt;
+  i: LongInt;
+begin
+  GetMem(src, size*sizeof(LongInt));
+  GetMem(dst, size*sizeof(LongInt));
+  write('testing move of ',size,' dwords ...');
+  for i := 0 to size-1 do
+  begin
+    src[i] := i;
+    dst[i] := -1;
+  end;
+  move(src[0], dst[2], (size-4)*sizeof(LongInt));
+  test(dst[0], -1);
+  test(dst[1], -1);
+  test(dst[size-1], -1);
+  test(dst[size-2], -1);
+  for i := 2 to size-3 do
+    test(dst[i], i-2);
+  writeln('Passed!');
+
+  // repeat with source and dest swapped (maybe move in opposite direction)
+  // current implementations detect that regions don't overlap and move forward,
+  // so this test is mostly useless. But it won't harm anyway.
+  write('testing move of ',size,' dwords, opposite direction...');
+  for i := 0 to size-1 do
+  begin
+    dst[i] := i;
+    src[i] := -1;
+  end;
+  move(dst[0], src[2], (size-4)*sizeof(LongInt));
+  test(src[0], -1);
+  test(src[1], -1);
+  test(src[size-1], -1);
+  test(src[size-2], -1);
+  for i := 2 to size-3 do
+    test(src[i], i-2);
+  writeln('Passed!');
+
+  write('testing move of ',size,' dwords, overlapping forward...');
+  for i := 0 to size-1 do
+    src[i] := i;
+  move(src[0], src[100], (size-100)*sizeof(LongInt));
+  for i := 0 to 99 do
+    test(src[i], i);
+  for i := 100 to size-101 do
+    test(src[i], i-100);
+  writeln('Passed!');
+
+  write('testing move of ',size,' dwords, overlapping backward...');
+  for i := 0 to size-1 do
+    src[i] := i;
+  move(src[100], src[0], (size-100)*sizeof(LongInt));
+  for i := 0 to size-101 do
+    test(src[i], i+100);
+  for i := size-100 to size-1 do
+    test(src[i], i);
+  writeln('Passed!');
+  FreeMem(dst);
+  FreeMem(src);
+end;
+
 {$ifdef fpc}
 procedure test_fillword;
  var
@@ -271,6 +335,8 @@ end;
 begin
   test_fillchar;
   test_move;
+  test_move_large(500);   // 512 longints=2048 bytes
+  test_move_large(500000);
 {$ifdef fpc}
   test_fillword;
   test_filldword;