From 2e0203b7a2c793f765858903e513325dabbbde98 Mon Sep 17 00:00:00 2001
From: masta <masta@idefix.freepascal.org>
Date: Wed, 1 Aug 2012 11:15:20 +0000
Subject: [PATCH] Improved Move implementation on ARM

This adds some small improvements to Move_pld and Move_blended.

1.) Overlapping memory is handled as "unusual" and the code is placed at
the end of the function for better icache/bpu performance
2.) Fused the overlap check into 3 instructions with a single jump
instead of 5 instructions with 2 jumps.
2.) Use ldmia/stmia with 2 registers instead of ldr/str for faster
copying.
3.) Some code cleanup

git-svn-id: trunk@21992 -
---
 rtl/arm/arm.inc | 145 ++++++++++++++----------------------------------
 1 file changed, 42 insertions(+), 103 deletions(-)

diff --git a/rtl/arm/arm.inc b/rtl/arm/arm.inc
index 65bcb64d1f..fa68c21947 100644
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@@ -224,62 +224,27 @@ asm
   bxle  lr
 {$endif}
   // overlap?
-  cmp r1,r0
-  bls .Lnooverlap
-  add r3,r0,r2
-  cmp r3,r1
-  bls .Lnooverlap
-  // overlap, copy backward
-.Loverlapped:
-  subs r2,r2,#1
-  ldrb r3,[r0,r2]
-  strb r3,[r1,r2]
-  bne .Loverlapped
-{$if defined(cpuarmv3) or defined(cpuarmv4)}
-  mov pc,lr
-{$else}
-  bx  lr
-{$endif}
-.Lnooverlap:
-  // less then 16 bytes to copy?
-  cmp r2,#8
-  // yes, the forget about the whole optimizations
-  // and do a bytewise copy
-  blt .Lbyteloop
+  subs   r3, r1, r0    // if (dest > source) and
+  cmphi  r2, r3        //    (count > dest - src) then
+  bhi    .Loverlapped  //   DoReverseByteCopy;
 
-  // both aligned?
-  orr r3,r0,r1
-  tst r3,#3
+  cmp r2,#8            // if (count < 8) then
+  blt .Lbyteloop       //    DoForwardByteCopy;
+  // Any way to avoid the above jump and fuse the next two instructions?
+  tst   r0, #3         // if (source and 3) <> 0 or
+  tsteq r1, #3         //    (dest and 3) <> 0 then
+  bne   .Lbyteloop     //   DoForwardByteCopy;
 
-  bne .Lbyteloop
-(*
-  // yes, then align
-  // alignment to 4 byte boundries is enough
-  ldrb ip,[r0],#1
-  sub r2,r2,#1
-  stb ip,[r1],#1
-  tst r3,#2
-  bne .Ldifferentaligned
-  ldrh ip,[r0],#2
-  sub r2,r2,#2
-  sth ip,[r1],#2
-
-.Ldifferentaligned
-  // qword aligned?
-  orrs r3,r0,r1
-  tst r3,#7
-  bne .Ldwordloop
-*)
-  pld [r0,#32]
+  pld   [r0,#32]
 .Ldwordloop:
-  sub r2,r2,#4
-  ldr r3,[r0],#4
+  ldmia r0!, {r3, ip}
   // preload
-  pld [r0,#64]
-  cmp r2,#4
-  str r3,[r1],#4
-  bcs .Ldwordloop
-  cmp r2,#0
+  pld   [r0,#64]
+  sub   r2,r2,#8
+  cmp   r2, #8
+  stmia r1!, {r3, ip}
+  bge   .Ldwordloop
+  cmp   r2,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
   moveq pc,lr
 {$else}
@@ -295,6 +260,11 @@ asm
 {$else}
   bx  lr
 {$endif}
+.Loverlapped:
+  subs r2,r2,#1
+  ldrb r3,[r0,r2]
+  strb r3,[r1,r2]
+  bne .Loverlapped
 end;
 
 procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
@@ -307,59 +277,24 @@ asm
   bxle  lr
 {$endif}
   // overlap?
-  cmp r1,r0
-  bls .Lnooverlap
-  add r3,r0,r2
-  cmp r3,r1
-  bls .Lnooverlap
-  // overlap, copy backward
-.Loverlapped:
-  subs r2,r2,#1
-  ldrb r3,[r0,r2]
-  strb r3,[r1,r2]
-  bne .Loverlapped
-{$if defined(cpuarmv3) or defined(cpuarmv4)}
-  mov pc,lr
-{$else}
-  bx  lr
-{$endif}
-.Lnooverlap:
-  // less then 16 bytes to copy?
-  cmp r2,#8
-  // yes, the forget about the whole optimizations
-  // and do a bytewise copy
-  blt .Lbyteloop
+  subs   r3, r1, r0    // if (dest > source) and
+  cmphi  r2, r3        //    (count > dest - src) then
+  bhi    .Loverlapped  //   DoReverseByteCopy;
 
-  // both aligned?
-  orr r3,r0,r1
-  tst r3,#3
+  cmp r2,#8            // if (count < 8) then
+  blt .Lbyteloop       //    DoForwardByteCopy;
+  // Any way to avoid the above jump and fuse the next two instructions?
+  tst   r0, #3         // if (source and 3) <> 0 or
+  tsteq r1, #3         //    (dest and 3) <> 0 then
+  bne   .Lbyteloop     //   DoForwardByteCopy;
 
-  bne .Lbyteloop
-(*
-  // yes, then align
-  // alignment to 4 byte boundries is enough
-  ldrb ip,[r0],#1
-  sub r2,r2,#1
-  stb ip,[r1],#1
-  tst r3,#2
-  bne .Ldifferentaligned
-  ldrh ip,[r0],#2
-  sub r2,r2,#2
-  sth ip,[r1],#2
-
-.Ldifferentaligned
-  // qword aligned?
-  orrs r3,r0,r1
-  tst r3,#7
-  bne .Ldwordloop
-*)
 .Ldwordloop:
-  sub r2,r2,#4
-  ldr r3,[r0],#4
-  cmp r2,#4
-  str r3,[r1],#4
-  bcs .Ldwordloop
-  cmp r2,#0
+  ldmia r0!, {r3, ip}
+  sub   r2,r2,#8
+  cmp   r2, #8
+  stmia r1!, {r3, ip}
+  bge   .Ldwordloop
+  cmp   r2,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
   moveq pc,lr
 {$else}
@@ -375,9 +310,13 @@ asm
 {$else}
   bx  lr
 {$endif}
+.Loverlapped:
+  subs r2,r2,#1
+  ldrb r3,[r0,r2]
+  strb r3,[r1,r2]
+  bne .Loverlapped
 end;
 
-
 const
   moveproc : pointer = @move_blended;