Improved Move implementation on ARM

This adds some small improvements to Move_pld and Move_blended. 1.) Overlapping memory is handled as "unusual" and the code is placed at the end of the function for better icache/bpu performance 2.) Fused the overlap check into 3 instructions with a single jump instead of 5 instructions with 2 jumps. 2.) Use ldmia/stmia with 2 registers instead of ldr/str for faster copying. 3.) Some code cleanup git-svn-id: trunk@21992 -
2025-04-14 05:40:28 +02:00 · 2012-08-01 11:15:20 +00:00 · 2012-08-01 11:15:20 +00:00 · 2e0203b7a2
commit 2e0203b7a2
parent f6deb01295
1 changed files with 42 additions and 103 deletions
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@ -224,62 +224,27 @@ asm
  bxle  lr
 {$endif}
  // overlap?
-  cmp r1,r0
-  bls .Lnooverlap
-  add r3,r0,r2
-  cmp r3,r1
-  bls .Lnooverlap
-  // overlap, copy backward
-.Loverlapped:
-  subs r2,r2,#1
-  ldrb r3,[r0,r2]
-  strb r3,[r1,r2]
-  bne .Loverlapped
-{$if defined(cpuarmv3) or defined(cpuarmv4)}
-  mov pc,lr
-{$else}
-  bx  lr
-{$endif}
-.Lnooverlap:
-  // less then 16 bytes to copy?
-  cmp r2,#8
-  // yes, the forget about the whole optimizations
-  // and do a bytewise copy
-  blt .Lbyteloop
+  subs   r3, r1, r0    // if (dest > source) and
+  cmphi  r2, r3        //    (count > dest - src) then
+  bhi    .Loverlapped  //   DoReverseByteCopy;

-  // both aligned?
-  orr r3,r0,r1
-  tst r3,#3
+  cmp r2,#8            // if (count < 8) then
+  blt .Lbyteloop       //    DoForwardByteCopy;
+  // Any way to avoid the above jump and fuse the next two instructions?
+  tst   r0, #3         // if (source and 3) <> 0 or
+  tsteq r1, #3         //    (dest and 3) <> 0 then
+  bne   .Lbyteloop     //   DoForwardByteCopy;

-  bne .Lbyteloop
-(*
-  // yes, then align
-  // alignment to 4 byte boundries is enough
-  ldrb ip,[r0],#1
-  sub r2,r2,#1
-  stb ip,[r1],#1
-  tst r3,#2
-  bne .Ldifferentaligned
-  ldrh ip,[r0],#2
-  sub r2,r2,#2
-  sth ip,[r1],#2
-
-.Ldifferentaligned
-  // qword aligned?
-  orrs r3,r0,r1
-  tst r3,#7
-  bne .Ldwordloop
-*)
-  pld [r0,#32]
+  pld   [r0,#32]
 .Ldwordloop:
-  sub r2,r2,#4
-  ldr r3,[r0],#4
+  ldmia r0!, {r3, ip}
  // preload
-  pld [r0,#64]
-  cmp r2,#4
-  str r3,[r1],#4
-  bcs .Ldwordloop
-  cmp r2,#0
+  pld   [r0,#64]
+  sub   r2,r2,#8
+  cmp   r2, #8
+  stmia r1!, {r3, ip}
+  bge   .Ldwordloop
+  cmp   r2,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
  moveq pc,lr
 {$else}
@ -295,6 +260,11 @@ asm
 {$else}
  bx  lr
 {$endif}
+.Loverlapped:
+  subs r2,r2,#1
+  ldrb r3,[r0,r2]
+  strb r3,[r1,r2]
+  bne .Loverlapped
 end;

 procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
@ -307,59 +277,24 @@ asm
  bxle  lr
 {$endif}
  // overlap?
-  cmp r1,r0
-  bls .Lnooverlap
-  add r3,r0,r2
-  cmp r3,r1
-  bls .Lnooverlap
-  // overlap, copy backward
-.Loverlapped:
-  subs r2,r2,#1
-  ldrb r3,[r0,r2]
-  strb r3,[r1,r2]
-  bne .Loverlapped
-{$if defined(cpuarmv3) or defined(cpuarmv4)}
-  mov pc,lr
-{$else}
-  bx  lr
-{$endif}
-.Lnooverlap:
-  // less then 16 bytes to copy?
-  cmp r2,#8
-  // yes, the forget about the whole optimizations
-  // and do a bytewise copy
-  blt .Lbyteloop
+  subs   r3, r1, r0    // if (dest > source) and
+  cmphi  r2, r3        //    (count > dest - src) then
+  bhi    .Loverlapped  //   DoReverseByteCopy;

-  // both aligned?
-  orr r3,r0,r1
-  tst r3,#3
+  cmp r2,#8            // if (count < 8) then
+  blt .Lbyteloop       //    DoForwardByteCopy;
+  // Any way to avoid the above jump and fuse the next two instructions?
+  tst   r0, #3         // if (source and 3) <> 0 or
+  tsteq r1, #3         //    (dest and 3) <> 0 then
+  bne   .Lbyteloop     //   DoForwardByteCopy;

-  bne .Lbyteloop
-(*
-  // yes, then align
-  // alignment to 4 byte boundries is enough
-  ldrb ip,[r0],#1
-  sub r2,r2,#1
-  stb ip,[r1],#1
-  tst r3,#2
-  bne .Ldifferentaligned
-  ldrh ip,[r0],#2
-  sub r2,r2,#2
-  sth ip,[r1],#2
-
-.Ldifferentaligned
-  // qword aligned?
-  orrs r3,r0,r1
-  tst r3,#7
-  bne .Ldwordloop
-*)
 .Ldwordloop:
-  sub r2,r2,#4
-  ldr r3,[r0],#4
-  cmp r2,#4
-  str r3,[r1],#4
-  bcs .Ldwordloop
-  cmp r2,#0
+  ldmia r0!, {r3, ip}
+  sub   r2,r2,#8
+  cmp   r2, #8
+  stmia r1!, {r3, ip}
+  bge   .Ldwordloop
+  cmp   r2,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
  moveq pc,lr
 {$else}
@ -375,9 +310,13 @@ asm
 {$else}
  bx  lr
 {$endif}
+.Loverlapped:
+  subs r2,r2,#1
+  ldrb r3,[r0,r2]
+  strb r3,[r1,r2]
+  bne .Loverlapped
 end;

-
 const
  moveproc : pointer = @move_blended;