Improved Move implementation on ARM

This adds some small improvements to Move_pld and Move_blended. 1.) Overlapping memory is handled as "unusual" and the code is placed at the end of the function for better icache/bpu performance 2.) Fused the overlap check into 3 instructions with a single jump instead of 5 instructions with 2 jumps. 2.) Use ldmia/stmia with 2 registers instead of ldr/str for faster copying. 3.) Some code cleanup git-svn-id: trunk@21992 -
2025-08-27 20:11:02 +02:00 · 2012-08-01 11:15:20 +00:00 · 2012-08-01 11:15:20 +00:00 · 2e0203b7a2
commit 2e0203b7a2
parent f6deb01295
1 changed files with 42 additions and 103 deletions
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@ -224,62 +224,27 @@ asm
  bxle  lr
 {$endif}
  // overlap?
-  cmp r1,r0
+  subs   r3, r1, r0    // if (dest > source) and
-  bls .Lnooverlap
+  cmphi  r2, r3        //    (count > dest - src) then
-  add r3,r0,r2
+  bhi    .Loverlapped  //   DoReverseByteCopy;
  cmp r3,r1
  bls .Lnooverlap
  // overlap, copy backward
 .Loverlapped:
  subs r2,r2,#1
  ldrb r3,[r0,r2]
  strb r3,[r1,r2]
  bne .Loverlapped
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
  mov pc,lr
 {$else}
  bx  lr
 {$endif}
 .Lnooverlap:
  // less then 16 bytes to copy?
  cmp r2,#8
  // yes, the forget about the whole optimizations
  // and do a bytewise copy
  blt .Lbyteloop
-  // both aligned?
+  cmp r2,#8            // if (count < 8) then
-  orr r3,r0,r1
+  blt .Lbyteloop       //    DoForwardByteCopy;
-  tst r3,#3
+  // Any way to avoid the above jump and fuse the next two instructions?
  tst   r0, #3         // if (source and 3) <> 0 or
  tsteq r1, #3         //    (dest and 3) <> 0 then
  bne   .Lbyteloop     //   DoForwardByteCopy;
-  bne .Lbyteloop
+  pld   [r0,#32]
 (*
  // yes, then align
  // alignment to 4 byte boundries is enough
  ldrb ip,[r0],#1
  sub r2,r2,#1
  stb ip,[r1],#1
  tst r3,#2
  bne .Ldifferentaligned
  ldrh ip,[r0],#2
  sub r2,r2,#2
  sth ip,[r1],#2
 .Ldifferentaligned
  // qword aligned?
  orrs r3,r0,r1
  tst r3,#7
  bne .Ldwordloop
 *)
  pld [r0,#32]
 .Ldwordloop:
-  sub r2,r2,#4
+  ldmia r0!, {r3, ip}
  ldr r3,[r0],#4
  // preload
-  pld [r0,#64]
+  pld   [r0,#64]
-  cmp r2,#4
+  sub   r2,r2,#8
-  str r3,[r1],#4
+  cmp   r2, #8
-  bcs .Ldwordloop
+  stmia r1!, {r3, ip}
-  cmp r2,#0
+  bge   .Ldwordloop
  cmp   r2,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
  moveq pc,lr
 {$else}
@ -295,6 +260,11 @@ asm
 {$else}
  bx  lr
 {$endif}
 .Loverlapped:
  subs r2,r2,#1
  ldrb r3,[r0,r2]
  strb r3,[r1,r2]
  bne .Loverlapped
 end;
 procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
@ -307,59 +277,24 @@ asm
  bxle  lr
 {$endif}
  // overlap?
-  cmp r1,r0
+  subs   r3, r1, r0    // if (dest > source) and
-  bls .Lnooverlap
+  cmphi  r2, r3        //    (count > dest - src) then
-  add r3,r0,r2
+  bhi    .Loverlapped  //   DoReverseByteCopy;
  cmp r3,r1
  bls .Lnooverlap
  // overlap, copy backward
 .Loverlapped:
  subs r2,r2,#1
  ldrb r3,[r0,r2]
  strb r3,[r1,r2]
  bne .Loverlapped
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
  mov pc,lr
 {$else}
  bx  lr
 {$endif}
 .Lnooverlap:
  // less then 16 bytes to copy?
  cmp r2,#8
  // yes, the forget about the whole optimizations
  // and do a bytewise copy
  blt .Lbyteloop
-  // both aligned?
+  cmp r2,#8            // if (count < 8) then
-  orr r3,r0,r1
+  blt .Lbyteloop       //    DoForwardByteCopy;
-  tst r3,#3
+  // Any way to avoid the above jump and fuse the next two instructions?
  tst   r0, #3         // if (source and 3) <> 0 or
  tsteq r1, #3         //    (dest and 3) <> 0 then
  bne   .Lbyteloop     //   DoForwardByteCopy;
  bne .Lbyteloop
 (*
  // yes, then align
  // alignment to 4 byte boundries is enough
  ldrb ip,[r0],#1
  sub r2,r2,#1
  stb ip,[r1],#1
  tst r3,#2
  bne .Ldifferentaligned
  ldrh ip,[r0],#2
  sub r2,r2,#2
  sth ip,[r1],#2
 .Ldifferentaligned
  // qword aligned?
  orrs r3,r0,r1
  tst r3,#7
  bne .Ldwordloop
 *)
 .Ldwordloop:
-  sub r2,r2,#4
+  ldmia r0!, {r3, ip}
-  ldr r3,[r0],#4
+  sub   r2,r2,#8
-  cmp r2,#4
+  cmp   r2, #8
-  str r3,[r1],#4
+  stmia r1!, {r3, ip}
-  bcs .Ldwordloop
+  bge   .Ldwordloop
-  cmp r2,#0
+  cmp   r2,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
  moveq pc,lr
 {$else}
@ -375,9 +310,13 @@ asm
 {$else}
  bx  lr
 {$endif}
 .Loverlapped:
  subs r2,r2,#1
  ldrb r3,[r0,r2]
  strb r3,[r1,r2]
  bne .Loverlapped
 end;
 const
  moveproc : pointer = @move_blended;