From 2e0203b7a2c793f765858903e513325dabbbde98 Mon Sep 17 00:00:00 2001 From: masta Date: Wed, 1 Aug 2012 11:15:20 +0000 Subject: [PATCH] Improved Move implementation on ARM This adds some small improvements to Move_pld and Move_blended. 1.) Overlapping memory is handled as "unusual" and the code is placed at the end of the function for better icache/bpu performance 2.) Fused the overlap check into 3 instructions with a single jump instead of 5 instructions with 2 jumps. 2.) Use ldmia/stmia with 2 registers instead of ldr/str for faster copying. 3.) Some code cleanup git-svn-id: trunk@21992 - --- rtl/arm/arm.inc | 145 ++++++++++++++---------------------------------- 1 file changed, 42 insertions(+), 103 deletions(-) diff --git a/rtl/arm/arm.inc b/rtl/arm/arm.inc index 65bcb64d1f..fa68c21947 100644 --- a/rtl/arm/arm.inc +++ b/rtl/arm/arm.inc @@ -224,62 +224,27 @@ asm bxle lr {$endif} // overlap? - cmp r1,r0 - bls .Lnooverlap - add r3,r0,r2 - cmp r3,r1 - bls .Lnooverlap - // overlap, copy backward -.Loverlapped: - subs r2,r2,#1 - ldrb r3,[r0,r2] - strb r3,[r1,r2] - bne .Loverlapped -{$if defined(cpuarmv3) or defined(cpuarmv4)} - mov pc,lr -{$else} - bx lr -{$endif} -.Lnooverlap: - // less then 16 bytes to copy? - cmp r2,#8 - // yes, the forget about the whole optimizations - // and do a bytewise copy - blt .Lbyteloop + subs r3, r1, r0 // if (dest > source) and + cmphi r2, r3 // (count > dest - src) then + bhi .Loverlapped // DoReverseByteCopy; - // both aligned? - orr r3,r0,r1 - tst r3,#3 + cmp r2,#8 // if (count < 8) then + blt .Lbyteloop // DoForwardByteCopy; + // Any way to avoid the above jump and fuse the next two instructions? + tst r0, #3 // if (source and 3) <> 0 or + tsteq r1, #3 // (dest and 3) <> 0 then + bne .Lbyteloop // DoForwardByteCopy; - bne .Lbyteloop -(* - // yes, then align - // alignment to 4 byte boundries is enough - ldrb ip,[r0],#1 - sub r2,r2,#1 - stb ip,[r1],#1 - tst r3,#2 - bne .Ldifferentaligned - ldrh ip,[r0],#2 - sub r2,r2,#2 - sth ip,[r1],#2 - -.Ldifferentaligned - // qword aligned? - orrs r3,r0,r1 - tst r3,#7 - bne .Ldwordloop -*) - pld [r0,#32] + pld [r0,#32] .Ldwordloop: - sub r2,r2,#4 - ldr r3,[r0],#4 + ldmia r0!, {r3, ip} // preload - pld [r0,#64] - cmp r2,#4 - str r3,[r1],#4 - bcs .Ldwordloop - cmp r2,#0 + pld [r0,#64] + sub r2,r2,#8 + cmp r2, #8 + stmia r1!, {r3, ip} + bge .Ldwordloop + cmp r2,#0 {$if defined(cpuarmv3) or defined(cpuarmv4)} moveq pc,lr {$else} @@ -295,6 +260,11 @@ asm {$else} bx lr {$endif} +.Loverlapped: + subs r2,r2,#1 + ldrb r3,[r0,r2] + strb r3,[r1,r2] + bne .Loverlapped end; procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe; @@ -307,59 +277,24 @@ asm bxle lr {$endif} // overlap? - cmp r1,r0 - bls .Lnooverlap - add r3,r0,r2 - cmp r3,r1 - bls .Lnooverlap - // overlap, copy backward -.Loverlapped: - subs r2,r2,#1 - ldrb r3,[r0,r2] - strb r3,[r1,r2] - bne .Loverlapped -{$if defined(cpuarmv3) or defined(cpuarmv4)} - mov pc,lr -{$else} - bx lr -{$endif} -.Lnooverlap: - // less then 16 bytes to copy? - cmp r2,#8 - // yes, the forget about the whole optimizations - // and do a bytewise copy - blt .Lbyteloop + subs r3, r1, r0 // if (dest > source) and + cmphi r2, r3 // (count > dest - src) then + bhi .Loverlapped // DoReverseByteCopy; - // both aligned? - orr r3,r0,r1 - tst r3,#3 + cmp r2,#8 // if (count < 8) then + blt .Lbyteloop // DoForwardByteCopy; + // Any way to avoid the above jump and fuse the next two instructions? + tst r0, #3 // if (source and 3) <> 0 or + tsteq r1, #3 // (dest and 3) <> 0 then + bne .Lbyteloop // DoForwardByteCopy; - bne .Lbyteloop -(* - // yes, then align - // alignment to 4 byte boundries is enough - ldrb ip,[r0],#1 - sub r2,r2,#1 - stb ip,[r1],#1 - tst r3,#2 - bne .Ldifferentaligned - ldrh ip,[r0],#2 - sub r2,r2,#2 - sth ip,[r1],#2 - -.Ldifferentaligned - // qword aligned? - orrs r3,r0,r1 - tst r3,#7 - bne .Ldwordloop -*) .Ldwordloop: - sub r2,r2,#4 - ldr r3,[r0],#4 - cmp r2,#4 - str r3,[r1],#4 - bcs .Ldwordloop - cmp r2,#0 + ldmia r0!, {r3, ip} + sub r2,r2,#8 + cmp r2, #8 + stmia r1!, {r3, ip} + bge .Ldwordloop + cmp r2,#0 {$if defined(cpuarmv3) or defined(cpuarmv4)} moveq pc,lr {$else} @@ -375,9 +310,13 @@ asm {$else} bx lr {$endif} +.Loverlapped: + subs r2,r2,#1 + ldrb r3,[r0,r2] + strb r3,[r1,r2] + bne .Loverlapped end; - const moveproc : pointer = @move_blended;