Improved Move implementation on ARM

This adds some small improvements to Move_pld and Move_blended.

1.) Overlapping memory is handled as "unusual" and the code is placed at
the end of the function for better icache/bpu performance
2.) Fused the overlap check into 3 instructions with a single jump
instead of 5 instructions with 2 jumps.
2.) Use ldmia/stmia with 2 registers instead of ldr/str for faster
copying.
3.) Some code cleanup

git-svn-id: trunk@21992 -
This commit is contained in:
masta 2012-08-01 11:15:20 +00:00
parent f6deb01295
commit 2e0203b7a2

View File

@ -224,62 +224,27 @@ asm
bxle lr
{$endif}
// overlap?
cmp r1,r0
bls .Lnooverlap
add r3,r0,r2
cmp r3,r1
bls .Lnooverlap
// overlap, copy backward
.Loverlapped:
subs r2,r2,#1
ldrb r3,[r0,r2]
strb r3,[r1,r2]
bne .Loverlapped
{$if defined(cpuarmv3) or defined(cpuarmv4)}
mov pc,lr
{$else}
bx lr
{$endif}
.Lnooverlap:
// less then 16 bytes to copy?
cmp r2,#8
// yes, the forget about the whole optimizations
// and do a bytewise copy
blt .Lbyteloop
subs r3, r1, r0 // if (dest > source) and
cmphi r2, r3 // (count > dest - src) then
bhi .Loverlapped // DoReverseByteCopy;
// both aligned?
orr r3,r0,r1
tst r3,#3
cmp r2,#8 // if (count < 8) then
blt .Lbyteloop // DoForwardByteCopy;
// Any way to avoid the above jump and fuse the next two instructions?
tst r0, #3 // if (source and 3) <> 0 or
tsteq r1, #3 // (dest and 3) <> 0 then
bne .Lbyteloop // DoForwardByteCopy;
bne .Lbyteloop
(*
// yes, then align
// alignment to 4 byte boundries is enough
ldrb ip,[r0],#1
sub r2,r2,#1
stb ip,[r1],#1
tst r3,#2
bne .Ldifferentaligned
ldrh ip,[r0],#2
sub r2,r2,#2
sth ip,[r1],#2
.Ldifferentaligned
// qword aligned?
orrs r3,r0,r1
tst r3,#7
bne .Ldwordloop
*)
pld [r0,#32]
pld [r0,#32]
.Ldwordloop:
sub r2,r2,#4
ldr r3,[r0],#4
ldmia r0!, {r3, ip}
// preload
pld [r0,#64]
cmp r2,#4
str r3,[r1],#4
bcs .Ldwordloop
cmp r2,#0
pld [r0,#64]
sub r2,r2,#8
cmp r2, #8
stmia r1!, {r3, ip}
bge .Ldwordloop
cmp r2,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)}
moveq pc,lr
{$else}
@ -295,6 +260,11 @@ asm
{$else}
bx lr
{$endif}
.Loverlapped:
subs r2,r2,#1
ldrb r3,[r0,r2]
strb r3,[r1,r2]
bne .Loverlapped
end;
procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
@ -307,59 +277,24 @@ asm
bxle lr
{$endif}
// overlap?
cmp r1,r0
bls .Lnooverlap
add r3,r0,r2
cmp r3,r1
bls .Lnooverlap
// overlap, copy backward
.Loverlapped:
subs r2,r2,#1
ldrb r3,[r0,r2]
strb r3,[r1,r2]
bne .Loverlapped
{$if defined(cpuarmv3) or defined(cpuarmv4)}
mov pc,lr
{$else}
bx lr
{$endif}
.Lnooverlap:
// less then 16 bytes to copy?
cmp r2,#8
// yes, the forget about the whole optimizations
// and do a bytewise copy
blt .Lbyteloop
subs r3, r1, r0 // if (dest > source) and
cmphi r2, r3 // (count > dest - src) then
bhi .Loverlapped // DoReverseByteCopy;
// both aligned?
orr r3,r0,r1
tst r3,#3
cmp r2,#8 // if (count < 8) then
blt .Lbyteloop // DoForwardByteCopy;
// Any way to avoid the above jump and fuse the next two instructions?
tst r0, #3 // if (source and 3) <> 0 or
tsteq r1, #3 // (dest and 3) <> 0 then
bne .Lbyteloop // DoForwardByteCopy;
bne .Lbyteloop
(*
// yes, then align
// alignment to 4 byte boundries is enough
ldrb ip,[r0],#1
sub r2,r2,#1
stb ip,[r1],#1
tst r3,#2
bne .Ldifferentaligned
ldrh ip,[r0],#2
sub r2,r2,#2
sth ip,[r1],#2
.Ldifferentaligned
// qword aligned?
orrs r3,r0,r1
tst r3,#7
bne .Ldwordloop
*)
.Ldwordloop:
sub r2,r2,#4
ldr r3,[r0],#4
cmp r2,#4
str r3,[r1],#4
bcs .Ldwordloop
cmp r2,#0
ldmia r0!, {r3, ip}
sub r2,r2,#8
cmp r2, #8
stmia r1!, {r3, ip}
bge .Ldwordloop
cmp r2,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)}
moveq pc,lr
{$else}
@ -375,9 +310,13 @@ asm
{$else}
bx lr
{$endif}
.Loverlapped:
subs r2,r2,#1
ldrb r3,[r0,r2]
strb r3,[r1,r2]
bne .Loverlapped
end;
const
moveproc : pointer = @move_blended;