mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-27 20:11:02 +02:00
Improved Move implementation on ARM
This adds some small improvements to Move_pld and Move_blended. 1.) Overlapping memory is handled as "unusual" and the code is placed at the end of the function for better icache/bpu performance 2.) Fused the overlap check into 3 instructions with a single jump instead of 5 instructions with 2 jumps. 2.) Use ldmia/stmia with 2 registers instead of ldr/str for faster copying. 3.) Some code cleanup git-svn-id: trunk@21992 -
This commit is contained in:
parent
f6deb01295
commit
2e0203b7a2
145
rtl/arm/arm.inc
145
rtl/arm/arm.inc
@ -224,62 +224,27 @@ asm
|
|||||||
bxle lr
|
bxle lr
|
||||||
{$endif}
|
{$endif}
|
||||||
// overlap?
|
// overlap?
|
||||||
cmp r1,r0
|
subs r3, r1, r0 // if (dest > source) and
|
||||||
bls .Lnooverlap
|
cmphi r2, r3 // (count > dest - src) then
|
||||||
add r3,r0,r2
|
bhi .Loverlapped // DoReverseByteCopy;
|
||||||
cmp r3,r1
|
|
||||||
bls .Lnooverlap
|
|
||||||
// overlap, copy backward
|
|
||||||
.Loverlapped:
|
|
||||||
subs r2,r2,#1
|
|
||||||
ldrb r3,[r0,r2]
|
|
||||||
strb r3,[r1,r2]
|
|
||||||
bne .Loverlapped
|
|
||||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
|
||||||
mov pc,lr
|
|
||||||
{$else}
|
|
||||||
bx lr
|
|
||||||
{$endif}
|
|
||||||
.Lnooverlap:
|
|
||||||
// less then 16 bytes to copy?
|
|
||||||
cmp r2,#8
|
|
||||||
// yes, the forget about the whole optimizations
|
|
||||||
// and do a bytewise copy
|
|
||||||
blt .Lbyteloop
|
|
||||||
|
|
||||||
// both aligned?
|
cmp r2,#8 // if (count < 8) then
|
||||||
orr r3,r0,r1
|
blt .Lbyteloop // DoForwardByteCopy;
|
||||||
tst r3,#3
|
// Any way to avoid the above jump and fuse the next two instructions?
|
||||||
|
tst r0, #3 // if (source and 3) <> 0 or
|
||||||
|
tsteq r1, #3 // (dest and 3) <> 0 then
|
||||||
|
bne .Lbyteloop // DoForwardByteCopy;
|
||||||
|
|
||||||
bne .Lbyteloop
|
pld [r0,#32]
|
||||||
(*
|
|
||||||
// yes, then align
|
|
||||||
// alignment to 4 byte boundries is enough
|
|
||||||
ldrb ip,[r0],#1
|
|
||||||
sub r2,r2,#1
|
|
||||||
stb ip,[r1],#1
|
|
||||||
tst r3,#2
|
|
||||||
bne .Ldifferentaligned
|
|
||||||
ldrh ip,[r0],#2
|
|
||||||
sub r2,r2,#2
|
|
||||||
sth ip,[r1],#2
|
|
||||||
|
|
||||||
.Ldifferentaligned
|
|
||||||
// qword aligned?
|
|
||||||
orrs r3,r0,r1
|
|
||||||
tst r3,#7
|
|
||||||
bne .Ldwordloop
|
|
||||||
*)
|
|
||||||
pld [r0,#32]
|
|
||||||
.Ldwordloop:
|
.Ldwordloop:
|
||||||
sub r2,r2,#4
|
ldmia r0!, {r3, ip}
|
||||||
ldr r3,[r0],#4
|
|
||||||
// preload
|
// preload
|
||||||
pld [r0,#64]
|
pld [r0,#64]
|
||||||
cmp r2,#4
|
sub r2,r2,#8
|
||||||
str r3,[r1],#4
|
cmp r2, #8
|
||||||
bcs .Ldwordloop
|
stmia r1!, {r3, ip}
|
||||||
cmp r2,#0
|
bge .Ldwordloop
|
||||||
|
cmp r2,#0
|
||||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||||
moveq pc,lr
|
moveq pc,lr
|
||||||
{$else}
|
{$else}
|
||||||
@ -295,6 +260,11 @@ asm
|
|||||||
{$else}
|
{$else}
|
||||||
bx lr
|
bx lr
|
||||||
{$endif}
|
{$endif}
|
||||||
|
.Loverlapped:
|
||||||
|
subs r2,r2,#1
|
||||||
|
ldrb r3,[r0,r2]
|
||||||
|
strb r3,[r1,r2]
|
||||||
|
bne .Loverlapped
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
|
procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
|
||||||
@ -307,59 +277,24 @@ asm
|
|||||||
bxle lr
|
bxle lr
|
||||||
{$endif}
|
{$endif}
|
||||||
// overlap?
|
// overlap?
|
||||||
cmp r1,r0
|
subs r3, r1, r0 // if (dest > source) and
|
||||||
bls .Lnooverlap
|
cmphi r2, r3 // (count > dest - src) then
|
||||||
add r3,r0,r2
|
bhi .Loverlapped // DoReverseByteCopy;
|
||||||
cmp r3,r1
|
|
||||||
bls .Lnooverlap
|
|
||||||
// overlap, copy backward
|
|
||||||
.Loverlapped:
|
|
||||||
subs r2,r2,#1
|
|
||||||
ldrb r3,[r0,r2]
|
|
||||||
strb r3,[r1,r2]
|
|
||||||
bne .Loverlapped
|
|
||||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
|
||||||
mov pc,lr
|
|
||||||
{$else}
|
|
||||||
bx lr
|
|
||||||
{$endif}
|
|
||||||
.Lnooverlap:
|
|
||||||
// less then 16 bytes to copy?
|
|
||||||
cmp r2,#8
|
|
||||||
// yes, the forget about the whole optimizations
|
|
||||||
// and do a bytewise copy
|
|
||||||
blt .Lbyteloop
|
|
||||||
|
|
||||||
// both aligned?
|
cmp r2,#8 // if (count < 8) then
|
||||||
orr r3,r0,r1
|
blt .Lbyteloop // DoForwardByteCopy;
|
||||||
tst r3,#3
|
// Any way to avoid the above jump and fuse the next two instructions?
|
||||||
|
tst r0, #3 // if (source and 3) <> 0 or
|
||||||
|
tsteq r1, #3 // (dest and 3) <> 0 then
|
||||||
|
bne .Lbyteloop // DoForwardByteCopy;
|
||||||
|
|
||||||
bne .Lbyteloop
|
|
||||||
(*
|
|
||||||
// yes, then align
|
|
||||||
// alignment to 4 byte boundries is enough
|
|
||||||
ldrb ip,[r0],#1
|
|
||||||
sub r2,r2,#1
|
|
||||||
stb ip,[r1],#1
|
|
||||||
tst r3,#2
|
|
||||||
bne .Ldifferentaligned
|
|
||||||
ldrh ip,[r0],#2
|
|
||||||
sub r2,r2,#2
|
|
||||||
sth ip,[r1],#2
|
|
||||||
|
|
||||||
.Ldifferentaligned
|
|
||||||
// qword aligned?
|
|
||||||
orrs r3,r0,r1
|
|
||||||
tst r3,#7
|
|
||||||
bne .Ldwordloop
|
|
||||||
*)
|
|
||||||
.Ldwordloop:
|
.Ldwordloop:
|
||||||
sub r2,r2,#4
|
ldmia r0!, {r3, ip}
|
||||||
ldr r3,[r0],#4
|
sub r2,r2,#8
|
||||||
cmp r2,#4
|
cmp r2, #8
|
||||||
str r3,[r1],#4
|
stmia r1!, {r3, ip}
|
||||||
bcs .Ldwordloop
|
bge .Ldwordloop
|
||||||
cmp r2,#0
|
cmp r2,#0
|
||||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||||
moveq pc,lr
|
moveq pc,lr
|
||||||
{$else}
|
{$else}
|
||||||
@ -375,9 +310,13 @@ asm
|
|||||||
{$else}
|
{$else}
|
||||||
bx lr
|
bx lr
|
||||||
{$endif}
|
{$endif}
|
||||||
|
.Loverlapped:
|
||||||
|
subs r2,r2,#1
|
||||||
|
ldrb r3,[r0,r2]
|
||||||
|
strb r3,[r1,r2]
|
||||||
|
bne .Loverlapped
|
||||||
end;
|
end;
|
||||||
|
|
||||||
|
|
||||||
const
|
const
|
||||||
moveproc : pointer = @move_blended;
|
moveproc : pointer = @move_blended;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user