mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-14 05:40:28 +02:00
Improved Move implementation on ARM
This adds some small improvements to Move_pld and Move_blended. 1.) Overlapping memory is handled as "unusual" and the code is placed at the end of the function for better icache/bpu performance 2.) Fused the overlap check into 3 instructions with a single jump instead of 5 instructions with 2 jumps. 2.) Use ldmia/stmia with 2 registers instead of ldr/str for faster copying. 3.) Some code cleanup git-svn-id: trunk@21992 -
This commit is contained in:
parent
f6deb01295
commit
2e0203b7a2
145
rtl/arm/arm.inc
145
rtl/arm/arm.inc
@ -224,62 +224,27 @@ asm
|
||||
bxle lr
|
||||
{$endif}
|
||||
// overlap?
|
||||
cmp r1,r0
|
||||
bls .Lnooverlap
|
||||
add r3,r0,r2
|
||||
cmp r3,r1
|
||||
bls .Lnooverlap
|
||||
// overlap, copy backward
|
||||
.Loverlapped:
|
||||
subs r2,r2,#1
|
||||
ldrb r3,[r0,r2]
|
||||
strb r3,[r1,r2]
|
||||
bne .Loverlapped
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
mov pc,lr
|
||||
{$else}
|
||||
bx lr
|
||||
{$endif}
|
||||
.Lnooverlap:
|
||||
// less then 16 bytes to copy?
|
||||
cmp r2,#8
|
||||
// yes, the forget about the whole optimizations
|
||||
// and do a bytewise copy
|
||||
blt .Lbyteloop
|
||||
subs r3, r1, r0 // if (dest > source) and
|
||||
cmphi r2, r3 // (count > dest - src) then
|
||||
bhi .Loverlapped // DoReverseByteCopy;
|
||||
|
||||
// both aligned?
|
||||
orr r3,r0,r1
|
||||
tst r3,#3
|
||||
cmp r2,#8 // if (count < 8) then
|
||||
blt .Lbyteloop // DoForwardByteCopy;
|
||||
// Any way to avoid the above jump and fuse the next two instructions?
|
||||
tst r0, #3 // if (source and 3) <> 0 or
|
||||
tsteq r1, #3 // (dest and 3) <> 0 then
|
||||
bne .Lbyteloop // DoForwardByteCopy;
|
||||
|
||||
bne .Lbyteloop
|
||||
(*
|
||||
// yes, then align
|
||||
// alignment to 4 byte boundries is enough
|
||||
ldrb ip,[r0],#1
|
||||
sub r2,r2,#1
|
||||
stb ip,[r1],#1
|
||||
tst r3,#2
|
||||
bne .Ldifferentaligned
|
||||
ldrh ip,[r0],#2
|
||||
sub r2,r2,#2
|
||||
sth ip,[r1],#2
|
||||
|
||||
.Ldifferentaligned
|
||||
// qword aligned?
|
||||
orrs r3,r0,r1
|
||||
tst r3,#7
|
||||
bne .Ldwordloop
|
||||
*)
|
||||
pld [r0,#32]
|
||||
pld [r0,#32]
|
||||
.Ldwordloop:
|
||||
sub r2,r2,#4
|
||||
ldr r3,[r0],#4
|
||||
ldmia r0!, {r3, ip}
|
||||
// preload
|
||||
pld [r0,#64]
|
||||
cmp r2,#4
|
||||
str r3,[r1],#4
|
||||
bcs .Ldwordloop
|
||||
cmp r2,#0
|
||||
pld [r0,#64]
|
||||
sub r2,r2,#8
|
||||
cmp r2, #8
|
||||
stmia r1!, {r3, ip}
|
||||
bge .Ldwordloop
|
||||
cmp r2,#0
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
moveq pc,lr
|
||||
{$else}
|
||||
@ -295,6 +260,11 @@ asm
|
||||
{$else}
|
||||
bx lr
|
||||
{$endif}
|
||||
.Loverlapped:
|
||||
subs r2,r2,#1
|
||||
ldrb r3,[r0,r2]
|
||||
strb r3,[r1,r2]
|
||||
bne .Loverlapped
|
||||
end;
|
||||
|
||||
procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
|
||||
@ -307,59 +277,24 @@ asm
|
||||
bxle lr
|
||||
{$endif}
|
||||
// overlap?
|
||||
cmp r1,r0
|
||||
bls .Lnooverlap
|
||||
add r3,r0,r2
|
||||
cmp r3,r1
|
||||
bls .Lnooverlap
|
||||
// overlap, copy backward
|
||||
.Loverlapped:
|
||||
subs r2,r2,#1
|
||||
ldrb r3,[r0,r2]
|
||||
strb r3,[r1,r2]
|
||||
bne .Loverlapped
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
mov pc,lr
|
||||
{$else}
|
||||
bx lr
|
||||
{$endif}
|
||||
.Lnooverlap:
|
||||
// less then 16 bytes to copy?
|
||||
cmp r2,#8
|
||||
// yes, the forget about the whole optimizations
|
||||
// and do a bytewise copy
|
||||
blt .Lbyteloop
|
||||
subs r3, r1, r0 // if (dest > source) and
|
||||
cmphi r2, r3 // (count > dest - src) then
|
||||
bhi .Loverlapped // DoReverseByteCopy;
|
||||
|
||||
// both aligned?
|
||||
orr r3,r0,r1
|
||||
tst r3,#3
|
||||
cmp r2,#8 // if (count < 8) then
|
||||
blt .Lbyteloop // DoForwardByteCopy;
|
||||
// Any way to avoid the above jump and fuse the next two instructions?
|
||||
tst r0, #3 // if (source and 3) <> 0 or
|
||||
tsteq r1, #3 // (dest and 3) <> 0 then
|
||||
bne .Lbyteloop // DoForwardByteCopy;
|
||||
|
||||
bne .Lbyteloop
|
||||
(*
|
||||
// yes, then align
|
||||
// alignment to 4 byte boundries is enough
|
||||
ldrb ip,[r0],#1
|
||||
sub r2,r2,#1
|
||||
stb ip,[r1],#1
|
||||
tst r3,#2
|
||||
bne .Ldifferentaligned
|
||||
ldrh ip,[r0],#2
|
||||
sub r2,r2,#2
|
||||
sth ip,[r1],#2
|
||||
|
||||
.Ldifferentaligned
|
||||
// qword aligned?
|
||||
orrs r3,r0,r1
|
||||
tst r3,#7
|
||||
bne .Ldwordloop
|
||||
*)
|
||||
.Ldwordloop:
|
||||
sub r2,r2,#4
|
||||
ldr r3,[r0],#4
|
||||
cmp r2,#4
|
||||
str r3,[r1],#4
|
||||
bcs .Ldwordloop
|
||||
cmp r2,#0
|
||||
ldmia r0!, {r3, ip}
|
||||
sub r2,r2,#8
|
||||
cmp r2, #8
|
||||
stmia r1!, {r3, ip}
|
||||
bge .Ldwordloop
|
||||
cmp r2,#0
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
moveq pc,lr
|
||||
{$else}
|
||||
@ -375,9 +310,13 @@ asm
|
||||
{$else}
|
||||
bx lr
|
||||
{$endif}
|
||||
.Loverlapped:
|
||||
subs r2,r2,#1
|
||||
ldrb r3,[r0,r2]
|
||||
strb r3,[r1,r2]
|
||||
bne .Loverlapped
|
||||
end;
|
||||
|
||||
|
||||
const
|
||||
moveproc : pointer = @move_blended;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user