mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-10 12:38:36 +02:00
Small optimizations to FillChar for ARM
The new version is more optimized to the "common case" We assume most of the data will be aligned, thats why the unaligned case has been moved to the end of the function so the aligned case is more cache- and pipeline friendly. I've also reduced the loop unrolling for the block transfer loop, because for large blocks we'll most likely hit the write buffer limit anyway. I've did some measurements. The new routine is a bit slower for less than 8 bytes, but beats the old one by 10-15% with 8 bytes++ git-svn-id: trunk@21760 -
This commit is contained in:
parent
37b8cd1b7a
commit
64c122100f
@ -145,62 +145,69 @@ end;
|
||||
Procedure FillChar(var x;count:longint;value:byte);assembler;nostackframe;
|
||||
asm
|
||||
// less than 0?
|
||||
cmp r1,#0
|
||||
cmp r1,#0
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
movlt pc,lr
|
||||
movle pc,lr
|
||||
{$else}
|
||||
bxlt lr
|
||||
bxle lr
|
||||
{$endif}
|
||||
mov r3,r0
|
||||
cmp r1,#8 // at least 8 bytes to do?
|
||||
blt .LFillchar2
|
||||
orr r2,r2,r2,lsl #8
|
||||
orr r2,r2,r2,lsl #16
|
||||
.LFillchar0:
|
||||
tst r3,#3 // aligned yet?
|
||||
strneb r2,[r3],#1
|
||||
subne r1,r1,#1
|
||||
bne .LFillchar0
|
||||
|
||||
orr r2,r2,r2,lsl #8
|
||||
orr r2,r2,r2,lsl #16
|
||||
|
||||
tst r3, #3 // Aligned?
|
||||
bne .LFillchar_do_align
|
||||
|
||||
.LFillchar_is_aligned:
|
||||
subs r1,r1,#8
|
||||
bmi .LFillchar_less_than_8bytes
|
||||
|
||||
mov ip,r2
|
||||
.LFillchar1:
|
||||
cmp r1,#8 // 8 bytes still to do?
|
||||
blt .LFillchar2
|
||||
.LFillchar_at_least_8bytes:
|
||||
// Do 16 bytes per loop
|
||||
// More unrolling is uncessary, as we'll just stall on the write buffers
|
||||
stmia r3!,{r2,ip}
|
||||
sub r1,r1,#8
|
||||
cmp r1,#8 // 8 bytes still to do?
|
||||
blt .LFillchar2
|
||||
stmia r3!,{r2,ip}
|
||||
sub r1,r1,#8
|
||||
cmp r1,#8 // 8 bytes still to do?
|
||||
blt .LFillchar2
|
||||
stmia r3!,{r2,ip}
|
||||
sub r1,r1,#8
|
||||
cmp r1,#8 // 8 bytes still to do?
|
||||
stmgeia r3!,{r2,ip}
|
||||
subge r1,r1,#8
|
||||
bge .LFillchar1
|
||||
.LFillchar2:
|
||||
movs r1,r1 // anything left?
|
||||
subs r1,r1,#8
|
||||
stmplia r3!,{r2,ip}
|
||||
subpls r1,r1,#8
|
||||
bpl .LFillchar_at_least_8bytes
|
||||
|
||||
.LFillchar_less_than_8bytes:
|
||||
// Do the rest
|
||||
adds r1, r1, #8
|
||||
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
moveq pc,lr
|
||||
{$else}
|
||||
bxeq lr
|
||||
{$endif}
|
||||
rsb r1,r1,#7
|
||||
add pc,pc,r1,lsl #2
|
||||
mov r0,r0
|
||||
strb r2,[r3],#1
|
||||
strb r2,[r3],#1
|
||||
strb r2,[r3],#1
|
||||
strb r2,[r3],#1
|
||||
strb r2,[r3],#1
|
||||
strb r2,[r3],#1
|
||||
strb r2,[r3],#1
|
||||
|
||||
tst r1, #4
|
||||
strne r2,[r3],#4
|
||||
tst r1, #2
|
||||
strneh r2,[r3],#2
|
||||
tst r1, #1
|
||||
strneb r2,[r3],#1
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
mov pc,lr
|
||||
{$else}
|
||||
bx lr
|
||||
{$endif}
|
||||
|
||||
// Special case for unaligned start
|
||||
// We make a maximum of 3 loops here
|
||||
.LFillchar_do_align:
|
||||
strb r2,[r3],#1
|
||||
subs r1, r1, #1
|
||||
{$if defined(cpuarmv3) or defined(cpuarmv4)}
|
||||
moveq pc,lr
|
||||
{$else}
|
||||
bxeq lr
|
||||
{$endif}
|
||||
tst r3,#3
|
||||
bne .LFillchar_do_align
|
||||
b .LFillchar_is_aligned
|
||||
end;
|
||||
{$endif FPC_SYSTEM_HAS_FILLCHAR}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user