Small optimizations to FillChar for ARM

The new version is more optimized to the "common case"

We assume most of the data will be aligned, thats why the unaligned
case has been moved to the end of the function so the aligned case is
more cache- and pipeline friendly.

I've also reduced the loop unrolling for the block transfer loop,
because for large blocks we'll most likely hit the write buffer limit
anyway.

I've did some measurements. The new routine is a bit slower for less
than 8 bytes, but beats the old one by 10-15% with 8 bytes++

git-svn-id: trunk@21760 -
This commit is contained in:
masta 2012-07-02 23:54:19 +00:00
parent 37b8cd1b7a
commit 64c122100f

View File

@ -147,60 +147,67 @@ asm
// less than 0? // less than 0?
cmp r1,#0 cmp r1,#0
{$if defined(cpuarmv3) or defined(cpuarmv4)} {$if defined(cpuarmv3) or defined(cpuarmv4)}
movlt pc,lr movle pc,lr
{$else} {$else}
bxlt lr bxle lr
{$endif} {$endif}
mov r3,r0 mov r3,r0
cmp r1,#8 // at least 8 bytes to do?
blt .LFillchar2
orr r2,r2,r2,lsl #8 orr r2,r2,r2,lsl #8
orr r2,r2,r2,lsl #16 orr r2,r2,r2,lsl #16
.LFillchar0:
tst r3,#3 // aligned yet? tst r3, #3 // Aligned?
strneb r2,[r3],#1 bne .LFillchar_do_align
subne r1,r1,#1
bne .LFillchar0 .LFillchar_is_aligned:
subs r1,r1,#8
bmi .LFillchar_less_than_8bytes
mov ip,r2 mov ip,r2
.LFillchar1: .LFillchar_at_least_8bytes:
cmp r1,#8 // 8 bytes still to do? // Do 16 bytes per loop
blt .LFillchar2 // More unrolling is uncessary, as we'll just stall on the write buffers
stmia r3!,{r2,ip} stmia r3!,{r2,ip}
sub r1,r1,#8 subs r1,r1,#8
cmp r1,#8 // 8 bytes still to do? stmplia r3!,{r2,ip}
blt .LFillchar2 subpls r1,r1,#8
stmia r3!,{r2,ip} bpl .LFillchar_at_least_8bytes
sub r1,r1,#8
cmp r1,#8 // 8 bytes still to do? .LFillchar_less_than_8bytes:
blt .LFillchar2 // Do the rest
stmia r3!,{r2,ip} adds r1, r1, #8
sub r1,r1,#8
cmp r1,#8 // 8 bytes still to do?
stmgeia r3!,{r2,ip}
subge r1,r1,#8
bge .LFillchar1
.LFillchar2:
movs r1,r1 // anything left?
{$if defined(cpuarmv3) or defined(cpuarmv4)} {$if defined(cpuarmv3) or defined(cpuarmv4)}
moveq pc,lr moveq pc,lr
{$else} {$else}
bxeq lr bxeq lr
{$endif} {$endif}
rsb r1,r1,#7
add pc,pc,r1,lsl #2 tst r1, #4
mov r0,r0 strne r2,[r3],#4
strb r2,[r3],#1 tst r1, #2
strb r2,[r3],#1 strneh r2,[r3],#2
strb r2,[r3],#1 tst r1, #1
strb r2,[r3],#1 strneb r2,[r3],#1
strb r2,[r3],#1
strb r2,[r3],#1
strb r2,[r3],#1
{$if defined(cpuarmv3) or defined(cpuarmv4)} {$if defined(cpuarmv3) or defined(cpuarmv4)}
mov pc,lr mov pc,lr
{$else} {$else}
bx lr bx lr
{$endif} {$endif}
// Special case for unaligned start
// We make a maximum of 3 loops here
.LFillchar_do_align:
strb r2,[r3],#1
subs r1, r1, #1
{$if defined(cpuarmv3) or defined(cpuarmv4)}
moveq pc,lr
{$else}
bxeq lr
{$endif}
tst r3,#3
bne .LFillchar_do_align
b .LFillchar_is_aligned
end; end;
{$endif FPC_SYSTEM_HAS_FILLCHAR} {$endif FPC_SYSTEM_HAS_FILLCHAR}