Small optimizations to FillChar for ARM

The new version is more optimized to the "common case" We assume most of the data will be aligned, thats why the unaligned case has been moved to the end of the function so the aligned case is more cache- and pipeline friendly. I've also reduced the loop unrolling for the block transfer loop, because for large blocks we'll most likely hit the write buffer limit anyway. I've did some measurements. The new routine is a bit slower for less than 8 bytes, but beats the old one by 10-15% with 8 bytes++ git-svn-id: trunk@21760 -
2025-04-10 12:38:36 +02:00 · 2012-07-02 23:54:19 +00:00 · 2012-07-02 23:54:19 +00:00 · 64c122100f
commit 64c122100f
parent 37b8cd1b7a
1 changed files with 47 additions and 40 deletions
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@ -145,62 +145,69 @@ end;
 Procedure FillChar(var x;count:longint;value:byte);assembler;nostackframe;
 asm
        // less than 0?
-        cmp r1,#0
+        cmp     r1,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
-        movlt pc,lr
+        movle   pc,lr
 {$else}
-        bxlt  lr
+        bxle    lr
 {$endif}
        mov     r3,r0
-        cmp     r1,#8           // at least 8 bytes to do?
-        blt     .LFillchar2
-        orr r2,r2,r2,lsl #8
-        orr r2,r2,r2,lsl #16
-.LFillchar0:
-        tst     r3,#3           // aligned yet?
-        strneb r2,[r3],#1
-        subne   r1,r1,#1
-        bne     .LFillchar0
+
+        orr     r2,r2,r2,lsl #8
+        orr     r2,r2,r2,lsl #16
+
+        tst     r3, #3  // Aligned?
+        bne     .LFillchar_do_align
+
+.LFillchar_is_aligned:
+        subs    r1,r1,#8
+        bmi     .LFillchar_less_than_8bytes
+
        mov     ip,r2
-.LFillchar1:
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
+.LFillchar_at_least_8bytes:
+        // Do 16 bytes per loop
+        // More unrolling is uncessary, as we'll just stall on the write buffers
        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
-        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
-        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        stmgeia r3!,{r2,ip}
-        subge   r1,r1,#8
-        bge     .LFillchar1
-.LFillchar2:
-        movs r1,r1              // anything left?
+        subs    r1,r1,#8
+        stmplia r3!,{r2,ip}
+        subpls  r1,r1,#8
+        bpl     .LFillchar_at_least_8bytes
+
+.LFillchar_less_than_8bytes:
+        // Do the rest
+        adds    r1, r1, #8
+
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
        moveq   pc,lr
 {$else}
        bxeq    lr
 {$endif}
-        rsb     r1,r1,#7
-        add     pc,pc,r1,lsl #2
-        mov     r0,r0
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
+
+        tst     r1, #4
+        strne   r2,[r3],#4
+        tst     r1, #2
+        strneh  r2,[r3],#2
+        tst     r1, #1
+        strneb  r2,[r3],#1
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
        mov pc,lr
 {$else}
        bx  lr
 {$endif}
+
+// Special case for unaligned start
+// We make a maximum of 3 loops here
+.LFillchar_do_align:
+        strb r2,[r3],#1
+        subs r1, r1, #1
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
+        moveq pc,lr
+{$else}
+        bxeq  lr
+{$endif}
+        tst r3,#3
+        bne .LFillchar_do_align
+        b .LFillchar_is_aligned
 end;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}