Small optimizations to FillChar for ARM

The new version is more optimized to the "common case" We assume most of the data will be aligned, thats why the unaligned case has been moved to the end of the function so the aligned case is more cache- and pipeline friendly. I've also reduced the loop unrolling for the block transfer loop, because for large blocks we'll most likely hit the write buffer limit anyway. I've did some measurements. The new routine is a bit slower for less than 8 bytes, but beats the old one by 10-15% with 8 bytes++ git-svn-id: trunk@21760 -
2025-08-18 20:29:18 +02:00 · 2012-07-02 23:54:19 +00:00 · 2012-07-02 23:54:19 +00:00 · 64c122100f
commit 64c122100f
parent 37b8cd1b7a
1 changed files with 47 additions and 40 deletions
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@ -147,60 +147,67 @@ asm
        // less than 0?
        cmp     r1,#0
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
-        movlt pc,lr
+        movle   pc,lr
 {$else}
-        bxlt  lr
+        bxle    lr
 {$endif}
        mov     r3,r0
-        cmp     r1,#8           // at least 8 bytes to do?
+
        blt     .LFillchar2
        orr     r2,r2,r2,lsl #8
        orr     r2,r2,r2,lsl #16
-.LFillchar0:
+
-        tst     r3,#3           // aligned yet?
+        tst     r3, #3  // Aligned?
-        strneb r2,[r3],#1
+        bne     .LFillchar_do_align
-        subne   r1,r1,#1
+
-        bne     .LFillchar0
+.LFillchar_is_aligned:
        subs    r1,r1,#8
        bmi     .LFillchar_less_than_8bytes
        mov     ip,r2
-.LFillchar1:
+.LFillchar_at_least_8bytes:
-        cmp     r1,#8           // 8 bytes still to do?
+        // Do 16 bytes per loop
-        blt     .LFillchar2
+        // More unrolling is uncessary, as we'll just stall on the write buffers
        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
+        subs    r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
+        stmplia r3!,{r2,ip}
-        blt     .LFillchar2
+        subpls  r1,r1,#8
-        stmia   r3!,{r2,ip}
+        bpl     .LFillchar_at_least_8bytes
-        sub     r1,r1,#8
+
-        cmp     r1,#8           // 8 bytes still to do?
+.LFillchar_less_than_8bytes:
-        blt     .LFillchar2
+        // Do the rest
-        stmia   r3!,{r2,ip}
+        adds    r1, r1, #8
-        sub     r1,r1,#8
+
        cmp     r1,#8           // 8 bytes still to do?
        stmgeia r3!,{r2,ip}
        subge   r1,r1,#8
        bge     .LFillchar1
 .LFillchar2:
        movs r1,r1              // anything left?
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
        moveq   pc,lr
 {$else}
        bxeq    lr
 {$endif}
-        rsb     r1,r1,#7
+
-        add     pc,pc,r1,lsl #2
+        tst     r1, #4
-        mov     r0,r0
+        strne   r2,[r3],#4
-        strb r2,[r3],#1
+        tst     r1, #2
-        strb r2,[r3],#1
+        strneh  r2,[r3],#2
-        strb r2,[r3],#1
+        tst     r1, #1
-        strb r2,[r3],#1
+        strneb  r2,[r3],#1
        strb r2,[r3],#1
        strb r2,[r3],#1
        strb r2,[r3],#1
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
        mov pc,lr
 {$else}
        bx  lr
 {$endif}
 // Special case for unaligned start
 // We make a maximum of 3 loops here
 .LFillchar_do_align:
        strb r2,[r3],#1
        subs r1, r1, #1
 {$if defined(cpuarmv3) or defined(cpuarmv4)}
        moveq pc,lr
 {$else}
        bxeq  lr
 {$endif}
        tst r3,#3
        bne .LFillchar_do_align
        b .LFillchar_is_aligned
 end;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}