浏览代码

* thumb2: Optimize fillchar a bit more with a wider inner loop chunk size

git-svn-id: trunk@49100 -
Jeppe Johansen 4 年之前
父节点
当前提交
d712c64236
共有 1 个文件被更改,包括 60 次插入42 次删除
  1. 60 42
      rtl/arm/thumb2.inc

+ 60 - 42
rtl/arm/thumb2.inc

@@ -124,42 +124,70 @@ end;
 Procedure FillChar(var x;count:longint;value:byte);assembler;nostackframe;
 asm
         // less than 0?
-        cmp r1,#0
-        it lt
-        movlt pc,lr
-        mov     r3,r0
-        cmp     r1,#8           // at least 8 bytes to do?
-        blt     .LFillchar2
-        orr r2,r2,r2,lsl #8
-        orr r2,r2,r2,lsl #16
+        cmp   r1,#0
+        it    le
+        movle pc,lr
+        mov   r3,r0
+        cmp   r1,#8           // at least 8 bytes to do?
+        add   r1, r0
+        blt   .LFillchar3
+        orr   r2,r2,r2,lsl #8
+        orr   r2,r2,r2,lsl #16
 .LFillchar0:
-        tst     r3,#3           // aligned yet?
-        itt ne
-        strneb r2,[r3],#1
-        subne   r1,r1,#1
-        bne     .LFillchar0
+        ands    ip, r3, #3
+        beq     .LAligned
+
+        subs    r0, ip, #1
+        lsls    r0, r0, #1
+        add     pc, r0
+        nop
+
+        strb    r2,[r3,#2]
+        strb    r2,[r3,#1]
+        strb    r2,[r3,#0]
+        rsb     r0, ip, #4
+        add     r3, r0
+
+.LAligned:
         mov     ip,r2
+        push    {r4,r5,lr}
+        mov     r4,r2
+        mov     r5,r2
 .LFillchar1:
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
-        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
-        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
-        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        itt ge
-        stmgeia r3!,{r2,ip}
-        subge   r1,r1,#8
-        bge     .LFillchar1
+        // Use calculated jump to do fills of x*16 bytes
+        subs  r0, r1, r3
+        cmp   r0, #128
+        bge   .LFillchar1_128
+        lsrs  r0, #4
+        beq   .LFillchar2
+        rsb   r0, #8
+        lsls  r0, #2
+        add   pc, r0
+        nop
+.LFillchar1_128:
+        stmia r3!,{r2,r4,r5,ip}
+        stmia r3!,{r2,r4,r5,ip}
+        stmia r3!,{r2,r4,r5,ip}
+        stmia r3!,{r2,r4,r5,ip}
+        stmia r3!,{r2,r4,r5,ip}
+        stmia r3!,{r2,r4,r5,ip}
+        stmia r3!,{r2,r4,r5,ip}
+        stmia r3!,{r2,r4,r5,ip}
+        b .LFillchar1
 .LFillchar2:
-        adr r0, .Ljumptable
-        tbb [r0, r1]
+        // Mop up any leftover 8 byte chunks. We are still aligned at this point
+        pop     {r4,r5,lr}
+        sub     r0, r1, r3
+        cmp     r0, #8
+        it      ge
+        stmgeia r3!,{r2,ip}
+.LFillchar3:
+        // Write any remaining bytes
+        subs r0, r3, r1
+        adds r0, #7 // 7-(e-s) = 7+(s-e)
+        lsls r0, #1
+        add  pc, r0
+        nop
 
         strb r2,[r3,#6]
         strb r2,[r3,#5]
@@ -168,16 +196,6 @@ asm
         strb r2,[r3,#2]
         strb r2,[r3,#1]
         strb r2,[r3,#0]
-        mov pc,lr
-.Ljumptable:
-        .byte 7
-        .byte 6
-        .byte 5
-        .byte 4
-        .byte 3
-        .byte 2
-        .byte 1
-        .byte 0
 end;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}