Browse Source

* patch by Nico Erfurth:
Reorder unaligned Load sequence on ARM

The old version produced code like that:

ldrb rDEST, [rBASE]
ldrb rTemp, [rBASE, #1]
orr rDEST, rDEST, rTEMP lsl #8 (2 stall cycles)
ldrb rTemp, [rBASE, #2]
orr rDEST, rDEST, rTEMP lsl #16 (2 stall cycles)
ldrb rTemp, [rBASE, #3]
orr rDEST, rDEST, rTEMP lsl #24 (2 stall cycles)

This creates a lot of stall-cycles on ARM Implementations with load
delay slots like Marvel Kirkwood or Intel XScale. With the usual up to 2
stall-cycles this code requires a total of 13 cycles (7 instructions + 6 stall
cycles) in best case.

The new code uses a second temp register to avoid the stall cycles.

ldrb rDEST, [rBASE]
ldrb rTemp1, [rBASE, #1]
ldrb rTemp2, [rBASE, #2]
orr rDEST, rDEST, rTEMP1 lsl #8
ldrb rTemp1, [rBASE, #3]
orr rDEST, rDEST, rTEMP2 lsl #16
orr rDEST, rDEST, rTEMP1 lsl #24 (1 stall cycle)

The rescheduling and second register bring the total cycles down to 8.
If a later rescheduling should happen for the last orr it even can go
down to 7.

git-svn-id: trunk@21363 -

florian 13 years ago
parent
commit
c75486db89
1 changed files with 13 additions and 6 deletions
  1. 13 6
      compiler/arm/cgcpu.pas

+ 13 - 6
compiler/arm/cgcpu.pas

@@ -177,7 +177,7 @@ unit cgcpu;
 
 
 
 
     uses
     uses
-       globals,verbose,systems,cutils,
+       globals,verbose,systems,cutils,sysutils,
        aopt,aoptcpu,
        aopt,aoptcpu,
        fmodule,
        fmodule,
        symconst,symsym,
        symconst,symsym,
@@ -388,19 +388,26 @@ unit cgcpu;
                      end
                      end
                    else
                    else
                      begin
                      begin
+                       tmpreg2:=getintregister(list,OS_INT);
                        if target_info.endian=endian_big then
                        if target_info.endian=endian_big then
                          inc(usedtmpref.offset,3);
                          inc(usedtmpref.offset,3);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,reg);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,reg);
+
                        inc(usedtmpref.offset,dir);
                        inc(usedtmpref.offset,dir);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
+
+                       inc(usedtmpref.offset,dir);
+                       a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg2);
+
                        so.shiftimm:=8;
                        so.shiftimm:=8;
                        list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
                        list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
+
                        inc(usedtmpref.offset,dir);
                        inc(usedtmpref.offset,dir);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
+
                        so.shiftimm:=16;
                        so.shiftimm:=16;
-                       list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
-                       inc(usedtmpref.offset,dir);
-                       a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
+                       list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg2,so));
+
                        so.shiftimm:=24;
                        so.shiftimm:=24;
                        list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
                        list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
                      end;
                      end;
@@ -706,7 +713,7 @@ unit cgcpu;
             OP_SAR:
             OP_SAR:
               begin
               begin
                 if a>32 then
                 if a>32 then
-                  internalerror(200308295);
+                  internalerror(200308298);
                 if a<>0 then
                 if a<>0 then
                   begin
                   begin
                     shifterop_reset(so);
                     shifterop_reset(so);
@@ -1081,7 +1088,7 @@ unit cgcpu;
            OS_F32:
            OS_F32:
              oppostfix:=PF_None;
              oppostfix:=PF_None;
            else
            else
-             InternalError(200308295);
+             InternalError(200308299);
          end;
          end;
          if (ref.alignment in [1,2]) and (ref.alignment<tcgsize2size[tosize]) then
          if (ref.alignment in [1,2]) and (ref.alignment<tcgsize2size[tosize]) then
            begin
            begin