Browse Source

--- Merging r21285 into '.':
U compiler/arm/raarmgas.pas
U compiler/arm/armatts.inc
U compiler/arm/armins.dat
U compiler/arm/armop.inc
U compiler/arm/armatt.inc
--- Merging r21363 into '.':
U compiler/arm/cgcpu.pas
--- Merging r21505 into '.':
G compiler/arm/cgcpu.pas

# revisions: 21285,21363,21505
r21285 | jonas | 2012-05-13 14:14:26 +0200 (Sun, 13 May 2012) | 1 line
Changed paths:
M /trunk/compiler/arm/armatt.inc
M /trunk/compiler/arm/armatts.inc
M /trunk/compiler/arm/armins.dat
M /trunk/compiler/arm/armop.inc
M /trunk/compiler/arm/raarmgas.pas

+ support for REV and several other ARMv6/ARMv6T2+ opcodes (mantis #21888)
r21363 | florian | 2012-05-22 21:09:20 +0200 (Tue, 22 May 2012) | 33 lines
Changed paths:
M /trunk/compiler/arm/cgcpu.pas

* patch by Nico Erfurth:
Reorder unaligned Load sequence on ARM

The old version produced code like that:

ldrb rDEST, [rBASE]
ldrb rTemp, [rBASE, #1]
orr rDEST, rDEST, rTEMP lsl #8 (2 stall cycles)
ldrb rTemp, [rBASE, #2]
orr rDEST, rDEST, rTEMP lsl #16 (2 stall cycles)
ldrb rTemp, [rBASE, #3]
orr rDEST, rDEST, rTEMP lsl #24 (2 stall cycles)

This creates a lot of stall-cycles on ARM Implementations with load
delay slots like Marvel Kirkwood or Intel XScale. With the usual up to 2
stall-cycles this code requires a total of 13 cycles (7 instructions + 6 stall
cycles) in best case.

The new code uses a second temp register to avoid the stall cycles.

ldrb rDEST, [rBASE]
ldrb rTemp1, [rBASE, #1]
ldrb rTemp2, [rBASE, #2]
orr rDEST, rDEST, rTEMP1 lsl #8
ldrb rTemp1, [rBASE, #3]
orr rDEST, rDEST, rTEMP2 lsl #16
orr rDEST, rDEST, rTEMP1 lsl #24 (1 stall cycle)

The rescheduling and second register bring the total cycles down to 8.
If a later rescheduling should happen for the last orr it even can go
down to 7.
r21505 | florian | 2012-06-06 21:42:26 +0200 (Wed, 06 Jun 2012) | 8 lines
Changed paths:
M /trunk/compiler/arm/cgcpu.pas

* patch by Nico Erfruth: Support BX for function returns on armv5+
BX is supported from ARMv4T onwards, but i don't have a armv4t device to
test it.

Using BX instead of mov pc,lr allows for a better pipeline utilization
by enabling the CPUs branch predictor to work properly.

git-svn-id: branches/fixes_2_6@22518 -

marco 13 years ago
parent
commit
e372455499
6 changed files with 433 additions and 35 deletions
  1. 91 2
      compiler/arm/armatt.inc
  2. 89 0
      compiler/arm/armatts.inc
  3. 126 4
      compiler/arm/armins.dat
  4. 91 2
      compiler/arm/armop.inc
  5. 15 8
      compiler/arm/cgcpu.pas
  6. 21 19
      compiler/arm/raarmgas.pas

+ 91 - 2
compiler/arm/armatt.inc

@@ -179,6 +179,97 @@
 'fuitos',
 'fmdrr',
 'fmrrd',
+'bfc',
+'bfi',
+'clrex',
+'ldrex',
+'ldrexb',
+'ldrexd',
+'ldrexh',
+'mls',
+'pkh',
+'pli',
+'qadd16',
+'qadd8',
+'qasx',
+'qsax',
+'qsub16',
+'qsub8',
+'rbit',
+'rev',
+'rev16',
+'revsh',
+'sadd16',
+'sadd8',
+'sasx',
+'sbfx',
+'sel',
+'setend',
+'sev',
+'shadd16',
+'shadd8',
+'shasx',
+'shsax',
+'shsub16',
+'shsub8',
+'smlad',
+'smlald',
+'smlsd',
+'smlsld',
+'smmla',
+'smmls',
+'smmul',
+'smuad',
+'smusd',
+'srs',
+'ssat',
+'ssat16',
+'ssax',
+'ssub16',
+'ssub8',
+'strex',
+'strexb',
+'strexd',
+'strexh',
+'sxtab',
+'sxtab16',
+'sxtah',
+'sxtb',
+'sxtb16',
+'sxth',
+'uadd16',
+'uadd8',
+'uasx',
+'ubfx',
+'uhadd16',
+'uhadd8',
+'uhasx',
+'uhsax',
+'uhsub16',
+'uhsub8',
+'umaal',
+'uqadd16',
+'uqadd8',
+'uqasx',
+'uqsax',
+'uqsub16',
+'uqsub8',
+'uqsad8',
+'uqsada8',
+'usat',
+'usat16',
+'usax',
+'usub16',
+'usub8',
+'uxtab',
+'uxtab16',
+'uxtah',
+'uxtb',
+'uxtb16',
+'uxth',
+'wfe',
+'wfi',
+'yield',
 'asr',
 'lsr',
 'lsl',
@@ -186,8 +277,6 @@
 'sdiv',
 'udiv',
 'movt',
-'ldrex',
-'strex',
 'it',
 'ite',
 'itt',

+ 89 - 0
compiler/arm/armatts.inc

@@ -202,5 +202,94 @@ attsufNONE,
 attsufNONE,
 attsufNONE,
 attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
+attsufNONE,
 attsufNONE
 );

+ 126 - 4
compiler/arm/armins.dat

@@ -574,6 +574,132 @@ reg32,reg32,reg32,reg32  \x16\x00\x80\x90		 ARM7
 
 [FMRRDcc]
 
+; ARMv6
+
+[BFCcc]
+
+[BFIcc]
+
+[CLREX]
+
+[LDREXcc]
+[LDREXBcc]
+[LDREXDcc]
+[LDREXHcc]
+
+[MLScc]
+
+[PKHcc]
+
+[PLI]
+
+[QADD16cc]
+[QADD8cc]
+[QASXcc]
+[QSAXcc]
+[QSUB16cc]
+[QSUB8cc]
+
+[RBITcc]
+
+[REVcc]
+[REV16cc]
+[REVSHcc]
+
+[SADD16cc]
+[SADD8cc]
+[SASXcc]
+
+[SBFXcc]
+
+[SELcc]
+
+[SETEND]
+
+[SEVcc]
+
+[SHADD16cc]
+[SHADD8cc]
+[SHASXcc]
+[SHSAXcc]
+[SHSUB16cc]
+[SHSUB8cc]
+
+[SMLADcc]
+[SMLALDcc]
+[SMLSDcc]
+[SMLSLDcc]
+[SMMLAcc]
+[SMMLScc]
+[SMMULcc]
+[SMUADcc]
+[SMUSDcc]
+
+[SRScc]
+
+[SSATcc]
+[SSAT16cc]
+[SSAXcc]
+
+[SSUB16cc]
+[SSUB8cc]
+
+[STREXcc]
+[STREXBcc]
+[STREXDcc]
+[STREXHcc]
+
+[SXTABcc]
+[SXTAB16cc]
+[SXTAHcc]
+[SXTBcc]
+[SXTB16cc]
+[SXTHcc]
+
+[UADD16cc]
+[UADD8cc]
+[UASXcc]
+
+[UBFXcc]
+
+[UHADD16cc]
+[UHADD8cc]
+[UHASXcc]
+[UHSAXcc]
+[UHSUB16cc]
+[UHSUB8cc]
+
+[UMAALcc]
+
+[UQADD16cc]
+[UQADD8]
+[UQASXcc]
+[UQSAXcc]
+
+[UQSUB16cc]
+[UQSUB8cc]
+[UQSAD8cc]
+[UQSADA8cc]
+
+[USATcc]
+[USAT16cc]
+[USAXcc]
+
+[USUB16cc]
+[USUB8cc]
+
+[UXTABcc]
+[UXTAB16cc]
+[UXTAHcc]
+
+[UXTBcc]
+[UXTB16cc]
+[UXTHcc]
+
+[WFEcc]
+[WFIcc]
+[YIELDcc]
+
 ; Thumb-2
 
 [ASRcc]
@@ -590,10 +716,6 @@ reg32,reg32,reg32,reg32  \x16\x00\x80\x90		 ARM7
 
 [MOVTcc]
 
-[LDREXcc]
-
-[STREXcc]
-
 [IT]
 
 [ITE]

+ 91 - 2
compiler/arm/armop.inc

@@ -179,6 +179,97 @@ A_FUITOD,
 A_FUITOS,
 A_FMDRR,
 A_FMRRD,
+A_BFC,
+A_BFI,
+A_CLREX,
+A_LDREX,
+A_LDREXB,
+A_LDREXD,
+A_LDREXH,
+A_MLS,
+A_PKH,
+A_PLI,
+A_QADD16,
+A_QADD8,
+A_QASX,
+A_QSAX,
+A_QSUB16,
+A_QSUB8,
+A_RBIT,
+A_REV,
+A_REV16,
+A_REVSH,
+A_SADD16,
+A_SADD8,
+A_SASX,
+A_SBFX,
+A_SEL,
+A_SETEND,
+A_SEV,
+A_SHADD16,
+A_SHADD8,
+A_SHASX,
+A_SHSAX,
+A_SHSUB16,
+A_SHSUB8,
+A_SMLAD,
+A_SMLALD,
+A_SMLSD,
+A_SMLSLD,
+A_SMMLA,
+A_SMMLS,
+A_SMMUL,
+A_SMUAD,
+A_SMUSD,
+A_SRS,
+A_SSAT,
+A_SSAT16,
+A_SSAX,
+A_SSUB16,
+A_SSUB8,
+A_STREX,
+A_STREXB,
+A_STREXD,
+A_STREXH,
+A_SXTAB,
+A_SXTAB16,
+A_SXTAH,
+A_SXTB,
+A_SXTB16,
+A_SXTH,
+A_UADD16,
+A_UADD8,
+A_UASX,
+A_UBFX,
+A_UHADD16,
+A_UHADD8,
+A_UHASX,
+A_UHSAX,
+A_UHSUB16,
+A_UHSUB8,
+A_UMAAL,
+A_UQADD16,
+A_UQADD8,
+A_UQASX,
+A_UQSAX,
+A_UQSUB16,
+A_UQSUB8,
+A_UQSAD8,
+A_UQSADA8,
+A_USAT,
+A_USAT16,
+A_USAX,
+A_USUB16,
+A_USUB8,
+A_UXTAB,
+A_UXTAB16,
+A_UXTAH,
+A_UXTB,
+A_UXTB16,
+A_UXTH,
+A_WFE,
+A_WFI,
+A_YIELD,
 A_ASR,
 A_LSR,
 A_LSL,
@@ -186,8 +277,6 @@ A_ROR,
 A_SDIV,
 A_UDIV,
 A_MOVT,
-A_LDREX,
-A_STREX,
 A_IT,
 A_ITE,
 A_ITT,

+ 15 - 8
compiler/arm/cgcpu.pas

@@ -175,7 +175,7 @@ unit cgcpu;
 
 
     uses
-       globals,verbose,systems,cutils,
+       globals,verbose,systems,cutils,sysutils,
        aopt,aoptcpu,
        fmodule,
        symconst,symsym,
@@ -384,19 +384,26 @@ unit cgcpu;
                      end
                    else
                      begin
+                       tmpreg2:=getintregister(list,OS_INT);
                        if target_info.endian=endian_big then
                          inc(usedtmpref.offset,3);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,reg);
+
                        inc(usedtmpref.offset,dir);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
+
+                       inc(usedtmpref.offset,dir);
+                       a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg2);
+
                        so.shiftimm:=8;
                        list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
+
                        inc(usedtmpref.offset,dir);
                        a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
+
                        so.shiftimm:=16;
-                       list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
-                       inc(usedtmpref.offset,dir);
-                       a_internal_load_ref_reg(list,OS_8,OS_8,usedtmpref,tmpreg);
+                       list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg2,so));
+
                        so.shiftimm:=24;
                        list.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,reg,reg,tmpreg,so));
                      end;
@@ -703,7 +710,7 @@ unit cgcpu;
             OP_SAR:
               begin
                 if a>32 then
-                  internalerror(200308295);
+                  internalerror(200308298);
                 if a<>0 then
                   begin
                     shifterop_reset(so);
@@ -1066,7 +1073,7 @@ unit cgcpu;
            OS_F32:
              oppostfix:=PF_None;
            else
-             InternalError(200308295);
+             InternalError(200308299);
          end;
          if (ref.alignment in [1,2]) and (ref.alignment<tcgsize2size[tosize]) then
            begin
@@ -1676,7 +1683,7 @@ unit cgcpu;
 
                 if regs=[] then
                   begin
-                    if (current_settings.cputype<cpu_armv6) then
+                    if (current_settings.cputype<cpu_armv5) then
                       list.concat(taicpu.op_reg_reg(A_MOV,NR_PC,NR_R14))
                     else
                       list.concat(taicpu.op_reg(A_BX,NR_R14))
@@ -1697,7 +1704,7 @@ unit cgcpu;
                 list.concat(setoppostfix(taicpu.op_ref_regset(A_LDM,ref,R_INTREGISTER,R_SUBWHOLE,regs),PF_EA));
               end;
           end
-        else if (current_settings.cputype<cpu_armv6) then
+        else if (current_settings.cputype<cpu_armv5) then
           list.concat(taicpu.op_reg_reg(A_MOV,NR_PC,NR_R14))
         else
           list.concat(taicpu.op_reg(A_BX,NR_R14))

+ 21 - 19
compiler/arm/raarmgas.pas

@@ -651,25 +651,27 @@ Unit raarmgas;
           begin
             is_ConditionCode := false;
             
-            if actopcode in [A_IT,A_ITE,A_ITT,
-                             A_ITEE,A_ITTE,A_ITET,A_ITTT,
-                             A_ITEEE,A_ITTEE,A_ITETE,A_ITTTE,A_ITEET,A_ITTET,A_ITETT,A_ITTTT] then
-              begin
-                { search for condition, conditions are always 2 chars }
-                if length(hs)>1 then
-                  begin
-                    for icond:=low(tasmcond) to high(tasmcond) do
-                      begin
-                        if copy(hs,1,2)=uppercond2str[icond] then
-                          begin
-                            //actcondition:=icond;
-                            oper.opr.typ := OPR_COND;
-                            oper.opr.cc := icond;
-                            exit(true);
-                          end;
-                      end;
-                  end;
-              end;
+            case actopcode of
+              A_IT,A_ITE,A_ITT,
+              A_ITEE,A_ITTE,A_ITET,A_ITTT,
+              A_ITEEE,A_ITTEE,A_ITETE,A_ITTTE,A_ITEET,A_ITTET,A_ITETT,A_ITTTT:
+                begin
+                  { search for condition, conditions are always 2 chars }
+                  if length(hs)>1 then
+                    begin
+                      for icond:=low(tasmcond) to high(tasmcond) do
+                        begin
+                          if copy(hs,1,2)=uppercond2str[icond] then
+                            begin
+                              //actcondition:=icond;
+                              oper.opr.typ := OPR_COND;
+                              oper.opr.cc := icond;
+                              exit(true);
+                            end;
+                        end;
+                    end;
+                end;
+            end;
           end;