Bladeren bron

+ AArch64: FoldShiftProcess optimization

git-svn-id: trunk@42924 -
florian 5 jaren geleden
bovenliggende
commit
0c6f7321bf
4 gewijzigde bestanden met toevoegingen van 211 en 4 verwijderingen
  1. 1 1
      compiler/aarch64/agcpugas.pas
  2. 208 1
      compiler/aarch64/aoptcpu.pas
  3. 1 1
      compiler/aarch64/cpubase.pas
  4. 1 1
      compiler/aarch64/racpugas.pas

+ 1 - 1
compiler/aarch64/agcpugas.pas

@@ -50,7 +50,7 @@ unit agcpugas;
 
     const
       gas_shiftmode2str : array[tshiftmode] of string[4] = (
-        '','lsl','lsr','asr',
+        '','lsl','lsr','asr','ror',
         'uxtb','uxth','uxtw','uxtx',
         'sxtb','sxth','sxtw','sxtx');
 

+ 208 - 1
compiler/aarch64/aoptcpu.pas

@@ -39,15 +39,21 @@ Interface
         { uses the same constructor as TAopObj }
         function PeepHoleOptPass1Cpu(var p: tai): boolean; override;
         procedure PeepHoleOptPass2;override;
+        function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
         function GetNextInstructionUsingReg(Current : tai; out Next : tai; reg : TRegister) : Boolean;
         function LookForPostindexedPattern(p : taicpu) : boolean;
         procedure DebugMsg(const s : string; p : tai);
+      private
+        function OptPass1Shift(var p: tai): boolean;
       End;
 
 Implementation
 
   uses
-    aasmbase;
+    aasmbase,
+    aoptutils,
+    cgutils,
+    verbose;
 
 {$ifdef DEBUG_AOPTCPU}
   procedure TCpuAsmOptimizer.DebugMsg(const s: string;p : tai);
@@ -66,6 +72,22 @@ Implementation
     end;
 
 
+  function RefsEqual(const r1, r2: treference): boolean;
+    begin
+      refsequal :=
+        (r1.offset = r2.offset) and
+        (r1.base = r2.base) and
+        (r1.index = r2.index) and (r1.scalefactor = r2.scalefactor) and
+        (r1.symbol=r2.symbol) and (r1.refaddr = r2.refaddr) and
+        (r1.relsymbol = r2.relsymbol) and
+        (r1.shiftimm = r2.shiftimm) and
+        (r1.addressmode = r2.addressmode) and
+        (r1.shiftmode = r2.shiftmode) and
+        (r1.volatility=[]) and
+        (r2.volatility=[]);
+    end;
+
+
   function MatchInstruction(const instr: tai; const op: TAsmOps; const postfix: TOpPostfixes): boolean;
     begin
       result :=
@@ -84,6 +106,33 @@ Implementation
     end;
 
 
+  function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
+    begin
+      result := (oper.typ = top_reg) and (oper.reg = reg);
+    end;
+
+
+  function MatchOperand(const oper1: TOper; const oper2: TOper): boolean; inline;
+    begin
+      result := oper1.typ = oper2.typ;
+
+      if result then
+        case oper1.typ of
+          top_const:
+            Result:=oper1.val = oper2.val;
+          top_reg:
+            Result:=oper1.reg = oper2.reg;
+          top_conditioncode:
+            Result:=oper1.cc = oper2.cc;
+          top_realconst:
+            Result:=oper1.val_real = oper2.val_real;
+          top_ref:
+            Result:=RefsEqual(oper1.ref^, oper2.ref^);
+          else Result:=false;
+        end
+    end;
+
+
   function TCpuAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
     Out Next: tai; reg: TRegister): Boolean;
     begin
@@ -97,6 +146,54 @@ Implementation
             is_calljmp(taicpu(Next).opcode);
     end;
 
+
+  function TCpuAsmOptimizer.RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;
+    var
+      p: taicpu;
+    begin
+      p := taicpu(hp);
+      Result := false;
+      if not ((assigned(hp)) and (hp.typ = ait_instruction)) then
+        exit;
+
+      case p.opcode of
+        { These operands do not write into a register at all }
+        A_CMP, A_CMN, A_TST, A_B, A_BL, A_MSR, A_FCMP:
+          exit;
+        {Take care of post/preincremented store and loads, they will change their base register}
+        A_STR, A_LDR:
+          begin
+            Result := false;
+            { actually, this does not apply here because post-/preindexed does not mean that a register
+              is loaded with a new value, it is only modified
+              (taicpu(p).oper[1]^.typ=top_ref) and
+              (taicpu(p).oper[1]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
+              (taicpu(p).oper[1]^.ref^.base = reg);
+            }
+            { STR does not load into it's first register }
+            if p.opcode = A_STR then
+              exit;
+          end;
+        else
+          ;
+      end;
+
+      if Result then
+        exit;
+
+      case p.oper[0]^.typ of
+        top_reg:
+          Result := (p.oper[0]^.reg = reg);
+        top_ref:
+          Result :=
+            (taicpu(p).oper[0]^.ref^.addressmode in [AM_PREINDEXED,AM_POSTINDEXED]) and
+            (taicpu(p).oper[0]^.ref^.base = reg);
+        else
+          ;
+      end;
+    end;
+
+
   {
     optimize
       ldr/str regX,[reg1]
@@ -145,6 +242,111 @@ Implementation
     end;
 
 
+  function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
+    var
+      hp1,hp2: tai;
+      I2, I: Integer;
+      shifterop: tshifterop;
+    begin
+      Result:=false;
+      { This folds shifterops into following instructions
+        <shiftop> r0, r1, #imm
+        <op> r2, r3, r0
+
+        to
+
+        <op> r2, r3, r1, <shiftop> #imm
+      }
+      { do not handle ROR yet, only part of the instructions below support ROR as shifter operand }
+      if MatchInstruction(p,[A_LSL, A_LSR, A_ASR{, A_ROR}],[PF_None]) and
+         MatchOpType(taicpu(p),top_reg,top_reg,top_const) and
+         GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+         MatchInstruction(hp1, [A_ADD, A_AND, A_BIC, A_CMP, A_CMN,
+                                A_EON, A_EOR, A_MOV, A_NEG, A_ORN, A_ORR,
+                                A_SUB, A_TST], [PF_None]) and
+         RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
+         (taicpu(hp1).ops >= 2) and
+         { Currently we can't fold into another shifterop }
+         (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
+         { SP does not work completely with shifted registers, as I didn't find the exact rules,
+           we do not operate on SP }
+         (taicpu(hp1).oper[0]^.reg<>NR_SP) and
+         (taicpu(hp1).oper[1]^.reg<>NR_SP) and
+         (taicpu(hp1).oper[taicpu(hp1).ops-1]^.reg<>NR_SP) and
+         { reg1 might not be modified inbetween }
+         not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
+         (
+           { Only ONE of the two src operands is allowed to match }
+           MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
+           MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
+         ) and
+         { for SUB, the last operand must match, there is no RSB on AArch64 }
+         ((taicpu(hp1).opcode<>A_SUB) or
+          MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)) then
+        begin
+          if taicpu(hp1).opcode in [A_TST, A_CMP, A_CMN, A_MOV] then
+            I2:=0
+          else
+            I2:=1;
+          for I:=I2 to taicpu(hp1).ops-1 do
+            if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
+              begin
+                { If the parameter matched on the second op from the RIGHT
+                  we have to switch the parameters, this will not happen for CMP
+                  were we're only evaluating the most right parameter
+                }
+                shifterop_reset(shifterop);
+                case taicpu(p).opcode of
+                  A_LSL:
+                    shifterop.shiftmode:=SM_LSL;
+                  A_ROR:
+                    shifterop.shiftmode:=SM_ROR;
+                  A_LSR:
+                    shifterop.shiftmode:=SM_LSR;
+                  A_ASR:
+                    shifterop.shiftmode:=SM_ASR;
+                  else
+                    InternalError(2019090401);
+                end;
+                shifterop.shiftimm:=taicpu(p).oper[2]^.val;
+
+                if I <> taicpu(hp1).ops-1 then
+                  begin
+                    if taicpu(hp1).ops = 3 then
+                      hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
+                           taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
+                           taicpu(p).oper[1]^.reg, shifterop)
+                    else
+                      hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
+                           taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                           shifterop);
+                  end
+                else
+                  if taicpu(hp1).ops = 3 then
+                    hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
+                         taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
+                         taicpu(p).oper[1]^.reg,shifterop)
+                  else
+                    hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
+                         taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                         shifterop);
+
+                taicpu(hp2).fileinfo:=taicpu(hp1).fileinfo;
+                asml.insertbefore(hp2, hp1);
+                GetNextInstruction(p, hp2);
+                asml.remove(p);
+                asml.remove(hp1);
+                p.free;
+                hp1.free;
+                p:=hp2;
+                DebugMsg('Peephole FoldShiftProcess done', p);
+                Result:=true;
+                break;
+              end;
+        end;
+    end;
+
+
   function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
     begin
       result := false;
@@ -159,6 +361,11 @@ Implementation
               begin
                 Result:=LookForPostindexedPattern(taicpu(p));
               end;
+            A_LSR,
+            A_ROR,
+            A_ASR,
+            A_LSL:
+              Result:=OptPass1Shift(p);
             else
               ;
           end;

+ 1 - 1
compiler/aarch64/cpubase.pas

@@ -199,7 +199,7 @@ unit cpubase;
       tshiftmode = (SM_None,
                     { shifted register instructions. LSL can also be used for
                       the index register of certain loads/stores }
-                    SM_LSL,SM_LSR,SM_ASR,
+                    SM_LSL,SM_LSR,SM_ASR,SM_ROR,
                     { extended register instructions: zero/sign extension +
                         optional shift (interpreted as LSL after extension)
                        -- the index register of certain loads/stores can be

+ 1 - 1
compiler/aarch64/racpugas.pas

@@ -461,7 +461,7 @@ Unit racpugas;
 
       const
         shiftmode2str: array[SM_LSL..SM_SXTX] of string[4] =
-          ('LSL','LSR','ASR',
+          ('LSL','LSR','ASR','ROR',
            'UXTB','UXTH','UXTW','UXTX',
            'SXTB','SXTH','SXTW','SXTX');
       var