Browse Source

* (modified) patch by J. Gareth Moreton to unify ldr/str optimizations on Aarch64/ARM, part of #38841

git-svn-id: trunk@49338 -
florian 4 years ago
parent
commit
d936280c6b
3 changed files with 254 additions and 12 deletions
  1. 25 2
      compiler/aarch64/aoptcpu.pas
  2. 11 5
      compiler/arm/aoptcpu.pas
  3. 218 5
      compiler/armgen/aoptarm.pas

+ 25 - 2
compiler/aarch64/aoptcpu.pas

@@ -44,6 +44,10 @@ Interface
         function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
         function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
         function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
         function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
         function LookForPostindexedPattern(var p : tai) : boolean;
         function LookForPostindexedPattern(var p : tai) : boolean;
+      public
+        { With these routines, there's optimisation code that's general for all ARM platforms }
+        function OptPass1LDR(var p: tai): Boolean; override;
+        function OptPass1STR(var p: tai): Boolean; override;
       private
       private
         function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
         function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
         function OptPass1Shift(var p: tai): boolean;
         function OptPass1Shift(var p: tai): boolean;
@@ -291,6 +295,24 @@ Implementation
     end;
     end;
 
 
 
 
+  function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
+    begin
+      Result := False;
+      if inherited OptPass1LDR(p) or
+        LookForPostindexedPattern(p) then
+        Exit(True);
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
+    begin
+      Result := False;
+      if inherited OptPass1STR(p) or
+        LookForPostindexedPattern(p) then
+        Exit(True);
+    end;
+
+
   function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
   function TCpuAsmOptimizer.OptPass1Shift(var p : tai): boolean;
     var
     var
       hp1,hp2: tai;
       hp1,hp2: tai;
@@ -764,9 +786,10 @@ Implementation
       if p.typ=ait_instruction then
       if p.typ=ait_instruction then
         begin
         begin
           case taicpu(p).opcode of
           case taicpu(p).opcode of
-            A_LDR,
+            A_LDR:
+              Result:=OptPass1LDR(p);
             A_STR:
             A_STR:
-              Result:=LookForPostindexedPattern(p);
+              Result:=OptPass1STR(p);
             A_MOV:
             A_MOV:
               Result:=OptPass1Mov(p);
               Result:=OptPass1Mov(p);
             A_STP:
             A_STP:

+ 11 - 5
compiler/arm/aoptcpu.pas

@@ -59,7 +59,11 @@ Type
     function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
     function InstructionLoadsFromReg(const reg : TRegister; const hp : tai) : boolean; override;
 
 
     function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
     function RegLoadedWithNewValue(reg : tregister; hp : tai) : boolean; override;
-    function OptPass1And(var p: tai): Boolean; override; { There's optimisation code that's general for all ARM platforms }
+
+     { With these routines, there's optimisation code that's general for all ARM platforms }
+    function OptPass1And(var p: tai): Boolean; override;
+    function OptPass1LDR(var p: tai): Boolean; override;
+    function OptPass1STR(var p: tai): Boolean; override;
   protected
   protected
     function LookForPreindexedPattern(p: taicpu): boolean;
     function LookForPreindexedPattern(p: taicpu): boolean;
     function LookForPostindexedPattern(p: taicpu): boolean;
     function LookForPostindexedPattern(p: taicpu): boolean;
@@ -69,9 +73,7 @@ Type
     function OptPass1DataCheckMov(var p: tai): Boolean;
     function OptPass1DataCheckMov(var p: tai): Boolean;
     function OptPass1ADDSUB(var p: tai): Boolean;
     function OptPass1ADDSUB(var p: tai): Boolean;
     function OptPass1CMP(var p: tai): Boolean;
     function OptPass1CMP(var p: tai): Boolean;
-    function OptPass1LDR(var p: tai): Boolean;
     function OptPass1STM(var p: tai): Boolean;
     function OptPass1STM(var p: tai): Boolean;
-    function OptPass1STR(var p: tai): Boolean;
     function OptPass1MOV(var p: tai): Boolean;
     function OptPass1MOV(var p: tai): Boolean;
     function OptPass1MUL(var p: tai): Boolean;
     function OptPass1MUL(var p: tai): Boolean;
     function OptPass1MVN(var p: tai): Boolean;
     function OptPass1MVN(var p: tai): Boolean;
@@ -834,7 +836,9 @@ Implementation
     var
     var
       hp1: tai;
       hp1: tai;
     begin
     begin
-      Result := False;
+      Result := inherited OptPass1LDR(p);
+      if Result then
+        Exit;
 
 
       { change
       { change
         ldr reg1,ref
         ldr reg1,ref
@@ -1022,7 +1026,9 @@ Implementation
     var
     var
       hp1: tai;
       hp1: tai;
     begin
     begin
-      Result := False;
+      Result := inherited OptPass1STR(p);
+      if Result then
+        Exit;
 
 
       { Common conditions }
       { Common conditions }
       if (taicpu(p).oper[1]^.typ = top_ref) and
       if (taicpu(p).oper[1]^.typ = top_ref) and

+ 218 - 5
compiler/armgen/aoptarm.pas

@@ -26,7 +26,7 @@ Unit aoptarm;
 {$i fpcdefs.inc}
 {$i fpcdefs.inc}
 
 
 { $define DEBUG_PREREGSCHEDULER}
 { $define DEBUG_PREREGSCHEDULER}
-{ $define DEBUG_AOPTCPU}
+{$define DEBUG_AOPTCPU}
 
 
 Interface
 Interface
 
 
@@ -41,12 +41,15 @@ Type
 
 
     function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
     function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
     function RedundantMovProcess(var p: tai; var hp1: tai): boolean;
     function RedundantMovProcess(var p: tai; var hp1: tai): boolean;
-    function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
+    function GetNextInstructionUsingReg(Current: tai; out Next: tai; const reg: TRegister): Boolean;
 
 
     function OptPass1UXTB(var p: tai): Boolean;
     function OptPass1UXTB(var p: tai): Boolean;
     function OptPass1UXTH(var p: tai): Boolean;
     function OptPass1UXTH(var p: tai): Boolean;
     function OptPass1SXTB(var p: tai): Boolean;
     function OptPass1SXTB(var p: tai): Boolean;
     function OptPass1SXTH(var p: tai): Boolean;
     function OptPass1SXTH(var p: tai): Boolean;
+
+    function OptPass1LDR(var p: tai): Boolean; virtual;
+    function OptPass1STR(var p: tai): Boolean; virtual;
     function OptPass1And(var p: tai): Boolean; virtual;
     function OptPass1And(var p: tai): Boolean; virtual;
   End;
   End;
 
 
@@ -69,15 +72,23 @@ Implementation
     systems,
     systems,
     cpuinfo,
     cpuinfo,
     cgobj,procinfo,
     cgobj,procinfo,
-    aasmbase,aasmdata;
+    aasmbase,aasmdata,itcpugas;
 
 
 
 
 {$ifdef DEBUG_AOPTCPU}
 {$ifdef DEBUG_AOPTCPU}
+  const
+    SPeepholeOptimization: shortstring = 'Peephole Optimization: ';
+
   procedure TARMAsmOptimizer.DebugMsg(const s: string;p : tai);
   procedure TARMAsmOptimizer.DebugMsg(const s: string;p : tai);
     begin
     begin
       asml.insertbefore(tai_comment.Create(strpnew(s)), p);
       asml.insertbefore(tai_comment.Create(strpnew(s)), p);
     end;
     end;
 {$else DEBUG_AOPTCPU}
 {$else DEBUG_AOPTCPU}
+  { Empty strings help the optimizer to remove string concatenations that won't
+    ever appear to the user on release builds. [Kit] }
+  const
+    SPeepholeOptimization = '';
+
   procedure TARMAsmOptimizer.DebugMsg(const s: string;p : tai);inline;
   procedure TARMAsmOptimizer.DebugMsg(const s: string;p : tai);inline;
     begin
     begin
     end;
     end;
@@ -179,7 +190,7 @@ Implementation
 
 
 
 
   function TARMAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
   function TARMAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
-    Out Next: tai; reg: TRegister): Boolean;
+    Out Next: tai; const reg: TRegister): Boolean;
     var
     var
       gniResult: Boolean;
       gniResult: Boolean;
     begin
     begin
@@ -395,7 +406,14 @@ Implementation
                   UpdateUsedRegs(TmpUsedRegs, tai(current_hp.Next));
                   UpdateUsedRegs(TmpUsedRegs, tai(current_hp.Next));
                   LDRChange := False;
                   LDRChange := False;
 
 
-                  if (taicpu(next_hp).opcode in [A_LDR,A_STR]) and (taicpu(next_hp).ops = 2) then
+                  if (taicpu(next_hp).opcode in [A_LDR,A_STR]) and (taicpu(next_hp).ops = 2)
+{$ifdef AARCH64}
+                    { If r0 is the zero register, then this sequence of instructions will cause
+                      an access violation, but that's better than an assembler error caused by
+                      changing r0 to xzr inside the reference (Where it's illegal). [Kit] }
+                    and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_XZR)
+{$endif AARCH64}
+                    then
                     begin
                     begin
 
 
                       { Change the registers from r1 to r0 }
                       { Change the registers from r1 to r0 }
@@ -1018,6 +1036,201 @@ Implementation
     end;
     end;
 
 
 
 
+  function TARMAsmOptimizer.OptPass1LDR(var p : tai) : Boolean;
+    var
+      hp1: tai;
+      Reference: TReference;
+      NewOp: TAsmOp;
+    begin
+      Result := False;
+      if (taicpu(p).ops <> 2) or (taicpu(p).condition <> C_None) then
+        Exit;
+
+      Reference := taicpu(p).oper[1]^.ref^;
+      if (Reference.addressmode = AM_OFFSET) and
+        not RegInRef(taicpu(p).oper[0]^.reg, Reference) and
+        { Delay calling GetNextInstruction for as long as possible }
+        GetNextInstruction(p, hp1) and
+        (hp1.typ = ait_instruction) and
+        (taicpu(hp1).condition = C_None) and
+        (taicpu(hp1).oppostfix = taicpu(p).oppostfix) then
+        begin
+          if (taicpu(hp1).opcode = A_STR) and
+            RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) and
+            (getregtype(taicpu(p).oper[0]^.reg) = getregtype(taicpu(hp1).oper[0]^.reg)) then
+            begin
+              { With:
+                  ldr reg1,[ref]
+                  str reg2,[ref]
+
+                If reg1 = reg2, Remove str
+              }
+              if taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg then
+                begin
+                  DebugMsg(SPeepholeOptimization + 'Removed redundant store instruction (load/store -> load/nop)', hp1);
+                  RemoveInstruction(hp1);
+                  Result := True;
+                  Exit;
+                end;
+            end
+          else if (taicpu(hp1).opcode = A_LDR) and
+            RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) then
+            begin
+              { With:
+                  ldr reg1,[ref]
+                  ldr reg2,[ref]
+
+                If reg1 = reg2, delete the second ldr
+                If reg1 <> reg2, changing the 2nd ldr to a mov might introduce
+                  a dependency, but it will likely open up new optimisations, so
+                  do it for now and handle any new dependencies later.
+              }
+              if taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg then
+                begin
+                  DebugMsg(SPeepholeOptimization + 'Removed duplicate load instruction (load/load -> load/nop)', hp1);
+                  RemoveInstruction(hp1);
+                  Result := True;
+                  Exit;
+                end
+              else if
+                (getregtype(taicpu(p).oper[0]^.reg) = R_INTREGISTER) and
+                (getregtype(taicpu(hp1).oper[0]^.reg) = R_INTREGISTER) and
+                (getsubreg(taicpu(p).oper[0]^.reg) = getsubreg(taicpu(hp1).oper[0]^.reg)) then
+                begin
+                  DebugMsg(SPeepholeOptimization + 'Changed second ldr' + oppostfix2str[taicpu(hp1).oppostfix] + ' to mov (load/load -> load/move)', hp1);
+                  taicpu(hp1).opcode := A_MOV;
+                  taicpu(hp1).oppostfix := PF_None;
+                  taicpu(hp1).loadreg(1, taicpu(p).oper[0]^.reg);
+                  AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, UsedRegs);
+                  Result := True;
+                  Exit;
+                end;
+            end;
+        end;
+    end;
+
+
+    function TARMAsmOptimizer.OptPass1STR(var p : tai) : Boolean;
+      var
+        hp1: tai;
+        Reference: TReference;
+        SizeMismatch: Boolean;
+        SrcReg: TRegister;
+        NewOp: TAsmOp;
+      begin
+        Result := False;
+        if (taicpu(p).ops <> 2) or (taicpu(p).condition <> C_None) then
+          Exit;
+
+        Reference := taicpu(p).oper[1]^.ref^;
+        if (Reference.addressmode = AM_OFFSET) and
+          not RegInRef(taicpu(p).oper[0]^.reg, Reference) and
+          { Delay calling GetNextInstruction for as long as possible }
+          GetNextInstruction(p, hp1) and
+          (hp1.typ = ait_instruction) and
+          (taicpu(hp1).condition = C_None) and
+          (taicpu(hp1).oppostfix = taicpu(p).oppostfix) then
+
+        if GetNextInstruction(p, hp1) and
+          (hp1.typ = ait_instruction) and
+          (taicpu(hp1).condition = C_None) then
+          begin
+            { Saves constant dereferencing and makes it easier to change the size if necessary }
+            SrcReg := taicpu(p).oper[0]^.reg;
+
+            if (taicpu(hp1).opcode = A_LDR) and
+              RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) and
+              (
+                (taicpu(hp1).oppostfix = taicpu(p).oppostfix) or
+                ((taicpu(p).oppostfix = PF_B) and (taicpu(hp1).oppostfix = PF_SB)) or
+                ((taicpu(p).oppostfix = PF_H) and (taicpu(hp1).oppostfix = PF_SH))
+{$ifdef AARCH64}
+                or ((taicpu(p).oppostfix = PF_W) and (taicpu(hp1).oppostfix = PF_SW))
+{$endif AARCH64}
+              ) then
+              begin
+                { With:
+                    str reg1,[ref]
+                    ldr reg2,[ref]
+
+                  If reg1 = reg2, Remove ldr.
+                  If reg1 <> reg2, replace ldr with "mov reg2,reg1"
+                }
+
+                if (SrcReg = taicpu(hp1).oper[0]^.reg) and
+                  { e.g. the ldrb in strb/ldrb is not a null operation as it clears the upper 24 bits }
+                  (taicpu(p).oppostfix=PF_None) then
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'Removed redundant load instruction (store/load -> store/nop)', hp1);
+                    RemoveInstruction(hp1);
+                    Result := True;
+                    Exit;
+                  end
+                else if (getregtype(taicpu(p).oper[0]^.reg) = R_INTREGISTER) and
+                  (getregtype(taicpu(hp1).oper[0]^.reg) = R_INTREGISTER) and
+                  (getsubreg(taicpu(p).oper[0]^.reg) = getsubreg(taicpu(hp1).oper[0]^.reg)) then
+                  begin
+                    case taicpu(hp1).oppostfix of
+                      PF_B:
+                        NewOp := A_UXTB;
+                      PF_SB:
+                        NewOp := A_SXTB;
+                      PF_H:
+                        NewOp := A_UXTH;
+                      PF_SH:
+                        NewOp := A_SXTH;
+{$ifdef AARCH64}
+                      PF_SW:
+                        NewOp := A_SXTW;
+                      PF_W,
+{$endif AARCH64}
+                      PF_None:
+                        NewOp := A_MOV;
+                      else
+                        InternalError(2021043001);
+                    end;
+
+                    DebugMsg(SPeepholeOptimization + 'Changed ldr' + oppostfix2str[taicpu(hp1).oppostfix] + ' to ' + gas_op2str[NewOp] + ' (store/load -> store/move)', hp1);
+
+                    taicpu(hp1).oppostfix := PF_None;
+                    taicpu(hp1).opcode := NewOp;
+                    taicpu(hp1).loadreg(1, taicpu(p).oper[0]^.reg);
+                    AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, UsedRegs);
+                    Result := True;
+                    Exit;
+                  end;
+              end
+            else if (taicpu(hp1).opcode = A_STR) and
+              RefsEqual(taicpu(hp1).oper[1]^.ref^, Reference) then
+              begin
+                { With:
+                    str reg1,[ref]
+                    str reg2,[ref]
+
+                  If reg1 <> reg2, delete the first str
+                  IF reg1 = reg2, delete the second str
+                }
+                if SrcReg = taicpu(hp1).oper[0]^.reg then
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'Removed duplicate store instruction (store/store -> store/nop)', hp1);
+                    RemoveInstruction(hp1);
+                    Result := True;
+                    Exit;
+                  end
+                else if
+                  { Registers same byte size? }
+                  (tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)] = tcgsize2size[reg_cgsize(taicpu(hp1).oper[0]^.reg)]) then
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'Removed dominated store instruction (store/store -> nop/store)', p);
+                    RemoveCurrentP(p, hp1);
+                    Result := True;
+                    Exit;
+                  end;
+              end;
+          end;
+      end;
+
+
   function TARMAsmOptimizer.OptPass1And(var p : tai) : Boolean;
   function TARMAsmOptimizer.OptPass1And(var p : tai) : Boolean;
     var
     var
       hp1, hp2: tai;
       hp1, hp2: tai;