Browse Source

* arm / a64: New optimisation for removing the number of necessary S/UXTB/H instructions

J. Gareth "Curious Kit" Moreton 1 year ago
parent
commit
096d5f50b2
1 changed files with 128 additions and 0 deletions
  1. 128 0
      compiler/armgen/aoptarm.pas

+ 128 - 0
compiler/armgen/aoptarm.pas

@@ -60,6 +60,9 @@ Type
 
     function OptPass2Bitwise(var p: tai): Boolean;
     function OptPass2TST(var p: tai): Boolean;
+
+  protected
+    function DoXTArithOp(var p: tai; hp1: tai): Boolean;
   End;
 
   function MatchInstruction(const instr: tai; const op: TCommonAsmOps; const cond: TAsmConds; const postfix: TOpPostfixes): boolean;
@@ -675,6 +678,123 @@ Implementation
     end;
 
 
+  function TARMAsmOptimizer.DoXTArithOp(var p: tai; hp1: tai): Boolean;
+    var
+      hp2: tai;
+      ConstLimit: TCGInt;
+      ValidPostFixes: TOpPostFixes;
+      FirstCode, SecondCode, ThirdCode, FourthCode: TAsmOp;
+    begin
+      Result := False;
+      { Change:
+          uxtb/h reg1,reg1
+          (operation on reg1 with immediate operand where the upper 24/56
+          bits don't affect the state of the first 8 bits )
+          uxtb/h reg1,reg1
+
+        Remove first uxtb/h
+      }
+      case taicpu(p).opcode of
+        A_UXTB,
+        A_SXTB:
+          begin
+            ConstLimit := $FF;
+            ValidPostFixes := [PF_B];
+            FirstCode := A_UXTB;
+            SecondCode := A_SXTB;
+            ThirdCode := A_UXTB; { Used to indicate no other valid codes }
+            FourthCode := A_SXTB;
+          end;
+        A_UXTH,
+        A_SXTH:
+          begin
+            ConstLimit := $FFFF;
+            ValidPostFixes := [PF_B, PF_H];
+            FirstCode := A_UXTH;
+            SecondCode := A_SXTH;
+            ThirdCode := A_UXTB;
+            FourthCode := A_SXTB;
+          end;
+        else
+          InternalError(2024051401);
+      end;
+
+{$ifndef AARCH64}
+      { Regular ARM doesn't have the multi-instruction MatchInstruction available }
+      if (hp1.typ = ait_instruction) and (taicpu(hp1).oppostfix = PF_None) then
+        case taicpu(hp1).opcode of
+          A_ADD, A_SUB, A_MUL, A_LSL, A_AND, A_ORR, A_EOR, A_BIC, A_ORN:
+{$endif AARCH64}
+
+      if
+        (taicpu(p).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
+{$ifdef AARCH64}
+        MatchInstruction(hp1, [A_ADD, A_SUB, A_MUL, A_LSL, A_AND, A_ORR, A_EOR, A_BIC, A_ORN, A_EON], [PF_None]) and
+{$endif AARCH64}
+        (taicpu(hp1).condition = C_None) and
+        (taicpu(hp1).ops = 3) and
+        (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
+        (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
+        (taicpu(hp1).oper[2]^.typ = top_const) and
+        (
+          (
+            { If the AND immediate is 8-bit, then this essentially performs
+              the functionality of the second UXTB and so its presence is
+              not required }
+            (taicpu(hp1).opcode = A_AND) and
+            (taicpu(hp1).oper[2]^.val >= 0) and
+            (taicpu(hp1).oper[2]^.val <= ConstLimit)
+          ) or
+          (
+            GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[0]^.reg) and
+            (hp2.typ = ait_instruction) and
+            (taicpu(hp2).ops = 2) and
+            (taicpu(hp2).condition = C_None) and
+            (
+              (
+                (taicpu(hp2).opcode in [FirstCode, SecondCode, ThirdCode, FourthCode]) and
+                (taicpu(hp2).oppostfix = PF_None) and
+                (taicpu(hp2).oper[1]^.reg = taicpu(p).oper[0]^.reg)
+                { Destination is allowed to be different in this case, but
+                  only if the source is no longer in use (it being the same as
+                  the source is covered by RegEndOfLife as well) }
+              ) or
+              (
+                { STRB essentially fills the same role as the second UXTB
+                  as long as the register is deallocated afterwards }
+                MatchInstruction(hp2, A_STR, [C_None], ValidPostFixes) and
+                (taicpu(hp2).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
+                not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp2).oper[1]^)
+              )
+            ) and
+            RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp2))
+          )
+        ) then
+        begin
+          DebugMsg(SPeepholeOptimization + 'S/Uxtb/hArithUxtb/h2ArithS/Uxtb/h done', p);
+          Result := RemoveCurrentP(p);
+
+          { Simplify bitwise constants if able }
+{$ifdef AARCH64}
+          if (taicpu(hp1).opcode in [A_AND, A_ORR, A_EOR, A_BIC, A_ORN, A_EON]) and
+            is_shifter_const(taicpu(hp1).oper[2]^.val and ConstLimit, OS_32) then
+{$else AARCH64}
+          if (
+              (ConstLimit = $FF) or
+              (taicpu(hp1).oper[2]^.val <= $100)
+            ) and
+            (taicpu(hp1).opcode in [A_AND, A_ORR, A_EOR, A_BIC, A_ORN]) then
+{$endif AARCH64}
+            taicpu(hp1).oper[2]^.val := taicpu(hp1).oper[2]^.val and ConstLimit;
+        end;
+{$ifndef AARCH64}
+          else
+            ;
+        end;
+{$endif not AARCH64}
+    end;
+
+
   function TARMAsmOptimizer.OptPass1UXTB(var p : tai) : Boolean;
     var
       hp1, hp2: tai;
@@ -773,6 +893,8 @@ Implementation
                   taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
                   result:=RemoveCurrentP(p);
                 end
+              else if DoXTArithOp(p, hp1) then
+                Result:=true
 {$ifdef AARCH64}
               else if USxtOp2Op(p,hp1,SM_UXTB) then
                 Result:=true
@@ -862,6 +984,8 @@ Implementation
                   taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
                   result:=RemoveCurrentP(p);
                 end
+              else if DoXTArithOp(p, hp1) then
+                Result:=true
 {$ifdef AARCH64}
               else if USxtOp2Op(p,hp1,SM_UXTH) then
                 Result:=true
@@ -974,6 +1098,8 @@ Implementation
                   taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
                   result:=RemoveCurrentP(p);
                 end
+              else if DoXTArithOp(p, hp1) then
+                Result:=true
 {$ifdef AARCH64}
               else if USxtOp2Op(p,hp1,SM_SXTB) then
                 Result:=true
@@ -1094,6 +1220,8 @@ Implementation
                   taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
                   result:=RemoveCurrentP(p);
                 end
+              else if DoXTArithOp(p, hp1) then
+                Result:=true
 {$ifdef AARCH64}
               else if USxtOp2Op(p,hp1,SM_SXTH) then
                 Result:=true