Browse Source

* x86: New SHR-based optimisations

J. Gareth "Curious Kit" Moreton 3 years ago
parent
commit
6757bf3832
3 changed files with 327 additions and 24 deletions
  1. 2 0
      compiler/i386/aoptcpu.pas
  2. 323 24
      compiler/x86/aoptx86.pas
  3. 2 0
      compiler/x86_64/aoptcpu.pas

+ 2 - 0
compiler/i386/aoptcpu.pas

@@ -191,6 +191,8 @@ unit aoptcpu;
                   end;
                 A_SHL, A_SAL:
                   Result:=OptPass1SHLSAL(p);
+                A_SHR:
+                  Result:=OptPass1SHR(p);
                 A_SUB:
                   Result:=OptPass1Sub(p);
                 A_Jcc:

+ 323 - 24
compiler/x86/aoptx86.pas

@@ -135,6 +135,7 @@ unit aoptx86;
 
         class function IsExitCode(p : tai) : boolean; static;
         class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
+        class function IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; static;
         procedure RemoveLastDeallocForFuncRes(p : tai);
 
         function DoSubAddOpt(var p : tai) : Boolean;
@@ -157,6 +158,7 @@ unit aoptx86;
         function OptPass1LEA(var p : tai) : boolean;
         function OptPass1Sub(var p : tai) : boolean;
         function OptPass1SHLSAL(var p : tai) : boolean;
+        function OptPass1SHR(var p : tai) : boolean;
         function OptPass1FSTP(var p : tai) : boolean;
         function OptPass1FLD(var p : tai) : boolean;
         function OptPass1Cmp(var p : tai) : boolean;
@@ -6448,6 +6450,146 @@ unit aoptx86;
       end;
 
 
+    class function TX86AsmOptimizer.IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean;
+      begin
+        case shr_size of
+          S_B:
+            { No valid combinations }
+            Result := False;
+
+          S_W:
+            Result := (Shift >= 8) and (movz_size = S_BW);
+
+          S_L:
+            Result :=
+              (Shift >= 24) { Any opsize is valid for this shift } or
+              ((Shift >= 16) and (movz_size = S_WL));
+{$ifdef x86_64}
+          S_Q:
+            Result :=
+              (Shift >= 56) { Any opsize is valid for this shift } or
+              ((Shift >= 48) and (movz_size = S_WL));
+{$endif x86_64}
+          else
+            InternalError(2022081510);
+        end;
+      end;
+
+    function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
+      var
+        hp1, hp2: tai;
+        Shift: TCGInt;
+        LimitSize: Topsize;
+        DoNotMerge: Boolean;
+      begin
+        Result := False;
+
+        { All these optimisations work on "shr const,%reg" }
+        if not MatchOpType(taicpu(p), top_const, top_reg) then
+          Exit;
+
+        DoNotMerge := False;
+        Shift := taicpu(p).oper[0]^.val;
+        LimitSize := taicpu(p).opsize;
+
+        hp1 := p;
+        repeat
+          if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
+            Exit;
+
+          case taicpu(hp1).opcode of
+            A_TEST, A_CMP, A_Jcc:
+              { Skip over conditional jumps and relevant comparisons }
+              Continue;
+
+            A_MOVZX:
+              if MatchOpType(taicpu(hp1), top_reg, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Since the original register is being read as is, subsequent
+                    SHRs must not be merged at this point }
+                  DoNotMerge := True;
+
+                  if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
+                    begin
+                      if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then { Different register target }
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
+                          taicpu(hp1).opcode := A_MOV;
+                          setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
+                          case taicpu(hp1).opsize of
+                            S_BW:
+                              taicpu(hp1).opsize := S_W;
+                            S_BL, S_WL:
+                              taicpu(hp1).opsize := S_L;
+                            else
+                              InternalError(2022081503);
+                          end;
+
+                          { p itself hasn't changed, so no need to set Result to True }
+                          Include(OptsToCheck, aoc_ForceNewIteration);
+
+                          { See if there's anything afterwards that can be
+                            optimised, since the input register hasn't changed }
+                          Continue;
+                        end;
+
+                      { NOTE: If the MOVZX instruction reads and writes the same
+                        register, defer this to the post-peephole optimisation stage }
+                      Exit;
+                    end;
+                end;
+            A_SHL, A_SAL, A_SHR:
+              if (taicpu(hp1).opsize <= LimitSize) and
+                MatchOpType(taicpu(hp1), top_const, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Make sure the sizes don't exceed the register size limit
+                    (measured by the shift value falling below the limit) }
+
+                  if taicpu(hp1).opsize < LimitSize then
+                    LimitSize := taicpu(hp1).opsize;
+
+                  if taicpu(hp1).opcode = A_SHR then
+                    Inc(Shift, taicpu(hp1).oper[0]^.val)
+                  else
+                    begin
+                      Dec(Shift, taicpu(hp1).oper[0]^.val);
+                      DoNotMerge := True;
+                    end;
+
+                  if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
+                    Exit;
+
+                  { Since we've established that the combined shift is within
+                    limits, we can actually combine the adjacent SHR
+                    instructions even if they're different sizes }
+                  if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
+                    begin
+                      hp2 := tai(hp1.Previous);
+                      DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 1', p);
+                      Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
+                      RemoveInstruction(hp1);
+                      hp1 := hp2;
+
+                      { Though p has changed, only the constant has, and its
+                        effects can still be detected on the next iteration of
+                        the repeat..until loop }
+                      Include(OptsToCheck, aoc_ForceNewIteration);
+                    end;
+
+                  { Move onto the next instruction }
+                  Continue;
+                end;
+            else
+              ;
+          end;
+
+          Break;
+        until False;
+      end;
+
+
     function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
       var
         CurrentRef: TReference;
@@ -12930,36 +13072,193 @@ unit aoptx86;
 
     function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
       var
-        hp1: tai;
+        hp1, hp2: tai;
+        IdentityMask, Shift: TCGInt;
+        LimitSize: Topsize;
+        DoNotMerge: Boolean;
       begin
-        { Detect:
-            shr    x,  %ax (x > 0)
-            ...
-            movzwl %ax,%eax
+        Result := False;
 
-          Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
-        }
+        { All these optimisations work on "shr const,%reg" }
+        if not MatchOpType(taicpu(p), top_const, top_reg) then
+          Exit;
 
-        Result := False;
-        if MatchOpType(taicpu(p), top_const, top_reg) and
-          (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
-          (taicpu(p).oper[0]^.val > 0) and
-          GetNextInstructionUsingReg(p, hp1, NR_EAX) and
-          MatchInstruction(hp1, A_MOVZX, [S_WL]) and
-          MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
-          MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
-          begin
-            DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
-            taicpu(hp1).opcode := A_CWDE;
-            taicpu(hp1).clearop(0);
-            taicpu(hp1).clearop(1);
-            taicpu(hp1).ops := 0;
+        DoNotMerge := False;
+        Shift := taicpu(p).oper[0]^.val;
+        LimitSize := taicpu(p).opsize;
 
-            { A change was made, but not with p, so move forward 1 }
-            p := tai(p.Next);
-            Result := True;
+        hp1 := p;
+        repeat
+          if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
+            Exit;
+
+          { Detect:
+              shr x, %reg
+              and y, %reg
+
+            If and y, %reg doesn't actually change the value of %reg (e.g. with
+            "shrl $24,%reg; andl $255,%reg", remove the AND instruction.
+          }
+
+          case taicpu(hp1).opcode of
+            A_AND:
+              if (taicpu(hp1).opsize = taicpu(p).opsize) and
+                MatchOpType(taicpu(hp1), top_const, top_reg) and
+                (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
+                begin
+                  { Make sure the FLAGS register isn't in use }
+                  TransferUsedRegs(TmpUsedRegs);
+                  hp2 := p;
+                  repeat
+                    UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                  until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                  if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
+                    begin
+                      { Generate the identity mask }
+                      case taicpu(p).opsize of
+                        S_B:
+                          IdentityMask := $FF shr Shift;
+                        S_W:
+                          IdentityMask := $FFFF shr Shift;
+                        S_L:
+                          IdentityMask := $FFFFFFFF shr Shift;
+{$ifdef x86_64}
+                        S_Q:
+                          { We need to force the operands to be unsigned 64-bit
+                            integers otherwise the wrong value is generated }
+                          IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
+{$endif x86_64}
+                        else
+                          InternalError(2022081501);
+                      end;
+
+                      if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
+                          { All the possible 1 bits are covered, so we can remove the AND }
+                          hp2 := tai(hp1.Previous);
+                          RemoveInstruction(hp1);
+
+                          { p wasn't actually changed, so don't set Result to True,
+                            but a change was nonetheless made elsewhere }
+                          Include(OptsToCheck, aoc_ForceNewIteration);
+
+                          { Do another pass in case other AND or MOVZX instructions
+                            follow }
+                          hp1 := hp2;
+                          Continue;
+                        end;
+
+                    end;
+                end;
+
+            A_TEST, A_CMP, A_Jcc:
+              { Skip over conditional jumps and relevant comparisons }
+              Continue;
+
+            A_MOVZX:
+              if MatchOpType(taicpu(hp1), top_reg, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Since the original register is being read as is, subsequent
+                    SHRs must not be merged at this point }
+                  DoNotMerge := True;
+
+                  if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
+                    begin
+                      if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
+                          { All the possible 1 bits are covered, so we can remove the AND }
+                          hp2 := tai(hp1.Previous);
+                          RemoveInstruction(hp1);
+
+                          hp1 := hp2;
+                        end
+                      else { Different register target }
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 2)', hp1);
+                          taicpu(hp1).opcode := A_MOV;
+                          setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
+                          case taicpu(hp1).opsize of
+                            S_BW:
+                              taicpu(hp1).opsize := S_W;
+                            S_BL, S_WL:
+                              taicpu(hp1).opsize := S_L;
+                            else
+                              InternalError(2022081503);
+                          end;
+                        end;
+                    end
+                  else if (Shift > 0) and
+                    (taicpu(p).opsize = S_W) and
+                    (taicpu(hp1).opsize = S_WL) and
+                    (taicpu(hp1).oper[0]^.reg = NR_AX) and
+                    (taicpu(hp1).oper[1]^.reg = NR_EAX) then
+                    begin
+                      { Detect:
+                          shr    x,  %ax (x > 0)
+                          ...
+                          movzwl %ax,%eax
+
+                        Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
+                      }
+                      DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
+                      taicpu(hp1).opcode := A_CWDE;
+                      taicpu(hp1).clearop(0);
+                      taicpu(hp1).clearop(1);
+                      taicpu(hp1).ops := 0;
+                    end;
+
+                  { Move onto the next instruction }
+                  Continue;
+                end;
+
+            A_SHL, A_SAL, A_SHR:
+              if (taicpu(hp1).opsize <= LimitSize) and
+                MatchOpType(taicpu(hp1), top_const, top_reg) and
+                SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
+                begin
+                  { Make sure the sizes don't exceed the register size limit
+                    (measured by the shift value falling below the limit) }
+
+                  if taicpu(hp1).opsize < LimitSize then
+                    LimitSize := taicpu(hp1).opsize;
+
+                  if taicpu(hp1).opcode = A_SHR then
+                    Inc(Shift, taicpu(hp1).oper[0]^.val)
+                  else
+                    begin
+                      Dec(Shift, taicpu(hp1).oper[0]^.val);
+                      DoNotMerge := True;
+                    end;
+
+                  if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
+                    Exit;
+
+                  { Since we've established that the combined shift is within
+                    limits, we can actually combine the adjacent SHR
+                    instructions even if they're different sizes }
+                  if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
+                    begin
+                      hp2 := tai(hp1.Previous);
+                      DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 2', p);
+                      Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
+                      RemoveInstruction(hp1);
+                      hp1 := hp2;
+                    end;
+
+                  { Move onto the next instruction }
+                  Continue;
+                end;
+            else
+              ;
           end;
 
+          Break;
+        until False;
+
       end;
 
 

+ 2 - 0
compiler/x86_64/aoptcpu.pas

@@ -130,6 +130,8 @@ uses
                   result:=OptPass1Sub(p);
                 A_SHL,A_SAL:
                   result:=OptPass1SHLSAL(p);
+                A_SHR:
+                  result:=OptPass1SHR(p);
                 A_FSTP,A_FISTP:
                   result:=OptPass1FSTP(p);
                 A_FLD: