Browse Source

* x86: Fixed bug and refactored optimisations where SHR instructions were
merged incorrectly when a condition appears between them

J. Gareth "Curious Kit" Moreton 4 months ago
parent
commit
b611c0011d
1 changed files with 164 additions and 200 deletions
  1. 164 200
      compiler/x86/aoptx86.pas

+ 164 - 200
compiler/x86/aoptx86.pas

@@ -164,6 +164,7 @@ unit aoptx86;
         function DoArithCombineOpt(var p : tai) : Boolean;
         function DoMovCmpMemOpt(var p : tai; const hp1: tai) : Boolean;
         function DoSETccLblRETOpt(var p: tai; const hp_label: tai_label) : Boolean;
+        function HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
 
         function PrePeepholeOptSxx(var p : tai) : boolean;
         function PrePeepholeOptIMUL(var p : tai) : boolean;
@@ -7732,19 +7733,18 @@ unit aoptx86;
         end;
       end;
 
-    function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
+
+    function TX86AsmOptimizer.HandleSHRMerge(var p: tai; const PostPeephole: Boolean): Boolean;
       var
         hp1, hp2: tai;
-        Shift: TCGInt;
+        IdentityMask, Shift: TCGInt;
         LimitSize: Topsize;
         DoNotMerge: Boolean;
       begin
-        Result := False;
-
-        { All these optimisations work on "shr const,%reg" }
-        if not MatchOpType(taicpu(p), top_const, top_reg) then
-          Exit;
+        if not MatchInstruction(p, A_SHR, []) then
+          InternalError(2025040301);
 
+        Result := False;
         DoNotMerge := False;
         Shift := taicpu(p).oper[0]^.val;
         LimitSize := taicpu(p).opsize;
@@ -7755,7 +7755,77 @@ unit aoptx86;
             Exit;
 
           case taicpu(hp1).opcode of
-            A_TEST, A_CMP, A_Jcc:
+            A_AND:
+              { Detect:
+                  shr x, %reg
+                  and y, %reg
+
+                If and y, %reg doesn't actually change the value of %reg (e.g. with
+                "shrl $24,%reg; andl $255,%reg", remove the AND instruction.
+
+                (Post-peephole only)
+              }
+              if PostPeephole and
+                (taicpu(hp1).opsize = taicpu(p).opsize) and
+                MatchOpType(taicpu(hp1), top_const, top_reg) and
+                (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
+                begin
+                  { Make sure the FLAGS register isn't in use }
+                  TransferUsedRegs(TmpUsedRegs);
+                  hp2 := p;
+                  repeat
+                    UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                  until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                  if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
+                    begin
+                      { Generate the identity mask }
+                      case taicpu(p).opsize of
+                        S_B:
+                          IdentityMask := $FF shr Shift;
+                        S_W:
+                          IdentityMask := $FFFF shr Shift;
+                        S_L:
+                          IdentityMask := $FFFFFFFF shr Shift;
+{$ifdef x86_64}
+                        S_Q:
+                          { We need to force the operands to be unsigned 64-bit
+                            integers otherwise the wrong value is generated }
+                          IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
+{$endif x86_64}
+                        else
+                          InternalError(2022081501);
+                      end;
+
+                      if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
+                          { All the possible 1 bits are covered, so we can remove the AND }
+                          hp2 := tai(hp1.Previous);
+                          RemoveInstruction(hp1);
+
+                          { p wasn't actually changed, so don't set Result to True,
+                            but a change was nonetheless made elsewhere }
+                          Include(OptsToCheck, aoc_ForceNewIteration);
+
+                          { Do another pass in case other AND or MOVZX instructions
+                            follow }
+                          hp1 := hp2;
+                          Continue;
+                        end;
+
+                    end;
+                end;
+
+            A_TEST, A_CMP:
+              { Skip over relevant comparisons, but shift instructions must
+                now not be merged since the original value is being read }
+              begin
+                DoNotMerge := True;
+                Continue;
+              end;
+
+            A_Jcc:
               { Skip over conditional jumps and relevant comparisons }
               Continue;
 
@@ -7769,7 +7839,21 @@ unit aoptx86;
 
                   if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
                     begin
-                      if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then { Different register target }
+                      if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
+                        begin
+                          { If the MOVZX instruction reads and writes the same register,
+                            defer this to the post-peephole optimisation stage }
+                          if PostPeephole then
+                            begin
+                              DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
+                              { All the possible 1 bits are covered, so we can remove the MOVZX }
+                              hp2 := tai(hp1.Previous);
+                              RemoveInstruction(hp1);
+
+                              hp1 := hp2;
+                            end;
+                        end
+                      else { Different register target }
                         begin
                           DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
                           taicpu(hp1).opcode := A_MOV;
@@ -7791,10 +7875,49 @@ unit aoptx86;
                           Continue;
                         end;
 
-                      { NOTE: If the MOVZX instruction reads and writes the same
-                        register, defer this to the post-peephole optimisation stage }
                       Exit;
+                    end
+                  else if PostPeephole and
+                    (Shift > 0) and
+                    (taicpu(p).opsize = S_W) and
+                    (taicpu(hp1).opsize = S_WL) and
+                    (taicpu(hp1).oper[0]^.reg = NR_AX) and
+                    (taicpu(hp1).oper[1]^.reg = NR_EAX) then
+                    begin
+                      { Detect:
+                          shr    x,  %ax (x > 0)
+                          ...
+                          movzwl %ax,%eax
+-
+                        Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
+
+                        But first, check to see if movzwl %ax,%eax can be removed...
+                      }
+                      hp2 := tai(hp1.Previous);
+
+                      TransferUsedRegs(TmpUsedRegs);
+                      UpdateUsedRegsBetween(UsedRegs, p, hp1);
+
+                      if PostPeepholeOptMovZX(hp1) then
+                        hp1 := hp2
+                      else
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
+                          taicpu(hp1).opcode := A_CWDE;
+                          taicpu(hp1).clearop(0);
+                          taicpu(hp1).clearop(1);
+                          taicpu(hp1).ops := 0;
+                        end;
+
+                      RestoreUsedRegs(TmpUsedRegs);
+                      { Don't need to set aoc_ForceNewIteration if
+                        PostPeepholeOptMovZX returned True because it's the
+                        post-peephole stage }
+
                     end;
+
+                  { Move onto the next instruction }
+                  Continue;
                 end;
             A_SHL, A_SAL, A_SHR:
               if (taicpu(hp1).opsize <= LimitSize) and
@@ -7842,11 +7965,36 @@ unit aoptx86;
               ;
           end;
 
+          { If the register isn't actually modified, move onto the next instruction,
+            but set DoNotMerge to True since the register is being read }
+          if (
+              { Under -O2 and below, GetNextInstructionUsingReg only returns
+                the next instruction, whether or not it contains the register }
+              (cs_opt_level3 in current_settings.optimizerswitches) or
+              RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1)
+            ) and not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
+            begin
+              DoNotMerge := True;
+              Continue;
+            end;
+
           Break;
         until False;
       end;
 
 
+    function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
+      begin
+        Result := False;
+
+        { All these optimisations work on "shr const,%reg" }
+        if not MatchOpType(taicpu(p), top_const, top_reg) then
+          Exit;
+
+        Result := HandleSHRMerge(p, False);
+      end;
+
+
     function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
       var
         CurrentRef: TReference;
@@ -16576,10 +16724,7 @@ unit aoptx86;
 
     function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
       var
-        hp1, hp2: tai;
-        IdentityMask, Shift: TCGInt;
-        LimitSize: Topsize;
-        DoNotMerge: Boolean;
+        hp1: tai;
       begin
         Result := False;
 
@@ -16587,193 +16732,12 @@ unit aoptx86;
         if not MatchOpType(taicpu(p), top_const, top_reg) then
           Exit;
 
-        DoNotMerge := False;
-        Shift := taicpu(p).oper[0]^.val;
-        LimitSize := taicpu(p).opsize;
-
-        hp1 := p;
-        repeat
-          if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
-            Break;
-
-          { Detect:
-              shr x, %reg
-              and y, %reg
-
-            If and y, %reg doesn't actually change the value of %reg (e.g. with
-            "shrl $24,%reg; andl $255,%reg", remove the AND instruction.
-          }
-
-          case taicpu(hp1).opcode of
-            A_AND:
-              if (taicpu(hp1).opsize = taicpu(p).opsize) and
-                MatchOpType(taicpu(hp1), top_const, top_reg) and
-                (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
-                begin
-                  { Make sure the FLAGS register isn't in use }
-                  TransferUsedRegs(TmpUsedRegs);
-                  hp2 := p;
-                  repeat
-                    UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
-                  until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
-
-                  if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
-                    begin
-                      { Generate the identity mask }
-                      case taicpu(p).opsize of
-                        S_B:
-                          IdentityMask := $FF shr Shift;
-                        S_W:
-                          IdentityMask := $FFFF shr Shift;
-                        S_L:
-                          IdentityMask := $FFFFFFFF shr Shift;
-{$ifdef x86_64}
-                        S_Q:
-                          { We need to force the operands to be unsigned 64-bit
-                            integers otherwise the wrong value is generated }
-                          IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
-{$endif x86_64}
-                        else
-                          InternalError(2022081501);
-                      end;
-
-                      if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
-                        begin
-                          DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
-                          { All the possible 1 bits are covered, so we can remove the AND }
-                          hp2 := tai(hp1.Previous);
-                          RemoveInstruction(hp1);
-
-                          { p wasn't actually changed, so don't set Result to True,
-                            but a change was nonetheless made elsewhere }
-                          Include(OptsToCheck, aoc_ForceNewIteration);
-
-                          { Do another pass in case other AND or MOVZX instructions
-                            follow }
-                          hp1 := hp2;
-                          Continue;
-                        end;
-
-                    end;
-                end;
-
-            A_TEST, A_CMP, A_Jcc:
-              { Skip over conditional jumps and relevant comparisons }
-              Continue;
-
-            A_MOVZX:
-              if MatchOpType(taicpu(hp1), top_reg, top_reg) and
-                SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
-                begin
-                  { Since the original register is being read as is, subsequent
-                    SHRs must not be merged at this point }
-                  DoNotMerge := True;
-
-                  if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
-                    begin
-                      if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
-                        begin
-                          DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
-                          { All the possible 1 bits are covered, so we can remove the AND }
-                          hp2 := tai(hp1.Previous);
-                          RemoveInstruction(hp1);
-
-                          hp1 := hp2;
-                        end
-                      else { Different register target }
-                        begin
-                          DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 2)', hp1);
-                          taicpu(hp1).opcode := A_MOV;
-                          setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
-                          case taicpu(hp1).opsize of
-                            S_BW:
-                              taicpu(hp1).opsize := S_W;
-                            S_BL, S_WL:
-                              taicpu(hp1).opsize := S_L;
-                            else
-                              InternalError(2022081503);
-                          end;
-                        end;
-                    end
-                  else if (Shift > 0) and
-                    (taicpu(p).opsize = S_W) and
-                    (taicpu(hp1).opsize = S_WL) and
-                    (taicpu(hp1).oper[0]^.reg = NR_AX) and
-                    (taicpu(hp1).oper[1]^.reg = NR_EAX) then
-                    begin
-                      { Detect:
-                          shr    x,  %ax (x > 0)
-                          ...
-                          movzwl %ax,%eax
-
-                        Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
-                      }
-                      DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
-                      taicpu(hp1).opcode := A_CWDE;
-                      taicpu(hp1).clearop(0);
-                      taicpu(hp1).clearop(1);
-                      taicpu(hp1).ops := 0;
-                    end;
-
-                  { Move onto the next instruction }
-                  Continue;
-                end;
-
-            A_SHL, A_SAL, A_SHR:
-              if (taicpu(hp1).opsize <= LimitSize) and
-                MatchOpType(taicpu(hp1), top_const, top_reg) and
-                SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
-                begin
-                  { Make sure the sizes don't exceed the register size limit
-                    (measured by the shift value falling below the limit) }
-
-                  if taicpu(hp1).opsize < LimitSize then
-                    LimitSize := taicpu(hp1).opsize;
-
-                  if taicpu(hp1).opcode = A_SHR then
-                    Inc(Shift, taicpu(hp1).oper[0]^.val)
-                  else
-                    begin
-                      Dec(Shift, taicpu(hp1).oper[0]^.val);
-                      DoNotMerge := True;
-                    end;
-
-                  if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
-                    Break;
-
-                  { Since we've established that the combined shift is within
-                    limits, we can actually combine the adjacent SHR
-                    instructions even if they're different sizes }
-                  if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
-                    begin
-                      hp2 := tai(hp1.Previous);
-                      DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 2', p);
-                      Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
-                      RemoveInstruction(hp1);
-                      hp1 := hp2;
-                    end;
-
-                  { Move onto the next instruction }
-                  Continue;
-                end;
-            else
-              { If the register isn't actually modified, move onto the next instruction,
-                but set DoNotMerge to True since the register is being read }
-              if (
-                  { Under -O2 and below, GetNextInstructionUsingReg only returns
-                    the next instruction, whether or not it contains the register }
-                  (cs_opt_level3 in current_settings.optimizerswitches) or
-                  RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1)
-                ) and not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
-                begin
-                  DoNotMerge := True;
-                  Continue;
-                end;
+        if HandleSHRMerge(p, True) then
+          begin
+            Result := True;
+            Exit;
           end;
 
-          Break;
-        until False;
-
         { Detect the following (looking backwards):
             shr %cl,%reg
             shr x,  %reg