Browse Source

New MovxMovxOp2OpMovx optimisation

J. Gareth "Curious Kit" Moreton 3 years ago
parent
commit
da899df6b2
1 changed files with 515 additions and 47 deletions
  1. 515 47
      compiler/x86/aoptx86.pas

+ 515 - 47
compiler/x86/aoptx86.pas

@@ -9602,7 +9602,11 @@ unit aoptx86;
     function TX86AsmOptimizer.OptPass1Movx(var p : tai) : boolean;
       var
         hp1,hp2: tai;
-        reg_and_hp1_is_instr: Boolean;
+        reg_and_hp1_is_instr, RegUsed, AndTest: Boolean;
+        NewSize: TOpSize;
+        NewRegSize: TSubRegister;
+        Limit: TCgInt;
+        SwapOper: POper;
       begin
         result:=false;
         reg_and_hp1_is_instr:=(taicpu(p).oper[1]^.typ = top_reg) and
@@ -9678,8 +9682,11 @@ unit aoptx86;
             DebugMsg(SPeepholeOptimization + 'var3',p);
             RemoveCurrentP(p, hp1);
             RemoveInstruction(hp2);
-          end
-        else if reg_and_hp1_is_instr and
+            Result := True;
+            Exit;
+          end;
+
+        if reg_and_hp1_is_instr and
           (taicpu(hp1).opcode = A_MOV) and
           MatchOpType(taicpu(hp1),top_reg,top_reg) and
           (MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^)
@@ -9716,9 +9723,12 @@ unit aoptx86;
 {$endif x86_64}
                   taicpu(p).loadreg(1,taicpu(hp1).oper[1]^.reg);
                 RemoveInstruction(hp1);
+                Result := True;
+                Exit;
               end;
-          end
-        else if reg_and_hp1_is_instr and
+          end;
+
+        if reg_and_hp1_is_instr and
           ((taicpu(hp1).opcode=A_MOV) or
            (taicpu(hp1).opcode=A_ADD) or
            (taicpu(hp1).opcode=A_SUB) or
@@ -9727,56 +9737,225 @@ unit aoptx86;
            (taicpu(hp1).opcode=A_XOR) or
            (taicpu(hp1).opcode=A_AND)
           ) and
-          MatchOpType(taicpu(hp1),top_reg,top_reg) and
-          (((taicpu(p).opsize in [S_BW,S_BL,S_WL{$ifdef x86_64},S_BQ,S_WQ,S_LQ{$endif x86_64}]) and
-           (taicpu(hp1).opsize=S_B)) or
-           ((taicpu(p).opsize in [S_WL{$ifdef x86_64},S_WQ,S_LQ{$endif x86_64}]) and
-           (taicpu(hp1).opsize=S_W))
+          (taicpu(hp1).oper[1]^.typ = top_reg) then
+          begin
+            AndTest := (taicpu(hp1).opcode=A_AND) and
+              GetNextInstruction(hp1, hp2) and
+              (hp2.typ = ait_instruction) and
+              (
+                (
+                  (taicpu(hp2).opcode=A_TEST) and
+                  MatchOperand(taicpu(hp2).oper[0]^, taicpu(hp1).oper[1]^.reg) and
+                  MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
+                ) or
+                (
+                  (taicpu(hp2).opcode=A_CMP) and
+                  MatchOperand(taicpu(hp2).oper[0]^, 0) and
+                  MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[1]^.reg)
+                )
+              );
+
+            if (taicpu(hp1).oper[0]^.typ = top_reg) and
+              (((taicpu(p).opsize in [S_BW,S_BL,S_WL{$ifdef x86_64},S_BQ,S_WQ,S_LQ{$endif x86_64}]) and
+               (taicpu(hp1).opsize=S_B)) or
+               ((taicpu(p).opsize in [S_WL{$ifdef x86_64},S_WQ,S_LQ{$endif x86_64}]) and
+               (taicpu(hp1).opsize=S_W))
 {$ifdef x86_64}
-           or ((taicpu(p).opsize=S_LQ) and
-            (taicpu(hp1).opsize=S_L))
+               or ((taicpu(p).opsize=S_LQ) and
+                (taicpu(hp1).opsize=S_L))
 {$endif x86_64}
-          ) and
-          SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg) then
-          begin
-            { change
-              movx   %reg1,%reg2
-              mov    %reg2,%reg3
-              dealloc %reg2
+              ) and
+              SuperRegistersEqual(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^.reg) then
+              begin
+                { change
+                  movx   %reg1,%reg2
+                  op     %reg2,%reg3
+                  dealloc %reg2
 
-              into
+                  into
 
-              mov   %reg1,%reg3
+                  op     %reg1,%reg3
 
-              if the second mov accesses only the bits stored in reg1
-            }
-            TransferUsedRegs(TmpUsedRegs);
-            UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-            if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
-              begin
-                DebugMsg(SPeepholeOptimization + 'MovxOp2Op',p);
-                if taicpu(p).oper[0]^.typ=top_reg then
+                  if the second op accesses only the bits stored in reg1
+                }
+                TransferUsedRegs(TmpUsedRegs);
+                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                if AndTest then
                   begin
-                    case taicpu(hp1).opsize of
-                      S_B:
-                        taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBL));
-                      S_W:
-                        taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBW));
-                      S_L:
-                        taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBD));
-                      else
-                        Internalerror(2020102301);
+                    UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
+                    RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
+                  end
+                else
+                  RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
+
+                if not RegUsed then
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'MovxOp2Op',p);
+                    if taicpu(p).oper[0]^.typ=top_reg then
+                      begin
+                        case taicpu(hp1).opsize of
+                          S_B:
+                            taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBL));
+                          S_W:
+                            taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBW));
+                          S_L:
+                            taicpu(hp1).loadreg(0,newreg(R_INTREGISTER,getsupreg(taicpu(p).oper[0]^.reg),R_SUBD));
+                          else
+                            Internalerror(2020102301);
+                        end;
+                        AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
+                      end
+                    else
+                      taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
+                    RemoveCurrentP(p);
+                    if AndTest then
+                      RemoveInstruction(hp2);
+                    result:=true;
+                    exit;
+                  end;
+              end
+            else if (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
+              (
+                { Bitwise operations only }
+                (taicpu(hp1).opcode=A_AND) or
+                (taicpu(hp1).opcode=A_TEST) or
+                (
+                  (taicpu(hp1).oper[0]^.typ = top_const) and
+                  (
+                    (taicpu(hp1).opcode=A_OR) or
+                    (taicpu(hp1).opcode=A_XOR)
+                  )
+                )
+              ) and
+              (
+                (taicpu(hp1).oper[0]^.typ = top_const) or
+                MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) or
+                not RegInOp(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[0]^)
+              ) then
+              begin
+                { change
+                  movx   %reg2,%reg2
+                  op     const,%reg2
+
+                  into
+                  op     const,%reg2  (smaller version)
+                  movx   %reg2,%reg2
+
+                  also change
+                  movx     %reg1,%reg2
+                  and/test (oper),%reg2
+                  dealloc %reg2
+
+                  into
+
+                  and/test (oper),%reg1
+                }
+                case taicpu(p).opsize of
+                  S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
+                    begin
+                      NewSize := S_B;
+                      NewRegSize := R_SUBL;
+                      Limit := $FF;
                     end;
-                    AllocRegBetween(taicpu(hp1).oper[0]^.reg,p,hp1,UsedRegs);
+                  S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
+                    begin
+                      NewSize := S_W;
+                      NewRegSize := R_SUBW;
+                      Limit := $FFFF;
+                    end;
+{$ifdef x86_64}
+                  S_LQ:
+                    begin
+                      NewSize := S_L;
+                      NewRegSize := R_SUBD;
+                      Limit := $FFFFFFFF;
+                    end;
+{$endif x86_64}
+                  else
+                    Internalerror(2021120302);
+                end;
+
+                TransferUsedRegs(TmpUsedRegs);
+                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                if AndTest then
+                  begin
+                    UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
+                    RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp2,TmpUsedRegs);
                   end
                 else
-                  taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
-                RemoveCurrentP(p);
-                result:=true;
-                exit;
+                  RegUsed := RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs);
+
+                if
+                  (
+                    (taicpu(p).opcode = A_MOVZX) and
+                    (
+                      (taicpu(hp1).opcode=A_AND) or
+                      (taicpu(hp1).opcode=A_TEST)
+                    ) and
+                    not (
+                      { If both are references, then the final instruction will have
+                        both operands as references, which is not allowed }
+                      (taicpu(p).oper[0]^.typ = top_ref) and
+                      (taicpu(hp1).oper[0]^.typ = top_ref)
+                    ) and
+                    not RegUsed
+                  ) or
+                  (
+                    (
+                      SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) or
+                      not RegUsed
+                    ) and
+                    (taicpu(p).oper[0]^.typ = top_reg) and
+                    SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
+                    (taicpu(hp1).oper[0]^.typ = top_const) and
+                    ((taicpu(hp1).oper[0]^.val and Limit) = taicpu(hp1).oper[0]^.val)
+                  ) then
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'MovxOp2Op 2',p);
+
+                    if AndTest and not RegUsed then
+                      taicpu(hp1).opcode := A_TEST;
+
+                    taicpu(hp1).opsize := NewSize;
+
+                    case taicpu(hp1).oper[0]^.typ of
+                      top_reg:
+                        setsubreg(taicpu(hp1).oper[0]^.reg, NewRegSize);
+                      top_const:
+                        { For the AND/TEST case }
+                        taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and Limit;
+                      else
+                        ;
+                    end;
+
+                    taicpu(hp1).loadoper(1, taicpu(p).oper[0]^);
+                    if (taicpu(hp1).opcode = A_TEST) and (taicpu(hp1).oper[0]^.typ = top_ref) then
+                      begin
+                        { For TEST, make sure the reference is the second operand }
+                        SwapOper := taicpu(hp1).oper[0];
+                        taicpu(hp1).oper[0] := taicpu(hp1).oper[1];
+                        taicpu(hp1).oper[1] := SwapOper;
+                      end;
+
+                    if AndTest then
+                      RemoveInstruction(hp2);
+
+                    if RegUsed then
+                      begin
+                        AsmL.Remove(p);
+                        AsmL.InsertAfter(p, hp1);
+                        p := hp1;
+                      end
+                    else
+                      RemoveCurrentP(p, hp1);
+
+                    result:=true;
+                    exit;
+                  end;
               end;
-          end
-        else if reg_and_hp1_is_instr and
+          end;
+
+        if reg_and_hp1_is_instr and
           (taicpu(p).oper[0]^.typ = top_reg) and
           (
             (taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
@@ -9853,8 +10032,183 @@ unit aoptx86;
               DebugMsg(SPeepholeOptimization + 'MovsSar2SarMovs', hp1);
 
             Result := True;
-          end
-        else if taicpu(p).opcode=A_MOVZX then
+          end;
+
+        if reg_and_hp1_is_instr and
+          (taicpu(p).oper[0]^.typ = top_reg) and
+          SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
+          (
+            (taicpu(hp1).opcode = taicpu(p).opcode)
+            or ((taicpu(p).opcode = A_MOVZX) and ((taicpu(hp1).opcode = A_MOVSX){$ifdef x86_64} or (taicpu(hp1).opcode = A_MOVSXD){$endif x86_64}))
+{$ifdef x86_64}
+            or ((taicpu(p).opcode = A_MOVSX) and (taicpu(hp1).opcode = A_MOVSXD))
+{$endif x86_64}
+          ) then
+          begin
+            if MatchOpType(taicpu(hp1), top_reg, top_reg) and
+              (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[0]^.reg) and
+              SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
+              begin
+                {
+                  For example:
+   	            movzbw  %al,%ax
+    	            movzwl  %ax,%eax
+
+                  Compress into:
+ 	            movzbl  %al,%eax
+                }
+                RegUsed := False;
+                case taicpu(p).opsize of
+                  S_BW:
+                    case taicpu(hp1).opsize of
+                      S_WL:
+                        begin
+                          taicpu(p).opsize := S_BL;
+                          RegUsed := True;
+                        end;
+{$ifdef x86_64}
+                      S_WQ:
+                        begin
+                          if taicpu(p).opcode = A_MOVZX then
+                            taicpu(p).opsize := S_BL
+                          else
+                            taicpu(p).opsize := S_BQ;
+                          RegUsed := True;
+                        end;
+{$endif x86_64}
+                      else
+                        ;
+                    end;
+{$ifdef x86_64}
+                  S_BL:
+                    case taicpu(hp1).opsize of
+                      S_LQ:
+                        begin
+                          if taicpu(p).opcode = A_MOVZX then
+                            taicpu(p).opsize := S_BL
+                          else
+                            taicpu(p).opsize := S_BQ;
+                          RegUsed := True;
+                        end;
+                      else
+                        ;
+                    end;
+                  S_WL:
+                    case taicpu(hp1).opsize of
+                      S_LQ:
+                        begin
+                          if taicpu(p).opcode = A_MOVZX then
+                            taicpu(p).opsize := S_WL
+                          else
+                            taicpu(p).opsize := S_WQ;
+                          RegUsed := True;
+                        end;
+                      else
+                        ;
+                    end;
+{$endif x86_64}
+                  else
+                    ;
+                end;
+
+                if RegUsed then
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'MovxMovx2Movx', p);
+                    taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
+                    RemoveInstruction(hp1);
+                    Result := True;
+                    Exit;
+                  end;
+              end;
+
+            if (taicpu(hp1).opsize = taicpu(p).opsize) and
+              not RegInInstruction(taicpu(p).oper[1]^.reg, hp1) and
+              GetNextInstruction(hp1, hp2) and
+              MatchInstruction(hp2, [A_AND, A_OR, A_XOR, A_TEST], []) and
+              (
+                ((taicpu(hp2).opsize = S_W) and (taicpu(p).opsize = S_BW)) or
+                ((taicpu(hp2).opsize = S_L) and (taicpu(p).opsize in [S_BL, S_WL]))
+{$ifdef x86_64}
+                or ((taicpu(hp2).opsize = S_Q) and (taicpu(p).opsize in [S_BL, S_BQ, S_WL, S_WQ, S_LQ]))
+{$endif x86_64}
+              ) and
+              MatchOpType(taicpu(hp2), top_reg, top_reg) and
+              (
+                (
+                  (taicpu(hp2).oper[0]^.reg = taicpu(hp1).oper[1]^.reg) and
+                  (taicpu(hp2).oper[1]^.reg = taicpu(p).oper[1]^.reg)
+                ) or
+                (
+                  { Only allow the operands in reverse order for TEST instructions }
+                  (taicpu(hp2).opcode = A_TEST) and
+                  (taicpu(hp2).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
+                  (taicpu(hp2).oper[1]^.reg = taicpu(hp1).oper[1]^.reg)
+                )
+              ) then
+              begin
+                {
+                  For example:
+      	            movzbl  %al,%eax
+      	            movzbl  (ref),%edx
+      	            andl    %edx,%eax
+                    (%edx deallocated)
+
+                  Change to:
+        	    andb	(ref),%al
+        	    movzbl	%al,%eax
+
+                  Rules are:
+                  - First two instructions have the same opcode and opsize
+                  - First instruction's operands are the same super-register
+                  - Second instruction operates on a different register
+                  - Third instruction is AND, OR, XOR or TEST
+                  - Third instruction's operands are the destination registers of the first two instructions
+                  - Third instruction writes to the destination register of the first instruction (except with TEST)
+                  - Second instruction's destination register is deallocated afterwards
+                }
+                TransferUsedRegs(TmpUsedRegs);
+                UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
+                UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
+                if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs) then
+                  begin
+                    case taicpu(p).opsize of
+                      S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
+                        NewSize := S_B;
+                      S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
+                        NewSize := S_W;
+{$ifdef x86_64}
+                      S_LQ:
+                        NewSize := S_L;
+{$endif x86_64}
+                      else
+                        InternalError(2021120301);
+                    end;
+
+                    taicpu(hp2).loadoper(0, taicpu(hp1).oper[0]^);
+                    taicpu(hp2).loadreg(1, taicpu(p).oper[0]^.reg);
+                    taicpu(hp2).opsize := NewSize;
+
+                    RemoveInstruction(hp1);
+
+                    { With TEST, it's best to keep the MOVX instruction at the top }
+                    if (taicpu(hp2).opcode <> A_TEST) then
+                      begin
+                        DebugMsg(SPeepholeOptimization + 'MovxMovxTest2MovxTest', p);
+                        asml.Remove(p);
+                         { If the third instruction uses the flags, the MOVX instruction won't modify then }
+                        asml.InsertAfter(p, hp2);
+                        p := hp2;
+                      end
+                    else
+                      DebugMsg(SPeepholeOptimization + 'MovxMovxOp2OpMovx', p);
+
+                    Result := True;
+                    Exit;
+                  end;
+              end;
+          end;
+
+        if taicpu(p).opcode=A_MOVZX then
           begin
             { removes superfluous And's after movzx's }
             if reg_and_hp1_is_instr and
@@ -10103,6 +10457,7 @@ unit aoptx86;
         hp1, hp2 : tai;
         MaskLength : Cardinal;
         MaskedBits : TCgInt;
+        ActiveReg : TRegister;
       begin
         Result:=false;
 
@@ -10316,6 +10671,119 @@ unit aoptx86;
                   else
                     ;
                 end;
+              end
+            else if MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^.reg) and
+              not RegInUsedRegs(NR_DEFAULTFLAGS, UsedRegs) then
+              begin
+{$ifdef x86_64}
+                if (taicpu(p).opsize = S_Q) then
+                  begin
+                    { Never necessary }
+                    DebugMsg(SPeepholeOptimization + 'Andq2Nop', p);
+                    RemoveCurrentP(p, hp1);
+                    Result := True;
+                    Exit;
+                  end;
+{$endif x86_64}
+                { Forward check to determine necessity of and %reg,%reg }
+                TransferUsedRegs(TmpUsedRegs);
+                UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
+
+                { Saves on a bunch of dereferences }
+                ActiveReg := taicpu(p).oper[1]^.reg;
+
+                case taicpu(hp1).opcode of
+                  A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}:
+
+                    if (
+                        (taicpu(hp1).oper[0]^.typ <> top_ref) or
+                        not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
+                      ) and
+                      (
+                        (taicpu(hp1).opcode <> A_MOV) or
+                        (taicpu(hp1).oper[1]^.typ <> top_ref) or
+                        not RegInRef(ActiveReg, taicpu(hp1).oper[1]^.ref^)
+                      ) and
+                      not (
+                        { If mov %reg,%reg is present, remove that instruction instead in OptPass1MOV }
+                        (taicpu(hp1).opcode = A_MOV) and
+                        MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) and
+                        MatchOperand(taicpu(hp1).oper[1]^, ActiveReg)
+                      ) and
+                      (
+                        (
+                          (taicpu(hp1).oper[0]^.typ = top_reg) and
+                          (taicpu(hp1).oper[0]^.reg = ActiveReg) and
+                          SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg)
+                        ) or
+                        (
+{$ifdef x86_64}
+                          (
+                            { If we read from the register, make sure it's not dependent on the upper 32 bits }
+                            (taicpu(hp1).oper[0]^.typ <> top_reg) or
+                            not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ActiveReg) or
+                            (GetSubReg(taicpu(hp1).oper[0]^.reg) <> R_SUBQ)
+                          ) and
+{$endif x86_64}
+                          not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs)
+                        )
+                      ) then
+                      begin
+                        DebugMsg(SPeepholeOptimization + 'AndMovx2Movx', p);
+                        RemoveCurrentP(p, hp1);
+                        Result := True;
+                        Exit;
+                      end;
+                  A_ADD,
+                  A_AND,
+                  A_BSF,
+                  A_BSR,
+                  A_BTC,
+                  A_BTR,
+                  A_BTS,
+                  A_OR,
+                  A_SUB,
+                  A_XOR:
+                    { Register is written to, so this will clear the upper 32 bits (2-operand instructions) }
+                    if (
+                        (taicpu(hp1).oper[0]^.typ <> top_ref) or
+                        not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
+                      ) and
+                      MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) then
+                      begin
+                        DebugMsg(SPeepholeOptimization + 'AndOp2Op 2', p);
+                        RemoveCurrentP(p, hp1);
+                        Result := True;
+                        Exit;
+                      end;
+                  A_CMP,
+                  A_TEST:
+                    if (
+                        (taicpu(hp1).oper[0]^.typ <> top_ref) or
+                        not RegInRef(ActiveReg, taicpu(hp1).oper[0]^.ref^)
+                      ) and
+                      MatchOperand(taicpu(hp1).oper[1]^, ActiveReg) and
+                      not RegUsedAfterInstruction(ActiveReg, hp1, TmpUsedRegs) then
+                      begin
+                        DebugMsg(SPeepholeOptimization + 'AND; CMP/TEST -> CMP/TEST', p);
+                        RemoveCurrentP(p, hp1);
+                        Result := True;
+                        Exit;
+                      end;
+                  A_BSWAP,
+                  A_NEG,
+                  A_NOT:
+                  { Register is written to, so this will clear the upper 32 bits (1-operand instructions) }
+                  if MatchOperand(taicpu(hp1).oper[0]^, ActiveReg) then
+                    begin
+                      DebugMsg(SPeepholeOptimization + 'AndOp2Op 1', p);
+                      RemoveCurrentP(p, hp1);
+                      Result := True;
+                      Exit;
+                    end;
+                  else
+                    ;
+                end;
               end;
 
             if (taicpu(hp1).is_jmp) and