Pārlūkot izejas kodu

+ patch by J. Gareth Moreton: Advanced MOVZX optimisations, resolves #38294

git-svn-id: trunk@48086 -
florian 4 gadi atpakaļ
vecāks
revīzija
28efcfba65
3 mainītis faili ar 713 papildinājumiem un 3 dzēšanām
  1. 7 1
      compiler/i386/aoptcpu.pas
  2. 702 2
      compiler/x86/aoptx86.pas
  3. 4 0
      compiler/x86_64/aoptcpu.pas

+ 7 - 1
compiler/i386/aoptcpu.pas

@@ -252,6 +252,8 @@ unit aoptcpu;
                   Result:=OptPass2Jmp(p);
                 A_MOV:
                   Result:=OptPass2MOV(p);
+                A_MOVZX:
+                  Result:=OptPass2Movx(p);
                 A_SUB:
                   Result:=OptPass2SUB(p);
                 else
@@ -288,7 +290,9 @@ unit aoptcpu;
                   {   "cmpl $3,%eax; movzbl 8(%ebp),%ebx; je .Lxxx"           }
                   { so we can't safely replace the movzx then with xor/mov,   }
                   { since that would change the flags (JM)                    }
-                  if not(cs_opt_regvar in current_settings.optimizerswitches) then
+                  if PostPeepholeOptMovzx(p) then
+                    Result := True
+                  else if not(cs_opt_regvar in current_settings.optimizerswitches) then
                     begin
                       if (taicpu(p).oper[1]^.typ = top_reg) then
                         if (taicpu(p).oper[0]^.typ = top_reg)
@@ -340,6 +344,8 @@ unit aoptcpu;
                   Result:=PostPeepholeOptAnd(p);
                 A_MOVSX:
                   Result:=PostPeepholeOptMOVSX(p);
+                A_SHR:
+                  Result:=PostPeepholeOptShr(p);
                 else
                   ;
               end;

+ 702 - 2
compiler/x86/aoptx86.pas

@@ -140,6 +140,7 @@ unit aoptx86;
         function OptPass1VPXor(var p: tai): boolean;
         function OptPass1Imul(var p : tai) : boolean;
 
+        function OptPass2Movx(var p : tai): Boolean;
         function OptPass2MOV(var p : tai) : boolean;
         function OptPass2Imul(var p : tai) : boolean;
         function OptPass2Jmp(var p : tai) : boolean;
@@ -149,8 +150,8 @@ unit aoptx86;
         function OptPass2ADD(var p : tai): Boolean;
 
         function PostPeepholeOptMov(var p : tai) : Boolean;
-{$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
         function PostPeepholeOptMovzx(var p : tai) : Boolean;
+{$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
         function PostPeepholeOptXor(var p : tai) : Boolean;
 {$endif}
         function PostPeepholeOptAnd(var p : tai) : boolean;
@@ -160,6 +161,7 @@ unit aoptx86;
         function PostPeepholeOptCall(var p : tai) : Boolean;
         function PostPeepholeOptLea(var p : tai) : Boolean;
         function PostPeepholeOptPush(var p: tai): Boolean;
+        function PostPeepholeOptShr(var p : tai) : boolean;
 
         procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
 
@@ -4935,6 +4937,529 @@ unit aoptx86;
       end;
 
 
+    function TX86AsmOptimizer.OptPass2Movx(var p : tai) : boolean;
+      const
+        LIST_STEP_SIZE = 4;
+      var
+        ThisReg: TRegister;
+        MinSize, MaxSize, TrySmaller, TargetSize: TOpSize;
+        TargetSubReg: TSubRegister;
+        hp1, hp2: tai;
+        RegInUse, p_removed: Boolean;
+
+        { Store list of found instructions so we don't have to call
+          GetNextInstructionUsingReg multiple times }
+        InstrList: array of taicpu;
+        InstrMax, Index: Integer;
+        UpperLimit, TrySmallerLimit: TCgInt;
+
+        { Data flow analysis }
+        TestValMin, TestValMax: TCgInt;
+        SmallerOverflow: Boolean;
+
+      begin
+        Result := False;
+        p_removed := False;
+
+        { This is anything but quick! }
+        if not(cs_opt_level2 in current_settings.optimizerswitches) then
+          Exit;
+
+        SetLength(InstrList, 0);
+        InstrMax := -1;
+        ThisReg := taicpu(p).oper[1]^.reg;
+        hp1 := p;
+
+        case taicpu(p).opsize of
+          S_BW, S_BL:
+            begin
+              UpperLimit := $FF;
+              MinSize := S_B;
+              if taicpu(p).opsize = S_BW then
+                MaxSize := S_W
+              else
+                MaxSize := S_L;
+            end;
+          S_WL:
+            begin
+              UpperLimit := $FFFF;
+              MinSize := S_W;
+              MaxSize := S_L;
+            end
+          else
+            InternalError(2020112301);
+        end;
+
+        TestValMin := 0;
+        TestValMax := UpperLimit;
+        TrySmallerLimit := UpperLimit;
+        TrySmaller := S_NO;
+        SmallerOverflow := False;
+
+        while GetNextInstructionUsingReg(hp1, hp1, ThisReg) and
+          (hp1.typ = ait_instruction) and
+          (
+            { Under -O1 and -O2, GetNextInstructionUsingReg may return an
+              instruction that doesn't actually contain ThisReg }
+            (cs_opt_level3 in current_settings.optimizerswitches) or
+            RegInInstruction(ThisReg, hp1)
+          ) do
+          begin
+            case taicpu(hp1).opcode of
+              A_INC,A_DEC:
+                begin
+                  { Has to be an exact match on the register }
+                  if not MatchOperand(taicpu(hp1).oper[0]^, ThisReg) then
+                    Break;
+
+                  if taicpu(hp1).opcode = A_INC then
+                    begin
+                      Inc(TestValMin);
+                      Inc(TestValMax);
+                    end
+                  else
+                    begin
+                      Dec(TestValMin);
+                      Dec(TestValMax);
+                    end;
+                end;
+
+              { OR and XOR are not included because they can too easily fool
+                the data flow analysis (they can cause non-linear behaviour) }
+              A_ADD,A_SUB,A_AND,A_SHL,A_SHR:
+                begin
+                  if
+                    (taicpu(hp1).oper[1]^.typ <> top_reg) or
+                    { Has to be an exact match on the register }
+                    (taicpu(hp1).oper[1]^.reg <> ThisReg) or not
+                    (
+                      (
+                        (taicpu(hp1).oper[0]^.typ = top_const) and
+                        (
+                          (
+                            (taicpu(hp1).opcode = A_SHL) and
+                            (
+                              ((MinSize = S_B) and (taicpu(hp1).oper[0]^.val < 8)) or
+                              ((MinSize = S_W) and (taicpu(hp1).oper[0]^.val < 16)) or
+                              ((MinSize = S_L) and (taicpu(hp1).oper[0]^.val < 32))
+                            )
+                          ) or (
+                            (taicpu(hp1).opcode <> A_SHL) and
+                            (
+                              ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
+                              { Is it in the negative range? }
+                              (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val))
+                            )
+                          )
+                        )
+                      ) or (
+                        MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^.reg) and
+                        ((taicpu(hp1).opcode = A_ADD) or (taicpu(hp1).opcode = A_AND) or (taicpu(hp1).opcode = A_SUB))
+                      )
+                    ) then
+                    Break;
+
+                  case taicpu(hp1).opcode of
+                    A_ADD:
+                      if (taicpu(hp1).oper[0]^.typ = top_reg) then
+                        begin
+                          TestValMin := TestValMin * 2;
+                          TestValMax := TestValMax * 2;
+                        end
+                      else
+                        begin
+                          TestValMin := TestValMin + taicpu(hp1).oper[0]^.val;
+                          TestValMax := TestValMax + taicpu(hp1).oper[0]^.val;
+                        end;
+                    A_SUB:
+                      if (taicpu(hp1).oper[0]^.typ = top_reg) then
+                        begin
+                          TestValMin := 0;
+                          TestValMax := 0;
+                        end
+                      else
+                        begin
+                          TestValMin := TestValMin - taicpu(hp1).oper[0]^.val;
+                          TestValMax := TestValMax - taicpu(hp1).oper[0]^.val;
+                        end;
+                    A_AND:
+                      if (taicpu(hp1).oper[0]^.typ = top_const) then
+                        begin
+                          { we might be able to go smaller if AND appears first }
+                          if InstrMax = -1 then
+                            case MinSize of
+                              S_B:
+                                ;
+                              S_W:
+                                if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
+                                  ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
+                                  begin
+                                    TrySmaller := S_B;
+                                    TrySmallerLimit := $FF;
+                                  end;
+                              S_L:
+                                if ((taicpu(hp1).oper[0]^.val and $FF) = taicpu(hp1).oper[0]^.val) or
+                                  ((not(taicpu(hp1).oper[0]^.val) and $7F) = (not taicpu(hp1).oper[0]^.val)) then
+                                  begin
+                                    TrySmaller := S_B;
+                                    TrySmallerLimit := $FF;
+                                  end
+                                else if ((taicpu(hp1).oper[0]^.val and $FFFF) = taicpu(hp1).oper[0]^.val) or
+                                  ((not(taicpu(hp1).oper[0]^.val) and $7FFF) = (not taicpu(hp1).oper[0]^.val)) then
+                                  begin
+                                    TrySmaller := S_W;
+                                    TrySmallerLimit := $FFFF;
+                                  end;
+                              else
+                                InternalError(2020112320);
+                            end;
+
+                          TestValMin := TestValMin and taicpu(hp1).oper[0]^.val;
+                          TestValMax := TestValMax and taicpu(hp1).oper[0]^.val;
+                        end;
+                    A_SHL:
+                      begin
+                        TestValMin := TestValMin shl taicpu(hp1).oper[0]^.val;
+                        TestValMax := TestValMax shl taicpu(hp1).oper[0]^.val;
+                      end;
+                    A_SHR:
+                      begin
+                        { we might be able to go smaller if SHR appears first }
+                        if InstrMax = -1 then
+                          case MinSize of
+                            S_B:
+                              ;
+                            S_W:
+                              if (taicpu(hp1).oper[0]^.val >= 8) then
+                                begin
+                                  TrySmaller := S_B;
+                                  TrySmallerLimit := $FF;
+                                end;
+                            S_L:
+                              if (taicpu(hp1).oper[0]^.val >= 24) then
+                                begin
+                                  TrySmaller := S_B;
+                                  TrySmallerLimit := $FF;
+                                end
+                              else if (taicpu(hp1).oper[0]^.val >= 16) then
+                                begin
+                                  TrySmaller := S_W;
+                                  TrySmallerLimit := $FFFF;
+                                end;
+                            else
+                              InternalError(2020112321);
+                          end;
+
+                        TestValMin := TestValMin shr taicpu(hp1).oper[0]^.val;
+                        TestValMax := TestValMax shr taicpu(hp1).oper[0]^.val;
+                      end;
+                    else
+                      InternalError(2020112303);
+                  end;
+                end;
+(*
+              A_IMUL:
+                case taicpu(hp1).ops of
+                  2:
+                    begin
+                      if not MatchOpType(hp1, top_reg, top_reg) or
+                        { Has to be an exact match on the register }
+                        (taicpu(hp1).oper[0]^.reg <> ThisReg) or
+                        (taicpu(hp1).oper[1]^.reg <> ThisReg) then
+                        Break;
+
+                      TestValMin := TestValMin * TestValMin;
+                      TestValMax := TestValMax * TestValMax;
+                    end;
+                  3:
+                    begin
+                      if not MatchOpType(hp1, top_const, top_reg, top_reg) or
+                        { Has to be an exact match on the register }
+                        (taicpu(hp1).oper[1]^.reg <> ThisReg) or
+                        (taicpu(hp1).oper[2]^.reg <> ThisReg) or
+                        ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
+                        { Is it in the negative range? }
+                        (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
+                        Break;
+
+                      TestValMin := TestValMin * taicpu(hp1).oper[0]^.val;
+                      TestValMax := TestValMax * taicpu(hp1).oper[0]^.val;
+                    end;
+                  else
+                    Break;
+                end;
+
+              A_IDIV:
+                case taicpu(hp1).ops of
+                  3:
+                    begin
+                      if not MatchOpType(hp1, top_const, top_reg, top_reg) or
+                        { Has to be an exact match on the register }
+                        (taicpu(hp1).oper[1]^.reg <> ThisReg) or
+                        (taicpu(hp1).oper[2]^.reg <> ThisReg) or
+                        ((taicpu(hp1).oper[0]^.val and UpperLimit) = taicpu(hp1).oper[0]^.val) or
+                        { Is it in the negative range? }
+                        (((not taicpu(hp1).oper[0]^.val) and (UpperLimit shr 1)) = (not taicpu(hp1).oper[0]^.val)) then
+                        Break;
+
+                      TestValMin := TestValMin div taicpu(hp1).oper[0]^.val;
+                      TestValMax := TestValMax div taicpu(hp1).oper[0]^.val;
+                    end;
+                  else
+                    Break;
+                end;
+*)
+              A_MOVZX:
+                begin
+                  if not MatchOpType(taicpu(hp1), top_reg, top_reg) then
+                    Break;
+
+                  { The objective here is to try to find a combination that
+                    removes one of the MOV/Z instructions. }
+                  case taicpu(hp1).opsize of
+                    S_WL:
+                      if (MinSize in [S_B, S_W]) then
+                        begin
+                          TargetSize := S_L;
+                          TargetSubReg := R_SUBD;
+                        end
+                      else if ((TrySmaller in [S_B, S_W]) and not SmallerOverflow) then
+                        begin
+                          TargetSize := TrySmaller;
+                          if TrySmaller = S_B then
+                            TargetSubReg := R_SUBL
+                          else
+                            TargetSubReg := R_SUBW;
+                        end
+                      else
+                        Break;
+
+                    S_BW:
+                      if (MinSize in [S_B, S_W]) then
+                        begin
+                          TargetSize := S_W;
+                          TargetSubReg := R_SUBW;
+                        end
+                      else if ((TrySmaller = S_B) and not SmallerOverflow) then
+                        begin
+                          TargetSize := S_B;
+                          TargetSubReg := R_SUBL;
+                        end
+                      else
+                        Break;
+
+                    S_BL:
+                      if (MinSize in [S_B, S_W]) then
+                        begin
+                          TargetSize := S_L;
+                          TargetSubReg := R_SUBD;
+                        end
+                      else if ((TrySmaller = S_B) and not SmallerOverflow) then
+                        begin
+                          TargetSize := S_B;
+                          TargetSubReg := R_SUBL;
+                        end
+                      else
+                        Break;
+
+                    else
+                      InternalError(2020112302);
+                  end;
+
+                  { Update the register to its new size }
+                  ThisReg := newreg(R_INTREGISTER, getsupreg(ThisReg), TargetSubReg);
+
+                  if TargetSize = MinSize then
+                    begin
+                      { Convert the input MOVZX to a MOV }
+                      if (taicpu(p).oper[0]^.typ = top_reg) and
+                        SuperRegistersEqual(taicpu(p).oper[0]^.reg, ThisReg) then
+                        begin
+                          { Or remove it completely! }
+                          DebugMsg(SPeepholeOptimization + 'Movzx2Nop 1', p);
+                          RemoveCurrentP(p);
+                          p_removed := True;
+                        end
+                      else
+                        begin
+                          DebugMsg(SPeepholeOptimization + 'Movzx2Mov 1', p);
+                          taicpu(p).opcode := A_MOV;
+                          taicpu(p).oper[1]^.reg := ThisReg;
+                          taicpu(p).opsize := TargetSize;
+                        end;
+
+                      Result := True;
+                    end
+                  else if TargetSize <> MaxSize then
+                    begin
+
+                      case MaxSize of
+                        S_L:
+                          if TargetSize = S_W then
+                            begin
+                              DebugMsg(SPeepholeOptimization + 'movzbl2movzbw', p);
+                              taicpu(p).opsize := S_BW;
+                              taicpu(p).oper[1]^.reg := ThisReg;
+                              Result := True;
+                            end
+                          else
+                            InternalError(2020112341);
+
+                        S_W:
+                          if TargetSize = S_L then
+                            begin
+                              DebugMsg(SPeepholeOptimization + 'movzbw2movzbl', p);
+                              taicpu(p).opsize := S_BL;
+                              taicpu(p).oper[1]^.reg := ThisReg;
+                              Result := True;
+                            end
+                          else
+                            InternalError(2020112342);
+                        else
+                          ;
+                      end;
+                    end;
+
+
+                  if (MaxSize = TargetSize) or
+                    ((TargetSize = S_L) and (taicpu(hp1).opsize in [S_L, S_BL, S_WL])) or
+                    ((TargetSize = S_W) and (taicpu(hp1).opsize in [S_W, S_BW])) then
+                    begin
+                      { Convert the output MOVZX to a MOV }
+                      if (taicpu(hp1).oper[0]^.typ = top_reg) and
+                        SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, ThisReg) then
+                        begin
+                          { Or remove it completely! }
+                          DebugMsg(SPeepholeOptimization + 'Movzx2Nop 2', hp1);
+
+                          { Be careful; if p = hp1 and p was also removed, p
+                            will become a dangling pointer }
+                          if p = hp1 then
+                            RemoveCurrentp(p) { p = hp1 and will then become the next instruction }
+                          else
+                            RemoveInstruction(hp1);
+                        end
+                      else
+                        begin
+                          taicpu(hp1).opcode := A_MOV;
+                          taicpu(hp1).oper[0]^.reg := ThisReg;
+                          taicpu(hp1).opsize := TargetSize;
+
+                          { Check to see if the active register is used afterwards;
+                            if not, we can change it and make a saving. }
+                          RegInUse := False;
+                          TransferUsedRegs(TmpUsedRegs);
+
+                          { The target register may be marked as in use to cross
+                            a jump to a distant label, so exclude it }
+                          ExcludeRegFromUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs);
+
+                          hp2 := p;
+                          repeat
+
+                            UpdateUsedRegs(TmpUsedRegs, tai(hp2.next));
+
+                            { Explicitly check for the excluded register (don't include the first
+                              instruction as it may be reading from here }
+                            if ((p <> hp2) and (RegInInstruction(taicpu(hp1).oper[1]^.reg, hp2))) or
+                              RegInUsedRegs(taicpu(hp1).oper[1]^.reg, TmpUsedRegs) then
+                              begin
+                                RegInUse := True;
+                                Break;
+                              end;
+
+                            if not GetNextInstruction(hp2, hp2) then
+                              InternalError(2020112340);
+
+                          until (hp2 = hp1);
+
+                          if not RegInUse and not RegUsedAfterInstruction(ThisReg, hp1, TmpUsedRegs) then
+                            begin
+                              DebugMsg(SPeepholeOptimization + 'Simplified register usage so ' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' = ' + debug_regname(taicpu(p).oper[1]^.reg), p);
+                              ThisReg := taicpu(hp1).oper[1]^.reg;
+
+                              TransferUsedRegs(TmpUsedRegs);
+                              AllocRegBetween(ThisReg, p, hp1, TmpUsedRegs);
+
+                              DebugMsg(SPeepholeOptimization + 'Movzx2Nop 3', hp1);
+                              if p = hp1 then
+                                RemoveCurrentp(p) { p = hp1 and will then become the next instruction }
+                              else
+                                RemoveInstruction(hp1);
+
+                              { Instruction will become "mov %reg,%reg" }
+                              if not p_removed and (taicpu(p).opcode = A_MOV) and
+                                MatchOperand(taicpu(p).oper[0]^, ThisReg) then
+                                begin
+                                  DebugMsg(SPeepholeOptimization + 'Movzx2Nop 6', p);
+                                  RemoveCurrentP(p);
+                                  p_removed := True;
+                                end
+                              else
+                                taicpu(p).oper[1]^.reg := ThisReg;
+
+                              Result := True;
+                            end
+                          else
+                            DebugMsg(SPeepholeOptimization + 'Movzx2Mov 2', hp1);
+
+                        end;
+                    end
+                  else
+                    InternalError(2020112330);
+
+                  { Now go through every instruction we found and change the
+                    size. If TargetSize = MaxSize, then almost no changes are
+                    needed and Result can remain False if it hasn't been set
+                    yet. }
+
+                  if (TargetSize <> MaxSize) and (InstrMax >= 0) then
+                    begin
+                      for Index := 0 to InstrMax do
+                        begin
+
+                          { If p_removed is true, then the original MOV/Z was removed
+                            and removing the AND instruction may not be safe if it
+                            appears first }
+                          if (InstrList[Index].oper[InstrList[Index].ops - 1]^.typ <> top_reg) then
+                            InternalError(2020112310);
+
+                          if InstrList[Index].oper[0]^.typ = top_reg then
+                            InstrList[Index].oper[0]^.reg := ThisReg;
+
+                          InstrList[Index].oper[InstrList[Index].ops - 1]^.reg := ThisReg;
+                          InstrList[Index].opsize := TargetSize;
+                        end;
+
+                      Result := True;
+                    end;
+
+                  Exit;
+                end;
+
+              else
+                { This includes ADC, SBB, IDIV and SAR }
+                Break;
+            end;
+
+            if (TestValMin < 0) or (TestValMax < 0) or
+              (TestValMin > UpperLimit) or (TestValMax > UpperLimit) then
+              { Overflow }
+              Break
+            else if not SmallerOverflow and (TrySmaller <> S_NO) and
+              ((TestValMin > TrySmallerLimit) or (TestValMax > TrySmallerLimit)) then
+              SmallerOverflow := True;
+
+            { Contains highest index (so instruction count - 1) }
+            Inc(InstrMax);
+            if InstrMax > High(InstrList) then
+              SetLength(InstrList, InstrMax + LIST_STEP_SIZE);
+
+            InstrList[InstrMax] := taicpu(hp1);
+          end;
+      end;
+
+
     function TX86AsmOptimizer.OptPass2Imul(var p : tai) : boolean;
       var
         hp1 : tai;
@@ -6691,6 +7216,41 @@ unit aoptx86;
       end;
 
 
+    function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
+      var
+        hp1: tai;
+      begin
+        { Detect:
+            shr    x,  %ax (x > 0)
+            ...
+            movzwl %ax,%eax
+
+          Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
+        }
+
+        Result := False;
+        if MatchOpType(taicpu(p), top_const, top_reg) and
+          (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
+          (taicpu(p).oper[0]^.val > 0) and
+          GetNextInstructionUsingReg(p, hp1, NR_EAX) and
+          MatchInstruction(hp1, A_MOVZX, [S_WL]) and
+          MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
+          MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
+          begin
+            DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
+            taicpu(hp1).opcode := A_CWDE;
+            taicpu(hp1).clearop(0);
+            taicpu(hp1).clearop(1);
+            taicpu(hp1).ops := 0;
+
+            { A change was made, but not with p, so move forward 1 }
+            p := tai(p.Next);
+            Result := True;
+          end;
+
+      end;
+
+
     function TX86AsmOptimizer.PostPeepholeOptCmp(var p : tai) : Boolean;
       begin
         Result:=false;
@@ -6864,12 +7424,150 @@ unit aoptx86;
       end;
 
 
-{$ifdef x86_64}
     function TX86AsmOptimizer.PostPeepholeOptMovzx(var p : tai) : Boolean;
+
+      function ConstInRange(const Val: TCGInt; const OpSize: TOpSize): Boolean;
+        begin
+          case OpSize of
+            S_B, S_BW, S_BL{$ifdef x86_64}, S_BQ{$endif x86_64}:
+              Result := (Val <= $FF) and (Val >= -128);
+            S_W, S_WL{$ifdef x86_64}, S_WQ{$endif x86_64}:
+              Result := (Val <= $FFFF) and (Val >= -32768);
+            S_L{$ifdef x86_64}, S_LQ{$endif x86_64}:
+              Result := (Val <= $FFFFFFFF) and (Val >= -2147483648);
+            else
+              Result := True;
+          end;
+        end;
+
       var
+        hp1, hp2 : tai;
+        SizeChange: Boolean;
         PreMessage: string;
       begin
         Result := False;
+
+        if (taicpu(p).oper[0]^.typ = top_reg) and
+          SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
+          GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) then
+          begin
+            { Change (using movzbl %al,%eax as an example):
+
+                movzbl %al, %eax    movzbl %al, %eax
+                cmpl   x,   %eax    testl  %eax,%eax
+
+              To:
+                cmpb   x,   %al     testb  %al, %al  (Move one back to avoid a false dependency)
+                movzbl %al, %eax    movzbl %al, %eax
+
+              Smaller instruction and minimises pipeline stall as the CPU
+              doesn't have to wait for the register to get zero-extended. [Kit]
+
+              Also allow if the smaller of the two registers is being checked,
+              as this still removes the false dependency.
+            }
+            if
+              (
+                (
+                  (taicpu(hp1).opcode = A_CMP) and MatchOpType(taicpu(hp1), top_const, top_reg) and
+                  ConstInRange(taicpu(hp1).oper[0]^.val, taicpu(p).opsize)
+                ) or (
+                  { If MatchOperand returns True, they must both be registers }
+                  (taicpu(hp1).opcode = A_TEST) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^)
+                )
+              ) and
+              (reg2opsize(taicpu(hp1).oper[1]^.reg) <= reg2opsize(taicpu(p).oper[1]^.reg)) then
+              begin
+                PreMessage := debug_op2str(taicpu(hp1).opcode) + debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' -> ' + debug_op2str(taicpu(hp1).opcode);
+
+                asml.Remove(hp1);
+                asml.InsertBefore(hp1, p);
+
+                { Swap instructions in the case of cmp 0,%reg or test %reg,%reg }
+                if (taicpu(hp1).opcode = A_TEST) or (taicpu(hp1).oper[0]^.val = 0) then
+                  begin
+                    taicpu(hp1).opcode := A_TEST;
+                    taicpu(hp1).loadreg(0, taicpu(p).oper[0]^.reg);
+                  end;
+
+                taicpu(hp1).oper[1]^.reg := taicpu(p).oper[0]^.reg;
+
+                case taicpu(p).opsize of
+                  S_BW, S_BL:
+                    begin
+                      SizeChange := taicpu(hp1).opsize <> S_B;
+                      taicpu(hp1).changeopsize(S_B);
+                    end;
+                  S_WL:
+                    begin
+                      SizeChange := taicpu(hp1).opsize <> S_W;
+                      taicpu(hp1).changeopsize(S_W);
+                    end
+                  else
+                    InternalError(2020112701);
+                end;
+
+                UpdateUsedRegs(tai(p.Next));
+
+                { Check if the register is used aferwards - if not, we can
+                  remove the movzx instruction completely }
+                if not RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, p, UsedRegs) then
+                  begin
+                    { Hp1 is a better position than p for debugging purposes }
+                    DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4a', hp1);
+                    RemoveCurrentp(p, hp1);
+                    Result := True;
+                  end;
+
+                if SizeChange then
+                  DebugMsg(SPeepholeOptimization + PreMessage +
+                    debug_opsize2str(taicpu(hp1).opsize) + ' ' + debug_operstr(taicpu(hp1).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (smaller and minimises pipeline stall - MovzxCmp2CmpMovzx)', hp1)
+                else
+                  DebugMsg(SPeepholeOptimization + 'MovzxCmp2CmpMovzx', hp1);
+
+                Exit;
+              end;
+
+            { Change (using movzwl %ax,%eax as an example):
+
+                movzwl %ax, %eax
+                movb   %al, (dest)  (Register is smaller than read register in movz)
+
+              To:
+                movb   %al, (dest)  (Move one back to avoid a false dependency)
+                movzwl %ax, %eax
+            }
+            if (taicpu(hp1).opcode = A_MOV) and
+              (taicpu(hp1).oper[0]^.typ = top_reg) and
+              not RegInOp(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^) and
+              SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[0]^.reg) and
+              (reg2opsize(taicpu(hp1).oper[0]^.reg) <= reg2opsize(taicpu(p).oper[0]^.reg)) then
+              begin
+                DebugMsg(SPeepholeOptimization + 'MovzxMov2MovMovzx', hp1);
+
+                hp2 := tai(hp1.Previous); { Effectively the old position of hp1 }
+                asml.Remove(hp1);
+                asml.InsertBefore(hp1, p);
+                if taicpu(hp1).oper[1]^.typ = top_reg then
+                  AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
+
+                { Check if the register is used aferwards - if not, we can
+                  remove the movzx instruction completely }
+
+                if not RegUsedAfterInstruction(taicpu(hp1).oper[0]^.reg, p, UsedRegs) then
+                  begin
+                    { Hp1 is a better position than p for debugging purposes }
+                    DebugMsg(SPeepholeOptimization + 'Movzx2Nop 4b', hp1);
+                    RemoveCurrentp(p, hp1);
+                    Result := True;
+                  end;
+
+                Exit;
+              end;
+
+          end;
+
+{$ifdef x86_64}
         { Code size reduction by J. Gareth "Kit" Moreton }
         { Convert MOVZBQ and MOVZWQ to MOVZBL and MOVZWL respectively if it removes the REX prefix }
         if (taicpu(p).opsize in [S_BQ, S_WQ]) and
@@ -6889,9 +7587,11 @@ unit aoptx86;
             DebugMsg(SPeepholeOptimization + PreMessage +
               debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg) + ' (removes REX prefix)', p);
           end;
+{$endif}
       end;
 
 
+{$ifdef x86_64}
     function TX86AsmOptimizer.PostPeepholeOptXor(var p : tai) : Boolean;
       var
         PreMessage, RegName: string;

+ 4 - 0
compiler/x86_64/aoptcpu.pas

@@ -163,6 +163,8 @@ uses
               case taicpu(p).opcode of
                 A_MOV:
                   Result:=OptPass2MOV(p);
+                A_MOVZX:
+                  Result:=OptPass2Movx(p);
                 A_IMUL:
                   Result:=OptPass2Imul(p);
                 A_JMP:
@@ -213,6 +215,8 @@ uses
                   Result:=PostPeepholeOptLea(p);
                 A_PUSH:
                   Result:=PostPeepholeOptPush(p);
+                A_SHR:
+                  Result:=PostPeepholeOptShr(p);
                 else
                   ;
               end;