Browse Source

* x86: Refactored CMOV optimisations and permitted the use of constants in some cases.

J. Gareth "Curious Kit" Moreton 3 years ago
parent
commit
bab60d819f
2 changed files with 701 additions and 159 deletions
  1. 3 2
      compiler/aoptobj.pas
  2. 698 157
      compiler/x86/aoptx86.pas

+ 3 - 2
compiler/aoptobj.pas

@@ -87,7 +87,7 @@ Unit AoptObj;
         { is Reg currently in use }
         Function IsUsed(Reg: TRegister): Boolean;
         { get all the currently used registers }
-        Function GetUsedRegs: TRegSet;
+        Function GetUsedRegs: TRegSet; {$ifdef USEINLINE}inline;{$endif USEINLINE}
 
         { outputs  the current set }
         Procedure Dump(var t : text);
@@ -549,7 +549,7 @@ Unit AoptObj;
       End;
 
 
-    Function TUsedRegs.GetUsedRegs: TRegSet; inline;
+    Function TUsedRegs.GetUsedRegs: TRegSet; {$ifdef USEINLINE}inline;{$endif USEINLINE}
       Begin
         GetUsedRegs := UsedRegs;
       End;
@@ -1395,6 +1395,7 @@ Unit AoptObj;
     procedure TAOptObj.AllocRegBetween(reg: tregister; p1, p2: tai; var initialusedregs: TAllUsedRegs);
       var
         hp, start: tai;
+        Po: PInteger;
         removedsomething,
         firstRemovedWasAlloc,
         lastRemovedWasDealloc: boolean;

+ 698 - 157
compiler/x86/aoptx86.pas

@@ -98,11 +98,11 @@ unit aoptx86;
         { Attempts to allocate a volatile integer register for use between p and hp,
           using AUsedRegs for the current register usage information.  Returns NR_NO
           if no free register could be found }
-        function GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
+        function GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
         { Attempts to allocate a volatile MM register for use between p and hp,
           using AUsedRegs for the current register usage information.  Returns NR_NO
           if no free register could be found }
-        function GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
+        function GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
 
         { checks whether loading a new value in reg1 overwrites the entirety of reg2 }
         class function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean; static;
@@ -130,6 +130,8 @@ unit aoptx86;
         { Returns true if the given MOV instruction can be safely converted to CMOV }
         class function CanBeCMOV(p : tai) : boolean; static;
 
+        { Like UpdateUsedRegs, but ignores deallocations }
+        class procedure UpdateIntRegsNoDealloc(var AUsedRegs: TAllUsedRegs; p: Tai); static;
 
         { Returns true if the given logic instruction can be converted into a BTx instruction (BT not included) }
         class function IsBTXAcceptable(p : tai) : boolean; static;
@@ -268,6 +270,10 @@ unit aoptx86;
       SPeepholeOptimization = '';
 {$endif DEBUG_AOPTCPU}
       LIST_STEP_SIZE = 4;
+{$ifndef 8086}
+      MAX_CMOV_INSTRUCTIONS = 4;
+      MAX_CMOV_REGISTERS = 8;
+{$endif 8086}
 
     type
       TJumpTrackingItem = class(TLinkedListItem)
@@ -1322,7 +1328,7 @@ unit aoptx86;
     { Attempts to allocate a volatile integer register for use between p and hp,
       using AUsedRegs for the current register usage information.  Returns NR_NO
       if no free register could be found }
-    function TX86AsmOptimizer.GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
+    function TX86AsmOptimizer.GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
       var
         RegSet: TCPURegisterSet;
         CurrentSuperReg: Integer;
@@ -1385,7 +1391,8 @@ unit aoptx86;
 
                 { We have a free register available }
                 Result := CurrentReg;
-                AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
+                if not DontAlloc then
+                  AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
                 Exit;
               end;
           end;
@@ -1395,7 +1402,7 @@ unit aoptx86;
     { Attempts to allocate a volatile MM register for use between p and hp,
       using AUsedRegs for the current register usage information.  Returns NR_NO
       if no free register could be found }
-    function TX86AsmOptimizer.GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
+    function TX86AsmOptimizer.GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai; DontAlloc: Boolean = False): TRegister;
       var
         RegSet: TCPURegisterSet;
         CurrentSuperReg: Integer;
@@ -1450,7 +1457,8 @@ unit aoptx86;
 
                 { We have a free register available }
                 Result := CurrentReg;
-                AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
+                if not DontAlloc then
+                  AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
                 Exit;
               end;
           end;
@@ -2541,19 +2549,19 @@ unit aoptx86;
 
     class function TX86AsmOptimizer.IsRefSafe(const ref: PReference): Boolean;
       begin
-        with ref^ do
-          Result :=
-            (index = NR_NO) and
-            (
+        Result :=
+          (ref^.index = NR_NO) and
+          (
 {$ifdef x86_64}
-              (
-                (base = NR_RIP) and
-                (refaddr in [addr_pic, addr_pic_no_got])
-              ) or
+            (
+              (ref^.base = NR_RIP) and
+              (ref^.refaddr in [addr_pic, addr_pic_no_got])
+            ) or
 {$endif x86_64}
-              (base = NR_STACK_POINTER_REG) or
-              (base = current_procinfo.framepointer)
-            );
+            (ref^.refaddr = addr_full) or
+            (ref^.base = NR_STACK_POINTER_REG) or
+            (ref^.base = current_procinfo.framepointer)
+          );
       end;
 
 
@@ -11239,12 +11247,6 @@ unit aoptx86;
       begin
          CanBeCMOV:=assigned(p) and
            MatchInstruction(p,A_MOV,[S_W,S_L,S_Q]) and
-           { we can't use cmov ref,reg because
-             ref could be nil and cmov still throws an exception
-             if ref=nil but the mov isn't done (FK)
-            or ((taicpu(p).oper[0]^.typ = top_ref) and
-             (taicpu(p).oper[0]^.ref^.refaddr = addr_no))
-           }
            (taicpu(p).oper[1]^.typ = top_reg) and
            (
              (taicpu(p).oper[0]^.typ = top_reg) or
@@ -11252,23 +11254,131 @@ unit aoptx86;
                it is not expected that this can cause a seg. violation }
              (
                (taicpu(p).oper[0]^.typ = top_ref) and
+               { TODO: Can we detect which references become constants at this
+                 stage so we don't have to do a blanket ban? }
+               (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) and
                IsRefSafe(taicpu(p).oper[0]^.ref)
              )
            );
       end;
 
 
+    class procedure TX86AsmOptimizer.UpdateIntRegsNoDealloc(var AUsedRegs: TAllUsedRegs; p: Tai);
+      begin
+        { Update integer registers, ignoring deallocations }
+        repeat
+          while assigned(p) and
+                ((p.typ in (SkipInstr - [ait_RegAlloc])) or
+                 (p.typ = ait_label) or
+                 ((p.typ = ait_marker) and
+                  (tai_Marker(p).Kind in [mark_AsmBlockEnd,mark_NoLineInfoStart,mark_NoLineInfoEnd]))) do
+               p := tai(p.next);
+          while assigned(p) and
+                (p.typ=ait_RegAlloc) Do
+            begin
+              if (getregtype(tai_regalloc(p).reg) = R_INTREGISTER) then
+                begin
+                  case tai_regalloc(p).ratype of
+                    ra_alloc :
+                      IncludeRegInUsedRegs(tai_regalloc(p).reg, AUsedRegs);
+                    else
+                      ;
+                  end;
+                end;
+              p := tai(p.next);
+            end;
+        until not(assigned(p)) or
+              (not(p.typ in SkipInstr) and
+               not((p.typ = ait_label) and
+                   labelCanBeSkipped(tai_label(p))));
+      end;
+
+
     function TX86AsmOptimizer.OptPass2Jcc(var p : tai) : boolean;
       var
         hp1,hp2: tai;
-{$ifndef i8086}
-        hp3,hp4,hpmov2, hp5: tai;
-        l : Longint;
-        condition : TAsmCond;
-{$endif i8086}
         carryadd_opcode : TAsmOp;
         symbol: TAsmSymbol;
         increg, tmpreg: TRegister;
+{$ifndef i8086}
+        { Code and variables specific to CMOV optimisations }
+        hp3,hp4,hp5,
+        hp_stop, hp_lblxxx, hp_lblyyy, hpmov1,hpmov2, hp_prev, hp_flagalloc, hp_prev2, hp_new, hp_jump: tai;
+        l, c, w, x : Longint;
+        condition, second_condition : TAsmCond;
+        FoundMatchingJump, RegMatch: Boolean;
+
+        RegWrites: array[0..MAX_CMOV_INSTRUCTIONS*2 - 1] of TRegister;
+
+        ConstRegs: array[0..MAX_CMOV_REGISTERS - 1] of TRegister;
+        ConstVals: array[0..MAX_CMOV_REGISTERS - 1] of TCGInt;
+
+        { Tries to convert a mov const,%reg instruction into a CMOV by reserving a
+          new register to store the constant }
+        function TryCMOVConst(p, search_start_p, stop_search_p: tai; var StoredCount: LongInt; var CMOVCount: LongInt): Boolean;
+          var
+            RegSize: TSubRegister;
+            CurrentVal: TCGInt;
+            NewReg: TRegister;
+            X: ShortInt;
+          begin
+            Result := False;
+
+            if not MatchOpType(taicpu(p), top_const, top_reg) then
+              Exit;
+
+            if StoredCount >= MAX_CMOV_REGISTERS then
+              { Arrays are full }
+              Exit;
+
+            { Remember that CMOV can't encode 8-bit registers }
+            case taicpu(p).opsize of
+              S_W:
+                RegSize := R_SUBW;
+              S_L:
+                RegSize := R_SUBD;
+              S_Q:
+                RegSize := R_SUBQ;
+              else
+                InternalError(2021100401);
+            end;
+
+            { See if the value has already been reserved for another CMOV instruction }
+            CurrentVal := taicpu(p).oper[0]^.val;
+            for X := 0 to StoredCount - 1 do
+              if ConstVals[X] = CurrentVal then
+                begin
+                  ConstRegs[StoredCount] := ConstRegs[X];
+                  ConstVals[StoredCount] := CurrentVal;
+                  Result := True;
+
+                  Inc(StoredCount);
+                  { Don't increase CMOVCount this time, since we're re-using a register }
+                  Exit;
+                end;
+
+            NewReg := GetIntRegisterBetween(RegSize, TmpUsedRegs, search_start_p, stop_search_p, True);
+            if NewReg = NR_NO then
+              { No free registers }
+              Exit;
+
+            { Reserve the register so subsequent TryCMOVConst calls don't all end
+              up vying for the same register }
+            IncludeRegInUsedRegs(NewReg, TmpUsedRegs);
+
+            ConstRegs[StoredCount] := NewReg;
+            ConstVals[StoredCount] := CurrentVal;
+
+            Inc(StoredCount);
+            { Increment the CMOV count variable from OptPass2JCC, since the extra
+              MOV required adds complexity and will cause diminishing returns
+              sooner than normal.  This is more of an approximate weighting than
+              anything else. }
+            Inc(CMOVCount);
+            Result := True;
+          end;
+{$endif i8086}
+
       begin
         result:=false;
         if GetNextInstruction(p,hp1) then
@@ -11542,7 +11652,8 @@ unit aoptx86;
                   Result:=true;
                   exit;
                 end
-              else if CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype] then
+              else if (CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) and
+                MatchInstruction(hp1,A_MOV,[S_W,S_L{$ifdef x86_64},S_Q{$endif x86_64}]) then
                 begin
                  { check for
                         jCC   xxx
@@ -11556,47 +11667,272 @@ unit aoptx86;
 
                    Change to:
                         <several cmovs with inverted condition>
-                        jmp   xxx
+                        jmp   xxx  (only for the 2nd case)
                  }
-                 l:=0;
-                 while assigned(hp1) and
-                   CanBeCMOV(hp1) and
+                 hp2 := p;
+                 hp_lblxxx := hp1;
+                 hp_flagalloc := nil;
+                 hp_stop := nil;
+                 FoundMatchingJump := False;
+
+                 { Remember the first instruction in the first block of MOVs }
+                 hpmov1 := hp1;
+
+                 TransferUsedRegs(TmpUsedRegs);
+                 while assigned(hp_lblxxx) and
                    { stop on labels }
-                   not(hp1.typ=ait_label) do
+                   (hp_lblxxx.typ <> ait_label) do
                    begin
-                      inc(l);
-                      hp5 := hp1;
-                      GetNextInstruction(hp1,hp1);
+                     { Keep track of all integer registers that are used }
+                     UpdateIntRegsNoDealloc(TmpUsedRegs, tai(hp2.Next));
+                     if hp_lblxxx.typ = ait_instruction then
+                       begin
+                         if (taicpu(hp_lblxxx).opcode = A_JMP) and
+                           IsJumpToLabel(taicpu(hp_lblxxx)) then
+                           begin
+                             hp_stop := hp_lblxxx;
+                             if (TAsmLabel(taicpu(hp_lblxxx).oper[0]^.ref^.symbol) = symbol) then
+                               begin
+                                 { We found Jcc xxx; <several movs>; Jmp xxx }
+                                 FoundMatchingJump := True;
+                                 Break;
+                               end;
+                             { If it's not the jump we're looking for, it's
+                               possibly the "if..else" variant }
+                           end
+                           { Check to see if we have a valid MOV instruction instead }
+                         else if (taicpu(hp_lblxxx).opcode <> A_MOV) or
+                           not (taicpu(hp_lblxxx).opsize in [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
+                           Break
+                         else
+                           { This will be a valid MOV }
+                           hp_stop := hp_lblxxx;
+                       end;
+
+                     hp2 := hp_lblxxx;
+                     GetNextInstruction(hp_lblxxx, hp_lblxxx);
                    end;
-                 if assigned(hp1) then
+
+                 { Just make sure the last MOV is included if there's no jump }
+                 if (hp_lblxxx.typ = ait_label) and MatchInstruction(hp_stop, A_MOV, []) then
+                   hp_stop := hp_lblxxx;
+
+                 { Note, the logic behind using hp_stop over hp_lblxxx in the
+                   range for TryCMOVConst is so GetIntRegisterBetween doesn't
+                   fail when it reaches a JMP instruction in the "jcc xxx; movs;
+                   jmp yyy; xxx:; movs; yyy:" variation }
+
+                 if assigned(hp_lblxxx) and
+                   (
+                     { If we found JMP xxx, we don't actually need a label
+                       (hp_lblxxx is the JMP instruction instead) }
+                     FoundMatchingJump or
+                     { Make sure we actually have the right label }
+                     FindLabel(TAsmLabel(symbol), hp_lblxxx)
+                   ) then
                    begin
-                      TransferUsedRegs(TmpUsedRegs);
-                      if (
-                          MatchInstruction(hp1, A_JMP, []) and
-                          (JumpTargetOp(taicpu(hp1))^.typ=top_ref) and
-                          (JumpTargetOp(taicpu(hp1))^.ref^.symbol=symbol)
-                        ) or
-                        FindLabel(tasmlabel(symbol),hp1) then
+                     { Use TmpUsedRegs to track registers that we reserve }
+
+                     { When allocating temporary registers, try to look one
+                       instruction back, as defining them before a CMP or TEST
+                       instruction will be faster, and also avoid picking a
+                       register that was only just deallocated }
+                     if GetLastInstruction(p, hp_prev) and
+                       MatchInstruction(hp_prev, [A_CMP, A_TEST, A_BSR, A_BSF, A_COMISS, A_COMISD, A_UCOMISS, A_UCOMISD, A_VCOMISS, A_VCOMISD, A_VUCOMISS, A_VUCOMISD], []) then
+                       begin
+                         { Mark all the registers in the comparison as 'in use', even if they've just been deallocated }
+                         for l := 0 to 1 do
+                           with taicpu(hp_prev).oper[l]^ do
+                             case typ of
+                               top_reg:
+                                 if getregtype(reg) = R_INTREGISTER then
+                                   IncludeRegInUsedRegs(reg, TmpUsedRegs);
+                               top_ref:
+                                 begin
+                                   if
+{$ifdef x86_64}
+                                     (ref^.base <> NR_RIP) and
+{$endif x86_64}
+                                     (ref^.base <> NR_NO) then
+                                     IncludeRegInUsedRegs(ref^.base, TmpUsedRegs);
+
+                                   if (ref^.index <> NR_NO) then
+                                     IncludeRegInUsedRegs(ref^.index, TmpUsedRegs);
+                                 end
+                               else
+                                 ;
+                             end;
+
+                         { When inserting instructions before hp_prev, try to insert
+                           them before the allocation of the FLAGS register }
+                         if not SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(hp_prev.Previous)), hp_flagalloc) then
+                           { If not found, set it equal to hp_prev so it's something sensible }
+                           hp_flagalloc := hp_prev;
+
+                         hp_prev2 := nil;
+                         { When dealing with a comparison against zero, take
+                           note of the instruction before it to see if we can
+                           move instructions further back in order to benefit
+                           PostPeepholeOptTestOr.
+                         }
+                         if (
+                             (
+                               (taicpu(hp_prev).opcode = A_CMP) and
+                               MatchOperand(taicpu(hp_prev).oper[0]^, 0)
+                             ) or
+                             (
+                               (taicpu(hp_prev).opcode = A_TEST) and
+                               (
+                                 OpsEqual(taicpu(hp_prev).oper[0]^, taicpu(hp_prev).oper[1]^) or
+                                 MatchOperand(taicpu(hp_prev).oper[0]^, -1)
+                               )
+                             )
+                           ) and
+                           GetLastInstruction(hp_prev, hp_prev2) then
+                             begin
+                               if (hp_prev2.typ = ait_instruction) and
+                                 { These instructions set the zero flag if the result is zero }
+                                 MatchInstruction(hp_prev2, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) then
+                                 begin
+                                  { Also mark all the registers in this previous instruction
+                                    as 'in use', even if they've just been deallocated }
+                                  for l := 0 to 1 do
+                                    with taicpu(hp_prev2).oper[l]^ do
+                                      case typ of
+                                        top_reg:
+                                          if getregtype(reg) = R_INTREGISTER then
+                                            IncludeRegInUsedRegs(reg, TmpUsedRegs);
+                                        top_ref:
+                                          begin
+                                            if
+{$ifdef x86_64}
+                                              (ref^.base <> NR_RIP) and
+{$endif x86_64}
+                                              (ref^.base <> NR_NO) then
+                                              IncludeRegInUsedRegs(ref^.base, TmpUsedRegs);
+
+                                            if (ref^.index <> NR_NO) then
+                                              IncludeRegInUsedRegs(ref^.index, TmpUsedRegs);
+                                          end
+                                        else
+                                          ;
+                                      end;
+                                 end
+                               else
+                                 { Unsuitable instruction }
+                                 hp_prev2 := nil;
+                           end;
+                       end
+                     else
+                       begin
+                         hp_prev := p;
+                         { When inserting instructions before hp_prev, try to insert
+                           them before the allocation of the FLAGS register }
+                         if not SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp_flagalloc) then
+                           { If not found, set it equal to p so it's something sensible }
+                           hp_flagalloc := p;
+                         hp_prev2 := nil;
+                       end;
+
+                     l := 0;
+                     c := 0;
+
+                     { Initialise RegWrites, ConstRegs and ConstVals }
+                     FillChar(RegWrites[0], MAX_CMOV_INSTRUCTIONS * 2 * SizeOf(TRegister), 0);
+                     FillChar(ConstRegs[0], MAX_CMOV_REGISTERS * SizeOf(TRegister), 0);
+                     FillChar(ConstVals[0], MAX_CMOV_REGISTERS * SizeOf(TCGInt), 0);
+
+                     while assigned(hp1) and
+                       { Stop on the label we found }
+                       (hp1 <> hp_lblxxx) do
+                       begin
+                         case hp1.typ of
+                           ait_instruction:
+                             if MatchInstruction(hp1, A_MOV, [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
+                               begin
+                                 if CanBeCMOV(hp1) then
+                                   Inc(l)
+                                 else if not (cs_opt_size in current_settings.optimizerswitches) and
+                                   { CMOV with constants grows the code size }
+                                   TryCMOVConst(hp1, hp_prev, hp_stop, c, l) then
+                                   begin
+                                     { Register was reserved by TryCMOVConst and
+                                       stored on ConstRegs[c] }
+                                   end
+                                 else
+                                   Break;
+                               end
+                             else
+                               Break;
+                           else
+                             ;
+                         end;
+                         GetNextInstruction(hp1,hp1);
+                       end;
+
+                      if (hp1 = hp_lblxxx) then
                         begin
-                          if (l<=4) and (l>0) then
+                          if (l <= MAX_CMOV_INSTRUCTIONS) and (l > 0) then
                             begin
-                              AllocRegBetween(NR_DEFAULTFLAGS, p, hp5, TmpUsedRegs);
+                              { Repurpose TmpUsedRegs to mark registers that we've defined }
+                              TmpUsedRegs[R_INTREGISTER].Clear;
 
-                              condition:=inverse_cond(taicpu(p).condition);
+                              x := 0;
+                              AllocRegBetween(NR_DEFAULTFLAGS, p, hp_lblxxx, UsedRegs);
+                              condition := inverse_cond(taicpu(p).condition);
                               UpdateUsedRegs(tai(p.next));
-                              GetNextInstruction(p,hp1);
+
+                              hp1 := hpmov1;
                               repeat
                                 if not Assigned(hp1) then
                                   InternalError(2018062900);
 
-                                taicpu(hp1).opcode:=A_CMOVcc;
-                                taicpu(hp1).condition:=condition;
+                                if (hp1.typ = ait_instruction) then
+                                  begin
+                                    { Extra safeguard }
+                                    if (taicpu(hp1).opcode <> A_MOV) then
+                                      InternalError(2018062901);
+
+                                    if taicpu(hp1).oper[0]^.typ = top_const then
+                                      begin
+                                        if x >= MAX_CMOV_REGISTERS then
+                                          InternalError(2021100410);
+
+                                        { If it's in TmpUsedRegs, then this register
+                                          is being used more than once and hence has
+                                          already had its value defined (it gets
+                                          added to UsedRegs through AllocRegBetween
+                                          below) }
+                                        if not TmpUsedRegs[R_INTREGISTER].IsUsed(ConstRegs[x]) then
+                                          begin
+                                            hp_new := taicpu.op_const_reg(A_MOV, taicpu(hp1).opsize, taicpu(hp1).oper[0]^.val, ConstRegs[x]);
+                                            taicpu(hp_new).fileinfo := taicpu(hp_prev).fileinfo;
+
+                                            asml.InsertBefore(hp_new, hp_flagalloc);
+                                            if Assigned(hp_prev2) then
+                                              TrySwapMovOp(hp_prev2, hp_new);
+
+                                            IncludeRegInUsedRegs(ConstRegs[x], TmpUsedRegs);
+                                          end
+                                        else
+                                        { We just need an instruction between hp_prev and hp1
+                                          where we know the register is marked as in use }
+                                          hp_new := hpmov1;
+
+                                        AllocRegBetween(ConstRegs[x], hp_new, hp1, UsedRegs);
+                                        taicpu(hp1).loadreg(0, ConstRegs[x]);
+                                        Inc(x);
+                                      end;
+
+                                    taicpu(hp1).opcode := A_CMOVcc;
+                                    taicpu(hp1).condition := condition;
+                                  end;
+
                                 UpdateUsedRegs(tai(hp1.next));
-                                GetNextInstruction(hp1,hp1);
-                              until not(CanBeCMOV(hp1));
+                                GetNextInstruction(hp1, hp1);
+                              until (hp1 = hp_lblxxx);
 
-                              { Remember what hp1 is in case there's multiple aligns to get rid of }
-                              hp2 := hp1;
+                              hp2 := hp_lblxxx;
                               repeat
                                 if not Assigned(hp2) then
                                   InternalError(2018062910);
@@ -11643,138 +11979,343 @@ unit aoptx86;
 
                               if hp2.typ=ait_instruction then
                                 begin
-                                  p:=hp2;
-                                  Result:=True;
+                                  p := hp2;
+                                  Result := True;
                                 end
                               else
                                 begin
                                   UpdateUsedRegs(tai(hp2.next));
-                                  Result:=GetNextInstruction(hp2, p); { Instruction after the label }
+                                  Result := GetNextInstruction(hp2, p); { Instruction after the label }
 
                                   { Remove the label if this is its final reference }
                                   if (tasmlabel(symbol).getrefs=0) then
-                                    StripLabelFast(hp1);
+                                    begin
+                                      { Make sure the aligns get stripped too }
+                                      hp1 := tai(hp_lblxxx.Previous);
+                                      while Assigned(hp1) and (hp1.typ = ait_align) do
+                                        begin
+                                          hp_lblxxx := hp1;
+                                          hp1 := tai(hp_lblxxx.Previous);
+                                        end;
+                                      StripLabelFast(hp_lblxxx);
+                                    end;
                                 end;
 
-                              exit;
+                              Exit;
                             end;
                         end
-                      else
+                      else if assigned(hp_lblxxx) and
+                         { check further for
+                                jCC   xxx
+                                <several movs 1>
+                                jmp   yyy
+                        xxx:
+                                <several movs 2>
+                        yyy:
+                         }
+                        (l <= MAX_CMOV_INSTRUCTIONS - 1) and
+                        { hp1 should be pointing to jmp yyy }
+                        MatchInstruction(hp1, A_JMP, []) and
+                        { real label and jump, no further references to the
+                          label are allowed }
+                        (TAsmLabel(symbol).getrefs=1) and
+                        FindLabel(TAsmLabel(symbol), hp_lblxxx) then
                         begin
-                           { check further for
-                                  jCC   xxx
-                                  <several movs 1>
-                                  jmp   yyy
-                          xxx:
-                                  <several movs 2>
-                          yyy:
-                           }
-                          { hp2 points to jmp yyy }
-                          hp2:=hp1;
-                          { skip hp1 to xxx (or an align right before it) }
-                          GetNextInstruction(hp1, hp1);
-
-                          if assigned(hp2) and
-                            assigned(hp1) and
-                            (l<=3) and
-                            (hp2.typ=ait_instruction) and
-                            (taicpu(hp2).is_jmp) and
-                            (taicpu(hp2).condition=C_None) and
-                            { real label and jump, no further references to the
-                              label are allowed }
-                            (tasmlabel(symbol).getrefs=1) and
-                            FindLabel(tasmlabel(symbol),hp1) then
-                             begin
-                               l:=0;
-                               { skip hp1 to <several moves 2> }
-                               if (hp1.typ = ait_align) then
-                                 GetNextInstruction(hp1, hp1);
+                          hp_jump := hp1;
 
-                               GetNextInstruction(hp1, hpmov2);
+                          { Don't set c to zero }
+                          l := 0;
+                          w := 0;
 
-                               hp1 := hpmov2;
-                               while assigned(hp1) and
-                                 CanBeCMOV(hp1) do
-                                 begin
-                                   inc(l);
-                                   hp5 := hp1;
-                                   GetNextInstruction(hp1, hp1);
-                                 end;
-                               { hp1 points to yyy (or an align right before it) }
-                               hp3 := hp1;
-                               if assigned(hp1) and
-                                 FindLabel(tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol),hp1) then
-                                 begin
-                                    AllocRegBetween(NR_DEFAULTFLAGS, p, hp5, TmpUsedRegs);
-
-                                    condition:=inverse_cond(taicpu(p).condition);
-                                    UpdateUsedRegs(tai(p.next));
-                                    GetNextInstruction(p,hp1);
-                                    repeat
-                                      taicpu(hp1).opcode:=A_CMOVcc;
-                                      taicpu(hp1).condition:=condition;
-                                      UpdateUsedRegs(tai(hp1.next));
-                                      GetNextInstruction(hp1,hp1);
-                                    until not(assigned(hp1)) or
-                                      not(CanBeCMOV(hp1));
-
-                                    condition:=inverse_cond(condition);
-                                    if GetLastInstruction(hpmov2,hp1) then
-                                      UpdateUsedRegs(tai(hp1.next));
-                                    hp1 := hpmov2;
-                                    { hp1 is now at <several movs 2> }
-                                    while Assigned(hp1) and CanBeCMOV(hp1) do
+                          GetNextInstruction(hp_lblxxx, hpmov2);
+
+                          hp2 := hp_lblxxx;
+                          hp_lblyyy := hpmov2;
+
+                          while assigned(hp_lblyyy) and
+                            { stop on labels }
+                            (hp_lblyyy.typ <> ait_label) do
+                            begin
+                              { Keep track of all integer registers that are used }
+                              UpdateIntRegsNoDealloc(TmpUsedRegs, tai(hp2.Next));
+                              if not MatchInstruction(hp_lblyyy, A_MOV, [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
+                                Break;
+
+                              hp2 := hp_lblyyy;
+                              GetNextInstruction(hp_lblyyy, hp_lblyyy);
+                            end;
+
+                          { Analyse the second batch of MOVs to see if the setup is valid }
+                          hp1 := hpmov2;
+                          while assigned(hp1) and
+                            (hp1 <> hp_lblyyy) do
+                            begin
+                              case hp1.typ of
+                                ait_instruction:
+                                  if MatchInstruction(hp1, A_MOV, [S_W, S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) then
+                                    begin
+                                      if CanBeCMOV(hp1) then
+                                        Inc(l)
+                                      else if not (cs_opt_size in current_settings.optimizerswitches)
+                                        { CMOV with constants grows the code size }
+                                        and TryCMOVConst(hp1, hpmov2, hp_lblyyy, c, l) then
+                                        begin
+                                          { Register was reserved by TryCMOVConst and
+                                            stored on ConstRegs[c] }
+                                        end
+                                      else
+                                        Break;
+                                    end
+                                  else
+                                    Break;
+                                else
+                                  ;
+                              end;
+                              GetNextInstruction(hp1,hp1);
+                            end;
+
+                          { Repurpose TmpUsedRegs to mark registers that we've defined }
+                          TmpUsedRegs[R_INTREGISTER].Clear;
+
+                          if (l <= MAX_CMOV_INSTRUCTIONS - 1) and
+                            (hp1 = hp_lblyyy) and
+                            FindLabel(TAsmLabel(taicpu(hp_jump).oper[0]^.ref^.symbol), hp_lblyyy) then
+                            begin
+                              AllocRegBetween(NR_DEFAULTFLAGS, p, hp_lblyyy, UsedRegs);
+
+                              second_condition := taicpu(p).condition;
+                              condition := inverse_cond(taicpu(p).condition);
+                              UpdateUsedRegs(tai(p.next));
+
+                              { Scan through the first set of MOVs to update UsedRegs,
+                                but don't process them yet }
+                              hp1 := hpmov1;
+                              repeat
+                                if not Assigned(hp1) then
+                                  InternalError(2018062901);
+
+                                UpdateUsedRegs(tai(hp1.next));
+                                GetNextInstruction(hp1, hp1);
+                              until (hp1 = hp_lblxxx);
+
+                              UpdateUsedRegs(tai(hp_lblxxx.next));
+
+                              { Process the second set of MOVs first,
+                                because if a destination register is
+                                shared between the first and second MOV
+                                sets, it is more efficient to turn the
+                                first one into a MOV instruction and place
+                                it before the CMP if possible, but we
+                                won't know which registers are shared
+                                until we've processed at least one list,
+                                so we might as well make it the second
+                                one since that won't be modified again. }
+
+                              hp1 := hpmov2;
+                              repeat
+                                if not Assigned(hp1) then
+                                  InternalError(2018062902);
+
+                                if (hp1.typ = ait_instruction) then
+                                  begin
+                                    { Extra safeguard }
+                                    if (taicpu(hp1).opcode <> A_MOV) then
+                                      InternalError(2018062903);
+
+                                    if taicpu(hp1).oper[0]^.typ = top_const then
+                                      begin
+                                        RegMatch := False;
+
+                                        for x := 0 to c - 1 do
+                                          if (ConstVals[x] = taicpu(hp1).oper[0]^.val) then
+                                            begin
+                                              RegMatch := True;
+
+                                              { If it's in TmpUsedRegs, then this register
+                                                is being used more than once and hence has
+                                                already had its value defined (it gets
+                                                added to UsedRegs through AllocRegBetween
+                                                below) }
+                                              if not TmpUsedRegs[R_INTREGISTER].IsUsed(ConstRegs[x]) then
+                                                begin
+                                                  hp_new := taicpu.op_const_reg(A_MOV, taicpu(hp1).opsize, taicpu(hp1).oper[0]^.val, ConstRegs[x]);
+                                                  asml.InsertBefore(hp_new, hp_flagalloc);
+                                                  if Assigned(hp_prev2) then
+                                                    TrySwapMovOp(hp_prev2, hp_new);
+
+                                                  IncludeRegInUsedRegs(ConstRegs[x], TmpUsedRegs);
+                                                end
+                                              else
+                                                { We just need an instruction between hp_prev and hp1
+                                                  where we know the register is marked as in use }
+                                                hp_new := hpmov2;
+
+                                              AllocRegBetween(ConstRegs[x], hp_new, hp1, UsedRegs);
+                                              taicpu(hp1).loadreg(0, ConstRegs[x]);
+                                              Break;
+                                            end;
+
+                                        if not RegMatch then
+                                          InternalError(2021100411);
+                                      end;
+
+                                    taicpu(hp1).opcode := A_CMOVcc;
+                                    taicpu(hp1).condition := second_condition;
+
+                                    { Store these writes to search for
+                                      duplicates later on }
+                                    RegWrites[w] := taicpu(hp1).oper[1]^.reg;
+                                    Inc(w);
+                                  end;
+
+                                UpdateUsedRegs(tai(hp1.next));
+                                GetNextInstruction(hp1, hp1);
+                              until (hp1 = hp_lblyyy);
+
+                              { Now do the first set of MOVs }
+                              hp1 := hpmov1;
+                              repeat
+                                if not Assigned(hp1) then
+                                  InternalError(2018062904);
+
+                                if (hp1.typ = ait_instruction) then
+                                  begin
+                                    RegMatch := False;
+
+                                    { Extra safeguard }
+                                    if (taicpu(hp1).opcode <> A_MOV) then
+                                      InternalError(2018062905);
+
+                                    { Search through the RegWrites list to see
+                                      if there are any opposing CMOV pairs that
+                                      write to the same register }
+                                    for x := 0 to w - 1 do
+                                      if RegWrites[x] = taicpu(hp1).oper[1]^.reg then
+                                        begin
+                                          { We have a match.  Move this instruction
+                                            right to the top }
+
+                                          hp2 := hp1;
+                                          { Move ahead in preparation }
+                                          GetNextInstruction(hp1, hp1);
+
+                                          asml.Remove(hp2);
+                                          asml.InsertAfter(hp2, hp_prev);
+
+                                          { Note we can't use the trick of inserting before hp_prev
+                                            and then calling TrySwapMovOp with hp_prev2, like with
+                                            the MOV imm,reg optimisations, because hp2 may share a
+                                            register with the comparison }
+                                          if (hp_prev <> p) then
+                                            TrySwapMovCmp(hp_prev, hp2);
+
+                                          RegMatch := True;
+                                          Break;
+                                        end;
+
+                                    if RegMatch then
+                                      Continue;
+
+                                    if taicpu(hp1).oper[0]^.typ = top_const then
                                       begin
-                                        taicpu(hp1).opcode:=A_CMOVcc;
-                                        taicpu(hp1).condition:=condition;
-                                        UpdateUsedRegs(tai(hp1.next));
-                                        GetNextInstruction(hp1,hp1);
+                                        RegMatch := False;
+
+                                        for x := 0 to c - 1 do
+                                          if (ConstVals[x] = taicpu(hp1).oper[0]^.val) then
+                                            begin
+                                              RegMatch := True;
+
+                                              { If it's in TmpUsedRegs, then this register
+                                                is being used more than once and hence has
+                                                already had its value defined (it gets
+                                                added to UsedRegs through AllocRegBetween
+                                                below) }
+                                              if not TmpUsedRegs[R_INTREGISTER].IsUsed(ConstRegs[x]) then
+                                                begin
+                                                  hp_new := taicpu.op_const_reg(A_MOV, taicpu(hp1).opsize, taicpu(hp1).oper[0]^.val, ConstRegs[x]);
+                                                  asml.InsertBefore(hp_new, hp_flagalloc);
+                                                  if Assigned(hp_prev2) then
+                                                    TrySwapMovOp(hp_prev2, hp_new);
+
+                                                  IncludeRegInUsedRegs(ConstRegs[x], TmpUsedRegs);
+                                                end
+                                              else
+                                                { We just need an instruction between hp_prev and hp1
+                                                  where we know the register is marked as in use }
+                                                hp_new := hpmov1;
+
+                                              AllocRegBetween(ConstRegs[x], hp_new, hp1, UsedRegs);
+                                              taicpu(hp1).loadreg(0, ConstRegs[x]);
+                                              Break;
+                                            end;
+
+                                        if not RegMatch then
+                                          InternalError(2021100412);
                                       end;
 
-                                    hp1 := p;
+                                    taicpu(hp1).opcode := A_CMOVcc;
+                                    taicpu(hp1).condition := condition;
+                                  end;
+
+                                GetNextInstruction(hp1, hp1);
+                              until (hp1 = hp_jump); { Stop at the jump, not lbl xxx }
 
-                                    { Get first instruction after label }
-                                    UpdateUsedRegs(tai(hp3.next));
-                                    GetNextInstruction(hp3, p);
+                              UpdateUsedRegs(tai(hp_jump.next));
+                              UpdateUsedRegs(tai(hp_lblyyy.next));
 
-                                    if assigned(p) and (hp3.typ = ait_align) then
-                                      GetNextInstruction(p, p);
+                              { Get first instruction after label }
+                              hp1 := p;
+                              GetNextInstruction(hp_lblyyy, p);
 
-                                    { Don't dereference yet, as doing so will cause
-                                      GetNextInstruction to skip the label and
-                                      optional align marker. [Kit] }
-                                    GetNextInstruction(hp2, hp4);
+                              { Don't dereference yet, as doing so will cause
+                                GetNextInstruction to skip the label and
+                                optional align marker. [Kit] }
 
-                                    DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
+                              DebugMsg(SPeepholeOptimization+'JccMovJmpMov2CMovCMov',hp1);
 
-                                    { remove jCC }
-                                    RemoveInstruction(hp1);
+                              { remove Jcc }
+                              RemoveInstruction(hp1);
 
-                                    { Now we can safely decrement it }
-                                    tasmlabel(symbol).decrefs;
+                              { Now we can safely decrement it }
+                              tasmlabel(symbol).decrefs;
 
-                                    { Remove label xxx (it will have a ref of zero due to the initial check }
-                                    StripLabelFast(hp4);
+                              { Remove label xxx (it will have a ref of zero due to the initial check) }
+                              { Make sure the aligns get stripped too }
+                              hp1 := tai(hp_lblxxx.Previous);
+                              while Assigned(hp1) and (hp1.typ = ait_align) do
+                                begin
+                                  hp_lblxxx := hp1;
+                                  hp1 := tai(hp_lblxxx.Previous);
+                                end;
+                              StripLabelFast(hp_lblxxx);
 
-                                    { remove jmp }
-                                    symbol := taicpu(hp2).oper[0]^.ref^.symbol;
+                              { remove jmp }
+                              symbol := taicpu(hp_jump).oper[0]^.ref^.symbol;
 
-                                    RemoveInstruction(hp2);
+                              RemoveInstruction(hp_jump);
 
-                                    { As before, now we can safely decrement it }
-                                    tasmlabel(symbol).decrefs;
+                              { As before, now we can safely decrement it }
+                              TAsmLabel(symbol).decrefs;
 
-                                    { Remove label yyy (and the optional alignment) if its reference falls to zero }
-                                    if tasmlabel(symbol).getrefs = 0 then
-                                      StripLabelFast(hp3);
+                              { Remove label yyy (and the optional alignment) if its reference falls to zero }
+                              if TAsmLabel(symbol).getrefs = 0 then
+                                begin
+                                  { Make sure the aligns get stripped too }
+                                  hp1 := tai(hp_lblyyy.Previous);
+                                  while Assigned(hp1) and (hp1.typ = ait_align) do
+                                    begin
+                                      hp_lblyyy := hp1;
+                                      hp1 := tai(hp_lblyyy.Previous);
+                                    end;
+                                  StripLabelFast(hp_lblyyy);
+                                end;
 
-                                    if Assigned(p) then
-                                      result:=true;
-                                    exit;
-                                 end;
-                             end;
+                              if Assigned(p) then
+                                result := True;
+                              exit;
+                            end;
                         end;
-                   end;
+                    end;
 {$endif i8086}
               end;
           end;