Browse Source

* x86: Additional optimisation in OptPass2MOV to help with sub-optimal MOV/CMP/MOV triplets (and a minor refactor).

J. Gareth "Curious Kit" Moreton 1 year ago
parent
commit
755d221230
1 changed files with 120 additions and 29 deletions
  1. 120 29
      compiler/x86/aoptx86.pas

+ 120 - 29
compiler/x86/aoptx86.pas

@@ -10003,13 +10003,64 @@ unit aoptx86;
         if not GetNextInstruction(p, hp1) then
         if not GetNextInstruction(p, hp1) then
           Exit;
           Exit;
 
 
-        if MatchInstruction(hp1, A_CMP, A_TEST, [taicpu(p).opsize])
-          and DoMovCmpMemOpt(p, hp1) then
+        if MatchInstruction(hp1, A_CMP, A_TEST, []) then
           begin
           begin
-            Result := True;
-            Exit;
-          end
-        else if MatchInstruction(hp1, A_JMP, [S_NO]) then
+            if (taicpu(hp1).opsize = taicpu(p).opsize) and DoMovCmpMemOpt(p, hp1) then
+              begin
+                Result := True;
+                Exit;
+              end;
+
+            { This optimisation is only effective on a second run of Pass 2,
+              hence -O3 or above.
+
+              Change:
+                mov      %reg1,%reg2
+                cmp/test (contains %reg1)
+                mov      x,    %reg1
+                (another mov or a j(c))
+
+              To:
+                mov      %reg1,%reg2
+                mov      x,    %reg1
+                cmp      (%reg1 replaced with %reg2)
+                (another mov or a j(c))
+
+              The requirement of an additional MOV or a jump ensures there
+              isn't performance loss, since a j(c) will permit macro-fusion
+              with the cmp instruction, while another MOV likely means it's
+              not all being executed in a single cycle due to parallelisation.
+            }
+            if (cs_opt_level3 in current_settings.optimizerswitches) and
+              MatchOpType(taicpu(p), top_reg, top_reg) and
+              RegInInstruction(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
+              GetNextInstruction(hp1, hp2) and
+              MatchInstruction(hp2, A_MOV, []) and
+              (taicpu(hp2).oper[1]^.typ = top_reg) and
+              { Registers don't have to be the same size in this case }
+              SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
+              GetNextInstruction(hp2, hp3) and
+              MatchInstruction(hp3, A_MOV, A_Jcc, []) and
+              { Make sure the operands in the camparison can be safely replaced }
+              (
+                not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[0]^) or
+                ReplaceRegisterInOper(taicpu(hp1), 0, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
+              ) and
+              (
+                not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^) or
+                ReplaceRegisterInOper(taicpu(hp1), 1, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
+              ) then
+              begin
+                DebugMsg(SPeepholeOptimization + 'MOV/CMP/MOV -> MOV/MOV/CMP', p);
+                AsmL.Remove(hp2);
+                AsmL.InsertAfter(hp2, p);
+
+                Result := True;
+                Exit;
+              end;
+          end;
+
+        if MatchInstruction(hp1, A_JMP, [S_NO]) then
           begin
           begin
             { Sometimes the MOVs that OptPass2JMP produces can be improved
             { Sometimes the MOVs that OptPass2JMP produces can be improved
               further, but we can't just put this jump optimisation in pass 1
               further, but we can't just put this jump optimisation in pass 1
@@ -10019,21 +10070,30 @@ unit aoptx86;
             UpdateUsedRegs(tai(p.Next));
             UpdateUsedRegs(tai(p.Next));
 
 
             if OptPass2JMP(hp1) then
             if OptPass2JMP(hp1) then
-              { call OptPass1MOV once to potentially merge any MOVs that were created }
-              Result := OptPass1MOV(p);
-              { OptPass2MOV will now exit but will be called again if OptPass1MOV
-                returned True and the instruction is still a MOV, thus checking
-                the optimisations below }
+              begin
+                { Restore register state }
+                RestoreUsedRegs(TempTracking);
+                ReleaseUsedRegs(TempTracking);
+
+                { call OptPass1MOV once to potentially merge any MOVs that were created }
+                OptPass1MOV(p);
+                Result := True;
+                Exit;
+              end;
 
 
             { If OptPass2JMP returned False, no optimisations were done to
             { If OptPass2JMP returned False, no optimisations were done to
               the jump and there are no further optimisations that can be done
               the jump and there are no further optimisations that can be done
-              to the MOV instruction on this pass }
+              to the MOV instruction on this pass other than FuncMov2Func }
 
 
             { Restore register state }
             { Restore register state }
             RestoreUsedRegs(TempTracking);
             RestoreUsedRegs(TempTracking);
             ReleaseUsedRegs(TempTracking);
             ReleaseUsedRegs(TempTracking);
-          end
-        else if MatchOpType(taicpu(p),top_reg,top_reg) and
+
+            Result := FuncMov2Func(p, hp1);
+            Exit;
+          end;
+
+        if MatchOpType(taicpu(p),top_reg,top_reg) and
           (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
           (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
           MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
           MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
           (taicpu(hp1).oper[1]^.typ = top_reg) and
           (taicpu(hp1).oper[1]^.typ = top_reg) and
@@ -10076,8 +10136,9 @@ unit aoptx86;
                     Exit;
                     Exit;
                   end;
                   end;
               end;
               end;
-          end
-        else if MatchOpType(taicpu(p),top_reg,top_reg) and
+          end;
+
+        if MatchOpType(taicpu(p),top_reg,top_reg) and
 {$ifdef x86_64}
 {$ifdef x86_64}
           MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
           MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
 {$else x86_64}
 {$else x86_64}
@@ -10105,11 +10166,12 @@ unit aoptx86;
                 Result:=true;
                 Result:=true;
               end;
               end;
 
 
-            exit;
-          end
-        else if MatchOpType(taicpu(p),top_reg,top_reg) and
+            Exit;
+          end;
+
+        if MatchOpType(taicpu(p),top_reg,top_reg) and
           IsXCHGAcceptable and
           IsXCHGAcceptable and
-          { XCHG doesn't support 8-byte registers }
+          { XCHG doesn't support 8-bit registers }
           (taicpu(p).opsize <> S_B) and
           (taicpu(p).opsize <> S_B) and
           MatchInstruction(hp1, A_MOV, []) and
           MatchInstruction(hp1, A_MOV, []) and
           MatchOpType(taicpu(hp1),top_reg,top_reg) and
           MatchOpType(taicpu(hp1),top_reg,top_reg) and
@@ -10146,8 +10208,9 @@ unit aoptx86;
                 Result := True;
                 Result := True;
                 Exit;
                 Exit;
               end;
               end;
-          end
-        else if MatchOpType(taicpu(p),top_reg,top_reg) and
+          end;
+
+        if MatchOpType(taicpu(p),top_reg,top_reg) and
           MatchInstruction(hp1, A_SAR, []) then
           MatchInstruction(hp1, A_SAR, []) then
           begin
           begin
             if MatchOperand(taicpu(hp1).oper[0]^, 31) then
             if MatchOperand(taicpu(hp1).oper[0]^, 31) then
@@ -10172,7 +10235,9 @@ unit aoptx86;
                         taicpu(p).clearop(1);
                         taicpu(p).clearop(1);
                         taicpu(p).clearop(0);
                         taicpu(p).clearop(0);
                         taicpu(p).ops:=0;
                         taicpu(p).ops:=0;
+
                         Result := True;
                         Result := True;
+                        Exit;
                       end
                       end
                     else if (cs_opt_size in current_settings.optimizerswitches) and
                     else if (cs_opt_size in current_settings.optimizerswitches) and
                       (taicpu(p).oper[0]^.reg = NR_EDX) and
                       (taicpu(p).oper[0]^.reg = NR_EDX) and
@@ -10194,6 +10259,9 @@ unit aoptx86;
                         taicpu(hp1).clearop(1);
                         taicpu(hp1).clearop(1);
                         taicpu(hp1).clearop(0);
                         taicpu(hp1).clearop(0);
                         taicpu(hp1).ops:=0;
                         taicpu(hp1).ops:=0;
+
+                        Include(OptsToCheck, aoc_ForceNewIteration);
+                        Exit;
                       end;
                       end;
 {$ifndef x86_64}
 {$ifndef x86_64}
                   end
                   end
@@ -10273,6 +10341,9 @@ unit aoptx86;
                                   else
                                   else
                                     ;
                                     ;
                                 end;
                                 end;
+
+                            Result := True;
+                            Exit;
                           end;
                           end;
                       end;
                       end;
 {$else x86_64}
 {$else x86_64}
@@ -10299,7 +10370,9 @@ unit aoptx86;
                     taicpu(p).clearop(1);
                     taicpu(p).clearop(1);
                     taicpu(p).clearop(0);
                     taicpu(p).clearop(0);
                     taicpu(p).ops:=0;
                     taicpu(p).ops:=0;
+
                     Result := True;
                     Result := True;
+                    Exit;
                   end
                   end
                 else if (cs_opt_size in current_settings.optimizerswitches) and
                 else if (cs_opt_size in current_settings.optimizerswitches) and
                   (taicpu(p).oper[0]^.reg = NR_RDX) and
                   (taicpu(p).oper[0]^.reg = NR_RDX) and
@@ -10321,11 +10394,15 @@ unit aoptx86;
                     taicpu(hp1).clearop(1);
                     taicpu(hp1).clearop(1);
                     taicpu(hp1).clearop(0);
                     taicpu(hp1).clearop(0);
                     taicpu(hp1).ops:=0;
                     taicpu(hp1).ops:=0;
+
+                    Include(OptsToCheck, aoc_ForceNewIteration);
+                    Exit;
 {$endif x86_64}
 {$endif x86_64}
                   end;
                   end;
               end;
               end;
-          end
-        else if MatchInstruction(hp1, A_MOV, []) and
+          end;
+
+        if MatchInstruction(hp1, A_MOV, []) and
           (taicpu(hp1).oper[1]^.typ = top_reg) then
           (taicpu(hp1).oper[1]^.typ = top_reg) then
           { Though "GetNextInstruction" could be factored out, along with
           { Though "GetNextInstruction" could be factored out, along with
             the instructions that depend on hp2, it is an expensive call that
             the instructions that depend on hp2, it is an expensive call that
@@ -10376,6 +10453,8 @@ unit aoptx86;
                     taicpu(hp1).ops:=0;
                     taicpu(hp1).ops:=0;
 
 
                     RemoveInstruction(hp2);
                     RemoveInstruction(hp2);
+
+                    Include(OptsToCheck, aoc_ForceNewIteration);
 (*
 (*
 {$ifdef x86_64}
 {$ifdef x86_64}
                   end
                   end
@@ -10423,13 +10502,16 @@ unit aoptx86;
                     taicpu(hp1).ops:=0;
                     taicpu(hp1).ops:=0;
 
 
                     RemoveInstruction(hp2);
                     RemoveInstruction(hp2);
+
+                    Include(OptsToCheck, aoc_ForceNewIteration);
 {$endif x86_64}
 {$endif x86_64}
 *)
 *)
                   end;
                   end;
               end;
               end;
 {$ifdef x86_64}
 {$ifdef x86_64}
-          end
-        else if (taicpu(p).opsize = S_L) and
+          end;
+
+        if (taicpu(p).opsize = S_L) and
           (taicpu(p).oper[1]^.typ = top_reg) and
           (taicpu(p).oper[1]^.typ = top_reg) and
           (
           (
             MatchInstruction(hp1, A_MOV,[]) and
             MatchInstruction(hp1, A_MOV,[]) and
@@ -10502,10 +10584,17 @@ unit aoptx86;
             DebugMsg(SPeepholeOptimization + 'MovMov*Shr2MovMov*Rcr', p);
             DebugMsg(SPeepholeOptimization + 'MovMov*Shr2MovMov*Rcr', p);
 
 
             if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
             if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
-              { Change first MOV command to have the same register as the final output }
-              taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg
+              begin
+                { Change first MOV command to have the same register as the final output }
+                taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
+                AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, UsedRegs);
+                Result := True;
+              end
             else
             else
-              taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+              begin
+                taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                Include(OptsToCheck, aoc_ForceNewIteration);
+              end;
 
 
             { Change second MOV command to an ADD command. This is easier than
             { Change second MOV command to an ADD command. This is easier than
               converting the existing command because it means we don't have to
               converting the existing command because it means we don't have to
@@ -10520,6 +10609,8 @@ unit aoptx86;
             taicpu(hp3).opcode := A_RCR;
             taicpu(hp3).opcode := A_RCR;
             taicpu(hp3).changeopsize(S_L);
             taicpu(hp3).changeopsize(S_L);
             setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
             setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
+            { Don't need to Exit yet as p is still a MOV and hp1 hasn't been
+              called, so FuncMov2Func below is safe to call }
 {$endif x86_64}
 {$endif x86_64}
           end;
           end;