Browse Source

x86: Deeper insight in OptPass2ADD and OptPass2SUB to produce more efficient code

J. Gareth "Kit" Moreton 2 years ago
parent
commit
5a6d9ff532
1 changed files with 301 additions and 91 deletions
  1. 301 91
      compiler/x86/aoptx86.pas

+ 301 - 91
compiler/x86/aoptx86.pas

@@ -55,6 +55,12 @@ unit aoptx86;
         function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
         function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
         function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
         function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
 
 
+        { Identical to GetNextInstructionUsingReg, but returns a value indicating
+          how many instructions away that Next is from Current is.
+
+          0 = failure, equivalent to False in GetNextInstructionUsingReg }
+        function GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
+
         { This version of GetNextInstructionUsingReg will look across conditional jumps,
         { This version of GetNextInstructionUsingReg will look across conditional jumps,
           potentially allowing further optimisation (although it might need to know if
           potentially allowing further optimisation (although it might need to know if
           it crossed a conditional jump. }
           it crossed a conditional jump. }
@@ -478,6 +484,27 @@ unit aoptx86;
     end;
     end;
 
 
 
 
+  function TX86AsmOptimizer.GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
+    var
+      GetNextResult: Boolean;
+    begin
+      Result:=0;
+      Next:=Current;
+      repeat
+        GetNextResult := GetNextInstruction(Next,Next);
+        if GetNextResult then
+          Inc(Result)
+        else
+          { Must return zero upon hitting the end of the linked list without a match }
+          Result := 0;
+      until not (GetNextResult) or
+            not(cs_opt_level3 in current_settings.optimizerswitches) or
+            (Next.typ<>ait_instruction) or
+            RegInInstruction(reg,Next) or
+            is_calljmp(taicpu(Next).opcode);
+    end;
+
+
   function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var JumpTracking: TLinkedList; var CrossJump: Boolean): Boolean;
   function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var JumpTracking: TLinkedList; var CrossJump: Boolean): Boolean;
 
 
     procedure TrackJump(Symbol: TAsmSymbol);
     procedure TrackJump(Symbol: TAsmSymbol);
@@ -12631,7 +12658,10 @@ unit aoptx86;
 
 
     function TX86AsmOptimizer.OptPass2ADD(var p : tai) : boolean;
     function TX86AsmOptimizer.OptPass2ADD(var p : tai) : boolean;
       var
       var
-        hp1: tai; NewRef: TReference;
+        hp1, hp2: tai;
+        NewRef: TReference;
+        Distance: Cardinal;
+        TempTracking: TAllUsedRegs;
 
 
         { This entire nested function is used in an if-statement below, but we
         { This entire nested function is used in an if-statement below, but we
           want to avoid all the used reg transfers and GetNextInstruction calls
           want to avoid all the used reg transfers and GetNextInstruction calls
@@ -12644,18 +12674,82 @@ unit aoptx86;
             hp2 := p;
             hp2 := p;
             repeat
             repeat
               UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
               UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
-            until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+            until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
 
 
             Result := not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
             Result := not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
           end;
           end;
 
 
       begin
       begin
         Result := False;
         Result := False;
-        if not GetNextInstruction(p, hp1) or (hp1.typ <> ait_instruction) then
-          Exit;
 
 
-        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) then
+        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
+          (taicpu(p).oper[1]^.typ = top_reg) then
           begin
           begin
+            Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
+            if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
+              (hp1.typ <> ait_instruction) or
+              not
+              (
+                (cs_opt_level3 in current_settings.optimizerswitches) or
+                { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
+                RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
+              ) then
+              Exit;
+
+            { Some of the MOV optimisations are much more in-depth.  For example, if we have:
+                addq $x,   %rax
+                movq %rax, %rdx
+                sarq $63,  %rdx
+                (%rax still in use)
+
+              ...letting OptPass2ADD run its course (and without -Os) will produce:
+                leaq $x(%rax),%rdx
+                addq $x,   %rax
+                sarq $63,  %rdx
+
+              ...which is okay since it breaks the dependency chain between
+                addq and movq, but if OptPass2MOV is called first:
+
+                addq $x,   %rax
+                cqto
+
+              ...which is better in all ways, taking only 2 cycles to execute
+                and much smaller in code size.
+            }
+
+            { The extra register tracking is quite strenuous }
+            if (cs_opt_level2 in current_settings.optimizerswitches) and
+              MatchInstruction(hp1, A_MOV, []) then
+              begin
+                { Update the register tracking to the MOV instruction }
+                CopyUsedRegs(TempTracking);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                { if hp1 <> hp2 after the call, then hp1 got removed, so let
+                  OptPass2ADD get called again }
+                if OptPass2MOV(hp1) and (hp1 <> hp2) then
+                  begin
+                    { Reset the tracking to the current instruction }
+                    RestoreUsedRegs(TempTracking);
+                    ReleaseUsedRegs(TempTracking);
+
+                    Result := True;
+                    Exit;
+                  end;
+
+                { Reset the tracking to the current instruction }
+                RestoreUsedRegs(TempTracking);
+                ReleaseUsedRegs(TempTracking);
+
+                { If OptPass2MOV returned True, we don't need to set Result to
+                  True if hp1 didn't change because the ADD instruction didn't
+                  get modified and we'll be evaluating hp1 again when the
+                  peephole optimizer reaches it }
+              end;
+
             { Change:
             { Change:
                 add     %reg2,%reg1
                 add     %reg2,%reg1
                 mov/s/z #(%reg1),%reg1  (%reg1 superregisters must be the same)
                 mov/s/z #(%reg1),%reg1  (%reg1 superregisters must be the same)
@@ -12663,7 +12757,7 @@ unit aoptx86;
               To:
               To:
                 mov/s/z #(%reg1,%reg2),%reg1
                 mov/s/z #(%reg1,%reg2),%reg1
             }
             }
-            if MatchOpType(taicpu(p), top_reg, top_reg) and
+            if (taicpu(p).oper[0]^.typ = top_reg) and
               MatchInstruction(hp1, [A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and
               MatchInstruction(hp1, [A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and
               MatchOpType(taicpu(hp1), top_ref, top_reg) and
               MatchOpType(taicpu(hp1), top_ref, top_reg) and
               (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
               (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
@@ -12686,11 +12780,18 @@ unit aoptx86;
                 )
                 )
               ) then
               ) then
               begin
               begin
+                AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, UsedRegs);
                 taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
                 taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
                 taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
                 taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
 
 
                 DebugMsg(SPeepholeOptimization + 'AddMov2Mov done', p);
                 DebugMsg(SPeepholeOptimization + 'AddMov2Mov done', p);
-                RemoveCurrentp(p, hp1);
+
+                if (cs_opt_level3 in current_settings.optimizerswitches) then
+                  { hp1 may not be the immediate next instruction under -O3 }
+                  RemoveCurrentp(p)
+                else
+                  RemoveCurrentp(p, hp1);
+
                 Result := True;
                 Result := True;
                 Exit;
                 Exit;
               end;
               end;
@@ -12704,51 +12805,69 @@ unit aoptx86;
 
 
               Breaks the dependency chain.
               Breaks the dependency chain.
             }
             }
-            if MatchOpType(taicpu(p),top_const,top_reg) and
+            if (taicpu(p).oper[0]^.typ = top_const) and
               MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
               MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
               (taicpu(hp1).oper[1]^.typ = top_reg) and
               (taicpu(hp1).oper[1]^.typ = top_reg) and
               MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
               MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
               (
               (
-                { Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
-                not (cs_opt_size in current_settings.optimizerswitches) or
-                (
-                  not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
-                  RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
-                )
+                { Instructions are guaranteed to be adjacent on -O2 and under }
+                not (cs_opt_level3 in current_settings.optimizerswitches) or
+                not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
               ) then
               ) then
               begin
               begin
-                { Change the MOV instruction to a LEA instruction, and update the
-                  first operand }
+                TransferUsedRegs(TmpUsedRegs);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
 
 
-                reference_reset(NewRef, 1, []);
-                NewRef.base := taicpu(p).oper[1]^.reg;
-                NewRef.scalefactor := 1;
-                NewRef.offset := asizeint(taicpu(p).oper[0]^.val);
+                if (
+                    { Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
+                    not (cs_opt_size in current_settings.optimizerswitches) or
+                    (
+                      not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
+                      not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
+                    )
+                  ) then
+                  begin
+                    { Change the MOV instruction to a LEA instruction, and update the
+                      first operand }
 
 
-                taicpu(hp1).opcode := A_LEA;
-                taicpu(hp1).loadref(0, NewRef);
+                    reference_reset(NewRef, 1, []);
+                    NewRef.base := taicpu(p).oper[1]^.reg;
+                    NewRef.scalefactor := 1;
+                    NewRef.offset := asizeint(taicpu(p).oper[0]^.val);
 
 
-                TransferUsedRegs(TmpUsedRegs);
-                UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
-                if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
-                  RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
-                  begin
-                    { Move what is now the LEA instruction to before the SUB instruction }
-                    Asml.Remove(hp1);
-                    Asml.InsertBefore(hp1, p);
-                    AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
+                    taicpu(hp1).opcode := A_LEA;
+                    taicpu(hp1).loadref(0, NewRef);
 
 
-                    DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
-                    p := hp1;
-                  end
-                else
-                  begin
-                    { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
-                    RemoveCurrentP(p, hp1);
-                    DebugMsg(SPeepholeOptimization + 'AddMov2Lea', p);
-                  end;
+                    if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
+                      RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
+                      begin
+                        hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
 
 
-                Result := True;
+                        { Move what is now the LEA instruction to before the ADD instruction }
+                        Asml.Remove(hp1);
+                        Asml.InsertBefore(hp1, p);
+                        AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
+
+                        DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
+                        p := hp1;
+                      end
+                    else
+                      begin
+                        { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
+                        DebugMsg(SPeepholeOptimization + 'AddMov2Lea', hp1);
+
+                        if (cs_opt_level3 in current_settings.optimizerswitches) then
+                          { hp1 may not be the immediate next instruction under -O3 }
+                          RemoveCurrentp(p)
+                        else
+                          RemoveCurrentp(p, hp1);
+                      end;
+
+                    Result := True;
+                  end;
               end;
               end;
           end;
           end;
       end;
       end;
@@ -12800,65 +12919,156 @@ unit aoptx86;
 
 
     function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
     function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
       var
       var
-        hp1: tai; NewRef: TReference;
-      begin
-        { Change:
-            subl/q $x,%reg1
-            movl/q %reg1,%reg2
-          To:
-            leal/q $-x(%reg1),%reg2
-            subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
+        hp1, hp2: tai;
+        NewRef: TReference;
+        Distance: Cardinal;
+        TempTracking: TAllUsedRegs;
 
 
-          Breaks the dependency chain and potentially permits the removal of
-          a CMP instruction if one follows.
-        }
+      begin
         Result := False;
         Result := False;
-        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
-          MatchOpType(taicpu(p),top_const,top_reg) and
-          GetNextInstruction(p, hp1) and
-          MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
-          (taicpu(hp1).oper[1]^.typ = top_reg) and
-          MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
-          (
-            { Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
-            not (cs_opt_size in current_settings.optimizerswitches) or
-            (
-              not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
-              RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
-            )
-          ) then
+
+        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
+          MatchOpType(taicpu(p),top_const,top_reg) then
           begin
           begin
-            { Change the MOV instruction to a LEA instruction, and update the
-              first operand }
-            reference_reset(NewRef, 1, []);
-            NewRef.base := taicpu(p).oper[1]^.reg;
-            NewRef.scalefactor := 1;
-            NewRef.offset := -taicpu(p).oper[0]^.val;
+            Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
+            if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
+              (hp1.typ <> ait_instruction) or
+              not
+              (
+                (cs_opt_level3 in current_settings.optimizerswitches) or
+                { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
+                RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
+              ) then
+              Exit;
 
 
-            taicpu(hp1).opcode := A_LEA;
-            taicpu(hp1).loadref(0, NewRef);
+            { Some of the MOV optimisations are much more in-depth.  For example, if we have:
+                subq $x,   %rax
+                movq %rax, %rdx
+                sarq $63,  %rdx
+                (%rax still in use)
 
 
-            TransferUsedRegs(TmpUsedRegs);
-            UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
-            if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
-              RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
-              begin
-                { Move what is now the LEA instruction to before the SUB instruction }
-                Asml.Remove(hp1);
-                Asml.InsertBefore(hp1, p);
-                AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
+              ...letting OptPass2SUB run its course (and without -Os) will produce:
+                leaq $-x(%rax),%rdx
+                movq $x,   %rax
+                sarq $63,  %rdx
 
 
-                DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
-                p := hp1;
-              end
-            else
+              ...which is okay since it breaks the dependency chain between
+                subq and movq, but if OptPass2MOV is called first:
+
+                subq $x,   %rax
+                cqto
+
+              ...which is better in all ways, taking only 2 cycles to execute
+                and much smaller in code size.
+            }
+
+            { The extra register tracking is quite strenuous }
+            if (cs_opt_level2 in current_settings.optimizerswitches) and
+              MatchInstruction(hp1, A_MOV, []) then
               begin
               begin
-                { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
-                RemoveCurrentP(p, hp1);
-                DebugMsg(SPeepholeOptimization + 'SubMov2Lea', p);
+                { Update the register tracking to the MOV instruction }
+                CopyUsedRegs(TempTracking);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                { if hp1 <> hp2 after the call, then hp1 got removed, so let
+                  OptPass2SUB get called again }
+                if OptPass2MOV(hp1) and (hp1 <> hp2) then
+                  begin
+                    { Reset the tracking to the current instruction }
+                    RestoreUsedRegs(TempTracking);
+                    ReleaseUsedRegs(TempTracking);
+
+                    Result := True;
+                    Exit;
+                  end;
+
+                { Reset the tracking to the current instruction }
+                RestoreUsedRegs(TempTracking);
+                ReleaseUsedRegs(TempTracking);
+
+                { If OptPass2MOV returned True, we don't need to set Result to
+                  True if hp1 didn't change because the SUB instruction didn't
+                  get modified and we'll be evaluating hp1 again when the
+                  peephole optimizer reaches it }
               end;
               end;
 
 
-            Result := True;
+            { Change:
+                subl/q $x,%reg1
+                movl/q %reg1,%reg2
+              To:
+                leal/q $-x(%reg1),%reg2
+                subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
+
+              Breaks the dependency chain and potentially permits the removal of
+              a CMP instruction if one follows.
+            }
+            if MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
+              (taicpu(hp1).oper[1]^.typ = top_reg) and
+              MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
+              (
+                { Instructions are guaranteed to be adjacent on -O2 and under }
+                not (cs_opt_level3 in current_settings.optimizerswitches) or
+                not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
+              ) then
+              begin
+                TransferUsedRegs(TmpUsedRegs);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                if (
+                    { Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
+                    not (cs_opt_size in current_settings.optimizerswitches) or
+                    (
+                      not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
+                      not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
+                    )
+                  ) then
+                  begin
+                    { Change the MOV instruction to a LEA instruction, and update the
+                      first operand }
+                    reference_reset(NewRef, 1, []);
+                    NewRef.base := taicpu(p).oper[1]^.reg;
+                    NewRef.scalefactor := 1;
+                    NewRef.offset := -taicpu(p).oper[0]^.val;
+
+                    taicpu(hp1).opcode := A_LEA;
+                    taicpu(hp1).loadref(0, NewRef);
+
+                    TransferUsedRegs(TmpUsedRegs);
+                    UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
+                    if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
+                      RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
+                      begin
+                        hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
+
+                        { Move what is now the LEA instruction to before the SUB instruction }
+                        Asml.Remove(hp1);
+                        Asml.InsertBefore(hp1, p);
+                        AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
+
+                        DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
+                        p := hp1;
+                      end
+                    else
+                      begin
+                        { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
+                        DebugMsg(SPeepholeOptimization + 'SubMov2Lea', hp1);
+
+                        if (cs_opt_level3 in current_settings.optimizerswitches) then
+                          { hp1 may not be the immediate next instruction under -O3 }
+                          RemoveCurrentp(p)
+                        else
+                          RemoveCurrentp(p, hp1);
+                      end;
+
+                    Result := True;
+                  end;
+              end;
           end;
           end;
       end;
       end;