Browse Source

x86: Deeper insight in OptPass2ADD and OptPass2SUB to produce more efficient code

J. Gareth "Kit" Moreton 2 years ago
parent
commit
5a6d9ff532
1 changed files with 301 additions and 91 deletions
  1. 301 91
      compiler/x86/aoptx86.pas

+ 301 - 91
compiler/x86/aoptx86.pas

@@ -55,6 +55,12 @@ unit aoptx86;
         function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override;
         function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
 
+        { Identical to GetNextInstructionUsingReg, but returns a value indicating
+          how many instructions away that Next is from Current is.
+
+          0 = failure, equivalent to False in GetNextInstructionUsingReg }
+        function GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
+
         { This version of GetNextInstructionUsingReg will look across conditional jumps,
           potentially allowing further optimisation (although it might need to know if
           it crossed a conditional jump. }
@@ -478,6 +484,27 @@ unit aoptx86;
     end;
 
 
+  function TX86AsmOptimizer.GetNextInstructionUsingRegCount(Current: tai; out Next: tai; reg: TRegister): Cardinal;
+    var
+      GetNextResult: Boolean;
+    begin
+      Result:=0;
+      Next:=Current;
+      repeat
+        GetNextResult := GetNextInstruction(Next,Next);
+        if GetNextResult then
+          Inc(Result)
+        else
+          { Must return zero upon hitting the end of the linked list without a match }
+          Result := 0;
+      until not (GetNextResult) or
+            not(cs_opt_level3 in current_settings.optimizerswitches) or
+            (Next.typ<>ait_instruction) or
+            RegInInstruction(reg,Next) or
+            is_calljmp(taicpu(Next).opcode);
+    end;
+
+
   function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var JumpTracking: TLinkedList; var CrossJump: Boolean): Boolean;
 
     procedure TrackJump(Symbol: TAsmSymbol);
@@ -12631,7 +12658,10 @@ unit aoptx86;
 
     function TX86AsmOptimizer.OptPass2ADD(var p : tai) : boolean;
       var
-        hp1: tai; NewRef: TReference;
+        hp1, hp2: tai;
+        NewRef: TReference;
+        Distance: Cardinal;
+        TempTracking: TAllUsedRegs;
 
         { This entire nested function is used in an if-statement below, but we
           want to avoid all the used reg transfers and GetNextInstruction calls
@@ -12644,18 +12674,82 @@ unit aoptx86;
             hp2 := p;
             repeat
               UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
-            until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+            until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
 
             Result := not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
           end;
 
       begin
         Result := False;
-        if not GetNextInstruction(p, hp1) or (hp1.typ <> ait_instruction) then
-          Exit;
 
-        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) then
+        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
+          (taicpu(p).oper[1]^.typ = top_reg) then
           begin
+            Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
+            if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
+              (hp1.typ <> ait_instruction) or
+              not
+              (
+                (cs_opt_level3 in current_settings.optimizerswitches) or
+                { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
+                RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
+              ) then
+              Exit;
+
+            { Some of the MOV optimisations are much more in-depth.  For example, if we have:
+                addq $x,   %rax
+                movq %rax, %rdx
+                sarq $63,  %rdx
+                (%rax still in use)
+
+              ...letting OptPass2ADD run its course (and without -Os) will produce:
+                leaq $x(%rax),%rdx
+                addq $x,   %rax
+                sarq $63,  %rdx
+
+              ...which is okay since it breaks the dependency chain between
+                addq and movq, but if OptPass2MOV is called first:
+
+                addq $x,   %rax
+                cqto
+
+              ...which is better in all ways, taking only 2 cycles to execute
+                and much smaller in code size.
+            }
+
+            { The extra register tracking is quite strenuous }
+            if (cs_opt_level2 in current_settings.optimizerswitches) and
+              MatchInstruction(hp1, A_MOV, []) then
+              begin
+                { Update the register tracking to the MOV instruction }
+                CopyUsedRegs(TempTracking);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                { if hp1 <> hp2 after the call, then hp1 got removed, so let
+                  OptPass2ADD get called again }
+                if OptPass2MOV(hp1) and (hp1 <> hp2) then
+                  begin
+                    { Reset the tracking to the current instruction }
+                    RestoreUsedRegs(TempTracking);
+                    ReleaseUsedRegs(TempTracking);
+
+                    Result := True;
+                    Exit;
+                  end;
+
+                { Reset the tracking to the current instruction }
+                RestoreUsedRegs(TempTracking);
+                ReleaseUsedRegs(TempTracking);
+
+                { If OptPass2MOV returned True, we don't need to set Result to
+                  True if hp1 didn't change because the ADD instruction didn't
+                  get modified and we'll be evaluating hp1 again when the
+                  peephole optimizer reaches it }
+              end;
+
             { Change:
                 add     %reg2,%reg1
                 mov/s/z #(%reg1),%reg1  (%reg1 superregisters must be the same)
@@ -12663,7 +12757,7 @@ unit aoptx86;
               To:
                 mov/s/z #(%reg1,%reg2),%reg1
             }
-            if MatchOpType(taicpu(p), top_reg, top_reg) and
+            if (taicpu(p).oper[0]^.typ = top_reg) and
               MatchInstruction(hp1, [A_MOV, A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif}], []) and
               MatchOpType(taicpu(hp1), top_ref, top_reg) and
               (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
@@ -12686,11 +12780,18 @@ unit aoptx86;
                 )
               ) then
               begin
+                AllocRegBetween(taicpu(p).oper[0]^.reg, p, hp1, UsedRegs);
                 taicpu(hp1).oper[0]^.ref^.base := taicpu(p).oper[1]^.reg;
                 taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.reg;
 
                 DebugMsg(SPeepholeOptimization + 'AddMov2Mov done', p);
-                RemoveCurrentp(p, hp1);
+
+                if (cs_opt_level3 in current_settings.optimizerswitches) then
+                  { hp1 may not be the immediate next instruction under -O3 }
+                  RemoveCurrentp(p)
+                else
+                  RemoveCurrentp(p, hp1);
+
                 Result := True;
                 Exit;
               end;
@@ -12704,51 +12805,69 @@ unit aoptx86;
 
               Breaks the dependency chain.
             }
-            if MatchOpType(taicpu(p),top_const,top_reg) and
+            if (taicpu(p).oper[0]^.typ = top_const) and
               MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
               (taicpu(hp1).oper[1]^.typ = top_reg) and
               MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
               (
-                { Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
-                not (cs_opt_size in current_settings.optimizerswitches) or
-                (
-                  not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
-                  RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
-                )
+                { Instructions are guaranteed to be adjacent on -O2 and under }
+                not (cs_opt_level3 in current_settings.optimizerswitches) or
+                not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
               ) then
               begin
-                { Change the MOV instruction to a LEA instruction, and update the
-                  first operand }
+                TransferUsedRegs(TmpUsedRegs);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
 
-                reference_reset(NewRef, 1, []);
-                NewRef.base := taicpu(p).oper[1]^.reg;
-                NewRef.scalefactor := 1;
-                NewRef.offset := asizeint(taicpu(p).oper[0]^.val);
+                if (
+                    { Don't do AddMov2LeaAdd under -Os, but do allow AddMov2Lea }
+                    not (cs_opt_size in current_settings.optimizerswitches) or
+                    (
+                      not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
+                      not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
+                    )
+                  ) then
+                  begin
+                    { Change the MOV instruction to a LEA instruction, and update the
+                      first operand }
 
-                taicpu(hp1).opcode := A_LEA;
-                taicpu(hp1).loadref(0, NewRef);
+                    reference_reset(NewRef, 1, []);
+                    NewRef.base := taicpu(p).oper[1]^.reg;
+                    NewRef.scalefactor := 1;
+                    NewRef.offset := asizeint(taicpu(p).oper[0]^.val);
 
-                TransferUsedRegs(TmpUsedRegs);
-                UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
-                if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
-                  RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
-                  begin
-                    { Move what is now the LEA instruction to before the SUB instruction }
-                    Asml.Remove(hp1);
-                    Asml.InsertBefore(hp1, p);
-                    AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
+                    taicpu(hp1).opcode := A_LEA;
+                    taicpu(hp1).loadref(0, NewRef);
 
-                    DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
-                    p := hp1;
-                  end
-                else
-                  begin
-                    { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
-                    RemoveCurrentP(p, hp1);
-                    DebugMsg(SPeepholeOptimization + 'AddMov2Lea', p);
-                  end;
+                    if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
+                      RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
+                      begin
+                        hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
 
-                Result := True;
+                        { Move what is now the LEA instruction to before the ADD instruction }
+                        Asml.Remove(hp1);
+                        Asml.InsertBefore(hp1, p);
+                        AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
+
+                        DebugMsg(SPeepholeOptimization + 'AddMov2LeaAdd', p);
+                        p := hp1;
+                      end
+                    else
+                      begin
+                        { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
+                        DebugMsg(SPeepholeOptimization + 'AddMov2Lea', hp1);
+
+                        if (cs_opt_level3 in current_settings.optimizerswitches) then
+                          { hp1 may not be the immediate next instruction under -O3 }
+                          RemoveCurrentp(p)
+                        else
+                          RemoveCurrentp(p, hp1);
+                      end;
+
+                    Result := True;
+                  end;
               end;
           end;
       end;
@@ -12800,65 +12919,156 @@ unit aoptx86;
 
     function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
       var
-        hp1: tai; NewRef: TReference;
-      begin
-        { Change:
-            subl/q $x,%reg1
-            movl/q %reg1,%reg2
-          To:
-            leal/q $-x(%reg1),%reg2
-            subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
+        hp1, hp2: tai;
+        NewRef: TReference;
+        Distance: Cardinal;
+        TempTracking: TAllUsedRegs;
 
-          Breaks the dependency chain and potentially permits the removal of
-          a CMP instruction if one follows.
-        }
+      begin
         Result := False;
-        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
-          MatchOpType(taicpu(p),top_const,top_reg) and
-          GetNextInstruction(p, hp1) and
-          MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
-          (taicpu(hp1).oper[1]^.typ = top_reg) and
-          MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
-          (
-            { Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
-            not (cs_opt_size in current_settings.optimizerswitches) or
-            (
-              not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
-              RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
-            )
-          ) then
+
+        if (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif}]) and
+          MatchOpType(taicpu(p),top_const,top_reg) then
           begin
-            { Change the MOV instruction to a LEA instruction, and update the
-              first operand }
-            reference_reset(NewRef, 1, []);
-            NewRef.base := taicpu(p).oper[1]^.reg;
-            NewRef.scalefactor := 1;
-            NewRef.offset := -taicpu(p).oper[0]^.val;
+            Distance := GetNextInstructionUsingRegCount(p, hp1, taicpu(p).oper[1]^.reg);
+            if (Distance = 0) or (Distance > 3) { Likely too far to make a meaningful difference } or
+              (hp1.typ <> ait_instruction) or
+              not
+              (
+                (cs_opt_level3 in current_settings.optimizerswitches) or
+                { GetNextInstructionUsingRegCount just returns the next valid instruction under -O2 and under }
+                RegInInstruction(taicpu(p).oper[1]^.reg, hp1)
+              ) then
+              Exit;
 
-            taicpu(hp1).opcode := A_LEA;
-            taicpu(hp1).loadref(0, NewRef);
+            { Some of the MOV optimisations are much more in-depth.  For example, if we have:
+                subq $x,   %rax
+                movq %rax, %rdx
+                sarq $63,  %rdx
+                (%rax still in use)
 
-            TransferUsedRegs(TmpUsedRegs);
-            UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
-            if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
-              RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
-              begin
-                { Move what is now the LEA instruction to before the SUB instruction }
-                Asml.Remove(hp1);
-                Asml.InsertBefore(hp1, p);
-                AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
+              ...letting OptPass2SUB run its course (and without -Os) will produce:
+                leaq $-x(%rax),%rdx
+                movq $x,   %rax
+                sarq $63,  %rdx
 
-                DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
-                p := hp1;
-              end
-            else
+              ...which is okay since it breaks the dependency chain between
+                subq and movq, but if OptPass2MOV is called first:
+
+                subq $x,   %rax
+                cqto
+
+              ...which is better in all ways, taking only 2 cycles to execute
+                and much smaller in code size.
+            }
+
+            { The extra register tracking is quite strenuous }
+            if (cs_opt_level2 in current_settings.optimizerswitches) and
+              MatchInstruction(hp1, A_MOV, []) then
               begin
-                { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
-                RemoveCurrentP(p, hp1);
-                DebugMsg(SPeepholeOptimization + 'SubMov2Lea', p);
+                { Update the register tracking to the MOV instruction }
+                CopyUsedRegs(TempTracking);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                { if hp1 <> hp2 after the call, then hp1 got removed, so let
+                  OptPass2SUB get called again }
+                if OptPass2MOV(hp1) and (hp1 <> hp2) then
+                  begin
+                    { Reset the tracking to the current instruction }
+                    RestoreUsedRegs(TempTracking);
+                    ReleaseUsedRegs(TempTracking);
+
+                    Result := True;
+                    Exit;
+                  end;
+
+                { Reset the tracking to the current instruction }
+                RestoreUsedRegs(TempTracking);
+                ReleaseUsedRegs(TempTracking);
+
+                { If OptPass2MOV returned True, we don't need to set Result to
+                  True if hp1 didn't change because the SUB instruction didn't
+                  get modified and we'll be evaluating hp1 again when the
+                  peephole optimizer reaches it }
               end;
 
-            Result := True;
+            { Change:
+                subl/q $x,%reg1
+                movl/q %reg1,%reg2
+              To:
+                leal/q $-x(%reg1),%reg2
+                subl/q $x,%reg1 (can be removed if %reg1 or the flags are not used afterwards)
+
+              Breaks the dependency chain and potentially permits the removal of
+              a CMP instruction if one follows.
+            }
+            if MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
+              (taicpu(hp1).oper[1]^.typ = top_reg) and
+              MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) and
+              (
+                { Instructions are guaranteed to be adjacent on -O2 and under }
+                not (cs_opt_level3 in current_settings.optimizerswitches) or
+                not RegUsedBetween(taicpu(hp1).oper[1]^.reg, p, hp1)
+              ) then
+              begin
+                TransferUsedRegs(TmpUsedRegs);
+                hp2 := p;
+                repeat
+                  UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                until not (cs_opt_level3 in current_settings.optimizerswitches) or not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                if (
+                    { Don't do SubMov2LeaSub under -Os, but do allow SubMov2Lea }
+                    not (cs_opt_size in current_settings.optimizerswitches) or
+                    (
+                      not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs) and
+                      not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)
+                    )
+                  ) then
+                  begin
+                    { Change the MOV instruction to a LEA instruction, and update the
+                      first operand }
+                    reference_reset(NewRef, 1, []);
+                    NewRef.base := taicpu(p).oper[1]^.reg;
+                    NewRef.scalefactor := 1;
+                    NewRef.offset := -taicpu(p).oper[0]^.val;
+
+                    taicpu(hp1).opcode := A_LEA;
+                    taicpu(hp1).loadref(0, NewRef);
+
+                    TransferUsedRegs(TmpUsedRegs);
+                    UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
+                    if RegUsedAfterInstruction(NewRef.base, hp1, TmpUsedRegs) or
+                      RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
+                      begin
+                        hp2 := tai(hp1.Next); { for the benefit of AllocRegBetween }
+
+                        { Move what is now the LEA instruction to before the SUB instruction }
+                        Asml.Remove(hp1);
+                        Asml.InsertBefore(hp1, p);
+                        AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, hp2, UsedRegs);
+
+                        DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
+                        p := hp1;
+                      end
+                    else
+                      begin
+                        { Since %reg1 or the flags aren't used afterwards, we can delete p completely }
+                        DebugMsg(SPeepholeOptimization + 'SubMov2Lea', hp1);
+
+                        if (cs_opt_level3 in current_settings.optimizerswitches) then
+                          { hp1 may not be the immediate next instruction under -O3 }
+                          RemoveCurrentp(p)
+                        else
+                          RemoveCurrentp(p, hp1);
+                      end;
+
+                    Result := True;
+                  end;
+              end;
           end;
       end;