Browse Source

* x86: The LEA/LEA optimisations can now work with a different
destination register and the intermediate register still in
use.

J. Gareth "Curious Kit" Moreton 2 years ago
parent
commit
77f53ebde3
1 changed files with 85 additions and 43 deletions
  1. 85 43
      compiler/x86/aoptx86.pas

+ 85 - 43
compiler/x86/aoptx86.pas

@@ -5643,13 +5643,13 @@ unit aoptx86;
 
     function TX86AsmOptimizer.OptPass1LEA(var p : tai) : boolean;
       var
-        hp1: tai;
+        hp1, hp2: tai;
         ref: Integer;
         saveref: treference;
         offsetcalc: Int64;
         TempReg: TRegister;
         Multiple: TCGInt;
-        Adjacent: Boolean;
+        Adjacent, IntermediateRegDiscarded: Boolean;
       begin
         Result:=false;
 
@@ -5804,7 +5804,6 @@ unit aoptx86;
               begin
                 { Check common LEA/LEA conditions }
                 if MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
-                  (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
                   (taicpu(p).oper[0]^.ref^.relsymbol = nil) and
                   (taicpu(p).oper[0]^.ref^.segment = NR_NO) and
                   (taicpu(p).oper[0]^.ref^.symbol = nil) and
@@ -5827,6 +5826,16 @@ unit aoptx86;
                     )
                   ) then
                   begin
+                    TransferUsedRegs(TmpUsedRegs);
+                    hp2 := p;
+                    repeat
+                      UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
+                    until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
+
+                    IntermediateRegDiscarded :=
+                      (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) or
+                      not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs);
+
                     { changes
                         lea offset1(regX,scale), reg1
                         lea offset2(reg1,reg1), reg2
@@ -5850,6 +5859,11 @@ unit aoptx86;
                       (Similarly, allow the first instruction to be "lea (regX,regX),reg1")
                       }
                     if (taicpu(p).oper[0]^.ref^.base<>NR_STACK_POINTER_REG) and { lea (%rsp,scale),reg is not a valid encoding }
+                      (
+                        { Don't optimise if size is a concern and the intermediate register remains in use }
+                        IntermediateRegDiscarded or
+                        not (cs_opt_size in current_settings.optimizerswitches)
+                      ) and
                       (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
                       (
                         (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[0]^.ref^.index) or
@@ -5897,8 +5911,6 @@ unit aoptx86;
 
                         if (offsetcalc <= $7FFFFFFF) and (offsetcalc >= -2147483648) then
                           begin
-                            DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
-
                             if (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
                               (taicpu(hp1).oper[0]^.ref^.index <> taicpu(p).oper[1]^.reg) then
                               begin
@@ -5925,7 +5937,16 @@ unit aoptx86;
                             if (taicpu(p).oper[0]^.ref^.offset <> 0) then
                               Inc(taicpu(hp1).oper[0]^.ref^.offset, taicpu(p).oper[0]^.ref^.offset * max(taicpu(p).oper[0]^.ref^.scalefactor, 1));
                             taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.index;
-                            RemoveCurrentP(p);
+
+                            { Only remove the first LEA if we don't need the intermediate register's value as is }
+                            if IntermediateRegDiscarded then
+                              begin
+                                DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
+                                RemoveCurrentP(p);
+                              end
+                            else
+                              DebugMsg(SPeepholeOptimization + 'LeaLea2LeaLea 2 done (intermediate register still in use)',p);
+
                             result:=true;
                             exit;
                           end;
@@ -5933,29 +5954,35 @@ unit aoptx86;
 
                     { changes
                         lea offset1(regX), reg1
-                        lea offset2(reg1), reg1
+                        lea offset2(reg1), reg2
                         to
-                        lea offset1+offset2(regX), reg1 }
-                    if
+                        lea offset1+offset2(regX), reg2 }
+                    if (
+                        { Don't optimise if size is a concern and the intermediate register remains in use }
+                        IntermediateRegDiscarded or
+                        not (cs_opt_size in current_settings.optimizerswitches)
+                      ) and
                       (
-                        (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
-                        (taicpu(p).oper[0]^.ref^.index = NR_NO)
-                      ) or (
-                        (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
-                        (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
                         (
+                          (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
+                          (taicpu(p).oper[0]^.ref^.index = NR_NO)
+                        ) or (
+                          (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
+                          (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
                           (
-                            (taicpu(p).oper[0]^.ref^.index = NR_NO) or
-                            (taicpu(p).oper[0]^.ref^.base = NR_NO)
-                          ) or (
-                            (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
                             (
                               (taicpu(p).oper[0]^.ref^.index = NR_NO) or
+                              (taicpu(p).oper[0]^.ref^.base = NR_NO)
+                            ) or (
+                              (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
                               (
-                                (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
+                                (taicpu(p).oper[0]^.ref^.index = NR_NO) or
                                 (
-                                  (taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
-                                  (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
+                                  (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
+                                  (
+                                    (taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
+                                    (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
+                                  )
                                 )
                               )
                             )
@@ -5963,34 +5990,49 @@ unit aoptx86;
                         )
                       ) then
                       begin
-                        DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
+                        { Make sure the offset doesn't go out of range (use 64-bit arithmetic)}
+                        offsetcalc := taicpu(hp1).oper[0]^.ref^.offset;
+                        Inc(offsetcalc, Int64(taicpu(p).oper[0]^.ref^.offset) * max(taicpu(hp1).oper[0]^.ref^.scalefactor, 1));
 
-                        if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
+                        if (offsetcalc <= $7FFFFFFF) and (offsetcalc >= -2147483648) then
                           begin
-                            taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
-                            inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
-                            { if the register is used as index and base, we have to increase for base as well
-                              and adapt base }
-                            if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
+                            if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
+                              begin
+                                taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
+                                inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
+                                { if the register is used as index and base, we have to increase for base as well
+                                  and adapt base }
+                                if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
+                                  begin
+                                    taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
+                                    inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
+                                  end;
+                              end
+                            else
                               begin
-                                taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
                                 inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
+                                taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
                               end;
-                          end
-                        else
-                          begin
-                            inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
-                            taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
-                          end;
-                        if taicpu(p).oper[0]^.ref^.index<>NR_NO then
-                          begin
-                            taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
-                            taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
-                            taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
+                            if taicpu(p).oper[0]^.ref^.index<>NR_NO then
+                              begin
+                                taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
+                                taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
+                                taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
+                              end;
+
+                            { Only remove the first LEA if we don't need the intermediate register's value as is }
+                            if IntermediateRegDiscarded then
+                              begin
+                                DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
+                                RemoveCurrentP(p);
+                              end
+                            else
+                              DebugMsg(SPeepholeOptimization + 'LeaLea2LeaLea 1 done (intermediate register still in use)',p);
+
+
+                            result:=true;
+                            exit;
                           end;
-                        RemoveCurrentP(p);
-                        result:=true;
-                        exit;
                       end;
                   end;