Browse Source

* Some streamlining in x86's OptPass1LEA routine now that the stack
pointer is properly tracked, which also permits some deeper
optimisations on -O1 and -O2 virtually for free.

J. Gareth "Curious Kit" Moreton 3 years ago
parent
commit
305e49c0b8
1 changed files with 284 additions and 286 deletions
  1. 284 286
      compiler/x86/aoptx86.pas

+ 284 - 286
compiler/x86/aoptx86.pas

@@ -4911,7 +4911,6 @@ unit aoptx86;
         hp1: tai;
         hp1: tai;
         ref: Integer;
         ref: Integer;
         saveref: treference;
         saveref: treference;
-        TempReg: TRegister;
         Multiple: TCGInt;
         Multiple: TCGInt;
       begin
       begin
         Result:=false;
         Result:=false;
@@ -4970,13 +4969,15 @@ unit aoptx86;
               end;
               end;
           end;
           end;
 
 
+        { Don't optimise if the stack or frame pointer is the destination register }
+        if (taicpu(p).oper[1]^.reg=NR_STACK_POINTER_REG) or (taicpu(p).oper[1]^.reg=current_procinfo.framepointer) then
+          Exit;
         if GetNextInstruction(p,hp1) and
         if GetNextInstruction(p,hp1) and
           (hp1.typ=ait_instruction) then
           (hp1.typ=ait_instruction) then
           begin
           begin
             if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
             if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) and
               MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
               MatchOperand(taicpu(p).oper[1]^,taicpu(hp1).oper[0]^) and
-              MatchOpType(Taicpu(hp1),top_reg,top_reg) and
-              (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) then
+              MatchOpType(Taicpu(hp1),top_reg,top_reg) then
               begin
               begin
                 TransferUsedRegs(TmpUsedRegs);
                 TransferUsedRegs(TmpUsedRegs);
                 UpdateUsedRegs(TmpUsedRegs, tai(p.next));
                 UpdateUsedRegs(TmpUsedRegs, tai(p.next));
@@ -4995,335 +4996,332 @@ unit aoptx86;
                 <op> ...,<ref. with reg1>,...
                 <op> ...,<ref. with reg1>,...
                 to
                 to
                 <op> ...,<ref1>,... }
                 <op> ...,<ref1>,... }
-            if (taicpu(p).oper[1]^.reg<>current_procinfo.framepointer) and
-              (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) and
-              not(MatchInstruction(hp1,A_LEA,[])) then
-              begin
-                { find a reference which uses reg1 }
-                if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
-                  ref:=0
-                else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
-                  ref:=1
+
+            { find a reference which uses reg1 }
+            if (taicpu(hp1).ops>=1) and (taicpu(hp1).oper[0]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[0]^) then
+              ref:=0
+            else if (taicpu(hp1).ops>=2) and (taicpu(hp1).oper[1]^.typ=top_ref) and RegInOp(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^) then
+              ref:=1
+            else
+              ref:=-1;
+            if (ref<>-1) and
+              { reg1 must be either the base or the index }
+              ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
+              begin
+                { reg1 can be removed from the reference }
+                saveref:=taicpu(hp1).oper[ref]^.ref^;
+                if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
+                  taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
+                else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
+                  taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
                 else
                 else
-                  ref:=-1;
-                if (ref<>-1) and
-                  { reg1 must be either the base or the index }
-                  ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) xor (taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg)) then
-                  begin
-                    { reg1 can be removed from the reference }
-                    saveref:=taicpu(hp1).oper[ref]^.ref^;
-                    if taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg then
-                      taicpu(hp1).oper[ref]^.ref^.base:=NR_NO
-                    else if taicpu(hp1).oper[ref]^.ref^.index=taicpu(p).oper[1]^.reg then
-                      taicpu(hp1).oper[ref]^.ref^.index:=NR_NO
-                    else
-                      Internalerror(2019111201);
-                    { check if the can insert all data of the lea into the second instruction }
-                    if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
-                      ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
-                      ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
-                      ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
-                      ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
-                      ((taicpu(p).oper[0]^.ref^.scalefactor <= 1) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
-                      { Segment register of p.oper[0]^.ref will be NR_NO already }
-                      (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
+                  Internalerror(2019111201);
+                { check if the can insert all data of the lea into the second instruction }
+                if ((taicpu(hp1).oper[ref]^.ref^.base=taicpu(p).oper[1]^.reg) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
+                  ((taicpu(p).oper[0]^.ref^.base=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.base=NR_NO)) and
+                  ((taicpu(p).oper[0]^.ref^.index=NR_NO) or (taicpu(hp1).oper[ref]^.ref^.index=NR_NO)) and
+                  ((taicpu(p).oper[0]^.ref^.symbol=nil) or (taicpu(hp1).oper[ref]^.ref^.symbol=nil)) and
+                  ((taicpu(p).oper[0]^.ref^.relsymbol=nil) or (taicpu(hp1).oper[ref]^.ref^.relsymbol=nil)) and
+                  ((taicpu(p).oper[0]^.ref^.scalefactor <= 1) or (taicpu(hp1).oper[ref]^.ref^.scalefactor <= 1)) and
+                  (taicpu(p).oper[0]^.ref^.segment=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.segment=NR_NO)
 {$ifdef x86_64}
 {$ifdef x86_64}
-                      and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
-                      and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
-                           ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
-                          )
+                  and (abs(taicpu(hp1).oper[ref]^.ref^.offset+taicpu(p).oper[0]^.ref^.offset)<=$7fffffff)
+                  and (((taicpu(p).oper[0]^.ref^.base<>NR_RIP) and (taicpu(p).oper[0]^.ref^.index<>NR_RIP)) or
+                       ((taicpu(hp1).oper[ref]^.ref^.base=NR_NO) and (taicpu(hp1).oper[ref]^.ref^.index=NR_NO))
+                      )
 {$endif x86_64}
 {$endif x86_64}
-                      then
+                  then
+                  begin
+                    { reg1 might not used by the second instruction after it is remove from the reference }
+                    if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
                       begin
                       begin
-                        { reg1 might not used by the second instruction after it is remove from the reference }
-                        if not(RegInInstruction(taicpu(p).oper[1]^.reg,taicpu(hp1))) then
+                        TransferUsedRegs(TmpUsedRegs);
+                        UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                        { reg1 is not updated so it might not be used afterwards }
+                        if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
                           begin
                           begin
-                            TransferUsedRegs(TmpUsedRegs);
-                            UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                            { reg1 is not updated so it might not be used afterwards }
-                            if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs)) then
-                              begin
-                                DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
-                                if taicpu(p).oper[0]^.ref^.base<>NR_NO then
-                                  taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
-                                if taicpu(p).oper[0]^.ref^.index<>NR_NO then
-                                  taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
-                                if taicpu(p).oper[0]^.ref^.symbol<>nil then
-                                  taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
-                                if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
-                                  taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
-                                if taicpu(p).oper[0]^.ref^.scalefactor > 1 then
-                                  taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
-                                inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
-                                RemoveCurrentP(p, hp1);
-                                result:=true;
-                                exit;
-                              end
-                          end;
+                            DebugMsg(SPeepholeOptimization + 'LeaOp2Op done',p);
+                            if taicpu(p).oper[0]^.ref^.base<>NR_NO then
+                              taicpu(hp1).oper[ref]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
+                            if taicpu(p).oper[0]^.ref^.index<>NR_NO then
+                              taicpu(hp1).oper[ref]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
+                            if taicpu(p).oper[0]^.ref^.symbol<>nil then
+                              taicpu(hp1).oper[ref]^.ref^.symbol:=taicpu(p).oper[0]^.ref^.symbol;
+                            if taicpu(p).oper[0]^.ref^.relsymbol<>nil then
+                              taicpu(hp1).oper[ref]^.ref^.relsymbol:=taicpu(p).oper[0]^.ref^.relsymbol;
+                            if taicpu(p).oper[0]^.ref^.scalefactor > 1 then
+                              taicpu(hp1).oper[ref]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
+                            inc(taicpu(hp1).oper[ref]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
+                            RemoveCurrentP(p, hp1);
+                            result:=true;
+                            exit;
+                          end
                       end;
                       end;
-                    { recover }
-                    taicpu(hp1).oper[ref]^.ref^:=saveref;
                   end;
                   end;
+                { recover }
+                taicpu(hp1).oper[ref]^.ref^:=saveref;
               end;
               end;
 
 
-          end;
-
-        { for now, we do not mess with the stack pointer, thought it might be usefull to remove
-          unneeded lea sequences on the stack pointer, it needs to be tested in detail }
-        if (taicpu(p).oper[1]^.reg <> NR_STACK_POINTER_REG) and
-          GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) then
-          begin
-            { Check common LEA/LEA conditions }
-            if MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
-              (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
-              (taicpu(p).oper[0]^.ref^.relsymbol = nil) and
-              (taicpu(p).oper[0]^.ref^.symbol = nil) and
-              (taicpu(hp1).oper[0]^.ref^.relsymbol = nil) and
-              { Since we're merging two LEA instructions, the segment registers don't matter }
-              (taicpu(hp1).oper[0]^.ref^.symbol = nil) and
-              (
-                (taicpu(p).oper[0]^.ref^.base = NR_NO) or { Don't call RegModifiedBetween unnecessarily }
-                not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1))
-              ) and (
-                (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) or { Don't call RegModifiedBetween unnecessarily }
-                (taicpu(p).oper[0]^.ref^.index = NR_NO) or
-                not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1))
-              ) then
-              begin
-                { changes
-                    lea (regX,scale), reg1
-                    lea offset(reg1,reg1), reg1
-                    to
-                    lea offset(regX,scale*2), reg1
-
-                  and
-                    lea (regX,scale1), reg1
-                    lea offset(reg1,scale2), reg1
-                    to
-                    lea offset(regX,scale1*scale2), reg1
-
-                  ... so long as the final scale does not exceed 8
-
-                  (Similarly, allow the first instruction to be "lea (regX,regX),reg1")
-                  }
-                if (taicpu(p).oper[0]^.ref^.offset = 0) and
-                  (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
+            if RegInInstruction(DestinationReg, hp1) or
+              { Check further ahead (up to 2 instructions ahead for -O2) }
+              GetNextInstructionUsingReg(hp1,hp1,taicpu(p).oper[1]^.reg) then
+              begin
+                { Check common LEA/LEA conditions }
+                if MatchInstruction(hp1,A_LEA,[taicpu(p).opsize]) and
+                  (taicpu(p).oper[1]^.reg = taicpu(hp1).oper[1]^.reg) and
+                  (taicpu(p).oper[0]^.ref^.relsymbol = nil) and
+                  (taicpu(p).oper[0]^.ref^.segment = NR_NO) and
+                  (taicpu(p).oper[0]^.ref^.symbol = nil) and
+                  (taicpu(hp1).oper[0]^.ref^.relsymbol = nil) and
+                  (taicpu(hp1).oper[0]^.ref^.segment = NR_NO) and
+                  (taicpu(hp1).oper[0]^.ref^.symbol = nil) and
                   (
                   (
                     (
                     (
-                      (taicpu(p).oper[0]^.ref^.base = NR_NO)
-                    ) or (
-                      (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
-                      (
-                        (taicpu(p).oper[0]^.ref^.base = taicpu(p).oper[0]^.ref^.index) and
-                        not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index, p, hp1))
-                      )
+                      (taicpu(p).oper[0]^.ref^.base = NR_NO) or { Don't call RegModifiedBetween unnecessarily }
+                      not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.base,p,hp1))
+                    ) and (
+                      (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) or { Don't call RegModifiedBetween unnecessarily }
+                      (taicpu(p).oper[0]^.ref^.index = NR_NO) or
+                      not(RegModifiedBetween(taicpu(p).oper[0]^.ref^.index,p,hp1))
                     )
                     )
-                  ) and (
-                    (
-                      { lea (reg1,scale2), reg1 variant }
-                      (taicpu(hp1).oper[0]^.ref^.base = NR_NO) and
+                  ) then
+                  begin
+                    { changes
+                        lea (regX,scale), reg1
+                        lea offset(reg1,reg1), reg1
+                        to
+                        lea offset(regX,scale*2), reg1
+
+                      and
+                        lea (regX,scale1), reg1
+                        lea offset(reg1,scale2), reg1
+                        to
+                        lea offset(regX,scale1*scale2), reg1
+
+                      ... so long as the final scale does not exceed 8
+
+                      (Similarly, allow the first instruction to be "lea (regX,regX),reg1")
+                      }
+                    if (taicpu(p).oper[0]^.ref^.base<>NR_STACK_POINTER_REG) and { lea (%rsp,scale),reg is not a valid encoding }
+                      (taicpu(p).oper[0]^.ref^.offset = 0) and
+                      (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
                       (
                       (
                         (
                         (
-                          (taicpu(p).oper[0]^.ref^.base = NR_NO) and
-                          (taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor <= 8)
+                          (taicpu(p).oper[0]^.ref^.base = NR_NO)
+                        ) or (
+                          (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
+                          (
+                            (taicpu(p).oper[0]^.ref^.base = taicpu(p).oper[0]^.ref^.index) and
+                            not(RegUsedBetween(taicpu(p).oper[0]^.ref^.index, p, hp1))
+                          )
+                        )
+                      ) and (
+                        (
+                          { lea (reg1,scale2), reg1 variant }
+                          (taicpu(hp1).oper[0]^.ref^.base = NR_NO) and
+                          (
+                            (
+                              (taicpu(p).oper[0]^.ref^.base = NR_NO) and
+                              (taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor <= 8)
+                            ) or (
+                              { lea (regX,regX), reg1 variant }
+                              (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
+                              (taicpu(hp1).oper[0]^.ref^.scalefactor <= 4)
+                            )
+                          )
                         ) or (
                         ) or (
-                          { lea (regX,regX), reg1 variant }
-                          (taicpu(p).oper[0]^.ref^.base <> NR_NO) and
-                          (taicpu(hp1).oper[0]^.ref^.scalefactor <= 4)
+                          { lea (reg1,reg1), reg1 variant }
+                          (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
+                          (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1)
                         )
                         )
-                      )
-                    ) or (
-                      { lea (reg1,reg1), reg1 variant }
-                      (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
-                      (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1)
-                    )
-                  ) then
-                  begin
-                    DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
-
-                    { Make everything homogeneous to make calculations easier }
-                    if (taicpu(p).oper[0]^.ref^.base <> NR_NO) then
+                      ) then
                       begin
                       begin
-                        if taicpu(p).oper[0]^.ref^.index <> NR_NO then
-                          { Convert lea (regX,regX),reg1 to lea (regX,2),reg1 }
-                          taicpu(p).oper[0]^.ref^.scalefactor := 2
-                        else
-                          taicpu(p).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.base;
+                        DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 2 done',p);
 
 
-                        taicpu(p).oper[0]^.ref^.base := NR_NO;
-                      end;
+                        { Make everything homogeneous to make calculations easier }
+                        if (taicpu(p).oper[0]^.ref^.base <> NR_NO) then
+                          begin
+                            if taicpu(p).oper[0]^.ref^.index <> NR_NO then
+                              { Convert lea (regX,regX),reg1 to lea (regX,2),reg1 }
+                              taicpu(p).oper[0]^.ref^.scalefactor := 2
+                            else
+                              taicpu(p).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.base;
 
 
-                    if (taicpu(hp1).oper[0]^.ref^.base = NR_NO) then
-                      begin
-                        { Just to prevent miscalculations }
-                        if (taicpu(hp1).oper[0]^.ref^.scalefactor = 0) then
-                          taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor
+                            taicpu(p).oper[0]^.ref^.base := NR_NO;
+                          end;
+
+                        if (taicpu(hp1).oper[0]^.ref^.base = NR_NO) then
+                          begin
+                            { Just to prevent miscalculations }
+                            if (taicpu(hp1).oper[0]^.ref^.scalefactor = 0) then
+                              taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor
+                            else
+                              taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor;
+                          end
                         else
                         else
-                          taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(hp1).oper[0]^.ref^.scalefactor * taicpu(p).oper[0]^.ref^.scalefactor;
-                      end
-                    else
-                      begin
-                        taicpu(hp1).oper[0]^.ref^.base := NR_NO;
-                        taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor * 2;
-                      end;
+                          begin
+                            taicpu(hp1).oper[0]^.ref^.base := NR_NO;
+                            taicpu(hp1).oper[0]^.ref^.scalefactor := taicpu(p).oper[0]^.ref^.scalefactor * 2;
+                          end;
 
 
-                    taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.index;
-                    RemoveCurrentP(p);
-                    result:=true;
-                    exit;
-                  end
+                        taicpu(hp1).oper[0]^.ref^.index := taicpu(p).oper[0]^.ref^.index;
+                        RemoveCurrentP(p);
+                        result:=true;
+                        exit;
+                      end
 
 
-                { changes
-                    lea offset1(regX), reg1
-                    lea offset2(reg1), reg1
-                    to
-                    lea offset1+offset2(regX), reg1 }
-                else if
-                  (
-                    (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
-                    (taicpu(p).oper[0]^.ref^.index = NR_NO)
-                  ) or (
-                    (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
-                    (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
-                    (
+                    { changes
+                        lea offset1(regX), reg1
+                        lea offset2(reg1), reg1
+                        to
+                        lea offset1+offset2(regX), reg1 }
+                    else if
                       (
                       (
-                        (taicpu(p).oper[0]^.ref^.index = NR_NO) or
-                        (taicpu(p).oper[0]^.ref^.base = NR_NO)
+                        (taicpu(hp1).oper[0]^.ref^.index = taicpu(p).oper[1]^.reg) and
+                        (taicpu(p).oper[0]^.ref^.index = NR_NO)
                       ) or (
                       ) or (
-                        (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
+                        (taicpu(hp1).oper[0]^.ref^.base = taicpu(p).oper[1]^.reg) and
+                        (taicpu(hp1).oper[0]^.ref^.scalefactor <= 1) and
                         (
                         (
-                          (taicpu(p).oper[0]^.ref^.index = NR_NO) or
                           (
                           (
-                            (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
+                            (taicpu(p).oper[0]^.ref^.index = NR_NO) or
+                            (taicpu(p).oper[0]^.ref^.base = NR_NO)
+                          ) or (
+                            (taicpu(p).oper[0]^.ref^.scalefactor <= 1) and
                             (
                             (
-                              (taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
-                              (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
+                              (taicpu(p).oper[0]^.ref^.index = NR_NO) or
+                              (
+                                (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
+                                (
+                                  (taicpu(hp1).oper[0]^.ref^.index = NR_NO) or
+                                  (taicpu(hp1).oper[0]^.ref^.base = NR_NO)
+                                )
+                              )
                             )
                             )
                           )
                           )
                         )
                         )
-                      )
-                    )
-                  ) then
-                  begin
-                    DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
-
-                    if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
+                      ) then
                       begin
                       begin
-                        taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
-                        inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
-                        { if the register is used as index and base, we have to increase for base as well
-                          and adapt base }
-                        if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
+                        DebugMsg(SPeepholeOptimization + 'LeaLea2Lea 1 done',p);
+
+                        if taicpu(hp1).oper[0]^.ref^.index=taicpu(p).oper[1]^.reg then
+                          begin
+                            taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.base;
+                            inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset*max(taicpu(hp1).oper[0]^.ref^.scalefactor,1));
+                            { if the register is used as index and base, we have to increase for base as well
+                              and adapt base }
+                            if taicpu(hp1).oper[0]^.ref^.base=taicpu(p).oper[1]^.reg then
+                              begin
+                                taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
+                                inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
+                              end;
+                          end
+                        else
                           begin
                           begin
-                            taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
                             inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
                             inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
+                            taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
                           end;
                           end;
-                      end
-                    else
-                      begin
-                        inc(taicpu(hp1).oper[0]^.ref^.offset,taicpu(p).oper[0]^.ref^.offset);
-                        taicpu(hp1).oper[0]^.ref^.base:=taicpu(p).oper[0]^.ref^.base;
-                      end;
-                    if taicpu(p).oper[0]^.ref^.index<>NR_NO then
-                      begin
-                        taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
-                        taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
-                        taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
+                        if taicpu(p).oper[0]^.ref^.index<>NR_NO then
+                          begin
+                            taicpu(hp1).oper[0]^.ref^.base:=taicpu(hp1).oper[0]^.ref^.index;
+                            taicpu(hp1).oper[0]^.ref^.index:=taicpu(p).oper[0]^.ref^.index;
+                            taicpu(hp1).oper[0]^.ref^.scalefactor:=taicpu(p).oper[0]^.ref^.scalefactor;
+                          end;
+                        RemoveCurrentP(p);
+                        result:=true;
+                        exit;
                       end;
                       end;
-                    RemoveCurrentP(p);
-                    result:=true;
-                    exit;
                   end;
                   end;
-              end;
 
 
-            { Change:
-                leal/q $x(%reg1),%reg2
-                ...
-                shll/q $y,%reg2
-              To:
-                leal/q $(x+2^y)(%reg1,2^y),%reg2 (if y <= 3)
-            }
-            if MatchInstruction(hp1, A_SHL, [taicpu(p).opsize]) and
-              MatchOpType(taicpu(hp1), top_const, top_reg) and
-              (taicpu(hp1).oper[0]^.val <= 3) then
-              begin
-                Multiple := 1 shl taicpu(hp1).oper[0]^.val;
-                TransferUsedRegs(TmpUsedRegs);
-                UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
+                { Change:
+                    leal/q $x(%reg1),%reg2
+                    ...
+                    shll/q $y,%reg2
+                  To:
+                    leal/q $(x+2^y)(%reg1,2^y),%reg2 (if y <= 3)
+                }
+                if (taicpu(p).oper[0]^.ref^.base<>NR_STACK_POINTER_REG) and { lea (%rsp,scale),reg is not a valid encoding }
+                  MatchInstruction(hp1, A_SHL, [taicpu(p).opsize]) and
+                  MatchOpType(taicpu(hp1), top_const, top_reg) and
+                  (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
+                  (taicpu(hp1).oper[0]^.val <= 3) then
+                  begin
+                    Multiple := 1 shl taicpu(hp1).oper[0]^.val;
+                    TransferUsedRegs(TmpUsedRegs);
+                    UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
 
 
-                TempReg := taicpu(hp1).oper[1]^.reg; { Store locally to reduce the number of dereferences }
-                if
-                  { This allows the optimisation in some circumstances even if the lea instruction already has a scale factor
-                    (this works even if scalefactor is zero) }
-                  ((Multiple * taicpu(p).oper[0]^.ref^.scalefactor) <= 8) and
+                    if
+                      { This allows the optimisation in some circumstances even if the lea instruction already has a scale factor
+                        (this works even if scalefactor is zero) }
+                      ((Multiple * taicpu(p).oper[0]^.ref^.scalefactor) <= 8) and
 
 
-                  { Ensure offset doesn't go out of bounds }
-                  (abs(taicpu(p).oper[0]^.ref^.offset * Multiple) <= $7FFFFFFF) and
+                      { Ensure offset doesn't go out of bounds }
+                      (abs(taicpu(p).oper[0]^.ref^.offset * Multiple) <= $7FFFFFFF) and
 
 
-                  not (RegInUsedRegs(NR_DEFAULTFLAGS,TmpUsedRegs)) and
-                  MatchOperand(taicpu(p).oper[1]^, TempReg) and
-                  (
-                    (
-                      not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.base, TempReg) and
+                      not (RegInUsedRegs(NR_DEFAULTFLAGS,TmpUsedRegs)) and
                       (
                       (
-                        (taicpu(p).oper[0]^.ref^.index = NR_NO) or
-                        (taicpu(p).oper[0]^.ref^.index = NR_INVALID) or
                         (
                         (
-                          { Check for lea $x(%reg1,%reg1),%reg2 and treat as it it were lea $x(%reg1,2),%reg2 }
-                          (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
-                          (taicpu(p).oper[0]^.ref^.scalefactor <= 1)
+                          not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.base, taicpu(p).oper[1]^.reg) and
+                          (
+                            (taicpu(p).oper[0]^.ref^.index = NR_NO) or
+                            (taicpu(p).oper[0]^.ref^.index = NR_INVALID) or
+                            (
+                              { Check for lea $x(%reg1,%reg1),%reg2 and treat as it it were lea $x(%reg1,2),%reg2 }
+                              (taicpu(p).oper[0]^.ref^.index = taicpu(p).oper[0]^.ref^.base) and
+                              (taicpu(p).oper[0]^.ref^.scalefactor <= 1)
+                            )
+                          )
+                        ) or (
+                          (
+                            (taicpu(p).oper[0]^.ref^.base = NR_NO) or
+                            (taicpu(p).oper[0]^.ref^.base = NR_INVALID)
+                          ) and
+                          not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.index, taicpu(p).oper[1]^.reg)
                         )
                         )
-                      )
-                    ) or (
-                      (
-                        (taicpu(p).oper[0]^.ref^.base = NR_NO) or
-                        (taicpu(p).oper[0]^.ref^.base = NR_INVALID)
-                      ) and
-                      not SuperRegistersEqual(taicpu(p).oper[0]^.ref^.index, TempReg)
-                    )
-                  ) then
-                  begin
-                    repeat
-                      with taicpu(p).oper[0]^.ref^ do
-                        begin
-                          { Convert lea $x(%reg1,%reg1),%reg2 to lea $x(%reg1,2),%reg2 }
-                          if index = base then
-                            begin
-                              if Multiple > 4 then
-                                { Optimisation will no longer work because resultant
-                                  scale factor will exceed 8 }
-                                Break;
-
-                              base := NR_NO;
-                              scalefactor := 2;
-                              DebugMsg(SPeepholeOptimization + 'lea $x(%reg1,%reg1),%reg2 -> lea $x(%reg1,2),%reg2 for following optimisation', p);
-                            end
-                          else if (base <> NR_NO) and (base <> NR_INVALID) then
+                      ) then
+                      begin
+                        repeat
+                          with taicpu(p).oper[0]^.ref^ do
                             begin
                             begin
-                              { Scale factor only works on the index register }
-                              index := base;
-                              base := NR_NO;
-                            end;
+                              { Convert lea $x(%reg1,%reg1),%reg2 to lea $x(%reg1,2),%reg2 }
+                              if index = base then
+                                begin
+                                  if Multiple > 4 then
+                                    { Optimisation will no longer work because resultant
+                                      scale factor will exceed 8 }
+                                    Break;
 
 
-                          { For safety }
-                          if scalefactor <= 1 then
-                            begin
-                              DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 1', p);
-                              scalefactor := Multiple;
-                            end
-                          else
-                            begin
-                              DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 2', p);
-                              scalefactor := scalefactor * Multiple;
-                            end;
+                                  base := NR_NO;
+                                  scalefactor := 2;
+                                  DebugMsg(SPeepholeOptimization + 'lea $x(%reg1,%reg1),%reg2 -> lea $x(%reg1,2),%reg2 for following optimisation', p);
+                                end
+                              else if (base <> NR_NO) and (base <> NR_INVALID) then
+                                begin
+                                  { Scale factor only works on the index register }
+                                  index := base;
+                                  base := NR_NO;
+                                end;
 
 
-                          offset := offset * Multiple;
-                        end;
-                      RemoveInstruction(hp1);
-                      Result := True;
-                      Exit;
-                    { This repeat..until loop exists for the benefit of Break }
-                    until True;
+                              { For safety }
+                              if scalefactor <= 1 then
+                                begin
+                                  DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 1', p);
+                                  scalefactor := Multiple;
+                                end
+                              else
+                                begin
+                                  DebugMsg(SPeepholeOptimization + 'LeaShl2Lea 2', p);
+                                  scalefactor := scalefactor * Multiple;
+                                end;
+
+                              offset := offset * Multiple;
+                            end;
+                          RemoveInstruction(hp1);
+                          Result := True;
+                          Exit;
+                        { This repeat..until loop exists for the benefit of Break }
+                        until True;
+                      end;
                   end;
                   end;
               end;
               end;
           end;
           end;