Browse Source

* patch by J. Gareth Moreton: fixes crash on ARM with -CriotR, resolves #38116

git-svn-id: trunk@47531 -
florian 4 years ago
parent
commit
1014e53081
2 changed files with 520 additions and 502 deletions
  1. 460 452
      compiler/arm/aoptcpu.pas
  2. 60 50
      compiler/armgen/aoptarm.pas

+ 460 - 452
compiler/arm/aoptcpu.pas

@@ -1284,504 +1284,512 @@ Implementation
 
 
       { All the optimisations from this point on require GetNextInstructionUsingReg
       { All the optimisations from this point on require GetNextInstructionUsingReg
         to return True }
         to return True }
-      if not (
+      while (
         GetNextInstructionUsingReg(p, hpfar1, taicpu(p).oper[0]^.reg) and
         GetNextInstructionUsingReg(p, hpfar1, taicpu(p).oper[0]^.reg) and
         (hpfar1.typ = ait_instruction)
         (hpfar1.typ = ait_instruction)
-      ) then
-        Exit;
+      ) do
+        begin
 
 
-      { Change the common
-        mov r0, r0, lsr #xxx
-        and r0, r0, #yyy/bic r0, r0, #xxx
+          { Change the common
+            mov r0, r0, lsr #xxx
+            and r0, r0, #yyy/bic r0, r0, #xxx
 
 
-        and remove the superfluous and/bic if possible
+            and remove the superfluous and/bic if possible
 
 
-        This could be extended to handle more cases.
-      }
+            This could be extended to handle more cases.
+          }
 
 
-      { Change
-        mov rx, ry, lsr/ror #xxx
-        uxtb/uxth rz,rx/and rz,rx,0xFF
-        dealloc rx
+          { Change
+            mov rx, ry, lsr/ror #xxx
+            uxtb/uxth rz,rx/and rz,rx,0xFF
+            dealloc rx
 
 
-        to
+            to
 
 
-        uxtb/uxth rz,ry,ror #xxx
-      }
-      if (GenerateThumb2Code) and
-         (taicpu(p).ops=3) and
-         (taicpu(p).oper[2]^.typ = top_shifterop) and
-         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-         (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
-         RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-         begin
-           if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and
-             (taicpu(hpfar1).ops = 2) and
-             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
-             begin
-               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-               taicpu(hpfar1).ops := 3;
-
-               if not Assigned(hp1) then
-                 GetNextInstruction(p,hp1);
-
-               RemoveCurrentP(p, hp1);
-
-               result:=true;
-               exit;
-             end
-           else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and
-             (taicpu(hpfar1).ops=2) and
-             (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
-             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
-             begin
-               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-               taicpu(hpfar1).ops := 3;
-
-               if not Assigned(hp1) then
-                 GetNextInstruction(p,hp1);
-
-               RemoveCurrentP(p, hp1);
-
-               result:=true;
-               exit;
-             end
-           else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and
-             (taicpu(hpfar1).ops = 3) and
-             (taicpu(hpfar1).oper[2]^.typ = top_const) and
-             (taicpu(hpfar1).oper[2]^.val = $FF) and
-             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+            uxtb/uxth rz,ry,ror #xxx
+          }
+          if (GenerateThumb2Code) and
+             (taicpu(p).ops=3) and
+             (taicpu(p).oper[2]^.typ = top_shifterop) and
+             (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+             (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
+             RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
              begin
              begin
-               taicpu(hpfar1).ops := 3;
-               taicpu(hpfar1).opcode := A_UXTB;
-               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-
-               if not Assigned(hp1) then
-                 GetNextInstruction(p,hp1);
-
-               RemoveCurrentP(p, hp1);
-
-               result:=true;
-               exit;
+               if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and
+                 (taicpu(hpfar1).ops = 2) and
+                 (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+                 MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                 begin
+                   taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                   taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                   taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+                   taicpu(hpfar1).ops := 3;
+
+                   if not Assigned(hp1) then
+                     GetNextInstruction(p,hp1);
+
+                   RemoveCurrentP(p, hp1);
+
+                   result:=true;
+                   exit;
+                 end
+               else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and
+                 (taicpu(hpfar1).ops=2) and
+                 (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
+                 MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                 begin
+                   taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                   taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                   taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+                   taicpu(hpfar1).ops := 3;
+
+                   if not Assigned(hp1) then
+                     GetNextInstruction(p,hp1);
+
+                   RemoveCurrentP(p, hp1);
+
+                   result:=true;
+                   exit;
+                 end
+               else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and
+                 (taicpu(hpfar1).ops = 3) and
+                 (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                 (taicpu(hpfar1).oper[2]^.val = $FF) and
+                 (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+                 MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                 begin
+                   taicpu(hpfar1).ops := 3;
+                   taicpu(hpfar1).opcode := A_UXTB;
+                   taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                   taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                   taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+
+                   if not Assigned(hp1) then
+                     GetNextInstruction(p,hp1);
+
+                   RemoveCurrentP(p, hp1);
+
+                   result:=true;
+                   exit;
+                 end;
              end;
              end;
-         end;
 
 
-      { 2-operald mov optimisations }
-      if (taicpu(p).ops = 2) then
-        begin
-          {
-            This removes the mul from
-            mov rX,0
-            ...
-            mul ...,rX,...
-          }
-          if (taicpu(p).oper[1]^.typ = top_const) then
+          { 2-operald mov optimisations }
+          if (taicpu(p).ops = 2) then
             begin
             begin
-(*          if false and
-            (taicpu(p).oper[1]^.val=0) and
-            MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
-            (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or
-             ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then
-              begin
-                TransferUsedRegs(TmpUsedRegs);
-                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
-                DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p);
-                if taicpu(hpfar1).opcode=A_MUL then
-                  taicpu(hpfar1).loadconst(1,0)
-                else
-                  taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg);
-                taicpu(hpfar1).ops:=2;
-                taicpu(hpfar1).opcode:=A_MOV;
-                if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
-                  RemoveCurrentP(p);
-                Result:=true;
-                exit;
-              end
-          else*) if (taicpu(p).oper[1]^.val=0) and
-              MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-              MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then
+              {
+                This removes the mul from
+                mov rX,0
+                ...
+                mul ...,rX,...
+              }
+              if (taicpu(p).oper[1]^.typ = top_const) then
                 begin
                 begin
-                  TransferUsedRegs(TmpUsedRegs);
-                  UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                  UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
-                  DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p);
-                  taicpu(hpfar1).ops:=3;
-                  taicpu(hpfar1).opcode:=A_MUL;
-                  if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
-                    begin
+    (*          if false and
+                (taicpu(p).oper[1]^.val=0) and
+                MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or
+                 ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then
+                  begin
+                    TransferUsedRegs(TmpUsedRegs);
+                    UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                    UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                    DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p);
+                    if taicpu(hpfar1).opcode=A_MUL then
+                      taicpu(hpfar1).loadconst(1,0)
+                    else
+                      taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg);
+                    taicpu(hpfar1).ops:=2;
+                    taicpu(hpfar1).opcode:=A_MOV;
+                    if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
                       RemoveCurrentP(p);
                       RemoveCurrentP(p);
-                      Result:=true;
-                    end;
-                  exit;
-                end
-            {
-              This changes the very common
-              mov r0, #0
-              str r0, [...]
-              mov r0, #0
-              str r0, [...]
-
-              and removes all superfluous mov instructions
-            }
-            else if (taicpu(hpfar1).opcode=A_STR) then
-              begin
-                hp1 := hpfar1;
-                while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
-                      MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and
-                      GetNextInstruction(hp1, hp2) and
-                      MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
-                      (taicpu(hp2).ops = 2) and
-                      MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
-                      MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
+                    Result:=true;
+                    exit;
+                  end
+              else*) if (taicpu(p).oper[1]^.val=0) and
+                  MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                  MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then
+                    begin
+                      TransferUsedRegs(TmpUsedRegs);
+                      UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                      UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                      DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p);
+                      taicpu(hpfar1).ops:=3;
+                      taicpu(hpfar1).opcode:=A_MUL;
+                      if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
+                        begin
+                          RemoveCurrentP(p);
+                          Result:=true;
+                        end;
+                      exit;
+                    end
+                {
+                  This changes the very common
+                  mov r0, #0
+                  str r0, [...]
+                  mov r0, #0
+                  str r0, [...]
+
+                  and removes all superfluous mov instructions
+                }
+                else if (taicpu(hpfar1).opcode=A_STR) then
                   begin
                   begin
-                    DebugMsg('Peephole Optimization: MovStrMov done', hp2);
-                    GetNextInstruction(hp2,hp1);
-                    asml.remove(hp2);
-                    hp2.free;
-                    result:=true;
-                    if not assigned(hp1) then break;
-                  end;
+                    hp1 := hpfar1;
+                    while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
+                          MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and
+                          GetNextInstruction(hp1, hp2) and
+                          MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
+                          (taicpu(hp2).ops = 2) and
+                          MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
+                          MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
+                      begin
+                        DebugMsg('Peephole Optimization: MovStrMov done', hp2);
+                        GetNextInstruction(hp2,hp1);
+                        asml.remove(hp2);
+                        hp2.free;
+                        result:=true;
+                        if not assigned(hp1) then break;
+                      end;
 
 
-                if Result then
-                  Exit;
-              end;
-            end;
-          {
-            This removes the first mov from
-            mov rX,...
-            mov rX,...
-          }
-          if taicpu(hpfar1).opcode=A_MOV then
-            begin
-              hp1 := p;
-              while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and
-                    (taicpu(hpfar1).ops = 2) and
-                    MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and
-                    { don't remove the first mov if the second is a mov rX,rX }
-                    not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do
+                    if Result then
+                      Exit;
+                  end;
+                end;
+              {
+                This removes the first mov from
+                mov rX,...
+                mov rX,...
+              }
+              if taicpu(hpfar1).opcode=A_MOV then
                 begin
                 begin
-                  { Defer removing the first p until after the while loop }
-                  if p <> hp1 then
+                  hp1 := p;
+                  while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and
+                        (taicpu(hpfar1).ops = 2) and
+                        MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and
+                        { don't remove the first mov if the second is a mov rX,rX }
+                        not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do
                     begin
                     begin
-                      DebugMsg('Peephole Optimization: MovMov done', hp1);
-                      asml.remove(hp1);
-                      hp1.free;
+                      { Defer removing the first p until after the while loop }
+                      if p <> hp1 then
+                        begin
+                          DebugMsg('Peephole Optimization: MovMov done', hp1);
+                          asml.remove(hp1);
+                          hp1.free;
+                        end;
+                      hp1:=hpfar1;
+                      GetNextInstruction(hpfar1,hpfar1);
+                      result:=true;
+                      if not assigned(hpfar1) then
+                        Break;
+                    end;
+
+                  if Result then
+                    begin
+                      DebugMsg('Peephole Optimization: MovMov done', p);
+                      RemoveCurrentp(p);
+                      Exit;
                     end;
                     end;
-                  hp1:=hpfar1;
-                  GetNextInstruction(hpfar1,hpfar1);
-                  result:=true;
-                  if not assigned(hpfar1) then
-                    Break;
                 end;
                 end;
 
 
-              if Result then
+              if RedundantMovProcess(p,hpfar1) then
                 begin
                 begin
-                  DebugMsg('Peephole Optimization: MovMov done', p);
-                  RemoveCurrentp(p);
-                  Exit;
+                  Result:=true;
+                  { p might not point at a mov anymore }
+                  exit;
                 end;
                 end;
-            end;
 
 
-          if RedundantMovProcess(p,hpfar1) then
-            begin
-              Result:=true;
-              { p might not point at a mov anymore }
-              exit;
-            end;
+              { If hpfar1 is nil after the call to RedundantMovProcess, it is
+                because it would have become a dangling pointer, so reinitialise it. }
+              if not Assigned(hpfar1) then
+                Continue;
 
 
-          { Fold the very common sequence
-              mov  regA, regB
-              ldr* regA, [regA]
-            to
-              ldr* regA, [regB]
-            CAUTION! If this one is successful p might not be a mov instruction anymore!
-          }
-          if
-             // Make sure that Thumb code doesn't propagate a high register into a reference
-             (
-               (
-                 GenerateThumbCode and
-                 (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)
-               ) or (not GenerateThumbCode)
-             ) and
-             (taicpu(p).oper[1]^.typ = top_reg) and
-             (taicpu(p).oppostfix = PF_NONE) and
-             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and
-             (taicpu(hpfar1).oper[1]^.typ = top_ref) and
-             { We can change the base register only when the instruction uses AM_OFFSET }
-             ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
-               ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
-             ) and
-             not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
-             RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-            begin
-              DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1);
-              if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-                taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
-
-              if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
-                taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
-
-              dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
-              if Assigned(dealloc) then
+              { Fold the very common sequence
+                  mov  regA, regB
+                  ldr* regA, [regA]
+                to
+                  ldr* regA, [regB]
+                CAUTION! If this one is successful p might not be a mov instruction anymore!
+              }
+              if
+                 // Make sure that Thumb code doesn't propagate a high register into a reference
+                 (
+                   (
+                     GenerateThumbCode and
+                     (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)
+                   ) or (not GenerateThumbCode)
+                 ) and
+                 (taicpu(p).oper[1]^.typ = top_reg) and
+                 (taicpu(p).oppostfix = PF_NONE) and
+                 MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and
+                 (taicpu(hpfar1).oper[1]^.typ = top_ref) and
+                 { We can change the base register only when the instruction uses AM_OFFSET }
+                 ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
+                   ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                    (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
+                 ) and
+                 not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+                 RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
                 begin
                 begin
-                  asml.remove(dealloc);
-                  asml.InsertAfter(dealloc,hpfar1);
-                end;
+                  DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1);
+                  if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                     (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+                    taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
 
 
-              if not Assigned(hp1) then
-                GetNextInstruction(p, hp1);
+                  if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
+                    taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
 
 
-              RemoveCurrentP(p, hp1);
+                  dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
+                  if Assigned(dealloc) then
+                    begin
+                      asml.remove(dealloc);
+                      asml.InsertAfter(dealloc,hpfar1);
+                    end;
 
 
-              result:=true;
-              Exit;
-            end
-        end
+                  if not Assigned(hp1) then
+                    GetNextInstruction(p, hp1);
 
 
-      { 3-operald mov optimisations }
-      else if (taicpu(p).ops = 3) then
-        begin
+                  RemoveCurrentP(p, hp1);
 
 
-          if (taicpu(p).oper[2]^.typ = top_shifterop) and
-            (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-            (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
-            (taicpu(hpfar1).ops>=1) and
-            (taicpu(hpfar1).oper[0]^.typ=top_reg) and
-            (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and
-            RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-            begin
-              if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
-                MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                (taicpu(hpfar1).ops=3) and
-                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
-                (taicpu(hpfar1).oper[2]^.typ = top_const) and
-                { Check if the AND actually would only mask out bits being already zero because of the shift
-                }
-                ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) =
-                  ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
-                begin
-                  DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1);
-                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
-                  asml.remove(hpfar1);
-                  hpfar1.free;
                   result:=true;
                   result:=true;
                   Exit;
                   Exit;
                 end
                 end
-              else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                (taicpu(hpfar1).ops=3) and
-                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
-                (taicpu(hpfar1).oper[2]^.typ = top_const) and
-                { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
-                (taicpu(hpfar1).oper[2]^.val<>0) and
-                (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
-                begin
-                  DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1);
-                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
-                  asml.remove(hpfar1);
-                  hpfar1.free;
-                  result:=true;
-                  Exit;
-                end;
-            end;
-          { This folds shifterops into following instructions
-            mov r0, r1, lsl #8
-            add r2, r3, r0
+            end
 
 
-            to
+          { 3-operald mov optimisations }
+          else if (taicpu(p).ops = 3) then
+            begin
 
 
-            add r2, r3, r1, lsl #8
-            CAUTION! If this one is successful p might not be a mov instruction anymore!
-          }
-          if (taicpu(p).oper[1]^.typ = top_reg) and
-           (taicpu(p).oper[2]^.typ = top_shifterop) and
-           (taicpu(p).oppostfix = PF_NONE) and
-           MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
-                                  A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
-                                  A_CMP, A_CMN],
-                            [taicpu(p).condition], [PF_None]) and
-           (not ((GenerateThumb2Code) and
-                 (taicpu(hpfar1).opcode in [A_SBC]) and
-                 (((taicpu(hpfar1).ops=3) and
-                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or
-                  ((taicpu(hpfar1).ops=2) and
-                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and
-           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and
-           (taicpu(hpfar1).ops >= 2) and
-           {Currently we can't fold into another shifterop}
-           (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and
-           {Folding rrx is problematic because of the C-Flag, as we currently can't check
-            NR_DEFAULTFLAGS for modification}
-           (
-             {Everything is fine if we don't use RRX}
-             (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
-             (
-               {If it is RRX, then check if we're just accessing the next instruction}
-               Assigned(hp1) and
-               (hpfar1 = hp1)
-             )
-           ) and
-           { reg1 might not be modified inbetween }
-           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
-           { The shifterop can contain a register, might not be modified}
-           (
-             (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
-             not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1))
-           ) and
-           (
-             {Only ONE of the two src operands is allowed to match}
-             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor
-             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^)
-           ) then
-          begin
-            if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then
-              I2:=0
-            else
-              I2:=1;
-            for I:=I2 to taicpu(hpfar1).ops-1 do
-              if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then
+              if (taicpu(p).oper[2]^.typ = top_shifterop) and
+                (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+                (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
+                (taicpu(hpfar1).ops>=1) and
+                (taicpu(hpfar1).oper[0]^.typ=top_reg) and
+                (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and
+                RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
                 begin
                 begin
-                  { If the parameter matched on the second op from the RIGHT
-                    we have to switch the parameters, this will not happen for CMP
-                    were we're only evaluating the most right parameter
-                  }
-                  if I <> taicpu(hpfar1).ops-1 then
+                  if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
+                    MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                    (taicpu(hpfar1).ops=3) and
+                    MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                    (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                    { Check if the AND actually would only mask out bits being already zero because of the shift
+                    }
+                    ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) =
+                      ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
                     begin
                     begin
-                      {The SUB operators need to be changed when we swap parameters}
-                      case taicpu(hpfar1).opcode of
-                        A_SUB: tempop:=A_RSB;
-                        A_SBC: tempop:=A_RSC;
-                        A_RSB: tempop:=A_SUB;
-                        A_RSC: tempop:=A_SBC;
-                        else tempop:=taicpu(hpfar1).opcode;
-                      end;
-                      if taicpu(hpfar1).ops = 3 then
-                        hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
-                             taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg,
-                             taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                      else
-                        hp2:=taicpu.op_reg_reg_shifterop(tempop,
-                             taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                             taicpu(p).oper[2]^.shifterop^);
+                      DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1);
+                      taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                      asml.remove(hpfar1);
+                      hpfar1.free;
+                      result:=true;
+                      Exit;
                     end
                     end
-                  else
-                    if taicpu(hpfar1).ops = 3 then
-                      hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode,
-                           taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg,
-                           taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                    else
-                      hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode,
-                           taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                           taicpu(p).oper[2]^.shifterop^);
-                  if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
-                    AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs);
-                  AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs);
-                  asml.insertbefore(hp2, hpfar1);
-                  asml.remove(hpfar1);
-                  hpfar1.free;
-                  DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2);
+                  else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                    (taicpu(hpfar1).ops=3) and
+                    MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                    (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                    { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
+                    (taicpu(hpfar1).oper[2]^.val<>0) and
+                    (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
+                    begin
+                      DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1);
+                      taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                      asml.remove(hpfar1);
+                      hpfar1.free;
+                      result:=true;
+                      Exit;
+                    end;
+                end;
+              { This folds shifterops into following instructions
+                mov r0, r1, lsl #8
+                add r2, r3, r0
 
 
-                  if not Assigned(hp1) then
-                    GetNextInstruction(p, hp1)
-                  else if hp1 = hpfar1 then
-                    { If hp1 = hpfar1, then it's a dangling pointer }
-                    hp1 := hp2;
+                to
 
 
-                  RemoveCurrentP(p, hp1);
-                  Result:=true;
-                  Exit;
-                end;
-          end;
-        {
-          Fold
-            mov r1, r1, lsl #2
-            ldr/ldrb r0, [r0, r1]
-          to
-            ldr/ldrb r0, [r0, r1, lsl #2]
-
-          XXX: This still needs some work, as we quite often encounter something like
-                 mov r1, r2, lsl #2
-                 add r2, r3, #imm
-                 ldr r0, [r2, r1]
-               which can't be folded because r2 is overwritten between the shift and the ldr.
-               We could try to shuffle the registers around and fold it into.
-                 add r1, r3, #imm
-                 ldr r0, [r1, r2, lsl #2]
-        }
-        if (not(GenerateThumbCode)) and
-          { thumb2 allows only lsl #0..#3 }
-          (not(GenerateThumb2Code) or
-           ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
-            (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
-           )
-          ) and
-           (taicpu(p).oper[1]^.typ = top_reg) and
-           (taicpu(p).oper[2]^.typ = top_shifterop) and
-           { RRX is tough to handle, because it requires tracking the C-Flag,
-             it is also extremly unlikely to be emitted this way}
-           (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
-           (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
-           (taicpu(p).oppostfix = PF_NONE) and
-           {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
-           (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
-            (GenerateThumb2Code and
-             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
-           ) and
-           (
-             {If this is address by offset, one of the two registers can be used}
-             ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+                add r2, r3, r1, lsl #8
+                CAUTION! If this one is successful p might not be a mov instruction anymore!
+              }
+              if (taicpu(p).oper[1]^.typ = top_reg) and
+               (taicpu(p).oper[2]^.typ = top_shifterop) and
+               (taicpu(p).oppostfix = PF_NONE) and
+               MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
+                                      A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
+                                      A_CMP, A_CMN],
+                                [taicpu(p).condition], [PF_None]) and
+               (not ((GenerateThumb2Code) and
+                     (taicpu(hpfar1).opcode in [A_SBC]) and
+                     (((taicpu(hpfar1).ops=3) and
+                       MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or
+                      ((taicpu(hpfar1).ops=2) and
+                       MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and
+               RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and
+               (taicpu(hpfar1).ops >= 2) and
+               {Currently we can't fold into another shifterop}
+               (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and
+               {Folding rrx is problematic because of the C-Flag, as we currently can't check
+                NR_DEFAULTFLAGS for modification}
+               (
+                 {Everything is fine if we don't use RRX}
+                 (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
+                 (
+                   {If it is RRX, then check if we're just accessing the next instruction}
+                   Assigned(hp1) and
+                   (hpfar1 = hp1)
+                 )
+               ) and
+               { reg1 might not be modified inbetween }
+               not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+               { The shifterop can contain a register, might not be modified}
+               (
+                 (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
+                 not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1))
+               ) and
                (
                (
-                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
-                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
+                 {Only ONE of the two src operands is allowed to match}
+                 MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor
+                 MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^)
+               ) then
+              begin
+                if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then
+                  I2:=0
+                else
+                  I2:=1;
+                for I:=I2 to taicpu(hpfar1).ops-1 do
+                  if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then
+                    begin
+                      { If the parameter matched on the second op from the RIGHT
+                        we have to switch the parameters, this will not happen for CMP
+                        were we're only evaluating the most right parameter
+                      }
+                      if I <> taicpu(hpfar1).ops-1 then
+                        begin
+                          {The SUB operators need to be changed when we swap parameters}
+                          case taicpu(hpfar1).opcode of
+                            A_SUB: tempop:=A_RSB;
+                            A_SBC: tempop:=A_RSC;
+                            A_RSB: tempop:=A_SUB;
+                            A_RSC: tempop:=A_SBC;
+                            else tempop:=taicpu(hpfar1).opcode;
+                          end;
+                          if taicpu(hpfar1).ops = 3 then
+                            hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
+                                 taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg,
+                                 taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                          else
+                            hp2:=taicpu.op_reg_reg_shifterop(tempop,
+                                 taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                                 taicpu(p).oper[2]^.shifterop^);
+                        end
+                      else
+                        if taicpu(hpfar1).ops = 3 then
+                          hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                               taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg,
+                               taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                        else
+                          hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                               taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                               taicpu(p).oper[2]^.shifterop^);
+                      if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
+                        AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs);
+                      AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs);
+                      asml.insertbefore(hp2, hpfar1);
+                      asml.remove(hpfar1);
+                      hpfar1.free;
+                      DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2);
+
+                      if not Assigned(hp1) then
+                        GetNextInstruction(p, hp1)
+                      else if hp1 = hpfar1 then
+                        { If hp1 = hpfar1, then it's a dangling pointer }
+                        hp1 := hp2;
+
+                      RemoveCurrentP(p, hp1);
+                      Result:=true;
+                      Exit;
+                    end;
+              end;
+            {
+              Fold
+                mov r1, r1, lsl #2
+                ldr/ldrb r0, [r0, r1]
+              to
+                ldr/ldrb r0, [r0, r1, lsl #2]
+
+              XXX: This still needs some work, as we quite often encounter something like
+                     mov r1, r2, lsl #2
+                     add r2, r3, #imm
+                     ldr r0, [r2, r1]
+                   which can't be folded because r2 is overwritten between the shift and the ldr.
+                   We could try to shuffle the registers around and fold it into.
+                     add r1, r3, #imm
+                     ldr r0, [r1, r2, lsl #2]
+            }
+            if (not(GenerateThumbCode)) and
+              { thumb2 allows only lsl #0..#3 }
+              (not(GenerateThumb2Code) or
+               ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
+                (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
                )
                )
-             ) or
-             {For post and preindexed only the index register can be used}
-             ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
+              ) and
+               (taicpu(p).oper[1]^.typ = top_reg) and
+               (taicpu(p).oper[2]^.typ = top_shifterop) and
+               { RRX is tough to handle, because it requires tracking the C-Flag,
+                 it is also extremly unlikely to be emitted this way}
+               (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
+               (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
+               (taicpu(p).oppostfix = PF_NONE) and
+               {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
+               (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
+                (GenerateThumb2Code and
+                 MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
+               ) and
                (
                (
-                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
-                 (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
+                 {If this is address by offset, one of the two registers can be used}
+                 ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+                   (
+                     (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
+                     (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
+                   )
+                 ) or
+                 {For post and preindexed only the index register can be used}
+                 ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
+                   (
+                     (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
+                     (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
+                   ) and
+                   (not GenerateThumb2Code)
+                 )
                ) and
                ) and
-               (not GenerateThumb2Code)
-             )
-           ) and
-           { Only fold if both registers are used. Otherwise we are folding p with itself }
-           (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and
-           (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and
-           { Only fold if there isn't another shifterop already, and offset is zero. }
-           (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and
-           (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and
-           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
-           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-           begin
-             { If the register we want to do the shift for resides in base, we need to swap that}
-             if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-               taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index;
-             taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
-             taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
-             taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
-             DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1);
-             RemoveCurrentP(p);
-             Result:=true;
-             Exit;
-           end;
+               { Only fold if both registers are used. Otherwise we are folding p with itself }
+               (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and
+               (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and
+               { Only fold if there isn't another shifterop already, and offset is zero. }
+               (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and
+               (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and
+               not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+               RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+               begin
+                 { If the register we want to do the shift for resides in base, we need to swap that}
+                 if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+                   taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index;
+                 taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+                 taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
+                 taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
+                 DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1);
+                 RemoveCurrentP(p);
+                 Result:=true;
+                 Exit;
+               end;
+            end;
+          {
+            Often we see shifts and then a superfluous mov to another register
+            In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
+          }
+          if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then
+            Result:=true;
+
+          Exit;
         end;
         end;
-      {
-        Often we see shifts and then a superfluous mov to another register
-        In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
-      }
-      if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then
-        Result:=true;
     end;
     end;
 
 
 
 

+ 60 - 50
compiler/armgen/aoptarm.pas

@@ -40,7 +40,7 @@ Type
     procedure DebugMsg(const s : string; p : tai);
     procedure DebugMsg(const s : string; p : tai);
 
 
     function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
     function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
-    function RedundantMovProcess(var p: tai; hp1: tai): boolean;
+    function RedundantMovProcess(var p: tai; var hp1: tai): boolean;
     function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
     function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
 
 
     function OptPass1UXTB(var p: tai): Boolean;
     function OptPass1UXTB(var p: tai): Boolean;
@@ -292,10 +292,10 @@ Implementation
     end;
     end;
 
 
 
 
-  function TARMAsmOptimizer.RedundantMovProcess(var p: tai;hp1: tai):boolean;
+  function TARMAsmOptimizer.RedundantMovProcess(var p: tai; var hp1: tai):boolean;
     var
     var
       I: Integer;
       I: Integer;
-      current_hp: tai;
+      current_hp, next_hp: tai;
       LDRChange: Boolean;
       LDRChange: Boolean;
     begin
     begin
       Result:=false;
       Result:=false;
@@ -390,80 +390,80 @@ Implementation
               TransferUsedRegs(TmpUsedRegs);
               TransferUsedRegs(TmpUsedRegs);
 
 
               { Search local instruction block }
               { Search local instruction block }
-              while GetNextInstruction(current_hp, hp1) and (hp1 <> BlockEnd) and (hp1.typ = ait_instruction) do
+              while GetNextInstruction(current_hp, next_hp) and (next_hp <> BlockEnd) and (next_hp.typ = ait_instruction) do
                 begin
                 begin
                   UpdateUsedRegs(TmpUsedRegs, tai(current_hp.Next));
                   UpdateUsedRegs(TmpUsedRegs, tai(current_hp.Next));
                   LDRChange := False;
                   LDRChange := False;
 
 
-                  if (taicpu(hp1).opcode in [A_LDR,A_STR]) and (taicpu(hp1).ops = 2) then
+                  if (taicpu(next_hp).opcode in [A_LDR,A_STR]) and (taicpu(next_hp).ops = 2) then
                     begin
                     begin
 
 
                       { Change the registers from r1 to r0 }
                       { Change the registers from r1 to r0 }
-                      if (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) and
+                      if (taicpu(next_hp).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) and
 {$ifdef ARM}
 {$ifdef ARM}
                         { This optimisation conflicts with something and raises
                         { This optimisation conflicts with something and raises
                           an access violation - needs further investigation. [Kit] }
                           an access violation - needs further investigation. [Kit] }
-                        (taicpu(hp1).opcode <> A_LDR) and
+                        (taicpu(next_hp).opcode <> A_LDR) and
 {$endif ARM}
 {$endif ARM}
                         { Don't mess around with the base register if the
                         { Don't mess around with the base register if the
                           reference is pre- or post-indexed }
                           reference is pre- or post-indexed }
-                        (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) then
+                        (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_OFFSET) then
                         begin
                         begin
-                          taicpu(hp1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
+                          taicpu(next_hp).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
                           LDRChange := True;
                           LDRChange := True;
                         end;
                         end;
 
 
-                      if taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
+                      if taicpu(next_hp).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
                         begin
                         begin
-                          taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+                          taicpu(next_hp).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
                           LDRChange := True;
                           LDRChange := True;
                         end;
                         end;
 
 
                       if LDRChange then
                       if LDRChange then
-                        DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 1)', hp1);
+                        DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 1)', next_hp);
 
 
                       { Drop out if we're dealing with pre-indexed references }
                       { Drop out if we're dealing with pre-indexed references }
-                      if (taicpu(hp1).oper[1]^.ref^.addressmode = AM_PREINDEXED) and
+                      if (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_PREINDEXED) and
                         (
                         (
-                          RegInRef(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^.ref^) or
-                          RegInRef(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.ref^)
+                          RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) or
+                          RegInRef(taicpu(p).oper[1]^.reg, taicpu(next_hp).oper[1]^.ref^)
                         ) then
                         ) then
                         begin
                         begin
                           { Remember to update register allocations }
                           { Remember to update register allocations }
                           if LDRChange then
                           if LDRChange then
-                            AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs);
+                            AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
 
 
                           Break;
                           Break;
                         end;
                         end;
 
 
                       { The register being stored can be potentially changed (as long as it's not the stack pointer) }
                       { The register being stored can be potentially changed (as long as it's not the stack pointer) }
-                      if (taicpu(hp1).opcode = A_STR) and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
-                        MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) then
+                      if (taicpu(next_hp).opcode = A_STR) and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
+                        MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then
                         begin
                         begin
-                          DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 2)', hp1);
-                          taicpu(hp1).oper[0]^.reg := taicpu(p).oper[1]^.reg;
+                          DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 2)', next_hp);
+                          taicpu(next_hp).oper[0]^.reg := taicpu(p).oper[1]^.reg;
                           LDRChange := True;
                           LDRChange := True;
                         end;
                         end;
 
 
                       if LDRChange and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) then
                       if LDRChange and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) then
                         begin
                         begin
-                          AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs);
+                          AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
                           if (taicpu(p).oppostfix = PF_None) and
                           if (taicpu(p).oppostfix = PF_None) and
                             (
                             (
                               (
                               (
-                                (taicpu(hp1).opcode = A_LDR) and
-                                MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg)
+                                (taicpu(next_hp).opcode = A_LDR) and
+                                MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg)
                               ) or
                               ) or
-                              not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs)
+                              not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs)
                             ) and
                             ) and
                             { Double-check to see if the old registers were actually
                             { Double-check to see if the old registers were actually
                               changed (e.g. if the super registers matched, but not
                               changed (e.g. if the super registers matched, but not
                               the sizes, they won't be changed). }
                               the sizes, they won't be changed). }
                             (
                             (
-                              (taicpu(hp1).opcode = A_LDR) or
-                              not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[0]^)
+                              (taicpu(next_hp).opcode = A_LDR) or
+                              not RegInOp(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[0]^)
                             ) and
                             ) and
-                            not RegInRef(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^.ref^) then
+                            not RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) then
                             begin
                             begin
                               DebugMsg('Peephole Optimization: RedundantMovProcess 2a done', p);
                               DebugMsg('Peephole Optimization: RedundantMovProcess 2a done', p);
                               RemoveCurrentP(p);
                               RemoveCurrentP(p);
@@ -472,23 +472,28 @@ Implementation
                             end;
                             end;
                         end;
                         end;
                     end
                     end
-                  else if (taicpu(hp1).opcode = A_MOV) and (taicpu(hp1).oppostfix = PF_None) and
-                    (taicpu(hp1).ops = 2) then
+                  else if (taicpu(next_hp).opcode = A_MOV) and (taicpu(next_hp).oppostfix = PF_None) and
+                    (taicpu(next_hp).ops = 2) then
                     begin
                     begin
-                      if MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) then
+                      if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then
                         begin
                         begin
                           { Found another mov that writes entirely to the register }
                           { Found another mov that writes entirely to the register }
-                          if RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp1) then
+                          if RegUsedBetween(taicpu(p).oper[0]^.reg, p, next_hp) then
                             begin
                             begin
                               { Register was used beforehand }
                               { Register was used beforehand }
-                              if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) then
+                              if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[1]^.reg) then
                                 begin
                                 begin
                                   { This MOV is exactly the same as the first one.
                                   { This MOV is exactly the same as the first one.
                                     Since none of the registers have changed value
                                     Since none of the registers have changed value
                                     at this point, we can remove it. }
                                     at this point, we can remove it. }
-                                  DebugMsg('Peephole Optimization: RedundantMovProcess 3a done', hp1);
-                                  asml.Remove(hp1);
-                                  hp1.Free;
+                                  DebugMsg('Peephole Optimization: RedundantMovProcess 3a done', next_hp);
+
+                                  if (next_hp = hp1) then
+                                    { Don't let hp1 become a dangling pointer }
+                                    hp1 := nil;
+
+                                  asml.Remove(next_hp);
+                                  next_hp.Free;
 
 
                                   { We still have the original p, so we can continue optimising;
                                   { We still have the original p, so we can continue optimising;
                                    if it was -O2 or below, this instruction appeared immediately
                                    if it was -O2 or below, this instruction appeared immediately
@@ -504,7 +509,7 @@ Implementation
                           { We can delete the first MOV (only if the second MOV is unconditional) }
                           { We can delete the first MOV (only if the second MOV is unconditional) }
 {$ifdef ARM}
 {$ifdef ARM}
                           if (taicpu(p).oppostfix = PF_None) and
                           if (taicpu(p).oppostfix = PF_None) and
-                            (taicpu(hp1).condition = C_None) then
+                            (taicpu(next_hp).condition = C_None) then
 {$endif ARM}
 {$endif ARM}
                             begin
                             begin
                               DebugMsg('Peephole Optimization: RedundantMovProcess 2b done', p);
                               DebugMsg('Peephole Optimization: RedundantMovProcess 2b done', p);
@@ -513,9 +518,9 @@ Implementation
                             end;
                             end;
                           Exit;
                           Exit;
                         end
                         end
-                      else if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                      else if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[0]^.reg) then
                         begin
                         begin
-                          if MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg)
+                          if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg)
                             { Be careful - if the entire register is not used, removing this
                             { Be careful - if the entire register is not used, removing this
                               instruction will leave the unused part uninitialised }
                               instruction will leave the unused part uninitialised }
 {$ifdef AARCH64}
 {$ifdef AARCH64}
@@ -524,9 +529,14 @@ Implementation
                             then
                             then
                             begin
                             begin
                               { Instruction will become mov r1,r1 }
                               { Instruction will become mov r1,r1 }
-                              DebugMsg('Peephole Optimization: Mov2None 2 done', hp1);
-                              asml.Remove(hp1);
-                              hp1.Free;
+                              DebugMsg('Peephole Optimization: Mov2None 2 done', next_hp);
+
+                              if (next_hp = hp1) then
+                                { Don't let hp1 become a dangling pointer }
+                                hp1 := nil;
+
+                              asml.Remove(next_hp);
+                              next_hp.Free;
                               Continue;
                               Continue;
                             end;
                             end;
 
 
@@ -534,12 +544,12 @@ Implementation
                             forces it to be left alone if the full register is not
                             forces it to be left alone if the full register is not
                             used, lest mov w1,w1 gets optimised out by mistake. [Kit] }
                             used, lest mov w1,w1 gets optimised out by mistake. [Kit] }
 {$ifdef AARCH64}
 {$ifdef AARCH64}
-                          if not MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then
+                          if not MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg) then
 {$endif AARCH64}
 {$endif AARCH64}
                             begin
                             begin
-                              DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovMov2Mov 2)', hp1);
-                              taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                              AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs);
+                              DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovMov2Mov 2)', next_hp);
+                              taicpu(next_hp).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                              AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
 
 
                               { If this was the only reference to the old register,
                               { If this was the only reference to the old register,
                                 then we can remove the original MOV now }
                                 then we can remove the original MOV now }
@@ -551,7 +561,7 @@ Implementation
                                   register). [Kit] }
                                   register). [Kit] }
                                 (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
                                 (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
                                 RegInUsedRegs(taicpu(p).oper[0]^.reg, UsedRegs) and
                                 RegInUsedRegs(taicpu(p).oper[0]^.reg, UsedRegs) and
-                                not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
+                                not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs) then
                                 begin
                                 begin
                                   DebugMsg('Peephole Optimization: RedundantMovProcess 2c done', p);
                                   DebugMsg('Peephole Optimization: RedundantMovProcess 2c done', p);
                                   RemoveCurrentP(p);
                                   RemoveCurrentP(p);
@@ -565,14 +575,14 @@ Implementation
                   { On low optimisation settions, don't search more than one instruction ahead }
                   { On low optimisation settions, don't search more than one instruction ahead }
                   if not(cs_opt_level3 in current_settings.optimizerswitches) or
                   if not(cs_opt_level3 in current_settings.optimizerswitches) or
                     { Stop at procedure calls and jumps }
                     { Stop at procedure calls and jumps }
-                    is_calljmp(taicpu(hp1).opcode) or
+                    is_calljmp(taicpu(next_hp).opcode) or
                     { If the read register has changed value, or the MOV
                     { If the read register has changed value, or the MOV
                       destination register has been used, drop out }
                       destination register has been used, drop out }
-                    RegInInstruction(taicpu(p).oper[0]^.reg, hp1) or
-                    RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
+                    RegInInstruction(taicpu(p).oper[0]^.reg, next_hp) or
+                    RegModifiedByInstruction(taicpu(p).oper[1]^.reg, next_hp) then
                     Break;
                     Break;
 
 
-                  current_hp := hp1;
+                  current_hp := next_hp;
                 end;
                 end;
             end;
             end;
         end;
         end;