Browse Source

* patch by J. Gareth Moreton: fixes crash on ARM with -CriotR, resolves #38116

git-svn-id: trunk@47531 -
florian 4 years ago
parent
commit
1014e53081
2 changed files with 520 additions and 502 deletions
  1. 460 452
      compiler/arm/aoptcpu.pas
  2. 60 50
      compiler/armgen/aoptarm.pas

+ 460 - 452
compiler/arm/aoptcpu.pas

@@ -1284,504 +1284,512 @@ Implementation
 
       { All the optimisations from this point on require GetNextInstructionUsingReg
         to return True }
-      if not (
+      while (
         GetNextInstructionUsingReg(p, hpfar1, taicpu(p).oper[0]^.reg) and
         (hpfar1.typ = ait_instruction)
-      ) then
-        Exit;
+      ) do
+        begin
 
-      { Change the common
-        mov r0, r0, lsr #xxx
-        and r0, r0, #yyy/bic r0, r0, #xxx
+          { Change the common
+            mov r0, r0, lsr #xxx
+            and r0, r0, #yyy/bic r0, r0, #xxx
 
-        and remove the superfluous and/bic if possible
+            and remove the superfluous and/bic if possible
 
-        This could be extended to handle more cases.
-      }
+            This could be extended to handle more cases.
+          }
 
-      { Change
-        mov rx, ry, lsr/ror #xxx
-        uxtb/uxth rz,rx/and rz,rx,0xFF
-        dealloc rx
+          { Change
+            mov rx, ry, lsr/ror #xxx
+            uxtb/uxth rz,rx/and rz,rx,0xFF
+            dealloc rx
 
-        to
+            to
 
-        uxtb/uxth rz,ry,ror #xxx
-      }
-      if (GenerateThumb2Code) and
-         (taicpu(p).ops=3) and
-         (taicpu(p).oper[2]^.typ = top_shifterop) and
-         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-         (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
-         RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-         begin
-           if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and
-             (taicpu(hpfar1).ops = 2) and
-             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
-             begin
-               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-               taicpu(hpfar1).ops := 3;
-
-               if not Assigned(hp1) then
-                 GetNextInstruction(p,hp1);
-
-               RemoveCurrentP(p, hp1);
-
-               result:=true;
-               exit;
-             end
-           else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and
-             (taicpu(hpfar1).ops=2) and
-             (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
-             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
-             begin
-               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-               taicpu(hpfar1).ops := 3;
-
-               if not Assigned(hp1) then
-                 GetNextInstruction(p,hp1);
-
-               RemoveCurrentP(p, hp1);
-
-               result:=true;
-               exit;
-             end
-           else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and
-             (taicpu(hpfar1).ops = 3) and
-             (taicpu(hpfar1).oper[2]^.typ = top_const) and
-             (taicpu(hpfar1).oper[2]^.val = $FF) and
-             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+            uxtb/uxth rz,ry,ror #xxx
+          }
+          if (GenerateThumb2Code) and
+             (taicpu(p).ops=3) and
+             (taicpu(p).oper[2]^.typ = top_shifterop) and
+             (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+             (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
+             RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
              begin
-               taicpu(hpfar1).ops := 3;
-               taicpu(hpfar1).opcode := A_UXTB;
-               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-
-               if not Assigned(hp1) then
-                 GetNextInstruction(p,hp1);
-
-               RemoveCurrentP(p, hp1);
-
-               result:=true;
-               exit;
+               if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and
+                 (taicpu(hpfar1).ops = 2) and
+                 (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+                 MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                 begin
+                   taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                   taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                   taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+                   taicpu(hpfar1).ops := 3;
+
+                   if not Assigned(hp1) then
+                     GetNextInstruction(p,hp1);
+
+                   RemoveCurrentP(p, hp1);
+
+                   result:=true;
+                   exit;
+                 end
+               else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and
+                 (taicpu(hpfar1).ops=2) and
+                 (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
+                 MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                 begin
+                   taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                   taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                   taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+                   taicpu(hpfar1).ops := 3;
+
+                   if not Assigned(hp1) then
+                     GetNextInstruction(p,hp1);
+
+                   RemoveCurrentP(p, hp1);
+
+                   result:=true;
+                   exit;
+                 end
+               else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and
+                 (taicpu(hpfar1).ops = 3) and
+                 (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                 (taicpu(hpfar1).oper[2]^.val = $FF) and
+                 (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+                 MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                 begin
+                   taicpu(hpfar1).ops := 3;
+                   taicpu(hpfar1).opcode := A_UXTB;
+                   taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                   taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+                   taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+
+                   if not Assigned(hp1) then
+                     GetNextInstruction(p,hp1);
+
+                   RemoveCurrentP(p, hp1);
+
+                   result:=true;
+                   exit;
+                 end;
              end;
-         end;
 
-      { 2-operald mov optimisations }
-      if (taicpu(p).ops = 2) then
-        begin
-          {
-            This removes the mul from
-            mov rX,0
-            ...
-            mul ...,rX,...
-          }
-          if (taicpu(p).oper[1]^.typ = top_const) then
+          { 2-operald mov optimisations }
+          if (taicpu(p).ops = 2) then
             begin
-(*          if false and
-            (taicpu(p).oper[1]^.val=0) and
-            MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
-            (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or
-             ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then
-              begin
-                TransferUsedRegs(TmpUsedRegs);
-                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
-                DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p);
-                if taicpu(hpfar1).opcode=A_MUL then
-                  taicpu(hpfar1).loadconst(1,0)
-                else
-                  taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg);
-                taicpu(hpfar1).ops:=2;
-                taicpu(hpfar1).opcode:=A_MOV;
-                if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
-                  RemoveCurrentP(p);
-                Result:=true;
-                exit;
-              end
-          else*) if (taicpu(p).oper[1]^.val=0) and
-              MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-              MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then
+              {
+                This removes the mul from
+                mov rX,0
+                ...
+                mul ...,rX,...
+              }
+              if (taicpu(p).oper[1]^.typ = top_const) then
                 begin
-                  TransferUsedRegs(TmpUsedRegs);
-                  UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                  UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
-                  DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p);
-                  taicpu(hpfar1).ops:=3;
-                  taicpu(hpfar1).opcode:=A_MUL;
-                  if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
-                    begin
+    (*          if false and
+                (taicpu(p).oper[1]^.val=0) and
+                MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or
+                 ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then
+                  begin
+                    TransferUsedRegs(TmpUsedRegs);
+                    UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                    UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                    DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p);
+                    if taicpu(hpfar1).opcode=A_MUL then
+                      taicpu(hpfar1).loadconst(1,0)
+                    else
+                      taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg);
+                    taicpu(hpfar1).ops:=2;
+                    taicpu(hpfar1).opcode:=A_MOV;
+                    if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
                       RemoveCurrentP(p);
-                      Result:=true;
-                    end;
-                  exit;
-                end
-            {
-              This changes the very common
-              mov r0, #0
-              str r0, [...]
-              mov r0, #0
-              str r0, [...]
-
-              and removes all superfluous mov instructions
-            }
-            else if (taicpu(hpfar1).opcode=A_STR) then
-              begin
-                hp1 := hpfar1;
-                while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
-                      MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and
-                      GetNextInstruction(hp1, hp2) and
-                      MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
-                      (taicpu(hp2).ops = 2) and
-                      MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
-                      MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
+                    Result:=true;
+                    exit;
+                  end
+              else*) if (taicpu(p).oper[1]^.val=0) and
+                  MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                  MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then
+                    begin
+                      TransferUsedRegs(TmpUsedRegs);
+                      UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                      UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                      DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p);
+                      taicpu(hpfar1).ops:=3;
+                      taicpu(hpfar1).opcode:=A_MUL;
+                      if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
+                        begin
+                          RemoveCurrentP(p);
+                          Result:=true;
+                        end;
+                      exit;
+                    end
+                {
+                  This changes the very common
+                  mov r0, #0
+                  str r0, [...]
+                  mov r0, #0
+                  str r0, [...]
+
+                  and removes all superfluous mov instructions
+                }
+                else if (taicpu(hpfar1).opcode=A_STR) then
                   begin
-                    DebugMsg('Peephole Optimization: MovStrMov done', hp2);
-                    GetNextInstruction(hp2,hp1);
-                    asml.remove(hp2);
-                    hp2.free;
-                    result:=true;
-                    if not assigned(hp1) then break;
-                  end;
+                    hp1 := hpfar1;
+                    while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
+                          MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and
+                          GetNextInstruction(hp1, hp2) and
+                          MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
+                          (taicpu(hp2).ops = 2) and
+                          MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
+                          MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
+                      begin
+                        DebugMsg('Peephole Optimization: MovStrMov done', hp2);
+                        GetNextInstruction(hp2,hp1);
+                        asml.remove(hp2);
+                        hp2.free;
+                        result:=true;
+                        if not assigned(hp1) then break;
+                      end;
 
-                if Result then
-                  Exit;
-              end;
-            end;
-          {
-            This removes the first mov from
-            mov rX,...
-            mov rX,...
-          }
-          if taicpu(hpfar1).opcode=A_MOV then
-            begin
-              hp1 := p;
-              while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and
-                    (taicpu(hpfar1).ops = 2) and
-                    MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and
-                    { don't remove the first mov if the second is a mov rX,rX }
-                    not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do
+                    if Result then
+                      Exit;
+                  end;
+                end;
+              {
+                This removes the first mov from
+                mov rX,...
+                mov rX,...
+              }
+              if taicpu(hpfar1).opcode=A_MOV then
                 begin
-                  { Defer removing the first p until after the while loop }
-                  if p <> hp1 then
+                  hp1 := p;
+                  while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and
+                        (taicpu(hpfar1).ops = 2) and
+                        MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and
+                        { don't remove the first mov if the second is a mov rX,rX }
+                        not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do
                     begin
-                      DebugMsg('Peephole Optimization: MovMov done', hp1);
-                      asml.remove(hp1);
-                      hp1.free;
+                      { Defer removing the first p until after the while loop }
+                      if p <> hp1 then
+                        begin
+                          DebugMsg('Peephole Optimization: MovMov done', hp1);
+                          asml.remove(hp1);
+                          hp1.free;
+                        end;
+                      hp1:=hpfar1;
+                      GetNextInstruction(hpfar1,hpfar1);
+                      result:=true;
+                      if not assigned(hpfar1) then
+                        Break;
+                    end;
+
+                  if Result then
+                    begin
+                      DebugMsg('Peephole Optimization: MovMov done', p);
+                      RemoveCurrentp(p);
+                      Exit;
                     end;
-                  hp1:=hpfar1;
-                  GetNextInstruction(hpfar1,hpfar1);
-                  result:=true;
-                  if not assigned(hpfar1) then
-                    Break;
                 end;
 
-              if Result then
+              if RedundantMovProcess(p,hpfar1) then
                 begin
-                  DebugMsg('Peephole Optimization: MovMov done', p);
-                  RemoveCurrentp(p);
-                  Exit;
+                  Result:=true;
+                  { p might not point at a mov anymore }
+                  exit;
                 end;
-            end;
 
-          if RedundantMovProcess(p,hpfar1) then
-            begin
-              Result:=true;
-              { p might not point at a mov anymore }
-              exit;
-            end;
+              { If hpfar1 is nil after the call to RedundantMovProcess, it is
+                because it would have become a dangling pointer, so reinitialise it. }
+              if not Assigned(hpfar1) then
+                Continue;
 
-          { Fold the very common sequence
-              mov  regA, regB
-              ldr* regA, [regA]
-            to
-              ldr* regA, [regB]
-            CAUTION! If this one is successful p might not be a mov instruction anymore!
-          }
-          if
-             // Make sure that Thumb code doesn't propagate a high register into a reference
-             (
-               (
-                 GenerateThumbCode and
-                 (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)
-               ) or (not GenerateThumbCode)
-             ) and
-             (taicpu(p).oper[1]^.typ = top_reg) and
-             (taicpu(p).oppostfix = PF_NONE) and
-             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and
-             (taicpu(hpfar1).oper[1]^.typ = top_ref) and
-             { We can change the base register only when the instruction uses AM_OFFSET }
-             ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
-               ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
-             ) and
-             not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
-             RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-            begin
-              DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1);
-              if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-                taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
-
-              if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
-                taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
-
-              dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
-              if Assigned(dealloc) then
+              { Fold the very common sequence
+                  mov  regA, regB
+                  ldr* regA, [regA]
+                to
+                  ldr* regA, [regB]
+                CAUTION! If this one is successful p might not be a mov instruction anymore!
+              }
+              if
+                 // Make sure that Thumb code doesn't propagate a high register into a reference
+                 (
+                   (
+                     GenerateThumbCode and
+                     (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)
+                   ) or (not GenerateThumbCode)
+                 ) and
+                 (taicpu(p).oper[1]^.typ = top_reg) and
+                 (taicpu(p).oppostfix = PF_NONE) and
+                 MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and
+                 (taicpu(hpfar1).oper[1]^.typ = top_ref) and
+                 { We can change the base register only when the instruction uses AM_OFFSET }
+                 ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
+                   ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                    (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
+                 ) and
+                 not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+                 RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
                 begin
-                  asml.remove(dealloc);
-                  asml.InsertAfter(dealloc,hpfar1);
-                end;
+                  DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1);
+                  if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                     (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+                    taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
 
-              if not Assigned(hp1) then
-                GetNextInstruction(p, hp1);
+                  if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
+                    taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
 
-              RemoveCurrentP(p, hp1);
+                  dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
+                  if Assigned(dealloc) then
+                    begin
+                      asml.remove(dealloc);
+                      asml.InsertAfter(dealloc,hpfar1);
+                    end;
 
-              result:=true;
-              Exit;
-            end
-        end
+                  if not Assigned(hp1) then
+                    GetNextInstruction(p, hp1);
 
-      { 3-operald mov optimisations }
-      else if (taicpu(p).ops = 3) then
-        begin
+                  RemoveCurrentP(p, hp1);
 
-          if (taicpu(p).oper[2]^.typ = top_shifterop) and
-            (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-            (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
-            (taicpu(hpfar1).ops>=1) and
-            (taicpu(hpfar1).oper[0]^.typ=top_reg) and
-            (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and
-            RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-            begin
-              if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
-                MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                (taicpu(hpfar1).ops=3) and
-                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
-                (taicpu(hpfar1).oper[2]^.typ = top_const) and
-                { Check if the AND actually would only mask out bits being already zero because of the shift
-                }
-                ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) =
-                  ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
-                begin
-                  DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1);
-                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
-                  asml.remove(hpfar1);
-                  hpfar1.free;
                   result:=true;
                   Exit;
                 end
-              else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                (taicpu(hpfar1).ops=3) and
-                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
-                (taicpu(hpfar1).oper[2]^.typ = top_const) and
-                { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
-                (taicpu(hpfar1).oper[2]^.val<>0) and
-                (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
-                begin
-                  DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1);
-                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
-                  asml.remove(hpfar1);
-                  hpfar1.free;
-                  result:=true;
-                  Exit;
-                end;
-            end;
-          { This folds shifterops into following instructions
-            mov r0, r1, lsl #8
-            add r2, r3, r0
+            end
 
-            to
+          { 3-operald mov optimisations }
+          else if (taicpu(p).ops = 3) then
+            begin
 
-            add r2, r3, r1, lsl #8
-            CAUTION! If this one is successful p might not be a mov instruction anymore!
-          }
-          if (taicpu(p).oper[1]^.typ = top_reg) and
-           (taicpu(p).oper[2]^.typ = top_shifterop) and
-           (taicpu(p).oppostfix = PF_NONE) and
-           MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
-                                  A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
-                                  A_CMP, A_CMN],
-                            [taicpu(p).condition], [PF_None]) and
-           (not ((GenerateThumb2Code) and
-                 (taicpu(hpfar1).opcode in [A_SBC]) and
-                 (((taicpu(hpfar1).ops=3) and
-                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or
-                  ((taicpu(hpfar1).ops=2) and
-                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and
-           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and
-           (taicpu(hpfar1).ops >= 2) and
-           {Currently we can't fold into another shifterop}
-           (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and
-           {Folding rrx is problematic because of the C-Flag, as we currently can't check
-            NR_DEFAULTFLAGS for modification}
-           (
-             {Everything is fine if we don't use RRX}
-             (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
-             (
-               {If it is RRX, then check if we're just accessing the next instruction}
-               Assigned(hp1) and
-               (hpfar1 = hp1)
-             )
-           ) and
-           { reg1 might not be modified inbetween }
-           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
-           { The shifterop can contain a register, might not be modified}
-           (
-             (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
-             not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1))
-           ) and
-           (
-             {Only ONE of the two src operands is allowed to match}
-             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor
-             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^)
-           ) then
-          begin
-            if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then
-              I2:=0
-            else
-              I2:=1;
-            for I:=I2 to taicpu(hpfar1).ops-1 do
-              if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then
+              if (taicpu(p).oper[2]^.typ = top_shifterop) and
+                (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+                (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
+                (taicpu(hpfar1).ops>=1) and
+                (taicpu(hpfar1).oper[0]^.typ=top_reg) and
+                (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and
+                RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
                 begin
-                  { If the parameter matched on the second op from the RIGHT
-                    we have to switch the parameters, this will not happen for CMP
-                    were we're only evaluating the most right parameter
-                  }
-                  if I <> taicpu(hpfar1).ops-1 then
+                  if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
+                    MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                    (taicpu(hpfar1).ops=3) and
+                    MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                    (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                    { Check if the AND actually would only mask out bits being already zero because of the shift
+                    }
+                    ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) =
+                      ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
                     begin
-                      {The SUB operators need to be changed when we swap parameters}
-                      case taicpu(hpfar1).opcode of
-                        A_SUB: tempop:=A_RSB;
-                        A_SBC: tempop:=A_RSC;
-                        A_RSB: tempop:=A_SUB;
-                        A_RSC: tempop:=A_SBC;
-                        else tempop:=taicpu(hpfar1).opcode;
-                      end;
-                      if taicpu(hpfar1).ops = 3 then
-                        hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
-                             taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg,
-                             taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                      else
-                        hp2:=taicpu.op_reg_reg_shifterop(tempop,
-                             taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                             taicpu(p).oper[2]^.shifterop^);
+                      DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1);
+                      taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                      asml.remove(hpfar1);
+                      hpfar1.free;
+                      result:=true;
+                      Exit;
                     end
-                  else
-                    if taicpu(hpfar1).ops = 3 then
-                      hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode,
-                           taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg,
-                           taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                    else
-                      hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode,
-                           taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                           taicpu(p).oper[2]^.shifterop^);
-                  if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
-                    AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs);
-                  AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs);
-                  asml.insertbefore(hp2, hpfar1);
-                  asml.remove(hpfar1);
-                  hpfar1.free;
-                  DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2);
+                  else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                    (taicpu(hpfar1).ops=3) and
+                    MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                    (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                    { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
+                    (taicpu(hpfar1).oper[2]^.val<>0) and
+                    (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
+                    begin
+                      DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1);
+                      taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                      asml.remove(hpfar1);
+                      hpfar1.free;
+                      result:=true;
+                      Exit;
+                    end;
+                end;
+              { This folds shifterops into following instructions
+                mov r0, r1, lsl #8
+                add r2, r3, r0
 
-                  if not Assigned(hp1) then
-                    GetNextInstruction(p, hp1)
-                  else if hp1 = hpfar1 then
-                    { If hp1 = hpfar1, then it's a dangling pointer }
-                    hp1 := hp2;
+                to
 
-                  RemoveCurrentP(p, hp1);
-                  Result:=true;
-                  Exit;
-                end;
-          end;
-        {
-          Fold
-            mov r1, r1, lsl #2
-            ldr/ldrb r0, [r0, r1]
-          to
-            ldr/ldrb r0, [r0, r1, lsl #2]
-
-          XXX: This still needs some work, as we quite often encounter something like
-                 mov r1, r2, lsl #2
-                 add r2, r3, #imm
-                 ldr r0, [r2, r1]
-               which can't be folded because r2 is overwritten between the shift and the ldr.
-               We could try to shuffle the registers around and fold it into.
-                 add r1, r3, #imm
-                 ldr r0, [r1, r2, lsl #2]
-        }
-        if (not(GenerateThumbCode)) and
-          { thumb2 allows only lsl #0..#3 }
-          (not(GenerateThumb2Code) or
-           ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
-            (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
-           )
-          ) and
-           (taicpu(p).oper[1]^.typ = top_reg) and
-           (taicpu(p).oper[2]^.typ = top_shifterop) and
-           { RRX is tough to handle, because it requires tracking the C-Flag,
-             it is also extremly unlikely to be emitted this way}
-           (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
-           (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
-           (taicpu(p).oppostfix = PF_NONE) and
-           {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
-           (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
-            (GenerateThumb2Code and
-             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
-           ) and
-           (
-             {If this is address by offset, one of the two registers can be used}
-             ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+                add r2, r3, r1, lsl #8
+                CAUTION! If this one is successful p might not be a mov instruction anymore!
+              }
+              if (taicpu(p).oper[1]^.typ = top_reg) and
+               (taicpu(p).oper[2]^.typ = top_shifterop) and
+               (taicpu(p).oppostfix = PF_NONE) and
+               MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
+                                      A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
+                                      A_CMP, A_CMN],
+                                [taicpu(p).condition], [PF_None]) and
+               (not ((GenerateThumb2Code) and
+                     (taicpu(hpfar1).opcode in [A_SBC]) and
+                     (((taicpu(hpfar1).ops=3) and
+                       MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or
+                      ((taicpu(hpfar1).ops=2) and
+                       MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and
+               RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and
+               (taicpu(hpfar1).ops >= 2) and
+               {Currently we can't fold into another shifterop}
+               (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and
+               {Folding rrx is problematic because of the C-Flag, as we currently can't check
+                NR_DEFAULTFLAGS for modification}
+               (
+                 {Everything is fine if we don't use RRX}
+                 (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
+                 (
+                   {If it is RRX, then check if we're just accessing the next instruction}
+                   Assigned(hp1) and
+                   (hpfar1 = hp1)
+                 )
+               ) and
+               { reg1 might not be modified inbetween }
+               not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+               { The shifterop can contain a register, might not be modified}
+               (
+                 (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
+                 not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1))
+               ) and
                (
-                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
-                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
+                 {Only ONE of the two src operands is allowed to match}
+                 MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor
+                 MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^)
+               ) then
+              begin
+                if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then
+                  I2:=0
+                else
+                  I2:=1;
+                for I:=I2 to taicpu(hpfar1).ops-1 do
+                  if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then
+                    begin
+                      { If the parameter matched on the second op from the RIGHT
+                        we have to switch the parameters, this will not happen for CMP
+                        were we're only evaluating the most right parameter
+                      }
+                      if I <> taicpu(hpfar1).ops-1 then
+                        begin
+                          {The SUB operators need to be changed when we swap parameters}
+                          case taicpu(hpfar1).opcode of
+                            A_SUB: tempop:=A_RSB;
+                            A_SBC: tempop:=A_RSC;
+                            A_RSB: tempop:=A_SUB;
+                            A_RSC: tempop:=A_SBC;
+                            else tempop:=taicpu(hpfar1).opcode;
+                          end;
+                          if taicpu(hpfar1).ops = 3 then
+                            hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
+                                 taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg,
+                                 taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                          else
+                            hp2:=taicpu.op_reg_reg_shifterop(tempop,
+                                 taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                                 taicpu(p).oper[2]^.shifterop^);
+                        end
+                      else
+                        if taicpu(hpfar1).ops = 3 then
+                          hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                               taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg,
+                               taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                        else
+                          hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                               taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                               taicpu(p).oper[2]^.shifterop^);
+                      if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
+                        AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs);
+                      AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs);
+                      asml.insertbefore(hp2, hpfar1);
+                      asml.remove(hpfar1);
+                      hpfar1.free;
+                      DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2);
+
+                      if not Assigned(hp1) then
+                        GetNextInstruction(p, hp1)
+                      else if hp1 = hpfar1 then
+                        { If hp1 = hpfar1, then it's a dangling pointer }
+                        hp1 := hp2;
+
+                      RemoveCurrentP(p, hp1);
+                      Result:=true;
+                      Exit;
+                    end;
+              end;
+            {
+              Fold
+                mov r1, r1, lsl #2
+                ldr/ldrb r0, [r0, r1]
+              to
+                ldr/ldrb r0, [r0, r1, lsl #2]
+
+              XXX: This still needs some work, as we quite often encounter something like
+                     mov r1, r2, lsl #2
+                     add r2, r3, #imm
+                     ldr r0, [r2, r1]
+                   which can't be folded because r2 is overwritten between the shift and the ldr.
+                   We could try to shuffle the registers around and fold it into.
+                     add r1, r3, #imm
+                     ldr r0, [r1, r2, lsl #2]
+            }
+            if (not(GenerateThumbCode)) and
+              { thumb2 allows only lsl #0..#3 }
+              (not(GenerateThumb2Code) or
+               ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
+                (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
                )
-             ) or
-             {For post and preindexed only the index register can be used}
-             ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
+              ) and
+               (taicpu(p).oper[1]^.typ = top_reg) and
+               (taicpu(p).oper[2]^.typ = top_shifterop) and
+               { RRX is tough to handle, because it requires tracking the C-Flag,
+                 it is also extremly unlikely to be emitted this way}
+               (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
+               (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
+               (taicpu(p).oppostfix = PF_NONE) and
+               {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
+               (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
+                (GenerateThumb2Code and
+                 MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
+               ) and
                (
-                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
-                 (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
+                 {If this is address by offset, one of the two registers can be used}
+                 ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+                   (
+                     (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
+                     (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
+                   )
+                 ) or
+                 {For post and preindexed only the index register can be used}
+                 ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
+                   (
+                     (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
+                     (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
+                   ) and
+                   (not GenerateThumb2Code)
+                 )
                ) and
-               (not GenerateThumb2Code)
-             )
-           ) and
-           { Only fold if both registers are used. Otherwise we are folding p with itself }
-           (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and
-           (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and
-           { Only fold if there isn't another shifterop already, and offset is zero. }
-           (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and
-           (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and
-           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
-           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
-           begin
-             { If the register we want to do the shift for resides in base, we need to swap that}
-             if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-               taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index;
-             taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
-             taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
-             taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
-             DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1);
-             RemoveCurrentP(p);
-             Result:=true;
-             Exit;
-           end;
+               { Only fold if both registers are used. Otherwise we are folding p with itself }
+               (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and
+               (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and
+               { Only fold if there isn't another shifterop already, and offset is zero. }
+               (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and
+               (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and
+               not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+               RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+               begin
+                 { If the register we want to do the shift for resides in base, we need to swap that}
+                 if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+                   taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index;
+                 taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+                 taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
+                 taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
+                 DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1);
+                 RemoveCurrentP(p);
+                 Result:=true;
+                 Exit;
+               end;
+            end;
+          {
+            Often we see shifts and then a superfluous mov to another register
+            In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
+          }
+          if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then
+            Result:=true;
+
+          Exit;
         end;
-      {
-        Often we see shifts and then a superfluous mov to another register
-        In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
-      }
-      if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then
-        Result:=true;
     end;
 
 

+ 60 - 50
compiler/armgen/aoptarm.pas

@@ -40,7 +40,7 @@ Type
     procedure DebugMsg(const s : string; p : tai);
 
     function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean;
-    function RedundantMovProcess(var p: tai; hp1: tai): boolean;
+    function RedundantMovProcess(var p: tai; var hp1: tai): boolean;
     function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean;
 
     function OptPass1UXTB(var p: tai): Boolean;
@@ -292,10 +292,10 @@ Implementation
     end;
 
 
-  function TARMAsmOptimizer.RedundantMovProcess(var p: tai;hp1: tai):boolean;
+  function TARMAsmOptimizer.RedundantMovProcess(var p: tai; var hp1: tai):boolean;
     var
       I: Integer;
-      current_hp: tai;
+      current_hp, next_hp: tai;
       LDRChange: Boolean;
     begin
       Result:=false;
@@ -390,80 +390,80 @@ Implementation
               TransferUsedRegs(TmpUsedRegs);
 
               { Search local instruction block }
-              while GetNextInstruction(current_hp, hp1) and (hp1 <> BlockEnd) and (hp1.typ = ait_instruction) do
+              while GetNextInstruction(current_hp, next_hp) and (next_hp <> BlockEnd) and (next_hp.typ = ait_instruction) do
                 begin
                   UpdateUsedRegs(TmpUsedRegs, tai(current_hp.Next));
                   LDRChange := False;
 
-                  if (taicpu(hp1).opcode in [A_LDR,A_STR]) and (taicpu(hp1).ops = 2) then
+                  if (taicpu(next_hp).opcode in [A_LDR,A_STR]) and (taicpu(next_hp).ops = 2) then
                     begin
 
                       { Change the registers from r1 to r0 }
-                      if (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) and
+                      if (taicpu(next_hp).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) and
 {$ifdef ARM}
                         { This optimisation conflicts with something and raises
                           an access violation - needs further investigation. [Kit] }
-                        (taicpu(hp1).opcode <> A_LDR) and
+                        (taicpu(next_hp).opcode <> A_LDR) and
 {$endif ARM}
                         { Don't mess around with the base register if the
                           reference is pre- or post-indexed }
-                        (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) then
+                        (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_OFFSET) then
                         begin
-                          taicpu(hp1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
+                          taicpu(next_hp).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
                           LDRChange := True;
                         end;
 
-                      if taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
+                      if taicpu(next_hp).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
                         begin
-                          taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+                          taicpu(next_hp).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
                           LDRChange := True;
                         end;
 
                       if LDRChange then
-                        DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 1)', hp1);
+                        DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 1)', next_hp);
 
                       { Drop out if we're dealing with pre-indexed references }
-                      if (taicpu(hp1).oper[1]^.ref^.addressmode = AM_PREINDEXED) and
+                      if (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_PREINDEXED) and
                         (
-                          RegInRef(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^.ref^) or
-                          RegInRef(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.ref^)
+                          RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) or
+                          RegInRef(taicpu(p).oper[1]^.reg, taicpu(next_hp).oper[1]^.ref^)
                         ) then
                         begin
                           { Remember to update register allocations }
                           if LDRChange then
-                            AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs);
+                            AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
 
                           Break;
                         end;
 
                       { The register being stored can be potentially changed (as long as it's not the stack pointer) }
-                      if (taicpu(hp1).opcode = A_STR) and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
-                        MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) then
+                      if (taicpu(next_hp).opcode = A_STR) and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
+                        MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then
                         begin
-                          DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 2)', hp1);
-                          taicpu(hp1).oper[0]^.reg := taicpu(p).oper[1]^.reg;
+                          DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 2)', next_hp);
+                          taicpu(next_hp).oper[0]^.reg := taicpu(p).oper[1]^.reg;
                           LDRChange := True;
                         end;
 
                       if LDRChange and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) then
                         begin
-                          AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs);
+                          AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
                           if (taicpu(p).oppostfix = PF_None) and
                             (
                               (
-                                (taicpu(hp1).opcode = A_LDR) and
-                                MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg)
+                                (taicpu(next_hp).opcode = A_LDR) and
+                                MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg)
                               ) or
-                              not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs)
+                              not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs)
                             ) and
                             { Double-check to see if the old registers were actually
                               changed (e.g. if the super registers matched, but not
                               the sizes, they won't be changed). }
                             (
-                              (taicpu(hp1).opcode = A_LDR) or
-                              not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[0]^)
+                              (taicpu(next_hp).opcode = A_LDR) or
+                              not RegInOp(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[0]^)
                             ) and
-                            not RegInRef(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^.ref^) then
+                            not RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) then
                             begin
                               DebugMsg('Peephole Optimization: RedundantMovProcess 2a done', p);
                               RemoveCurrentP(p);
@@ -472,23 +472,28 @@ Implementation
                             end;
                         end;
                     end
-                  else if (taicpu(hp1).opcode = A_MOV) and (taicpu(hp1).oppostfix = PF_None) and
-                    (taicpu(hp1).ops = 2) then
+                  else if (taicpu(next_hp).opcode = A_MOV) and (taicpu(next_hp).oppostfix = PF_None) and
+                    (taicpu(next_hp).ops = 2) then
                     begin
-                      if MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) then
+                      if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then
                         begin
                           { Found another mov that writes entirely to the register }
-                          if RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp1) then
+                          if RegUsedBetween(taicpu(p).oper[0]^.reg, p, next_hp) then
                             begin
                               { Register was used beforehand }
-                              if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) then
+                              if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[1]^.reg) then
                                 begin
                                   { This MOV is exactly the same as the first one.
                                     Since none of the registers have changed value
                                     at this point, we can remove it. }
-                                  DebugMsg('Peephole Optimization: RedundantMovProcess 3a done', hp1);
-                                  asml.Remove(hp1);
-                                  hp1.Free;
+                                  DebugMsg('Peephole Optimization: RedundantMovProcess 3a done', next_hp);
+
+                                  if (next_hp = hp1) then
+                                    { Don't let hp1 become a dangling pointer }
+                                    hp1 := nil;
+
+                                  asml.Remove(next_hp);
+                                  next_hp.Free;
 
                                   { We still have the original p, so we can continue optimising;
                                    if it was -O2 or below, this instruction appeared immediately
@@ -504,7 +509,7 @@ Implementation
                           { We can delete the first MOV (only if the second MOV is unconditional) }
 {$ifdef ARM}
                           if (taicpu(p).oppostfix = PF_None) and
-                            (taicpu(hp1).condition = C_None) then
+                            (taicpu(next_hp).condition = C_None) then
 {$endif ARM}
                             begin
                               DebugMsg('Peephole Optimization: RedundantMovProcess 2b done', p);
@@ -513,9 +518,9 @@ Implementation
                             end;
                           Exit;
                         end
-                      else if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                      else if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[0]^.reg) then
                         begin
-                          if MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg)
+                          if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg)
                             { Be careful - if the entire register is not used, removing this
                               instruction will leave the unused part uninitialised }
 {$ifdef AARCH64}
@@ -524,9 +529,14 @@ Implementation
                             then
                             begin
                               { Instruction will become mov r1,r1 }
-                              DebugMsg('Peephole Optimization: Mov2None 2 done', hp1);
-                              asml.Remove(hp1);
-                              hp1.Free;
+                              DebugMsg('Peephole Optimization: Mov2None 2 done', next_hp);
+
+                              if (next_hp = hp1) then
+                                { Don't let hp1 become a dangling pointer }
+                                hp1 := nil;
+
+                              asml.Remove(next_hp);
+                              next_hp.Free;
                               Continue;
                             end;
 
@@ -534,12 +544,12 @@ Implementation
                             forces it to be left alone if the full register is not
                             used, lest mov w1,w1 gets optimised out by mistake. [Kit] }
 {$ifdef AARCH64}
-                          if not MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then
+                          if not MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg) then
 {$endif AARCH64}
                             begin
-                              DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovMov2Mov 2)', hp1);
-                              taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                              AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs);
+                              DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovMov2Mov 2)', next_hp);
+                              taicpu(next_hp).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+                              AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs);
 
                               { If this was the only reference to the old register,
                                 then we can remove the original MOV now }
@@ -551,7 +561,7 @@ Implementation
                                   register). [Kit] }
                                 (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and
                                 RegInUsedRegs(taicpu(p).oper[0]^.reg, UsedRegs) and
-                                not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then
+                                not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs) then
                                 begin
                                   DebugMsg('Peephole Optimization: RedundantMovProcess 2c done', p);
                                   RemoveCurrentP(p);
@@ -565,14 +575,14 @@ Implementation
                   { On low optimisation settions, don't search more than one instruction ahead }
                   if not(cs_opt_level3 in current_settings.optimizerswitches) or
                     { Stop at procedure calls and jumps }
-                    is_calljmp(taicpu(hp1).opcode) or
+                    is_calljmp(taicpu(next_hp).opcode) or
                     { If the read register has changed value, or the MOV
                       destination register has been used, drop out }
-                    RegInInstruction(taicpu(p).oper[0]^.reg, hp1) or
-                    RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then
+                    RegInInstruction(taicpu(p).oper[0]^.reg, next_hp) or
+                    RegModifiedByInstruction(taicpu(p).oper[1]^.reg, next_hp) then
                     Break;
 
-                  current_hp := hp1;
+                  current_hp := next_hp;
                 end;
             end;
         end;