Преглед изворни кода

* synchronized with trunk

git-svn-id: branches/wasm@46977 -
nickysn пре 4 година
родитељ
комит
38c4c93cee

+ 2 - 0
.gitattributes

@@ -18515,9 +18515,11 @@ tests/webtbs/tw3777.pp svneol=native#text/plain
 tests/webtbs/tw37779.pp svneol=native#text/pascal
 tests/webtbs/tw3778.pp svneol=native#text/plain
 tests/webtbs/tw37780.pp svneol=native#text/plain
+tests/webtbs/tw37796.pp svneol=native#text/pascal
 tests/webtbs/tw3780.pp svneol=native#text/plain
 tests/webtbs/tw37806.pp svneol=native#text/pascal
 tests/webtbs/tw3782.pp svneol=native#text/plain
+tests/webtbs/tw37823.pp svneol=native#text/pascal
 tests/webtbs/tw3796.pp svneol=native#text/plain
 tests/webtbs/tw3805.pp svneol=native#text/plain
 tests/webtbs/tw3814.pp svneol=native#text/plain

+ 40 - 37
compiler/aarch64/aoptcpu.pas

@@ -43,12 +43,13 @@ Interface
         function PostPeepHoleOptsCpu(var p: tai): boolean; override;
         function RegLoadedWithNewValue(reg: tregister; hp: tai): boolean;override;
         function InstructionLoadsFromReg(const reg: TRegister; const hp: tai): boolean;override;
-        function LookForPostindexedPattern(p : taicpu) : boolean;
+        function LookForPostindexedPattern(var p : tai) : boolean;
       private
+        function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
         function OptPass1Shift(var p: tai): boolean;
         function OptPostCMP(var p: tai): boolean;
         function OptPass1Data(var p: tai): boolean;
-        function RemoveSuperfluousFMov(const p: tai; movp: tai; const optimizer: string): boolean;
+        function OptPass1FData(var p: tai): Boolean;
         function OptPass1STP(var p: tai): boolean;
         function OptPass1Mov(var p: tai): boolean;
         function OptPass1FMov(var p: tai): Boolean;
@@ -172,20 +173,20 @@ Implementation
 
       ldr/str regX,[reg1], regY/const
   }
-  function TCpuAsmOptimizer.LookForPostindexedPattern(p: taicpu) : boolean;
+  function TCpuAsmOptimizer.LookForPostindexedPattern(var p: tai) : boolean;
     var
       hp1 : tai;
     begin
       Result:=false;
-      if (p.oper[1]^.typ = top_ref) and
-        (p.oper[1]^.ref^.addressmode=AM_OFFSET) and
-        (p.oper[1]^.ref^.index=NR_NO) and
-        (p.oper[1]^.ref^.offset=0) and
-        GetNextInstructionUsingReg(p, hp1, p.oper[1]^.ref^.base) and
+      if (taicpu(p).oper[1]^.typ = top_ref) and
+        (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
+        (taicpu(p).oper[1]^.ref^.index=NR_NO) and
+        (taicpu(p).oper[1]^.ref^.offset=0) and
+        GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.ref^.base) and
         { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
         MatchInstruction(hp1, [A_ADD, A_SUB], [PF_None]) and
-        (taicpu(hp1).oper[0]^.reg=p.oper[1]^.ref^.base) and
-        (taicpu(hp1).oper[1]^.reg=p.oper[1]^.ref^.base) and
+        (taicpu(hp1).oper[0]^.reg=taicpu(p).oper[1]^.ref^.base) and
+        (taicpu(hp1).oper[1]^.reg=taicpu(p).oper[1]^.ref^.base) and
         (
          { valid offset? }
          (taicpu(hp1).oper[2]^.typ=top_const) and
@@ -193,16 +194,20 @@ Implementation
          (abs(taicpu(hp1).oper[2]^.val)<256)
         ) and
         { don't apply the optimization if the base register is loaded }
-        (getsupreg(p.oper[0]^.reg)<>getsupreg(p.oper[1]^.ref^.base)) and
+        (getsupreg(taicpu(p).oper[0]^.reg)<>getsupreg(taicpu(p).oper[1]^.ref^.base)) and
         not(RegModifiedBetween(taicpu(hp1).oper[0]^.reg,p,hp1)) and
         not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) then
         begin
-          DebugMsg('Peephole Str/LdrAdd/Sub2Str/Ldr Postindex done', p);
-          p.oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
+          if taicpu(p).opcode = A_LDR then
+            DebugMsg('Peephole LdrAdd/Sub2Ldr Postindex done', p)
+          else
+            DebugMsg('Peephole StrAdd/Sub2Str Postindex done', p);
+
+          taicpu(p).oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
           if taicpu(hp1).opcode=A_ADD then
-            p.oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
+            taicpu(p).oper[1]^.ref^.offset:=taicpu(hp1).oper[2]^.val
           else
-            p.oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
+            taicpu(p).oper[1]^.ref^.offset:=-taicpu(hp1).oper[2]^.val;
           asml.Remove(hp1);
           hp1.Free;
           Result:=true;
@@ -398,10 +403,17 @@ Implementation
     var
       hp1: tai;
     begin
-      result:=false;
-      if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-        RemoveSuperfluousMove(p, hp1, 'DataMov2Data') then
-        Result:=true;
+      Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1FData(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp');
     end;
 
 
@@ -431,21 +443,20 @@ Implementation
         (taicpu(p).oper[2]^.ref^.index=NR_NO) and
         (taicpu(p).oper[2]^.ref^.offset=-16) and
         (taicpu(p).oper[2]^.ref^.addressmode=AM_PREINDEXED) and
-        GetNextInstruction(p, hp1) and
-        GetNextInstruction(hp1, hp2) and
-        SkipEntryExitMarker(hp2, hp2) and
-        GetNextInstruction(hp2, hp3) and
-        SkipEntryExitMarker(hp3, hp3) and
-        GetNextInstruction(hp3, hp4) and
 
+        GetNextInstruction(p, hp1) and
         MatchInstruction(hp1, A_MOV, [C_None], [PF_NONE]) and
         MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
         (taicpu(hp1).oper[1]^.typ = top_reg) and
         (taicpu(hp1).oper[1]^.reg = NR_STACK_POINTER_REG) and
 
+        GetNextInstruction(hp1, hp2) and
+        SkipEntryExitMarker(hp2, hp2) and
         MatchInstruction(hp2, A_BL, [C_None], [PF_NONE]) and
         (taicpu(hp2).oper[0]^.typ = top_ref) and
 
+        GetNextInstruction(hp2, hp3) and
+        SkipEntryExitMarker(hp3, hp3) and
         MatchInstruction(hp3, A_LDP, [C_None], [PF_NONE]) and
         MatchOpType(taicpu(hp3),top_reg,top_reg,top_ref) and
         (taicpu(hp3).oper[0]^.reg = NR_X29) and
@@ -455,6 +466,7 @@ Implementation
         (taicpu(hp3).oper[2]^.ref^.offset=16) and
         (taicpu(hp3).oper[2]^.ref^.addressmode=AM_POSTINDEXED) and
 
+        GetNextInstruction(hp3, hp4) and
         MatchInstruction(hp4, A_RET, [C_None], [PF_None]) and
         (taicpu(hp4).ops = 0) then
         begin
@@ -728,14 +740,9 @@ Implementation
       if p.typ=ait_instruction then
         begin
           case taicpu(p).opcode of
-            A_LDR:
-              begin
-                Result:=LookForPostindexedPattern(taicpu(p));
-              end;
+            A_LDR,
             A_STR:
-              begin
-                Result:=LookForPostindexedPattern(taicpu(p));
-              end;
+              Result:=LookForPostindexedPattern(p);
             A_MOV:
               Result:=OptPass1Mov(p);
             A_STP:
@@ -773,11 +780,7 @@ Implementation
             A_FNEG,
             A_FCVT,
             A_FABS:
-              begin
-                if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                  RemoveSuperfluousFMov(p, hp1, 'FOpFMov2FOp') then
-                  Result:=true;
-              end;
+              Result:=OptPass1FData(p);
             A_FMOV:
               Result:=OptPass1FMov(p);
             else

+ 1383 - 1257
compiler/arm/aoptcpu.pas

@@ -62,6 +62,21 @@ Type
   protected
     function LookForPreindexedPattern(p: taicpu): boolean;
     function LookForPostindexedPattern(p: taicpu): boolean;
+
+
+    { Individual optimisation routines }
+    function OptPass1DataCheckMov(var p: tai): Boolean;
+    function OptPass1ADDSUB(var p: tai): Boolean;
+    function OptPass1And(var p: tai): Boolean; override; { There's optimisation code that's general for all ARM platforms }
+    function OptPass1CMP(var p: tai): Boolean;
+    function OptPass1LDR(var p: tai): Boolean;
+    function OptPass1STM(var p: tai): Boolean;
+    function OptPass1STR(var p: tai): Boolean;
+    function OptPass1MOV(var p: tai): Boolean;
+    function OptPass1MUL(var p: tai): Boolean;
+    function OptPass1MVN(var p: tai): Boolean;
+    function OptPass1VMov(var p: tai): Boolean;
+    function OptPass1VOp(var p: tai): Boolean;
   End;
 
   TCpuPreRegallocScheduler = class(TAsmScheduler)
@@ -117,7 +132,7 @@ Implementation
          (taicpu(cmpp).oper[0]^.reg = taicpu(movp).oper[0]^.reg) and
          (taicpu(cmpp).oper[1]^.val = taicpu(movp).oper[1]^.val) then
       begin
-        asml.insertafter(tai_comment.Create(strpnew('Peephole CmpMovMov - Removed redundant moveq')), movp);
+        asml.insertafter(tai_comment.Create(strpnew('Peephole Optimization: CmpMovMov - Removed redundant moveq')), movp);
         asml.remove(movp);
         movp.free;
         Result:=true;
@@ -355,7 +370,7 @@ Implementation
           dealloc:=FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(movp.Next));
           if assigned(dealloc) then
             begin
-              DebugMsg('Peephole '+optimizer+' removed superfluous vmov', movp);
+              DebugMsg('Peephole Optimization: '+optimizer+' removed superfluous vmov', movp);
               result:=true;
 
               { taicpu(p).oper[0]^.reg is not used anymore, try to find its allocation
@@ -498,7 +513,7 @@ Implementation
         not(RegModifiedBetween(taicpu(hp1).oper[2]^.reg,p,hp1)) and
         GenerateARMCode then
         begin
-          DebugMsg('Peephole Str/LdrAdd/Sub2Str/Ldr Postindex done', p);
+          DebugMsg('Peephole Optimization: Str/LdrAdd/Sub2Str/Ldr Postindex done', p);
           p.oper[1]^.ref^.addressmode:=AM_POSTINDEXED;
           if taicpu(hp1).oper[2]^.typ=top_const then
             begin
@@ -522,1297 +537,1408 @@ Implementation
     end;
 
 
-  function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
+  function TCpuAsmOptimizer.OptPass1ADDSUB(var p: tai): Boolean;
     var
-      hp1,hp2,hp3,hp4: tai;
-      i, i2: longint;
-      tempop: tasmop;
+      hp1,hp2: tai;
       oldreg: tregister;
-      dealloc: tai_regalloc;
+    begin
+      Result := OptPass1DataCheckMov(p);
+
+      {
+        change
+        add/sub reg2,reg1,const1
+        str/ldr reg3,[reg2,const2]
+        dealloc reg2
+        to
+        str/ldr reg3,[reg1,const2+/-const1]
+      }
+      if (not GenerateThumbCode) and
+         (taicpu(p).ops>2) and
+         (taicpu(p).oper[1]^.typ = top_reg) and
+         (taicpu(p).oper[2]^.typ = top_const) then
+        begin
+          hp1:=p;
+          while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) and
+            { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
+            MatchInstruction(hp1, [A_LDR, A_STR], [C_None], []) and
+            (taicpu(hp1).oper[1]^.typ = top_ref) and
+            (taicpu(hp1).oper[1]^.ref^.base=taicpu(p).oper[0]^.reg) and
+            { don't optimize if the register is stored/overwritten }
+            (taicpu(hp1).oper[0]^.reg<>taicpu(p).oper[1]^.reg) and
+            (taicpu(hp1).oper[1]^.ref^.index=NR_NO) and
+            (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+            { new offset must be valid: either in the range of 8 or 12 bit, depend on the
+              ldr postfix }
+            (((taicpu(p).opcode=A_ADD) and
+             isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset+taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
+             ) or
+             ((taicpu(p).opcode=A_SUB) and
+              isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset-taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
+             )
+            ) do
+            begin
+              { neither reg1 nor reg2 might be changed inbetween }
+              if RegModifiedBetween(taicpu(p).oper[0]^.reg,p,hp1) or
+                RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1) then
+                break;
+              { reg2 must be either overwritten by the ldr or it is deallocated afterwards }
+              if ((taicpu(hp1).opcode=A_LDR) and (taicpu(p).oper[0]^.reg=taicpu(hp1).oper[0]^.reg)) or
+                assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) then
+                begin
+                  { remember last instruction }
+                  hp2:=hp1;
+                  DebugMsg('Peephole Optimization: Add/SubLdr2Ldr done', p);
+                  hp1:=p;
+                  { fix all ldr/str }
+                  while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) do
+                    begin
+                      taicpu(hp1).oper[1]^.ref^.base:=taicpu(p).oper[1]^.reg;
+                      if taicpu(p).opcode=A_ADD then
+                        inc(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val)
+                      else
+                        dec(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val);
+                      if hp1=hp2 then
+                        break;
+                    end;
+                  RemoveCurrentP(p);
+                  result:=true;
+                  Exit;
+                end;
+            end;
+        end;
 
-    function IsPowerOf2(const value: DWord): boolean; inline;
-      begin
-        Result:=(value and (value - 1)) = 0;
-      end;
+      if (taicpu(p).condition = C_None) and
+        (taicpu(p).oppostfix = PF_None) and
+        LookForPreindexedPattern(taicpu(p)) then
+        begin
+          DebugMsg('Peephole Optimization: Add/Sub to Preindexed done', p);
+          RemoveCurrentP(p);
+          Result:=true;
+          Exit;
+        end;
+    end;
 
+
+  function TCpuAsmOptimizer.OptPass1MUL(var p: tai): Boolean;
+    var
+      hp1,hp2: tai;
+      oldreg: tregister;
     begin
-      result := false;
-      case p.typ of
-        ait_instruction:
-          begin
-            {
-              change
-              <op> reg,x,y
-              cmp reg,#0
-              into
-              <op>s reg,x,y
-            }
-            { this optimization can applied only to the currently enabled operations because
-              the other operations do not update all flags and FPC does not track flag usage }
-            if MatchInstruction(p, [A_ADC,A_ADD,A_BIC,A_SUB,A_MUL,A_MVN,A_MOV,A_ORR,A_EOR,A_AND,
-                                 A_RSB,A_RSC,A_SBC,A_MLA], [C_None], [PF_None]) and
-              GetNextInstruction(p, hp1) and
-              { mlas is only allowed in arm mode }
-              ((taicpu(p).opcode<>A_MLA) or
-               (current_settings.instructionset<>is_thumb)) and
-              MatchInstruction(hp1, A_CMP, [C_None], [PF_None]) and
-              (taicpu(hp1).oper[1]^.typ = top_const) and
-              (taicpu(p).oper[0]^.reg = taicpu(hp1).oper[0]^.reg) and
-              (taicpu(hp1).oper[1]^.val = 0) and
-              GetNextInstruction(hp1, hp2) and
-              { be careful here, following instructions could use other flags
-                however after a jump fpc never depends on the value of flags }
-              { All above instructions set Z and N according to the following
-                Z := result = 0;
-                N := result[31];
-                EQ = Z=1; NE = Z=0;
-                MI = N=1; PL = N=0; }
-              (MatchInstruction(hp2, A_B, [C_EQ,C_NE,C_MI,C_PL], []) or
-               { mov is also possible, but only if there is no shifter operand, it could be an rxx,
-                 we are too lazy to check if it is rxx or something else }
-               (MatchInstruction(hp2, A_MOV, [C_EQ,C_NE,C_MI,C_PL], []) and (taicpu(hp2).ops=2))) and
-              assigned(FindRegDealloc(NR_DEFAULTFLAGS,tai(hp2.Next))) then
-             begin
-               DebugMsg('Peephole OpCmp2OpS done', p);
-
-               taicpu(p).oppostfix:=PF_S;
-
-               { move flag allocation if possible }
-               GetLastInstruction(hp1, hp2);
-               hp2:=FindRegAlloc(NR_DEFAULTFLAGS,tai(hp2.Next));
-               if assigned(hp2) then
-                 begin
-                   asml.Remove(hp2);
-                   asml.insertbefore(hp2, p);
-                 end;
-
-               asml.remove(hp1);
-               hp1.free;
-               Result:=true;
-             end
-           else
-              case taicpu(p).opcode of
-                A_STR:
-                  begin
-                    { change
-                      str reg1,ref
-                      ldr reg2,ref
-                      into
-                      str reg1,ref
-                      mov reg2,reg1
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_ref) and
-                       (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       (taicpu(p).oppostfix=PF_None) and
-                       (taicpu(p).condition=C_None) and
-                       GetNextInstructionUsingRef(p,hp1,taicpu(p).oper[1]^.ref^) and
-                       MatchInstruction(hp1, A_LDR, [taicpu(p).condition], [PF_None]) and
-                       (taicpu(hp1).oper[1]^.typ=top_ref) and
-                       (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
-                       ((taicpu(hp1).oper[1]^.ref^.index=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1))) and
-                       ((taicpu(hp1).oper[1]^.ref^.base=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1))) then
-                      begin
-                        if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
-                          begin
-                            DebugMsg('Peephole StrLdr2StrMov 1 done', hp1);
-                            asml.remove(hp1);
-                            hp1.free;
-                          end
-                        else
-                          begin
-                            taicpu(hp1).opcode:=A_MOV;
-                            taicpu(hp1).oppostfix:=PF_None;
-                            taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
-                            DebugMsg('Peephole StrLdr2StrMov 2 done', hp1);
-                          end;
-                        result := true;
-                      end
-                    { change
-                      str reg1,ref
-                      str reg2,ref
-                      into
-                      strd reg1,reg2,ref
-                    }
-                    else if (GenerateARMCode or GenerateThumb2Code) and
-                       (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
-                       (taicpu(p).oppostfix=PF_None) and
-                       (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       GetNextInstruction(p,hp1) and
-                       MatchInstruction(hp1, A_STR, [taicpu(p).condition, C_None], [PF_None]) and
-                       not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
-                      (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
-                      { str ensures that either base or index contain no register, else ldr wouldn't
-                        use an offset either
-                      }
-                      (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
-                      (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
-                      (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) and
-                      (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
-                      AlignedToQWord(taicpu(p).oper[1]^.ref^) then
-                      begin
-                        DebugMsg('Peephole StrStr2Strd done', p);
-                        taicpu(p).oppostfix:=PF_D;
-                        taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
-                        taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
-                        taicpu(p).ops:=3;
-                        asml.remove(hp1);
-                        hp1.free;
-                        result:=true;
-                      end;
-                    Result:=LookForPostindexedPattern(taicpu(p)) or Result;
-                  end;
-                A_LDR:
-                  begin
-                    { change
-                      ldr reg1,ref
-                      ldr reg2,ref
-                      into ...
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_ref) and
-                       (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                       GetNextInstruction(p,hp1) and
-                       { ldrd is not allowed here }
-                       MatchInstruction(hp1, A_LDR, [taicpu(p).condition, C_None], [taicpu(p).oppostfix,PF_None]-[PF_D]) then
-                      begin
-                        {
-                          ...
-                          ldr reg1,ref
-                          mov reg2,reg1
-                        }
-                        if (taicpu(p).oppostfix=taicpu(hp1).oppostfix) and
-                           RefsEqual(taicpu(p).oper[1]^.ref^,taicpu(hp1).oper[1]^.ref^) and
-                           (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.index) and
-                           (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.base) and
-                           (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) then
-                          begin
-                            if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
-                              begin
-                                DebugMsg('Peephole LdrLdr2Ldr done', hp1);
-                                asml.remove(hp1);
-                                hp1.free;
-                              end
-                            else
-                              begin
-                                DebugMsg('Peephole LdrLdr2LdrMov done', hp1);
-                                taicpu(hp1).opcode:=A_MOV;
-                                taicpu(hp1).oppostfix:=PF_None;
-                                taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
-                              end;
-                            result := true;
-                          end
-                        {
-                           ...
-                           ldrd reg1,reg1+1,ref
-                        }
-                        else if (GenerateARMCode or GenerateThumb2Code) and
-                          (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
-                          { ldrd does not allow any postfixes ... }
-                          (taicpu(p).oppostfix=PF_None) and
-                          not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
-                          (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
-                          { ldr ensures that either base or index contain no register, else ldr wouldn't
-                            use an offset either
-                          }
-                          (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
-                          (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
-                          (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) and
-                          (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
-                          AlignedToQWord(taicpu(p).oper[1]^.ref^) then
-                          begin
-                            DebugMsg('Peephole LdrLdr2Ldrd done', p);
-                            taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
-                            taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
-                            taicpu(p).ops:=3;
-                            taicpu(p).oppostfix:=PF_D;
-                            asml.remove(hp1);
-                            hp1.free;
-                            result:=true;
-                          end;
-                      end;
+      Result := OptPass1DataCheckMov(p);
+      {
+       Turn
+       mul reg0, z,w
+       sub/add x, y, reg0
+       dealloc reg0
+
+       into
+
+       mls/mla x,z,w,y
+       }
+      if (taicpu(p).condition = C_None) and
+        (taicpu(p).oppostfix = PF_None) and
+        (taicpu(p).ops=3) and
+        (taicpu(p).oper[0]^.typ = top_reg) and
+        (taicpu(p).oper[1]^.typ = top_reg) and
+        (taicpu(p).oper[2]^.typ = top_reg) and
+        GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
+        MatchInstruction(hp1,[A_ADD,A_SUB],[C_None],[PF_None]) and
+        (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
+        (not RegModifiedBetween(taicpu(p).oper[2]^.reg, p, hp1)) and
+
+        (((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype>=cpu_armv4)) or
+         ((taicpu(hp1).opcode=A_SUB) and (current_settings.cputype in [cpu_armv6t2,cpu_armv7,cpu_armv7a,cpu_armv7r,cpu_armv7m,cpu_armv7em]))) and
+
+        // CPUs before ARMv6 don't recommend having the same Rd and Rm for MLA.
+        // TODO: A workaround would be to swap Rm and Rs
+        (not ((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype<=cpu_armv6) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^))) and
+
+        (((taicpu(hp1).ops=3) and
+          (taicpu(hp1).oper[2]^.typ=top_reg) and
+          ((MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) and
+            (not RegModifiedBetween(taicpu(hp1).oper[1]^.reg, p, hp1))) or
+           ((MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
+             (taicpu(hp1).opcode=A_ADD) and
+             (not RegModifiedBetween(taicpu(hp1).oper[2]^.reg, p, hp1)))))) or
+         ((taicpu(hp1).ops=2) and
+          (taicpu(hp1).oper[1]^.typ=top_reg) and
+          MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
+        (RegEndOfLife(taicpu(p).oper[0]^.reg,taicpu(hp1))) then
+        begin
+          if taicpu(hp1).opcode=A_ADD then
+            begin
+              taicpu(hp1).opcode:=A_MLA;
 
-                    {
-                      Change
-
-                        ldrb dst1, [REF]
-                        and  dst2, dst1, #255
-
-                      into
-
-                        ldrb dst2, [ref]
-                    }
-                    if not(GenerateThumbCode) and
-                       (taicpu(p).oppostfix=PF_B) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       MatchInstruction(hp1, A_AND, [taicpu(p).condition], [PF_NONE]) and
-                       (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
-                       (taicpu(hp1).oper[2]^.typ = top_const) and
-                       (taicpu(hp1).oper[2]^.val = $FF) and
-                       not(RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         DebugMsg('Peephole LdrbAnd2Ldrb done', p);
-                         taicpu(p).oper[0]^.reg := taicpu(hp1).oper[0]^.reg;
-                         asml.remove(hp1);
-                         hp1.free;
-                         result:=true;
-                       end;
-                    Result:=LookForPostindexedPattern(taicpu(p)) or Result;
-                    { Remove superfluous mov after ldr
-                      changes
-                      ldr reg1, ref
-                      mov reg2, reg1
-                      to
-                      ldr reg2, ref
-
-                      conditions are:
-                        * no ldrd usage
-                        * reg1 must be released after mov
-                        * mov can not contain shifterops
-                        * ldr+mov have the same conditions
-                        * mov does not set flags
-                    }
-                    if (taicpu(p).oppostfix<>PF_D) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       RemoveSuperfluousMove(p, hp1, 'LdrMov2Ldr') then
-                      Result:=true;
-                  end;
-                A_MOV:
+              if taicpu(hp1).ops=3 then
+                begin
+                  if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^) then
+                    oldreg:=taicpu(hp1).oper[2]^.reg
+                  else
+                    oldreg:=taicpu(hp1).oper[1]^.reg;
+                end
+              else
+                oldreg:=taicpu(hp1).oper[0]^.reg;
+
+              taicpu(hp1).loadreg(1,taicpu(p).oper[1]^.reg);
+              taicpu(hp1).loadreg(2,taicpu(p).oper[2]^.reg);
+              taicpu(hp1).loadreg(3,oldreg);
+
+              DebugMsg('Peephole Optimization: MulAdd2MLA done', p);
+            end
+          else
+            begin
+              taicpu(hp1).opcode:=A_MLS;
+
+              taicpu(hp1).loadreg(3,taicpu(hp1).oper[1]^.reg);
+
+              if taicpu(hp1).ops=2 then
+                taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg)
+              else
+                taicpu(hp1).loadreg(1,taicpu(p).oper[2]^.reg);
+
+              taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
+
+              DebugMsg('Peephole Optimization: MulSub2MLS done', p);
+              AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
+              AllocRegBetween(taicpu(hp1).oper[2]^.reg,p,hp1,UsedRegs);
+              AllocRegBetween(taicpu(hp1).oper[3]^.reg,p,hp1,UsedRegs);
+
+            end;
+
+          taicpu(hp1).ops:=4;
+          RemoveCurrentP(p);
+          Result := True;
+          Exit;
+        end
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1And(var p: tai): Boolean;
+    begin
+      Result := OptPass1DataCheckMov(p);
+      Result := inherited OptPass1And(p) or Result;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1DataCheckMov(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      {
+        change
+        op  reg1, ...
+        mov reg2, reg1
+        to
+        op  reg2, ...
+      }
+      Result := (taicpu(p).ops >= 3) and
+        GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousMove(p, hp1, 'DataMov2Data');
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1CMP(var p: tai): Boolean;
+    var
+      hp1, hp2, hp_last: tai;
+      MovRem1, MovRem2: Boolean;
+    begin
+      Result := False;
+
+      { These optimizations can be applied only to the currently enabled operations because
+        the other operations do not update all flags and FPC does not track flag usage }
+      if (taicpu(p).condition = C_None) and
+        (taicpu(p).oper[1]^.typ = top_const) and
+        GetNextInstruction(p, hp1) then
+        begin
+          {
+            change
+            cmp   reg,const1
+            moveq reg,const1
+            movne reg,const2
+            to
+            cmp   reg,const1
+            movne reg,const2
+          }
+          if MatchInstruction(hp1, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
+            (taicpu(hp1).oper[1]^.typ = top_const) and
+            GetNextInstruction(hp1, hp2) and
+            MatchInstruction(hp2, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
+            (taicpu(hp2).oper[1]^.typ = top_const) then
+            begin
+              MovRem1 := RemoveRedundantMove(p, hp1, asml);
+              MovRem2 := RemoveRedundantMove(p, hp2, asml);
+
+              Result:= MovRem1 or MovRem2;
+
+              { Make sure that hp1 is still the next instruction after p }
+              if MovRem1 then
+                if MovRem2 then
                   begin
-                    { fold
-                      mov reg1,reg0, shift imm1
-                      mov reg1,reg1, shift imm2
-                    }
-                    if (taicpu(p).ops=3) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-                       getnextinstruction(p,hp1) and
-                       MatchInstruction(hp1, A_MOV, [taicpu(p).condition], [PF_None]) and
-                       (taicpu(hp1).ops=3) and
-                       MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) and
-                       MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
-                       (taicpu(hp1).oper[2]^.typ = top_shifterop) and
-                       (taicpu(hp1).oper[2]^.shifterop^.rs = NR_NO) then
-                      begin
-                        { fold
-                          mov reg1,reg0, lsl 16
-                          mov reg1,reg1, lsr 16
-                          strh reg1, ...
-                          dealloc reg1
-                          to
-                          strh reg1, ...
-                          dealloc reg1
-                        }
-                        if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and
-                          (taicpu(p).oper[2]^.shifterop^.shiftimm=16) and
-                          (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ASR]) and
-                          (taicpu(hp1).oper[2]^.shifterop^.shiftimm=16) and
-                          getnextinstruction(hp1,hp2) and
-                          MatchInstruction(hp2, A_STR, [taicpu(p).condition], [PF_H]) and
-                          MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^.reg) then
-                          begin
-                            TransferUsedRegs(TmpUsedRegs);
-                            UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                            UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
-                            if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp2,TmpUsedRegs)) then
-                              begin
-                                DebugMsg('Peephole optimizer removed superfluous 16 Bit zero extension', hp1);
-                                taicpu(hp2).loadreg(0,taicpu(p).oper[1]^.reg);
-                                asml.remove(p);
-                                asml.remove(hp1);
-                                p.free;
-                                hp1.free;
-                                p:=hp2;
-                                Result:=true;
-                              end;
-                          end
-                        { fold
-                          mov reg1,reg0, shift imm1
-                          mov reg1,reg1, shift imm2
-                          to
-                          mov reg1,reg0, shift imm1+imm2
-                        }
-                        else if (taicpu(p).oper[2]^.shifterop^.shiftmode=taicpu(hp1).oper[2]^.shifterop^.shiftmode) or
-                          { asr makes no use after a lsr, the asr can be foled into the lsr }
-                           ((taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSR) and (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_ASR) ) then
-                          begin
-                            inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp1).oper[2]^.shifterop^.shiftimm);
-                            { avoid overflows }
-                            if taicpu(p).oper[2]^.shifterop^.shiftimm>31 then
-                              case taicpu(p).oper[2]^.shifterop^.shiftmode of
-                                SM_ROR:
-                                  taicpu(p).oper[2]^.shifterop^.shiftimm:=taicpu(p).oper[2]^.shifterop^.shiftimm and 31;
-                                SM_ASR:
-                                  taicpu(p).oper[2]^.shifterop^.shiftimm:=31;
-                                SM_LSR,
-                                SM_LSL:
-                                  begin
-                                    hp2:=taicpu.op_reg_const(A_MOV,taicpu(p).oper[0]^.reg,0);
-                                    InsertLLItem(p.previous, p.next, hp2);
-                                    p.free;
-                                    p:=hp2;
-                                  end;
-                                else
-                                  internalerror(2008072803);
-                              end;
-                            DebugMsg('Peephole ShiftShift2Shift 1 done', p);
-                            asml.remove(hp1);
-                            hp1.free;
-                            result := true;
-                          end
-                        { fold
-                          mov reg1,reg0, shift imm1
-                          mov reg1,reg1, shift imm2
-                          mov reg1,reg1, shift imm3 ...
-                          mov reg2,reg1, shift imm3 ...
-                        }
-                        else if GetNextInstructionUsingReg(hp1,hp2, taicpu(hp1).oper[0]^.reg) and
-                          MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
-                          (taicpu(hp2).ops=3) and
-                          MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) and
-                          RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp2)) and
-                          (taicpu(hp2).oper[2]^.typ = top_shifterop) and
-                          (taicpu(hp2).oper[2]^.shifterop^.rs = NR_NO) then
-                          begin
-                            { mov reg1,reg0, lsl imm1
-                              mov reg1,reg1, lsr/asr imm2
-                              mov reg2,reg1, lsl imm3 ...
-                              to
-                              mov reg1,reg0, lsl imm1
-                              mov reg2,reg1, lsr/asr imm2-imm3
-                              if
-                              imm1>=imm2
-                            }
-                            if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode=SM_LSL) and
-                              (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
-                              (taicpu(p).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
-                              begin
-                                if (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
-                                  begin
-                                    if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,p,hp1)) and
-                                      not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
-                                      begin
-                                        DebugMsg('Peephole ShiftShiftShift2ShiftShift 1a done', p);
-                                        inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm-taicpu(hp1).oper[2]^.shifterop^.shiftimm);
-                                        taicpu(p).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
-                                        asml.remove(hp1);
-                                        asml.remove(hp2);
-                                        hp1.free;
-                                        hp2.free;
-
-                                        if taicpu(p).oper[2]^.shifterop^.shiftimm>=32 then
-                                          begin
-                                            taicpu(p).freeop(1);
-                                            taicpu(p).freeop(2);
-                                            taicpu(p).loadconst(1,0);
-                                          end;
-                                        result := true;
-                                      end;
-                                  end
-                                else if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
-                                  begin
-                                    DebugMsg('Peephole ShiftShiftShift2ShiftShift 1b done', p);
-
-                                    dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm);
-                                    taicpu(hp1).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
-                                    asml.remove(hp2);
-                                    hp2.free;
-                                    result := true;
-                                  end;
-                              end
-                            { mov reg1,reg0, lsr/asr imm1
-                              mov reg1,reg1, lsl imm2
-                              mov reg1,reg1, lsr/asr imm3 ...
-
-                              if imm3>=imm1 and imm2>=imm1
-                              to
-                              mov reg1,reg0, lsl imm2-imm1
-                              mov reg1,reg1, lsr/asr imm3 ...
-                            }
-                            else if (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
-                              (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_LSL) and
-                              (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) and
-                              (taicpu(hp1).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) then
-                              begin
-                                dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(p).oper[2]^.shifterop^.shiftimm);
-                                taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
-                                DebugMsg('Peephole ShiftShiftShift2ShiftShift 2 done', p);
-                                asml.remove(p);
-                                p.free;
-                                p:=hp2;
-                                if taicpu(hp1).oper[2]^.shifterop^.shiftimm=0 then
-                                  begin
-                                    taicpu(hp2).oper[1]^.reg:=taicpu(hp1).oper[1]^.reg;
-                                    asml.remove(hp1);
-                                    hp1.free;
-                                    p:=hp2;
-                                  end;
-                                result := true;
-                              end;
-                          end;
-                      end;
-                    { Change the common
-                      mov r0, r0, lsr #xxx
-                      and r0, r0, #yyy/bic r0, r0, #xxx
-
-                      and remove the superfluous and/bic if possible
-
-                      This could be extended to handle more cases.
-                    }
-                    if (taicpu(p).ops=3) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-                       (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
-                       GetNextInstructionUsingReg(p,hp1, taicpu(p).oper[0]^.reg) and
-                       (hp1.typ=ait_instruction) and
-                       (taicpu(hp1).ops>=1) and
-                       (taicpu(hp1).oper[0]^.typ=top_reg) and
-                       (not RegModifiedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
-                           MatchInstruction(hp1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                           (taicpu(hp1).ops=3) and
-                           MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
-                           (taicpu(hp1).oper[2]^.typ = top_const) and
-                           { Check if the AND actually would only mask out bits being already zero because of the shift
-                           }
-                           ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hp1).oper[2]^.val) =
-                             ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
-                           begin
-                             DebugMsg('Peephole LsrAnd2Lsr done', hp1);
-                             taicpu(p).oper[0]^.reg:=taicpu(hp1).oper[0]^.reg;
-                             asml.remove(hp1);
-                             hp1.free;
-                             result:=true;
-                           end
-                         else if MatchInstruction(hp1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                           (taicpu(hp1).ops=3) and
-                           MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^) and
-                           (taicpu(hp1).oper[2]^.typ = top_const) and
-                           { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
-                           (taicpu(hp1).oper[2]^.val<>0) and
-                           (BsfDWord(taicpu(hp1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
-                           begin
-                             DebugMsg('Peephole LsrBic2Lsr done', hp1);
-                             taicpu(p).oper[0]^.reg:=taicpu(hp1).oper[0]^.reg;
-                             asml.remove(hp1);
-                             hp1.free;
-                             result:=true;
-                           end;
-                       end;
-                    { Change
-                      mov rx, ry, lsr/ror #xxx
-                      uxtb/uxth rz,rx/and rz,rx,0xFF
-                      dealloc rx
-
-                      to
-
-                      uxtb/uxth rz,ry,ror #xxx
-                    }
-                    if (taicpu(p).ops=3) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
-                       (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
-                       (GenerateThumb2Code) and
-                       GetNextInstructionUsingReg(p,hp1, taicpu(p).oper[0]^.reg) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         if MatchInstruction(hp1, A_UXTB, [C_None], [PF_None]) and
-                           (taicpu(hp1).ops = 2) and
-                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                           begin
-                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-                             taicpu(hp1).ops := 3;
+                    if not GetNextInstruction(p, hp1) then
+                      Exit;
+                  end
+                else
+                  hp1 := hp2;
+            end;
 
-                             GetNextInstruction(p,hp1);
+          {
+            change
+            <op> reg,x,y
+            cmp reg,#0
+            into
+            <op>s reg,x,y
+          }
+          if (taicpu(p).oppostfix = PF_None) and
+            (taicpu(p).oper[1]^.val = 0) and
+            { be careful here, following instructions could use other flags
+              however after a jump fpc never depends on the value of flags }
+            { All above instructions set Z and N according to the following
+              Z := result = 0;
+              N := result[31];
+              EQ = Z=1; NE = Z=0;
+              MI = N=1; PL = N=0; }
+            (MatchInstruction(hp1, A_B, [C_EQ,C_NE,C_MI,C_PL], []) or
+            { mov is also possible, but only if there is no shifter operand, it could be an rxx,
+              we are too lazy to check if it is rxx or something else }
+            (MatchInstruction(hp1, A_MOV, [C_EQ,C_NE,C_MI,C_PL], []) and (taicpu(hp1).ops=2))) and
+            GetLastInstruction(p, hp_last) and
+            MatchInstruction(hp_last, [A_ADC,A_ADD,A_BIC,A_SUB,A_MUL,A_MVN,A_MOV,A_ORR,
+              A_EOR,A_AND,A_RSB,A_RSC,A_SBC,A_MLA], [C_None], [PF_None]) and
+            (
+              { mlas is only allowed in arm mode }
+              (taicpu(hp_last).opcode<>A_MLA) or
+              (current_settings.instructionset<>is_thumb)
+            ) and
+            (taicpu(hp_last).oper[0]^.reg = taicpu(p).oper[0]^.reg) and
+            assigned(FindRegDealloc(NR_DEFAULTFLAGS,tai(hp1.Next))) then
+            begin
+              DebugMsg('Peephole Optimization: OpCmp2OpS done', hp_last);
 
-                             asml.Remove(p);
-                             p.Free;
+              taicpu(hp_last).oppostfix:=PF_S;
 
-                             p:=hp1;
+              { move flag allocation if possible }
+              hp1:=FindRegAlloc(NR_DEFAULTFLAGS,tai(hp_last.Next));
+              if assigned(hp1) then
+                begin
+                  asml.Remove(hp1);
+                  asml.insertbefore(hp1, hp_last);
+                end;
 
-                             result:=true;
-                             exit;
-                           end
-                         else if MatchInstruction(hp1, A_UXTH, [C_None], [PF_None]) and
-                           (taicpu(hp1).ops=2) and
-                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
-                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                           begin
-                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
-                             taicpu(hp1).ops := 3;
-
-                             GetNextInstruction(p,hp1);
-
-                             asml.Remove(p);
-                             p.Free;
-
-                             p:=hp1;
-
-                             result:=true;
-                             exit;
-                           end
-                         else if MatchInstruction(hp1, A_AND, [C_None], [PF_None]) and
-                           (taicpu(hp1).ops = 3) and
-                           (taicpu(hp1).oper[2]^.typ = top_const) and
-                           (taicpu(hp1).oper[2]^.val = $FF) and
-                           (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
-                           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                           begin
-                             taicpu(hp1).ops := 3;
-                             taicpu(hp1).opcode := A_UXTB;
-                             taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
-                             taicpu(hp1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
-                             taicpu(hp1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+              RemoveCurrentP(p);
+              Result:=true;
+            end;
+        end;
+    end;
 
-                             GetNextInstruction(p,hp1);
 
-                             asml.Remove(p);
-                             p.Free;
+  function TCpuAsmOptimizer.OptPass1LDR(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := False;
+
+      { change
+        ldr reg1,ref
+        ldr reg2,ref
+        into ...
+      }
+      if (taicpu(p).oper[1]^.typ = top_ref) and
+         (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
+         GetNextInstruction(p,hp1) and
+         { ldrd is not allowed here }
+         MatchInstruction(hp1, A_LDR, [taicpu(p).condition, C_None], [taicpu(p).oppostfix,PF_None]-[PF_D]) then
+        begin
+          {
+            ...
+            ldr reg1,ref
+            mov reg2,reg1
+          }
+          if (taicpu(p).oppostfix=taicpu(hp1).oppostfix) and
+             RefsEqual(taicpu(p).oper[1]^.ref^,taicpu(hp1).oper[1]^.ref^) and
+             (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.index) and
+             (taicpu(p).oper[0]^.reg<>taicpu(hp1).oper[1]^.ref^.base) and
+             (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) then
+            begin
+              if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
+                begin
+                  DebugMsg('Peephole Optimization: LdrLdr2Ldr done', hp1);
+                  asml.remove(hp1);
+                  hp1.free;
+                end
+              else
+                begin
+                  DebugMsg('Peephole Optimization: LdrLdr2LdrMov done', hp1);
+                  taicpu(hp1).opcode:=A_MOV;
+                  taicpu(hp1).oppostfix:=PF_None;
+                  taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
+                end;
+              result := true;
+            end
+          {
+             ...
+             ldrd reg1,reg1+1,ref
+          }
+          else if (GenerateARMCode or GenerateThumb2Code) and
+            (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
+            { ldrd does not allow any postfixes ... }
+            (taicpu(p).oppostfix=PF_None) and
+            not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
+            (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
+            { ldr ensures that either base or index contain no register, else ldr wouldn't
+              use an offset either
+            }
+            (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
+            (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
+            (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) and
+            (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
+            AlignedToQWord(taicpu(p).oper[1]^.ref^) then
+            begin
+              DebugMsg('Peephole Optimization: LdrLdr2Ldrd done', p);
+              taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
+              taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
+              taicpu(p).ops:=3;
+              taicpu(p).oppostfix:=PF_D;
+              asml.remove(hp1);
+              hp1.free;
+              result:=true;
+            end;
+        end;
 
-                             p:=hp1;
+      {
+        Change
+
+          ldrb dst1, [REF]
+          and  dst2, dst1, #255
+
+        into
+
+          ldrb dst2, [ref]
+      }
+      if not(GenerateThumbCode) and
+         (taicpu(p).oppostfix=PF_B) and
+         GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+         MatchInstruction(hp1, A_AND, [taicpu(p).condition], [PF_NONE]) and
+         (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and
+         (taicpu(hp1).oper[2]^.typ = top_const) and
+         (taicpu(hp1).oper[2]^.val = $FF) and
+         not(RegUsedBetween(taicpu(hp1).oper[0]^.reg, p, hp1)) and
+         RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
+         begin
+           DebugMsg('Peephole Optimization: LdrbAnd2Ldrb done', p);
+           taicpu(p).oper[0]^.reg := taicpu(hp1).oper[0]^.reg;
+           asml.remove(hp1);
+           hp1.free;
+           result:=true;
+         end;
+      Result:=LookForPostindexedPattern(taicpu(p)) or Result;
+      { Remove superfluous mov after ldr
+        changes
+        ldr reg1, ref
+        mov reg2, reg1
+        to
+        ldr reg2, ref
+
+        conditions are:
+          * no ldrd usage
+          * reg1 must be released after mov
+          * mov can not contain shifterops
+          * ldr+mov have the same conditions
+          * mov does not set flags
+      }
+      if (taicpu(p).oppostfix<>PF_D) and
+         GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+         RemoveSuperfluousMove(p, hp1, 'LdrMov2Ldr') then
+        Result:=true;
+    end;
 
-                             result:=true;
-                             exit;
-                           end;
-                       end;
-                    {
-                      optimize
-                      mov rX, yyyy
-                      ....
-                    }
-                    if (taicpu(p).ops = 2) and
-                       GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
-                       (tai(hp1).typ = ait_instruction) then
-                      begin
-                        {
-                          This removes the mul from
-                          mov rX,0
-                          ...
-                          mul ...,rX,...
-                        }
-                        if false and (taicpu(p).oper[1]^.typ = top_const) and
-                          (taicpu(p).oper[1]^.val=0) and
-                          MatchInstruction(hp1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                          (((taicpu(hp1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^)) or
-                           ((taicpu(hp1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[2]^))) then
-                            begin
-                              TransferUsedRegs(TmpUsedRegs);
-                              UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                              UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
-                              DebugMsg('Peephole MovMUL/MLA2Mov0 done', p);
-                              if taicpu(hp1).opcode=A_MUL then
-                                taicpu(hp1).loadconst(1,0)
-                              else
-                                taicpu(hp1).loadreg(1,taicpu(hp1).oper[3]^.reg);
-                              taicpu(hp1).ops:=2;
-                              taicpu(hp1).opcode:=A_MOV;
-                              if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp1,TmpUsedRegs)) then
-                                RemoveCurrentP(p);
-                              Result:=true;
-                              exit;
-                            end
-                        else if (taicpu(p).oper[1]^.typ = top_const) and
-                          (taicpu(p).oper[1]^.val=0) and
-                          MatchInstruction(hp1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                          MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[3]^) then
-                            begin
-                              TransferUsedRegs(TmpUsedRegs);
-                              UpdateUsedRegs(TmpUsedRegs, tai(p.next));
-                              UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
-                              DebugMsg('Peephole MovMLA2MUL 1 done', p);
-                              taicpu(hp1).ops:=3;
-                              taicpu(hp1).opcode:=A_MUL;
-                              if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp1,TmpUsedRegs)) then
-                                RemoveCurrentP(p);
-                              Result:=true;
-                              exit;
-                            end
-                        {
-                          This changes the very common
-                          mov r0, #0
-                          str r0, [...]
-                          mov r0, #0
-                          str r0, [...]
-
-                          and removes all superfluous mov instructions
-                        }
-                        else if (taicpu(p).oper[1]^.typ = top_const) and
-                           (taicpu(hp1).opcode=A_STR) then
-                          while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
-                                MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^) and
-                                GetNextInstruction(hp1, hp2) and
-                                MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
-                                (taicpu(hp2).ops = 2) and
-                                MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
-                                MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
-                            begin
-                              DebugMsg('Peephole MovStrMov done', hp2);
-                              GetNextInstruction(hp2,hp1);
-                              asml.remove(hp2);
-                              hp2.free;
-                              result:=true;
-                              if not assigned(hp1) then break;
-                            end
-                        {
-                          This removes the first mov from
-                          mov rX,...
-                          mov rX,...
-                        }
-                        else if taicpu(hp1).opcode=A_MOV then
-                          while MatchInstruction(hp1, A_MOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                                (taicpu(hp1).ops = 2) and
-                                MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^) and
-                                { don't remove the first mov if the second is a mov rX,rX }
-                                not(MatchOperand(taicpu(hp1).oper[0]^, taicpu(hp1).oper[1]^)) do
-                            begin
-                              DebugMsg('Peephole MovMov done', p);
-                              asml.remove(p);
-                              p.free;
-                              p:=hp1;
-                              GetNextInstruction(hp1,hp1);
-                              result:=true;
-                              if not assigned(hp1) then
-                                break;
-                            end;
-                         if RedundantMovProcess(p,hp1) then
-                           begin
-                             Result:=true;
-                             { p might not point at a mov anymore }
-                             exit;
-                           end;
-                      end;
 
-                    { Fold the very common sequence
-                        mov  regA, regB
-                        ldr* regA, [regA]
-                      to
-                        ldr* regA, [regB]
-                      CAUTION! If this one is successful p might not be a mov instruction anymore!
-                    }
-                    if (taicpu(p).opcode = A_MOV) and
-                       (taicpu(p).ops = 2) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oppostfix = PF_NONE) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], []) and
-                       (taicpu(hp1).oper[1]^.typ = top_ref) and
-                       { We can change the base register only when the instruction uses AM_OFFSET }
-                       ((taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
-                         ((taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                          (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
-                       ) and
-                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
-
-                       // Make sure that Thumb code doesn't propagate a high register into a reference
-                       ((GenerateThumbCode and
-                         (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)) or
-                        (not GenerateThumbCode)) and
-
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                      begin
-                        DebugMsg('Peephole MovLdr2Ldr done', hp1);
-                        if (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) and
-                           (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-                          taicpu(hp1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
-
-                        if taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
-                          taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
-
-                        dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
-                        if Assigned(dealloc) then
-                          begin
-                            asml.remove(dealloc);
-                            asml.InsertAfter(dealloc,hp1);
-                          end;
-
-                        GetNextInstruction(p, hp1);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp1;
-                        result:=true;
-                      end;
+  function TCpuAsmOptimizer.OptPass1STM(var p: tai): Boolean;
+    var
+      hp1, hp2, hp3, hp4: tai;
+    begin
+      Result := False;
+
+      {
+        change
+        stmfd	r13!,[r14]
+        sub	r13,r13,#4
+        bl	abc
+        add	r13,r13,#4
+        ldmfd	r13!,[r15]
+        into
+        b         abc
+      }
+      if not(ts_thumb_interworking in current_settings.targetswitches) and
+        (taicpu(p).condition = C_None) and
+        (taicpu(p).oppostfix = PF_FD) and
+        (taicpu(p).oper[0]^.typ = top_ref) and
+        (taicpu(p).oper[0]^.ref^.index=NR_STACK_POINTER_REG) and
+        (taicpu(p).oper[0]^.ref^.base=NR_NO) and
+        (taicpu(p).oper[0]^.ref^.offset=0) and
+        (taicpu(p).oper[0]^.ref^.addressmode=AM_PREINDEXED) and
+        (taicpu(p).oper[1]^.typ = top_regset) and
+        (taicpu(p).oper[1]^.regset^ = [RS_R14]) and
+        GetNextInstruction(p, hp1) and
+        MatchInstruction(hp1, A_SUB, [C_None], [PF_NONE]) and
+        (taicpu(hp1).oper[0]^.typ = top_reg) and
+        (taicpu(hp1).oper[0]^.reg = NR_STACK_POINTER_REG) and
+        MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) and
+        (taicpu(hp1).oper[2]^.typ = top_const) and
+
+        GetNextInstruction(hp1, hp2) and
+        SkipEntryExitMarker(hp2, hp2) and
+
+        MatchInstruction(hp2, [A_BL,A_BLX], [C_None], [PF_NONE]) and
+        (taicpu(hp2).oper[0]^.typ = top_ref) and
+
+        GetNextInstruction(hp2, hp3) and
+        SkipEntryExitMarker(hp3, hp3) and
+        MatchInstruction(hp3, A_ADD, [C_None], [PF_NONE]) and
+        MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[0]^) and
+        MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[1]^) and
+        MatchOperand(taicpu(hp1).oper[2]^,taicpu(hp3).oper[2]^) and
+
+        GetNextInstruction(hp3, hp4) and
+        MatchInstruction(hp4, A_LDM, [C_None], [PF_FD]) and
+        MatchOperand(taicpu(p).oper[0]^,taicpu(hp4).oper[0]^) and
+        (taicpu(hp4).oper[1]^.typ = top_regset) and
+        (taicpu(hp4).oper[1]^.regset^ = [RS_R15]) then
+        begin
+          asml.Remove(hp1);
+          asml.Remove(hp3);
+          asml.Remove(hp4);
+          taicpu(hp2).opcode:=A_B;
+          hp1.free;
+          hp3.free;
+          hp4.free;
+          RemoveCurrentp(p, hp2);
+          DebugMsg('Peephole Optimization: Bl2B done', p);
+          Result := True;
+        end;
+    end;
 
-                    { This folds shifterops into following instructions
-                      mov r0, r1, lsl #8
-                      add r2, r3, r0
-
-                      to
-
-                      add r2, r3, r1, lsl #8
-                      CAUTION! If this one is successful p might not be a mov instruction anymore!
-                    }
-                    if (taicpu(p).opcode = A_MOV) and
-                       (taicpu(p).ops = 3) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       (taicpu(p).oppostfix = PF_NONE) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       MatchInstruction(hp1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
-                                              A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
-                                              A_CMP, A_CMN],
-                                        [taicpu(p).condition], [PF_None]) and
-                       (not ((GenerateThumb2Code) and
-                             (taicpu(hp1).opcode in [A_SBC]) and
-                             (((taicpu(hp1).ops=3) and
-                               MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[1]^.reg)) or
-                              ((taicpu(hp1).ops=2) and
-                               MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[0]^.reg))))) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
-                       (taicpu(hp1).ops >= 2) and
-                       {Currently we can't fold into another shifterop}
-                       (taicpu(hp1).oper[taicpu(hp1).ops-1]^.typ = top_reg) and
-                       {Folding rrx is problematic because of the C-Flag, as we currently can't check
-                        NR_DEFAULTFLAGS for modification}
-                       (
-                         {Everything is fine if we don't use RRX}
-                         (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
-                         (
-                           {If it is RRX, then check if we're just accessing the next instruction}
-                           GetNextInstruction(p, hp2) and
-                           (hp1 = hp2)
-                         )
-                       ) and
-                       { reg1 might not be modified inbetween }
-                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
-                       { The shifterop can contain a register, might not be modified}
-                       (
-                         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
-                         not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hp1))
-                       ) and
-                       (
-                         {Only ONE of the two src operands is allowed to match}
-                         MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-2]^) xor
-                         MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[taicpu(hp1).ops-1]^)
-                       ) then
-                      begin
-                        if taicpu(hp1).opcode in [A_TST, A_TEQ, A_CMN] then
-                          I2:=0
-                        else
-                          I2:=1;
-                        for I:=I2 to taicpu(hp1).ops-1 do
-                          if MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[I]^.reg) then
+
+
+  function TCpuAsmOptimizer.OptPass1STR(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := False;
+
+      { Common conditions }
+      if (taicpu(p).oper[1]^.typ = top_ref) and
+        (taicpu(p).oper[1]^.ref^.addressmode=AM_OFFSET) and
+        (taicpu(p).oppostfix=PF_None) then
+        begin
+          { change
+            str reg1,ref
+            ldr reg2,ref
+            into
+            str reg1,ref
+            mov reg2,reg1
+          }
+          if (taicpu(p).condition=C_None) and
+             GetNextInstructionUsingRef(p,hp1,taicpu(p).oper[1]^.ref^) and
+             MatchInstruction(hp1, A_LDR, [taicpu(p).condition], [PF_None]) and
+             (taicpu(hp1).oper[1]^.typ=top_ref) and
+             (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+             not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp1)) and
+             ((taicpu(hp1).oper[1]^.ref^.index=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index, p, hp1))) and
+             ((taicpu(hp1).oper[1]^.ref^.base=NR_NO) or not (RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base, p, hp1))) then
+            begin
+              if taicpu(hp1).oper[0]^.reg=taicpu(p).oper[0]^.reg then
+                begin
+                  DebugMsg('Peephole Optimization: StrLdr2StrMov 1 done', hp1);
+                  asml.remove(hp1);
+                  hp1.free;
+                end
+              else
+                begin
+                  taicpu(hp1).opcode:=A_MOV;
+                  taicpu(hp1).oppostfix:=PF_None;
+                  taicpu(hp1).loadreg(1,taicpu(p).oper[0]^.reg);
+                  DebugMsg('Peephole Optimization: StrLdr2StrMov 2 done', hp1);
+                end;
+              result := True;
+            end
+          { change
+            str reg1,ref
+            str reg2,ref
+            into
+            strd reg1,reg2,ref
+          }
+          else if (GenerateARMCode or GenerateThumb2Code) and
+             (CPUARM_HAS_EDSP in cpu_capabilities[current_settings.cputype]) and
+             not(odd(getsupreg(taicpu(p).oper[0]^.reg))) and
+             (abs(taicpu(p).oper[1]^.ref^.offset)<256) and
+             AlignedToQWord(taicpu(p).oper[1]^.ref^) and
+             GetNextInstruction(p,hp1) and
+             MatchInstruction(hp1, A_STR, [taicpu(p).condition, C_None], [PF_None]) and
+            (getsupreg(taicpu(p).oper[0]^.reg)+1=getsupreg(taicpu(hp1).oper[0]^.reg)) and
+            { str ensures that either base or index contain no register, else ldr wouldn't
+              use an offset either
+            }
+            (taicpu(p).oper[1]^.ref^.base=taicpu(hp1).oper[1]^.ref^.base) and
+            (taicpu(p).oper[1]^.ref^.index=taicpu(hp1).oper[1]^.ref^.index) and
+            (taicpu(p).oper[1]^.ref^.offset+4=taicpu(hp1).oper[1]^.ref^.offset) then
+            begin
+              DebugMsg('Peephole Optimization: StrStr2Strd done', p);
+              taicpu(p).oppostfix:=PF_D;
+              taicpu(p).loadref(2,taicpu(p).oper[1]^.ref^);
+              taicpu(p).loadreg(1, taicpu(hp1).oper[0]^.reg);
+              taicpu(p).ops:=3;
+              asml.remove(hp1);
+              hp1.free;
+              result:=true;
+            end;
+        end;
+
+      Result:=LookForPostindexedPattern(taicpu(p)) or Result;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1MOV(var p: tai): Boolean;
+    var
+      hp1, hpfar1, hp2, hp3: tai;
+      i, i2: longint;
+      tempop: tasmop;
+      dealloc: tai_regalloc;
+    begin
+      Result := False;
+      hp1 := nil;
+
+      { fold
+        mov reg1,reg0, shift imm1
+        mov reg1,reg1, shift imm2
+      }
+      if (taicpu(p).ops=3) and
+         (taicpu(p).oper[2]^.typ = top_shifterop) and
+         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+         getnextinstruction(p,hp1) and
+         MatchInstruction(hp1, A_MOV, [taicpu(p).condition], [PF_None]) and
+         (taicpu(hp1).ops=3) and
+         MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) and
+         MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
+         (taicpu(hp1).oper[2]^.typ = top_shifterop) and
+         (taicpu(hp1).oper[2]^.shifterop^.rs = NR_NO) then
+        begin
+          { fold
+            mov reg1,reg0, lsl 16
+            mov reg1,reg1, lsr 16
+            strh reg1, ...
+            dealloc reg1
+            to
+            strh reg1, ...
+            dealloc reg1
+          }
+          if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and
+            (taicpu(p).oper[2]^.shifterop^.shiftimm=16) and
+            (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ASR]) and
+            (taicpu(hp1).oper[2]^.shifterop^.shiftimm=16) and
+            getnextinstruction(hp1,hp2) and
+            MatchInstruction(hp2, A_STR, [taicpu(p).condition], [PF_H]) and
+            MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^.reg) then
+            begin
+              TransferUsedRegs(TmpUsedRegs);
+              UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+              UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
+              if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hp2,TmpUsedRegs)) then
+                begin
+                  DebugMsg('Peephole Optimization: removed superfluous 16 Bit zero extension', hp1);
+                  taicpu(hp2).loadreg(0,taicpu(p).oper[1]^.reg);
+                  asml.remove(hp1);
+                  hp1.free;
+
+                  RemoveCurrentP(p, hp2);
+                  Result:=true;
+                  Exit;
+                end;
+            end
+          { fold
+            mov reg1,reg0, shift imm1
+            mov reg1,reg1, shift imm2
+            to
+            mov reg1,reg0, shift imm1+imm2
+          }
+          else if (taicpu(p).oper[2]^.shifterop^.shiftmode=taicpu(hp1).oper[2]^.shifterop^.shiftmode) or
+            { asr makes no use after a lsr, the asr can be foled into the lsr }
+             ((taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSR) and (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_ASR) ) then
+            begin
+              inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp1).oper[2]^.shifterop^.shiftimm);
+              { avoid overflows }
+              if taicpu(p).oper[2]^.shifterop^.shiftimm>31 then
+                case taicpu(p).oper[2]^.shifterop^.shiftmode of
+                  SM_ROR:
+                    taicpu(p).oper[2]^.shifterop^.shiftimm:=taicpu(p).oper[2]^.shifterop^.shiftimm and 31;
+                  SM_ASR:
+                    taicpu(p).oper[2]^.shifterop^.shiftimm:=31;
+                  SM_LSR,
+                  SM_LSL:
+                    begin
+                      hp2:=taicpu.op_reg_const(A_MOV,taicpu(p).oper[0]^.reg,0);
+                      InsertLLItem(p.previous, p.next, hp2);
+                      p.free;
+                      p:=hp2;
+                    end;
+                  else
+                    internalerror(2008072803);
+                end;
+              DebugMsg('Peephole Optimization: ShiftShift2Shift 1 done', p);
+              asml.remove(hp1);
+              hp1.free;
+              hp1 := nil;
+              result := true;
+            end
+          { fold
+            mov reg1,reg0, shift imm1
+            mov reg1,reg1, shift imm2
+            mov reg1,reg1, shift imm3 ...
+            mov reg2,reg1, shift imm3 ...
+          }
+          else if GetNextInstructionUsingReg(hp1,hp2, taicpu(hp1).oper[0]^.reg) and
+            MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
+            (taicpu(hp2).ops=3) and
+            MatchOperand(taicpu(hp2).oper[1]^, taicpu(hp1).oper[0]^.reg) and
+            RegEndofLife(taicpu(p).oper[0]^.reg,taicpu(hp2)) and
+            (taicpu(hp2).oper[2]^.typ = top_shifterop) and
+            (taicpu(hp2).oper[2]^.shifterop^.rs = NR_NO) then
+            begin
+              { mov reg1,reg0, lsl imm1
+                mov reg1,reg1, lsr/asr imm2
+                mov reg2,reg1, lsl imm3 ...
+                to
+                mov reg1,reg0, lsl imm1
+                mov reg2,reg1, lsr/asr imm2-imm3
+                if
+                imm1>=imm2
+              }
+              if (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode=SM_LSL) and
+                (taicpu(hp1).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
+                (taicpu(p).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
+                begin
+                  if (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(hp1).oper[2]^.shifterop^.shiftimm) then
+                    begin
+                      if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,p,hp1)) and
+                        not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
+                        begin
+                          DebugMsg('Peephole Optimization: ShiftShiftShift2ShiftShift 1a done', p);
+                          inc(taicpu(p).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm-taicpu(hp1).oper[2]^.shifterop^.shiftimm);
+                          taicpu(p).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
+                          asml.remove(hp1);
+                          asml.remove(hp2);
+                          hp1.free;
+                          hp2.free;
+
+                          if taicpu(p).oper[2]^.shifterop^.shiftimm>=32 then
                             begin
-                              { If the parameter matched on the second op from the RIGHT
-                                we have to switch the parameters, this will not happen for CMP
-                                were we're only evaluating the most right parameter
-                              }
-                              if I <> taicpu(hp1).ops-1 then
-                                begin
-                                  {The SUB operators need to be changed when we swap parameters}
-                                  case taicpu(hp1).opcode of
-                                    A_SUB: tempop:=A_RSB;
-                                    A_SBC: tempop:=A_RSC;
-                                    A_RSB: tempop:=A_SUB;
-                                    A_RSC: tempop:=A_SBC;
-                                    else tempop:=taicpu(hp1).opcode;
-                                  end;
-                                  if taicpu(hp1).ops = 3 then
-                                    hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
-                                         taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[2]^.reg,
-                                         taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                                  else
-                                    hp2:=taicpu.op_reg_reg_shifterop(tempop,
-                                         taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                                         taicpu(p).oper[2]^.shifterop^);
-                                end
-                              else
-                                if taicpu(hp1).ops = 3 then
-                                  hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hp1).opcode,
-                                       taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg,
-                                       taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
-                                else
-                                  hp2:=taicpu.op_reg_reg_shifterop(taicpu(hp1).opcode,
-                                       taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
-                                       taicpu(p).oper[2]^.shifterop^);
-                              if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
-                                AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hp1,UsedRegs);
-                              AllocRegBetween(taicpu(p).oper[1]^.reg,p,hp1,UsedRegs);
-                              asml.insertbefore(hp2, hp1);
-                              GetNextInstruction(p, hp2);
-                              asml.remove(p);
-                              asml.remove(hp1);
-                              p.free;
-                              hp1.free;
-                              p:=hp2;
-                              DebugMsg('Peephole FoldShiftProcess done', p);
-                              Result:=true;
-                              break;
+                              taicpu(p).freeop(1);
+                              taicpu(p).freeop(2);
+                              taicpu(p).loadconst(1,0);
                             end;
-                      end;
-                    {
-                      Fold
-                        mov r1, r1, lsl #2
-                        ldr/ldrb r0, [r0, r1]
-                      to
-                        ldr/ldrb r0, [r0, r1, lsl #2]
-
-                      XXX: This still needs some work, as we quite often encounter something like
-                             mov r1, r2, lsl #2
-                             add r2, r3, #imm
-                             ldr r0, [r2, r1]
-                           which can't be folded because r2 is overwritten between the shift and the ldr.
-                           We could try to shuffle the registers around and fold it into.
-                             add r1, r3, #imm
-                             ldr r0, [r1, r2, lsl #2]
-                    }
-                    if (not(GenerateThumbCode)) and
-                       (taicpu(p).opcode = A_MOV) and
-                       (taicpu(p).ops = 3) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oper[2]^.typ = top_shifterop) and
-                       { RRX is tough to handle, because it requires tracking the C-Flag,
-                         it is also extremly unlikely to be emitted this way}
-                       (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
-                       (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
-                       { thumb2 allows only lsl #0..#3 }
-                       (not(GenerateThumb2Code) or
-                        ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
-                         (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
-                        )
-                       ) and
-                       (taicpu(p).oppostfix = PF_NONE) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
-                       (MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
-                        (GenerateThumb2Code and
-                         MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
-                       ) and
-                       (
-                         {If this is address by offset, one of the two registers can be used}
-                         ((taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                           (
-                             (taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
-                             (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
-                           )
-                         ) or
-                         {For post and preindexed only the index register can be used}
-                         ((taicpu(hp1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
-                           (
-                             (taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
-                             (taicpu(hp1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
-                           ) and
-                           (not GenerateThumb2Code)
-                         )
-                       ) and
-                       { Only fold if both registers are used. Otherwise we are folding p with itself }
-                       (taicpu(hp1).oper[1]^.ref^.index<>NR_NO) and
-                       (taicpu(hp1).oper[1]^.ref^.base<>NR_NO) and
-                       { Only fold if there isn't another shifterop already, and offset is zero. }
-                       (taicpu(hp1).oper[1]^.ref^.offset = 0) and
-                       (taicpu(hp1).oper[1]^.ref^.shiftmode = SM_None) and
-                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
-                       RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp1)) then
-                       begin
-                         { If the register we want to do the shift for resides in base, we need to swap that}
-                         if (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
-                           taicpu(hp1).oper[1]^.ref^.base := taicpu(hp1).oper[1]^.ref^.index;
-                         taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
-                         taicpu(hp1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
-                         taicpu(hp1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
-                         DebugMsg('Peephole FoldShiftLdrStr done', hp1);
-                         GetNextInstruction(p, hp1);
-                         asml.remove(p);
-                         p.free;
-                         p:=hp1;
-                         Result:=true;
-                       end;
-                    {
-                      Often we see shifts and then a superfluous mov to another register
-                      In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
-                    }
-                    if (taicpu(p).opcode = A_MOV) and
-                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       RemoveSuperfluousMove(p, hp1, 'MovMov2Mov') then
-                      Result:=true;
-                  end;
-                A_ADD,
-                A_ADC,
-                A_RSB,
-                A_RSC,
-                A_SUB,
-                A_SBC,
-                A_BIC,
-                A_EOR,
-                A_ORR,
-                A_MLA,
-                A_MLS,
-                A_MUL,
-                A_QADD,A_QADD16,A_QADD8,
-                A_QSUB,A_QSUB16,A_QSUB8,
-                A_QDADD,A_QDSUB,A_QASX,A_QSAX,
-                A_SHADD16,A_SHADD8,A_UHADD16,A_UHADD8,
-                A_SHSUB16,A_SHSUB8,A_UHSUB16,A_UHSUB8,
-                A_PKHTB,A_PKHBT,
-                A_SMUAD,A_SMUSD:
-                  begin
-                    {
-                      change
-                      add/sub reg2,reg1,const1
-                      str/ldr reg3,[reg2,const2]
-                      dealloc reg2
-                      to
-                      str/ldr reg3,[reg1,const2+/-const1]
-                    }
-                    if (not GenerateThumbCode) and
-                       (taicpu(p).opcode in [A_ADD,A_SUB]) and
-                       (taicpu(p).ops>2) and
-                       (taicpu(p).oper[1]^.typ = top_reg) and
-                       (taicpu(p).oper[2]^.typ = top_const) then
-                      begin
-                        hp1:=p;
-                        while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) and
-                          { we cannot check NR_DEFAULTFLAGS for modification yet so don't allow a condition }
-                          MatchInstruction(hp1, [A_LDR, A_STR], [C_None], []) and
-                          (taicpu(hp1).oper[1]^.typ = top_ref) and
-                          (taicpu(hp1).oper[1]^.ref^.base=taicpu(p).oper[0]^.reg) and
-                          { don't optimize if the register is stored/overwritten }
-                          (taicpu(hp1).oper[0]^.reg<>taicpu(p).oper[1]^.reg) and
-                          (taicpu(hp1).oper[1]^.ref^.index=NR_NO) and
-                          (taicpu(hp1).oper[1]^.ref^.addressmode=AM_OFFSET) and
-                          { new offset must be valid: either in the range of 8 or 12 bit, depend on the
-                            ldr postfix }
-                          (((taicpu(p).opcode=A_ADD) and
-                           isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset+taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
-                           ) or
-                           ((taicpu(p).opcode=A_SUB) and
-                            isValidConstLoadStoreOffset(taicpu(hp1).oper[1]^.ref^.offset-taicpu(p).oper[2]^.val, taicpu(hp1).oppostfix)
-                           )
-                          ) do
-                          begin
-                            { neither reg1 nor reg2 might be changed inbetween }
-                            if RegModifiedBetween(taicpu(p).oper[0]^.reg,p,hp1) or
-                              RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1) then
-                              break;
-                            { reg2 must be either overwritten by the ldr or it is deallocated afterwards }
-                            if ((taicpu(hp1).opcode=A_LDR) and (taicpu(p).oper[0]^.reg=taicpu(hp1).oper[0]^.reg)) or
-                              assigned(FindRegDeAlloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) then
-                              begin
-                                { remember last instruction }
-                                hp2:=hp1;
-                                DebugMsg('Peephole Add/SubLdr2Ldr done', p);
-                                hp1:=p;
-                                { fix all ldr/str }
-                                while GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[0]^.reg) do
-                                  begin
-                                    taicpu(hp1).oper[1]^.ref^.base:=taicpu(p).oper[1]^.reg;
-                                    if taicpu(p).opcode=A_ADD then
-                                      inc(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val)
-                                    else
-                                      dec(taicpu(hp1).oper[1]^.ref^.offset,taicpu(p).oper[2]^.val);
-                                    if hp1=hp2 then
-                                      break;
-                                  end;
-                                GetNextInstruction(p,hp1);
-                                asml.remove(p);
-                                p.free;
-                                p:=hp1;
-                                result:=true;
-                                break;
-                              end;
-                          end;
-                      end;
-                    {
-                      change
-                      add reg1, ...
-                      mov reg2, reg1
-                      to
-                      add reg2, ...
-                    }
-                    if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                       (taicpu(p).ops>=3) and
-                       RemoveSuperfluousMove(p, hp1, 'DataMov2Data') then
-                      Result:=true;
+                          result := true;
+                          Exit;
+                        end;
+                    end
+                  else if not(RegUsedBetween(taicpu(hp2).oper[0]^.reg,hp1,hp2)) then
+                    begin
+                      DebugMsg('Peephole Optimization: ShiftShiftShift2ShiftShift 1b done', p);
+
+                      dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(hp2).oper[2]^.shifterop^.shiftimm);
+                      taicpu(hp1).oper[0]^.reg:=taicpu(hp2).oper[0]^.reg;
+                      asml.remove(hp2);
+                      hp2.free;
+                      result := true;
+                      Exit;
+                    end;
+                end
+              { mov reg1,reg0, lsr/asr imm1
+                mov reg1,reg1, lsl imm2
+                mov reg1,reg1, lsr/asr imm3 ...
+
+                if imm3>=imm1 and imm2>=imm1
+                to
+                mov reg1,reg0, lsl imm2-imm1
+                mov reg1,reg1, lsr/asr imm3 ...
+              }
+              else if (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and (taicpu(hp2).oper[2]^.shifterop^.shiftmode in [SM_ASR,SM_LSR]) and
+                (taicpu(hp1).oper[2]^.shifterop^.shiftmode=SM_LSL) and
+                (taicpu(hp2).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) and
+                (taicpu(hp1).oper[2]^.shifterop^.shiftimm>=taicpu(p).oper[2]^.shifterop^.shiftimm) then
+                begin
+                  dec(taicpu(hp1).oper[2]^.shifterop^.shiftimm,taicpu(p).oper[2]^.shifterop^.shiftimm);
+                  taicpu(hp1).oper[1]^.reg:=taicpu(p).oper[1]^.reg;
+                  DebugMsg('Peephole Optimization: ShiftShiftShift2ShiftShift 2 done', p);
+                  if taicpu(hp1).oper[2]^.shifterop^.shiftimm=0 then
+                    begin
+                      taicpu(hp2).oper[1]^.reg:=taicpu(hp1).oper[1]^.reg;
+                      asml.remove(hp1);
+                      hp1.free;
+                    end;
 
-                    if MatchInstruction(p, [A_ADD,A_SUB], [C_None], [PF_None]) and
-                      LookForPreindexedPattern(taicpu(p)) then
-                      begin
-                        GetNextInstruction(p,hp1);
-                        DebugMsg('Peephole Add/Sub to Preindexed done', p);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp1;
-                        Result:=true;
-                      end;
-                    {
-                     Turn
-                     mul reg0, z,w
-                     sub/add x, y, reg0
-                     dealloc reg0
-
-                     into
-
-                     mls/mla x,z,w,y
-                     }
-                    if MatchInstruction(p, [A_MUL], [C_None], [PF_None]) and
-                      (taicpu(p).ops=3) and
-                      (taicpu(p).oper[0]^.typ = top_reg) and
-                      (taicpu(p).oper[1]^.typ = top_reg) and
-                      (taicpu(p).oper[2]^.typ = top_reg) and
-                      GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
-                      MatchInstruction(hp1,[A_ADD,A_SUB],[C_None],[PF_None]) and
-                      (not RegModifiedBetween(taicpu(p).oper[1]^.reg, p, hp1)) and
-                      (not RegModifiedBetween(taicpu(p).oper[2]^.reg, p, hp1)) and
-
-                      (((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype>=cpu_armv4)) or
-                       ((taicpu(hp1).opcode=A_SUB) and (current_settings.cputype in [cpu_armv6t2,cpu_armv7,cpu_armv7a,cpu_armv7r,cpu_armv7m,cpu_armv7em]))) and
-
-                      // CPUs before ARMv6 don't recommend having the same Rd and Rm for MLA.
-                      // TODO: A workaround would be to swap Rm and Rs
-                      (not ((taicpu(hp1).opcode=A_ADD) and (current_settings.cputype<=cpu_armv6) and MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^))) and
-
-                      (((taicpu(hp1).ops=3) and
-                        (taicpu(hp1).oper[2]^.typ=top_reg) and
-                        ((MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) and
-                          (not RegModifiedBetween(taicpu(hp1).oper[1]^.reg, p, hp1))) or
-                         ((MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) and
-                           (taicpu(hp1).opcode=A_ADD) and
-                           (not RegModifiedBetween(taicpu(hp1).oper[2]^.reg, p, hp1)))))) or
-                       ((taicpu(hp1).ops=2) and
-                        (taicpu(hp1).oper[1]^.typ=top_reg) and
-                        MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
-                      (RegEndOfLife(taicpu(p).oper[0]^.reg,taicpu(hp1))) then
-                      begin
-                        if taicpu(hp1).opcode=A_ADD then
-                          begin
-                            taicpu(hp1).opcode:=A_MLA;
+                  RemoveCurrentp(p);
+                  result := true;
+                  Exit;
+                end;
+            end;
+        end;
 
-                            if taicpu(hp1).ops=3 then
-                              begin
-                                if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^) then
-                                  oldreg:=taicpu(hp1).oper[2]^.reg
-                                else
-                                  oldreg:=taicpu(hp1).oper[1]^.reg;
-                              end
-                            else
-                              oldreg:=taicpu(hp1).oper[0]^.reg;
+      { All the optimisations from this point on require GetNextInstructionUsingReg
+        to return True }
+      if not (
+        GetNextInstructionUsingReg(p, hpfar1, taicpu(p).oper[0]^.reg) and
+        (hpfar1.typ = ait_instruction)
+      ) then
+        Exit;
+
+      { Change the common
+        mov r0, r0, lsr #xxx
+        and r0, r0, #yyy/bic r0, r0, #xxx
+
+        and remove the superfluous and/bic if possible
+
+        This could be extended to handle more cases.
+      }
+
+      { Change
+        mov rx, ry, lsr/ror #xxx
+        uxtb/uxth rz,rx/and rz,rx,0xFF
+        dealloc rx
+
+        to
+
+        uxtb/uxth rz,ry,ror #xxx
+      }
+      if (GenerateThumb2Code) and
+         (taicpu(p).ops=3) and
+         (taicpu(p).oper[2]^.typ = top_shifterop) and
+         (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+         (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and
+         RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+         begin
+           if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and
+             (taicpu(hpfar1).ops = 2) and
+             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+             begin
+               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+               taicpu(hpfar1).ops := 3;
 
-                            taicpu(hp1).loadreg(1,taicpu(p).oper[1]^.reg);
-                            taicpu(hp1).loadreg(2,taicpu(p).oper[2]^.reg);
-                            taicpu(hp1).loadreg(3,oldreg);
+               if not Assigned(hp1) then
+                 GetNextInstruction(p,hp1);
 
-                            DebugMsg('MulAdd2MLA done', p);
+               RemoveCurrentP(p, hp1);
 
-                            taicpu(hp1).ops:=4;
+               result:=true;
+               exit;
+             end
+           else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and
+             (taicpu(hpfar1).ops=2) and
+             (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and
+             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+             begin
+               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
+               taicpu(hpfar1).ops := 3;
 
-                            asml.remove(p);
-                            p.free;
-                            p:=hp1;
-                          end
-                        else
-                          begin
-                            taicpu(hp1).opcode:=A_MLS;
+               if not Assigned(hp1) then
+                 GetNextInstruction(p,hp1);
 
+               RemoveCurrentP(p, hp1);
 
-                            taicpu(hp1).loadreg(3,taicpu(hp1).oper[1]^.reg);
+               result:=true;
+               exit;
+             end
+           else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and
+             (taicpu(hpfar1).ops = 3) and
+             (taicpu(hpfar1).oper[2]^.typ = top_const) and
+             (taicpu(hpfar1).oper[2]^.val = $FF) and
+             (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and
+             MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then
+             begin
+               taicpu(hpfar1).ops := 3;
+               taicpu(hpfar1).opcode := A_UXTB;
+               taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
+               taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^);
+               taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR;
 
-                            if taicpu(hp1).ops=2 then
-                              taicpu(hp1).loadreg(1,taicpu(hp1).oper[0]^.reg)
-                            else
-                              taicpu(hp1).loadreg(1,taicpu(p).oper[2]^.reg);
+               if not Assigned(hp1) then
+                 GetNextInstruction(p,hp1);
 
-                            taicpu(hp1).loadreg(2,taicpu(p).oper[1]^.reg);
+               RemoveCurrentP(p, hp1);
 
-                            DebugMsg('MulSub2MLS done', p);
-                            AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,UsedRegs);
-                            AllocRegBetween(taicpu(hp1).oper[2]^.reg,p,hp1,UsedRegs);
-                            AllocRegBetween(taicpu(hp1).oper[3]^.reg,p,hp1,UsedRegs);
+               result:=true;
+               exit;
+             end;
+         end;
 
-                            taicpu(hp1).ops:=4;
-                            RemoveCurrentP(p, hp1); // <-- Is this actually safe? hp1 is not necessarily the next instruction. [Kit]
-                          end;
+      { 2-operald mov optimisations }
+      if (taicpu(p).ops = 2) then
+        begin
+          {
+            This removes the mul from
+            mov rX,0
+            ...
+            mul ...,rX,...
+          }
+          if (taicpu(p).oper[1]^.typ = top_const) then
+            begin
+(*          if false and
+            (taicpu(p).oper[1]^.val=0) and
+            MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and
+            (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or
+             ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then
+              begin
+                TransferUsedRegs(TmpUsedRegs);
+                UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p);
+                if taicpu(hpfar1).opcode=A_MUL then
+                  taicpu(hpfar1).loadconst(1,0)
+                else
+                  taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg);
+                taicpu(hpfar1).ops:=2;
+                taicpu(hpfar1).opcode:=A_MOV;
+                if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
+                  RemoveCurrentP(p);
+                Result:=true;
+                exit;
+              end
+          else*) if (taicpu(p).oper[1]^.val=0) and
+              MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+              MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then
+                begin
+                  TransferUsedRegs(TmpUsedRegs);
+                  UpdateUsedRegs(TmpUsedRegs, tai(p.next));
+                  UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next));
+                  DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p);
+                  taicpu(hpfar1).ops:=3;
+                  taicpu(hpfar1).opcode:=A_MUL;
+                  if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then
+                    begin
+                      RemoveCurrentP(p);
+                      Result:=true;
+                    end;
+                  exit;
+                end
+            {
+              This changes the very common
+              mov r0, #0
+              str r0, [...]
+              mov r0, #0
+              str r0, [...]
 
-                        result:=true;
-                      end
-                  end;
-{$ifdef dummy}
-                A_MVN:
-                  begin
-                    {
-                      change
-                      mvn reg2,reg1
-                      and reg3,reg4,reg2
-                      dealloc reg2
-                      to
-                      bic reg3,reg4,reg1
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_reg) and
-                      GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
-                      MatchInstruction(hp1,A_AND,[],[]) and
-                      (((taicpu(hp1).ops=3) and
-                        (taicpu(hp1).oper[2]^.typ=top_reg) and
-                        (MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) or
-                         MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) or
-                       ((taicpu(hp1).ops=2) and
-                        (taicpu(hp1).oper[1]^.typ=top_reg) and
-                        MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
-                      assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
-                      { reg1 might not be modified inbetween }
-                      not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
-                      begin
-                        DebugMsg('Peephole MvnAnd2Bic done', p);
-                        taicpu(hp1).opcode:=A_BIC;
-
-                        if taicpu(hp1).ops=3 then
-                          begin
-                            if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
-                              taicpu(hp1).loadReg(1,taicpu(hp1).oper[2]^.reg); // Swap operands
-
-                            taicpu(hp1).loadReg(2,taicpu(p).oper[1]^.reg);
-                          end
-                        else
-                          taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
-                        GetNextInstruction(p, hp1);
-                        asml.remove(p);
-                        p.free;
-                        p:=hp1;
-                      end;
-                  end;
-{$endif dummy}
-                A_UXTB:
-                  Result:=OptPass1UXTB(p);
-                A_UXTH:
-                  Result:=OptPass1UXTH(p);
-                A_SXTB:
-                  Result:=OptPass1SXTB(p);
-                A_SXTH:
-                  Result:=OptPass1SXTH(p);
-                A_CMP:
-                  begin
-                    {
-                      change
-                      cmp   reg,const1
-                      moveq reg,const1
-                      movne reg,const2
-                      to
-                      cmp   reg,const1
-                      movne reg,const2
-                    }
-                    if (taicpu(p).oper[1]^.typ = top_const) and
-                       GetNextInstruction(p, hp1) and
-                       MatchInstruction(hp1, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
-                       (taicpu(hp1).oper[1]^.typ = top_const) and
-                       GetNextInstruction(hp1, hp2) and
-                       MatchInstruction(hp2, A_MOV, [C_EQ, C_NE], [PF_NONE]) and
-                       (taicpu(hp1).oper[1]^.typ = top_const) then
-                      begin
-                        Result:=RemoveRedundantMove(p, hp1, asml) or Result;
-                        Result:=RemoveRedundantMove(p, hp2, asml) or Result;
-                      end;
-                  end;
-                A_STM:
-                  begin
-                    {
-                      change
-	              stmfd	r13!,[r14]
-	              sub	r13,r13,#4
-	              bl	abc
-	              add	r13,r13,#4
-	              ldmfd	r13!,[r15]
-                      into
-                      b         abc
-                    }
-                    if not(ts_thumb_interworking in current_settings.targetswitches) and
-                       MatchInstruction(p, A_STM, [C_None], [PF_FD]) and
-                      GetNextInstruction(p, hp1) and
+              and removes all superfluous mov instructions
+            }
+            else if (taicpu(hpfar1).opcode=A_STR) then
+              begin
+                hp1 := hpfar1;
+                while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and
+                      MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and
                       GetNextInstruction(hp1, hp2) and
-                      SkipEntryExitMarker(hp2, hp2) and
-                      GetNextInstruction(hp2, hp3) and
-                      SkipEntryExitMarker(hp3, hp3) and
-                      GetNextInstruction(hp3, hp4) and
-                      (taicpu(p).oper[0]^.typ = top_ref) and
-                      (taicpu(p).oper[0]^.ref^.index=NR_STACK_POINTER_REG) and
-                      (taicpu(p).oper[0]^.ref^.base=NR_NO) and
-                      (taicpu(p).oper[0]^.ref^.offset=0) and
-                      (taicpu(p).oper[0]^.ref^.addressmode=AM_PREINDEXED) and
-                      (taicpu(p).oper[1]^.typ = top_regset) and
-                      (taicpu(p).oper[1]^.regset^ = [RS_R14]) and
-
-                      MatchInstruction(hp1, A_SUB, [C_None], [PF_NONE]) and
-                      (taicpu(hp1).oper[0]^.typ = top_reg) and
-                      (taicpu(hp1).oper[0]^.reg = NR_STACK_POINTER_REG) and
-                      MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp1).oper[1]^) and
-                      (taicpu(hp1).oper[2]^.typ = top_const) and
-
-                      MatchInstruction(hp3, A_ADD, [C_None], [PF_NONE]) and
-                      MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[0]^) and
-                      MatchOperand(taicpu(hp1).oper[0]^,taicpu(hp3).oper[1]^) and
-                      MatchOperand(taicpu(hp1).oper[2]^,taicpu(hp3).oper[2]^) and
-
-                      MatchInstruction(hp2, [A_BL,A_BLX], [C_None], [PF_NONE]) and
-                      (taicpu(hp2).oper[0]^.typ = top_ref) and
-
-                      MatchInstruction(hp4, A_LDM, [C_None], [PF_FD]) and
-                      MatchOperand(taicpu(p).oper[0]^,taicpu(hp4).oper[0]^) and
-                      (taicpu(hp4).oper[1]^.typ = top_regset) and
-                      (taicpu(hp4).oper[1]^.regset^ = [RS_R15]) then
-                      begin
-                        asml.Remove(p);
-                        asml.Remove(hp1);
-                        asml.Remove(hp3);
-                        asml.Remove(hp4);
-                        taicpu(hp2).opcode:=A_B;
-                        p.free;
-                        hp1.free;
-                        hp3.free;
-                        hp4.free;
-                        p:=hp2;
-                        DebugMsg('Peephole Bl2B done', p);
-                      end;
-                  end;
-                A_VMOV:
+                      MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and
+                      (taicpu(hp2).ops = 2) and
+                      MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and
+                      MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do
                   begin
-                    {
-                      change
-                      vmov reg0,reg1,reg2
-                      vmov reg1,reg2,reg0
-                      into
-                      vmov reg0,reg1,reg2
-
-                      can be applied regardless if reg0 or reg2 is the vfp register
-                    }
-                    if (taicpu(p).ops = 3) and
-                      GetNextInstruction(p, hp1) and
-                      MatchInstruction(hp1, A_VMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
-                      (taicpu(hp1).ops = 3) and
-                      MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[2]^) and
-                      MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) and
-                      MatchOperand(taicpu(p).oper[2]^, taicpu(hp1).oper[1]^) then
-                      begin
-                        asml.Remove(hp1);
-                        hp1.free;
-                        DebugMsg('Peephole VMovVMov2VMov done', p);
-                      end;
+                    DebugMsg('Peephole Optimization: MovStrMov done', hp2);
+                    GetNextInstruction(hp2,hp1);
+                    asml.remove(hp2);
+                    hp2.free;
+                    result:=true;
+                    if not assigned(hp1) then break;
                   end;
-                A_AND:
-                  Result:=OptPass1And(p);
-                A_VLDR,
-                A_VADD,
-                A_VMUL,
-                A_VDIV,
-                A_VSUB,
-                A_VSQRT,
-                A_VNEG,
-                A_VCVT,
-                A_VABS:
-                  begin
-                    if GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
-                      RemoveSuperfluousVMov(p, hp1, 'VOpVMov2VOp') then
-                      Result:=true;
-                  end
-                else
-                  ;
+
+                if Result then
+                  Exit;
               end;
+            end;
+          {
+            This removes the first mov from
+            mov rX,...
+            mov rX,...
+          }
+          if taicpu(hpfar1).opcode=A_MOV then
+            begin
+              hp1 := p;
+              while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and
+                    (taicpu(hpfar1).ops = 2) and
+                    MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and
+                    { don't remove the first mov if the second is a mov rX,rX }
+                    not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do
+                begin
+                  { Defer removing the first p until after the while loop }
+                  if p <> hp1 then
+                    begin
+                      DebugMsg('Peephole Optimization: MovMov done', hp1);
+                      asml.remove(hp1);
+                      hp1.free;
+                    end;
+                  hp1:=hpfar1;
+                  GetNextInstruction(hpfar1,hpfar1);
+                  result:=true;
+                  if not assigned(hpfar1) then
+                    Break;
+                end;
+
+              if Result then
+                begin
+                  DebugMsg('Peephole Optimization: MovMov done', p);
+                  RemoveCurrentp(p);
+                  Exit;
+                end;
+            end;
+
+          if RedundantMovProcess(p,hpfar1) then
+            begin
+              Result:=true;
+              { p might not point at a mov anymore }
+              exit;
+            end;
+
+          { Fold the very common sequence
+              mov  regA, regB
+              ldr* regA, [regA]
+            to
+              ldr* regA, [regB]
+            CAUTION! If this one is successful p might not be a mov instruction anymore!
+          }
+          if
+             // Make sure that Thumb code doesn't propagate a high register into a reference
+             (
+               (
+                 GenerateThumbCode and
+                 (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8)
+               ) or (not GenerateThumbCode)
+             ) and
+             (taicpu(p).oper[1]^.typ = top_reg) and
+             (taicpu(p).oppostfix = PF_NONE) and
+             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and
+             (taicpu(hpfar1).oper[1]^.typ = top_ref) and
+             { We can change the base register only when the instruction uses AM_OFFSET }
+             ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or
+               ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg))
+             ) and
+             not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+             RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+            begin
+              DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1);
+              if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and
+                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+                taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg;
+
+              if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then
+                taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+
+              dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next));
+              if Assigned(dealloc) then
+                begin
+                  asml.remove(dealloc);
+                  asml.InsertAfter(dealloc,hpfar1);
+                end;
+
+              if not Assigned(hp1) then
+                GetNextInstruction(p, hp1);
+
+              RemoveCurrentP(p, hp1);
+
+              result:=true;
+              Exit;
+            end
+        end
+
+      { 3-operald mov optimisations }
+      else if (taicpu(p).ops = 3) then
+        begin
+
+          if (taicpu(p).oper[2]^.typ = top_shifterop) and
+            (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and
+            (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and
+            (taicpu(hpfar1).ops>=1) and
+            (taicpu(hpfar1).oper[0]^.typ=top_reg) and
+            (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and
+            RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+            begin
+              if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and
+                MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                (taicpu(hpfar1).ops=3) and
+                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                { Check if the AND actually would only mask out bits being already zero because of the shift
+                }
+                ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) =
+                  ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then
+                begin
+                  DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1);
+                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                  asml.remove(hpfar1);
+                  hpfar1.free;
+                  result:=true;
+                  Exit;
+                end
+              else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+                (taicpu(hpfar1).ops=3) and
+                MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and
+                (taicpu(hpfar1).oper[2]^.typ = top_const) and
+                { Check if the BIC actually would only mask out bits beeing already zero because of the shift }
+                (taicpu(hpfar1).oper[2]^.val<>0) and
+                (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then
+                begin
+                  DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1);
+                  taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg;
+                  asml.remove(hpfar1);
+                  hpfar1.free;
+                  result:=true;
+                  Exit;
+                end;
+            end;
+          { This folds shifterops into following instructions
+            mov r0, r1, lsl #8
+            add r2, r3, r0
+
+            to
+
+            add r2, r3, r1, lsl #8
+            CAUTION! If this one is successful p might not be a mov instruction anymore!
+          }
+          if (taicpu(p).oper[1]^.typ = top_reg) and
+           (taicpu(p).oper[2]^.typ = top_shifterop) and
+           (taicpu(p).oppostfix = PF_NONE) and
+           MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC,
+                                  A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST,
+                                  A_CMP, A_CMN],
+                            [taicpu(p).condition], [PF_None]) and
+           (not ((GenerateThumb2Code) and
+                 (taicpu(hpfar1).opcode in [A_SBC]) and
+                 (((taicpu(hpfar1).ops=3) and
+                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or
+                  ((taicpu(hpfar1).ops=2) and
+                   MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and
+           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and
+           (taicpu(hpfar1).ops >= 2) and
+           {Currently we can't fold into another shifterop}
+           (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and
+           {Folding rrx is problematic because of the C-Flag, as we currently can't check
+            NR_DEFAULTFLAGS for modification}
+           (
+             {Everything is fine if we don't use RRX}
+             (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or
+             (
+               {If it is RRX, then check if we're just accessing the next instruction}
+               Assigned(hp1) and
+               (hpfar1 = hp1)
+             )
+           ) and
+           { reg1 might not be modified inbetween }
+           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+           { The shifterop can contain a register, might not be modified}
+           (
+             (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or
+             not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1))
+           ) and
+           (
+             {Only ONE of the two src operands is allowed to match}
+             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor
+             MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^)
+           ) then
+          begin
+            if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then
+              I2:=0
+            else
+              I2:=1;
+            for I:=I2 to taicpu(hpfar1).ops-1 do
+              if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then
+                begin
+                  { If the parameter matched on the second op from the RIGHT
+                    we have to switch the parameters, this will not happen for CMP
+                    were we're only evaluating the most right parameter
+                  }
+                  if I <> taicpu(hpfar1).ops-1 then
+                    begin
+                      {The SUB operators need to be changed when we swap parameters}
+                      case taicpu(hpfar1).opcode of
+                        A_SUB: tempop:=A_RSB;
+                        A_SBC: tempop:=A_RSC;
+                        A_RSB: tempop:=A_SUB;
+                        A_RSC: tempop:=A_SBC;
+                        else tempop:=taicpu(hpfar1).opcode;
+                      end;
+                      if taicpu(hpfar1).ops = 3 then
+                        hp2:=taicpu.op_reg_reg_reg_shifterop(tempop,
+                             taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg,
+                             taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                      else
+                        hp2:=taicpu.op_reg_reg_shifterop(tempop,
+                             taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                             taicpu(p).oper[2]^.shifterop^);
+                    end
+                  else
+                    if taicpu(hpfar1).ops = 3 then
+                      hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                           taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg,
+                           taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^)
+                    else
+                      hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode,
+                           taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg,
+                           taicpu(p).oper[2]^.shifterop^);
+                  if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then
+                    AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs);
+                  AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs);
+                  asml.insertbefore(hp2, hpfar1);
+                  asml.remove(hpfar1);
+                  hpfar1.free;
+                  DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2);
+
+                  if not Assigned(hp1) then
+                    GetNextInstruction(p, hp1)
+                  else if hp1 = hpfar1 then
+                    { If hp1 = hpfar1, then it's a dangling pointer }
+                    hp1 := hp2;
+
+                  RemoveCurrentP(p, hp1);
+                  Result:=true;
+                  Exit;
+                end;
           end;
-        else
-          ;
-      end;
+        {
+          Fold
+            mov r1, r1, lsl #2
+            ldr/ldrb r0, [r0, r1]
+          to
+            ldr/ldrb r0, [r0, r1, lsl #2]
+
+          XXX: This still needs some work, as we quite often encounter something like
+                 mov r1, r2, lsl #2
+                 add r2, r3, #imm
+                 ldr r0, [r2, r1]
+               which can't be folded because r2 is overwritten between the shift and the ldr.
+               We could try to shuffle the registers around and fold it into.
+                 add r1, r3, #imm
+                 ldr r0, [r1, r2, lsl #2]
+        }
+        if (not(GenerateThumbCode)) and
+          { thumb2 allows only lsl #0..#3 }
+          (not(GenerateThumb2Code) or
+           ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and
+            (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL)
+           )
+          ) and
+           (taicpu(p).oper[1]^.typ = top_reg) and
+           (taicpu(p).oper[2]^.typ = top_shifterop) and
+           { RRX is tough to handle, because it requires tracking the C-Flag,
+             it is also extremly unlikely to be emitted this way}
+           (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
+           (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
+           (taicpu(p).oppostfix = PF_NONE) and
+           {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
+           (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or
+            (GenerateThumb2Code and
+             MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH]))
+           ) and
+           (
+             {If this is address by offset, one of the two registers can be used}
+             ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and
+               (
+                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor
+                 (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)
+               )
+             ) or
+             {For post and preindexed only the index register can be used}
+             ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and
+               (
+                 (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
+                 (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg)
+               ) and
+               (not GenerateThumb2Code)
+             )
+           ) and
+           { Only fold if both registers are used. Otherwise we are folding p with itself }
+           (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and
+           (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and
+           { Only fold if there isn't another shifterop already, and offset is zero. }
+           (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and
+           (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and
+           not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and
+           RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then
+           begin
+             { If the register we want to do the shift for resides in base, we need to swap that}
+             if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then
+               taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index;
+             taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+             taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
+             taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
+             DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1);
+             RemoveCurrentP(p);
+             Result:=true;
+             Exit;
+           end;
+        end;
+      {
+        Often we see shifts and then a superfluous mov to another register
+        In the future this might be handled in RedundantMovProcess when it uses RegisterTracking
+      }
+      if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then
+        Result:=true;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1MVN(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      {
+        change
+        mvn reg2,reg1
+        and reg3,reg4,reg2
+        dealloc reg2
+        to
+        bic reg3,reg4,reg1
+      }
+      Result := False;
+      if (taicpu(p).oper[1]^.typ = top_reg) and
+        GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[0]^.reg) and
+        MatchInstruction(hp1,A_AND,[],[]) and
+        (((taicpu(hp1).ops=3) and
+          (taicpu(hp1).oper[2]^.typ=top_reg) and
+          (MatchOperand(taicpu(hp1).oper[2]^, taicpu(p).oper[0]^.reg) or
+           MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) or
+         ((taicpu(hp1).ops=2) and
+          (taicpu(hp1).oper[1]^.typ=top_reg) and
+          MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg))) and
+        assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) and
+        { reg1 might not be modified inbetween }
+        not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) then
+        begin
+          DebugMsg('Peephole Optimization: MvnAnd2Bic done', p);
+          taicpu(hp1).opcode:=A_BIC;
+
+          if taicpu(hp1).ops=3 then
+            begin
+              if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then
+                taicpu(hp1).loadReg(1,taicpu(hp1).oper[2]^.reg); // Swap operands
+
+              taicpu(hp1).loadReg(2,taicpu(p).oper[1]^.reg);
+            end
+          else
+            taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg);
+
+          RemoveCurrentp(p);
+          Result := True;
+          Exit;
+        end;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1VMov(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      {
+        change
+        vmov reg0,reg1,reg2
+        vmov reg1,reg2,reg0
+        into
+        vmov reg0,reg1,reg2
+
+        can be applied regardless if reg0 or reg2 is the vfp register
+      }
+      Result := False;
+      if (taicpu(p).ops = 3) then
+        while GetNextInstruction(p, hp1) and
+          MatchInstruction(hp1, A_VMOV, [taicpu(p).condition], [taicpu(p).oppostfix]) and
+          (taicpu(hp1).ops = 3) and
+          MatchOperand(taicpu(p).oper[0]^, taicpu(hp1).oper[2]^) and
+          MatchOperand(taicpu(p).oper[1]^, taicpu(hp1).oper[0]^) and
+          MatchOperand(taicpu(p).oper[2]^, taicpu(hp1).oper[1]^) do
+          begin
+            asml.Remove(hp1);
+            hp1.free;
+            DebugMsg('Peephole Optimization: VMovVMov2VMov done', p);
+            { Can we do it again? }
+          end;
+    end;
+
+
+  function TCpuAsmOptimizer.OptPass1VOp(var p: tai): Boolean;
+    var
+      hp1: tai;
+    begin
+      Result := GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+        RemoveSuperfluousVMov(p, hp1, 'VOpVMov2VOp');
+    end;
+
+
+  function TCpuAsmOptimizer.PeepHoleOptPass1Cpu(var p: tai): boolean;
+    begin
+      result := false;
+      if p.typ = ait_instruction then
+        begin
+          case taicpu(p).opcode of
+            A_CMP:
+              Result := OptPass1CMP(p);
+            A_STR:
+              Result := OptPass1STR(p);
+            A_LDR:
+              Result := OptPass1LDR(p);
+            A_MOV:
+              Result := OptPass1MOV(p);
+            A_AND:
+              Result := OptPass1And(p);
+            A_ADD,
+            A_SUB:
+              Result := OptPass1ADDSUB(p);
+            A_MUL:
+              REsult := OptPass1MUL(p);
+            A_ADC,
+            A_RSB,
+            A_RSC,
+            A_SBC,
+            A_BIC,
+            A_EOR,
+            A_ORR,
+            A_MLA,
+            A_MLS,
+            A_QADD,A_QADD16,A_QADD8,
+            A_QSUB,A_QSUB16,A_QSUB8,
+            A_QDADD,A_QDSUB,A_QASX,A_QSAX,
+            A_SHADD16,A_SHADD8,A_UHADD16,A_UHADD8,
+            A_SHSUB16,A_SHSUB8,A_UHSUB16,A_UHSUB8,
+            A_PKHTB,A_PKHBT,
+            A_SMUAD,A_SMUSD:
+              Result := OptPass1DataCheckMov(p);
+{$ifdef dummy}
+            A_MVN:
+              Result := OPtPass1MVN(p);
+{$endif dummy}
+            A_UXTB:
+              Result := OptPass1UXTB(p);
+            A_UXTH:
+              Result := OptPass1UXTH(p);
+            A_SXTB:
+              Result := OptPass1SXTB(p);
+            A_SXTH:
+              Result := OptPass1SXTH(p);
+            A_STM:
+              Result := OptPass1STM(p);
+            A_VMOV:
+              Result := OptPass1VMov(p);
+            A_VLDR,
+            A_VADD,
+            A_VMUL,
+            A_VDIV,
+            A_VSUB,
+            A_VSQRT,
+            A_VNEG,
+            A_VCVT,
+            A_VABS:
+              Result := OptPass1VOp(p);
+            else
+              ;
+          end;
+        end;
     end;
 
 

+ 16 - 8
compiler/armgen/aoptarm.pas

@@ -47,7 +47,7 @@ Type
     function OptPass1UXTH(var p: tai): Boolean;
     function OptPass1SXTB(var p: tai): Boolean;
     function OptPass1SXTH(var p: tai): Boolean;
-    function OptPass1And(var p: tai): Boolean;
+    function OptPass1And(var p: tai): Boolean; virtual;
   End;
 
   function MatchInstruction(const instr: tai; const op: TCommonAsmOps; const cond: TAsmConds; const postfix: TOpPostfixes): boolean;
@@ -170,18 +170,26 @@ Implementation
 
   function TARMAsmOptimizer.GetNextInstructionUsingReg(Current: tai;
     Out Next: tai; reg: TRegister): Boolean;
+    var
+      gniResult: Boolean;
     begin
       Next:=Current;
+      Result := False;
       repeat
-        Result:=GetNextInstruction(Next,Next);
-      until not (Result) or
-            not(cs_opt_level3 in current_settings.optimizerswitches) or
-            (Next.typ<>ait_instruction) or
-            RegInInstruction(reg,Next) or
-            is_calljmp(taicpu(Next).opcode)
+
+        gniResult:=GetNextInstruction(Next,Next);
+        if gniResult and RegInInstruction(reg,Next) then
+          { Found something }
+          Exit(True);
+
+      until not gniResult or
+        not(cs_opt_level3 in current_settings.optimizerswitches) or
+        (Next.typ<>ait_instruction) or
+        is_calljmp(taicpu(Next).opcode)
 {$ifdef ARM}
-            or RegModifiedByInstruction(NR_PC,Next);
+        or RegModifiedByInstruction(NR_PC,Next)
 {$endif ARM}
+        ;
     end;
 
 

+ 2 - 0
compiler/htypechk.pas

@@ -1803,6 +1803,7 @@ implementation
                  mayberesettypeconvs;
                  exit;
                end;
+             arrayconstructorn,
              setconstn,
              stringconstn,
              guidconstn :
@@ -2106,6 +2107,7 @@ implementation
                  (tstringdef(def_to).encoding=tstringdef(p.resultdef).encoding) then
                 eq:=te_equal
             end;
+          formaldef,
           setdef :
             begin
               { set can also be a not yet converted array constructor }

+ 7 - 0
compiler/ncal.pas

@@ -1192,6 +1192,13 @@ implementation
                     (parasym.vardef.typ=setdef) then
                    inserttypeconv(left,parasym.vardef);
 
+                 { if an array constructor can be a set and it is passed to
+                   a formaldef, a set must be passed, see also issue #37796 }
+                 if (left.nodetype=arrayconstructorn) and
+                    (parasym.vardef.typ=formaldef) and
+                    (arrayconstructor_can_be_set(left)) then
+                   left:=arrayconstructor_to_set(left,false);
+
                  { set some settings needed for arrayconstructor }
                  if is_array_constructor(left.resultdef) then
                   begin

+ 2 - 1
compiler/nflw.pas

@@ -2193,7 +2193,8 @@ implementation
                     p2:=current_procinfo;
                     while true do
                       begin
-                        if (p2.flags*[pi_needs_implicit_finally,pi_uses_exceptions,pi_has_implicit_finally])<>[] then
+                        if ((cs_implicit_exceptions in current_settings.moduleswitches) and ((p2.flags*[pi_needs_implicit_finally,pi_has_implicit_finally])<>[])) or
+                        ((p2.flags*[pi_uses_exceptions])<>[]) then
                           Message(cg_e_goto_across_procedures_with_exceptions_not_allowed);
                         if labelsym.owner=p2.procdef.localst then
                           break;

+ 2 - 2
packages/fcl-passrc/src/pasresolver.pp

@@ -16315,7 +16315,7 @@ begin
                               ParamType,ConstraintClass,ErrorPos);
       exit(cIncompatible);
       end;
-    if TPasClassType(ParamType).ObjKind<>okClass then
+    if not (TPasClassType(ParamType).ObjKind in [okClass,okInterface]) then
       begin
       if ErrorPos<>nil then
         RaiseMsg(20190904175144,nXExpectedButYFound,sXExpectedButYFound,
@@ -29830,7 +29830,7 @@ begin
   Result:=nil;
   while ClassEl<>nil do
     begin
-    if IndexOfImplementedInterface(ClassEl,Intf)>=0 then
+    if (ClassEl=Intf) or (IndexOfImplementedInterface(ClassEl,Intf)>=0) then
       exit(ClassEl);
     ClassEl:=GetPasClassAncestor(ClassEl,true) as TPasClassType;
     end;

+ 11 - 5
packages/pastojs/src/fppas2js.pp

@@ -5658,12 +5658,18 @@ begin
         else
           if not (ConEl is TPasType) then
             RaiseNotYetImplemented(20191018180031,ConEl,GetObjPath(Param));
-          if ConEl is TPasClassType then
-            begin
-            if TPasClassType(ConEl).IsExternal then
-              TIName:=Pas2JSBuiltInNames[pbitnTIExtClass]
+          TypeEl:=ResolveAliasType(TPasType(ConEl));
+          if TypeEl is TPasClassType then
+            case TPasClassType(TypeEl).ObjKind of
+            okClass:
+              if TPasClassType(TypeEl).IsExternal then
+                TIName:=Pas2JSBuiltInNames[pbitnTIExtClass]
+              else
+                TIName:=Pas2JSBuiltInNames[pbitnTIClass];
+            okInterface:
+              TIName:=Pas2JSBuiltInNames[pbitnTIInterface];
             else
-              TIName:=Pas2JSBuiltInNames[pbitnTIClass];
+              RaiseNotYetImplemented(20200927100825,ConEl,GetObjPath(Param));
             end
           else
             RaiseNotYetImplemented(20191018180131,ConEl,GetObjPath(Param));

+ 41 - 0
packages/pastojs/tests/tcgenerics.pas

@@ -52,6 +52,7 @@ type
     // class interfaces
     procedure TestGen_ClassInterface_Corba;
     procedure TestGen_ClassInterface_InterfacedObject;
+    procedure TestGen_ClassInterface_COM_RTTI;
 
     // statements
     Procedure TestGen_InlineSpec_Constructor;
@@ -1478,6 +1479,46 @@ begin
     '']));
 end;
 
+procedure TTestGenerics.TestGen_ClassInterface_COM_RTTI;
+begin
+  StartProgram(true,[supTInterfacedObject]);
+  Add([
+  '{$mode delphi}',
+  'type',
+  '  TBird = class',
+  '    function Fly<T: IInterface>: T;',
+  '  end;',
+  '  IAnt = interface',
+  '    procedure InterfaceProc;',
+  '  end;',
+  'function TBird.Fly<T>: T;',
+  'begin',
+  '  if TypeInfo(T)=nil then ;',
+  'end;',
+  'var Bird: TBird;',
+  '  Ant: IAnt;',
+  'begin',
+  '  Ant := Bird.Fly<IAnt>;',
+  '']);
+  ConvertProgram;
+  CheckSource('TestGen_ClassInterface_COM_RTTI',
+    LinesToStr([ // statements
+    'rtl.createClass(this, "TBird", pas.system.TObject, function () {',
+    '  this.Fly$G1 = function () {',
+    '    var Result = null;',
+    '    if ($mod.$rtti["IAnt"] === null) ;',
+    '    return Result;',
+    '  };',
+    '});',
+    'rtl.createInterface(this, "IAnt", "{B9D0FF27-A446-3A1B-AA85-F167837AA297}", ["InterfaceProc"], pas.system.IUnknown);',
+    'this.Bird = null;',
+    'this.Ant = null;',
+    '']),
+    LinesToStr([ // $mod.$main
+    'rtl.setIntfP($mod, "Ant", $mod.Bird.Fly$G1(), true);',
+    '']));
+end;
+
 procedure TTestGenerics.TestGen_InlineSpec_Constructor;
 begin
   StartProgram(false);

+ 1 - 0
rtl/objpas/math.pp

@@ -509,6 +509,7 @@ function MaxValue(const data : PInteger; Const N : Integer) : Integer;
 
 { returns random values with gaussian distribution }
 function RandG(mean,stddev : float) : float;
+
 function RandomRange(const aFrom, aTo: Integer): Integer;
 function RandomRange(const aFrom, aTo: Int64): Int64;
 

+ 38 - 0
tests/webtbs/tw37796.pp

@@ -0,0 +1,38 @@
+program tformal;
+{$mode objfpc}
+
+uses
+  sysutils;
+
+type
+  TFontStyle = (
+    fsItalic,
+    fsBold,
+    fsUnderlined,
+    fsStrikeOut
+  );
+  TFontStyles = set of TFontStyle;
+
+var aFS: TFontStyles;
+
+procedure Any(const Anything);
+begin
+  aFS:=aFS+TFontStyles(Anything);
+  Writeln(IntToHex(PLongInt(@Anything)^, 8));
+end;
+
+procedure DoIt;
+begin
+  Any([fsItalic, fsBold]); //unit1.pas(31,25) Error: Variable identifier expected
+  if aFS<>[fsItalic, fsBold] then
+    halt(1);
+  Any(Cardinal([fsItalic, fsBold])); //ok
+end;
+
+begin
+  aFS:=[];
+  writeln(Cardinal(aFS));
+  DoIt;
+  writeln(Cardinal(aFS));
+  writeln('ok');
+end.

+ 21 - 0
tests/webtbs/tw37823.pp

@@ -0,0 +1,21 @@
+{$MODE ISO}
+{$implicitExceptions off}
+{$Q+}
+{$R+}
+program gt;
+  label 1;
+  procedure jump;
+  var
+    a: integer;
+    b: rawbytestring;
+  begin
+    b := 'nanu';
+    writeln('nanu');
+    goto 1;
+  end;
+begin
+  jump;
+  writeln('not jumped!');
+1:
+writeln('jumped!');
+end.