Browse Source

* ARMv7A / A64: Constant writes to memory merged to larger forms where possible

J. Gareth "Curious Kit" Moreton 3 years ago
parent
commit
2a50d5abf8
4 changed files with 1051 additions and 11 deletions
  1. 52 6
      compiler/aarch64/aoptcpu.pas
  2. 62 0
      compiler/arm/aoptcpu.pas
  3. 5 4
      compiler/arm/cpuinfo.pas
  4. 932 1
      compiler/armgen/aoptarm.pas

+ 52 - 6
compiler/aarch64/aoptcpu.pas

@@ -383,6 +383,9 @@ Implementation
       if inherited OptPass1STR(p) or
         LookForPostindexedPattern(p) then
         Exit(True);
+
+      if getsupreg(taicpu(p).oper[0]^.reg) = RS_WZR then
+        Result := TryConstMerge(p, nil);
     end;
 
 
@@ -645,10 +648,12 @@ Implementation
   function TCpuAsmOptimizer.OptPass1MOVZ(var p: tai): boolean;
     var
       hp1: tai;
-      ZeroReg: TRegister;
+      TargetReg: TRegister;
     begin
       Result := False;
       hp1 := nil;
+
+      TargetReg := taicpu(p).oper[0]^.reg;
       if (taicpu(p).oppostfix = PF_None) and (taicpu(p).condition = C_None) then
         begin
           if
@@ -658,7 +663,7 @@ Implementation
             not GetNextInstruction(p, hp1) or
             { MOVZ and MOVK/MOVN instructions undergo macro-fusion. }
             not MatchInstruction(hp1, [A_MOVK, A_MOVN], [C_None], [PF_None]) or
-            (taicpu(hp1).oper[0]^.reg <> taicpu(p).oper[0]^.reg) then
+            (taicpu(hp1).oper[0]^.reg <> TargetReg) then
             begin
               if (taicpu(p).oper[1]^.val = 0) then
                 begin
@@ -672,12 +677,11 @@ Implementation
                   }
                   DebugMsg(SPeepholeOptimization + 'Movz0ToMovZeroReg', p);
 
-                  { Make sure the zero register is the correct size }
-                  ZeroReg := taicpu(p).oper[0]^.reg;
-                  setsupreg(ZeroReg, RS_XZR);
+                  { Convert TargetReg to the correctly-sized zero register }
+                  setsupreg(TargetReg, RS_XZR);
 
                   taicpu(p).opcode := A_MOV;
-                  taicpu(p).loadreg(1, ZeroReg);
+                  taicpu(p).loadreg(1, TargetReg);
                   Result := True;
                   Exit;
                 end;
@@ -698,6 +702,48 @@ Implementation
               exit;
             end;
         end;
+
+      if (getsupreg(TargetReg) <= RS_X30) and { Mostly to play safe }
+        GetNextInstructionUsingReg(p, hp1, TargetReg) and
+        (hp1.typ = ait_instruction) then
+        begin
+          case taicpu(hp1).opcode of
+{$ifdef AARCH64}
+            A_MOVK:
+              { Try to avoid too much unnecessary processing by checking to see
+                if the register is 32-bit }
+              if (getsubreg(TargetReg) = R_SUBD) and
+                (taicpu(hp1).oper[0]^.reg = TargetReg) and
+                TryConstMerge(p, hp1) then
+                begin
+                  Result := True;
+                  Exit;
+                end;
+{$endif AARCH64}
+            A_STR:
+              {
+                With sequences such as:
+                  movz  w0,x
+                  strb  w0,[sp, #ofs]
+                  movz  w0,y
+                  strb  w0,[sp, #ofs+1]
+
+                Merge the constants to:
+                  movz  w0,x + (y shl 8)
+                  strw  w0,[sp, #ofs]
+
+                Only use the stack pointer or frame pointer and an even offset though
+                to guarantee alignment
+              }
+              if TryConstMerge(p, hp1) then
+                begin
+                  Result := True;
+                  Exit;
+                end;
+            else
+              ;
+          end;
+        end;
     end;
 
 

+ 62 - 0
compiler/arm/aoptcpu.pas

@@ -79,6 +79,7 @@ Type
     function OptPass1CMP(var p: tai): Boolean;
     function OptPass1STM(var p: tai): Boolean;
     function OptPass1MOV(var p: tai): Boolean;
+    function OptPass1MOVW(var p: tai): Boolean;
     function OptPass1MUL(var p: tai): Boolean;
     function OptPass1MVN(var p: tai): Boolean;
     function OptPass1VMov(var p: tai): Boolean;
@@ -1484,6 +1485,13 @@ Implementation
 
                     if Result then
                       Exit;
+
+                    { If no changes were made, now try constant merging }
+                    if TryConstMerge(p, hpfar1) then
+                      begin
+                        Result := True;
+                        Exit;
+                      end;
                   end;
                 end;
               {
@@ -1824,6 +1832,58 @@ Implementation
     end;
 
 
+  function TCpuAsmOptimizer.OptPass1MOVW(var p: tai): Boolean;
+    var
+      ThisReg: TRegister;
+      a: aint;
+      imm_shift: byte;
+      hp1, hp2: tai;
+    begin
+      Result := False;
+      ThisReg := taicpu(p).oper[0]^.reg;
+      if GetNextInstruction(p, hp1) then
+        begin
+          { Can the MOVW/MOVT pair be represented by a single MOV instruction? }
+          if MatchInstruction(hp1, A_MOVT, [taicpu(p).condition], []) and
+            (taicpu(hp1).oper[0]^.reg = ThisReg) then
+            begin
+              a := (aint(taicpu(p).oper[1]^.val) and $FFFF) or aint(taicpu(hp1).oper[1]^.val shl 16);
+
+              if is_shifter_const(a,imm_shift) then
+                begin
+                  DebugMsg(SPeepholeOptimization + 'MOVW/MOVT pair can encode value as a single MOV instruction (MovwMovT2Mov)', p);
+                  taicpu(p).opcode := A_MOV;
+                  taicpu(p).oper[1]^.val := a;
+                  RemoveInstruction(hp1);
+                  Result := True;
+                  Exit;
+                end
+              else if is_shifter_const(not(a),imm_shift) then
+                begin
+                  DebugMsg(SPeepholeOptimization + 'MOVW/MOVT pair can encode value as a single MVN instruction (MovwMovT2Mvn)', p);
+                  taicpu(p).opcode := A_MVN;
+                  taicpu(p).oper[1]^.val := not(a);
+                  RemoveInstruction(hp1);
+                  Result := True;
+                  Exit;
+                end;
+            end;
+
+          if (
+              (
+                MatchInstruction(hp1, A_STR, [taicpu(p).condition], [PF_H]) and
+                (taicpu(hp1).oper[0]^.reg = ThisReg)
+              )
+            ) and
+            TryConstMerge(p, hp1) then
+            begin
+              Result := True;
+              Exit;
+            end;
+        end;
+    end;
+
+
   function TCpuAsmOptimizer.OptPass1MVN(var p: tai): Boolean;
     var
       hp1: tai;
@@ -2351,6 +2411,8 @@ Implementation
               Result := OptPass1LDR(p);
             A_MOV:
               Result := OptPass1MOV(p);
+            A_MOVW:
+              Result := OptPass1MOVW(p);
             A_AND:
               Result := OptPass1And(p);
             A_ADD,

+ 5 - 4
compiler/arm/cpuinfo.pas

@@ -1101,7 +1101,8 @@ Const
        CPUARM_HAS_IDIV,
        CPUARM_HAS_THUMB_IDIV,
        CPUARM_HAS_THUMB2,
-       CPUARM_HAS_UMULL
+       CPUARM_HAS_UMULL,
+       CPUARM_HAS_EXTENDED_CONSTANTS  { has MOVW and MOVT instructions                    }
       );
 
    tfpuflags =
@@ -1132,9 +1133,9 @@ Const
        { cpu_armv6t2  } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
        { cpu_armv6z   } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_LDREX,CPUARM_HAS_UMULL],
        { cpu_armv6m   } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_REV],
-       { the identifier armv7 is should not be used, it is considered being equal to armv7a }
-       { cpu_armv7    } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
-       { cpu_armv7a   } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
+       { the identifier armv7 should not be used; it is considered equal to armv7a }
+       { cpu_armv7    } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL,CPUARM_HAS_EXTENDED_CONSTANTS],
+       { cpu_armv7a   } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL,CPUARM_HAS_EXTENDED_CONSTANTS],
        { cpu_armv7r   } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_BLX_LABEL,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
        { cpu_armv7m   } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL],
        { cpu_armv7em  } [CPUARM_HAS_THUMB,CPUARM_HAS_ALL_MEM,CPUARM_HAS_BX,CPUARM_HAS_BLX,CPUARM_HAS_CLZ,CPUARM_HAS_EDSP,CPUARM_HAS_REV,CPUARM_HAS_RBIT,CPUARM_HAS_LDREX,CPUARM_HAS_THUMB_IDIV,CPUARM_HAS_DMB,CPUARM_HAS_THUMB2,CPUARM_HAS_UMULL]

+ 932 - 1
compiler/armgen/aoptarm.pas

@@ -61,6 +61,9 @@ Type
     function OptPass2Bitwise(var p: tai): Boolean;
     function OptPass2TST(var p: tai): Boolean;
 
+    { Common code that tries to merge constant writes to sequential memory }
+    function TryConstMerge(var p: tai; hp1: tai): Boolean;
+
   protected
     function DoXTArithOp(var p: tai; hp1: tai): Boolean;
   End;
@@ -81,7 +84,7 @@ Type
 Implementation
 
   uses
-    cutils,verbose,globals,
+    cutils,verbose,globals,aoptutils,
     systems,
     cpuinfo,
     cgobj,procinfo,
@@ -2003,5 +2006,933 @@ Implementation
         end;
     end;
 
+
+  function TARMAsmOptimizer.TryConstMerge(var p: tai; hp1: tai): Boolean;
+    const
+{$ifdef ARM}
+      LO_16_WRITE: TAsmOp = A_MOVW;
+      HI_16_WRITE: TAsmOp = A_MOVT;
+{$endif ARM}
+{$ifdef AARCH64}
+      LO_16_WRITE: TAsmOp = A_MOVZ;
+      HI_16_WRITE: TAsmOp = A_MOVK;
+{$endif AARCH64}
+    var
+      hp2, hp2_second, hp3, hp3_second, p_second, hp1_second: tai;
+      ThisReg: TRegister;
+      ThisRef: TReference;
+      so: TShifterOp;
+
+      procedure SearchAhead;
+        begin
+          { If p.opcode = A_STR, then ThisReg will be NR_NO }
+          if (
+{$ifdef ARM}
+              (p_second.typ = ait_instruction) and
+              (taicpu(p_second).condition = taicpu(p).condition) and
+              (
+                (taicpu(p_second).opcode = A_MOV) or
+                (taicpu(p_second).opcode = A_MOVW)
+              )
+{$endif ARM}
+{$ifdef AARCH64}
+              MatchInstruction(p, A_MOVZ, []) or
+              (
+                MatchInstruction(p, A_STR, []) and
+                SetAndTest(p, hp1)
+              )
+{$endif AARCH64}
+            ) and
+            (
+              (
+                (ThisReg <> NR_NO) and
+                (
+{$ifdef AARCH64}
+                  (
+                    (getsubreg(ThisReg) = R_SUBD) and
+                    MatchInstruction(hp1, A_MOVK, []) and
+                    (taicpu(hp1).oper[0]^.reg = ThisReg) and
+                    GetNextInstruction(hp1, hp2) and
+                    MatchInstruction(hp2, A_STR, []) and
+                    (taicpu(hp2).oper[0]^.reg = ThisReg) and
+                    GetNextInstruction(hp2, p_second)
+                  ) or
+{$endif AARCH64}
+                  (
+                    MatchInstruction(hp1, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, []) and
+                    (taicpu(hp1).oper[0]^.reg = ThisReg) and
+                    GetNextInstruction(hp1, p_second)
+                  )
+                )
+              ) or (
+                { Just search one ahead if ThisReg is NR_NO }
+                (ThisReg = NR_NO) and
+                GetNextInstruction(hp1, p_second)
+              )
+            ) and
+            (
+              (
+{$ifdef ARM}
+                (p_second.typ = ait_instruction) and
+                (taicpu(p_second).condition = taicpu(p).condition) and
+                (
+                  (taicpu(p_second).opcode = A_MOV) or
+                  (taicpu(p_second).opcode = A_MOVW)
+                ) and
+{$endif ARM}
+{$ifdef AARCH64}
+                MatchInstruction(p_second, A_MOVZ, []) and
+{$endif AARCH64}
+                { Don't use ThisReg because it may be NR_NO }
+                GetNextInstruction(p_second, hp1_second) and
+                (
+{$ifdef AARCH64}
+                  (
+                    MatchInstruction(hp1_second, A_MOVK, []) and
+                    GetNextInstruction(hp1_second, hp2_second) and
+                    MatchInstruction(hp2_second, A_STR, [PF_None])
+                  ) or
+{$endif AARCH64}
+                  MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
+                )
+              )
+{$ifdef AARCH64}
+              or (
+                MatchInstruction(p_second, A_STR, []) and
+                (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) and
+                { Negate the result because we're setting hp1_second to nil }
+                not SetAndTest(nil, hp1_second)
+              )
+{$endif AARCH64}
+            ) then
+            TryConstMerge(p_second, hp1_second);
+        end;
+
+    begin
+      Result := False;
+{$ifdef ARM}
+      { We need a Cortex-A ARM processor that supports MOVW and MOVT }
+      if not (CPUARM_HAS_EXTENDED_CONSTANTS in cpu_capabilities[current_settings.cputype]) then
+        Exit;
+{$endif ARM}
+
+      ThisReg := NR_NO; { Safe initialisation }
+
+      case taicpu(p).opcode of
+{$ifdef ARM}
+        A_MOV,
+        A_MOVW:
+          if (taicpu(p).opcode <> A_MOV) or (taicpu(p).oper[1]^.typ = top_const) then
+{$endif ARM}
+{$ifdef AARCH64}
+        A_MOVZ:
+{$endif AARCH64}
+          begin
+            ThisReg := taicpu(p).oper[0]^.reg;
+            if Assigned(hp1){$ifdef ARM} and (taicpu(hp1).condition = taicpu(p).condition){$endif ARM} then
+              case taicpu(hp1).opcode of
+                A_STR:
+                  if {$ifdef ARM}(taicpu(hp1).ops = 2) and {$endif ARM}SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, ThisReg) then
+                    begin
+                      ThisRef := taicpu(hp1).oper[1]^.ref^;
+
+                      if (ThisRef.addressmode = AM_OFFSET) and
+                        (ThisRef.index = NR_NO) and
+                        { Only permit writes to the stack, since we can guarantee alignment with that }
+                        (
+                          (ThisRef.base = NR_STACK_POINTER_REG) or
+                          (ThisRef.base = current_procinfo.framepointer)
+                        ) then
+                        begin
+                          case taicpu(hp1).oppostfix of
+                            PF_B:
+                              {
+                                With sequences such as:
+                                  movz  w0,x
+                                  strb  w0,[sp, #ofs]
+                                  movz  w0,y
+                                  strb  w0,[sp, #ofs+1]
+
+                                Merge the constants to:
+                                  movz  w0,x + (y shl 8)
+                                  strh  w0,[sp, #ofs]
+
+                                Only use the stack pointer or frame pointer and an even offset though
+                                to guarantee alignment
+                              }
+                              if ((ThisRef.offset mod 2) = 0) and
+                                GetNextInstruction(hp1, p_second) and
+                                (p_second.typ = ait_instruction)
+{$ifdef ARM}
+                                and (taicpu(p_second).condition = taicpu(p).condition)
+{$endif ARM}
+                                then
+                                begin
+                                  case taicpu(p_second).opcode of
+{$ifdef ARM}
+                                    A_MOV,
+                                    A_MOVW:
+                                      if (taicpu(p_second).oppostfix = PF_None) and
+                                        ((taicpu(p_second).opcode <> A_MOV) or (taicpu(p_second).oper[1]^.typ = top_const)) then
+{$endif ARM}
+{$ifdef AARCH64}
+                                    A_MOVZ:
+{$endif AARCH64}
+                                      begin
+                                        if SuperRegistersEqual(taicpu(p_second).oper[0]^.reg, ThisReg) and
+                                          GetNextInstruction(p_second, hp1_second) and
+                                          MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
+                                          SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
+                                          begin
+                                            { Is the second storage location exactly one byte ahead? }
+                                            Inc(ThisRef.offset);
+                                            if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
+                                              { The final safety check... make sure the register used
+                                                to store the constant isn't used afterwards }
+                                              RegEndOfLife(ThisReg, taicpu(hp1_second)) then
+                                              begin
+
+                                                { See if we can merge 4 bytes at once (this benefits ARM mostly, but provides a speed boost for AArch64 too) }
+                                                if GetNextInstruction(hp1_second, hp2) and
+                                                  (
+{$ifdef ARM}
+                                                    MatchInstruction(hp2, A_MOVW, [taicpu(p).condition], []) or
+{$endif ARM}
+                                                    (
+                                                      MatchInstruction(hp2, LO_16_WRITE{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
+{$ifdef ARM}
+                                                      and (taicpu(hp2).oper[1]^.typ = top_const)
+{$endif ARM}
+                                                    )
+                                                  ) and
+                                                  SuperRegistersEqual(taicpu(hp2).oper[0]^.reg, ThisReg) and
+                                                  GetNextInstruction(hp2, hp2_second) and
+                                                  MatchInstruction(hp2_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
+                                                  SuperRegistersEqual(taicpu(hp2_second).oper[0]^.reg, ThisReg) and
+                                                  GetNextInstruction(hp2_second, hp3) and
+                                                  (
+{$ifdef ARM}
+                                                    MatchInstruction(hp3, A_MOVW, [taicpu(p).condition], []) or
+{$endif ARM}
+                                                    (
+                                                      MatchInstruction(hp3, LO_16_WRITE{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [])
+{$ifdef ARM}
+                                                      and (taicpu(hp3).oper[1]^.typ = top_const)
+{$endif ARM}
+                                                    )
+                                                  ) and
+                                                  SuperRegistersEqual(taicpu(hp3).oper[0]^.reg, ThisReg) and
+                                                  GetNextInstruction(hp3, hp3_second) and
+                                                  MatchInstruction(hp3_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_B]) and
+                                                  SuperRegistersEqual(taicpu(hp3_second).oper[0]^.reg, ThisReg) then
+                                                  begin
+                                                    Inc(ThisRef.offset);
+                                                    if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) then
+                                                      begin
+                                                        Inc(ThisRef.offset);
+                                                        if RefsEqual(taicpu(hp3_second).oper[1]^.ref^, ThisRef) then
+                                                          begin
+                                                            { Merge the constants }
+                                                            DebugMsg(SPeepholeOptimization + 'Merged four byte-writes to memory into a single word-write (MovzStrbMovzStrbMovzStrbMovzStrb2MovzMovkStr)', p);
+{$ifdef ARM}
+                                                            taicpu(p).opcode := A_MOVW;
+{$endif ARM}
+                                                            taicpu(p).oper[1]^.val := (taicpu(p).oper[1]^.val and $FF) or ((taicpu(p_second).oper[1]^.val and $FF) shl 8);
+
+                                                            taicpu(hp2).opcode := HI_16_WRITE;
+                                                            taicpu(hp2).oper[1]^.val := (taicpu(hp2).oper[1]^.val and $FF) or ((taicpu(hp3).oper[1]^.val and $FF) shl 8);
+
+                                                            so.shiftimm := 16;
+                                                            so.shiftmode := SM_LSL;
+                                                            taicpu(hp2).loadshifterop(2, so);
+                                                            taicpu(hp2).ops := 3;
+
+                                                            taicpu(hp1).oppostfix := PF_None;
+
+                                                            AsmL.Remove(hp2);
+                                                            AsmL.InsertAfter(hp2, p);
+
+                                                            RemoveInstruction(p_second);
+                                                            RemoveInstruction(hp1_second);
+                                                            RemoveInstruction(hp2_second);
+                                                            RemoveInstruction(hp3);
+                                                            RemoveInstruction(hp3_second);
+                                                            Result := True;
+{$ifdef AARCH64}
+                                                            { Searching ahead only benefits AArch64 here }
+                                                            SearchAhead;
+{$endif AARCH64}
+                                                            Exit;
+                                                          end;
+                                                        { Reset the offset so the range check below is correct }
+                                                        Dec(ThisRef.offset);
+                                                      end;
+                                                    Dec(ThisRef.offset);
+                                                  end;
+{$ifdef ARM}
+                                                { Be careful.  strb and str support offsets between -4095 and +4095, but
+                                                  strh only supports offsets between -255 and +255.  However, we might be
+                                                  able to bypass this if there are four bytes in a row (for AArch64, just
+                                                  use SearchAhead below }
+                                                if { Remember we added 1 to the offset }
+                                                  (ThisRef.offset >= -254) and (ThisRef.offset <= 256) then
+{$endif ARM}
+                                                  begin
+
+                                                    { Merge the constants and remove the second pair of instructions }
+                                                    DebugMsg(SPeepholeOptimization + 'Merged two byte-writes to memory into a single half-write (MovzStrbMovzStrb2MovzStrh)', p);
+{$ifdef ARM}
+                                                    taicpu(p).opcode := A_MOVW;
+{$endif ARM}
+                                                    taicpu(p).oper[1]^.val := (taicpu(p).oper[1]^.val and $FF) or ((taicpu(p_second).oper[1]^.val and $FF) shl 8);
+                                                    taicpu(hp1).oppostfix := PF_H;
+                                                    RemoveInstruction(p_second);
+                                                    RemoveInstruction(hp1_second);
+                                                    Result := True;
+                                                  end;
+                                              end;
+                                          end;
+                                      end;
+{$ifdef AARCH64}
+                                    A_STR:
+                                      { Sometimes, the second mov might not be present as we're writing the
+                                        zero register to the next address - that is:
+                                          movz  w0,x
+                                          strb  w0,[sp, #ofs]
+                                          strb  wzr,[sp, #ofs+1]
+
+                                        Which becomes:
+                                          movz  w0,x
+                                          strh  w0,[sp, #ofs]
+                                      }
+                                      if RegEndOfLife(ThisReg, taicpu(hp1)) and
+                                        (taicpu(p_second).oppostfix = PF_B) and
+                                        (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
+                                        begin
+                                          { Is the second storage location exactly one byte ahead? }
+                                          Inc(ThisRef.offset);
+                                          if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
+                                            begin
+                                              { Merge the constants and remove the second pair of instructions }
+                                              DebugMsg(SPeepholeOptimization + 'Merged a byte-write and a zero-register byte-write to memory into a single half-write (MovzStrbStrb2MovzStrh 1)', p);
+                                              taicpu(p).oper[1]^.val := taicpu(p).oper[1]^.val and $FF; { In case there's some extraneous bits }
+                                              taicpu(hp1).oppostfix := PF_H;
+                                              RemoveInstruction(p_second);
+                                              Result := True;
+                                            end;
+                                        end;
+{$endif AARCH64}
+                                    else
+                                      ;
+                                  end;
+
+                                  { Search ahead to see if more bytes are written individually,
+                                    because then we may be able to merge 4 bytes into a full
+                                    word write in a single pass }
+                                  if Result then
+                                    begin
+                                      SearchAhead;
+                                      Exit;
+                                    end;
+                                end;
+                            PF_H:
+                              {
+                                With sequences such as:
+                                  movz  w0,x
+                                  strh  w0,[sp, #ofs]
+                                  movz  w0,y
+                                  strh  w0,[sp, #ofs+2]
+
+                                Merge the constants to:
+                                  movz  w0,x
+                                  movk  w0,y,lsl #16
+                                  str   w0,[sp, #ofs]
+
+                                Only use the stack pointer or frame pointer and an offset
+                                that's a multiple of 4 though to guarantee alignment
+                              }
+                              if ((ThisRef.offset mod 4) = 0) and
+                                GetNextInstruction(hp1, p_second) and
+                                (p_second.typ = ait_instruction)
+{$ifdef ARM}
+                                and (taicpu(p_second).condition = taicpu(p).condition)
+{$endif ARM}
+                                then
+                                begin
+                                  case taicpu(p_second).opcode of
+{$ifdef ARM}
+                                    A_MOV,
+                                    A_MOVW:
+                                      if (taicpu(p).oppostfix = PF_None) and
+                                        ((taicpu(p).opcode <> A_MOV) or (taicpu(p).oper[1]^.typ = top_const)) then
+{$endif ARM}
+{$ifdef AARCH64}
+                                    A_MOVZ:
+{$endif AARCH64}
+                                      begin
+                                        if SuperRegistersEqual(taicpu(p_second).oper[0]^.reg, ThisReg) and
+                                          GetNextInstruction(p_second, hp1_second) and
+                                          MatchInstruction(hp1_second, A_STR{$ifdef ARM}, [taicpu(p).condition]{$endif ARM}, [PF_H]) and
+                                          SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
+                                          begin
+                                            { Is the second storage location exactly one byte ahead? }
+                                            Inc(ThisRef.offset, 2);
+                                            if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
+                                              { The final safety check... make sure the register used
+                                                to store the constant isn't used afterwards }
+                                              RegEndOfLife(ThisReg, taicpu(hp1_second)) then
+                                              begin
+                                                { Merge the constants }
+                                                DebugMsg(SPeepholeOptimization + 'Merged two half-writes to memory into a single word-write (MovzStrhMovzStrh2MovzMovkStr)', p);
+
+                                                { Repurpose the second MOVZ instruction into a MOVK instruction }
+                                                if taicpu(p_second).oper[1]^.val = 0 then
+                                                  begin
+                                                    { Or just remove it if it's not needed }
+                                                    RemoveInstruction(p_second);
+{$ifdef ARM}
+                                                    { If within the range 0..255, MOV suffices (256 can also be encoded this way) }
+                                                    if (taicpu(p).oper[1]^.val < 0) or (taicpu(p).oper[1]^.val > 256) then
+                                                      taicpu(p).opcode := A_MOVW;
+{$endif ARM}
+                                                  end
+                                                else
+                                                  begin
+                                                    asml.Remove(p_second);
+                                                    asml.InsertAfter(p_second, p);
+{$ifdef ARM}
+                                                    taicpu(p).opcode := A_MOVW;
+{$endif ARM}
+                                                    taicpu(p_second).opcode := HI_16_WRITE;
+{$ifdef AARCH64}
+                                                    so.shiftmode := SM_LSL;
+                                                    so.shiftimm := 16;
+
+                                                    taicpu(p_second).ops := 3;
+                                                    taicpu(p_second).loadshifterop(2, so);
+
+                                                    { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
+                                                    setsubreg(ThisReg, R_SUBD);
+                                                    taicpu(p).oper[0]^.reg := ThisReg;
+                                                    taicpu(p_second).oper[0]^.reg := ThisReg;
+                                                    taicpu(hp1).oper[0]^.reg := ThisReg;
+{$endif AARCH64}
+                                                    { TODO: Confirm that the A_MOVZ / A_MOVK combination is the most efficient }
+                                                  end;
+
+                                                taicpu(hp1).oppostfix := PF_None;
+                                                RemoveInstruction(hp1_second);
+                                                Result := True;
+                                              end;
+                                          end;
+                                      end;
+{$ifdef AARCH64}
+                                    A_STR:
+                                      { Sometimes, the second mov might not be present as we're writing the
+                                        zero register to the next address - that is:
+                                          movz  w0,x
+                                          strh  w0,[sp, #ofs]
+                                          strh  wzr,[sp, #ofs+1]
+
+                                        Which becomes:
+                                          movz  w0,x
+                                          str   w0,[sp, #ofs]
+                                      }
+                                      if RegEndOfLife(ThisReg, taicpu(hp1)) and
+                                        (taicpu(p_second).oppostfix = PF_H) and
+                                        (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
+                                        begin
+                                          { Is the second storage location exactly one byte ahead? }
+                                          Inc(ThisRef.offset, 2);
+                                          if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
+                                            begin
+                                              { Merge the constants and remove the second pair of instructions }
+                                              DebugMsg(SPeepholeOptimization + 'Merged a half-write and a zero-register half-write to memory into a single word-write (MovzStrhStrh2MovzStr)', p);
+
+                                              { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
+                                              setsubreg(ThisReg, R_SUBD);
+                                              taicpu(p).oper[0]^.reg := ThisReg;
+                                              taicpu(hp1).oper[0]^.reg := ThisReg;
+
+                                              taicpu(hp1).oppostfix := PF_None;
+                                              RemoveInstruction(p_second);
+                                              Result := True;
+                                            end;
+                                        end;
+{$endif AARCH64}
+                                    else
+                                      ;
+                                  end;
+{$ifdef AARCH64}
+                                  { Search ahead to see if more half-words are written
+                                    individually, because then we may be able to merge
+                                    4 words into a full extended write in a single pass }
+                                  if Result then
+                                    begin
+                                      SearchAhead;
+                                      Exit;
+                                    end;
+{$endif AARCH64}
+                                end;
+                            else
+                              ;
+                          end;
+                        end;
+                    end;
+{$ifdef AARCH64}
+                A_MOVK:
+                  if (getsubreg(ThisReg) = R_SUBD) and
+                    (taicpu(hp1).oper[0]^.reg = ThisReg) and
+                    (taicpu(hp1).ops = 3) and
+                    (taicpu(hp1).oper[2]^.shifterop^.shiftmode = SM_LSL) and
+                    (taicpu(hp1).oper[2]^.shifterop^.shiftimm = 16) and
+                    GetNextInstruction(hp1, hp2) and
+                    MatchInstruction(hp2, A_STR, [PF_None]) and
+                    (taicpu(hp2).oper[0]^.reg = ThisReg) then
+                    begin
+                      {
+                        With sequences such as:
+                          movz  w0,x
+                          movk  w0,y,lsl #16
+                          str   w0,[sp, #ofs]
+                          movz  w0,z
+                          movk  w0,q,lsl #16
+                          str   w0,[sp, #ofs+4]
+
+                        Merge the constants to:
+                          movz  x0,x
+                          movk  x0,y,lsl #16
+                          movk  x0,z,lsl #32
+                          movk  x0,q,lsl #48
+                          str   x0,[sp, #ofs]
+
+                        Only use the stack pointer or frame pointer and an offset
+                        that's a multiple of 8 though to guarantee alignment
+                      }
+                      ThisRef := taicpu(hp2).oper[1]^.ref^;
+                      if ((ThisRef.offset mod 8) = 0) and
+                        GetNextInstruction(hp2, p_second) and
+                        (p_second.typ = ait_instruction) then
+                        case taicpu(p_second).opcode of
+                          A_MOVZ:
+                            if (
+                                (taicpu(p_second).oper[0]^.reg = ThisReg) or
+                                (
+                                  RegEndOfLife(ThisReg, taicpu(hp2)) and
+                                  (getsubreg(taicpu(p_second).oper[0]^.reg) = R_SUBD)
+                                )
+                              ) and GetNextInstruction(p_second, hp1_second) then
+                              begin
+                                case taicpu(hp1_second).opcode of
+                                  A_MOVK:
+                                    if (taicpu(p_second).oper[1]^.val <= $FFFF) and
+                                      (taicpu(hp1_second).oper[0]^.reg = taicpu(p_second).oper[0]^.reg) and
+                                      (taicpu(hp1_second).ops = 3) and
+                                      (taicpu(hp1_second).oper[2]^.shifterop^.shiftmode = SM_LSL) and
+                                      (taicpu(hp1_second).oper[2]^.shifterop^.shiftimm = 16) and
+                                      GetNextInstruction(hp1_second, hp2_second) and
+                                      MatchInstruction(hp2_second, A_STR, [PF_None]) and
+                                      (taicpu(hp1_second).oper[0]^.reg = taicpu(p_second).oper[0]^.reg) then
+                                      begin
+                                        Inc(ThisRef.offset, 4);
+                                        if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) and
+                                          { The final safety check... make sure the register used
+                                            to store the constant isn't used afterwards }
+                                          RegEndOfLife(taicpu(p_second).oper[0]^.reg, taicpu(hp2_second)) then
+                                          begin
+                                            DebugMsg(SPeepholeOptimization + 'Merged two word-writes to memory into a single extended-write (MovzMovkStrMovzMovkStr2MovzMovkMovkMovkStr)', p);
+
+                                            { Extend register to 64-bit and repurpose second MOVZ to a MOVK with lsl 32 }
+                                            setsubreg(ThisReg, R_SUBQ);
+
+                                            taicpu(p).oper[0]^.reg := ThisReg;
+                                            taicpu(hp1).oper[0]^.reg := ThisReg;
+
+                                            { If the 3rd word is zero, we can remove the instruction entirely }
+                                            if taicpu(p_second).oper[1]^.val = 0 then
+                                              RemoveInstruction(p_second)
+                                            else
+                                              begin
+                                                taicpu(p_second).oper[0]^.reg := ThisReg;
+                                                so.shiftimm := 32;
+                                                so.shiftmode := SM_LSL;
+                                                taicpu(p_second).opcode := A_MOVK;
+                                                taicpu(p_second).ops := 3;
+                                                taicpu(p_second).loadshifterop(2, so);
+                                                AsmL.Remove(p_second);
+                                                AsmL.InsertBefore(p_second, hp2);
+                                              end;
+
+                                            taicpu(hp1_second).oper[0]^.reg := ThisReg;
+                                            taicpu(hp1_second).oper[2]^.shifterop^.shiftimm := 48;
+                                            taicpu(hp2).oper[0]^.reg := ThisReg;
+
+                                            AsmL.Remove(hp1_second);
+                                            AsmL.InsertBefore(hp1_second, hp2);
+
+                                            RemoveInstruction(hp2_second);
+                                            Result := True;
+                                          end;
+                                      end;
+                                  else
+                                    ;
+                                end;
+                              end;
+                          A_STR:
+                            { Sometimes, the second mov might not be present as we're writing the
+                              zero register to the next address - that is:
+                                movz  w0,x
+                                movk  w0,y,lsl #16
+                                str   w0,[sp, #ofs]
+                                str   wzr,[sp, #ofs+4]
+
+                              Which becomes:
+                                movz  x0,x
+                                movk  x0,y,lsl #16
+                                str   x0,[sp, #ofs]
+                            }
+                            begin
+                              { Sometimes, the second mov might not be present as we're writing the
+                                zero register to the next address - that is:
+                                  movz  w0,x
+                                  strh  w0,[sp, #ofs]
+                                  strh  wzr,[sp, #ofs+1]
+
+                                Which becomes:
+                                  movz  w0,x
+                                  str   w0,[sp, #ofs]
+                              }
+                              { Don't need to check end-of-life because the upper 32 bits are zero
+                                and the overall value isn't being modified }
+                              if (taicpu(p_second).oppostfix = PF_None) and
+                                (taicpu(p_second).oper[0]^.reg = NR_WZR) then
+                                begin
+                                  { Is the second storage location exactly one byte ahead? }
+                                  Inc(ThisRef.offset, 4);
+                                  if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
+                                    begin
+                                      { Merge the constants and remove the second pair of instructions }
+                                      DebugMsg(SPeepholeOptimization + 'Merged a word-write and a zero-register word-write to memory into a single extended-write (MovzStrStr2MovzStr)', p);
+
+                                      setsubreg(taicpu(p).oper[0]^.reg, R_SUBQ);
+                                      setsubreg(taicpu(hp1).oper[0]^.reg, R_SUBQ);
+                                      setsubreg(taicpu(hp2).oper[0]^.reg, R_SUBQ);
+                                      RemoveInstruction(p_second);
+                                      Result := True;
+                                    end;
+                                end;
+                            end
+                          else
+                            ;
+                        end;
+                    end;
+{$endif AARCH64}
+                else
+                  ;
+              end;
+          end;
+{$ifdef AARCH64}
+        A_STR:
+          { hp1 is probably nil }
+          if getsupreg(taicpu(p).oper[0]^.reg) = RS_WZR then
+            begin
+              ThisRef := taicpu(p).oper[1]^.ref^;
+              if (ThisRef.addressmode = AM_OFFSET) and
+                (ThisRef.index = NR_NO) and
+                { Only permit writes to the stack, since we can guarantee alignment with that }
+                (
+                  (ThisRef.base = NR_STACK_POINTER_REG) or
+                  (ThisRef.base = current_procinfo.framepointer)
+                ) then
+                begin
+
+                  case taicpu(p).oppostfix of
+                    PF_B:
+                      {
+                        With sequences such as:
+                          strb  wzr,[sp, #ofs]
+                          movz  w0,x
+                          strb  w0,[sp, #ofs+1]
+
+                        Merge the constants to:
+                          movz  w0,x shl 8
+                          strh  w0,[sp, #ofs]
+
+                        Only use the stack pointer or frame pointer and an even offset though
+                        to guarantee alignment
+                      }
+                      if ((ThisRef.offset mod 2) = 0) and
+                        GetNextInstruction(p, p_second) and
+                        (p_second.typ = ait_instruction) then
+                        begin
+
+                          case taicpu(p_second).opcode of
+                            A_MOVZ:
+                              begin
+                                ThisReg := taicpu(p_second).oper[0]^.reg;
+                                if GetNextInstruction(p_second, hp1_second) and
+                                  MatchInstruction(hp1_second, A_STR, [PF_B]) and
+                                  SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
+                                  begin
+                                    { Is the second storage location exactly one byte ahead? }
+                                    Inc(ThisRef.offset);
+                                    if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
+                                      { The final safety check... make sure the register used
+                                        to store the constant isn't used afterwards }
+                                      RegEndOfLife(ThisReg, taicpu(hp1_second)) then
+                                      begin
+                                        { Merge the constants by repurposing the 2nd move, changing the register in the first STR and removing the second STR }
+                                        DebugMsg(SPeepholeOptimization + 'Merged a zero-register byte-write and a byte-write to memory into a single half-write (MovzStrbStrb2MovzStrh 2)', p);
+                                        taicpu(p_second).oper[1]^.val := (taicpu(p_second).oper[1]^.val and $FF) shl 8;
+
+                                        taicpu(hp1_second).oppostfix := PF_H;
+                                        Dec(taicpu(hp1_second).oper[1]^.ref^.offset, 1);
+
+                                        RemoveCurrentP(p, p_second);
+                                        Result := True;
+
+                                        hp1 := hp1_second; { So SearchAhead works properly below }
+                                      end;
+                                  end;
+                              end;
+                            A_STR:
+                              { Change:
+                                  strb  wzr,[sp, #ofs]
+                                  strb  wzr,[sp, #ofs+1]
+
+                                To:
+                                  strh  wzr,[sp, #ofs]
+                              }
+                              if (taicpu(p_second).oppostfix = PF_B) and
+                                (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
+                                begin
+                                  { Is the second storage location exactly one byte ahead? }
+                                  Inc(ThisRef.offset);
+                                  if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
+                                    begin
+                                      DebugMsg(SPeepholeOptimization + 'Merged two zero-register byte-writes to memory into a single zero-register half-write (StrbStrb2Strh)', p);
+                                      taicpu(p).oppostfix := PF_H;
+                                      RemoveInstruction(p_second);
+                                      Result := True;
+                                    end;
+                                end;
+                            else
+                              ;
+                          end;
+
+                          { Search ahead to see if more bytes are written individually,
+                            because then we may be able to merge 4 bytes into a full
+                            word write in a single pass }
+                          if Result then
+                            begin
+                              SearchAhead;
+                              Exit;
+                            end;
+                        end;
+                    PF_H:
+                      {
+                        With sequences such as:
+                          strh  wzr,[sp, #ofs]
+                          movz  w0,x
+                          strh  w0,[sp, #ofs+2]
+
+                        Merge the constants to:
+                          movz  w0,#0
+                          movk  w0,x,lsl #16
+                          str   w0,[sp, #ofs]
+
+                        Only use the stack pointer or frame pointer and an offset
+                        that's a multiple of 4 though to guarantee alignment
+                      }
+                      if ((ThisRef.offset mod 4) = 0) and
+                        GetNextInstruction(p, p_second) and
+                        (p_second.typ = ait_instruction) then
+                        begin
+                          case taicpu(p_second).opcode of
+                            A_MOVZ:
+                              begin
+                                ThisReg := taicpu(p_second).oper[0]^.reg;
+                                if GetNextInstruction(p_second, hp1_second) and
+                                  MatchInstruction(hp1_second, A_STR, [PF_H]) and
+                                  SuperRegistersEqual(taicpu(hp1_second).oper[0]^.reg, ThisReg) then
+                                  begin
+                                    { Is the second storage location exactly two bytes ahead? }
+                                    Inc(ThisRef.offset, 2);
+                                    if RefsEqual(taicpu(hp1_second).oper[1]^.ref^, ThisRef) and
+                                      { The final safety check... make sure the register used
+                                        to store the constant isn't used afterwards }
+                                      RegEndOfLife(ThisReg, taicpu(hp1_second)) then
+                                      begin
+
+                                        { Merge the constants }
+                                        DebugMsg(SPeepholeOptimization + 'Merged a zero-register half-write and a half-write to memory into a single word-write (StrhMovzStrh2MovzMovkStr)', p);
+
+                                        { Repurpose the first STR to a MOVZ instruction }
+                                        taicpu(p).opcode := A_MOVZ;
+                                        taicpu(p).oppostfix := PF_None;
+                                        taicpu(p).oper[0]^.reg := ThisReg;
+                                        taicpu(p).loadconst(1, 0);
+
+                                        so.shiftmode := SM_LSL;
+                                        so.shiftimm := 16;
+
+                                        taicpu(p_second).opcode := A_MOVK;
+                                        taicpu(p_second).ops := 3;
+                                        taicpu(p_second).loadshifterop(2, so);
+
+                                        { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
+                                        setsubreg(ThisReg, R_SUBD);
+                                        taicpu(p).oper[0]^.reg := ThisReg;
+                                        taicpu(p_second).oper[0]^.reg := ThisReg;
+                                        taicpu(hp1_second).oper[0]^.reg := ThisReg;
+
+                                        { TODO: Confirm that the A_MOVZ / A_MOVK combination is the most efficient }
+
+                                        taicpu(hp1_second).oppostfix := PF_None;
+                                        Dec(taicpu(hp1_second).oper[1]^.ref^.offset, 2);
+                                        Result := True;
+                                      end;
+                                  end;
+                              end;
+                            A_STR:
+                              { Change:
+                                  strh  wzr,[sp, #ofs]
+                                  strh  wzr,[sp, #ofs+2]
+
+                                To:
+                                  str   wzr,[sp, #ofs]
+                              }
+                              if (taicpu(p_second).oppostfix = PF_H) and
+                                (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
+                                begin
+                                  { Is the second storage location exactly one byte ahead? }
+                                  Inc(ThisRef.offset, 2);
+                                  if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
+                                    begin
+                                      DebugMsg(SPeepholeOptimization + 'Merged two zero-register half-writes to memory into a single zero-register word-write (StrhStrh2Str)', p);
+
+                                      { Make doubly sure we're only using the 32-bit register, otherwise STR could write 64 bits }
+                                      taicpu(p).oper[0]^.reg := NR_WZR;
+
+                                      taicpu(p).oppostfix := PF_None;
+                                      RemoveInstruction(p_second);
+                                      Result := True;
+                                    end;
+                                end;
+                            else
+                              ;
+                          end;
+                        end;
+                    PF_None:
+                      {
+                        With sequences such as:
+                          str   wzr,[sp, #ofs]
+                          movz  w0,x
+                          movk  w0,y,lsl #16
+                          str   w0,[sp, #ofs+4]
+
+                        Merge the constants to:
+                          movz  x0,#0
+                          movk  x0,x,lsl #32
+                          movk  x0,y,lsl #48
+                          str   x0,[sp, #ofs]
+
+                        Only use the stack pointer or frame pointer and an offset
+                        that's a multiple of 8 though to guarantee alignment
+                      }
+                      if ((ThisRef.offset mod 8) = 0) and
+                        GetNextInstruction(p, p_second) and
+                        (p_second.typ = ait_instruction) then
+                        begin
+                          case taicpu(p_second).opcode of
+                            A_MOVZ:
+                              begin
+                                ThisReg := taicpu(p_second).oper[0]^.reg;
+                                if GetNextInstruction(p_second, hp1_second) and
+                                  MatchInstruction(hp1_second, A_MOVK, []) and
+                                  GetNextInstruction(hp1_second, hp2_second) and
+                                  MatchInstruction(hp2_second, A_STR, [PF_None]) and
+                                  (taicpu(hp2_second).oper[0]^.reg = ThisReg) then
+                                  begin
+                                    { Is the second storage location exactly four bytes ahead? }
+                                    Inc(ThisRef.offset, 4);
+                                    if RefsEqual(taicpu(hp2_second).oper[1]^.ref^, ThisRef) and
+                                      { The final safety check... make sure the register used
+                                        to store the constant isn't used afterwards }
+                                      RegEndOfLife(ThisReg, taicpu(hp1_second)) then
+                                      begin
+                                        { Merge the constants }
+                                        DebugMsg(SPeepholeOptimization + 'Merged a zero-register word-write and a word-write to memory into a single extended-write (StrMovzMovkStr2MovzMovkMovkStr)', p);
+
+                                        setsubreg(ThisReg, R_SUBQ);
+
+                                        { Repurpose the first STR to a MOVZ instruction }
+                                        taicpu(p).opcode := A_MOVZ;
+                                        taicpu(p).oppostfix := PF_None;
+                                        taicpu(p).oper[0]^.reg := ThisReg;
+                                        taicpu(p).loadconst(1, 0);
+
+                                        { If the 3rd word is zero, we can remove the instruction entirely }
+                                        if taicpu(p_second).oper[1]^.val = 0 then
+                                          RemoveInstruction(p_second)
+                                        else
+                                          begin
+                                            so.shiftmode := SM_LSL;
+                                            so.shiftimm := 32;
+                                            taicpu(p_second).opcode := A_MOVK;
+                                            taicpu(p_second).ops := 3;
+                                            taicpu(p_second).loadshifterop(2, so);
+                                            taicpu(p_second).oper[0]^.reg := ThisReg;
+                                          end;
+
+                                        taicpu(p).oper[0]^.reg := ThisReg;
+                                        taicpu(hp1_second).oper[0]^.reg := ThisReg;
+                                        taicpu(hp1_second).oper[2]^.shifterop^.shiftimm := 48;
+
+                                        { TODO: Confirm that the A_MOVZ / A_MOVK / A_MOVK combination is the most efficient }
+
+                                        taicpu(hp2_second).oppostfix := PF_None;
+                                        Dec(taicpu(hp2_second).oper[1]^.ref^.offset, 4);
+                                        taicpu(hp2_second).oper[0]^.reg := ThisReg; { Remember to change the register to its 64-bit counterpart }
+                                        Result := True;
+                                      end;
+                                  end;
+                              end;
+                            A_STR:
+                              { Change:
+                                  str   wzr,[sp, #ofs]
+                                  str   wzr,[sp, #ofs+4]
+
+                                To:
+                                  str   xzr,[sp, #ofs]
+                              }
+                              if (taicpu(p_second).oppostfix = PF_None) and
+                                (getsupreg(taicpu(p_second).oper[0]^.reg) = RS_WZR) then
+                                begin
+                                  { Is the second storage location exactly one byte ahead? }
+                                  Inc(ThisRef.offset, 4);
+                                  if RefsEqual(taicpu(p_second).oper[1]^.ref^, ThisRef) then
+                                    begin
+                                      DebugMsg(SPeepholeOptimization + 'Merged two zero-register word-writes to memory into a single zero-register extended-write (StrStr2Str)', p);
+                                      taicpu(p).oper[0]^.reg := NR_XZR;
+                                      RemoveInstruction(p_second);
+                                      Result := True;
+                                    end;
+                                end;
+                            else
+                              ;
+                          end;
+                        end;
+                    else
+                      ;
+                  end;
+                end;
+            end;
+{$endif AARCH64}
+        else
+          ;
+      end;
+    end;
+
 end.