3 years ago · 6757bf3832
--- a/compiler/i386/aoptcpu.pas
+++ b/compiler/i386/aoptcpu.pas
@@ -191,6 +191,8 @@ unit aoptcpu;
 
				                   end;
			
 
				                 A_SHL, A_SAL:
			
 
				                   Result:=OptPass1SHLSAL(p);
			
 
				+                A_SHR:
			
 
				+                  Result:=OptPass1SHR(p);
			
 
				                 A_SUB:
			
 
				                   Result:=OptPass1Sub(p);
			
 
				                 A_Jcc:
			
--- a/compiler/x86/aoptx86.pas
+++ b/compiler/x86/aoptx86.pas
@@ -135,6 +135,7 @@ unit aoptx86;
 
				 
			
 
				         class function IsExitCode(p : tai) : boolean; static;
			
 
				         class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
			
 
				+        class function IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; static;
			
 
				         procedure RemoveLastDeallocForFuncRes(p : tai);
			
 
				 
			
 
				         function DoSubAddOpt(var p : tai) : Boolean;
			
@@ -157,6 +158,7 @@ unit aoptx86;
 
				         function OptPass1LEA(var p : tai) : boolean;
			
 
				         function OptPass1Sub(var p : tai) : boolean;
			
 
				         function OptPass1SHLSAL(var p : tai) : boolean;
			
 
				+        function OptPass1SHR(var p : tai) : boolean;
			
 
				         function OptPass1FSTP(var p : tai) : boolean;
			
 
				         function OptPass1FLD(var p : tai) : boolean;
			
 
				         function OptPass1Cmp(var p : tai) : boolean;
			
@@ -6448,6 +6450,146 @@ unit aoptx86;
 
				       end;
			
 
				 
			
 
				 
			
 
				+    class function TX86AsmOptimizer.IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean;
			
 
				+      begin
			
 
				+        case shr_size of
			
 
				+          S_B:
			
 
				+            { No valid combinations }
			
 
				+            Result := False;
			
 
				+
			
 
				+          S_W:
			
 
				+            Result := (Shift >= 8) and (movz_size = S_BW);
			
 
				+
			
 
				+          S_L:
			
 
				+            Result :=
			
 
				+              (Shift >= 24) { Any opsize is valid for this shift } or
			
 
				+              ((Shift >= 16) and (movz_size = S_WL));
			
 
				+{$ifdef x86_64}
			
 
				+          S_Q:
			
 
				+            Result :=
			
 
				+              (Shift >= 56) { Any opsize is valid for this shift } or
			
 
				+              ((Shift >= 48) and (movz_size = S_WL));
			
 
				+{$endif x86_64}
			
 
				+          else
			
 
				+            InternalError(2022081510);
			
 
				+        end;
			
 
				+      end;
			
 
				+
			
 
				+    function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
			
 
				+      var
			
 
				+        hp1, hp2: tai;
			
 
				+        Shift: TCGInt;
			
 
				+        LimitSize: Topsize;
			
 
				+        DoNotMerge: Boolean;
			
 
				+      begin
			
 
				+        Result := False;
			
 
				+
			
 
				+        { All these optimisations work on "shr const,%reg" }
			
 
				+        if not MatchOpType(taicpu(p), top_const, top_reg) then
			
 
				+          Exit;
			
 
				+
			
 
				+        DoNotMerge := False;
			
 
				+        Shift := taicpu(p).oper[0]^.val;
			
 
				+        LimitSize := taicpu(p).opsize;
			
 
				+
			
 
				+        hp1 := p;
			
 
				+        repeat
			
 
				+          if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
			
 
				+            Exit;
			
 
				+
			
 
				+          case taicpu(hp1).opcode of
			
 
				+            A_TEST, A_CMP, A_Jcc:
			
 
				+              { Skip over conditional jumps and relevant comparisons }
			
 
				+              Continue;
			
 
				+
			
 
				+            A_MOVZX:
			
 
				+              if MatchOpType(taicpu(hp1), top_reg, top_reg) and
			
 
				+                SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
			
 
				+                begin
			
 
				+                  { Since the original register is being read as is, subsequent
			
 
				+                    SHRs must not be merged at this point }
			
 
				+                  DoNotMerge := True;
			
 
				+
			
 
				+                  if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
			
 
				+                    begin
			
 
				+                      if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then { Different register target }
			
 
				+                        begin
			
 
				+                          DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
			
 
				+                          taicpu(hp1).opcode := A_MOV;
			
 
				+                          setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
			
 
				+                          case taicpu(hp1).opsize of
			
 
				+                            S_BW:
			
 
				+                              taicpu(hp1).opsize := S_W;
			
 
				+                            S_BL, S_WL:
			
 
				+                              taicpu(hp1).opsize := S_L;
			
 
				+                            else
			
 
				+                              InternalError(2022081503);
			
 
				+                          end;
			
 
				+
			
 
				+                          { p itself hasn't changed, so no need to set Result to True }
			
 
				+                          Include(OptsToCheck, aoc_ForceNewIteration);
			
 
				+
			
 
				+                          { See if there's anything afterwards that can be
			
 
				+                            optimised, since the input register hasn't changed }
			
 
				+                          Continue;
			
 
				+                        end;
			
 
				+
			
 
				+                      { NOTE: If the MOVZX instruction reads and writes the same
			
 
				+                        register, defer this to the post-peephole optimisation stage }
			
 
				+                      Exit;
			
 
				+                    end;
			
 
				+                end;
			
 
				+            A_SHL, A_SAL, A_SHR:
			
 
				+              if (taicpu(hp1).opsize <= LimitSize) and
			
 
				+                MatchOpType(taicpu(hp1), top_const, top_reg) and
			
 
				+                SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
			
 
				+                begin
			
 
				+                  { Make sure the sizes don't exceed the register size limit
			
 
				+                    (measured by the shift value falling below the limit) }
			
 
				+
			
 
				+                  if taicpu(hp1).opsize < LimitSize then
			
 
				+                    LimitSize := taicpu(hp1).opsize;
			
 
				+
			
 
				+                  if taicpu(hp1).opcode = A_SHR then
			
 
				+                    Inc(Shift, taicpu(hp1).oper[0]^.val)
			
 
				+                  else
			
 
				+                    begin
			
 
				+                      Dec(Shift, taicpu(hp1).oper[0]^.val);
			
 
				+                      DoNotMerge := True;
			
 
				+                    end;
			
 
				+
			
 
				+                  if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
			
 
				+                    Exit;
			
 
				+
			
 
				+                  { Since we've established that the combined shift is within
			
 
				+                    limits, we can actually combine the adjacent SHR
			
 
				+                    instructions even if they're different sizes }
			
 
				+                  if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
			
 
				+                    begin
			
 
				+                      hp2 := tai(hp1.Previous);
			
 
				+                      DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 1', p);
			
 
				+                      Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
			
 
				+                      RemoveInstruction(hp1);
			
 
				+                      hp1 := hp2;
			
 
				+
			
 
				+                      { Though p has changed, only the constant has, and its
			
 
				+                        effects can still be detected on the next iteration of
			
 
				+                        the repeat..until loop }
			
 
				+                      Include(OptsToCheck, aoc_ForceNewIteration);
			
 
				+                    end;
			
 
				+
			
 
				+                  { Move onto the next instruction }
			
 
				+                  Continue;
			
 
				+                end;
			
 
				+            else
			
 
				+              ;
			
 
				+          end;
			
 
				+
			
 
				+          Break;
			
 
				+        until False;
			
 
				+      end;
			
 
				+
			
 
				+
			
 
				     function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
			
 
				       var
			
 
				         CurrentRef: TReference;
			
@@ -12930,36 +13072,193 @@ unit aoptx86;
 
				 
			
 
				     function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
			
 
				       var
			
 
				-        hp1: tai;
			
 
				+        hp1, hp2: tai;
			
 
				+        IdentityMask, Shift: TCGInt;
			
 
				+        LimitSize: Topsize;
			
 
				+        DoNotMerge: Boolean;
			
 
				       begin
			
 
				-        { Detect:
			
 
				-            shr    x,  %ax (x > 0)
			
 
				-            ...
			
 
				-            movzwl %ax,%eax
			
 
				+        Result := False;
			
 
				 
			
 
				-          Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
			
 
				-        }
			
 
				+        { All these optimisations work on "shr const,%reg" }
			
 
				+        if not MatchOpType(taicpu(p), top_const, top_reg) then
			
 
				+          Exit;
			
 
				 
			
 
				-        Result := False;
			
 
				-        if MatchOpType(taicpu(p), top_const, top_reg) and
			
 
				-          (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
			
 
				-          (taicpu(p).oper[0]^.val > 0) and
			
 
				-          GetNextInstructionUsingReg(p, hp1, NR_EAX) and
			
 
				-          MatchInstruction(hp1, A_MOVZX, [S_WL]) and
			
 
				-          MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
			
 
				-          MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
			
 
				-          begin
			
 
				-            DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
			
 
				-            taicpu(hp1).opcode := A_CWDE;
			
 
				-            taicpu(hp1).clearop(0);
			
 
				-            taicpu(hp1).clearop(1);
			
 
				-            taicpu(hp1).ops := 0;
			
 
				+        DoNotMerge := False;
			
 
				+        Shift := taicpu(p).oper[0]^.val;
			
 
				+        LimitSize := taicpu(p).opsize;
			
 
				 
			
 
				-            { A change was made, but not with p, so move forward 1 }
			
 
				-            p := tai(p.Next);
			
 
				-            Result := True;
			
 
				+        hp1 := p;
			
 
				+        repeat
			
 
				+          if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
			
 
				+            Exit;
			
 
				+
			
 
				+          { Detect:
			
 
				+              shr x, %reg
			
 
				+              and y, %reg
			
 
				+
			
 
				+            If and y, %reg doesn't actually change the value of %reg (e.g. with
			
 
				+            "shrl $24,%reg; andl $255,%reg", remove the AND instruction.
			
 
				+          }
			
 
				+
			
 
				+          case taicpu(hp1).opcode of
			
 
				+            A_AND:
			
 
				+              if (taicpu(hp1).opsize = taicpu(p).opsize) and
			
 
				+                MatchOpType(taicpu(hp1), top_const, top_reg) and
			
 
				+                (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
			
 
				+                begin
			
 
				+                  { Make sure the FLAGS register isn't in use }
			
 
				+                  TransferUsedRegs(TmpUsedRegs);
			
 
				+                  hp2 := p;
			
 
				+                  repeat
			
 
				+                    UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
			
 
				+                  until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
			
 
				+
			
 
				+                  if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
			
 
				+                    begin
			
 
				+                      { Generate the identity mask }
			
 
				+                      case taicpu(p).opsize of
			
 
				+                        S_B:
			
 
				+                          IdentityMask := $FF shr Shift;
			
 
				+                        S_W:
			
 
				+                          IdentityMask := $FFFF shr Shift;
			
 
				+                        S_L:
			
 
				+                          IdentityMask := $FFFFFFFF shr Shift;
			
 
				+{$ifdef x86_64}
			
 
				+                        S_Q:
			
 
				+                          { We need to force the operands to be unsigned 64-bit
			
 
				+                            integers otherwise the wrong value is generated }
			
 
				+                          IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
			
 
				+{$endif x86_64}
			
 
				+                        else
			
 
				+                          InternalError(2022081501);
			
 
				+                      end;
			
 
				+
			
 
				+                      if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
			
 
				+                        begin
			
 
				+                          DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
			
 
				+                          { All the possible 1 bits are covered, so we can remove the AND }
			
 
				+                          hp2 := tai(hp1.Previous);
			
 
				+                          RemoveInstruction(hp1);
			
 
				+
			
 
				+                          { p wasn't actually changed, so don't set Result to True,
			
 
				+                            but a change was nonetheless made elsewhere }
			
 
				+                          Include(OptsToCheck, aoc_ForceNewIteration);
			
 
				+
			
 
				+                          { Do another pass in case other AND or MOVZX instructions
			
 
				+                            follow }
			
 
				+                          hp1 := hp2;
			
 
				+                          Continue;
			
 
				+                        end;
			
 
				+
			
 
				+                    end;
			
 
				+                end;
			
 
				+
			
 
				+            A_TEST, A_CMP, A_Jcc:
			
 
				+              { Skip over conditional jumps and relevant comparisons }
			
 
				+              Continue;
			
 
				+
			
 
				+            A_MOVZX:
			
 
				+              if MatchOpType(taicpu(hp1), top_reg, top_reg) and
			
 
				+                SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
			
 
				+                begin
			
 
				+                  { Since the original register is being read as is, subsequent
			
 
				+                    SHRs must not be merged at this point }
			
 
				+                  DoNotMerge := True;
			
 
				+
			
 
				+                  if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
			
 
				+                    begin
			
 
				+                      if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
			
 
				+                        begin
			
 
				+                          DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
			
 
				+                          { All the possible 1 bits are covered, so we can remove the AND }
			
 
				+                          hp2 := tai(hp1.Previous);
			
 
				+                          RemoveInstruction(hp1);
			
 
				+
			
 
				+                          hp1 := hp2;
			
 
				+                        end
			
 
				+                      else { Different register target }
			
 
				+                        begin
			
 
				+                          DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 2)', hp1);
			
 
				+                          taicpu(hp1).opcode := A_MOV;
			
 
				+                          setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
			
 
				+                          case taicpu(hp1).opsize of
			
 
				+                            S_BW:
			
 
				+                              taicpu(hp1).opsize := S_W;
			
 
				+                            S_BL, S_WL:
			
 
				+                              taicpu(hp1).opsize := S_L;
			
 
				+                            else
			
 
				+                              InternalError(2022081503);
			
 
				+                          end;
			
 
				+                        end;
			
 
				+                    end
			
 
				+                  else if (Shift > 0) and
			
 
				+                    (taicpu(p).opsize = S_W) and
			
 
				+                    (taicpu(hp1).opsize = S_WL) and
			
 
				+                    (taicpu(hp1).oper[0]^.reg = NR_AX) and
			
 
				+                    (taicpu(hp1).oper[1]^.reg = NR_EAX) then
			
 
				+                    begin
			
 
				+                      { Detect:
			
 
				+                          shr    x,  %ax (x > 0)
			
 
				+                          ...
			
 
				+                          movzwl %ax,%eax
			
 
				+
			
 
				+                        Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
			
 
				+                      }
			
 
				+                      DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
			
 
				+                      taicpu(hp1).opcode := A_CWDE;
			
 
				+                      taicpu(hp1).clearop(0);
			
 
				+                      taicpu(hp1).clearop(1);
			
 
				+                      taicpu(hp1).ops := 0;
			
 
				+                    end;
			
 
				+
			
 
				+                  { Move onto the next instruction }
			
 
				+                  Continue;
			
 
				+                end;
			
 
				+
			
 
				+            A_SHL, A_SAL, A_SHR:
			
 
				+              if (taicpu(hp1).opsize <= LimitSize) and
			
 
				+                MatchOpType(taicpu(hp1), top_const, top_reg) and
			
 
				+                SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
			
 
				+                begin
			
 
				+                  { Make sure the sizes don't exceed the register size limit
			
 
				+                    (measured by the shift value falling below the limit) }
			
 
				+
			
 
				+                  if taicpu(hp1).opsize < LimitSize then
			
 
				+                    LimitSize := taicpu(hp1).opsize;
			
 
				+
			
 
				+                  if taicpu(hp1).opcode = A_SHR then
			
 
				+                    Inc(Shift, taicpu(hp1).oper[0]^.val)
			
 
				+                  else
			
 
				+                    begin
			
 
				+                      Dec(Shift, taicpu(hp1).oper[0]^.val);
			
 
				+                      DoNotMerge := True;
			
 
				+                    end;
			
 
				+
			
 
				+                  if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
			
 
				+                    Exit;
			
 
				+
			
 
				+                  { Since we've established that the combined shift is within
			
 
				+                    limits, we can actually combine the adjacent SHR
			
 
				+                    instructions even if they're different sizes }
			
 
				+                  if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
			
 
				+                    begin
			
 
				+                      hp2 := tai(hp1.Previous);
			
 
				+                      DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 2', p);
			
 
				+                      Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
			
 
				+                      RemoveInstruction(hp1);
			
 
				+                      hp1 := hp2;
			
 
				+                    end;
			
 
				+
			
 
				+                  { Move onto the next instruction }
			
 
				+                  Continue;
			
 
				+                end;
			
 
				+            else
			
 
				+              ;
			
 
				           end;
			
 
				 
			
 
				+          Break;
			
 
				+        until False;
			
 
				+
			
 
				       end;
			
 
				 
			
 
				 
			
--- a/compiler/x86_64/aoptcpu.pas
+++ b/compiler/x86_64/aoptcpu.pas
@@ -130,6 +130,8 @@ uses
 
				                   result:=OptPass1Sub(p);
			
 
				                 A_SHL,A_SAL:
			
 
				                   result:=OptPass1SHLSAL(p);
			
 
				+                A_SHR:
			
 
				+                  result:=OptPass1SHR(p);
			
 
				                 A_FSTP,A_FISTP:
			
 
				                   result:=OptPass1FSTP(p);
			
 
				                 A_FLD: