|
@@ -2761,7 +2761,7 @@ unit aoptx86;
|
|
|
var
|
|
|
GetNextInstruction_p, TempRegUsed, CrossJump: Boolean;
|
|
|
PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
|
|
|
- NewSize: topsize;
|
|
|
+ NewSize: topsize; NewOffset: asizeint;
|
|
|
p_SourceReg, p_TargetReg, NewMMReg: TRegister;
|
|
|
SourceRef, TargetRef: TReference;
|
|
|
MovAligned, MovUnaligned: TAsmOp;
|
|
@@ -4609,54 +4609,155 @@ unit aoptx86;
|
|
|
exit;
|
|
|
end;
|
|
|
|
|
|
-{$ifdef x86_64}
|
|
|
- { Convert:
|
|
|
- movq x(ref),%reg64
|
|
|
- shrq y,%reg64
|
|
|
- To:
|
|
|
- movl x+4(ref),%reg32
|
|
|
- shrl y-32,%reg32 (Remove if y = 32)
|
|
|
- }
|
|
|
- if (taicpu(p).opsize = S_Q) and
|
|
|
- (taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
|
|
|
- (taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFB) and
|
|
|
- MatchInstruction(hp1, A_SHR, [taicpu(p).opsize]) and
|
|
|
+ if (taicpu(p).oper[0]^.typ = top_ref) and { Second operand will be a register }
|
|
|
+ MatchInstruction(hp1, A_SHR, A_SAR, [taicpu(p).opsize]) and
|
|
|
MatchOpType(taicpu(hp1), top_const, top_reg) and
|
|
|
- (taicpu(hp1).oper[0]^.val >= 32) and
|
|
|
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
|
|
|
begin
|
|
|
RegName1 := debug_regname(taicpu(hp1).oper[1]^.reg);
|
|
|
- PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
|
|
|
- 'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
|
|
|
+{$ifdef x86_64}
|
|
|
+ { Convert:
|
|
|
+ movq x(ref),%reg64
|
|
|
+ shrq y,%reg64
|
|
|
+ To:
|
|
|
+ movl x+4(ref),%reg32
|
|
|
+ shrl y-32,%reg32 (Remove if y = 32)
|
|
|
+ }
|
|
|
+ if (taicpu(p).opsize = S_Q) and
|
|
|
+ (taicpu(hp1).opcode = A_SHR) and
|
|
|
+ (taicpu(hp1).oper[0]^.val >= 32) then
|
|
|
+ begin
|
|
|
+ PreMessage := 'movq ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
|
|
|
+ 'shrq $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> movl ';
|
|
|
|
|
|
- { Convert to 32-bit }
|
|
|
- setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
|
|
|
- taicpu(p).opsize := S_L;
|
|
|
+ { Convert to 32-bit }
|
|
|
+ setsubreg(taicpu(p).oper[1]^.reg, R_SUBD);
|
|
|
+ taicpu(p).opsize := S_L;
|
|
|
|
|
|
- Inc(taicpu(p).oper[0]^.ref^.offset, 4);
|
|
|
+ Inc(taicpu(p).oper[0]^.ref^.offset, 4);
|
|
|
|
|
|
- PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
|
|
|
- if (taicpu(hp1).oper[0]^.val = 32) then
|
|
|
- begin
|
|
|
- DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
|
|
|
- RemoveInstruction(hp1);
|
|
|
- end
|
|
|
+ PreMessage := PreMessage + debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(p).oper[1]^.reg);
|
|
|
+ if (taicpu(hp1).oper[0]^.val = 32) then
|
|
|
+ begin
|
|
|
+ DebugMsg(SPeepholeOptimization + PreMessage + ' (MovShr2Mov)', p);
|
|
|
+ RemoveInstruction(hp1);
|
|
|
+ end
|
|
|
+ else
|
|
|
+ begin
|
|
|
+ { This will potentially open up more arithmetic operations since
|
|
|
+ the peephole optimizer now has a big hint that only the lower
|
|
|
+ 32 bits are currently in use (and opcodes are smaller in size) }
|
|
|
+ setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
|
|
|
+ taicpu(hp1).opsize := S_L;
|
|
|
+
|
|
|
+ Dec(taicpu(hp1).oper[0]^.val, 32);
|
|
|
+ DebugMsg(SPeepholeOptimization + PreMessage +
|
|
|
+ '; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
|
|
|
+ end;
|
|
|
+ Result := True;
|
|
|
+ Exit;
|
|
|
+ end;
|
|
|
+{$endif x86_64}
|
|
|
+ { Convert:
|
|
|
+ movl x(ref),%reg
|
|
|
+ shrl $24,%reg
|
|
|
+ To:
|
|
|
+ movzbl x+3(ref),%reg
|
|
|
+
|
|
|
+ Do similar things for movl; shrl $16 -> movzwl and movw; shrw $8 -> movzbw
|
|
|
+
|
|
|
+ Also accept sar instead of shr, but convert to movsx instead of movzx
|
|
|
+ }
|
|
|
+ if taicpu(hp1).opcode = A_SHR then
|
|
|
+ MovUnaligned := A_MOVZX
|
|
|
else
|
|
|
+ MovUnaligned := A_MOVSX;
|
|
|
+
|
|
|
+ NewSize := S_NO;
|
|
|
+ NewOffset := 0;
|
|
|
+ case taicpu(p).opsize of
|
|
|
+ S_B:
|
|
|
+ { No valid combinations };
|
|
|
+ S_W:
|
|
|
+ if (taicpu(hp1).oper[0]^.val = 8) then
|
|
|
+ begin
|
|
|
+ NewSize := S_BW;
|
|
|
+ NewOffset := 1;
|
|
|
+ end;
|
|
|
+ S_L:
|
|
|
+ case taicpu(hp1).oper[0]^.val of
|
|
|
+ 16:
|
|
|
+ begin
|
|
|
+ NewSize := S_WL;
|
|
|
+ NewOffset := 2;
|
|
|
+ end;
|
|
|
+ 24:
|
|
|
+ begin
|
|
|
+ NewSize := S_BL;
|
|
|
+ NewOffset := 3;
|
|
|
+ end;
|
|
|
+ else
|
|
|
+ ;
|
|
|
+ end;
|
|
|
+{$ifdef x86_64}
|
|
|
+ S_Q:
|
|
|
+ case taicpu(hp1).oper[0]^.val of
|
|
|
+ 32:
|
|
|
+ begin
|
|
|
+ if taicpu(hp1).opcode = A_SAR then
|
|
|
+ begin
|
|
|
+ { 32-bit to 64-bit is a distinct instruction }
|
|
|
+ MovUnaligned := A_MOVSXD;
|
|
|
+ NewSize := S_LQ;
|
|
|
+ NewOffset := 4;
|
|
|
+ end
|
|
|
+ else
|
|
|
+ { Should have been handled by MovShr2Mov above }
|
|
|
+ InternalError(2022081811);
|
|
|
+ end;
|
|
|
+ 48:
|
|
|
+ begin
|
|
|
+ NewSize := S_WQ;
|
|
|
+ NewOffset := 6;
|
|
|
+ end;
|
|
|
+ 56:
|
|
|
+ begin
|
|
|
+ NewSize := S_BQ;
|
|
|
+ NewOffset := 7;
|
|
|
+ end;
|
|
|
+ else
|
|
|
+ ;
|
|
|
+ end;
|
|
|
+{$endif x86_64}
|
|
|
+ else
|
|
|
+ InternalError(2022081810);
|
|
|
+ end;
|
|
|
+
|
|
|
+ if (NewSize <> S_NO) and
|
|
|
+ (taicpu(p).oper[0]^.ref^.offset <= $7FFFFFFF - NewOffset) then
|
|
|
begin
|
|
|
- { This will potentially open up more arithmetic operations since
|
|
|
- the peephole optimizer now has a big hint that only the lower
|
|
|
- 32 bits are currently in use (and opcodes are smaller in size) }
|
|
|
- setsubreg(taicpu(hp1).oper[1]^.reg, R_SUBD);
|
|
|
- taicpu(hp1).opsize := S_L;
|
|
|
+ PreMessage := 'mov' + debug_opsize2str(taicpu(p).opsize) + ' ' + debug_operstr(taicpu(p).oper[0]^) + ',' + RegName1 + '; ' +
|
|
|
+ 'shr' + debug_opsize2str(taicpu(p).opsize) + ' $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + RegName1 + ' -> ' +
|
|
|
+ debug_op2str(MovUnaligned);
|
|
|
+
|
|
|
+{$ifdef x86_64}
|
|
|
+ if MovUnaligned <> A_MOVSXD then
|
|
|
+ { Don't add size suffix for MOVSXD }
|
|
|
+{$endif x86_64}
|
|
|
+ PreMessage := PreMessage + debug_opsize2str(NewSize);
|
|
|
|
|
|
- Dec(taicpu(hp1).oper[0]^.val, 32);
|
|
|
- DebugMsg(SPeepholeOptimization + PreMessage +
|
|
|
- '; shrl $' + debug_tostr(taicpu(hp1).oper[0]^.val) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr2MovShr)', p);
|
|
|
+ Inc(taicpu(p).oper[0]^.ref^.offset, NewOffset);
|
|
|
+ taicpu(p).opcode := MovUnaligned;
|
|
|
+ taicpu(p).opsize := NewSize;
|
|
|
+
|
|
|
+ DebugMsg(SPeepholeOptimization + PreMessage + ' ' +
|
|
|
+ debug_operstr(taicpu(p).oper[0]^) + ',' + debug_regname(taicpu(hp1).oper[1]^.reg) + ' (MovShr/Sar2Movx)', p);
|
|
|
+
|
|
|
+ RemoveInstruction(hp1);
|
|
|
+ Result := True;
|
|
|
+ Exit;
|
|
|
end;
|
|
|
- Result := True;
|
|
|
- Exit;
|
|
|
end;
|
|
|
-{$endif x86_64}
|
|
|
|
|
|
{ Backward optimisation. If we have:
|
|
|
func. %reg1,%reg2
|