3 年之前 · 3f24bd3886
--- a/compiler/x86/nx86mat.pas
+++ b/compiler/x86/nx86mat.pas
@@ -412,7 +412,7 @@ interface
 
				 
			
 
				     procedure tx86moddivnode.pass_generate_code;
			
 
				       var
			
 
				-        hreg1,hreg2,hreg3,rega,regd,tempreg:Tregister;
			
 
				+        hreg1,hreg2,hreg3,hreg4,rega,regd,tempreg:Tregister;
			
 
				         power:longint;
			
 
				         instr:TAiCpu;
			
 
				         op:Tasmop;
			
@@ -425,12 +425,161 @@ interface
 
				       label
			
 
				         DefaultDiv;
			
 
				 
			
 
				+{$ifndef i8086}
			
 
				+        procedure DoBMI2ReciprocalDivision;
			
 
				+          var
			
 
				+            exp_regd: Tregister;
			
 
				+            exp_opsize: topsize;
			
 
				+            DoMod: Boolean;
			
 
				+            SubSize: TSubRegister;
			
 
				+            divsize: Byte;
			
 
				+          begin
			
 
				+            DoMod := (nodetype = modn);
			
 
				+
			
 
				+            { Extend 32-bit divides to 64-bit registers and 16-bit
			
 
				+              divides to 32-bit registers.  Because the domain of
			
 
				+              the left input is only up to 2^(X/2 - 1) - 1, (i.e.
			
 
				+              2^31 - 1 for 64-bit and 2^15 - 1 for 32-bit), a much
			
 
				+              larger error in the reciprocal is permitted. }
			
 
				+            if (resultdef.size <= {$ifdef x86_64}4{$else x86_64}2{$endif x86_64}) then
			
 
				+              begin
			
 
				+{$ifdef x86_64}
			
 
				+                if resultdef.size = 4 then
			
 
				+                  divsize := 64
			
 
				+                else
			
 
				+{$endif x86_64}
			
 
				+                  divsize := 32;
			
 
				+
			
 
				+                calc_divconst_magic_unsigned(divsize, d, m, m_add, s);
			
 
				+
			
 
				+                { Should never have a zero shift and a magic add together }
			
 
				+                if (s = 0) and m_add then
			
 
				+                  InternalError(2021090203);
			
 
				+
			
 
				+                { Extend the input and out registers (the peephole optimizer should
			
 
				+                  help clean up unnecessary MOVZX instructions }
			
 
				+                hreg3 := hreg1;
			
 
				+                case resultdef.size of
			
 
				+{$ifdef x86_64}
			
 
				+                  4:
			
 
				+                    begin
			
 
				+                      SubSize := R_SUBQ;
			
 
				+                      setsubreg(hreg3, R_SUBQ);
			
 
				+                      { Make sure the upper 32 bits are zero; the peephole
			
 
				+                        optimizer will remove this instruction via MovAnd2Mov
			
 
				+                        if it's not needed }
			
 
				+                      emit_const_reg(A_AND, S_L, $FFFFFFFF, hreg1);
			
 
				+                      exp_regd := NR_RDX;
			
 
				+                      exp_opsize := S_Q;
			
 
				+
			
 
				+                      if m_add then
			
 
				+                        { Append 1 to the tail end of the result }
			
 
				+                        m := (m shr s) or ($8000000000000000 shr (s - 1))
			
 
				+                      else
			
 
				+                        m := m shr s;
			
 
				+                    end;
			
 
				+{$endif x86_64}
			
 
				+                  1, 2:
			
 
				+                    begin
			
 
				+                      { MULX doesn't have a 16-bit version }
			
 
				+                      SubSize := R_SUBD;
			
 
				+                      setsubreg(hreg3, R_SUBD);
			
 
				+                      if resultdef.size = 1 then
			
 
				+                        exp_opsize := S_BL
			
 
				+                      else
			
 
				+                        exp_opsize := S_WL;
			
 
				+                      emit_reg_reg(A_MOVZX, exp_opsize, hreg1, hreg3);
			
 
				+                      exp_regd := NR_EDX;
			
 
				+                      exp_opsize := S_L;
			
 
				+
			
 
				+                      if m_add then
			
 
				+                        { Append 1 to the tail end of the result }
			
 
				+                        m := (m shr s) or ($80000000 shr (s - 1))
			
 
				+                      else
			
 
				+                        m := m shr s;
			
 
				+                    end;
			
 
				+                  else
			
 
				+                    InternalError(2021090211);
			
 
				+                end;
			
 
				+
			
 
				+                Inc(m);
			
 
				+
			
 
				+                cg.getcpuregister(current_asmdata.CurrAsmList, exp_regd);
			
 
				+                emit_const_reg(A_MOV, exp_opsize, aint(m), exp_regd);
			
 
				+                hreg2 := cg.getintregister(current_asmdata.CurrAsmList, cgsize);
			
 
				+                hreg4 := hreg2;
			
 
				+                setsubreg(hreg4, SubSize);
			
 
				+
			
 
				+                emit_reg_reg_reg(A_MULX, exp_opsize, hreg3, hreg4, hreg4);
			
 
				+                cg.ungetcpuregister(current_asmdata.CurrAsmList, exp_regd);
			
 
				+              end
			
 
				+            else
			
 
				+              begin
			
 
				+                calc_divconst_magic_unsigned(resultdef.size * 8, d, m, m_add, s);
			
 
				+
			
 
				+                { Should never have a zero shift and a magic add together }
			
 
				+                if (s = 0) and m_add then
			
 
				+                  InternalError(2021090204);
			
 
				+
			
 
				+                cg.getcpuregister(current_asmdata.CurrAsmList, regd);
			
 
				+                emit_const_reg(A_MOV, opsize, aint(m), regd);
			
 
				+                hreg2 := cg.getintregister(current_asmdata.CurrAsmList, cgsize);
			
 
				+                emit_reg_reg_reg(A_MULX, opsize, hreg1, hreg2, hreg2);
			
 
				+                cg.ungetcpuregister(current_asmdata.CurrAsmList, regd);
			
 
				+
			
 
				+                if m_add then
			
 
				+                  begin
			
 
				+                    { addition can overflow, shift first bit considering carry,
			
 
				+                      then shift remaining bits in regular way. }
			
 
				+                    cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
			
 
				+                    emit_reg_reg(A_ADD, opsize, hreg1, hreg2);
			
 
				+                    emit_const_reg(A_RCR, opsize, 1, hreg2);
			
 
				+                    cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
			
 
				+                    dec(s);
			
 
				+                  end;
			
 
				+                if s<>0 then
			
 
				+                  emit_const_reg(A_SHR, opsize, aint(s), hreg2);
			
 
				+              end;
			
 
				+
			
 
				+            if DoMod then
			
 
				+              begin
			
 
				+                { Now multiply the quotient by the original denominator and
			
 
				+                  subtract the product from the original numerator to get
			
 
				+                  the remainder. }
			
 
				+{$ifdef x86_64}
			
 
				+                if (cgsize in [OS_64,OS_S64]) and (d > $7FFFFFFF) then { Cannot use 64-bit constants in IMUL }
			
 
				+                  begin
			
 
				+                    hreg4 := cg.getintregister(current_asmdata.CurrAsmList,cgsize);
			
 
				+                    emit_const_reg(A_MOV, opsize, aint(d), hreg4);
			
 
				+                    emit_reg_reg(A_IMUL, opsize, hreg4, hreg2);
			
 
				+                  end
			
 
				+                else
			
 
				+{$endif x86_64}
			
 
				+                  emit_const_reg(A_IMUL, opsize, aint(d), hreg2);
			
 
				+
			
 
				+                emit_reg_reg(A_SUB, opsize, hreg2, hreg1);
			
 
				+                location.register := hreg1;
			
 
				+              end
			
 
				+            else
			
 
				+              location.register := hreg2;
			
 
				+          end;
			
 
				+{$endif not i8086}
			
 
				+
			
 
				         procedure DoUnsignedReciprocalDivision;
			
 
				           var
			
 
				             exp_rega,exp_regd:Tregister;
			
 
				             exp_opsize:topsize;
			
 
				             DoMod: Boolean;
			
 
				           begin
			
 
				+{$ifndef i8086}
			
 
				+            if (current_settings.cputype = cpu_core_avx2) then
			
 
				+              begin
			
 
				+                { If BMI2 is available, use more efficient instructions }
			
 
				+                DoBMI2ReciprocalDivision;
			
 
				+                Exit;
			
 
				+              end;
			
 
				+{$endif not i8086}
			
 
				+
			
 
				             DoMod := (nodetype = modn);
			
 
				             { Extend 32-bit divides to 64-bit registers and 16-bit
			
 
				               divides to 32-bit registers.  Because the domain of
			
@@ -560,7 +709,6 @@ interface
 
				                     emit_reg_reg(A_IMUL,opsize,hreg3,regd);
			
 
				                   end
			
 
				                 else
			
 
				-{$endif x86_64}
			
 
				 {$endif x86_64}
			
 
				                   emit_const_reg(A_IMUL,opsize,aint(d),regd);