瀏覽代碼

* ppc64: moved division by constant optimization from nppcmat to cgcpu unit
* ppc64: cg now also replaces multiplications/divisions by negative powers of two by shifts (and a negate)
* ppc64: replacement of divison by constants now properly check the -O2 compiler switch

git-svn-id: trunk@1662 -

tom_at_work 20 年之前
父節點
當前提交
d088695868

+ 221 - 16
compiler/powerpc64/cgcpu.pas

@@ -172,6 +172,136 @@ uses
   symconst, symsym, fmodule,
   symconst, symsym, fmodule,
   rgobj, tgobj, cpupi, procinfo, paramgr;
   rgobj, tgobj, cpupi, procinfo, paramgr;
 
 
+{ helper function which calculate "magic" values for replacement of unsigned 
+ division by constant operation by multiplication. See the PowerPC compiler
+ developer manual for more information }
+procedure getmagic_unsignedN(const N : byte; const d : aWord; 
+  out magic_m : aWord; out magic_add : boolean; out magic_shift : byte);
+var
+    p : aInt;
+    nc, delta, q1, r1, q2, r2, two_N_minus_1 : aWord;
+begin
+  assert(d > 0);
+
+  two_N_minus_1 := aWord(1) shl (N-1);
+    
+  magic_add := false;
+  nc := - 1 - (-d) mod d;
+  p := N-1; { initialize p }
+  q1 := two_N_minus_1 div nc; { initialize q1 = 2p/nc }
+  r1 := two_N_minus_1 - q1*nc; { initialize r1 = rem(2p,nc) }
+  q2 := (two_N_minus_1-1) div d; { initialize q2 = (2p-1)/d }
+  r2 := (two_N_minus_1-1) - q2*d; { initialize r2 = rem((2p-1),d) }
+  repeat
+    inc(p);
+    if (r1 >= (nc - r1)) then begin
+      q1 := 2 * q1 + 1; { update q1 }
+      r1 := 2*r1 - nc; { update r1 }
+    end else begin
+      q1 := 2*q1; { update q1 }
+      r1 := 2*r1; { update r1 }
+    end;
+    if ((r2 + 1) >= (d - r2)) then begin
+      if (q2 >= (two_N_minus_1-1)) then
+        magic_add := true;
+      q2 := 2*q2 + 1; { update q2 }
+      r2 := 2*r2 + 1 - d; { update r2 }
+    end else begin
+      if (q2 >= two_N_minus_1) then 
+        magic_add := true;
+      q2 := 2*q2; { update q2 }
+      r2 := 2*r2 + 1; { update r2 }
+    end;
+    delta := d - 1 - r2;
+  until not ((p < (2*N)) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
+  magic_m := q2 + 1; { resulting magic number }
+  magic_shift := p - N; { resulting shift }
+end;
+
+{ helper function which calculate "magic" values for replacement of signed 
+ division by constant operation by multiplication. See the PowerPC compiler
+ developer manual for more information }
+procedure getmagic_signedN(const N : byte; const d : aInt; 
+  out magic_m : aInt; out magic_s : aInt);
+var
+  p : aInt;
+  ad, anc, delta, q1, r1, q2, r2, t : aWord;
+  two_N_minus_1 : aWord;
+    
+begin
+  assert((d < -1) or (d > 1));
+
+  two_N_minus_1 := aWord(1) shl (N-1);
+
+  ad := abs(d);
+  t := two_N_minus_1 + (aWord(d) shr (N-1));
+  anc := t - 1 - t mod ad; { absolute value of nc }
+  p := (N-1); { initialize p }
+  q1 := two_N_minus_1 div anc; { initialize q1 = 2p/abs(nc) }
+  r1 := two_N_minus_1 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
+  q2 := two_N_minus_1 div ad; { initialize q2 = 2p/abs(d) }
+  r2 := two_N_minus_1 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
+  repeat 
+    inc(p);
+    q1 := 2*q1; { update q1 = 2p/abs(nc) }
+    r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
+    if (r1 >= anc) then begin { must be unsigned comparison }
+      inc(q1);
+      dec(r1, anc);
+    end;
+    q2 := 2*q2; { update q2 = 2p/abs(d) }
+    r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
+    if (r2 >= ad) then begin { must be unsigned comparison }
+      inc(q2);
+      dec(r2, ad);
+    end;
+    delta := ad - r2;
+  until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
+  magic_m := q2 + 1;
+  if (d < 0) then begin
+    magic_m := -magic_m; { resulting magic number }
+  end;
+  magic_s := p - N; { resulting shift }
+end;
+
+{ finds positive and negative powers of two of the given value, returning the
+ power and whether it's a negative power or not in addition to the actual result
+ of the function }
+function ispowerof2(value : aInt; out power : byte; out neg : boolean) : boolean;
+var
+  i : longint;
+  hl : aInt;
+begin
+  neg := false;
+  { also try to find negative power of two's by negating if the 
+   value is negative. low(aInt) is special because it can not be
+   negated. Simply return the appropriate values for it }
+  if (value < 0) then begin
+    neg := true;
+    if (value = low(aInt)) then begin
+      power := sizeof(aInt)*8-1;
+      result := true;
+      exit;
+    end;
+    value := -value;
+  end;
+
+  if ((value and (value-1)) <> 0) then begin
+    result := false;
+    exit;
+  end;
+  hl := 1;
+  for i := 0 to (sizeof(aInt)*8-1) do begin
+    if (hl = value) then begin
+      result := true;
+      power := i;
+      exit;
+    end;
+    hl := hl shl 1;
+  end;
+end;
+
+
 procedure tcgppc.init_register_allocators;
 procedure tcgppc.init_register_allocators;
 begin
 begin
   inherited init_register_allocators;
   inherited init_register_allocators;
@@ -438,7 +568,9 @@ procedure tcgppc.a_load_const_reg(list: taasmoutput; size: TCGSize; a: aint;
   end;
   end;
 
 
   { R0-safe version of the above (ADDIS doesn't work the same way with R0 as base), without
   { R0-safe version of the above (ADDIS doesn't work the same way with R0 as base), without
-    the return value }
+   the return value. Unused until further testing shows that it is not really necessary;
+   loading the upper 32 bits of a value is now done using R12, which does not require
+   special treatment }
   procedure load32bitconstantR0(list : taasmoutput; size : TCGSize; a : longint;
   procedure load32bitconstantR0(list : taasmoutput; size : TCGSize; a : longint;
     reg : TRegister);
     reg : TRegister);
   begin
   begin
@@ -707,9 +839,86 @@ var
     else
     else
       list.concat(taicpu.op_reg_reg_const(A_ANDI_, dst, src, word(a)));
       list.concat(taicpu.op_reg_reg_const(A_ANDI_, dst, src, word(a)));
   end;
   end;
+
+  procedure do_constant_div(list : taasmoutput; size : TCgSize; a : aint; src, dst : TRegister;
+    signed : boolean);
+  const
+    negops : array[boolean] of tasmop = (A_NEG, A_NEGO);
+  var
+    magic, shift : int64;
+    u_magic : qword;
+    u_shift : byte;
+    u_add : boolean;
+    power : byte;
+    isNegPower : boolean;
+             
+    divreg : tregister;
+  begin
+    if (a = 0) then begin
+      internalerror(2005061701);
+    end else if (a = 1) then begin
+      cg.a_load_reg_reg(exprasmlist, OS_INT, OS_INT, src, dst);
+    end else if (a = -1) then begin
+      { note: only in the signed case possible..., may overflow }
+      exprasmlist.concat(taicpu.op_reg_reg(negops[cs_check_overflow in aktlocalswitches], dst, src));
+    end else if (ispowerof2(a, power, isNegPower)) then begin
+      if (signed) then begin
+        { From "The PowerPC Compiler Writer's Guide", pg. 52ff          }
+        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, power,
+          src, dst);
+        exprasmlist.concat(taicpu.op_reg_reg(A_ADDZE, dst, dst));
+        if (isNegPower) then
+          exprasmlist.concat(taicpu.op_reg_reg(A_NEG, dst, dst));
+      end else begin
+        cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, src, dst)
+      end;
+    end else begin
+      { replace division by multiplication, both implementations }
+      { from "The PowerPC Compiler Writer's Guide" pg. 53ff      }
+      divreg := cg.getintregister(exprasmlist, OS_INT);
+      if (signed) then begin
+        getmagic_signedN(sizeof(aInt)*8, a, magic, shift);
+        { load magic value }
+        cg.a_load_const_reg(exprasmlist, OS_INT, magic, divreg);
+        { multiply }
+        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHD, dst, src, divreg));
+        { add/subtract numerator }
+        if (a > 0) and (magic < 0) then begin
+          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, src, dst, dst);
+        end else if (a < 0) and (magic > 0) then begin
+          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, src, dst, dst);
+        end;
+        { shift shift places to the right (arithmetic) }
+        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, shift, dst, dst);                     
+        { extract and add sign bit }
+        if (a >= 0) then begin
+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, src, divreg);
+        end else begin
+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, dst, divreg);
+        end;                     
+        cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, dst, divreg, dst);
+      end else begin
+        getmagic_unsignedN(sizeof(aWord)*8, a, u_magic, u_add, u_shift);
+        { load magic in divreg }
+        cg.a_load_const_reg(exprasmlist, OS_INT, u_magic, divreg);
+        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHDU, dst, src, divreg));
+        if (u_add) then begin
+          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, dst, src, divreg);
+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT,  1, divreg, divreg);
+          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, divreg, dst, divreg);
+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift-1, divreg, dst);
+        end else begin
+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift, dst, dst);
+        end;
+      end;
+    end;
+  end;
+
 var
 var
   scratchreg: tregister;
   scratchreg: tregister;
-  shift, shiftmask : longint;
+  shift : byte;
+  shiftmask : longint;
+  isneg : boolean;
 
 
 begin
 begin
   { subtraction is the same as addition with negative constant }
   { subtraction is the same as addition with negative constant }
@@ -725,13 +934,8 @@ begin
   useReg := false;
   useReg := false;
   case (op) of
   case (op) of
     OP_DIV, OP_IDIV:
     OP_DIV, OP_IDIV:
-      { actually, this method should be never called directly with OP_DIV or
-       OP_IDIV, so just provide basic support.
-       TODO: move division by constant stuff from nppcmat.pas here }    
-      if (a = 0) then
-        internalerror(200208103)
-      else if (a = 1) then
-        a_load_reg_reg(list, size, size, src, dst)
+      if (cs_slowoptimize in aktglobalswitches) then
+        do_constant_div(list, size, a, src, dst, op = OP_IDIV)
       else
       else
         usereg := true; 
         usereg := true; 
     OP_IMUL, OP_MUL:
     OP_IMUL, OP_MUL:
@@ -743,9 +947,11 @@ begin
         list.concat(taicpu.op_reg_reg(A_NEG, dst, dst))
         list.concat(taicpu.op_reg_reg(A_NEG, dst, dst))
       else if (a = 1) then
       else if (a = 1) then
         a_load_reg_reg(list, OS_INT, OS_INT, src, dst)
         a_load_reg_reg(list, OS_INT, OS_INT, src, dst)
-      else if ispowerof2(a, shift) then
-        list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, shift))
-      else if (a >= low(smallint)) and (a <= high(smallint)) then
+      else if ispowerof2(a, shift, isneg) then begin
+        list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, shift));
+        if (isneg) then
+          exprasmlist.concat(taicpu.op_reg_reg(A_NEG, dst, dst));
+      end else if (a >= low(smallint)) and (a <= high(smallint)) then
         list.concat(taicpu.op_reg_reg_const(A_MULLI, dst, src,
         list.concat(taicpu.op_reg_reg_const(A_MULLI, dst, src,
           smallint(a)))
           smallint(a)))
       else
       else
@@ -808,7 +1014,6 @@ end;
 
 
 procedure tcgppc.a_op_reg_reg_reg(list: taasmoutput; op: TOpCg;
 procedure tcgppc.a_op_reg_reg_reg(list: taasmoutput; op: TOpCg;
   size: tcgsize; src1, src2, dst: tregister);
   size: tcgsize; src1, src2, dst: tregister);
-
 const
 const
   op_reg_reg_opcg2asmop32: array[TOpCG] of tasmop =
   op_reg_reg_opcg2asmop32: array[TOpCG] of tasmop =
   (A_NONE, A_ADD, A_AND, A_DIVWU, A_DIVW, A_MULLW, A_MULLW, A_NEG, A_NOT, A_OR,
   (A_NONE, A_ADD, A_AND, A_DIVWU, A_DIVW, A_MULLW, A_MULLW, A_NEG, A_NOT, A_OR,
@@ -816,7 +1021,6 @@ const
   op_reg_reg_opcg2asmop64: array[TOpCG] of tasmop =
   op_reg_reg_opcg2asmop64: array[TOpCG] of tasmop =
   (A_NONE, A_ADD, A_AND, A_DIVDU, A_DIVD, A_MULLD, A_MULLD, A_NEG, A_NOT, A_OR,
   (A_NONE, A_ADD, A_AND, A_DIVDU, A_DIVD, A_MULLD, A_MULLD, A_NEG, A_NOT, A_OR,
    A_SRAD, A_SLD, A_SRD, A_SUB, A_XOR);
    A_SRAD, A_SLD, A_SRD, A_SUB, A_XOR);
-
 begin
 begin
   case op of
   case op of
     OP_NEG, OP_NOT:
     OP_NEG, OP_NOT:
@@ -1559,7 +1763,7 @@ begin
    least four. If not, add the bytes which are "off" to the base register and
    least four. If not, add the bytes which are "off" to the base register and
    adjust the offset accordingly }
    adjust the offset accordingly }
   case op of
   case op of
-    A_LD, A_LDU, A_STD, A_STDU, A_LWA, A_LWAU :
+    A_LD, A_LDU, A_STD, A_STDU, A_LWA :
      if ((ref.offset mod 4) <> 0) then begin
      if ((ref.offset mod 4) <> 0) then begin
        tmpreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
        tmpreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
 
 
@@ -1621,7 +1825,8 @@ begin
       tmpref.base := ref.base;
       tmpref.base := ref.base;
       tmpref.index := tmpreg2;
       tmpref.index := tmpreg2;
       case op of
       case op of
-        { the code generator doesn't generate update instructions anyway }
+        { the code generator doesn't generate update instructions anyway, so 
+        error out on those instructions }
         A_LBZ : op := A_LBZX;
         A_LBZ : op := A_LBZX;
         A_LHZ : op := A_LHZX;
         A_LHZ : op := A_LHZX;
         A_LWZ : op := A_LWZX;
         A_LWZ : op := A_LWZX;

+ 1 - 1
compiler/powerpc64/cpubase.pas

@@ -96,7 +96,7 @@ type
     A_RLDICL,
     A_RLDICL,
     A_DIVDU, A_DIVDU_, A_DIVD, A_DIVD_, A_MULLD, A_MULLD_, A_MULHD, A_MULHD_, A_SRAD, A_SLD, A_SRD,
     A_DIVDU, A_DIVDU_, A_DIVD, A_DIVD_, A_MULLD, A_MULLD_, A_MULHD, A_MULHD_, A_SRAD, A_SLD, A_SRD,
     A_DIVDUO_, A_DIVDO_,
     A_DIVDUO_, A_DIVDO_,
-    A_LWA, A_LWAU, A_LWAX, A_LWAUX,
+    A_LWA, A_LWAX, A_LWAUX,
     A_FCFID,
     A_FCFID,
     A_LDARX, A_STDCX_, A_CNTLZD,
     A_LDARX, A_STDCX_, A_CNTLZD,
     A_LVX, A_STVX,
     A_LVX, A_STVX,

+ 1 - 1
compiler/powerpc64/itcpugas.pas

@@ -86,7 +86,7 @@ const
     'rldicl',
     'rldicl',
     'divdu', 'divdu.', 'divd', 'divd.', 'mulld', 'mulld.', 'mulhd', 'mulhd.', 'srad', 'sld', 'srd',
     'divdu', 'divdu.', 'divd', 'divd.', 'mulld', 'mulld.', 'mulhd', 'mulhd.', 'srad', 'sld', 'srd',
     'divduo.', 'divdo.',
     'divduo.', 'divdo.',
-    'lwa', '<illegal lwau>', 'lwax', 'lwaux',
+    'lwa', 'lwax', 'lwaux',
     'fcfid',
     'fcfid',
     'ldarx', 'stdcx.', 'cntlzd',
     'ldarx', 'stdcx.', 'cntlzd',
     'lvx', 'stvx',
     'lvx', 'stvx',

+ 2 - 1
compiler/powerpc64/nppcinl.pas

@@ -36,7 +36,8 @@ type
     }
     }
     function first_abs_real: tnode; override;
     function first_abs_real: tnode; override;
     function first_sqr_real: tnode; override;
     function first_sqr_real: tnode; override;
-    { todo: inline trunc/round/frac?/int }
+
+    { trunc/round/frac?/int can't be inlined? }
 
 
     procedure second_abs_real; override;
     procedure second_abs_real; override;
     procedure second_sqr_real; override;
     procedure second_sqr_real; override;

+ 23 - 354
compiler/powerpc64/nppcmat.pas

@@ -59,176 +59,6 @@ uses
   cpubase, cpuinfo,
   cpubase, cpuinfo,
   ncgutil, cgcpu, rgobj;
   ncgutil, cgcpu, rgobj;
 
 
-{ helper functions }
-procedure getmagic_unsigned32(d : dword; out magic_m : dword; out magic_add : boolean; out magic_shift : dword);
-var
-    p : longint;
-    nc, delta, q1, r1, q2, r2 : dword;
-    
-begin
-    assert(d > 0);
-    
-    magic_add := false;
-    nc := - 1 - (-d) mod d;
-    p := 31; { initialize p }
-    q1 := $80000000 div nc; { initialize q1 = 2p/nc }
-    r1 := $80000000 - q1*nc; { initialize r1 = rem(2p,nc) }
-    q2 := $7FFFFFFF div d; { initialize q2 = (2p-1)/d }
-    r2 := $7FFFFFFF - q2*d; { initialize r2 = rem((2p-1),d) }
-    repeat
-        inc(p);
-        if (r1 >= (nc - r1)) then begin
-            q1 := 2 * q1 + 1; { update q1 }
-            r1 := 2*r1 - nc; { update r1 }
-        end else begin
-            q1 := 2*q1; { update q1 }
-            r1 := 2*r1; { update r1 }
-        end;
-        if ((r2 + 1) >= (d - r2)) then begin
-            if (q2 >= $7FFFFFFF) then
-                magic_add := true;
-            q2 := 2*q2 + 1; { update q2 }
-            r2 := 2*r2 + 1 - d; { update r2 }
-        end else begin
-            if (q2 >= $80000000) then 
-                magic_add := true;
-            q2 := 2*q2; { update q2 }
-            r2 := 2*r2 + 1; { update r2 }
-        end;
-        delta := d - 1 - r2;
-    until not ((p < 64) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
-    magic_m := q2 + 1; { resulting magic number }
-    magic_shift := p - 32; { resulting shift }
-end;
-
-procedure getmagic_signed32(d : longint; out magic_m : longint; out magic_s : longint);
-const
-    two_31 : DWord = high(longint)+1;
-var
-    p : Longint;
-    ad, anc, delta, q1, r1, q2, r2, t : DWord;
-    
-begin
-    assert((d < -1) or (d > 1));
-
-    ad := abs(d);
-    t := two_31 + (DWord(d) shr 31);
-    anc := t - 1 - t mod ad; { absolute value of nc }
-    p := 31; { initialize p }
-    q1 := two_31 div anc; { initialize q1 = 2p/abs(nc) }
-    r1 := two_31 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
-    q2 := two_31 div ad; { initialize q2 = 2p/abs(d) }
-    r2 := two_31 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
-    repeat 
-        inc(p);
-        q1 := 2*q1; { update q1 = 2p/abs(nc) }
-        r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
-        if (r1 >= anc) then begin { must be unsigned comparison }
-            inc(q1);
-            dec(r1, anc);
-        end;
-        q2 := 2*q2; { update q2 = 2p/abs(d) }
-        r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
-        if (r2 >= ad) then begin { must be unsigned comparison }
-            inc(q2);
-            dec(r2, ad);
-        end;
-        delta := ad - r2;
-    until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
-    magic_m := q2 + 1;
-    if (d < 0) then begin
-        magic_m := -magic_m; { resulting magic number }
-    end;
-    magic_s := p - 32; { resulting shift }
-end;
-
-{ helper functions }
-procedure getmagic_unsigned64(d : qword; out magic_m : qword; out magic_add : boolean; out magic_shift : qword);
-const
-  two_63 : QWord = $8000000000000000;  
-var
-    p : int64;
-    nc, delta, q1, r1, q2, r2 : qword;
-    
-begin
-  assert(d > 0);
-    
-  magic_add := false;
-  nc := - 1 - (-d) mod d;
-  p := 63; { initialize p }
-  q1 := two_63 div nc; { initialize q1 = 2p/nc }
-  r1 := two_63 - q1*nc; { initialize r1 = rem(2p,nc) }
-  q2 := (two_63-1) div d; { initialize q2 = (2p-1)/d }
-  r2 := (two_63-1) - q2*d; { initialize r2 = rem((2p-1),d) }
-  repeat
-    inc(p);
-    if (r1 >= (nc - r1)) then begin
-      q1 := 2 * q1 + 1; { update q1 }
-      r1 := 2*r1 - nc; { update r1 }
-    end else begin
-      q1 := 2*q1; { update q1 }
-      r1 := 2*r1; { update r1 }
-    end;
-    if ((r2 + 1) >= (d - r2)) then begin
-      if (q2 >= (two_63-1)) then
-        magic_add := true;
-      q2 := 2*q2 + 1; { update q2 }
-      r2 := 2*r2 + 1 - d; { update r2 }
-    end else begin
-      if (q2 >= two_63) then 
-        magic_add := true;
-      q2 := 2*q2; { update q2 }
-      r2 := 2*r2 + 1; { update r2 }
-    end;
-    delta := d - 1 - r2;
-  until not ((p < 128) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
-  magic_m := q2 + 1; { resulting magic number }
-  magic_shift := p - 64; { resulting shift }
-end;
-
-procedure getmagic_signed64(d : int64; out magic_m : int64; out magic_s : int64);
-const
-  two_63 : QWord = $8000000000000000;  
-var
-  p : int64;
-  ad, anc, delta, q1, r1, q2, r2, t : QWord;
-    
-begin
-  assert((d < -1) or (d > 1));
-
-  ad := abs(d);
-  t := two_63 + (QWord(d) shr 63);
-  anc := t - 1 - t mod ad; { absolute value of nc }
-  p := 63; { initialize p }
-  q1 := two_63 div anc; { initialize q1 = 2p/abs(nc) }
-  r1 := two_63 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
-  q2 := two_63 div ad; { initialize q2 = 2p/abs(d) }
-  r2 := two_63 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
-  repeat 
-    inc(p);
-    q1 := 2*q1; { update q1 = 2p/abs(nc) }
-    r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
-    if (r1 >= anc) then begin { must be unsigned comparison }
-      inc(q1);
-      dec(r1, anc);
-    end;
-    q2 := 2*q2; { update q2 = 2p/abs(d) }
-    r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
-    if (r2 >= ad) then begin { must be unsigned comparison }
-      inc(q2);
-      dec(r2, ad);
-    end;
-    delta := ad - r2;
-  until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
-  magic_m := q2 + 1;
-  if (d < 0) then begin
-    magic_m := -magic_m; { resulting magic number }
-  end;
-  magic_s := p - 64; { resulting shift }
-end;
-
-
-
 {*****************************************************************************
 {*****************************************************************************
                              TPPCMODDIVNODE
                              TPPCMODDIVNODE
 *****************************************************************************}
 *****************************************************************************}
@@ -243,8 +73,13 @@ end;
 procedure tppcmoddivnode.pass_2;
 procedure tppcmoddivnode.pass_2;
 const         { signed   overflow }
 const         { signed   overflow }
   divops: array[boolean, boolean] of tasmop =
   divops: array[boolean, boolean] of tasmop =
-    ((A_DIVDU,A_DIVDU_),(A_DIVD,A_DIVDO_));
+    ((A_DIVDU, A_DIVDU_),(A_DIVD, A_DIVDO_));
+  divcgops : array[boolean] of TOpCG = (OP_DIV, OP_IDIV);
   zerocond: tasmcond = (dirhint: DH_Plus; simple: true; cond:C_NE; cr: RS_CR7);
   zerocond: tasmcond = (dirhint: DH_Plus; simple: true; cond:C_NE; cr: RS_CR7);
+  tcgsize2native : array[OS_8..OS_S128] of tcgsize = (
+    OS_64, OS_64, OS_64, OS_64, OS_NO, 
+    OS_S64, OS_S64, OS_S64, OS_S64, OS_NO
+    );
 var
 var
   power  : longint;
   power  : longint;
   op  : tasmop;
   op  : tasmop;
@@ -254,78 +89,10 @@ var
   hl : tasmlabel;
   hl : tasmlabel;
   done: boolean;
   done: boolean;
          
          
-  procedure genOrdConstNodeDiv;
-  const
-    negops : array[boolean] of tasmop = (A_NEG, A_NEGO);
-  var
-    magic, shift : int64;
-    u_magic, u_shift : qword;
-    u_add : boolean;
-             
-    divreg : tregister;
-  begin
-    if (tordconstnode(right).value = 0) then begin
-      internalerror(2005061701);
-    end else if (tordconstnode(right).value = 1) then begin
-      cg.a_load_reg_reg(exprasmlist, OS_INT, OS_INT, numerator, resultreg);
-    end else if (tordconstnode(right).value = -1) then begin
-      { note: only in the signed case possible..., may overflow }
-      exprasmlist.concat(taicpu.op_reg_reg(negops[cs_check_overflow in aktlocalswitches], resultreg, numerator));
-    end else if (ispowerof2(tordconstnode(right).value, power)) then begin
-      if (is_signed(right.resulttype.def)) then begin
-        { From "The PowerPC Compiler Writer's Guide", pg. 52ff          }
-        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, power,
-        numerator, resultreg);
-        exprasmlist.concat(taicpu.op_reg_reg(A_ADDZE, resultreg, resultreg));
-      end else begin
-        cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg)
-      end;
-    end else begin
-      { replace division by multiplication, both implementations }
-      { from "The PowerPC Compiler Writer's Guide" pg. 53ff      }
-      divreg := cg.getintregister(exprasmlist, OS_INT);
-      if (is_signed(right.resulttype.def)) then begin
-        getmagic_signed64(tordconstnode(right).value, magic, shift);
-        { load magic value }
-        cg.a_load_const_reg(exprasmlist, OS_INT, magic, divreg);
-        { multiply }
-        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHD, resultreg, numerator, divreg));
-        { add/subtract numerator }
-        if (tordconstnode(right).value > 0) and (magic < 0) then begin
-          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, numerator, resultreg, resultreg);
-        end else if (tordconstnode(right).value < 0) and (magic > 0) then begin
-          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, numerator, resultreg, resultreg);
-        end;
-        { shift shift places to the right (arithmetic) }
-        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, shift, resultreg, resultreg);                     
-        { extract and add sign bit }
-        if (tordconstnode(right).value >= 0) then begin
-          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, numerator, divreg);
-        end else begin
-          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, resultreg, divreg);
-        end;                     
-        cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, resultreg, divreg, resultreg);
-      end else begin
-        getmagic_unsigned64(tordconstnode(right).value, u_magic, u_add, u_shift);
-        { load magic in divreg }
-        cg.a_load_const_reg(exprasmlist, OS_INT, u_magic, divreg);
-        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHDU, resultreg, numerator, divreg));
-        if (u_add) then begin
-          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, divreg);
-          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT,  1, divreg, divreg);
-          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, divreg, resultreg, divreg);
-          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift-1, divreg, resultreg);
-        end else begin
-          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift, resultreg, resultreg);
-        end;
-      end;
-    end;
-    done := true;
-  end;
-
   procedure genOrdConstNodeMod;
   procedure genOrdConstNodeMod;
   var
   var
     modreg, maskreg, tempreg : tregister;
     modreg, maskreg, tempreg : tregister;
+    isNegPower : boolean;
   begin
   begin
     if (tordconstnode(right).value = 0) then begin
     if (tordconstnode(right).value = 0) then begin
       internalerror(2005061702);
       internalerror(2005061702);
@@ -348,11 +115,14 @@ var
         cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, modreg, maskreg, maskreg);
         cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, modreg, maskreg, maskreg);
         cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_INT, maskreg, tempreg, resultreg);
         cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_INT, maskreg, tempreg, resultreg);
       end else begin
       end else begin
-        cg.a_op_const_reg_reg(exprasmlist, OP_AND, OS_INT, tordconstnode(right).value-1, numerator, resultreg);
+        cg.a_op_const_reg_reg(exprasmlist, OP_AND, OS_INT, tordconstnode(right).value-1, numerator, 
+          resultreg);
       end;
       end;
     end else begin
     end else begin
-      genOrdConstNodeDiv();
-      cg.a_op_const_reg_reg(exprasmlist, OP_MUL, OS_INT, tordconstnode(right).value, resultreg, resultreg);
+      cg.a_op_const_reg_reg(exprasmlist, divCgOps[is_signed(right.resulttype.def)], OS_INT, 
+        tordconstnode(right).value, numerator, resultreg);
+      cg.a_op_const_reg_reg(exprasmlist, OP_MUL, OS_INT, tordconstnode(right).value, resultreg, 
+        resultreg);
       cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, resultreg);
       cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, resultreg);
     end;
     end;
   end;
   end;
@@ -380,20 +150,19 @@ begin
     resultreg := cg.getintregister(exprasmlist,size);
     resultreg := cg.getintregister(exprasmlist,size);
   end;
   end;
   done := false;
   done := false;
-(*
-  if (right.nodetype = ordconstn) then begin
+
+  if (cs_slowoptimize in aktglobalswitches) and (right.nodetype = ordconstn) then begin
     if (nodetype = divn) then
     if (nodetype = divn) then
-      genOrdConstNodeDiv
-    else
+      cg.a_op_const_reg_reg(exprasmlist, divCgOps[is_signed(right.resulttype.def)], 
+        size, tordconstnode(right).value, numerator, resultreg)
+    else 
       genOrdConstNodeMod;
       genOrdConstNodeMod;
     done := true;
     done := true;
   end;
   end;
-*)
 
 
   if (not done) then begin
   if (not done) then begin
     { load divider in a register if necessary }
     { load divider in a register if necessary }
-    location_force_reg(exprasmlist,right.location,
-      def_cgsize(right.resulttype.def),true);
+    location_force_reg(exprasmlist,right.location,def_cgsize(right.resulttype.def),true);
     if (right.nodetype <> ordconstn) then
     if (right.nodetype <> ordconstn) then
       exprasmlist.concat(taicpu.op_reg_reg_const(A_CMPDI, NR_CR7,
       exprasmlist.concat(taicpu.op_reg_reg_const(A_CMPDI, NR_CR7,
         right.location.register, 0))
         right.location.register, 0))
@@ -403,13 +172,14 @@ begin
     end;
     end;
     divider := right.location.register;
     divider := right.location.register;
 
 
-    { needs overflow checking, (-maxlongint-1) div (-1) overflows! }
-    op := divops[is_signed(right.resulttype.def),
-      cs_check_overflow in aktlocalswitches];
+    { select the correct opcode according to the sign of the result, whether we need
+     overflow checking }
+    op := divops[is_signed(right.resulttype.def), cs_check_overflow in aktlocalswitches];
     exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
     exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
       divider));
       divider));
 
 
     if (nodetype = modn) then begin
     if (nodetype = modn) then begin
+      { multiply with the divisor again, taking care of the correct size }
       exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD,resultreg,
       exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD,resultreg,
           divider,resultreg));
           divider,resultreg));
       exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUB,location.register,
       exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUB,location.register,
@@ -433,111 +203,10 @@ begin
     cg.g_overflowcheck(exprasmlist,location,resulttype.def);
     cg.g_overflowcheck(exprasmlist,location,resulttype.def);
 end;
 end;
 
 
-(*
-procedure tppcmoddivnode.pass_2;
-const
-  // ts: todo, use 32 bit operations if possible (much faster!)
-  { signed   overflow }
-  divops: array[boolean, boolean] of tasmop =
-  ((A_DIVDU, A_DIVDUO_), (A_DIVD, A_DIVDO_));
-  zerocond: tasmcond = (dirhint: DH_Plus; simple: true; cond: C_NE; cr: RS_CR1);
-var
-  power: longint;
-  op: tasmop;
-  numerator,
-    divider,
-    resultreg: tregister;
-  size: Tcgsize;
-  hl: tasmlabel;
-
-begin
-  secondpass(left);
-  secondpass(right);
-  location_copy(location, left.location);
-
-  { put numerator in register }
-  size := def_cgsize(left.resulttype.def);
-  location_force_reg(exprasmlist, left.location,
-    size, true);
-  location_copy(location, left.location);
-  numerator := location.register;
-  resultreg := location.register;
-  if (location.loc = LOC_CREGISTER) then
-  begin
-    location.loc := LOC_REGISTER;
-    location.register := cg.getintregister(exprasmlist, size);
-    resultreg := location.register;
-  end;
-  if (nodetype = modn) then
-  begin
-    resultreg := cg.getintregister(exprasmlist, size);
-  end;
-
-  if (nodetype = divn) and
-    (right.nodetype = ordconstn) and
-    ispowerof2(tordconstnode(right).value, power) then
-  begin
-  	if (is_signed(right.resulttype.def)) then begin
-      { From "The PowerPC Compiler Writer's Guide":                   }
-      { This code uses the fact that, in the PowerPC architecture,    }
-      { the shift right algebraic instructions set the Carry bit if   }
-      { the source register contains a negative number and one or     }
-      { more 1-bits are shifted out. Otherwise, the carry bit is      }
-      { cleared. The addze instruction corrects the quotient, if      }
-      { necessary, when the dividend is negative. For example, if     }
-      { n = -13, (0xFFFF_FFF3), and k = 2, after executing the srawi  }
-      { instruction, q = -4 (0xFFFF_FFFC) and CA = 1. After executing }
-      { the addze instruction, q = -3, the correct quotient.          }
-      cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_64, power,
-        numerator, resultreg);
-      exprasmlist.concat(taicpu.op_reg_reg(A_ADDZE, resultreg, resultreg));
-    end else begin
-      cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg);
-    end;
-  end else begin
-    { load divider in a register if necessary }
-    location_force_reg(exprasmlist, right.location,
-      def_cgsize(right.resulttype.def), true);
-    if (right.nodetype <> ordconstn) then
-{$NOTE ts: testme}
-      exprasmlist.concat(taicpu.op_reg_reg_const(A_CMPDI, NR_CR1,
-        right.location.register, 0));
-    divider := right.location.register;
-
-    { needs overflow checking, (-maxlongint-1) div (-1) overflows! }
-    { And on PPC, the only way to catch a div-by-0 is by checking  }
-    { the overflow flag (JM)                                       }
-    op := divops[is_signed(right.resulttype.def),
-      cs_check_overflow in aktlocalswitches];
-    exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
-      divider));
-
-    if (nodetype = modn) then begin
-{$NOTE ts:testme}
-      exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD, resultreg,
-        divider, resultreg));
-      exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUB, location.register,
-        numerator, resultreg));
-      resultreg := location.register;
-    end;
-  end;
-  { set result location }
-  location.loc := LOC_REGISTER;
-  location.register := resultreg;
-  if (right.nodetype <> ordconstn) then begin
-    objectlibrary.getjumplabel(hl);
-    exprasmlist.concat(taicpu.op_cond_sym(A_BC, zerocond, hl));
-    cg.a_call_name(exprasmlist, 'FPC_DIVBYZERO');
-    cg.a_label(exprasmlist, hl);
-  end;
-  cg.g_overflowcheck(exprasmlist, location, resulttype.def);
-end;
-*)
 {*****************************************************************************
 {*****************************************************************************
                              TPPCSHLRSHRNODE
                              TPPCSHLRSHRNODE
 *****************************************************************************}
 *****************************************************************************}
 
 
-
 procedure tppcshlshrnode.pass_2;
 procedure tppcshlshrnode.pass_2;
 
 
 var
 var