20 years ago · e4a61f4af1
--- a/compiler/powerpc64/cgcpu.pas
+++ b/compiler/powerpc64/cgcpu.pas
@@ -157,16 +157,9 @@ type
 
				   end;
			
 
				 
			
 
				 const
			
 
				-  TOpCG2AsmOpConstLo: array[topcg] of TAsmOp = (A_NONE, A_ADDI, A_ANDI_,
			
 
				-    A_DIVWU,
			
 
				-    A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE, A_ORI,
			
 
				-    A_SRAWI, A_SLWI, A_SRWI, A_SUBI, A_XORI);
			
 
				-  TOpCG2AsmOpConstHi: array[topcg] of TAsmOp = (A_NONE, A_ADDIS, A_ANDIS_,
			
 
				-    A_DIVWU, A_DIVW, A_MULLW, A_MULLW, A_NONE, A_NONE,
			
 
				-    A_ORIS, A_NONE, A_NONE, A_NONE, A_SUBIS, A_XORIS);
			
 
				-
			
 
				-  TShiftOpCG2AsmOpConst32 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRAWI, A_SLWI, A_SRWI);
			
 
				-  TShiftOpCG2AsmOpConst64 : array[OP_SAR..OP_SHR] of TAsmOp = (A_SRADI, A_SLDI, A_SRDI);
			
 
				+  TShiftOpCG2AsmOpConst : array[boolean, OP_SAR..OP_SHR] of TAsmOp = (
			
 
				+    (A_SRAWI, A_SLWI, A_SRWI), (A_SRADI, A_SLDI, A_SRDI)
			
 
				+    );
			
 
				 
			
 
				   TOpCmp2AsmCond: array[topcmp] of TAsmCondFlag = (C_NONE, C_EQ, C_GT,
			
 
				     C_LT, C_GE, C_LE, C_NE, C_LE, C_LT, C_GE, C_GT);
			
@@ -248,10 +241,13 @@ begin
 
				               location^.register)
			
 
				           else
			
 
				             { load non-integral sized memory location into register. This 
			
 
				-            memory location be 1-sizeleft byte sized.
			
 
				-            Always assume that this memory area is properly aligned, eg. start
			
 
				-            loading the larger quantities for "odd" quantities first }
			
 
				+             memory location be 1-sizeleft byte sized.
			
 
				+             Always assume that this memory area is properly aligned, eg. start
			
 
				+             loading the larger quantities for "odd" quantities first }
			
 
				             case sizeleft of
			
 
				+              1,2,4,8 :
			
 
				+                a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref,
			
 
				+                  location^.register); 
			
 
				               3 : begin
			
 
				                 a_reg_alloc(list, NR_R12); 
			
 
				                 a_load_ref_reg(list, OS_16, location^.size, tmpref, 
			
@@ -259,7 +255,7 @@ begin
 
				                 inc(tmpref.offset, tcgsize2size[OS_16]);
			
 
				                 a_load_ref_reg(list, OS_8, location^.size, tmpref,
			
 
				                   location^.register);
			
 
				-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40));                
			
 
				+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 40));
			
 
				                 a_reg_dealloc(list, NR_R12);
			
 
				               end;
			
 
				               5 : begin
			
@@ -267,8 +263,8 @@ begin
 
				                 a_load_ref_reg(list, OS_32, location^.size, tmpref, NR_R12);
			
 
				                 inc(tmpref.offset, tcgsize2size[OS_32]);
			
 
				                 a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register);
			
 
				-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24));                
			
 
				-                a_reg_dealloc(list, NR_R12);              
			
 
				+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R12, 8, 24));
			
 
				+                a_reg_dealloc(list, NR_R12);
			
 
				               end;
			
 
				               6 : begin
			
 
				                 a_reg_alloc(list, NR_R12);
			
@@ -286,20 +282,16 @@ begin
 
				                 a_load_ref_reg(list, OS_16, location^.size, tmpref, NR_R0);
			
 
				                 inc(tmpref.offset, tcgsize2size[OS_16]);
			
 
				                 a_load_ref_reg(list, OS_8, location^.size, tmpref, location^.register);
			
 
				-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16));                
			
 
				-                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8));                 
			
 
				+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, NR_R0, NR_R12, 16, 16));
			
 
				+                list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, location^.register, NR_R0, 8, 8));
			
 
				                 a_reg_dealloc(list, NR_R0);
			
 
				                 a_reg_dealloc(list, NR_R12);
			
 
				               end;
			
 
				-              1,2,4,8 :
			
 
				-                a_load_ref_reg(list, int_cgsize(sizeleft), location^.size, tmpref,
			
 
				-                  location^.register); 
			
 
				-              else 
			
 
				+              else
			
 
				+                { still > 8 bytes to load, so load data single register now }
			
 
				                 a_load_ref_reg(list, location^.size, location^.size, tmpref,
			
 
				                   location^.register);
			
 
				             end; 
			
 
				-//            a_load_ref_reg(list, location^.size, location^.size, tmpref,
			
 
				-//              location^.register);
			
 
				         end;
			
 
				       LOC_REFERENCE:
			
 
				         begin
			
@@ -368,12 +360,8 @@ begin
 
				     AT_FUNCTION)));
			
 
				   if (addNOP) then
			
 
				     list.concat(taicpu.op_none(A_NOP));
			
 
				-  {
			
 
				-         the compiler does not properly set this flag anymore in pass 1, and
			
 
				-         for now we only need it after pass 2 (I hope) (JM)
			
 
				-           if not(pi_do_call in current_procinfo.flags) then
			
 
				-             internalerror(2003060703);
			
 
				-  }
			
 
				+  { the compiler does not properly set this flag anymore in pass 1, and
			
 
				+   for now we only need it after pass 2 (I hope) (JM) }
			
 
				   include(current_procinfo.flags, pi_do_call);
			
 
				 end;
			
 
				 
			
@@ -503,9 +491,9 @@ begin
 
				          32 bits should contain -1
			
 
				         - loading the lower 32 bits resulted in 0 in the upper 32 bits, and the upper
			
 
				          32 bits should contain 0 }
			
 
				-      load32bitconstantR0(list, size, hi(a), NR_R0);
			
 
				+      load32bitconstant(list, size, hi(a), NR_R12);
			
 
				       { combine both registers }
			
 
				-      list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R0, 32, 0));
			
 
				+      list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, NR_R12, 32, 0));
			
 
				     end;
			
 
				   end;
			
 
				 end;
			
@@ -550,7 +538,7 @@ const
 
				     ((A_LBZ, A_LBZU), (A_LBZX, A_LBZUX)),
			
 
				     ((A_LHA, A_LHAU), (A_LHAX, A_LHAUX)),
			
 
				     { there's no load-word-arithmetic-indexed with update, simulate it in code :( }
			
 
				-    ((A_LWA, A_LWAU), (A_LWAX, A_LWAUX)),
			
 
				+    ((A_LWA, A_NOP), (A_LWAX, A_LWAUX)),
			
 
				     ((A_LD, A_LDU), (A_LDX, A_LDUX))
			
 
				     );
			
 
				 var
			
@@ -563,12 +551,12 @@ begin
 
				   ref2 := ref;
			
 
				   fixref(list, ref2, tosize);
			
 
				   { the caller is expected to have adjusted the reference already
			
 
				-   in this case                                                  }
			
 
				+   in this case }
			
 
				   if (TCGSize2Size[fromsize] >= TCGSize2Size[tosize]) then
			
 
				     fromsize := tosize;
			
 
				   op := loadinstr[fromsize, ref2.index <> NR_NO, false];
			
 
				   { there is no LWAU instruction, simulate using ADDI and LWA }
			
 
				-  if (op = A_LWAU) then begin
			
 
				+  if (op = A_NOP) then begin
			
 
				     list.concat(taicpu.op_reg_reg_const(A_ADDI, reg, reg, ref2.offset));
			
 
				     ref2.offset := 0;
			
 
				     op := A_LWA;
			
@@ -605,8 +593,8 @@ var
 
				 begin
			
 
				   op := movemap[fromsize, tosize];
			
 
				   case op of
			
 
				-        A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1);
			
 
				-        A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8);
			
 
				+    A_MR, A_EXTSB, A_EXTSH, A_EXTSW : instr := taicpu.op_reg_reg(op, reg2, reg1);
			
 
				+    A_RLDICL : instr := taicpu.op_reg_reg_const_const(A_RLDICL, reg2, reg1, 0, (8-tcgsize2size[fromsize])*8);
			
 
				   else
			
 
				     internalerror(2002090901);
			
 
				   end;
			
@@ -614,8 +602,8 @@ begin
 
				   rg[R_INTREGISTER].add_move_instruction(instr);
			
 
				 end;
			
 
				 
			
 
				-procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize; reg1, reg2:
			
 
				-  tregister);
			
 
				+procedure tcgppc.a_loadfpu_reg_reg(list: taasmoutput; size: tcgsize; 
			
 
				+  reg1, reg2: tregister);
			
 
				 var
			
 
				   instr: taicpu;
			
 
				 begin
			
@@ -624,8 +612,8 @@ begin
 
				   rg[R_FPUREGISTER].add_move_instruction(instr);
			
 
				 end;
			
 
				 
			
 
				-procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize; const ref:
			
 
				-  treference; reg: tregister);
			
 
				+procedure tcgppc.a_loadfpu_ref_reg(list: taasmoutput; size: tcgsize; 
			
 
				+  const ref: treference; reg: tregister);
			
 
				 const
			
 
				   FpuLoadInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp =
			
 
				   { indexed? updating?}
			
@@ -654,7 +642,6 @@ end;
 
				 
			
 
				 procedure tcgppc.a_loadfpu_reg_ref(list: taasmoutput; size: tcgsize; reg:
			
 
				   tregister; const ref: treference);
			
 
				-
			
 
				 const
			
 
				   FpuStoreInstr: array[OS_F32..OS_F64, boolean, boolean] of TAsmOp =
			
 
				   { indexed? updating? }
			
@@ -688,139 +675,131 @@ end;
 
				 procedure tcgppc.a_op_const_reg_reg(list: taasmoutput; op: TOpCg;
			
 
				   size: tcgsize; a: aint; src, dst: tregister);
			
 
				 var
			
 
				-  l1, l2: longint;
			
 
				-  oplo, ophi: tasmop;
			
 
				-  scratchreg: tregister;
			
 
				   useReg : boolean;
			
 
				-  shiftmask : longint;
			
 
				 
			
 
				-  procedure do_lo_hi;
			
 
				+  procedure do_lo_hi(loOp, hiOp : TAsmOp);
			
 
				   begin
			
 
				+    { Optimization for logical ops (excluding AND), trying to do this as efficiently
			
 
				+     as possible by only generating code for the affected halfwords. Note that all
			
 
				+     the instructions handled here must have "X op 0 = X" for every halfword. }
			
 
				     usereg := false;
			
 
				-    if (size in [OS_64, OS_S64]) then begin
			
 
				-      { ts: use register method for 64 bit consts. Sloooooow }
			
 
				+    if (aword(a) > high(dword)) then begin
			
 
				       usereg := true;
			
 
				-    end else if (size in [OS_32, OS_S32]) then begin
			
 
				-      list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a)));
			
 
				-      list.concat(taicpu.op_reg_reg_const(ophi, dst, dst, word(a shr 16)));
			
 
				     end else begin
			
 
				-      list.concat(taicpu.op_reg_reg_const(oplo, dst, src, word(a)));
			
 
				+      if (word(a) <> 0) then begin
			
 
				+        list.concat(taicpu.op_reg_reg_const(loOp, dst, src, word(a)));
			
 
				+        if (word(a shr 16) <> 0) then
			
 
				+          list.concat(taicpu.op_reg_reg_const(hiOp, dst, dst, word(a shr 16)));
			
 
				+      end else if (word(a shr 16) <> 0) then
			
 
				+        list.concat(taicpu.op_reg_reg_const(hiOp, dst, src, word(a shr 16)));
			
 
				     end;
			
 
				   end;
			
 
				 
			
 
				+  procedure do_lo_hi_and;
			
 
				+  begin
			
 
				+    { optimization logical and with immediate: only use "andi." for 16 bit
			
 
				+     ands, otherwise use register method. Doing this for 32 bit constants
			
 
				+     would not give any advantage to the register method (via useReg := true), 
			
 
				+     requiring a scratch register and three instructions. }
			
 
				+    usereg := false;
			
 
				+    if (aword(a) > high(word)) then
			
 
				+      usereg := true
			
 
				+    else
			
 
				+      list.concat(taicpu.op_reg_reg_const(A_ANDI_, dst, src, word(a)));
			
 
				+  end;
			
 
				+var
			
 
				+  scratchreg: tregister;
			
 
				+  shift, shiftmask : longint;
			
 
				+
			
 
				 begin
			
 
				+  { subtraction is the same as addition with negative constant }
			
 
				   if op = OP_SUB then begin
			
 
				     a_op_const_reg_reg(list, OP_ADD, size, -a, src, dst);
			
 
				     exit;
			
 
				   end;
			
 
				-  ophi := TOpCG2AsmOpConstHi[op];
			
 
				-  oplo := TOpCG2AsmOpConstLo[op];
			
 
				-  { peephole optimizations for AND, OR, XOR - can't this be done at
			
 
				-   some higher level, independent of architecture? }
			
 
				-  if (op in [OP_AND, OP_OR, OP_XOR]) then begin
			
 
				-    if (a = 0) then begin
			
 
				-      if op = OP_AND then
			
 
				-        list.concat(taicpu.op_reg_const(A_LI, dst, 0))
			
 
				-      else
			
 
				-        a_load_reg_reg(list, size, size, src, dst);
			
 
				-      exit;
			
 
				-    end else if (a = -1) then begin
			
 
				-      case op of
			
 
				-        OP_OR:
			
 
				-          list.concat(taicpu.op_reg_const(A_LI, dst, -1));
			
 
				-        OP_XOR:
			
 
				-          list.concat(taicpu.op_reg_reg(A_NOT, dst, src));
			
 
				-        OP_AND:
			
 
				-          a_load_reg_reg(list, size, size, src, dst);
			
 
				-      end;
			
 
				-      exit;
			
 
				-    end;
			
 
				-  { optimization for add }
			
 
				-  end else if (op = OP_ADD) then
			
 
				-    if a = 0 then begin
			
 
				-      a_load_reg_reg(list, size, size, src, dst);
			
 
				-      exit;
			
 
				-    end else if (a >= low(smallint)) and (a <= high(smallint)) then begin
			
 
				-      list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a)));
			
 
				-      exit;
			
 
				-    end;
			
 
				+  { This case includes some peephole optimizations for the various operations,
			
 
				+   (e.g. AND, OR, XOR, ..) - can't this be done at some higher level, 
			
 
				+   independent of architecture? }
			
 
				 
			
 
				-  { otherwise, the instructions we can generate depend on the operation }
			
 
				+  { assume that we do not need a scratch register for the operation }
			
 
				   useReg := false;
			
 
				-  case op of
			
 
				+  case (op) of
			
 
				     OP_DIV, OP_IDIV:
			
 
				+      { actually, this method should be never called directly with OP_DIV or
			
 
				+       OP_IDIV, so just provide basic support.
			
 
				+       TODO: move division by constant stuff from nppcmat.pas here }    
			
 
				       if (a = 0) then
			
 
				         internalerror(200208103)
			
 
				-      else if (a = 1) then begin
			
 
				-        a_load_reg_reg(list, OS_INT, OS_INT, src, dst);
			
 
				-        exit
			
 
				-      end else if false {and ispowerof2(a, l1)} then begin
			
 
				-        internalerror(200208103);
			
 
				-        case op of
			
 
				-          OP_DIV: begin
			
 
				-            list.concat(taicpu.op_reg_reg_const(A_SRDI, dst, src, l1));
			
 
				-          end;
			
 
				-          OP_IDIV:
			
 
				-            begin
			
 
				-              list.concat(taicpu.op_reg_reg_const(A_SRADI, dst, src, l1));
			
 
				-              list.concat(taicpu.op_reg_reg(A_ADDZE, dst, dst));
			
 
				-            end;
			
 
				-        end;
			
 
				-        exit;
			
 
				-      end else
			
 
				-        usereg := true;
			
 
				+      else if (a = 1) then
			
 
				+        a_load_reg_reg(list, size, size, src, dst)
			
 
				+      else
			
 
				+        usereg := true; 
			
 
				     OP_IMUL, OP_MUL:
			
 
				-      if (a = 0) then begin
			
 
				-        list.concat(taicpu.op_reg_const(A_LI, dst, 0));
			
 
				-        exit
			
 
				-      end else if (a = -1) then begin
			
 
				-        list.concat(taicpu.op_reg_reg(A_NEG, dst, dst));
			
 
				-      end else if (a = 1) then begin
			
 
				-        a_load_reg_reg(list, OS_INT, OS_INT, src, dst);
			
 
				-        exit
			
 
				-      end else if ispowerof2(a, l1) then
			
 
				-        list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, l1))
			
 
				+      { idea: factorize constant multiplicands and use adds/shifts with few factors;
			
 
				+       however, even a 64 bit multiply is already quite fast on PPC64 }
			
 
				+      if (a = 0) then
			
 
				+        a_load_const_reg(list, size, 0, dst)
			
 
				+      else if (a = -1) then
			
 
				+        list.concat(taicpu.op_reg_reg(A_NEG, dst, dst))
			
 
				+      else if (a = 1) then
			
 
				+        a_load_reg_reg(list, OS_INT, OS_INT, src, dst)
			
 
				+      else if ispowerof2(a, shift) then
			
 
				+        list.concat(taicpu.op_reg_reg_const(A_SLDI, dst, src, shift))
			
 
				       else if (a >= low(smallint)) and (a <= high(smallint)) then
			
 
				         list.concat(taicpu.op_reg_reg_const(A_MULLI, dst, src,
			
 
				           smallint(a)))
			
 
				       else
			
 
				         usereg := true;
			
 
				     OP_ADD:
			
 
				-      {$todo ts:optimize}
			
 
				-      useReg := true;
			
 
				+      if (a = 0) then
			
 
				+        a_load_reg_reg(list, size, size, src, dst)
			
 
				+      else if (a >= low(smallint)) and (a <= high(smallint)) then
			
 
				+        list.concat(taicpu.op_reg_reg_const(A_ADDI, dst, src, smallint(a)))
			
 
				+      else
			
 
				+        useReg := true;
			
 
				     OP_OR:
			
 
				-      do_lo_hi;
			
 
				+      if (a = 0) then
			
 
				+        a_load_reg_reg(list, size, size, src, dst)
			
 
				+      else if (a = -1) then
			
 
				+        a_load_const_reg(list, size, -1, dst)
			
 
				+      else
			
 
				+        do_lo_hi(A_ORI, A_ORIS);
			
 
				     OP_AND:
			
 
				-      useReg := true;
			
 
				+      if (a = 0) then
			
 
				+        a_load_const_reg(list, size, 0, dst)
			
 
				+      else if (a = -1) then
			
 
				+        a_load_reg_reg(list, size, size, src, dst)
			
 
				+      else
			
 
				+        do_lo_hi_and;
			
 
				     OP_XOR:
			
 
				-      do_lo_hi;
			
 
				+      if (a = 0) then
			
 
				+        a_load_reg_reg(list, size, size, src, dst)
			
 
				+      else if (a = -1) then
			
 
				+        list.concat(taicpu.op_reg_reg(A_NOT, dst, src))
			
 
				+      else
			
 
				+        do_lo_hi(A_XORI, A_XORIS);
			
 
				     OP_SHL, OP_SHR, OP_SAR:
			
 
				       begin
			
 
				-        {$note ts: cleanup todo, fix remaining bugs}
			
 
				-        if (size in [OS_64, OS_S64]) then begin
			
 
				-          if (a and 63) <> 0 then
			
 
				-            list.concat(taicpu.op_reg_reg_const(
			
 
				-              TShiftOpCG2AsmOpConst64[Op], dst, src, a and 63))
			
 
				-          else
			
 
				-            a_load_reg_reg(list, size, size, src, dst);
			
 
				-          if (a shr 6) <> 0 then
			
 
				-            internalError(68991);
			
 
				-        end else begin
			
 
				-          if (a and 31) <> 0 then
			
 
				-            list.concat(taicpu.op_reg_reg_const(
			
 
				-              TShiftOpCG2AsmOpConst32[Op], dst, src, a and 31))
			
 
				-          else
			
 
				-            a_load_reg_reg(list, size, size, src, dst);
			
 
				-          if (a shr 5) <> 0 then
			
 
				-            internalError(68991);
			
 
				-        end;
			
 
				+        if (size in [OS_64, OS_S64]) then 
			
 
				+          shift := 6
			
 
				+        else
			
 
				+          shift := 5;
			
 
				+        
			
 
				+        shiftmask := (1 shl shift)-1;
			
 
				+        if (a and shiftmask) <> 0 then
			
 
				+          list.concat(taicpu.op_reg_reg_const(
			
 
				+            TShiftOpCG2AsmOpConst[size in [OS_64, OS_S64], op], dst, src, a and shiftmask))
			
 
				+        else
			
 
				+          a_load_reg_reg(list, size, size, src, dst);
			
 
				+        if ((a shr shift) <> 0) then
			
 
				+          internalError(68991);
			
 
				       end
			
 
				-  else
			
 
				-    internalerror(200109091);
			
 
				+    else
			
 
				+      internalerror(200109091);
			
 
				   end;
			
 
				-  { if all else failed, load the constant in a register and then }
			
 
				-  { perform the operation                                        }
			
 
				-  if useReg then begin
			
 
				+  { if all else failed, load the constant in a register and then
			
 
				+   perform the operation }
			
 
				+  if (useReg) then begin
			
 
				     scratchreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
			
 
				     a_load_const_reg(list, size, a, scratchreg);
			
 
				     a_op_reg_reg_reg(list, op, size, scratchreg, src, dst);
			
@@ -843,35 +822,29 @@ begin
 
				     OP_NEG, OP_NOT:
			
 
				       begin
			
 
				         list.concat(taicpu.op_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src1));
			
 
				-        if (op = OP_NOT) and
			
 
				-          not (size in [OS_64, OS_S64]) then
			
 
				+        if (op = OP_NOT) and not (size in [OS_64, OS_S64]) then
			
 
				           { zero/sign extend result again, fromsize is not important here }
			
 
				           a_load_reg_reg(list, OS_S64, size, dst, dst)
			
 
				       end;
			
 
				-  else
			
 
				-  {$NOTE ts:testme}
			
 
				-    if (size in [OS_64, OS_S64]) then begin
			
 
				-      list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2,
			
 
				-        src1));
			
 
				-    end else begin
			
 
				-      list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2,
			
 
				-        src1));
			
 
				-    end;
			
 
				+    else
			
 
				+      if (size in [OS_64, OS_S64]) then begin
			
 
				+        list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop64[op], dst, src2,
			
 
				+          src1));
			
 
				+      end else begin
			
 
				+        list.concat(taicpu.op_reg_reg_reg(op_reg_reg_opcg2asmop32[op], dst, src2,
			
 
				+          src1));
			
 
				+      end;
			
 
				   end;
			
 
				 end;
			
 
				 
			
 
				 {*************** compare instructructions ****************}
			
 
				 
			
 
				-procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize; cmp_op:
			
 
				-  topcmp; a: aint; reg: tregister;
			
 
				-  l: tasmlabel);
			
 
				-
			
 
				+procedure tcgppc.a_cmp_const_reg_label(list: taasmoutput; size: tcgsize; 
			
 
				+  cmp_op: topcmp; a: aint; reg: tregister; l: tasmlabel);
			
 
				 var
			
 
				   scratch_register: TRegister;
			
 
				   signed: boolean;
			
 
				-
			
 
				 begin
			
 
				-  { todo: use 32 bit compares? }
			
 
				   signed := cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE];
			
 
				   { in the following case, we generate more efficient code when }
			
 
				   { signed is true                                              }
			
@@ -897,13 +870,10 @@ begin
 
				   a_jmp(list, A_BC, TOpCmp2AsmCond[cmp_op], 0, l);
			
 
				 end;
			
 
				 
			
 
				-procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize; cmp_op:
			
 
				-  topcmp;
			
 
				-  reg1, reg2: tregister; l: tasmlabel);
			
 
				-
			
 
				+procedure tcgppc.a_cmp_reg_reg_label(list: taasmoutput; size: tcgsize; 
			
 
				+  cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);
			
 
				 var
			
 
				   op: tasmop;
			
 
				-
			
 
				 begin
			
 
				   if cmp_op in [OC_GT, OC_LT, OC_GTE, OC_LTE] then
			
 
				     if (size in [OS_64, OS_S64]) then
			
@@ -953,11 +923,9 @@ end;
 
				 
			
 
				 procedure tcgppc.g_flags2reg(list: taasmoutput; size: TCgSize; const f:
			
 
				   TResFlags; reg: TRegister);
			
 
				-
			
 
				 var
			
 
				   testbit: byte;
			
 
				   bitvalue: boolean;
			
 
				-
			
 
				 begin
			
 
				   { get the bit to extract from the conditional register + its requested value (0 or 1) }
			
 
				   testbit := ((f.cr - RS_CR0) * 4);
			
@@ -1375,7 +1343,7 @@ begin
 
				     list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8));
			
 
				     countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
			
 
				     a_load_const_reg(list, OS_32, count, countreg);
			
 
				-    { explicitely allocate R_0 since it can be used safely here
			
 
				+    { explicitely allocate F0 since it can be used safely here
			
 
				      (for holding date that's being copied) }
			
 
				     a_reg_alloc(list, NR_F0);
			
 
				     objectlibrary.getjumplabel(lab);
			
--- a/compiler/powerpc64/cpubase.pas
+++ b/compiler/powerpc64/cpubase.pas
@@ -94,7 +94,7 @@ type
 
				     A_SRDI, A_SRADI,
			
 
				     A_SLDI,
			
 
				     A_RLDICL,
			
 
				-    A_DIVDU, A_DIVD, A_MULLD, A_SRAD, A_SLD, A_SRD,
			
 
				+    A_DIVDU, A_DIVDU_, A_DIVD, A_DIVD_, A_MULLD, A_MULLD_, A_MULHD, A_MULHD_, A_SRAD, A_SLD, A_SRD,
			
 
				     A_DIVDUO_, A_DIVDO_,
			
 
				     A_LWA, A_LWAU, A_LWAX, A_LWAUX,
			
 
				     A_FCFID,
			
--- a/compiler/powerpc64/cpuinfo.pas
+++ b/compiler/powerpc64/cpuinfo.pas
@@ -49,9 +49,7 @@ const
 
				     { the difference to stdcall is only the name mangling }
			
 
				     pocall_cdecl,
			
 
				     { the difference to stdcall is only the name mangling }
			
 
				-    pocall_cppdecl,
			
 
				-    { pass all const records by reference }
			
 
				-    pocall_mwpascal
			
 
				+    pocall_cppdecl
			
 
				     ];
			
 
				 
			
 
				   processorsstr: array[tprocessors] of string[10] = ('',
			
--- a/compiler/powerpc64/cpupara.pas
+++ b/compiler/powerpc64/cpupara.pas
@@ -408,10 +408,11 @@ begin
 
				       end;
			
 
				     end;
			
 
				   end;
			
 
				+
			
 
				   curintreg := nextintreg;
			
 
				   curfloatreg := nextfloatreg;
			
 
				   curmmreg := nextmmreg;
			
 
				-  cur_stack_offset := stack_offset;
			
 
				+  cur_stack_offset := stack_offset; 
			
 
				   result := stack_offset;
			
 
				 end;
			
 
				 
			
--- a/compiler/powerpc64/cpupi.pas
+++ b/compiler/powerpc64/cpupi.pas
@@ -66,7 +66,7 @@ var
 
				 begin
			
 
				   if not (po_assembler in procdef.procoptions) then begin
			
 
				     { align the stack properly }
			
 
				-    ofs := align(maxpushedparasize + LinkageAreaSizeELF, ELF_STACK_ALIGN);
			
 
				+    ofs := align(maxpushedparasize + LinkageAreaSizeELF, 8);
			
 
				 
			
 
				     { the ABI specification says that it is required to always allocate space for 8 * 8 bytes
			
 
				       for registers R3-R10 and stack header if there's a stack frame, but GCC doesn't do that,
			
@@ -74,7 +74,6 @@ begin
 
				 //    if (ofs < 112) then begin
			
 
				 //      ofs := 112;
			
 
				 //    end;
			
 
				-
			
 
				     tg.setfirsttemp(ofs);
			
 
				   end else begin
			
 
				     locals := 0;
			
--- a/compiler/powerpc64/itcpugas.pas
+++ b/compiler/powerpc64/itcpugas.pas
@@ -84,7 +84,7 @@ const
 
				     'srdi', 'sradi',
			
 
				     'sldi',
			
 
				     'rldicl',
			
 
				-    'divdu', 'divd', 'mulld', 'srad', 'sld', 'srd',
			
 
				+    'divdu', 'divdu.', 'divd', 'divd.', 'mulld', 'mulld.', 'mulhd', 'mulhd.', 'srad', 'sld', 'srd',
			
 
				     'divduo.', 'divdo.',
			
 
				     'lwa', '<illegal lwau>', 'lwax', 'lwaux',
			
 
				     'fcfid',
			
--- a/compiler/powerpc64/nppcadd.pas
+++ b/compiler/powerpc64/nppcadd.pas
@@ -163,7 +163,6 @@ begin
 
				   end
			
 
				 end;
			
 
				 
			
 
				-// Todo: ts: allow emiting word compares...
			
 
				 procedure tppcaddnode.emit_compare(unsigned: boolean);
			
 
				 var
			
 
				   op: tasmop;
			
@@ -175,8 +174,7 @@ begin
 
				     swapleftright;
			
 
				   // can we use an immediate, or do we have to load the
			
 
				   // constant in a register first?
			
 
				-  if (right.location.loc = LOC_CONSTANT) then
			
 
				-  begin
			
 
				+  if (right.location.loc = LOC_CONSTANT) then begin
			
 
				     if (nodetype in [equaln, unequaln]) then
			
 
				       if (unsigned and
			
 
				         (aword(right.location.value) > high(word))) or
			
@@ -193,15 +191,13 @@ begin
 
				       (aint(right.location.value) >= low(smallint)) and
			
 
				       (aint(right.location.value) <= high(smallint))) then
			
 
				       useconst := true
			
 
				-    else
			
 
				-    begin
			
 
				+    else begin
			
 
				       useconst := false;
			
 
				       tmpreg := cg.getintregister(exprasmlist, OS_INT);
			
 
				       cg.a_load_const_reg(exprasmlist, OS_INT,
			
 
				         right.location.value, tmpreg);
			
 
				     end
			
 
				-  end
			
 
				-  else
			
 
				+  end else
			
 
				     useconst := false;
			
 
				   location.loc := LOC_FLAGS;
			
 
				   location.resflags := getresflags;
			
@@ -215,15 +211,13 @@ begin
 
				   else
			
 
				     op := A_CMPLD;
			
 
				 
			
 
				-  if (right.location.loc = LOC_CONSTANT) then
			
 
				-  begin
			
 
				+  if (right.location.loc = LOC_CONSTANT) then begin
			
 
				     if useconst then
			
 
				       exprasmlist.concat(taicpu.op_reg_const(op, left.location.register,
			
 
				         longint(right.location.value)))
			
 
				     else
			
 
				       exprasmlist.concat(taicpu.op_reg_reg(op, left.location.register, tmpreg));
			
 
				-  end
			
 
				-  else
			
 
				+  end else
			
 
				     exprasmlist.concat(taicpu.op_reg_reg(op,
			
 
				       left.location.register, right.location.register));
			
 
				 end;
			
@@ -237,7 +231,7 @@ var
 
				   cgop: TOpCg;
			
 
				   cgsize: TCgSize;
			
 
				   cmpop,
			
 
				-    isjump: boolean;
			
 
				+  isjump: boolean;
			
 
				   otl, ofl: tasmlabel;
			
 
				 begin
			
 
				   { calculate the operator which is more difficult }
			
@@ -525,7 +519,6 @@ begin
 
				             cg.a_op_reg_reg(exprasmlist, OP_SHL, OS_64,
			
 
				               right.location.register, tmpreg);
			
 
				             if left.location.loc <> LOC_CONSTANT then begin
			
 
				-
			
 
				               cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_64, tmpreg,
			
 
				                 left.location.register, location.register)
			
 
				             end else begin
			
--- a/compiler/powerpc64/nppcinl.pas
+++ b/compiler/powerpc64/nppcinl.pas
@@ -36,6 +36,8 @@ type
 
				     }
			
 
				     function first_abs_real: tnode; override;
			
 
				     function first_sqr_real: tnode; override;
			
 
				+    { todo: inline trunc/round/frac?/int }
			
 
				+
			
 
				     procedure second_abs_real; override;
			
 
				     procedure second_sqr_real; override;
			
 
				     procedure second_prefetch; override;
			
--- a/compiler/powerpc64/nppcmat.pas
+++ b/compiler/powerpc64/nppcmat.pas
@@ -59,6 +59,176 @@ uses
 
				   cpubase, cpuinfo,
			
 
				   ncgutil, cgcpu, rgobj;
			
 
				 
			
 
				+{ helper functions }
			
 
				+procedure getmagic_unsigned32(d : dword; out magic_m : dword; out magic_add : boolean; out magic_shift : dword);
			
 
				+var
			
 
				+    p : longint;
			
 
				+    nc, delta, q1, r1, q2, r2 : dword;
			
 
				+    
			
 
				+begin
			
 
				+    assert(d > 0);
			
 
				+    
			
 
				+    magic_add := false;
			
 
				+    nc := - 1 - (-d) mod d;
			
 
				+    p := 31; { initialize p }
			
 
				+    q1 := $80000000 div nc; { initialize q1 = 2p/nc }
			
 
				+    r1 := $80000000 - q1*nc; { initialize r1 = rem(2p,nc) }
			
 
				+    q2 := $7FFFFFFF div d; { initialize q2 = (2p-1)/d }
			
 
				+    r2 := $7FFFFFFF - q2*d; { initialize r2 = rem((2p-1),d) }
			
 
				+    repeat
			
 
				+        inc(p);
			
 
				+        if (r1 >= (nc - r1)) then begin
			
 
				+            q1 := 2 * q1 + 1; { update q1 }
			
 
				+            r1 := 2*r1 - nc; { update r1 }
			
 
				+        end else begin
			
 
				+            q1 := 2*q1; { update q1 }
			
 
				+            r1 := 2*r1; { update r1 }
			
 
				+        end;
			
 
				+        if ((r2 + 1) >= (d - r2)) then begin
			
 
				+            if (q2 >= $7FFFFFFF) then
			
 
				+                magic_add := true;
			
 
				+            q2 := 2*q2 + 1; { update q2 }
			
 
				+            r2 := 2*r2 + 1 - d; { update r2 }
			
 
				+        end else begin
			
 
				+            if (q2 >= $80000000) then 
			
 
				+                magic_add := true;
			
 
				+            q2 := 2*q2; { update q2 }
			
 
				+            r2 := 2*r2 + 1; { update r2 }
			
 
				+        end;
			
 
				+        delta := d - 1 - r2;
			
 
				+    until not ((p < 64) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
			
 
				+    magic_m := q2 + 1; { resulting magic number }
			
 
				+    magic_shift := p - 32; { resulting shift }
			
 
				+end;
			
 
				+
			
 
				+procedure getmagic_signed32(d : longint; out magic_m : longint; out magic_s : longint);
			
 
				+const
			
 
				+    two_31 : DWord = high(longint)+1;
			
 
				+var
			
 
				+    p : Longint;
			
 
				+    ad, anc, delta, q1, r1, q2, r2, t : DWord;
			
 
				+    
			
 
				+begin
			
 
				+    assert((d < -1) or (d > 1));
			
 
				+
			
 
				+    ad := abs(d);
			
 
				+    t := two_31 + (DWord(d) shr 31);
			
 
				+    anc := t - 1 - t mod ad; { absolute value of nc }
			
 
				+    p := 31; { initialize p }
			
 
				+    q1 := two_31 div anc; { initialize q1 = 2p/abs(nc) }
			
 
				+    r1 := two_31 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
			
 
				+    q2 := two_31 div ad; { initialize q2 = 2p/abs(d) }
			
 
				+    r2 := two_31 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
			
 
				+    repeat 
			
 
				+        inc(p);
			
 
				+        q1 := 2*q1; { update q1 = 2p/abs(nc) }
			
 
				+        r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
			
 
				+        if (r1 >= anc) then begin { must be unsigned comparison }
			
 
				+            inc(q1);
			
 
				+            dec(r1, anc);
			
 
				+        end;
			
 
				+        q2 := 2*q2; { update q2 = 2p/abs(d) }
			
 
				+        r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
			
 
				+        if (r2 >= ad) then begin { must be unsigned comparison }
			
 
				+            inc(q2);
			
 
				+            dec(r2, ad);
			
 
				+        end;
			
 
				+        delta := ad - r2;
			
 
				+    until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
			
 
				+    magic_m := q2 + 1;
			
 
				+    if (d < 0) then begin
			
 
				+        magic_m := -magic_m; { resulting magic number }
			
 
				+    end;
			
 
				+    magic_s := p - 32; { resulting shift }
			
 
				+end;
			
 
				+
			
 
				+{ helper functions }
			
 
				+procedure getmagic_unsigned64(d : qword; out magic_m : qword; out magic_add : boolean; out magic_shift : qword);
			
 
				+const
			
 
				+  two_63 : QWord = $8000000000000000;  
			
 
				+var
			
 
				+    p : int64;
			
 
				+    nc, delta, q1, r1, q2, r2 : qword;
			
 
				+    
			
 
				+begin
			
 
				+  assert(d > 0);
			
 
				+    
			
 
				+  magic_add := false;
			
 
				+  nc := - 1 - (-d) mod d;
			
 
				+  p := 63; { initialize p }
			
 
				+  q1 := two_63 div nc; { initialize q1 = 2p/nc }
			
 
				+  r1 := two_63 - q1*nc; { initialize r1 = rem(2p,nc) }
			
 
				+  q2 := (two_63-1) div d; { initialize q2 = (2p-1)/d }
			
 
				+  r2 := (two_63-1) - q2*d; { initialize r2 = rem((2p-1),d) }
			
 
				+  repeat
			
 
				+    inc(p);
			
 
				+    if (r1 >= (nc - r1)) then begin
			
 
				+      q1 := 2 * q1 + 1; { update q1 }
			
 
				+      r1 := 2*r1 - nc; { update r1 }
			
 
				+    end else begin
			
 
				+      q1 := 2*q1; { update q1 }
			
 
				+      r1 := 2*r1; { update r1 }
			
 
				+    end;
			
 
				+    if ((r2 + 1) >= (d - r2)) then begin
			
 
				+      if (q2 >= (two_63-1)) then
			
 
				+        magic_add := true;
			
 
				+      q2 := 2*q2 + 1; { update q2 }
			
 
				+      r2 := 2*r2 + 1 - d; { update r2 }
			
 
				+    end else begin
			
 
				+      if (q2 >= two_63) then 
			
 
				+        magic_add := true;
			
 
				+      q2 := 2*q2; { update q2 }
			
 
				+      r2 := 2*r2 + 1; { update r2 }
			
 
				+    end;
			
 
				+    delta := d - 1 - r2;
			
 
				+  until not ((p < 128) and ((q1 < delta) or ((q1 = delta) and (r1 = 0))));
			
 
				+  magic_m := q2 + 1; { resulting magic number }
			
 
				+  magic_shift := p - 64; { resulting shift }
			
 
				+end;
			
 
				+
			
 
				+procedure getmagic_signed64(d : int64; out magic_m : int64; out magic_s : int64);
			
 
				+const
			
 
				+  two_63 : QWord = $8000000000000000;  
			
 
				+var
			
 
				+  p : int64;
			
 
				+  ad, anc, delta, q1, r1, q2, r2, t : QWord;
			
 
				+    
			
 
				+begin
			
 
				+  assert((d < -1) or (d > 1));
			
 
				+
			
 
				+  ad := abs(d);
			
 
				+  t := two_63 + (QWord(d) shr 63);
			
 
				+  anc := t - 1 - t mod ad; { absolute value of nc }
			
 
				+  p := 63; { initialize p }
			
 
				+  q1 := two_63 div anc; { initialize q1 = 2p/abs(nc) }
			
 
				+  r1 := two_63 - q1*anc; { initialize r1 = rem(2p,abs(nc)) }
			
 
				+  q2 := two_63 div ad; { initialize q2 = 2p/abs(d) }
			
 
				+  r2 := two_63 - q2*ad; { initialize r2 = rem(2p,abs(d)) }
			
 
				+  repeat 
			
 
				+    inc(p);
			
 
				+    q1 := 2*q1; { update q1 = 2p/abs(nc) }
			
 
				+    r1 := 2*r1; { update r1 = rem(2p/abs(nc)) }
			
 
				+    if (r1 >= anc) then begin { must be unsigned comparison }
			
 
				+      inc(q1);
			
 
				+      dec(r1, anc);
			
 
				+    end;
			
 
				+    q2 := 2*q2; { update q2 = 2p/abs(d) }
			
 
				+    r2 := 2*r2; { update r2 = rem(2p/abs(d)) }
			
 
				+    if (r2 >= ad) then begin { must be unsigned comparison }
			
 
				+      inc(q2);
			
 
				+      dec(r2, ad);
			
 
				+    end;
			
 
				+    delta := ad - r2;
			
 
				+  until not ((q1 < delta) or ((q1 = delta) and (r1 = 0)));
			
 
				+  magic_m := q2 + 1;
			
 
				+  if (d < 0) then begin
			
 
				+    magic_m := -magic_m; { resulting magic number }
			
 
				+  end;
			
 
				+  magic_s := p - 64; { resulting shift }
			
 
				+end;
			
 
				+
			
 
				+
			
 
				+
			
 
				 {*****************************************************************************
			
 
				                              TPPCMODDIVNODE
			
 
				 *****************************************************************************}
			
@@ -70,6 +240,200 @@ begin
 
				     include(current_procinfo.flags, pi_do_call);
			
 
				 end;
			
 
				 
			
 
				+procedure tppcmoddivnode.pass_2;
			
 
				+const         { signed   overflow }
			
 
				+  divops: array[boolean, boolean] of tasmop =
			
 
				+    ((A_DIVDU,A_DIVDU_),(A_DIVD,A_DIVDO_));
			
 
				+  zerocond: tasmcond = (dirhint: DH_Plus; simple: true; cond:C_NE; cr: RS_CR7);
			
 
				+var
			
 
				+  power  : longint;
			
 
				+  op  : tasmop;
			
 
				+  numerator, divider,
			
 
				+  resultreg  : tregister;
			
 
				+  size       : TCgSize;
			
 
				+  hl : tasmlabel;
			
 
				+  done: boolean;
			
 
				+         
			
 
				+  procedure genOrdConstNodeDiv;
			
 
				+  const
			
 
				+    negops : array[boolean] of tasmop = (A_NEG, A_NEGO);
			
 
				+  var
			
 
				+    magic, shift : int64;
			
 
				+    u_magic, u_shift : qword;
			
 
				+    u_add : boolean;
			
 
				+             
			
 
				+    divreg : tregister;
			
 
				+  begin
			
 
				+    if (tordconstnode(right).value = 0) then begin
			
 
				+      internalerror(2005061701);
			
 
				+    end else if (tordconstnode(right).value = 1) then begin
			
 
				+      cg.a_load_reg_reg(exprasmlist, OS_INT, OS_INT, numerator, resultreg);
			
 
				+    end else if (tordconstnode(right).value = -1) then begin
			
 
				+      { note: only in the signed case possible..., may overflow }
			
 
				+      exprasmlist.concat(taicpu.op_reg_reg(negops[cs_check_overflow in aktlocalswitches], resultreg, numerator));
			
 
				+    end else if (ispowerof2(tordconstnode(right).value, power)) then begin
			
 
				+      if (is_signed(right.resulttype.def)) then begin
			
 
				+        { From "The PowerPC Compiler Writer's Guide", pg. 52ff          }
			
 
				+        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, power,
			
 
				+        numerator, resultreg);
			
 
				+        exprasmlist.concat(taicpu.op_reg_reg(A_ADDZE, resultreg, resultreg));
			
 
				+      end else begin
			
 
				+        cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg)
			
 
				+      end;
			
 
				+    end else begin
			
 
				+      { replace division by multiplication, both implementations }
			
 
				+      { from "The PowerPC Compiler Writer's Guide" pg. 53ff      }
			
 
				+      divreg := cg.getintregister(exprasmlist, OS_INT);
			
 
				+      if (is_signed(right.resulttype.def)) then begin
			
 
				+        getmagic_signed64(tordconstnode(right).value, magic, shift);
			
 
				+        { load magic value }
			
 
				+        cg.a_load_const_reg(exprasmlist, OS_INT, magic, divreg);
			
 
				+        { multiply }
			
 
				+        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHD, resultreg, numerator, divreg));
			
 
				+        { add/subtract numerator }
			
 
				+        if (tordconstnode(right).value > 0) and (magic < 0) then begin
			
 
				+          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, numerator, resultreg, resultreg);
			
 
				+        end else if (tordconstnode(right).value < 0) and (magic > 0) then begin
			
 
				+          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, numerator, resultreg, resultreg);
			
 
				+        end;
			
 
				+        { shift shift places to the right (arithmetic) }
			
 
				+        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, shift, resultreg, resultreg);                     
			
 
				+        { extract and add sign bit }
			
 
				+        if (tordconstnode(right).value >= 0) then begin
			
 
				+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, numerator, divreg);
			
 
				+        end else begin
			
 
				+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, 63, resultreg, divreg);
			
 
				+        end;                     
			
 
				+        cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, resultreg, divreg, resultreg);
			
 
				+      end else begin
			
 
				+        getmagic_unsigned64(tordconstnode(right).value, u_magic, u_add, u_shift);
			
 
				+        { load magic in divreg }
			
 
				+        cg.a_load_const_reg(exprasmlist, OS_INT, u_magic, divreg);
			
 
				+        exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULHDU, resultreg, numerator, divreg));
			
 
				+        if (u_add) then begin
			
 
				+          cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, divreg);
			
 
				+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT,  1, divreg, divreg);
			
 
				+          cg.a_op_reg_reg_reg(exprasmlist, OP_ADD, OS_INT, divreg, resultreg, divreg);
			
 
				+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift-1, divreg, resultreg);
			
 
				+        end else begin
			
 
				+          cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, u_shift, resultreg, resultreg);
			
 
				+        end;
			
 
				+      end;
			
 
				+    end;
			
 
				+    done := true;
			
 
				+  end;
			
 
				+
			
 
				+  procedure genOrdConstNodeMod;
			
 
				+  var
			
 
				+    modreg, maskreg, tempreg : tregister;
			
 
				+  begin
			
 
				+    if (tordconstnode(right).value = 0) then begin
			
 
				+      internalerror(2005061702);
			
 
				+    end else if (abs(tordconstnode(right).value) = 1) then begin
			
 
				+      { x mod +/-1 is always zero }
			
 
				+      cg.a_load_const_reg(exprasmlist, OS_INT, 0, resultreg);
			
 
				+    end else if (ispowerof2(tordconstnode(right).value, power)) then begin
			
 
				+      if (is_signed(right.resulttype.def)) then begin
			
 
				+        tempreg := cg.getintregister(exprasmlist, OS_INT);
			
 
				+        maskreg := cg.getintregister(exprasmlist, OS_INT);
			
 
				+        modreg := cg.getintregister(exprasmlist, OS_INT);
			
 
				+
			
 
				+        cg.a_load_const_reg(exprasmlist, OS_INT, abs(tordconstnode(right).value)-1, modreg);
			
 
				+        cg.a_op_const_reg_reg(exprasmlist, OP_SAR, OS_INT, 63, numerator, maskreg);
			
 
				+        cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, numerator, modreg, tempreg);
			
 
				+
			
 
				+        exprasmlist.concat(taicpu.op_reg_reg_reg(A_ANDC, maskreg, maskreg, modreg));
			
 
				+        exprasmlist.concat(taicpu.op_reg_reg_const(A_SUBFIC, modreg, tempreg, 0));
			
 
				+        exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUBFE, modreg, modreg, modreg));
			
 
				+        cg.a_op_reg_reg_reg(exprasmlist, OP_AND, OS_INT, modreg, maskreg, maskreg);
			
 
				+        cg.a_op_reg_reg_reg(exprasmlist, OP_OR, OS_INT, maskreg, tempreg, resultreg);
			
 
				+      end else begin
			
 
				+        cg.a_op_const_reg_reg(exprasmlist, OP_AND, OS_INT, tordconstnode(right).value-1, numerator, resultreg);
			
 
				+      end;
			
 
				+    end else begin
			
 
				+      genOrdConstNodeDiv();
			
 
				+      cg.a_op_const_reg_reg(exprasmlist, OP_MUL, OS_INT, tordconstnode(right).value, resultreg, resultreg);
			
 
				+      cg.a_op_reg_reg_reg(exprasmlist, OP_SUB, OS_INT, resultreg, numerator, resultreg);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+         
			
 
				+begin
			
 
				+  secondpass(left);
			
 
				+  secondpass(right);
			
 
				+  location_copy(location,left.location);
			
 
				+
			
 
				+  { put numerator in register }
			
 
				+  size:=def_cgsize(left.resulttype.def);
			
 
				+  location_force_reg(exprasmlist,left.location,
			
 
				+    size,true);
			
 
				+  location_copy(location,left.location);
			
 
				+  numerator := location.register;
			
 
				+  resultreg := location.register;
			
 
				+  if (location.loc = LOC_CREGISTER) then begin
			
 
				+    location.loc := LOC_REGISTER;
			
 
				+    location.register := cg.getintregister(exprasmlist,size);
			
 
				+    resultreg := location.register;
			
 
				+  end else if (nodetype = modn) or (right.nodetype = ordconstn) then begin
			
 
				+    { for a modulus op, and for const nodes we need the result register
			
 
				+     to be an extra register }
			
 
				+    resultreg := cg.getintregister(exprasmlist,size);
			
 
				+  end;
			
 
				+  done := false;
			
 
				+(*
			
 
				+  if (right.nodetype = ordconstn) then begin
			
 
				+    if (nodetype = divn) then
			
 
				+      genOrdConstNodeDiv
			
 
				+    else
			
 
				+      genOrdConstNodeMod;
			
 
				+    done := true;
			
 
				+  end;
			
 
				+*)
			
 
				+
			
 
				+  if (not done) then begin
			
 
				+    { load divider in a register if necessary }
			
 
				+    location_force_reg(exprasmlist,right.location,
			
 
				+      def_cgsize(right.resulttype.def),true);
			
 
				+    if (right.nodetype <> ordconstn) then
			
 
				+      exprasmlist.concat(taicpu.op_reg_reg_const(A_CMPDI, NR_CR7,
			
 
				+        right.location.register, 0))
			
 
				+    else begin
			
 
				+      if (tordconstnode(right).value = 0) then 
			
 
				+        internalerror(2005100301);
			
 
				+    end;
			
 
				+    divider := right.location.register;
			
 
				+
			
 
				+    { needs overflow checking, (-maxlongint-1) div (-1) overflows! }
			
 
				+    op := divops[is_signed(right.resulttype.def),
			
 
				+      cs_check_overflow in aktlocalswitches];
			
 
				+    exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
			
 
				+      divider));
			
 
				+
			
 
				+    if (nodetype = modn) then begin
			
 
				+      exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD,resultreg,
			
 
				+          divider,resultreg));
			
 
				+      exprasmlist.concat(taicpu.op_reg_reg_reg(A_SUB,location.register,
			
 
				+        numerator,resultreg));
			
 
				+      resultreg := location.register;
			
 
				+    end;
			
 
				+  end;
			
 
				+  { set result location }
			
 
				+  location.loc:=LOC_REGISTER;
			
 
				+  location.register:=resultreg;
			
 
				+  if right.nodetype <> ordconstn then begin
			
 
				+    objectlibrary.getjumplabel(hl);
			
 
				+    exprasmlist.concat(taicpu.op_cond_sym(A_BC,zerocond,hl));
			
 
				+    cg.a_call_name(exprasmlist,'FPC_DIVBYZERO');
			
 
				+    cg.a_label(exprasmlist,hl);
			
 
				+  end;
			
 
				+  { unsigned division/module can only overflow in case of division by zero
			
 
				+   (but checking this overflow flag is more convoluted than performing a  
			
 
				+   simple comparison with 0)                                             }
			
 
				+  if is_signed(right.resulttype.def) then
			
 
				+    cg.g_overflowcheck(exprasmlist,location,resulttype.def);
			
 
				+end;
			
 
				+
			
 
				+(*
			
 
				 procedure tppcmoddivnode.pass_2;
			
 
				 const
			
 
				   // ts: todo, use 32 bit operations if possible (much faster!)
			
@@ -130,9 +494,7 @@ begin
 
				     end else begin
			
 
				       cg.a_op_const_reg_reg(exprasmlist, OP_SHR, OS_INT, power, numerator, resultreg);
			
 
				     end;
			
 
				-  end
			
 
				-  else
			
 
				-  begin
			
 
				+  end else begin
			
 
				     { load divider in a register if necessary }
			
 
				     location_force_reg(exprasmlist, right.location,
			
 
				       def_cgsize(right.resulttype.def), true);
			
@@ -150,8 +512,7 @@ begin
 
				     exprasmlist.concat(taicpu.op_reg_reg_reg(op, resultreg, numerator,
			
 
				       divider));
			
 
				 
			
 
				-    if (nodetype = modn) then
			
 
				-    begin
			
 
				+    if (nodetype = modn) then begin
			
 
				 {$NOTE ts:testme}
			
 
				       exprasmlist.concat(taicpu.op_reg_reg_reg(A_MULLD, resultreg,
			
 
				         divider, resultreg));
			
@@ -163,8 +524,7 @@ begin
 
				   { set result location }
			
 
				   location.loc := LOC_REGISTER;
			
 
				   location.register := resultreg;
			
 
				-  if right.nodetype <> ordconstn then
			
 
				-  begin
			
 
				+  if (right.nodetype <> ordconstn) then begin
			
 
				     objectlibrary.getjumplabel(hl);
			
 
				     exprasmlist.concat(taicpu.op_cond_sym(A_BC, zerocond, hl));
			
 
				     cg.a_call_name(exprasmlist, 'FPC_DIVBYZERO');
			
@@ -172,7 +532,7 @@ begin
 
				   end;
			
 
				   cg.g_overflowcheck(exprasmlist, location, resulttype.def);
			
 
				 end;
			
 
				-
			
 
				+*)
			
 
				 {*****************************************************************************
			
 
				                              TPPCSHLRSHRNODE
			
 
				 *****************************************************************************}
			
@@ -181,8 +541,8 @@ end;
 
				 procedure tppcshlshrnode.pass_2;
			
 
				 
			
 
				 var
			
 
				-  resultreg, hregister1, hregister2,
			
 
				-    hreg64hi, hreg64lo: tregister;
			
 
				+  resultreg, hregister1, hregister2 : tregister;
			
 
				+  
			
 
				   op: topcg;
			
 
				   asmop1, asmop2: tasmop;
			
 
				   shiftval: aint;
			
@@ -199,7 +559,7 @@ begin
 
				   hregister1 := location.register;
			
 
				   if (location.loc = LOC_CREGISTER) then begin
			
 
				     location.loc := LOC_REGISTER;
			
 
				-    resultreg := cg.getintregister(exprasmlist, OS_64);
			
 
				+    resultreg := cg.getintregister(exprasmlist, OS_INT);
			
 
				     location.register := resultreg;
			
 
				   end;
			
 
				 
			
@@ -257,17 +617,14 @@ begin
 
				         end;
			
 
				       LOC_REFERENCE, LOC_CREFERENCE:
			
 
				         begin
			
 
				-          if (left.resulttype.def.deftype = floatdef) then
			
 
				-          begin
			
 
				+          if (left.resulttype.def.deftype = floatdef) then begin
			
 
				             src1 := cg.getfpuregister(exprasmlist,
			
 
				               def_cgsize(left.resulttype.def));
			
 
				             location.register := src1;
			
 
				             cg.a_loadfpu_ref_reg(exprasmlist,
			
 
				               def_cgsize(left.resulttype.def),
			
 
				               left.location.reference, src1);
			
 
				-          end
			
 
				-          else
			
 
				-          begin
			
 
				+          end else begin
			
 
				             src1 := cg.getintregister(exprasmlist, OS_64);
			
 
				             location.register := src1;
			
 
				             cg.a_load_ref_reg(exprasmlist, OS_64, OS_64,
			
@@ -276,28 +633,19 @@ begin
 
				         end;
			
 
				     end;
			
 
				     { choose appropriate operand }
			
 
				-    if left.resulttype.def.deftype <> floatdef then
			
 
				-    begin
			
 
				+    if left.resulttype.def.deftype <> floatdef then begin
			
 
				       if not (cs_check_overflow in aktlocalswitches) then
			
 
				         op := A_NEG
			
 
				       else
			
 
				         op := A_NEGO_;
			
 
				       location.loc := LOC_REGISTER;
			
 
				-    end
			
 
				-    else
			
 
				-    begin
			
 
				+    end else begin
			
 
				       op := A_FNEG;
			
 
				       location.loc := LOC_FPUREGISTER;
			
 
				     end;
			
 
				     { emit operation }
			
 
				     exprasmlist.concat(taicpu.op_reg_reg(op, location.register, src1));
			
 
				   end;
			
 
				-  { Here was a problem...     }
			
 
				-  { Operand to be negated always     }
			
 
				-  { seems to be converted to signed  }
			
 
				-  { 32-bit before doing neg!!     }
			
 
				-  { So this is useless...     }
			
 
				-  { that's not true: -2^31 gives an overflow error if it is negated (FK) }
			
 
				   cg.g_overflowcheck(exprasmlist, location, resulttype.def);
			
 
				 end;