فهرست منبع

Merged revisions 9300,9302-9303,9307-9308,9310,9322,9337,9340,9343-9347 via svnmerge from
http://svn.freepascal.org/svn/fpc/trunk

........
r9300 | peter | 2007-11-19 23:52:58 +0100 (Mon, 19 Nov 2007) | 2 lines

* don't use readdata for getbyte/getword/getlongint
........
r9302 | jonas | 2007-11-20 12:59:15 +0100 (Tue, 20 Nov 2007) | 2 lines

* fixed handling of unaligned in assignments
........
r9303 | jonas | 2007-11-20 13:15:47 +0100 (Tue, 20 Nov 2007) | 2 lines

* fixed wrong unaligned() changes
........
r9345 | jonas | 2007-11-29 13:56:31 +0100 (Thu, 29 Nov 2007) | 4 lines

+ unaligned support in a_load_reg_ref and a_load_ref_reg for ppc64
(only needs it in case a 64 bit value is loaded from an address
with alignment < 4 bytes)
........
r9346 | jonas | 2007-11-29 13:59:05 +0100 (Thu, 29 Nov 2007) | 4 lines

- removed internalerror in case a subsetref value is loaded on a cpu
which requires proper alignment (they have to support unaligned accesses
in their a_load_*_* routines)
........
r9347 | jonas | 2007-11-29 18:18:52 +0100 (Thu, 29 Nov 2007) | 5 lines

* perform 4 instead of 8 byte copies at a time if source or dest is
unaligned
* use integer instead of floating point for 8 byte copies because the
integer unit has lower latency
........

git-svn-id: branches/fixes_2_2@9843 -

peter 17 سال پیش
والد
کامیت
6d468276ba
6فایلهای تغییر یافته به همراه163 افزوده شده و 102 حذف شده
  1. 0 8
      compiler/cgobj.pas
  2. 5 1
      compiler/ncgld.pas
  3. 1 0
      compiler/options.pas
  4. 86 53
      compiler/powerpc64/cgcpu.pas
  5. 31 3
      compiler/ppcgen/cgppc.pas
  6. 40 37
      compiler/ppu.pas

+ 0 - 8
compiler/cgobj.pas

@@ -1182,14 +1182,6 @@ implementation
       begin
         intloadsize := packedbitsloadsize(sref.bitlen);
 
-{$if not(defined(arm)) and not(defined(sparc))}
-        { may need to be split into several smaller loads/stores }
-        if (tf_requires_proper_alignment in target_info.flags) and
-           (intloadsize <> 1) and
-           (intloadsize <> sref.ref.alignment) then
-          internalerror(2006082011);
-{$endif not(defined(arm)) and not(defined(sparc))}
-
         if (intloadsize = 0) then
           internalerror(2006081310);
 

+ 5 - 1
compiler/ncgld.pas

@@ -577,7 +577,11 @@ implementation
                         len:=left.resultdef.size;
                         if (right.location.reference.offset mod sizeof(aint)<>0) or
                           (left.location.reference.offset mod sizeof(aint)<>0) or
-                          (right.resultdef.alignment<sizeof(aint)) then
+                          (right.resultdef.alignment<sizeof(aint)) or
+                          ((right.location.reference.alignment<>0) and
+                           (right.location.reference.alignment<sizeof(aint))) or
+                          ((left.location.reference.alignment<>0) and
+                           (left.location.reference.alignment<sizeof(aint))) then
                           cg.g_concatcopy_unaligned(current_asmdata.CurrAsmList,right.location.reference,left.location.reference,len)
                         else
                           cg.g_concatcopy(current_asmdata.CurrAsmList,right.location.reference,left.location.reference,len);

+ 1 - 0
compiler/options.pas

@@ -2152,6 +2152,7 @@ begin
 
 {$ifdef SUPPORT_UNALIGNED}
   def_system_macro('FPC_SUPPORTS_UNALIGNED');
+  def_system_macro('FPC_UNALIGNED_FIXED');
 {$endif SUPPORT_UNALIGNED}
 {$ifdef powerpc64}
   def_system_macro('FPC_HAS_LWSYNC');

+ 86 - 53
compiler/powerpc64/cgcpu.pas

@@ -401,7 +401,7 @@ begin
         RS_R9, RS_R10, RS_R11, RS_R12, RS_R31, RS_R30, RS_R29,
         RS_R28, RS_R27, RS_R26, RS_R25, RS_R24, RS_R23, RS_R22,
        RS_R21, RS_R20, RS_R19, RS_R18, RS_R17, RS_R16, RS_R15,
-       RS_R14], first_int_imreg, []);	
+       RS_R14], first_int_imreg, []);
   rg[R_FPUREGISTER] := trgcpu.create(R_FPUREGISTER, R_SUBNONE,
     [RS_F0, RS_F1, RS_F2, RS_F3, RS_F4, RS_F5, RS_F6, RS_F7, RS_F8, RS_F9,
     RS_F10, RS_F11, RS_F12, RS_F13, RS_F31, RS_F30, RS_F29, RS_F28, RS_F27,
@@ -706,7 +706,7 @@ begin
   if not (size in [OS_8, OS_S8, OS_16, OS_S16, OS_32, OS_S32, OS_64, OS_S64]) then
     internalerror(2002090902);
   { if PIC or basic optimizations are enabled, and the number of instructions which would be
-   required to load the value is greater than 2, store (and later load) the value from there } 
+   required to load the value is greater than 2, store (and later load) the value from there }
 //  if (((cs_opt_peephole in current_settings.optimizerswitches) or (cs_create_pic in current_settings.moduleswitches)) and
 //    (getInstructionLength(a) > 2)) then
 //    loadConstantPIC(list, size, a, reg)
@@ -737,7 +737,7 @@ const
 var
   op: tasmop;
   ref2: treference;
-
+  tmpreg: tregister;
 begin
   {$IFDEF EXTDEBUG}
   list.concat(tai_comment.create(strpnew('a_load_ref_reg ' + ref2string(ref))));
@@ -754,6 +754,30 @@ begin
   ref2 := ref;
   fixref(list, ref2);
 
+  { unaligned 64 bit accesses are much slower than unaligned }
+  { 32 bit accesses because they cause a hardware exception  }
+  { (which isn't handled by linux, so there you even get a   }
+  {  crash)                                                  }
+  if (ref.alignment<>0) and
+     (fromsize in [OS_64,OS_S64]) and
+     (ref.alignment<4) then
+    begin
+      if (ref2.base<>NR_NO) and
+         (ref2.index<>NR_NO) then
+        begin
+          tmpreg:=getintregister(list,OS_64);
+          a_op_reg_reg_reg(list,OP_SHR,OS_64,ref2.base,ref2.index,tmpreg);
+          ref2.base:=tmpreg;
+          ref2.index:=NR_NO;
+        end;
+      tmpreg:=getintregister(list,OS_32);
+      a_load_ref_reg(list,OS_32,OS_32,ref2,tmpreg);
+      inc(ref2.offset,4);
+      a_load_ref_reg(list,OS_32,OS_32,ref2,reg);
+      list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, tmpreg, 32, 0));
+      exit;
+    end;
+
   op := loadinstr[fromsize, ref2.index <> NR_NO, false];
   { there is no LWAU instruction, simulate using ADDI and LWA }
   if (op = A_NOP) then begin
@@ -807,10 +831,10 @@ begin
   {$ifdef extdebug}
   list.concat(tai_comment.create(strpnew('a_load_subsetreg_reg subsetregsize = ' + cgsize2string(sreg.subsetregsize) + ' subsetsize = ' + cgsize2string(subsetsize) + ' startbit = ' + intToStr(sreg.startbit) + ' tosize = ' + cgsize2string(tosize))));
   {$endif}
-  { do the extraction if required and then extend the sign correctly. (The latter is actually required only for signed subsets 
+  { do the extraction if required and then extend the sign correctly. (The latter is actually required only for signed subsets
   and if that subset is not >= the tosize). }
   if (sreg.startbit <> 0) or
-     (sreg.bitlen <> tcgsize2size[subsetsize]*8) then begin 
+     (sreg.bitlen <> tcgsize2size[subsetsize]*8) then begin
     list.concat(taicpu.op_reg_reg_const_const(A_RLDICL, destreg, sreg.subsetreg, (64 - sreg.startbit) and 63, 64 - sreg.bitlen));
     if (subsetsize in [OS_S8..OS_S128]) then
       if ((sreg.bitlen mod 8) = 0) then begin
@@ -1714,18 +1738,14 @@ end;
 
 { ************* concatcopy ************ }
 
-const
-  maxmoveunit = 8;
-
-
 procedure tcgppc.g_concatcopy(list: TAsmList; const source, dest: treference;
   len: aint);
 
 var
-  countreg, tempreg: TRegister;
+  countreg, tempreg:TRegister;
   src, dst: TReference;
   lab: tasmlabel;
-  count, count2: longint;
+  count, count2, step: longint;
   size: tcgsize;
 
 begin
@@ -1735,7 +1755,8 @@ begin
   list.concat(tai_comment.create(strpnew('g_concatcopy1 ' + inttostr(len) + ' bytes left ')));
 {$ENDIF extdebug}
   { if the references are equal, exit, there is no need to copy anything }
-  if (references_equal(source, dest)) then
+  if references_equal(source, dest) or
+     (len=0) then
     exit;
 
   { make sure short loads are handled as optimally as possible;
@@ -1744,7 +1765,7 @@ begin
    NOTE: maybe use some scratch registers to pair load/store instructions
   }
 
-  if (len <= maxmoveunit) then begin
+  if (len <= 8) then begin
     src := source; dst := dest;
     {$IFDEF extdebug}
     list.concat(tai_comment.create(strpnew('g_concatcopy3 ' + inttostr(src.offset) + ' ' + inttostr(dst.offset))));
@@ -1774,16 +1795,29 @@ begin
 {$ENDIF extdebug}
 
 
-  count := len div maxmoveunit;
+  if not(source.alignment in [1,2]) and
+     not(dest.alignment in [1,2]) then
+    begin
+      count:=len div 8;
+      step:=8;
+      size:=OS_64;
+    end
+  else
+    begin
+      count:=len div 4;
+      step:=4;
+      size:=OS_32;
+    end;
 
+  tempreg:=getintregister(list,size);
   reference_reset(src);
   reference_reset(dst);
   { load the address of source into src.base }
   if (count > 4) or
     not issimpleref(source) or
     ((source.index <> NR_NO) and
-    ((source.offset + len) > high(smallint))) then begin
-    src.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
+     ((source.offset + len) > high(smallint))) then begin
+    src.base := getaddressregister(list);
     a_loadaddr_ref_reg(list, source, src.base);
   end else begin
     src := source;
@@ -1793,7 +1827,7 @@ begin
     not issimpleref(dest) or
     ((dest.index <> NR_NO) and
     ((dest.offset + len) > high(smallint))) then begin
-    dst.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
+    dst.base := getaddressregister(list);
     a_loadaddr_ref_reg(list, dest, dst.base);
   end else begin
     dst := dest;
@@ -1802,64 +1836,63 @@ begin
   { generate a loop }
   if count > 4 then begin
     { the offsets are zero after the a_loadaddress_ref_reg and just
-     have to be set to 8. I put an Inc there so debugging may be
+     have to be set to step. I put an Inc there so debugging may be
      easier (should offset be different from zero here, it will be
      easy to notice in the generated assembler }
-    inc(dst.offset, 8);
-    inc(src.offset, 8);
-    list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, 8));
-    list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8));
-    countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
-    a_load_const_reg(list, OS_64, count, countreg);
-    { explicitely allocate F0 since it can be used safely here
-     (for holding date that's being copied) }
-    a_reg_alloc(list, NR_F0);
+    inc(dst.offset, step);
+    inc(src.offset, step);
+    list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, step));
+    list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, step));
+    countreg := getintregister(list, OS_INT);
+    a_load_const_reg(list, OS_INT, count, countreg);
     current_asmdata.getjumplabel(lab);
     a_label(list, lab);
     list.concat(taicpu.op_reg_reg_const(A_SUBIC_, countreg, countreg, 1));
-    list.concat(taicpu.op_reg_ref(A_LFDU, NR_F0, src));
-    list.concat(taicpu.op_reg_ref(A_STFDU, NR_F0, dst));
+    if (size=OS_64) then
+      begin
+        list.concat(taicpu.op_reg_ref(A_LDU, tempreg, src));
+        list.concat(taicpu.op_reg_ref(A_STDU, tempreg, dst));
+      end
+    else
+      begin
+        list.concat(taicpu.op_reg_ref(A_LWZU, tempreg, src));
+        list.concat(taicpu.op_reg_ref(A_STWU, tempreg, dst));
+      end;
     a_jmp(list, A_BC, C_NE, 0, lab);
-    a_reg_dealloc(list, NR_F0);
-    len := len mod 8;
+    a_reg_sync(list,src.base);
+    a_reg_sync(list,dst.base);
+    a_reg_sync(list,countreg);
+    len := len mod step;
+    count := 0;
   end;
 
-  count := len div 8;
   { unrolled loop }
   if count > 0 then begin
-    a_reg_alloc(list, NR_F0);
     for count2 := 1 to count do begin
-      a_loadfpu_ref_reg(list, OS_F64, OS_F64, src, NR_F0);
-      a_loadfpu_reg_ref(list, OS_F64, OS_F64, NR_F0, dst);
-      inc(src.offset, 8);
-      inc(dst.offset, 8);
+      a_load_ref_reg(list, size, size, src, tempreg);
+      a_load_reg_ref(list, size, size, tempreg, dst);
+      inc(src.offset, step);
+      inc(dst.offset, step);
     end;
-    a_reg_dealloc(list, NR_F0);
-    len := len mod 8;
+    len := len mod step;
   end;
 
   if (len and 4) <> 0 then begin
-    a_reg_alloc(list, NR_R0);
-    a_load_ref_reg(list, OS_32, OS_32, src, NR_R0);
-    a_load_reg_ref(list, OS_32, OS_32, NR_R0, dst);
+    a_load_ref_reg(list, OS_32, OS_32, src, tempreg);
+    a_load_reg_ref(list, OS_32, OS_32, tempreg, dst);
     inc(src.offset, 4);
     inc(dst.offset, 4);
-    a_reg_dealloc(list, NR_R0);
   end;
   { copy the leftovers }
   if (len and 2) <> 0 then begin
-    a_reg_alloc(list, NR_R0);
-    a_load_ref_reg(list, OS_16, OS_16, src, NR_R0);
-    a_load_reg_ref(list, OS_16, OS_16, NR_R0, dst);
+    a_load_ref_reg(list, OS_16, OS_16, src, tempreg);
+    a_load_reg_ref(list, OS_16, OS_16, tempreg, dst);
     inc(src.offset, 2);
     inc(dst.offset, 2);
-    a_reg_dealloc(list, NR_R0);
   end;
   if (len and 1) <> 0 then begin
-    a_reg_alloc(list, NR_R0);
-    a_load_ref_reg(list, OS_8, OS_8, src, NR_R0);
-    a_load_reg_ref(list, OS_8, OS_8, NR_R0, dst);
-    a_reg_dealloc(list, NR_R0);
+    a_load_ref_reg(list, OS_8, OS_8, src, tempreg);
+    a_load_reg_ref(list, OS_8, OS_8, tempreg, dst);
   end;
 
 end;
@@ -1874,7 +1907,7 @@ begin
   end;
 
   { for ppc64/linux emit correct code which sets up a stack frame and then calls the
-  external method normally to ensure that the GOT/TOC will be loaded correctly if 
+  external method normally to ensure that the GOT/TOC will be loaded correctly if
   required.
 
   It's not really advantageous to use cg methods here because they are too specialized.
@@ -1952,7 +1985,7 @@ procedure tcgppc.a_load_store(list: TAsmList; op: tasmop; reg: tregister;
         A_LD, A_LDU, A_STD, A_STDU, A_LWA :
            if ((ref.offset mod 4) <> 0) then begin
             tmpreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
-    
+
             if (ref.base <> NR_NO) then begin
               a_op_const_reg_reg(list, OP_ADD, OS_ADDR, ref.offset mod 4, ref.base, tmpreg);
               ref.base := tmpreg;

+ 31 - 3
compiler/ppcgen/cgppc.pas

@@ -394,19 +394,47 @@ unit cgppc;
 {$endif cpu64bit}
         );
     var
-      op: TAsmOp;
       ref2: TReference;
+      tmpreg: tregister;
+      op: TAsmOp;
     begin
       if not (fromsize in [OS_8..OS_INT,OS_S8..OS_SINT]) then
         internalerror(2002090903);
       if not (tosize in [OS_8..OS_INT,OS_S8..OS_SINT]) then
         internalerror(2002090905);
 
-      ref2 := ref;
-      fixref(list, ref2);
       if tosize in [OS_S8..OS_SINT] then
         { storing is the same for signed and unsigned values }
         tosize := tcgsize(ord(tosize) - (ord(OS_S8) - ord(OS_8)));
+
+      ref2 := ref;
+      fixref(list, ref2);
+
+      { unaligned 64 bit accesses are much slower than unaligned }
+      { 32 bit accesses because they cause a hardware exception  }
+      { (which isn't handled by linux, so there you even get a   }
+      {  crash)                                                  }
+       if (ref2.alignment<>0) and
+         (tosize in [OS_64,OS_S64]) and
+         (ref.alignment<4) then
+        begin
+          if (ref2.base<>NR_NO) and
+             (ref2.index<>NR_NO) then
+            begin
+              tmpreg:=getintregister(list,OS_64);
+              a_op_reg_reg_reg(list,OP_SHR,OS_64,ref2.base,ref2.index,tmpreg);
+              ref2.base:=tmpreg;
+              ref2.index:=NR_NO;
+            end;
+          tmpreg:=getintregister(list,OS_64);
+          a_op_const_reg_reg(list,OP_SHR,OS_64,32,reg,tmpreg);
+          inc(ref2.offset,4);
+          a_load_reg_ref(list,OS_32,OS_32,reg,ref2);
+          dec(ref2.offset,4);
+          a_load_reg_ref(list,OS_32,OS_32,tmpreg,ref2);
+          exit;
+        end;
+
       op := storeinstr[tcgsize2unsigned[tosize], ref2.index <> NR_NO, false];
       a_load_store(list, op, reg, ref2);
     end;

+ 40 - 37
compiler/ppu.pas

@@ -479,19 +479,7 @@ begin
     if bufsize=0 then
       exit;
   until false;
-  { For small values copy directly }
-  if len<=sizeof(ptruint) then
-    begin
-      pmax:=p+len;
-      while (p<pmax) do
-        begin
-          p^:=pbuf^;
-          inc(pbuf);
-          inc(p);
-        end;
-    end
-  else
-    move(pbuf^,p^,len);
+  move(pbuf^,p^,len);
   inc(bufidx,len);
 end;
 
@@ -574,43 +562,48 @@ end;
 
 
 function tppufile.getbyte:byte;
-var
-  b : byte;
 begin
   if entryidx+1>entry.size then
    begin
      error:=true;
-     getbyte:=0;
+     result:=0;
      exit;
    end;
-  readdata(b,1);
-  getbyte:=b;
+  if bufsize-bufidx>=1 then
+    begin
+      result:=pbyte(@buf[bufidx])^;
+      inc(bufidx);
+    end
+  else
+    readdata(result,1);
   inc(entryidx);
 end;
 
 
 function tppufile.getword:word;
-var
-  w : word;
 begin
   if entryidx+2>entry.size then
    begin
      error:=true;
-     getword:=0;
+     result:=0;
      exit;
    end;
-  readdata(w,2);
-  if change_endian then
-   getword:=swapendian(w)
+{$ifdef FPC_UNALIGNED_FIXED}
+  if bufsize-bufidx>=sizeof(word) then
+    begin
+      result:=Unaligned(pword(@buf[bufidx])^);
+      inc(bufidx,sizeof(word));
+    end
   else
-   getword:=w;
+{$endif FPC_UNALIGNED_FIXED}
+    readdata(result,sizeof(word));
+  if change_endian then
+   result:=swapendian(result);
   inc(entryidx,2);
 end;
 
 
 function tppufile.getlongint:longint;
-var
-  l : longint;
 begin
   if entryidx+4>entry.size then
    begin
@@ -618,18 +611,22 @@ begin
      getlongint:=0;
      exit;
    end;
-  readdata(l,4);
-  if change_endian then
-   getlongint:=swapendian(l)
+{$ifdef FPC_UNALIGNED_FIXED}
+  if bufsize-bufidx>=sizeof(longint) then
+    begin
+      result:=Unaligned(plongint(@buf[bufidx])^);
+      inc(bufidx,sizeof(longint));
+    end
   else
-   getlongint:=l;
+{$endif FPC_UNALIGNED_FIXED}
+    readdata(result,sizeof(longint));
+  if change_endian then
+   result:=swapendian(result);
   inc(entryidx,4);
 end;
 
 
 function tppufile.getint64:int64;
-var
-  i : int64;
 begin
   if entryidx+8>entry.size then
    begin
@@ -637,11 +634,17 @@ begin
      result:=0;
      exit;
    end;
-  readdata(i,8);
-  if change_endian then
-    result:=swapendian(i)
+{$ifdef FPC_UNALIGNED_FIXED}
+  if bufsize-bufidx>=sizeof(int64) then
+    begin
+      result:=Unaligned(pint64(@buf[bufidx])^);
+      inc(bufidx,sizeof(int64));
+    end
   else
-    result:=i;
+{$endif FPC_UNALIGNED_FIXED}
+    readdata(result,sizeof(int64));
+  if change_endian then
+   result:=swapendian(result);
   inc(entryidx,8);
 end;