浏览代码

Merged revisions 9300,9302-9303,9307-9308,9310,9322,9337,9340,9343-9347 via svnmerge from
http://svn.freepascal.org/svn/fpc/trunk

........
r9300 | peter | 2007-11-19 23:52:58 +0100 (Mon, 19 Nov 2007) | 2 lines

* don't use readdata for getbyte/getword/getlongint
........
r9302 | jonas | 2007-11-20 12:59:15 +0100 (Tue, 20 Nov 2007) | 2 lines

* fixed handling of unaligned in assignments
........
r9303 | jonas | 2007-11-20 13:15:47 +0100 (Tue, 20 Nov 2007) | 2 lines

* fixed wrong unaligned() changes
........
r9345 | jonas | 2007-11-29 13:56:31 +0100 (Thu, 29 Nov 2007) | 4 lines

+ unaligned support in a_load_reg_ref and a_load_ref_reg for ppc64
(only needs it in case a 64 bit value is loaded from an address
with alignment < 4 bytes)
........
r9346 | jonas | 2007-11-29 13:59:05 +0100 (Thu, 29 Nov 2007) | 4 lines

- removed internalerror in case a subsetref value is loaded on a cpu
which requires proper alignment (they have to support unaligned accesses
in their a_load_*_* routines)
........
r9347 | jonas | 2007-11-29 18:18:52 +0100 (Thu, 29 Nov 2007) | 5 lines

* perform 4 instead of 8 byte copies at a time if source or dest is
unaligned
* use integer instead of floating point for 8 byte copies because the
integer unit has lower latency
........

git-svn-id: branches/fixes_2_2@9843 -

peter 17 年之前
父节点
当前提交
6d468276ba
共有 6 个文件被更改,包括 163 次插入102 次删除
  1. 0 8
      compiler/cgobj.pas
  2. 5 1
      compiler/ncgld.pas
  3. 1 0
      compiler/options.pas
  4. 86 53
      compiler/powerpc64/cgcpu.pas
  5. 31 3
      compiler/ppcgen/cgppc.pas
  6. 40 37
      compiler/ppu.pas

+ 0 - 8
compiler/cgobj.pas

@@ -1182,14 +1182,6 @@ implementation
       begin
       begin
         intloadsize := packedbitsloadsize(sref.bitlen);
         intloadsize := packedbitsloadsize(sref.bitlen);
 
 
-{$if not(defined(arm)) and not(defined(sparc))}
-        { may need to be split into several smaller loads/stores }
-        if (tf_requires_proper_alignment in target_info.flags) and
-           (intloadsize <> 1) and
-           (intloadsize <> sref.ref.alignment) then
-          internalerror(2006082011);
-{$endif not(defined(arm)) and not(defined(sparc))}
-
         if (intloadsize = 0) then
         if (intloadsize = 0) then
           internalerror(2006081310);
           internalerror(2006081310);
 
 

+ 5 - 1
compiler/ncgld.pas

@@ -577,7 +577,11 @@ implementation
                         len:=left.resultdef.size;
                         len:=left.resultdef.size;
                         if (right.location.reference.offset mod sizeof(aint)<>0) or
                         if (right.location.reference.offset mod sizeof(aint)<>0) or
                           (left.location.reference.offset mod sizeof(aint)<>0) or
                           (left.location.reference.offset mod sizeof(aint)<>0) or
-                          (right.resultdef.alignment<sizeof(aint)) then
+                          (right.resultdef.alignment<sizeof(aint)) or
+                          ((right.location.reference.alignment<>0) and
+                           (right.location.reference.alignment<sizeof(aint))) or
+                          ((left.location.reference.alignment<>0) and
+                           (left.location.reference.alignment<sizeof(aint))) then
                           cg.g_concatcopy_unaligned(current_asmdata.CurrAsmList,right.location.reference,left.location.reference,len)
                           cg.g_concatcopy_unaligned(current_asmdata.CurrAsmList,right.location.reference,left.location.reference,len)
                         else
                         else
                           cg.g_concatcopy(current_asmdata.CurrAsmList,right.location.reference,left.location.reference,len);
                           cg.g_concatcopy(current_asmdata.CurrAsmList,right.location.reference,left.location.reference,len);

+ 1 - 0
compiler/options.pas

@@ -2152,6 +2152,7 @@ begin
 
 
 {$ifdef SUPPORT_UNALIGNED}
 {$ifdef SUPPORT_UNALIGNED}
   def_system_macro('FPC_SUPPORTS_UNALIGNED');
   def_system_macro('FPC_SUPPORTS_UNALIGNED');
+  def_system_macro('FPC_UNALIGNED_FIXED');
 {$endif SUPPORT_UNALIGNED}
 {$endif SUPPORT_UNALIGNED}
 {$ifdef powerpc64}
 {$ifdef powerpc64}
   def_system_macro('FPC_HAS_LWSYNC');
   def_system_macro('FPC_HAS_LWSYNC');

+ 86 - 53
compiler/powerpc64/cgcpu.pas

@@ -401,7 +401,7 @@ begin
         RS_R9, RS_R10, RS_R11, RS_R12, RS_R31, RS_R30, RS_R29,
         RS_R9, RS_R10, RS_R11, RS_R12, RS_R31, RS_R30, RS_R29,
         RS_R28, RS_R27, RS_R26, RS_R25, RS_R24, RS_R23, RS_R22,
         RS_R28, RS_R27, RS_R26, RS_R25, RS_R24, RS_R23, RS_R22,
        RS_R21, RS_R20, RS_R19, RS_R18, RS_R17, RS_R16, RS_R15,
        RS_R21, RS_R20, RS_R19, RS_R18, RS_R17, RS_R16, RS_R15,
-       RS_R14], first_int_imreg, []);	
+       RS_R14], first_int_imreg, []);
   rg[R_FPUREGISTER] := trgcpu.create(R_FPUREGISTER, R_SUBNONE,
   rg[R_FPUREGISTER] := trgcpu.create(R_FPUREGISTER, R_SUBNONE,
     [RS_F0, RS_F1, RS_F2, RS_F3, RS_F4, RS_F5, RS_F6, RS_F7, RS_F8, RS_F9,
     [RS_F0, RS_F1, RS_F2, RS_F3, RS_F4, RS_F5, RS_F6, RS_F7, RS_F8, RS_F9,
     RS_F10, RS_F11, RS_F12, RS_F13, RS_F31, RS_F30, RS_F29, RS_F28, RS_F27,
     RS_F10, RS_F11, RS_F12, RS_F13, RS_F31, RS_F30, RS_F29, RS_F28, RS_F27,
@@ -706,7 +706,7 @@ begin
   if not (size in [OS_8, OS_S8, OS_16, OS_S16, OS_32, OS_S32, OS_64, OS_S64]) then
   if not (size in [OS_8, OS_S8, OS_16, OS_S16, OS_32, OS_S32, OS_64, OS_S64]) then
     internalerror(2002090902);
     internalerror(2002090902);
   { if PIC or basic optimizations are enabled, and the number of instructions which would be
   { if PIC or basic optimizations are enabled, and the number of instructions which would be
-   required to load the value is greater than 2, store (and later load) the value from there } 
+   required to load the value is greater than 2, store (and later load) the value from there }
 //  if (((cs_opt_peephole in current_settings.optimizerswitches) or (cs_create_pic in current_settings.moduleswitches)) and
 //  if (((cs_opt_peephole in current_settings.optimizerswitches) or (cs_create_pic in current_settings.moduleswitches)) and
 //    (getInstructionLength(a) > 2)) then
 //    (getInstructionLength(a) > 2)) then
 //    loadConstantPIC(list, size, a, reg)
 //    loadConstantPIC(list, size, a, reg)
@@ -737,7 +737,7 @@ const
 var
 var
   op: tasmop;
   op: tasmop;
   ref2: treference;
   ref2: treference;
-
+  tmpreg: tregister;
 begin
 begin
   {$IFDEF EXTDEBUG}
   {$IFDEF EXTDEBUG}
   list.concat(tai_comment.create(strpnew('a_load_ref_reg ' + ref2string(ref))));
   list.concat(tai_comment.create(strpnew('a_load_ref_reg ' + ref2string(ref))));
@@ -754,6 +754,30 @@ begin
   ref2 := ref;
   ref2 := ref;
   fixref(list, ref2);
   fixref(list, ref2);
 
 
+  { unaligned 64 bit accesses are much slower than unaligned }
+  { 32 bit accesses because they cause a hardware exception  }
+  { (which isn't handled by linux, so there you even get a   }
+  {  crash)                                                  }
+  if (ref.alignment<>0) and
+     (fromsize in [OS_64,OS_S64]) and
+     (ref.alignment<4) then
+    begin
+      if (ref2.base<>NR_NO) and
+         (ref2.index<>NR_NO) then
+        begin
+          tmpreg:=getintregister(list,OS_64);
+          a_op_reg_reg_reg(list,OP_SHR,OS_64,ref2.base,ref2.index,tmpreg);
+          ref2.base:=tmpreg;
+          ref2.index:=NR_NO;
+        end;
+      tmpreg:=getintregister(list,OS_32);
+      a_load_ref_reg(list,OS_32,OS_32,ref2,tmpreg);
+      inc(ref2.offset,4);
+      a_load_ref_reg(list,OS_32,OS_32,ref2,reg);
+      list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, tmpreg, 32, 0));
+      exit;
+    end;
+
   op := loadinstr[fromsize, ref2.index <> NR_NO, false];
   op := loadinstr[fromsize, ref2.index <> NR_NO, false];
   { there is no LWAU instruction, simulate using ADDI and LWA }
   { there is no LWAU instruction, simulate using ADDI and LWA }
   if (op = A_NOP) then begin
   if (op = A_NOP) then begin
@@ -807,10 +831,10 @@ begin
   {$ifdef extdebug}
   {$ifdef extdebug}
   list.concat(tai_comment.create(strpnew('a_load_subsetreg_reg subsetregsize = ' + cgsize2string(sreg.subsetregsize) + ' subsetsize = ' + cgsize2string(subsetsize) + ' startbit = ' + intToStr(sreg.startbit) + ' tosize = ' + cgsize2string(tosize))));
   list.concat(tai_comment.create(strpnew('a_load_subsetreg_reg subsetregsize = ' + cgsize2string(sreg.subsetregsize) + ' subsetsize = ' + cgsize2string(subsetsize) + ' startbit = ' + intToStr(sreg.startbit) + ' tosize = ' + cgsize2string(tosize))));
   {$endif}
   {$endif}
-  { do the extraction if required and then extend the sign correctly. (The latter is actually required only for signed subsets 
+  { do the extraction if required and then extend the sign correctly. (The latter is actually required only for signed subsets
   and if that subset is not >= the tosize). }
   and if that subset is not >= the tosize). }
   if (sreg.startbit <> 0) or
   if (sreg.startbit <> 0) or
-     (sreg.bitlen <> tcgsize2size[subsetsize]*8) then begin 
+     (sreg.bitlen <> tcgsize2size[subsetsize]*8) then begin
     list.concat(taicpu.op_reg_reg_const_const(A_RLDICL, destreg, sreg.subsetreg, (64 - sreg.startbit) and 63, 64 - sreg.bitlen));
     list.concat(taicpu.op_reg_reg_const_const(A_RLDICL, destreg, sreg.subsetreg, (64 - sreg.startbit) and 63, 64 - sreg.bitlen));
     if (subsetsize in [OS_S8..OS_S128]) then
     if (subsetsize in [OS_S8..OS_S128]) then
       if ((sreg.bitlen mod 8) = 0) then begin
       if ((sreg.bitlen mod 8) = 0) then begin
@@ -1714,18 +1738,14 @@ end;
 
 
 { ************* concatcopy ************ }
 { ************* concatcopy ************ }
 
 
-const
-  maxmoveunit = 8;
-
-
 procedure tcgppc.g_concatcopy(list: TAsmList; const source, dest: treference;
 procedure tcgppc.g_concatcopy(list: TAsmList; const source, dest: treference;
   len: aint);
   len: aint);
 
 
 var
 var
-  countreg, tempreg: TRegister;
+  countreg, tempreg:TRegister;
   src, dst: TReference;
   src, dst: TReference;
   lab: tasmlabel;
   lab: tasmlabel;
-  count, count2: longint;
+  count, count2, step: longint;
   size: tcgsize;
   size: tcgsize;
 
 
 begin
 begin
@@ -1735,7 +1755,8 @@ begin
   list.concat(tai_comment.create(strpnew('g_concatcopy1 ' + inttostr(len) + ' bytes left ')));
   list.concat(tai_comment.create(strpnew('g_concatcopy1 ' + inttostr(len) + ' bytes left ')));
 {$ENDIF extdebug}
 {$ENDIF extdebug}
   { if the references are equal, exit, there is no need to copy anything }
   { if the references are equal, exit, there is no need to copy anything }
-  if (references_equal(source, dest)) then
+  if references_equal(source, dest) or
+     (len=0) then
     exit;
     exit;
 
 
   { make sure short loads are handled as optimally as possible;
   { make sure short loads are handled as optimally as possible;
@@ -1744,7 +1765,7 @@ begin
    NOTE: maybe use some scratch registers to pair load/store instructions
    NOTE: maybe use some scratch registers to pair load/store instructions
   }
   }
 
 
-  if (len <= maxmoveunit) then begin
+  if (len <= 8) then begin
     src := source; dst := dest;
     src := source; dst := dest;
     {$IFDEF extdebug}
     {$IFDEF extdebug}
     list.concat(tai_comment.create(strpnew('g_concatcopy3 ' + inttostr(src.offset) + ' ' + inttostr(dst.offset))));
     list.concat(tai_comment.create(strpnew('g_concatcopy3 ' + inttostr(src.offset) + ' ' + inttostr(dst.offset))));
@@ -1774,16 +1795,29 @@ begin
 {$ENDIF extdebug}
 {$ENDIF extdebug}
 
 
 
 
-  count := len div maxmoveunit;
+  if not(source.alignment in [1,2]) and
+     not(dest.alignment in [1,2]) then
+    begin
+      count:=len div 8;
+      step:=8;
+      size:=OS_64;
+    end
+  else
+    begin
+      count:=len div 4;
+      step:=4;
+      size:=OS_32;
+    end;
 
 
+  tempreg:=getintregister(list,size);
   reference_reset(src);
   reference_reset(src);
   reference_reset(dst);
   reference_reset(dst);
   { load the address of source into src.base }
   { load the address of source into src.base }
   if (count > 4) or
   if (count > 4) or
     not issimpleref(source) or
     not issimpleref(source) or
     ((source.index <> NR_NO) and
     ((source.index <> NR_NO) and
-    ((source.offset + len) > high(smallint))) then begin
-    src.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
+     ((source.offset + len) > high(smallint))) then begin
+    src.base := getaddressregister(list);
     a_loadaddr_ref_reg(list, source, src.base);
     a_loadaddr_ref_reg(list, source, src.base);
   end else begin
   end else begin
     src := source;
     src := source;
@@ -1793,7 +1827,7 @@ begin
     not issimpleref(dest) or
     not issimpleref(dest) or
     ((dest.index <> NR_NO) and
     ((dest.index <> NR_NO) and
     ((dest.offset + len) > high(smallint))) then begin
     ((dest.offset + len) > high(smallint))) then begin
-    dst.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
+    dst.base := getaddressregister(list);
     a_loadaddr_ref_reg(list, dest, dst.base);
     a_loadaddr_ref_reg(list, dest, dst.base);
   end else begin
   end else begin
     dst := dest;
     dst := dest;
@@ -1802,64 +1836,63 @@ begin
   { generate a loop }
   { generate a loop }
   if count > 4 then begin
   if count > 4 then begin
     { the offsets are zero after the a_loadaddress_ref_reg and just
     { the offsets are zero after the a_loadaddress_ref_reg and just
-     have to be set to 8. I put an Inc there so debugging may be
+     have to be set to step. I put an Inc there so debugging may be
      easier (should offset be different from zero here, it will be
      easier (should offset be different from zero here, it will be
      easy to notice in the generated assembler }
      easy to notice in the generated assembler }
-    inc(dst.offset, 8);
-    inc(src.offset, 8);
-    list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, 8));
-    list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8));
-    countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
-    a_load_const_reg(list, OS_64, count, countreg);
-    { explicitely allocate F0 since it can be used safely here
-     (for holding date that's being copied) }
-    a_reg_alloc(list, NR_F0);
+    inc(dst.offset, step);
+    inc(src.offset, step);
+    list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, step));
+    list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, step));
+    countreg := getintregister(list, OS_INT);
+    a_load_const_reg(list, OS_INT, count, countreg);
     current_asmdata.getjumplabel(lab);
     current_asmdata.getjumplabel(lab);
     a_label(list, lab);
     a_label(list, lab);
     list.concat(taicpu.op_reg_reg_const(A_SUBIC_, countreg, countreg, 1));
     list.concat(taicpu.op_reg_reg_const(A_SUBIC_, countreg, countreg, 1));
-    list.concat(taicpu.op_reg_ref(A_LFDU, NR_F0, src));
-    list.concat(taicpu.op_reg_ref(A_STFDU, NR_F0, dst));
+    if (size=OS_64) then
+      begin
+        list.concat(taicpu.op_reg_ref(A_LDU, tempreg, src));
+        list.concat(taicpu.op_reg_ref(A_STDU, tempreg, dst));
+      end
+    else
+      begin
+        list.concat(taicpu.op_reg_ref(A_LWZU, tempreg, src));
+        list.concat(taicpu.op_reg_ref(A_STWU, tempreg, dst));
+      end;
     a_jmp(list, A_BC, C_NE, 0, lab);
     a_jmp(list, A_BC, C_NE, 0, lab);
-    a_reg_dealloc(list, NR_F0);
-    len := len mod 8;
+    a_reg_sync(list,src.base);
+    a_reg_sync(list,dst.base);
+    a_reg_sync(list,countreg);
+    len := len mod step;
+    count := 0;
   end;
   end;
 
 
-  count := len div 8;
   { unrolled loop }
   { unrolled loop }
   if count > 0 then begin
   if count > 0 then begin
-    a_reg_alloc(list, NR_F0);
     for count2 := 1 to count do begin
     for count2 := 1 to count do begin
-      a_loadfpu_ref_reg(list, OS_F64, OS_F64, src, NR_F0);
-      a_loadfpu_reg_ref(list, OS_F64, OS_F64, NR_F0, dst);
-      inc(src.offset, 8);
-      inc(dst.offset, 8);
+      a_load_ref_reg(list, size, size, src, tempreg);
+      a_load_reg_ref(list, size, size, tempreg, dst);
+      inc(src.offset, step);
+      inc(dst.offset, step);
     end;
     end;
-    a_reg_dealloc(list, NR_F0);
-    len := len mod 8;
+    len := len mod step;
   end;
   end;
 
 
   if (len and 4) <> 0 then begin
   if (len and 4) <> 0 then begin
-    a_reg_alloc(list, NR_R0);
-    a_load_ref_reg(list, OS_32, OS_32, src, NR_R0);
-    a_load_reg_ref(list, OS_32, OS_32, NR_R0, dst);
+    a_load_ref_reg(list, OS_32, OS_32, src, tempreg);
+    a_load_reg_ref(list, OS_32, OS_32, tempreg, dst);
     inc(src.offset, 4);
     inc(src.offset, 4);
     inc(dst.offset, 4);
     inc(dst.offset, 4);
-    a_reg_dealloc(list, NR_R0);
   end;
   end;
   { copy the leftovers }
   { copy the leftovers }
   if (len and 2) <> 0 then begin
   if (len and 2) <> 0 then begin
-    a_reg_alloc(list, NR_R0);
-    a_load_ref_reg(list, OS_16, OS_16, src, NR_R0);
-    a_load_reg_ref(list, OS_16, OS_16, NR_R0, dst);
+    a_load_ref_reg(list, OS_16, OS_16, src, tempreg);
+    a_load_reg_ref(list, OS_16, OS_16, tempreg, dst);
     inc(src.offset, 2);
     inc(src.offset, 2);
     inc(dst.offset, 2);
     inc(dst.offset, 2);
-    a_reg_dealloc(list, NR_R0);
   end;
   end;
   if (len and 1) <> 0 then begin
   if (len and 1) <> 0 then begin
-    a_reg_alloc(list, NR_R0);
-    a_load_ref_reg(list, OS_8, OS_8, src, NR_R0);
-    a_load_reg_ref(list, OS_8, OS_8, NR_R0, dst);
-    a_reg_dealloc(list, NR_R0);
+    a_load_ref_reg(list, OS_8, OS_8, src, tempreg);
+    a_load_reg_ref(list, OS_8, OS_8, tempreg, dst);
   end;
   end;
 
 
 end;
 end;
@@ -1874,7 +1907,7 @@ begin
   end;
   end;
 
 
   { for ppc64/linux emit correct code which sets up a stack frame and then calls the
   { for ppc64/linux emit correct code which sets up a stack frame and then calls the
-  external method normally to ensure that the GOT/TOC will be loaded correctly if 
+  external method normally to ensure that the GOT/TOC will be loaded correctly if
   required.
   required.
 
 
   It's not really advantageous to use cg methods here because they are too specialized.
   It's not really advantageous to use cg methods here because they are too specialized.
@@ -1952,7 +1985,7 @@ procedure tcgppc.a_load_store(list: TAsmList; op: tasmop; reg: tregister;
         A_LD, A_LDU, A_STD, A_STDU, A_LWA :
         A_LD, A_LDU, A_STD, A_STDU, A_LWA :
            if ((ref.offset mod 4) <> 0) then begin
            if ((ref.offset mod 4) <> 0) then begin
             tmpreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
             tmpreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
-    
+
             if (ref.base <> NR_NO) then begin
             if (ref.base <> NR_NO) then begin
               a_op_const_reg_reg(list, OP_ADD, OS_ADDR, ref.offset mod 4, ref.base, tmpreg);
               a_op_const_reg_reg(list, OP_ADD, OS_ADDR, ref.offset mod 4, ref.base, tmpreg);
               ref.base := tmpreg;
               ref.base := tmpreg;

+ 31 - 3
compiler/ppcgen/cgppc.pas

@@ -394,19 +394,47 @@ unit cgppc;
 {$endif cpu64bit}
 {$endif cpu64bit}
         );
         );
     var
     var
-      op: TAsmOp;
       ref2: TReference;
       ref2: TReference;
+      tmpreg: tregister;
+      op: TAsmOp;
     begin
     begin
       if not (fromsize in [OS_8..OS_INT,OS_S8..OS_SINT]) then
       if not (fromsize in [OS_8..OS_INT,OS_S8..OS_SINT]) then
         internalerror(2002090903);
         internalerror(2002090903);
       if not (tosize in [OS_8..OS_INT,OS_S8..OS_SINT]) then
       if not (tosize in [OS_8..OS_INT,OS_S8..OS_SINT]) then
         internalerror(2002090905);
         internalerror(2002090905);
 
 
-      ref2 := ref;
-      fixref(list, ref2);
       if tosize in [OS_S8..OS_SINT] then
       if tosize in [OS_S8..OS_SINT] then
         { storing is the same for signed and unsigned values }
         { storing is the same for signed and unsigned values }
         tosize := tcgsize(ord(tosize) - (ord(OS_S8) - ord(OS_8)));
         tosize := tcgsize(ord(tosize) - (ord(OS_S8) - ord(OS_8)));
+
+      ref2 := ref;
+      fixref(list, ref2);
+
+      { unaligned 64 bit accesses are much slower than unaligned }
+      { 32 bit accesses because they cause a hardware exception  }
+      { (which isn't handled by linux, so there you even get a   }
+      {  crash)                                                  }
+       if (ref2.alignment<>0) and
+         (tosize in [OS_64,OS_S64]) and
+         (ref.alignment<4) then
+        begin
+          if (ref2.base<>NR_NO) and
+             (ref2.index<>NR_NO) then
+            begin
+              tmpreg:=getintregister(list,OS_64);
+              a_op_reg_reg_reg(list,OP_SHR,OS_64,ref2.base,ref2.index,tmpreg);
+              ref2.base:=tmpreg;
+              ref2.index:=NR_NO;
+            end;
+          tmpreg:=getintregister(list,OS_64);
+          a_op_const_reg_reg(list,OP_SHR,OS_64,32,reg,tmpreg);
+          inc(ref2.offset,4);
+          a_load_reg_ref(list,OS_32,OS_32,reg,ref2);
+          dec(ref2.offset,4);
+          a_load_reg_ref(list,OS_32,OS_32,tmpreg,ref2);
+          exit;
+        end;
+
       op := storeinstr[tcgsize2unsigned[tosize], ref2.index <> NR_NO, false];
       op := storeinstr[tcgsize2unsigned[tosize], ref2.index <> NR_NO, false];
       a_load_store(list, op, reg, ref2);
       a_load_store(list, op, reg, ref2);
     end;
     end;

+ 40 - 37
compiler/ppu.pas

@@ -479,19 +479,7 @@ begin
     if bufsize=0 then
     if bufsize=0 then
       exit;
       exit;
   until false;
   until false;
-  { For small values copy directly }
-  if len<=sizeof(ptruint) then
-    begin
-      pmax:=p+len;
-      while (p<pmax) do
-        begin
-          p^:=pbuf^;
-          inc(pbuf);
-          inc(p);
-        end;
-    end
-  else
-    move(pbuf^,p^,len);
+  move(pbuf^,p^,len);
   inc(bufidx,len);
   inc(bufidx,len);
 end;
 end;
 
 
@@ -574,43 +562,48 @@ end;
 
 
 
 
 function tppufile.getbyte:byte;
 function tppufile.getbyte:byte;
-var
-  b : byte;
 begin
 begin
   if entryidx+1>entry.size then
   if entryidx+1>entry.size then
    begin
    begin
      error:=true;
      error:=true;
-     getbyte:=0;
+     result:=0;
      exit;
      exit;
    end;
    end;
-  readdata(b,1);
-  getbyte:=b;
+  if bufsize-bufidx>=1 then
+    begin
+      result:=pbyte(@buf[bufidx])^;
+      inc(bufidx);
+    end
+  else
+    readdata(result,1);
   inc(entryidx);
   inc(entryidx);
 end;
 end;
 
 
 
 
 function tppufile.getword:word;
 function tppufile.getword:word;
-var
-  w : word;
 begin
 begin
   if entryidx+2>entry.size then
   if entryidx+2>entry.size then
    begin
    begin
      error:=true;
      error:=true;
-     getword:=0;
+     result:=0;
      exit;
      exit;
    end;
    end;
-  readdata(w,2);
-  if change_endian then
-   getword:=swapendian(w)
+{$ifdef FPC_UNALIGNED_FIXED}
+  if bufsize-bufidx>=sizeof(word) then
+    begin
+      result:=Unaligned(pword(@buf[bufidx])^);
+      inc(bufidx,sizeof(word));
+    end
   else
   else
-   getword:=w;
+{$endif FPC_UNALIGNED_FIXED}
+    readdata(result,sizeof(word));
+  if change_endian then
+   result:=swapendian(result);
   inc(entryidx,2);
   inc(entryidx,2);
 end;
 end;
 
 
 
 
 function tppufile.getlongint:longint;
 function tppufile.getlongint:longint;
-var
-  l : longint;
 begin
 begin
   if entryidx+4>entry.size then
   if entryidx+4>entry.size then
    begin
    begin
@@ -618,18 +611,22 @@ begin
      getlongint:=0;
      getlongint:=0;
      exit;
      exit;
    end;
    end;
-  readdata(l,4);
-  if change_endian then
-   getlongint:=swapendian(l)
+{$ifdef FPC_UNALIGNED_FIXED}
+  if bufsize-bufidx>=sizeof(longint) then
+    begin
+      result:=Unaligned(plongint(@buf[bufidx])^);
+      inc(bufidx,sizeof(longint));
+    end
   else
   else
-   getlongint:=l;
+{$endif FPC_UNALIGNED_FIXED}
+    readdata(result,sizeof(longint));
+  if change_endian then
+   result:=swapendian(result);
   inc(entryidx,4);
   inc(entryidx,4);
 end;
 end;
 
 
 
 
 function tppufile.getint64:int64;
 function tppufile.getint64:int64;
-var
-  i : int64;
 begin
 begin
   if entryidx+8>entry.size then
   if entryidx+8>entry.size then
    begin
    begin
@@ -637,11 +634,17 @@ begin
      result:=0;
      result:=0;
      exit;
      exit;
    end;
    end;
-  readdata(i,8);
-  if change_endian then
-    result:=swapendian(i)
+{$ifdef FPC_UNALIGNED_FIXED}
+  if bufsize-bufidx>=sizeof(int64) then
+    begin
+      result:=Unaligned(pint64(@buf[bufidx])^);
+      inc(bufidx,sizeof(int64));
+    end
   else
   else
-    result:=i;
+{$endif FPC_UNALIGNED_FIXED}
+    readdata(result,sizeof(int64));
+  if change_endian then
+   result:=swapendian(result);
   inc(entryidx,8);
   inc(entryidx,8);
 end;
 end;