|
@@ -401,7 +401,7 @@ begin
|
|
|
RS_R9, RS_R10, RS_R11, RS_R12, RS_R31, RS_R30, RS_R29,
|
|
|
RS_R28, RS_R27, RS_R26, RS_R25, RS_R24, RS_R23, RS_R22,
|
|
|
RS_R21, RS_R20, RS_R19, RS_R18, RS_R17, RS_R16, RS_R15,
|
|
|
- RS_R14], first_int_imreg, []);
|
|
|
+ RS_R14], first_int_imreg, []);
|
|
|
rg[R_FPUREGISTER] := trgcpu.create(R_FPUREGISTER, R_SUBNONE,
|
|
|
[RS_F0, RS_F1, RS_F2, RS_F3, RS_F4, RS_F5, RS_F6, RS_F7, RS_F8, RS_F9,
|
|
|
RS_F10, RS_F11, RS_F12, RS_F13, RS_F31, RS_F30, RS_F29, RS_F28, RS_F27,
|
|
@@ -706,7 +706,7 @@ begin
|
|
|
if not (size in [OS_8, OS_S8, OS_16, OS_S16, OS_32, OS_S32, OS_64, OS_S64]) then
|
|
|
internalerror(2002090902);
|
|
|
{ if PIC or basic optimizations are enabled, and the number of instructions which would be
|
|
|
- required to load the value is greater than 2, store (and later load) the value from there }
|
|
|
+ required to load the value is greater than 2, store (and later load) the value from there }
|
|
|
// if (((cs_opt_peephole in current_settings.optimizerswitches) or (cs_create_pic in current_settings.moduleswitches)) and
|
|
|
// (getInstructionLength(a) > 2)) then
|
|
|
// loadConstantPIC(list, size, a, reg)
|
|
@@ -737,7 +737,7 @@ const
|
|
|
var
|
|
|
op: tasmop;
|
|
|
ref2: treference;
|
|
|
-
|
|
|
+ tmpreg: tregister;
|
|
|
begin
|
|
|
{$IFDEF EXTDEBUG}
|
|
|
list.concat(tai_comment.create(strpnew('a_load_ref_reg ' + ref2string(ref))));
|
|
@@ -754,6 +754,30 @@ begin
|
|
|
ref2 := ref;
|
|
|
fixref(list, ref2);
|
|
|
|
|
|
+ { unaligned 64 bit accesses are much slower than unaligned }
|
|
|
+ { 32 bit accesses because they cause a hardware exception }
|
|
|
+ { (which isn't handled by linux, so there you even get a }
|
|
|
+ { crash) }
|
|
|
+ if (ref.alignment<>0) and
|
|
|
+ (fromsize in [OS_64,OS_S64]) and
|
|
|
+ (ref.alignment<4) then
|
|
|
+ begin
|
|
|
+ if (ref2.base<>NR_NO) and
|
|
|
+ (ref2.index<>NR_NO) then
|
|
|
+ begin
|
|
|
+ tmpreg:=getintregister(list,OS_64);
|
|
|
+ a_op_reg_reg_reg(list,OP_SHR,OS_64,ref2.base,ref2.index,tmpreg);
|
|
|
+ ref2.base:=tmpreg;
|
|
|
+ ref2.index:=NR_NO;
|
|
|
+ end;
|
|
|
+ tmpreg:=getintregister(list,OS_32);
|
|
|
+ a_load_ref_reg(list,OS_32,OS_32,ref2,tmpreg);
|
|
|
+ inc(ref2.offset,4);
|
|
|
+ a_load_ref_reg(list,OS_32,OS_32,ref2,reg);
|
|
|
+ list.concat(taicpu.op_reg_reg_const_const(A_RLDIMI, reg, tmpreg, 32, 0));
|
|
|
+ exit;
|
|
|
+ end;
|
|
|
+
|
|
|
op := loadinstr[fromsize, ref2.index <> NR_NO, false];
|
|
|
{ there is no LWAU instruction, simulate using ADDI and LWA }
|
|
|
if (op = A_NOP) then begin
|
|
@@ -807,10 +831,10 @@ begin
|
|
|
{$ifdef extdebug}
|
|
|
list.concat(tai_comment.create(strpnew('a_load_subsetreg_reg subsetregsize = ' + cgsize2string(sreg.subsetregsize) + ' subsetsize = ' + cgsize2string(subsetsize) + ' startbit = ' + intToStr(sreg.startbit) + ' tosize = ' + cgsize2string(tosize))));
|
|
|
{$endif}
|
|
|
- { do the extraction if required and then extend the sign correctly. (The latter is actually required only for signed subsets
|
|
|
+ { do the extraction if required and then extend the sign correctly. (The latter is actually required only for signed subsets
|
|
|
and if that subset is not >= the tosize). }
|
|
|
if (sreg.startbit <> 0) or
|
|
|
- (sreg.bitlen <> tcgsize2size[subsetsize]*8) then begin
|
|
|
+ (sreg.bitlen <> tcgsize2size[subsetsize]*8) then begin
|
|
|
list.concat(taicpu.op_reg_reg_const_const(A_RLDICL, destreg, sreg.subsetreg, (64 - sreg.startbit) and 63, 64 - sreg.bitlen));
|
|
|
if (subsetsize in [OS_S8..OS_S128]) then
|
|
|
if ((sreg.bitlen mod 8) = 0) then begin
|
|
@@ -1714,18 +1738,14 @@ end;
|
|
|
|
|
|
{ ************* concatcopy ************ }
|
|
|
|
|
|
-const
|
|
|
- maxmoveunit = 8;
|
|
|
-
|
|
|
-
|
|
|
procedure tcgppc.g_concatcopy(list: TAsmList; const source, dest: treference;
|
|
|
len: aint);
|
|
|
|
|
|
var
|
|
|
- countreg, tempreg: TRegister;
|
|
|
+ countreg, tempreg:TRegister;
|
|
|
src, dst: TReference;
|
|
|
lab: tasmlabel;
|
|
|
- count, count2: longint;
|
|
|
+ count, count2, step: longint;
|
|
|
size: tcgsize;
|
|
|
|
|
|
begin
|
|
@@ -1735,7 +1755,8 @@ begin
|
|
|
list.concat(tai_comment.create(strpnew('g_concatcopy1 ' + inttostr(len) + ' bytes left ')));
|
|
|
{$ENDIF extdebug}
|
|
|
{ if the references are equal, exit, there is no need to copy anything }
|
|
|
- if (references_equal(source, dest)) then
|
|
|
+ if references_equal(source, dest) or
|
|
|
+ (len=0) then
|
|
|
exit;
|
|
|
|
|
|
{ make sure short loads are handled as optimally as possible;
|
|
@@ -1744,7 +1765,7 @@ begin
|
|
|
NOTE: maybe use some scratch registers to pair load/store instructions
|
|
|
}
|
|
|
|
|
|
- if (len <= maxmoveunit) then begin
|
|
|
+ if (len <= 8) then begin
|
|
|
src := source; dst := dest;
|
|
|
{$IFDEF extdebug}
|
|
|
list.concat(tai_comment.create(strpnew('g_concatcopy3 ' + inttostr(src.offset) + ' ' + inttostr(dst.offset))));
|
|
@@ -1774,16 +1795,29 @@ begin
|
|
|
{$ENDIF extdebug}
|
|
|
|
|
|
|
|
|
- count := len div maxmoveunit;
|
|
|
+ if not(source.alignment in [1,2]) and
|
|
|
+ not(dest.alignment in [1,2]) then
|
|
|
+ begin
|
|
|
+ count:=len div 8;
|
|
|
+ step:=8;
|
|
|
+ size:=OS_64;
|
|
|
+ end
|
|
|
+ else
|
|
|
+ begin
|
|
|
+ count:=len div 4;
|
|
|
+ step:=4;
|
|
|
+ size:=OS_32;
|
|
|
+ end;
|
|
|
|
|
|
+ tempreg:=getintregister(list,size);
|
|
|
reference_reset(src);
|
|
|
reference_reset(dst);
|
|
|
{ load the address of source into src.base }
|
|
|
if (count > 4) or
|
|
|
not issimpleref(source) or
|
|
|
((source.index <> NR_NO) and
|
|
|
- ((source.offset + len) > high(smallint))) then begin
|
|
|
- src.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
|
|
|
+ ((source.offset + len) > high(smallint))) then begin
|
|
|
+ src.base := getaddressregister(list);
|
|
|
a_loadaddr_ref_reg(list, source, src.base);
|
|
|
end else begin
|
|
|
src := source;
|
|
@@ -1793,7 +1827,7 @@ begin
|
|
|
not issimpleref(dest) or
|
|
|
((dest.index <> NR_NO) and
|
|
|
((dest.offset + len) > high(smallint))) then begin
|
|
|
- dst.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
|
|
|
+ dst.base := getaddressregister(list);
|
|
|
a_loadaddr_ref_reg(list, dest, dst.base);
|
|
|
end else begin
|
|
|
dst := dest;
|
|
@@ -1802,64 +1836,63 @@ begin
|
|
|
{ generate a loop }
|
|
|
if count > 4 then begin
|
|
|
{ the offsets are zero after the a_loadaddress_ref_reg and just
|
|
|
- have to be set to 8. I put an Inc there so debugging may be
|
|
|
+ have to be set to step. I put an Inc there so debugging may be
|
|
|
easier (should offset be different from zero here, it will be
|
|
|
easy to notice in the generated assembler }
|
|
|
- inc(dst.offset, 8);
|
|
|
- inc(src.offset, 8);
|
|
|
- list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, 8));
|
|
|
- list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8));
|
|
|
- countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
|
|
|
- a_load_const_reg(list, OS_64, count, countreg);
|
|
|
- { explicitely allocate F0 since it can be used safely here
|
|
|
- (for holding date that's being copied) }
|
|
|
- a_reg_alloc(list, NR_F0);
|
|
|
+ inc(dst.offset, step);
|
|
|
+ inc(src.offset, step);
|
|
|
+ list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, step));
|
|
|
+ list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, step));
|
|
|
+ countreg := getintregister(list, OS_INT);
|
|
|
+ a_load_const_reg(list, OS_INT, count, countreg);
|
|
|
current_asmdata.getjumplabel(lab);
|
|
|
a_label(list, lab);
|
|
|
list.concat(taicpu.op_reg_reg_const(A_SUBIC_, countreg, countreg, 1));
|
|
|
- list.concat(taicpu.op_reg_ref(A_LFDU, NR_F0, src));
|
|
|
- list.concat(taicpu.op_reg_ref(A_STFDU, NR_F0, dst));
|
|
|
+ if (size=OS_64) then
|
|
|
+ begin
|
|
|
+ list.concat(taicpu.op_reg_ref(A_LDU, tempreg, src));
|
|
|
+ list.concat(taicpu.op_reg_ref(A_STDU, tempreg, dst));
|
|
|
+ end
|
|
|
+ else
|
|
|
+ begin
|
|
|
+ list.concat(taicpu.op_reg_ref(A_LWZU, tempreg, src));
|
|
|
+ list.concat(taicpu.op_reg_ref(A_STWU, tempreg, dst));
|
|
|
+ end;
|
|
|
a_jmp(list, A_BC, C_NE, 0, lab);
|
|
|
- a_reg_dealloc(list, NR_F0);
|
|
|
- len := len mod 8;
|
|
|
+ a_reg_sync(list,src.base);
|
|
|
+ a_reg_sync(list,dst.base);
|
|
|
+ a_reg_sync(list,countreg);
|
|
|
+ len := len mod step;
|
|
|
+ count := 0;
|
|
|
end;
|
|
|
|
|
|
- count := len div 8;
|
|
|
{ unrolled loop }
|
|
|
if count > 0 then begin
|
|
|
- a_reg_alloc(list, NR_F0);
|
|
|
for count2 := 1 to count do begin
|
|
|
- a_loadfpu_ref_reg(list, OS_F64, OS_F64, src, NR_F0);
|
|
|
- a_loadfpu_reg_ref(list, OS_F64, OS_F64, NR_F0, dst);
|
|
|
- inc(src.offset, 8);
|
|
|
- inc(dst.offset, 8);
|
|
|
+ a_load_ref_reg(list, size, size, src, tempreg);
|
|
|
+ a_load_reg_ref(list, size, size, tempreg, dst);
|
|
|
+ inc(src.offset, step);
|
|
|
+ inc(dst.offset, step);
|
|
|
end;
|
|
|
- a_reg_dealloc(list, NR_F0);
|
|
|
- len := len mod 8;
|
|
|
+ len := len mod step;
|
|
|
end;
|
|
|
|
|
|
if (len and 4) <> 0 then begin
|
|
|
- a_reg_alloc(list, NR_R0);
|
|
|
- a_load_ref_reg(list, OS_32, OS_32, src, NR_R0);
|
|
|
- a_load_reg_ref(list, OS_32, OS_32, NR_R0, dst);
|
|
|
+ a_load_ref_reg(list, OS_32, OS_32, src, tempreg);
|
|
|
+ a_load_reg_ref(list, OS_32, OS_32, tempreg, dst);
|
|
|
inc(src.offset, 4);
|
|
|
inc(dst.offset, 4);
|
|
|
- a_reg_dealloc(list, NR_R0);
|
|
|
end;
|
|
|
{ copy the leftovers }
|
|
|
if (len and 2) <> 0 then begin
|
|
|
- a_reg_alloc(list, NR_R0);
|
|
|
- a_load_ref_reg(list, OS_16, OS_16, src, NR_R0);
|
|
|
- a_load_reg_ref(list, OS_16, OS_16, NR_R0, dst);
|
|
|
+ a_load_ref_reg(list, OS_16, OS_16, src, tempreg);
|
|
|
+ a_load_reg_ref(list, OS_16, OS_16, tempreg, dst);
|
|
|
inc(src.offset, 2);
|
|
|
inc(dst.offset, 2);
|
|
|
- a_reg_dealloc(list, NR_R0);
|
|
|
end;
|
|
|
if (len and 1) <> 0 then begin
|
|
|
- a_reg_alloc(list, NR_R0);
|
|
|
- a_load_ref_reg(list, OS_8, OS_8, src, NR_R0);
|
|
|
- a_load_reg_ref(list, OS_8, OS_8, NR_R0, dst);
|
|
|
- a_reg_dealloc(list, NR_R0);
|
|
|
+ a_load_ref_reg(list, OS_8, OS_8, src, tempreg);
|
|
|
+ a_load_reg_ref(list, OS_8, OS_8, tempreg, dst);
|
|
|
end;
|
|
|
|
|
|
end;
|
|
@@ -1874,7 +1907,7 @@ begin
|
|
|
end;
|
|
|
|
|
|
{ for ppc64/linux emit correct code which sets up a stack frame and then calls the
|
|
|
- external method normally to ensure that the GOT/TOC will be loaded correctly if
|
|
|
+ external method normally to ensure that the GOT/TOC will be loaded correctly if
|
|
|
required.
|
|
|
|
|
|
It's not really advantageous to use cg methods here because they are too specialized.
|
|
@@ -1952,7 +1985,7 @@ procedure tcgppc.a_load_store(list: TAsmList; op: tasmop; reg: tregister;
|
|
|
A_LD, A_LDU, A_STD, A_STDU, A_LWA :
|
|
|
if ((ref.offset mod 4) <> 0) then begin
|
|
|
tmpreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE);
|
|
|
-
|
|
|
+
|
|
|
if (ref.base <> NR_NO) then begin
|
|
|
a_op_const_reg_reg(list, OP_ADD, OS_ADDR, ref.offset mod 4, ref.base, tmpreg);
|
|
|
ref.base := tmpreg;
|