123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509 |
- {
- Copyright (c) 1998-2002, 2014 by Florian Klaempfl and Jonas Maebe
- Generate AArch64 assembler for math nodes
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- ****************************************************************************
- }
- unit ncpumat;
- {$i fpcdefs.inc}
- interface
- uses
- node,nmat,ncgmat;
- type
- taarch64moddivnode = class(tmoddivnode)
- function pass_1: tnode; override;
- procedure pass_generate_code;override;
- end;
- taarch64notnode = class(tcgnotnode)
- procedure second_boolean;override;
- end;
- taarch64unaryminusnode = class(tcgunaryminusnode)
- function pass_1: tnode; override;
- procedure second_float; override;
- end;
- implementation
- uses
- globtype,systems,constexp,
- cutils,verbose,globals,
- symconst,symdef,
- aasmbase,aasmcpu,aasmtai,aasmdata,
- defutil,
- cgbase,cgobj,hlcgobj,pass_2,procinfo,
- ncon,
- cpubase,
- ncgutil,cgcpu,cgutils;
- {*****************************************************************************
- taarch64moddivnode
- *****************************************************************************}
- function taarch64moddivnode.pass_1: tnode;
- begin
- result:=inherited pass_1;
- if not assigned(result) then
- include(current_procinfo.flags,pi_do_call);
- end;
- procedure taarch64moddivnode.pass_generate_code;
- var
- op : tasmop;
- tmpreg,
- zeroreg,
- numerator,
- divider,
- largernumreg,
- largerresreg,
- resultreg : tregister;
- hl : tasmlabel;
- overflowloc: tlocation;
- power : longint;
- opsize : tcgsize;
- dividend : Int64;
- high_bit,
- reciprocal : QWord;
- { Just to save on stack space and the like }
- reciprocal_signed : Int64 absolute reciprocal;
- expandword,
- magic_add : Boolean;
- shift : byte;
- shifterop : tshifterop;
- hp : taicpu;
- procedure genOrdConstNodeDiv;
- var
- helper1, helper2: TRegister;
- so: tshifterop;
- begin
- if tordconstnode(right).value=0 then
- internalerror(2020021601)
- else if tordconstnode(right).value=1 then
- cg.a_load_reg_reg(current_asmdata.CurrAsmList, opsize, opsize, numerator, resultreg)
- else if (tordconstnode(right).value = int64(-1)) then
- begin
- // note: only in the signed case possible..., may overflow
- if cs_check_overflow in current_settings.localswitches then
- cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
- current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
- resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
- end
- else if isabspowerof2(tordconstnode(right).value,power) then
- begin
- if (is_signed(right.resultdef)) then
- begin
- helper2:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- if power = 1 then
- helper1:=numerator
- else
- begin
- helper1:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,opsize,resultdef.size*8-1,numerator,helper1);
- end;
- shifterop_reset(so);
- so.shiftmode:=SM_LSR;
- so.shiftimm:=resultdef.size*8-power;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
- cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,def_cgsize(resultdef),power,helper2,resultreg);
- if (tordconstnode(right).value < 0) then
- { Invert the result }
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_NEG,resultreg,resultreg));
- end
- else
- cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg)
- end
- else
- { Generic division }
- begin
- if is_signed(left.resultdef) then
- op:=A_SDIV
- else
- op:=A_UDIV;
- { If we didn't acquire the original divisor earlier, grab it now }
- if divider = NR_NO then
- begin
- divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
- end;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
- end;
- end;
- procedure genOverflowCheck;
- begin
- { in case of overflow checking, also check for low(int64) div (-1)
- (no hardware support for this either) }
- if (cs_check_overflow in current_settings.localswitches) and
- is_signed(left.resultdef) and
- ((right.nodetype<>ordconstn) or
- (tordconstnode(right).value=-1)) then
- begin
- { num=ffff... and div=8000... <=>
- num xor not(div xor 8000...) = 0
- (and we have the "eon" operation, which performs "xor not(...)" }
- tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef);
- hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),numerator,tmpreg);
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON,
- tmpreg,numerator,tmpreg));
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0));
- { now the zero/equal flag is set in case we divided low(int64) by
- (-1) }
- location_reset(overflowloc,LOC_FLAGS,OS_NO);
- overflowloc.resflags:=F_EQ;
- cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc);
- end;
- end;
- begin
- secondpass(left);
- secondpass(right);
- { avoid warning }
- divider := NR_NO;
- largernumreg := NR_NO;
- expandword := False;
- opsize := def_cgsize(resultdef);
- { set result location }
- location_reset(location,LOC_REGISTER,opsize);
- location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- resultreg:=location.register;
- { put numerator in register }
- hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
- numerator:=left.location.register;
- if (right.nodetype=ordconstn) then
- begin
- { If optimising for size, just use regular division operations }
- if (cs_opt_size in current_settings.optimizerswitches) or
- ((tordconstnode(right).value=1) or
- (tordconstnode(right).value=int64(-1)) or
- isabspowerof2(tordconstnode(right).value,power)) then
- begin
- { Store divisor for later (and executed at the same time as the multiplication) }
- if (nodetype=modn) then
- begin
- if (tordconstnode(right).value = 1) or (tordconstnode(right).value = int64(-1)) then
- begin
- { Just evaluates to zero }
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_MOVZ,resultreg, 0));
- Exit;
- end
- { "not cs_opt_size" saves from checking the value of the divisor again
- (if cs_opt_size is not set, then the divisor is a power of 2) }
- else if not (cs_opt_size in current_settings.optimizerswitches) then
- begin
- divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
- end
- end;
- genOrdConstNodeDiv;
- genOverflowCheck;
- { in case of modulo, multiply result again by the divider and subtract
- from the numerator }
- if (nodetype=modn) then
- begin
- if ispowerof2(tordconstnode(right).value,power) then
- begin
- shifterop.shiftmode := SM_LSL;
- shifterop.shiftimm := power;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_SUB,resultreg,numerator,resultreg,shifterop));
- end
- else
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
- resultreg,divider,numerator));
- end;
- Exit;
- end
- else
- begin
- if is_signed(left.resultdef) then
- begin
- if (nodetype=modn) then { Signed mod doesn't work properly }
- begin
- divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
- genOrdConstNodeDiv;
- end
- else
- begin
- { Read signed value to avoid Internal Error 200706094 }
- dividend := tordconstnode(right).value.svalue;
- calc_divconst_magic_signed(resultdef.size * 8, dividend, reciprocal_signed, shift);
- cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal_signed, resultreg);
- { SMULH is only available for the full 64-bit registers }
- if opsize in [OS_64, OS_S64] then
- begin
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SMULH,resultreg,resultreg,numerator));
- largerresreg := resultreg;
- end
- else
- begin
- largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
- largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
- expandword := True; { Merge the shift operation with something below }
- end;
- { Store divisor for later (and executed at the same time as the multiplication) }
- if nodetype=modn then
- begin
- divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,dividend,divider);
- end;
- { add or subtract dividend }
- if (dividend > 0) and (reciprocal_signed < 0) then
- begin
- if expandword then
- begin
- shifterop.shiftmode := SM_ASR;
- shifterop.shiftimm := 32;
- expandword := False;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,largerresreg,largernumreg,largerresreg,shifterop));
- end
- else
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator));
- end
- else if (dividend < 0) and (reciprocal_signed > 0) then
- begin
- if expandword then
- begin
- { We can't append LSR to the SUB below because it's on the wrong operand }
- expandword := False;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,32));
- end;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SUB,resultreg,resultreg,numerator));
- end
- else if expandword then
- Inc(shift,32);
- { shift if necessary }
- if (shift <> 0) then
- begin
- if expandword then
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,shift))
- else
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,resultreg,resultreg,shift));
- end;
- { extract and add the sign bit }
- shifterop.shiftmode := SM_LSR;
- shifterop.shiftimm := left.resultdef.size*8 - 1;
- if (dividend < 0) then
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,resultreg,shifterop))
- else
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,numerator,shifterop));
- end;
- end
- else
- begin
- calc_divconst_magic_unsigned(resultdef.size * 8, tordconstnode(right).value, reciprocal, magic_add, shift);
- { Add explicit typecast to tcgint type, to avoid range or overflow check }
- cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, tcgint(reciprocal), resultreg);
- { UMULH is only available for the full 64-bit registers }
- if opsize in [OS_64, OS_S64] then
- begin
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_UMULH,resultreg,resultreg,numerator));
- largerresreg := resultreg;
- end
- else
- begin
- largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
- largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
- expandword := True; { Try to merge the shift operation with something below }
- end;
- { Store divisor for later (and executed at the same time as the multiplication) }
- if (nodetype=modn) then
- begin
- divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
- end;
- if magic_add then
- begin
- { We can't append LSR to the ADD below because it would require extending the registers
- and interfere with the carry bit }
- if expandword then
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,32));
- { Add the reciprocal to the high-order word, tracking the carry bit, shift, then
- insert the carry bit via CSEL and ORR }
- if opsize in [OS_64,OS_S64] then
- zeroreg := NR_XZR
- else
- zeroreg := NR_WZR;
- high_bit := QWord(1) shl ((resultdef.size * 8) - shift);
- tmpreg := cg.getintregister(current_asmdata.CurrAsmList, opsize);
- cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, high_bit, tmpreg);
- { Generate ADDS instruction }
- hp := taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator);
- hp.oppostfix := PF_S;
- current_asmdata.CurrAsmList.concat(hp);
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_cond(A_CSEL,tmpreg,tmpreg,zeroreg, C_CS));
- shifterop.shiftmode := SM_LSR;
- shifterop.shiftimm := shift;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,resultreg,tmpreg,resultreg,shifterop));
- end
- else if expandword then
- { Include the right-shift by 32 to get the high-order DWord }
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,shift + 32))
- else
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,resultreg,resultreg,shift));
- end;
- end;
- end
- { no divide-by-zero detection available in hardware, emulate (if it's a
- constant, this will have been detected earlier already) }
- else
- begin
- { load divider in a register }
- hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
- divider:=right.location.register;
- { ARM-64 developer guides recommend checking for division by zero conditions
- AFTER the division, since the check and the division can be done in tandem }
- if is_signed(left.resultdef) then
- op:=A_SDIV
- else
- op:=A_UDIV;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,divider,0));
- current_asmdata.getjumplabel(hl);
- current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl));
- cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false);
- cg.a_label(current_asmdata.CurrAsmList,hl);
- end;
- genOverflowCheck;
- { in case of modulo, multiply result again by the divider and subtract
- from the numerator }
- if (nodetype=modn) then
- begin
- { If we didn't acquire the original divisor earlier, grab it now }
- if divider = NR_NO then
- begin
- divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
- cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
- end;
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
- resultreg,divider,numerator));
- end;
- end;
- {*****************************************************************************
- taarch64notnode
- *****************************************************************************}
- procedure taarch64notnode.second_boolean;
- begin
- secondpass(left);
- if not handle_locjump then
- begin
- case left.location.loc of
- LOC_FLAGS :
- begin
- location_copy(location,left.location);
- inverse_flags(location.resflags);
- end;
- LOC_REGISTER, LOC_CREGISTER,
- LOC_REFERENCE, LOC_CREFERENCE,
- LOC_SUBSETREG, LOC_CSUBSETREG,
- LOC_SUBSETREF, LOC_CSUBSETREF:
- begin
- hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,
- left.location.register,0));
- location_reset(location,LOC_FLAGS,OS_NO);
- location.resflags:=F_EQ;
- end;
- else
- internalerror(2003042401);
- end;
- end;
- end;
- {*****************************************************************************
- taarch64unaryminusnode
- *****************************************************************************}
- function taarch64unaryminusnode.pass_1: tnode;
- begin
- Result:=inherited pass_1;
- if Result=nil then
- if needs_check_for_fpu_exceptions then
- Include(current_procinfo.flags,pi_do_call);
- end;
- procedure taarch64unaryminusnode.second_float;
- begin
- secondpass(left);
- hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
- location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
- location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
- current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FNEG,location.register,left.location.register));
- cg.maybe_check_for_fpu_exception(current_asmdata.CurrAsmList);
- end;
- begin
- cmoddivnode:=taarch64moddivnode;
- cnotnode:=taarch64notnode;
- cunaryminusnode:=taarch64unaryminusnode;
- end.
|