ncpumat.pas 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. {
  2. Copyright (c) 1998-2002, 2014 by Florian Klaempfl and Jonas Maebe
  3. Generate AArch64 assembler for math nodes
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit ncpumat;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node,nmat,ncgmat;
  22. type
  23. taarch64moddivnode = class(tmoddivnode)
  24. function pass_1: tnode; override;
  25. procedure pass_generate_code;override;
  26. end;
  27. taarch64notnode = class(tcgnotnode)
  28. procedure second_boolean;override;
  29. end;
  30. taarch64unaryminusnode = class(tcgunaryminusnode)
  31. function pass_1: tnode; override;
  32. procedure second_float; override;
  33. end;
  34. implementation
  35. uses
  36. globtype,systems,constexp,
  37. cutils,verbose,globals,
  38. symconst,symdef,
  39. aasmbase,aasmcpu,aasmtai,aasmdata,
  40. defutil,
  41. cgbase,cgobj,hlcgobj,pass_2,procinfo,
  42. ncon,
  43. cpubase,
  44. ncgutil,cgcpu,cgutils;
  45. {*****************************************************************************
  46. taarch64moddivnode
  47. *****************************************************************************}
  48. function taarch64moddivnode.pass_1: tnode;
  49. begin
  50. result:=inherited pass_1;
  51. if not assigned(result) then
  52. include(current_procinfo.flags,pi_do_call);
  53. end;
  54. procedure taarch64moddivnode.pass_generate_code;
  55. var
  56. op : tasmop;
  57. tmpreg,
  58. zeroreg,
  59. numerator,
  60. divider,
  61. largernumreg,
  62. largerresreg,
  63. resultreg : tregister;
  64. hl : tasmlabel;
  65. overflowloc: tlocation;
  66. power : longint;
  67. opsize : tcgsize;
  68. dividend : Int64;
  69. high_bit,
  70. reciprocal : QWord;
  71. { Just to save on stack space and the like }
  72. reciprocal_signed : Int64 absolute reciprocal;
  73. expandword,
  74. magic_add : Boolean;
  75. shift : byte;
  76. shifterop : tshifterop;
  77. hp : taicpu;
  78. procedure genOrdConstNodeDiv;
  79. var
  80. helper1, helper2: TRegister;
  81. so: tshifterop;
  82. begin
  83. if tordconstnode(right).value=0 then
  84. internalerror(2020021601)
  85. else if tordconstnode(right).value=1 then
  86. cg.a_load_reg_reg(current_asmdata.CurrAsmList, opsize, opsize, numerator, resultreg)
  87. else if (tordconstnode(right).value = int64(-1)) then
  88. begin
  89. // note: only in the signed case possible..., may overflow
  90. if cs_check_overflow in current_settings.localswitches then
  91. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  92. current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
  93. resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
  94. end
  95. else if isabspowerof2(tordconstnode(right).value,power) then
  96. begin
  97. if (is_signed(right.resultdef)) then
  98. begin
  99. helper2:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  100. if power = 1 then
  101. helper1:=numerator
  102. else
  103. begin
  104. helper1:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  105. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,opsize,resultdef.size*8-1,numerator,helper1);
  106. end;
  107. shifterop_reset(so);
  108. so.shiftmode:=SM_LSR;
  109. so.shiftimm:=resultdef.size*8-power;
  110. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
  111. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,def_cgsize(resultdef),power,helper2,resultreg);
  112. if (tordconstnode(right).value < 0) then
  113. { Invert the result }
  114. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_NEG,resultreg,resultreg));
  115. end
  116. else
  117. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg)
  118. end
  119. else
  120. { Generic division }
  121. begin
  122. if is_signed(left.resultdef) then
  123. op:=A_SDIV
  124. else
  125. op:=A_UDIV;
  126. { If we didn't acquire the original divisor earlier, grab it now }
  127. if divider = NR_NO then
  128. begin
  129. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  130. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  131. end;
  132. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
  133. end;
  134. end;
  135. procedure genOverflowCheck;
  136. begin
  137. { in case of overflow checking, also check for low(int64) div (-1)
  138. (no hardware support for this either) }
  139. if (cs_check_overflow in current_settings.localswitches) and
  140. is_signed(left.resultdef) and
  141. ((right.nodetype<>ordconstn) or
  142. (tordconstnode(right).value=-1)) then
  143. begin
  144. { num=ffff... and div=8000... <=>
  145. num xor not(div xor 8000...) = 0
  146. (and we have the "eon" operation, which performs "xor not(...)" }
  147. tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef);
  148. hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),numerator,tmpreg);
  149. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON,
  150. tmpreg,numerator,tmpreg));
  151. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0));
  152. { now the zero/equal flag is set in case we divided low(int64) by
  153. (-1) }
  154. location_reset(overflowloc,LOC_FLAGS,OS_NO);
  155. overflowloc.resflags:=F_EQ;
  156. cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc);
  157. end;
  158. end;
  159. begin
  160. secondpass(left);
  161. secondpass(right);
  162. { avoid warning }
  163. divider := NR_NO;
  164. largernumreg := NR_NO;
  165. expandword := False;
  166. opsize := def_cgsize(resultdef);
  167. { set result location }
  168. location_reset(location,LOC_REGISTER,opsize);
  169. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  170. resultreg:=location.register;
  171. { put numerator in register }
  172. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
  173. numerator:=left.location.register;
  174. if (right.nodetype=ordconstn) then
  175. begin
  176. { If optimising for size, just use regular division operations }
  177. if (cs_opt_size in current_settings.optimizerswitches) or
  178. ((tordconstnode(right).value=1) or
  179. (tordconstnode(right).value=int64(-1)) or
  180. isabspowerof2(tordconstnode(right).value,power)) then
  181. begin
  182. { Store divisor for later (and executed at the same time as the multiplication) }
  183. if (nodetype=modn) then
  184. begin
  185. if (tordconstnode(right).value = 1) or (tordconstnode(right).value = int64(-1)) then
  186. begin
  187. { Just evaluates to zero }
  188. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_MOVZ,resultreg, 0));
  189. Exit;
  190. end
  191. { "not cs_opt_size" saves from checking the value of the divisor again
  192. (if cs_opt_size is not set, then the divisor is a power of 2) }
  193. else if not (cs_opt_size in current_settings.optimizerswitches) then
  194. begin
  195. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  196. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  197. end
  198. end;
  199. genOrdConstNodeDiv;
  200. genOverflowCheck;
  201. { in case of modulo, multiply result again by the divider and subtract
  202. from the numerator }
  203. if (nodetype=modn) then
  204. begin
  205. if ispowerof2(tordconstnode(right).value,power) then
  206. begin
  207. shifterop.shiftmode := SM_LSL;
  208. shifterop.shiftimm := power;
  209. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_SUB,resultreg,numerator,resultreg,shifterop));
  210. end
  211. else
  212. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
  213. resultreg,divider,numerator));
  214. end;
  215. Exit;
  216. end
  217. else
  218. begin
  219. if is_signed(left.resultdef) then
  220. begin
  221. if (nodetype=modn) then { Signed mod doesn't work properly }
  222. begin
  223. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  224. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  225. genOrdConstNodeDiv;
  226. end
  227. else
  228. begin
  229. { Read signed value to avoid Internal Error 200706094 }
  230. dividend := tordconstnode(right).value.svalue;
  231. calc_divconst_magic_signed(resultdef.size * 8, dividend, reciprocal_signed, shift);
  232. cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal_signed, resultreg);
  233. { SMULH is only available for the full 64-bit registers }
  234. if opsize in [OS_64, OS_S64] then
  235. begin
  236. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SMULH,resultreg,resultreg,numerator));
  237. largerresreg := resultreg;
  238. end
  239. else
  240. begin
  241. largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
  242. largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
  243. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
  244. expandword := True; { Merge the shift operation with something below }
  245. end;
  246. { Store divisor for later (and executed at the same time as the multiplication) }
  247. if nodetype=modn then
  248. begin
  249. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  250. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,dividend,divider);
  251. end;
  252. { add or subtract dividend }
  253. if (dividend > 0) and (reciprocal_signed < 0) then
  254. begin
  255. if expandword then
  256. begin
  257. shifterop.shiftmode := SM_ASR;
  258. shifterop.shiftimm := 32;
  259. expandword := False;
  260. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,largerresreg,largernumreg,largerresreg,shifterop));
  261. end
  262. else
  263. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator));
  264. end
  265. else if (dividend < 0) and (reciprocal_signed > 0) then
  266. begin
  267. if expandword then
  268. begin
  269. { We can't append LSR to the SUB below because it's on the wrong operand }
  270. expandword := False;
  271. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,32));
  272. end;
  273. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SUB,resultreg,resultreg,numerator));
  274. end
  275. else if expandword then
  276. Inc(shift,32);
  277. { shift if necessary }
  278. if (shift <> 0) then
  279. begin
  280. if expandword then
  281. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,shift))
  282. else
  283. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,resultreg,resultreg,shift));
  284. end;
  285. { extract and add the sign bit }
  286. shifterop.shiftmode := SM_LSR;
  287. shifterop.shiftimm := left.resultdef.size*8 - 1;
  288. if (dividend < 0) then
  289. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,resultreg,shifterop))
  290. else
  291. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,numerator,shifterop));
  292. end;
  293. end
  294. else
  295. begin
  296. calc_divconst_magic_unsigned(resultdef.size * 8, tordconstnode(right).value, reciprocal, magic_add, shift);
  297. { Add explicit typecast to tcgint type, to avoid range or overflow check }
  298. cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, tcgint(reciprocal), resultreg);
  299. { UMULH is only available for the full 64-bit registers }
  300. if opsize in [OS_64, OS_S64] then
  301. begin
  302. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_UMULH,resultreg,resultreg,numerator));
  303. largerresreg := resultreg;
  304. end
  305. else
  306. begin
  307. largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
  308. largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
  309. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
  310. expandword := True; { Try to merge the shift operation with something below }
  311. end;
  312. { Store divisor for later (and executed at the same time as the multiplication) }
  313. if (nodetype=modn) then
  314. begin
  315. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  316. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  317. end;
  318. if magic_add then
  319. begin
  320. { We can't append LSR to the ADD below because it would require extending the registers
  321. and interfere with the carry bit }
  322. if expandword then
  323. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,32));
  324. { Add the reciprocal to the high-order word, tracking the carry bit, shift, then
  325. insert the carry bit via CSEL and ORR }
  326. if opsize in [OS_64,OS_S64] then
  327. zeroreg := NR_XZR
  328. else
  329. zeroreg := NR_WZR;
  330. high_bit := QWord(1) shl ((resultdef.size * 8) - shift);
  331. tmpreg := cg.getintregister(current_asmdata.CurrAsmList, opsize);
  332. cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, high_bit, tmpreg);
  333. { Generate ADDS instruction }
  334. hp := taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator);
  335. hp.oppostfix := PF_S;
  336. current_asmdata.CurrAsmList.concat(hp);
  337. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_cond(A_CSEL,tmpreg,tmpreg,zeroreg, C_CS));
  338. shifterop.shiftmode := SM_LSR;
  339. shifterop.shiftimm := shift;
  340. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,resultreg,tmpreg,resultreg,shifterop));
  341. end
  342. else if expandword then
  343. { Include the right-shift by 32 to get the high-order DWord }
  344. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,shift + 32))
  345. else
  346. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,resultreg,resultreg,shift));
  347. end;
  348. end;
  349. end
  350. { no divide-by-zero detection available in hardware, emulate (if it's a
  351. constant, this will have been detected earlier already) }
  352. else
  353. begin
  354. { load divider in a register }
  355. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
  356. divider:=right.location.register;
  357. { ARM-64 developer guides recommend checking for division by zero conditions
  358. AFTER the division, since the check and the division can be done in tandem }
  359. if is_signed(left.resultdef) then
  360. op:=A_SDIV
  361. else
  362. op:=A_UDIV;
  363. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
  364. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,divider,0));
  365. current_asmdata.getjumplabel(hl);
  366. current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl));
  367. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false);
  368. cg.a_label(current_asmdata.CurrAsmList,hl);
  369. end;
  370. genOverflowCheck;
  371. { in case of modulo, multiply result again by the divider and subtract
  372. from the numerator }
  373. if (nodetype=modn) then
  374. begin
  375. { If we didn't acquire the original divisor earlier, grab it now }
  376. if divider = NR_NO then
  377. begin
  378. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  379. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  380. end;
  381. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
  382. resultreg,divider,numerator));
  383. end;
  384. end;
  385. {*****************************************************************************
  386. taarch64notnode
  387. *****************************************************************************}
  388. procedure taarch64notnode.second_boolean;
  389. begin
  390. secondpass(left);
  391. if not handle_locjump then
  392. begin
  393. case left.location.loc of
  394. LOC_FLAGS :
  395. begin
  396. location_copy(location,left.location);
  397. inverse_flags(location.resflags);
  398. end;
  399. LOC_REGISTER, LOC_CREGISTER,
  400. LOC_REFERENCE, LOC_CREFERENCE,
  401. LOC_SUBSETREG, LOC_CSUBSETREG,
  402. LOC_SUBSETREF, LOC_CSUBSETREF:
  403. begin
  404. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
  405. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,
  406. left.location.register,0));
  407. location_reset(location,LOC_FLAGS,OS_NO);
  408. location.resflags:=F_EQ;
  409. end;
  410. else
  411. internalerror(2003042401);
  412. end;
  413. end;
  414. end;
  415. {*****************************************************************************
  416. taarch64unaryminusnode
  417. *****************************************************************************}
  418. function taarch64unaryminusnode.pass_1: tnode;
  419. begin
  420. Result:=inherited pass_1;
  421. if Result=nil then
  422. if needs_check_for_fpu_exceptions then
  423. Include(current_procinfo.flags,pi_do_call);
  424. end;
  425. procedure taarch64unaryminusnode.second_float;
  426. begin
  427. secondpass(left);
  428. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  429. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  430. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  431. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FNEG,location.register,left.location.register));
  432. cg.maybe_check_for_fpu_exception(current_asmdata.CurrAsmList);
  433. end;
  434. begin
  435. cmoddivnode:=taarch64moddivnode;
  436. cnotnode:=taarch64notnode;
  437. cunaryminusnode:=taarch64unaryminusnode;
  438. end.