ncpumat.pas 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. {
  2. Copyright (c) 1998-2002, 2014 by Florian Klaempfl and Jonas Maebe
  3. Generate AArch64 assembler for math nodes
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit ncpumat;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node,nmat,ncgmat;
  22. type
  23. taarch64moddivnode = class(tmoddivnode)
  24. function pass_1: tnode; override;
  25. procedure pass_generate_code;override;
  26. end;
  27. taarch64notnode = class(tcgnotnode)
  28. procedure second_boolean;override;
  29. end;
  30. taarch64unaryminusnode = class(tcgunaryminusnode)
  31. procedure second_float; override;
  32. end;
  33. implementation
  34. uses
  35. globtype,systems,constexp,
  36. cutils,verbose,globals,
  37. symconst,symdef,
  38. aasmbase,aasmcpu,aasmtai,aasmdata,
  39. defutil,
  40. cgbase,cgobj,hlcgobj,pass_2,procinfo,
  41. ncon,
  42. cpubase,
  43. ncgutil,cgcpu,cgutils;
  44. {*****************************************************************************
  45. taarch64moddivnode
  46. *****************************************************************************}
  47. function taarch64moddivnode.pass_1: tnode;
  48. begin
  49. result:=inherited pass_1;
  50. if not assigned(result) then
  51. include(current_procinfo.flags,pi_do_call);
  52. end;
  53. procedure taarch64moddivnode.pass_generate_code;
  54. var
  55. op : tasmop;
  56. tmpreg,
  57. zeroreg,
  58. numerator,
  59. divider,
  60. largernumreg,
  61. largerresreg,
  62. resultreg : tregister;
  63. hl : tasmlabel;
  64. overflowloc: tlocation;
  65. power : longint;
  66. opsize : tcgsize;
  67. dividend : Int64;
  68. high_bit,
  69. reciprocal : QWord;
  70. { Just to save on stack space and the like }
  71. reciprocal_signed : Int64 absolute reciprocal;
  72. expandword,
  73. magic_add : Boolean;
  74. shift : byte;
  75. shifterop : tshifterop;
  76. hp : taicpu;
  77. procedure genOrdConstNodeDiv;
  78. var
  79. helper1, helper2: TRegister;
  80. so: tshifterop;
  81. begin
  82. if tordconstnode(right).value=0 then
  83. internalerror(2020021601)
  84. else if tordconstnode(right).value=1 then
  85. cg.a_load_reg_reg(current_asmdata.CurrAsmList, opsize, opsize, numerator, resultreg)
  86. else if (tordconstnode(right).value = int64(-1)) then
  87. begin
  88. // note: only in the signed case possible..., may overflow
  89. if cs_check_overflow in current_settings.localswitches then
  90. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  91. current_asmdata.CurrAsmList.concat(setoppostfix(taicpu.op_reg_reg(A_NEG,
  92. resultreg,numerator),toppostfix(ord(cs_check_overflow in current_settings.localswitches)*ord(PF_S))));
  93. end
  94. else if isabspowerof2(tordconstnode(right).value,power) then
  95. begin
  96. if (is_signed(right.resultdef)) then
  97. begin
  98. helper2:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  99. if power = 1 then
  100. helper1:=numerator
  101. else
  102. begin
  103. helper1:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  104. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,opsize,resultdef.size*8-1,numerator,helper1);
  105. end;
  106. shifterop_reset(so);
  107. so.shiftmode:=SM_LSR;
  108. so.shiftimm:=resultdef.size*8-power;
  109. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,helper2,numerator,helper1,so));
  110. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,def_cgsize(resultdef),power,helper2,resultreg);
  111. if (tordconstnode(right).value < 0) then
  112. { Invert the result }
  113. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_NEG,resultreg,resultreg));
  114. end
  115. else
  116. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,opsize,power,numerator,resultreg)
  117. end
  118. else
  119. { Generic division }
  120. begin
  121. if is_signed(left.resultdef) then
  122. op:=A_SDIV
  123. else
  124. op:=A_UDIV;
  125. { If we didn't acquire the original divisor earlier, grab it now }
  126. if divider = NR_NO then
  127. begin
  128. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  129. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  130. end;
  131. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
  132. end;
  133. end;
  134. procedure genOverflowCheck;
  135. begin
  136. { in case of overflow checking, also check for low(int64) div (-1)
  137. (no hardware support for this either) }
  138. if (cs_check_overflow in current_settings.localswitches) and
  139. is_signed(left.resultdef) and
  140. ((right.nodetype<>ordconstn) or
  141. (tordconstnode(right).value=-1)) then
  142. begin
  143. { num=ffff... and div=8000... <=>
  144. num xor not(div xor 8000...) = 0
  145. (and we have the "eon" operation, which performs "xor not(...)" }
  146. tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,left.resultdef);
  147. hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.resultdef,low(int64),numerator,tmpreg);
  148. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_EON,
  149. tmpreg,numerator,tmpreg));
  150. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,tmpreg,0));
  151. { now the zero/equal flag is set in case we divided low(int64) by
  152. (-1) }
  153. location_reset(overflowloc,LOC_FLAGS,OS_NO);
  154. overflowloc.resflags:=F_EQ;
  155. cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,location,resultdef,overflowloc);
  156. end;
  157. end;
  158. begin
  159. secondpass(left);
  160. secondpass(right);
  161. { avoid warning }
  162. divider := NR_NO;
  163. largernumreg := NR_NO;
  164. expandword := False;
  165. opsize := def_cgsize(resultdef);
  166. { set result location }
  167. location_reset(location,LOC_REGISTER,opsize);
  168. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  169. resultreg:=location.register;
  170. { put numerator in register }
  171. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
  172. numerator:=left.location.register;
  173. if (right.nodetype=ordconstn) then
  174. begin
  175. { If optimising for size, just use regular division operations }
  176. if (cs_opt_size in current_settings.optimizerswitches) or
  177. ((tordconstnode(right).value=1) or
  178. (tordconstnode(right).value=int64(-1)) or
  179. isabspowerof2(tordconstnode(right).value,power)) then
  180. begin
  181. { Store divisor for later (and executed at the same time as the multiplication) }
  182. if (nodetype=modn) then
  183. begin
  184. if (tordconstnode(right).value = 1) or (tordconstnode(right).value = int64(-1)) then
  185. begin
  186. { Just evaluates to zero }
  187. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_MOVZ,resultreg, 0));
  188. Exit;
  189. end
  190. { "not cs_opt_size" saves from checking the value of the divisor again
  191. (if cs_opt_size is not set, then the divisor is a power of 2) }
  192. else if not (cs_opt_size in current_settings.optimizerswitches) then
  193. begin
  194. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  195. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  196. end
  197. end;
  198. genOrdConstNodeDiv;
  199. genOverflowCheck;
  200. { in case of modulo, multiply result again by the divider and subtract
  201. from the numerator }
  202. if (nodetype=modn) then
  203. begin
  204. if ispowerof2(tordconstnode(right).value,power) then
  205. begin
  206. shifterop.shiftmode := SM_LSL;
  207. shifterop.shiftimm := power;
  208. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_SUB,resultreg,numerator,resultreg,shifterop));
  209. end
  210. else
  211. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
  212. resultreg,divider,numerator));
  213. end;
  214. Exit;
  215. end
  216. else
  217. begin
  218. if is_signed(left.resultdef) then
  219. begin
  220. if (nodetype=modn) then { Signed mod doesn't work properly }
  221. begin
  222. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  223. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  224. genOrdConstNodeDiv;
  225. end
  226. else
  227. begin
  228. { Read signed value to avoid Internal Error 200706094 }
  229. dividend := tordconstnode(right).value.svalue;
  230. calc_divconst_magic_signed(resultdef.size * 8, dividend, reciprocal_signed, shift);
  231. cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, reciprocal_signed, resultreg);
  232. { SMULH is only available for the full 64-bit registers }
  233. if opsize in [OS_64, OS_S64] then
  234. begin
  235. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SMULH,resultreg,resultreg,numerator));
  236. largerresreg := resultreg;
  237. end
  238. else
  239. begin
  240. largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
  241. largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
  242. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
  243. expandword := True; { Merge the shift operation with something below }
  244. end;
  245. { Store divisor for later (and executed at the same time as the multiplication) }
  246. if nodetype=modn then
  247. begin
  248. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  249. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,dividend,divider);
  250. end;
  251. { add or subtract dividend }
  252. if (dividend > 0) and (reciprocal_signed < 0) then
  253. begin
  254. if expandword then
  255. begin
  256. shifterop.shiftmode := SM_ASR;
  257. shifterop.shiftimm := 32;
  258. expandword := False;
  259. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,largerresreg,largernumreg,largerresreg,shifterop));
  260. end
  261. else
  262. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator));
  263. end
  264. else if (dividend < 0) and (reciprocal_signed > 0) then
  265. begin
  266. if expandword then
  267. begin
  268. { We can't append LSR to the SUB below because it's on the wrong operand }
  269. expandword := False;
  270. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,32));
  271. end;
  272. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_SUB,resultreg,resultreg,numerator));
  273. end
  274. else if expandword then
  275. Inc(shift,32);
  276. { shift if necessary }
  277. if (shift <> 0) then
  278. begin
  279. if expandword then
  280. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,largerresreg,largerresreg,shift))
  281. else
  282. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_ASR,resultreg,resultreg,shift));
  283. end;
  284. { extract and add the sign bit }
  285. shifterop.shiftmode := SM_LSR;
  286. shifterop.shiftimm := left.resultdef.size*8 - 1;
  287. if (dividend < 0) then
  288. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,resultreg,shifterop))
  289. else
  290. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,resultreg,resultreg,numerator,shifterop));
  291. end;
  292. end
  293. else
  294. begin
  295. calc_divconst_magic_unsigned(resultdef.size * 8, tordconstnode(right).value, reciprocal, magic_add, shift);
  296. { Add explicit typecast to tcgint type, to avoid range or overflow check }
  297. cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, tcgint(reciprocal), resultreg);
  298. { UMULH is only available for the full 64-bit registers }
  299. if opsize in [OS_64, OS_S64] then
  300. begin
  301. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_UMULH,resultreg,resultreg,numerator));
  302. largerresreg := resultreg;
  303. end
  304. else
  305. begin
  306. largerresreg := newreg(getregtype(resultreg), getsupreg(resultreg), R_SUBWHOLE);
  307. largernumreg := newreg(getregtype(numerator), getsupreg(numerator), R_SUBWHOLE);
  308. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_MUL,largerresreg,largerresreg,largernumreg));
  309. expandword := True; { Try to merge the shift operation with something below }
  310. end;
  311. { Store divisor for later (and executed at the same time as the multiplication) }
  312. if (nodetype=modn) then
  313. begin
  314. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  315. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  316. end;
  317. if magic_add then
  318. begin
  319. { We can't append LSR to the ADD below because it would require extending the registers
  320. and interfere with the carry bit }
  321. if expandword then
  322. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,32));
  323. { Add the reciprocal to the high-order word, tracking the carry bit, shift, then
  324. insert the carry bit via CSEL and ORR }
  325. if opsize in [OS_64,OS_S64] then
  326. zeroreg := NR_XZR
  327. else
  328. zeroreg := NR_WZR;
  329. high_bit := QWord(1) shl ((resultdef.size * 8) - shift);
  330. tmpreg := cg.getintregister(current_asmdata.CurrAsmList, opsize);
  331. cg.a_load_const_reg(current_asmdata.CurrAsmList, opsize, high_bit, tmpreg);
  332. { Generate ADDS instruction }
  333. hp := taicpu.op_reg_reg_reg(A_ADD,resultreg,resultreg,numerator);
  334. hp.oppostfix := PF_S;
  335. current_asmdata.CurrAsmList.concat(hp);
  336. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_cond(A_CSEL,tmpreg,tmpreg,zeroreg, C_CS));
  337. shifterop.shiftmode := SM_LSR;
  338. shifterop.shiftimm := shift;
  339. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_shifterop(A_ORR,resultreg,tmpreg,resultreg,shifterop));
  340. end
  341. else if expandword then
  342. { Include the right-shift by 32 to get the high-order DWord }
  343. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,largerresreg,largerresreg,shift + 32))
  344. else
  345. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_const(A_LSR,resultreg,resultreg,shift));
  346. end;
  347. end;
  348. end
  349. { no divide-by-zero detection available in hardware, emulate (if it's a
  350. constant, this will have been detected earlier already) }
  351. else
  352. begin
  353. { load divider in a register }
  354. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
  355. divider:=right.location.register;
  356. { ARM-64 developer guides recommend checking for division by zero conditions
  357. AFTER the division, since the check and the division can be done in tandem }
  358. if is_signed(left.resultdef) then
  359. op:=A_SDIV
  360. else
  361. op:=A_UDIV;
  362. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,resultreg,numerator,divider));
  363. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,divider,0));
  364. current_asmdata.getjumplabel(hl);
  365. current_asmdata.CurrAsmList.concat(taicpu.op_cond_sym(A_B,C_NE,hl));
  366. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_DIVBYZERO',false);
  367. cg.a_label(current_asmdata.CurrAsmList,hl);
  368. end;
  369. genOverflowCheck;
  370. { in case of modulo, multiply result again by the divider and subtract
  371. from the numerator }
  372. if (nodetype=modn) then
  373. begin
  374. { If we didn't acquire the original divisor earlier, grab it now }
  375. if divider = NR_NO then
  376. begin
  377. divider:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  378. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,tordconstnode(right).value.svalue,divider);
  379. end;
  380. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg_reg(A_MSUB,resultreg,
  381. resultreg,divider,numerator));
  382. end;
  383. end;
  384. {*****************************************************************************
  385. taarch64notnode
  386. *****************************************************************************}
  387. procedure taarch64notnode.second_boolean;
  388. begin
  389. secondpass(left);
  390. if not handle_locjump then
  391. begin
  392. case left.location.loc of
  393. LOC_FLAGS :
  394. begin
  395. location_copy(location,left.location);
  396. inverse_flags(location.resflags);
  397. end;
  398. LOC_REGISTER, LOC_CREGISTER,
  399. LOC_REFERENCE, LOC_CREFERENCE,
  400. LOC_SUBSETREG, LOC_CSUBSETREG,
  401. LOC_SUBSETREF, LOC_CSUBSETREF:
  402. begin
  403. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
  404. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_CMP,
  405. left.location.register,0));
  406. location_reset(location,LOC_FLAGS,OS_NO);
  407. location.resflags:=F_EQ;
  408. end;
  409. else
  410. internalerror(2003042401);
  411. end;
  412. end;
  413. end;
  414. {*****************************************************************************
  415. taarch64unaryminusnode
  416. *****************************************************************************}
  417. procedure taarch64unaryminusnode.second_float;
  418. begin
  419. secondpass(left);
  420. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  421. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  422. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  423. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FNEG,location.register,left.location.register));
  424. cg.maybe_check_for_fpu_exception(current_asmdata.CurrAsmList);
  425. end;
  426. begin
  427. cmoddivnode:=taarch64moddivnode;
  428. cnotnode:=taarch64notnode;
  429. cunaryminusnode:=taarch64unaryminusnode;
  430. end.