nx86inl.pas 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl
  3. Generate x86 inline nodes
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86inl;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node,ninl,ncginl;
  22. type
  23. tx86inlinenode = class(tcginlinenode)
  24. protected
  25. procedure maybe_remove_round_trunc_typeconv; virtual;
  26. public
  27. { first pass override
  28. so that the code generator will actually generate
  29. these nodes.
  30. }
  31. function first_pi: tnode ; override;
  32. function first_arctan_real: tnode; override;
  33. function first_abs_real: tnode; override;
  34. function first_sqr_real: tnode; override;
  35. function first_sqrt_real: tnode; override;
  36. function first_ln_real: tnode; override;
  37. function first_cos_real: tnode; override;
  38. function first_sin_real: tnode; override;
  39. function first_round_real: tnode; override;
  40. function first_trunc_real: tnode; override;
  41. function first_popcnt: tnode; override;
  42. function first_fma: tnode; override;
  43. function first_frac_real : tnode; override;
  44. function first_int_real : tnode; override;
  45. { second pass override to generate these nodes }
  46. procedure second_IncludeExclude;override;
  47. procedure second_pi; override;
  48. procedure second_arctan_real; override;
  49. procedure second_abs_real; override;
  50. procedure second_round_real; override;
  51. procedure second_sqr_real; override;
  52. procedure second_sqrt_real; override;
  53. procedure second_ln_real; override;
  54. procedure second_cos_real; override;
  55. procedure second_sin_real; override;
  56. procedure second_trunc_real; override;
  57. procedure second_prefetch;override;
  58. procedure second_abs_long;override;
  59. procedure second_popcnt;override;
  60. procedure second_fma;override;
  61. procedure second_frac_real;override;
  62. procedure second_int_real;override;
  63. private
  64. procedure load_fpu_location(lnode: tnode);
  65. end;
  66. implementation
  67. uses
  68. systems,
  69. globtype,globals,
  70. verbose,compinnr,
  71. defutil,
  72. aasmbase,aasmdata,aasmcpu,
  73. symtype,symdef,symcpu,
  74. cgbase,pass_2,
  75. cpuinfo,cpubase,nutils,
  76. ncal,ncgutil,
  77. tgobj,
  78. cga,cgutils,cgx86,cgobj,hlcgobj;
  79. {*****************************************************************************
  80. TX86INLINENODE
  81. *****************************************************************************}
  82. procedure tx86inlinenode.maybe_remove_round_trunc_typeconv;
  83. begin
  84. { only makes a difference for x86_64 }
  85. end;
  86. function tx86inlinenode.first_pi : tnode;
  87. begin
  88. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  89. begin
  90. expectloc:=LOC_FPUREGISTER;
  91. first_pi := nil;
  92. end
  93. else
  94. result:=inherited;
  95. end;
  96. function tx86inlinenode.first_arctan_real : tnode;
  97. begin
  98. {$ifdef i8086}
  99. { FPATAN's range is limited to (0 <= value < 1) on the 8087 and 80287,
  100. so we need to use the RTL helper on these FPUs }
  101. if current_settings.cputype < cpu_386 then
  102. begin
  103. result := inherited;
  104. exit;
  105. end;
  106. {$endif i8086}
  107. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  108. begin
  109. expectloc:=LOC_FPUREGISTER;
  110. first_arctan_real := nil;
  111. end
  112. else
  113. result:=inherited;
  114. end;
  115. function tx86inlinenode.first_abs_real : tnode;
  116. begin
  117. if use_vectorfpu(resultdef) then
  118. expectloc:=LOC_MMREGISTER
  119. else
  120. expectloc:=LOC_FPUREGISTER;
  121. first_abs_real := nil;
  122. end;
  123. function tx86inlinenode.first_sqr_real : tnode;
  124. begin
  125. if use_vectorfpu(resultdef) then
  126. expectloc:=LOC_MMREGISTER
  127. else
  128. expectloc:=LOC_FPUREGISTER;
  129. first_sqr_real := nil;
  130. end;
  131. function tx86inlinenode.first_sqrt_real : tnode;
  132. begin
  133. if use_vectorfpu(resultdef) then
  134. expectloc:=LOC_MMREGISTER
  135. else
  136. expectloc:=LOC_FPUREGISTER;
  137. first_sqrt_real := nil;
  138. end;
  139. function tx86inlinenode.first_ln_real : tnode;
  140. begin
  141. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  142. begin
  143. expectloc:=LOC_FPUREGISTER;
  144. first_ln_real := nil;
  145. end
  146. else
  147. result:=inherited;
  148. end;
  149. function tx86inlinenode.first_cos_real : tnode;
  150. begin
  151. {$ifdef i8086}
  152. { FCOS is 387+ }
  153. if current_settings.cputype < cpu_386 then
  154. begin
  155. result := inherited;
  156. exit;
  157. end;
  158. {$endif i8086}
  159. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  160. begin
  161. expectloc:=LOC_FPUREGISTER;
  162. result:=nil;
  163. end
  164. else
  165. result:=inherited;
  166. end;
  167. function tx86inlinenode.first_sin_real : tnode;
  168. begin
  169. {$ifdef i8086}
  170. { FSIN is 387+ }
  171. if current_settings.cputype < cpu_386 then
  172. begin
  173. result := inherited;
  174. exit;
  175. end;
  176. {$endif i8086}
  177. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  178. begin
  179. expectloc:=LOC_FPUREGISTER;
  180. result:=nil;
  181. end
  182. else
  183. result:=inherited;
  184. end;
  185. function tx86inlinenode.first_round_real : tnode;
  186. begin
  187. maybe_remove_round_trunc_typeconv;
  188. {$ifdef x86_64}
  189. if use_vectorfpu(left.resultdef) then
  190. expectloc:=LOC_REGISTER
  191. else
  192. {$endif x86_64}
  193. expectloc:=LOC_REFERENCE;
  194. result:=nil;
  195. end;
  196. function tx86inlinenode.first_trunc_real: tnode;
  197. begin
  198. maybe_remove_round_trunc_typeconv;
  199. if (cs_opt_size in current_settings.optimizerswitches)
  200. {$ifdef x86_64}
  201. and not(use_vectorfpu(left.resultdef))
  202. {$endif x86_64}
  203. then
  204. result:=inherited
  205. else
  206. begin
  207. {$ifdef x86_64}
  208. if use_vectorfpu(left.resultdef) then
  209. expectloc:=LOC_REGISTER
  210. else
  211. {$endif x86_64}
  212. expectloc:=LOC_REFERENCE;
  213. result:=nil;
  214. end;
  215. end;
  216. function tx86inlinenode.first_popcnt: tnode;
  217. begin
  218. Result:=nil;
  219. {$ifndef i8086}
  220. if (CPUX86_HAS_POPCNT in cpu_capabilities[current_settings.cputype])
  221. {$ifdef i386}
  222. and not is_64bit(left.resultdef)
  223. {$endif i386}
  224. then
  225. expectloc:=LOC_REGISTER
  226. else
  227. {$endif not i8086}
  228. Result:=inherited first_popcnt
  229. end;
  230. function tx86inlinenode.first_fma : tnode;
  231. begin
  232. {$ifndef i8086}
  233. if ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]) and
  234. ((is_double(resultdef)) or (is_single(resultdef))) then
  235. begin
  236. expectloc:=LOC_MMREGISTER;
  237. Result:=nil;
  238. end
  239. else
  240. {$endif i8086}
  241. Result:=inherited first_fma;
  242. end;
  243. function tx86inlinenode.first_frac_real : tnode;
  244. begin
  245. if (current_settings.fputype>=fpu_sse41) and
  246. ((is_double(resultdef)) or (is_single(resultdef))) then
  247. begin
  248. expectloc:=LOC_MMREGISTER;
  249. Result:=nil;
  250. end
  251. else
  252. Result:=inherited first_frac_real;
  253. end;
  254. function tx86inlinenode.first_int_real : tnode;
  255. begin
  256. if (current_settings.fputype>=fpu_sse41) and
  257. ((is_double(resultdef)) or (is_single(resultdef))) then
  258. begin
  259. expectloc:=LOC_MMREGISTER;
  260. Result:=nil;
  261. end
  262. else
  263. Result:=inherited first_int_real;
  264. end;
  265. procedure tx86inlinenode.second_pi;
  266. begin
  267. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  268. emit_none(A_FLDPI,S_NO);
  269. tcgx86(cg).inc_fpu_stack;
  270. location.register:=NR_FPU_RESULT_REG;
  271. end;
  272. { load the FPU into the an fpu register }
  273. procedure tx86inlinenode.load_fpu_location(lnode: tnode);
  274. begin
  275. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  276. location.register:=NR_FPU_RESULT_REG;
  277. secondpass(lnode);
  278. case lnode.location.loc of
  279. LOC_FPUREGISTER:
  280. ;
  281. LOC_CFPUREGISTER:
  282. begin
  283. cg.a_loadfpu_reg_reg(current_asmdata.CurrAsmList,lnode.location.size,
  284. lnode.location.size,lnode.location.register,location.register);
  285. end;
  286. LOC_REFERENCE,LOC_CREFERENCE:
  287. begin
  288. cg.a_loadfpu_ref_reg(current_asmdata.CurrAsmList,
  289. lnode.location.size,lnode.location.size,
  290. lnode.location.reference,location.register);
  291. end;
  292. LOC_MMREGISTER,LOC_CMMREGISTER:
  293. begin
  294. location:=lnode.location;
  295. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,location,resultdef,false);
  296. end;
  297. else
  298. internalerror(309991);
  299. end;
  300. end;
  301. procedure tx86inlinenode.second_arctan_real;
  302. begin
  303. load_fpu_location(left);
  304. emit_none(A_FLD1,S_NO);
  305. emit_none(A_FPATAN,S_NO);
  306. end;
  307. procedure tx86inlinenode.second_abs_real;
  308. var
  309. href : treference;
  310. begin
  311. if use_vectorfpu(resultdef) then
  312. begin
  313. secondpass(left);
  314. if left.location.loc<>LOC_MMREGISTER then
  315. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,UseAVX);
  316. if UseAVX then
  317. begin
  318. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  319. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
  320. end
  321. else
  322. location:=left.location;
  323. case tfloatdef(resultdef).floattype of
  324. s32real:
  325. begin
  326. reference_reset_symbol(href,current_asmdata.RefAsmSymbol(target_info.cprefix+'FPC_ABSMASK_SINGLE',AT_DATA),0,4,[]);
  327. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList, href);
  328. if UseAVX then
  329. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(
  330. A_VANDPS,S_XMM,href,left.location.register,location.register))
  331. else
  332. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_ANDPS,S_XMM,href,location.register));
  333. end;
  334. s64real:
  335. begin
  336. reference_reset_symbol(href,current_asmdata.RefAsmSymbol(target_info.cprefix+'FPC_ABSMASK_DOUBLE',AT_DATA),0,4,[]);
  337. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList, href);
  338. if UseAVX then
  339. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(
  340. A_VANDPD,S_XMM,href,left.location.register,location.register))
  341. else
  342. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_ANDPD,S_XMM,href,location.register))
  343. end;
  344. else
  345. internalerror(200506081);
  346. end;
  347. end
  348. else
  349. begin
  350. load_fpu_location(left);
  351. emit_none(A_FABS,S_NO);
  352. end;
  353. end;
  354. procedure tx86inlinenode.second_round_real;
  355. begin
  356. {$ifdef x86_64}
  357. if use_vectorfpu(left.resultdef) then
  358. begin
  359. secondpass(left);
  360. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  361. location_reset(location,LOC_REGISTER,OS_S64);
  362. location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
  363. if UseAVX then
  364. case left.location.size of
  365. OS_F32:
  366. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSS2SI,S_NO,left.location.register,location.register));
  367. OS_F64:
  368. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSD2SI,S_NO,left.location.register,location.register));
  369. else
  370. internalerror(2007031402);
  371. end
  372. else
  373. case left.location.size of
  374. OS_F32:
  375. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_NO,left.location.register,location.register));
  376. OS_F64:
  377. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_NO,left.location.register,location.register));
  378. else
  379. internalerror(2007031402);
  380. end;
  381. end
  382. else
  383. {$endif x86_64}
  384. begin
  385. load_fpu_location(left);
  386. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  387. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  388. emit_ref(A_FISTP,S_IQ,location.reference);
  389. tcgx86(cg).dec_fpu_stack;
  390. emit_none(A_FWAIT,S_NO);
  391. end;
  392. end;
  393. procedure tx86inlinenode.second_trunc_real;
  394. var
  395. oldcw,newcw : treference;
  396. begin
  397. {$ifdef x86_64}
  398. if use_vectorfpu(left.resultdef) and
  399. not((left.location.loc=LOC_FPUREGISTER) and (current_settings.fputype>=fpu_sse3)) then
  400. begin
  401. secondpass(left);
  402. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  403. location_reset(location,LOC_REGISTER,OS_S64);
  404. location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
  405. if UseAVX then
  406. case left.location.size of
  407. OS_F32:
  408. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSS2SI,S_NO,left.location.register,location.register));
  409. OS_F64:
  410. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSD2SI,S_NO,left.location.register,location.register));
  411. else
  412. internalerror(2007031401);
  413. end
  414. else
  415. case left.location.size of
  416. OS_F32:
  417. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_NO,left.location.register,location.register));
  418. OS_F64:
  419. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_NO,left.location.register,location.register));
  420. else
  421. internalerror(2007031401);
  422. end;
  423. end
  424. else
  425. {$endif x86_64}
  426. begin
  427. if (current_settings.fputype>=fpu_sse3) then
  428. begin
  429. load_fpu_location(left);
  430. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  431. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  432. emit_ref(A_FISTTP,S_IQ,location.reference);
  433. tcgx86(cg).dec_fpu_stack;
  434. end
  435. else
  436. begin
  437. tg.GetTemp(current_asmdata.CurrAsmList,2,2,tt_normal,oldcw);
  438. tg.GetTemp(current_asmdata.CurrAsmList,2,2,tt_normal,newcw);
  439. {$ifdef i8086}
  440. if current_settings.cputype<=cpu_286 then
  441. begin
  442. emit_ref(A_FSTCW,S_NO,newcw);
  443. emit_ref(A_FSTCW,S_NO,oldcw);
  444. emit_none(A_FWAIT,S_NO);
  445. end
  446. else
  447. {$endif i8086}
  448. begin
  449. emit_ref(A_FNSTCW,S_NO,newcw);
  450. emit_ref(A_FNSTCW,S_NO,oldcw);
  451. end;
  452. emit_const_ref(A_OR,S_W,$0f00,newcw);
  453. load_fpu_location(left);
  454. emit_ref(A_FLDCW,S_NO,newcw);
  455. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  456. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  457. emit_ref(A_FISTP,S_IQ,location.reference);
  458. tcgx86(cg).dec_fpu_stack;
  459. emit_ref(A_FLDCW,S_NO,oldcw);
  460. emit_none(A_FWAIT,S_NO);
  461. tg.UnGetTemp(current_asmdata.CurrAsmList,oldcw);
  462. tg.UnGetTemp(current_asmdata.CurrAsmList,newcw);
  463. end;
  464. end;
  465. end;
  466. procedure tx86inlinenode.second_sqr_real;
  467. begin
  468. if use_vectorfpu(resultdef) then
  469. begin
  470. secondpass(left);
  471. location_reset(location,LOC_MMREGISTER,left.location.size);
  472. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  473. if UseAVX then
  474. begin
  475. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  476. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location.register,left.location.register,location.register,mms_movescalar);
  477. end
  478. else
  479. begin
  480. if left.location.loc in [LOC_CFPUREGISTER,LOC_FPUREGISTER] then
  481. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  482. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  483. cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,location.register,location.register,mms_movescalar);
  484. end;
  485. end
  486. else
  487. begin
  488. load_fpu_location(left);
  489. emit_reg_reg(A_FMUL,S_NO,NR_ST0,NR_ST0);
  490. end;
  491. end;
  492. procedure tx86inlinenode.second_sqrt_real;
  493. begin
  494. if use_vectorfpu(resultdef) then
  495. begin
  496. secondpass(left);
  497. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  498. location_reset(location,LOC_MMREGISTER,left.location.size);
  499. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  500. if UseAVX then
  501. case tfloatdef(resultdef).floattype of
  502. s32real:
  503. { we use S_NO instead of S_XMM here, regardless of the register size, as the size of the memory location is 32/64 bit }
  504. { using left.location.register here as 2nd parameter is crucial to break dependency chains }
  505. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSS,S_NO,left.location.register,left.location.register,location.register));
  506. s64real:
  507. { we use S_NO instead of S_XMM here, regardless of the register size, as the size of the memory location is 32/64 bit }
  508. { using left.location.register here as 2nd parameter is crucial to break dependency chains }
  509. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSD,S_NO,left.location.register,left.location.register,location.register));
  510. else
  511. internalerror(200510031);
  512. end
  513. else
  514. case tfloatdef(resultdef).floattype of
  515. s32real:
  516. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_NO,left.location.register,location.register));
  517. s64real:
  518. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_NO,left.location.register,location.register));
  519. else
  520. internalerror(200510031);
  521. end;
  522. end
  523. else
  524. begin
  525. load_fpu_location(left);
  526. emit_none(A_FSQRT,S_NO);
  527. end;
  528. end;
  529. procedure tx86inlinenode.second_ln_real;
  530. begin
  531. load_fpu_location(left);
  532. emit_none(A_FLDLN2,S_NO);
  533. emit_none(A_FXCH,S_NO);
  534. emit_none(A_FYL2X,S_NO);
  535. end;
  536. procedure tx86inlinenode.second_cos_real;
  537. begin
  538. {$ifdef i8086}
  539. { FCOS is 387+ }
  540. if current_settings.cputype < cpu_386 then
  541. begin
  542. inherited;
  543. exit;
  544. end;
  545. {$endif i8086}
  546. load_fpu_location(left);
  547. emit_none(A_FCOS,S_NO);
  548. end;
  549. procedure tx86inlinenode.second_sin_real;
  550. begin
  551. {$ifdef i8086}
  552. { FSIN is 387+ }
  553. if current_settings.cputype < cpu_386 then
  554. begin
  555. inherited;
  556. exit;
  557. end;
  558. {$endif i8086}
  559. load_fpu_location(left);
  560. emit_none(A_FSIN,S_NO)
  561. end;
  562. procedure tx86inlinenode.second_prefetch;
  563. var
  564. ref : treference;
  565. r : tregister;
  566. checkpointer_used : boolean;
  567. begin
  568. {$if defined(i386) or defined(i8086)}
  569. if current_settings.cputype>=cpu_Pentium3 then
  570. {$endif i386 or i8086}
  571. begin
  572. { do not call Checkpointer for left node }
  573. checkpointer_used:=(cs_checkpointer in current_settings.localswitches);
  574. if checkpointer_used then
  575. node_change_local_switch(left,cs_checkpointer,false);
  576. secondpass(left);
  577. if checkpointer_used then
  578. node_change_local_switch(left,cs_checkpointer,false);
  579. case left.location.loc of
  580. LOC_CREFERENCE,
  581. LOC_REFERENCE:
  582. begin
  583. r:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
  584. cg.a_loadaddr_ref_reg(current_asmdata.CurrAsmList,left.location.reference,r);
  585. reference_reset_base(ref,r,0,left.location.reference.alignment,left.location.reference.volatility);
  586. current_asmdata.CurrAsmList.concat(taicpu.op_ref(A_PREFETCHNTA,S_NO,ref));
  587. end;
  588. else
  589. { nothing to prefetch };
  590. end;
  591. end;
  592. end;
  593. procedure tx86inlinenode.second_abs_long;
  594. var
  595. hregister : tregister;
  596. opsize : tcgsize;
  597. hp : taicpu;
  598. begin
  599. {$if defined(i8086) or defined(i386)}
  600. if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) then
  601. begin
  602. opsize:=def_cgsize(left.resultdef);
  603. secondpass(left);
  604. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false);
  605. location:=left.location;
  606. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  607. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,location.register);
  608. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SAR,opsize,tcgsize2size[opsize]*8-1,left.location.register);
  609. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_XOR,opsize,left.location.register,location.register);
  610. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_SUB,opsize,left.location.register,location.register);
  611. end
  612. else
  613. {$endif i8086 or i386}
  614. begin
  615. opsize:=def_cgsize(left.resultdef);
  616. secondpass(left);
  617. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
  618. hregister:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  619. location:=left.location;
  620. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  621. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,hregister);
  622. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,location.register);
  623. emit_reg(A_NEG,tcgsize2opsize[opsize],hregister);
  624. hp:=taicpu.op_reg_reg(A_CMOVcc,tcgsize2opsize[opsize],hregister,location.register);
  625. hp.condition:=C_NS;
  626. current_asmdata.CurrAsmList.concat(hp);
  627. end;
  628. end;
  629. {*****************************************************************************
  630. INCLUDE/EXCLUDE GENERIC HANDLING
  631. *****************************************************************************}
  632. procedure tx86inlinenode.second_IncludeExclude;
  633. var
  634. hregister,
  635. hregister2: tregister;
  636. setbase : aint;
  637. bitsperop,l : longint;
  638. cgop : topcg;
  639. asmop : tasmop;
  640. opdef : tdef;
  641. opsize,
  642. orgsize: tcgsize;
  643. begin
  644. {$ifdef i8086}
  645. { BTS and BTR are 386+ }
  646. if current_settings.cputype < cpu_386 then
  647. begin
  648. inherited;
  649. exit;
  650. end;
  651. {$endif i8086}
  652. if is_smallset(tcallparanode(left).resultdef) then
  653. begin
  654. opdef:=tcallparanode(left).resultdef;
  655. opsize:=int_cgsize(opdef.size)
  656. end
  657. else
  658. begin
  659. opdef:=u32inttype;
  660. opsize:=OS_32;
  661. end;
  662. bitsperop:=(8*tcgsize2size[opsize]);
  663. secondpass(tcallparanode(left).left);
  664. secondpass(tcallparanode(tcallparanode(left).right).left);
  665. setbase:=tsetdef(tcallparanode(left).left.resultdef).setbase;
  666. if tcallparanode(tcallparanode(left).right).left.location.loc=LOC_CONSTANT then
  667. begin
  668. { calculate bit position }
  669. l:=1 shl ((tcallparanode(tcallparanode(left).right).left.location.value-setbase) mod bitsperop);
  670. { determine operator }
  671. if inlinenumber=in_include_x_y then
  672. cgop:=OP_OR
  673. else
  674. begin
  675. cgop:=OP_AND;
  676. l:=not(l);
  677. end;
  678. case tcallparanode(left).left.location.loc of
  679. LOC_REFERENCE :
  680. begin
  681. inc(tcallparanode(left).left.location.reference.offset,
  682. ((tcallparanode(tcallparanode(left).right).left.location.value-setbase) div bitsperop)*tcgsize2size[opsize]);
  683. cg.a_op_const_ref(current_asmdata.CurrAsmList,cgop,opsize,l,tcallparanode(left).left.location.reference);
  684. end;
  685. LOC_CREGISTER :
  686. cg.a_op_const_reg(current_asmdata.CurrAsmList,cgop,tcallparanode(left).left.location.size,l,tcallparanode(left).left.location.register);
  687. else
  688. internalerror(200405022);
  689. end;
  690. end
  691. else
  692. begin
  693. orgsize:=opsize;
  694. if opsize in [OS_8,OS_S8] then
  695. begin
  696. opdef:=u32inttype;
  697. opsize:=OS_32;
  698. end;
  699. { determine asm operator }
  700. if inlinenumber=in_include_x_y then
  701. asmop:=A_BTS
  702. else
  703. asmop:=A_BTR;
  704. hlcg.location_force_reg(current_asmdata.CurrAsmList,tcallparanode(tcallparanode(left).right).left.location,tcallparanode(tcallparanode(left).right).left.resultdef,opdef,true);
  705. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,tcallparanode(tcallparanode(left).right).left.resultdef,tcallparanode(tcallparanode(left).right).left.location,setbase);
  706. hregister:=tcallparanode(tcallparanode(left).right).left.location.register;
  707. if (tcallparanode(left).left.location.loc=LOC_REFERENCE) then
  708. emit_reg_ref(asmop,tcgsize2opsize[opsize],hregister,tcallparanode(left).left.location.reference)
  709. else
  710. begin
  711. { second argument can't be an 8 bit register either }
  712. hregister2:=tcallparanode(left).left.location.register;
  713. if (orgsize in [OS_8,OS_S8]) then
  714. hregister2:=cg.makeregsize(current_asmdata.CurrAsmList,hregister2,opsize);
  715. emit_reg_reg(asmop,tcgsize2opsize[opsize],hregister,hregister2);
  716. end;
  717. end;
  718. end;
  719. procedure tx86inlinenode.second_popcnt;
  720. var
  721. opsize: tcgsize;
  722. begin
  723. secondpass(left);
  724. opsize:=tcgsize2unsigned[left.location.size];
  725. { no 8 Bit popcont }
  726. if opsize=OS_8 then
  727. opsize:=OS_16;
  728. if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_REFERENCE,LOC_CREFERENCE]) or
  729. (left.location.size<>opsize) then
  730. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,cgsize_orddef(opsize),true);
  731. location_reset(location,LOC_REGISTER,opsize);
  732. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  733. if left.location.loc in [LOC_REGISTER,LOC_CREGISTER] then
  734. emit_reg_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.register,location.register)
  735. else
  736. emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
  737. end;
  738. procedure tx86inlinenode.second_fma;
  739. const
  740. op : array[false..true,false..true,s32real..s64real,0..3] of TAsmOp =
  741. (
  742. { positive product }
  743. (
  744. { positive third operand }
  745. ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
  746. (A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD)
  747. ),
  748. { negative third operand }
  749. ((A_VFMSUB231SS,A_VFMSUB231SS,A_VFMSUB231SS,A_VFMSUB213SS),
  750. (A_VFMSUB231SD,A_VFMSUB231SD,A_VFMSUB231SD,A_VFMSUB213SD)
  751. )
  752. ),
  753. { negative product }
  754. (
  755. { positive third operand }
  756. ((A_VFNMADD231SS,A_VFNMADD231SS,A_VFNMADD231SS,A_VFNMADD213SS),
  757. (A_VFNMADD231SD,A_VFNMADD231SD,A_VFNMADD231SD,A_VFNMADD213SD)
  758. ),
  759. { negative third operand }
  760. ((A_VFNMSUB231SS,A_VFNMSUB231SS,A_VFNMSUB231SS,A_VFNMSUB213SS),
  761. (A_VFNMSUB231SD,A_VFNMSUB231SD,A_VFNMSUB231SD,A_VFNMSUB213SD)
  762. )
  763. )
  764. );
  765. var
  766. paraarray : array[1..3] of tnode;
  767. memop,
  768. i : integer;
  769. negop3,
  770. negproduct,
  771. gotmem : boolean;
  772. begin
  773. {$ifndef i8086}
  774. if (cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[] then
  775. begin
  776. negop3:=false;
  777. negproduct:=false;
  778. paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
  779. paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
  780. paraarray[3]:=tcallparanode(parameters).paravalue;
  781. { check if a neg. node can be removed
  782. this is possible because changing the sign of
  783. a floating point number does not affect its absolute
  784. value in any way
  785. }
  786. if paraarray[1].nodetype=unaryminusn then
  787. begin
  788. paraarray[1]:=tunarynode(paraarray[1]).left;
  789. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  790. only no code is generated for it }
  791. negproduct:=not(negproduct);
  792. end;
  793. if paraarray[2].nodetype=unaryminusn then
  794. begin
  795. paraarray[2]:=tunarynode(paraarray[2]).left;
  796. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  797. only no code is generated for it }
  798. negproduct:=not(negproduct);
  799. end;
  800. if paraarray[3].nodetype=unaryminusn then
  801. begin
  802. paraarray[3]:=tunarynode(paraarray[3]).left;
  803. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  804. only no code is generated for it }
  805. negop3:=true;
  806. end;
  807. for i:=1 to 3 do
  808. secondpass(paraarray[i]);
  809. { only one memory operand is allowed }
  810. gotmem:=false;
  811. memop:=0;
  812. for i:=1 to 3 do
  813. begin
  814. if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  815. begin
  816. if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
  817. begin
  818. memop:=i;
  819. gotmem:=true;
  820. end
  821. else
  822. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
  823. end;
  824. end;
  825. location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
  826. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  827. if gotmem then
  828. begin
  829. case memop of
  830. 1:
  831. begin
  832. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  833. paraarray[3].location.register,location.register,mms_movescalar);
  834. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  835. paraarray[1].location.reference,paraarray[2].location.register,location.register);
  836. end;
  837. 2:
  838. begin
  839. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  840. paraarray[3].location.register,location.register,mms_movescalar);
  841. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  842. paraarray[2].location.reference,paraarray[1].location.register,location.register);
  843. end;
  844. 3:
  845. begin
  846. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  847. paraarray[1].location.register,location.register,mms_movescalar);
  848. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  849. paraarray[3].location.reference,paraarray[2].location.register,location.register);
  850. end
  851. else
  852. internalerror(2014041301);
  853. end;
  854. end
  855. else
  856. begin
  857. { try to use the location which is already in a temp. mm register as destination,
  858. so the compiler might be able to re-use the register }
  859. if paraarray[1].location.loc=LOC_MMREGISTER then
  860. begin
  861. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  862. paraarray[1].location.register,location.register,mms_movescalar);
  863. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,3],S_NO,
  864. paraarray[3].location.register,paraarray[2].location.register,location.register);
  865. end
  866. else if paraarray[2].location.loc=LOC_MMREGISTER then
  867. begin
  868. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[2].resultdef,resultdef,
  869. paraarray[2].location.register,location.register,mms_movescalar);
  870. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,3],S_NO,
  871. paraarray[3].location.register,paraarray[1].location.register,location.register);
  872. end
  873. else
  874. begin
  875. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  876. paraarray[3].location.register,location.register,mms_movescalar);
  877. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,0],S_NO,
  878. paraarray[1].location.register,paraarray[2].location.register,location.register);
  879. end;
  880. end;
  881. end
  882. else
  883. {$endif i8086}
  884. internalerror(2014032301);
  885. end;
  886. procedure tx86inlinenode.second_frac_real;
  887. var
  888. extrareg : TRegister;
  889. begin
  890. if use_vectorfpu(resultdef) then
  891. begin
  892. secondpass(left);
  893. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  894. location_reset(location,LOC_MMREGISTER,left.location.size);
  895. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  896. if UseAVX then
  897. case tfloatdef(resultdef).floattype of
  898. s32real:
  899. begin
  900. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  901. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSS,S_NO,3,left.location.register,left.location.register,location.register));
  902. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSUBSS,S_NO,location.register,left.location.register,location.register));
  903. end;
  904. s64real:
  905. begin
  906. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  907. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSD,S_NO,3,left.location.register,left.location.register,location.register));
  908. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSUBSD,S_NO,location.register,left.location.register,location.register));
  909. end;
  910. else
  911. internalerror(2017052102);
  912. end
  913. else
  914. begin
  915. extrareg:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  916. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  917. case tfloatdef(resultdef).floattype of
  918. s32real:
  919. begin
  920. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSS,S_NO,3,left.location.register,extrareg));
  921. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SUBSS,S_NO,extrareg,location.register));
  922. end;
  923. s64real:
  924. begin
  925. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSD,S_NO,3,left.location.register,extrareg));
  926. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SUBSD,S_NO,extrareg,location.register));
  927. end;
  928. else
  929. internalerror(2017052103);
  930. end;
  931. end;
  932. end
  933. else
  934. internalerror(2017052101);
  935. end;
  936. procedure tx86inlinenode.second_int_real;
  937. var
  938. extrareg : TRegister;
  939. begin
  940. if use_vectorfpu(resultdef) then
  941. begin
  942. secondpass(left);
  943. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  944. location_reset(location,LOC_MMREGISTER,left.location.size);
  945. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  946. if UseAVX then
  947. case tfloatdef(resultdef).floattype of
  948. s32real:
  949. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  950. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSS,S_NO,3,left.location.register,left.location.register,location.register));
  951. s64real:
  952. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  953. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSD,S_NO,3,left.location.register,left.location.register,location.register));
  954. else
  955. internalerror(2017052105);
  956. end
  957. else
  958. begin
  959. case tfloatdef(resultdef).floattype of
  960. s32real:
  961. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSS,S_NO,3,left.location.register,location.register));
  962. s64real:
  963. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSD,S_NO,3,left.location.register,location.register));
  964. else
  965. internalerror(2017052106);
  966. end;
  967. end;
  968. end
  969. else
  970. internalerror(2017052107);
  971. end;
  972. end.