nx86inl.pas 65 KB


  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl
  3. Generate x86 inline nodes
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86inl;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node,ninl,ncginl;
  22. type
  23. tx86inlinenode = class(tcginlinenode)
  24. protected
  25. procedure maybe_remove_round_trunc_typeconv; virtual;
  26. public
  27. function pass_typecheck_cpu:tnode;override;
  28. { first pass override
  29. so that the code generator will actually generate
  30. these nodes.
  31. }
  32. function first_cpu: tnode;override;
  33. function first_pi: tnode ; override;
  34. function first_arctan_real: tnode; override;
  35. function first_abs_real: tnode; override;
  36. function first_sqr_real: tnode; override;
  37. function first_sqrt_real: tnode; override;
  38. function first_ln_real: tnode; override;
  39. function first_cos_real: tnode; override;
  40. function first_sin_real: tnode; override;
  41. function first_round_real: tnode; override;
  42. function first_trunc_real: tnode; override;
  43. function first_popcnt: tnode; override;
  44. function first_fma: tnode; override;
  45. function first_frac_real : tnode; override;
  46. function first_int_real : tnode; override;
  47. function first_minmax: tnode; override;
  48. function simplify(forinline : boolean) : tnode; override;
  49. { second pass override to generate these nodes }
  50. procedure pass_generate_code_cpu;override;
  51. procedure second_IncludeExclude;override;
  52. procedure second_AndOrXorShiftRot_assign;override;
  53. procedure second_pi; override;
  54. procedure second_arctan_real; override;
  55. procedure second_abs_real; override;
  56. procedure second_round_real; override;
  57. procedure second_sqr_real; override;
  58. procedure second_sqrt_real; override;
  59. procedure second_ln_real; override;
  60. procedure second_cos_real; override;
  61. procedure second_sin_real; override;
  62. procedure second_trunc_real; override;
  63. procedure second_prefetch;override;
  64. procedure second_abs_long;override;
  65. procedure second_popcnt;override;
  66. procedure second_fma;override;
  67. procedure second_frac_real;override;
  68. procedure second_int_real;override;
  69. procedure second_high;override;
  70. procedure second_minmax;override;
  71. private
  72. procedure load_fpu_location(lnode: tnode);
  73. end;
  74. implementation
  75. uses
  76. systems,
  77. globtype,globals,
  78. verbose,compinnr,fmodule,
  79. defutil,
  80. aasmbase,aasmdata,aasmcpu,
  81. symconst,symtype,symdef,symcpu,
  82. ncnv,
  83. htypechk,
  84. cgbase,pass_1,pass_2,
  85. cpuinfo,cpubase,nutils,
  86. ncal,ncgutil,nld,ncon,nadd,nmat,constexp,
  87. tgobj,
  88. cga,cgutils,cgx86,cgobj,hlcgobj;
  89. {*****************************************************************************
  90. TX86INLINENODE
  91. *****************************************************************************}
  92. procedure tx86inlinenode.maybe_remove_round_trunc_typeconv;
  93. begin
  94. { only makes a difference for x86_64 }
  95. end;
  96. function tx86inlinenode.pass_typecheck_cpu: tnode;
  97. begin
  98. Result:=nil;
  99. case inlinenumber of
  100. in_x86_inportb:
  101. begin
  102. CheckParameters(1);
  103. resultdef:=u8inttype;
  104. end;
  105. in_x86_inportw:
  106. begin
  107. CheckParameters(1);
  108. resultdef:=u16inttype;
  109. end;
  110. in_x86_inportl:
  111. begin
  112. CheckParameters(1);
  113. resultdef:=s32inttype;
  114. end;
  115. in_x86_outportb,
  116. in_x86_outportw,
  117. in_x86_outportl:
  118. begin
  119. CheckParameters(2);
  120. resultdef:=voidtype;
  121. end;
  122. in_x86_cli,
  123. in_x86_sti:
  124. resultdef:=voidtype;
  125. in_x86_get_cs,
  126. in_x86_get_ss,
  127. in_x86_get_ds,
  128. in_x86_get_es,
  129. in_x86_get_fs,
  130. in_x86_get_gs:
  131. {$ifdef i8086}
  132. resultdef:=u16inttype;
  133. {$else i8086}
  134. resultdef:=s32inttype;
  135. {$endif i8086}
  136. { include automatically generated code }
  137. {$i x86mmtype.inc}
  138. else
  139. Result:=inherited pass_typecheck_cpu;
  140. end;
  141. end;
  142. function tx86inlinenode.first_cpu: tnode;
  143. begin
  144. Result:=nil;
  145. case inlinenumber of
  146. in_x86_inportb,
  147. in_x86_inportw,
  148. in_x86_inportl,
  149. in_x86_get_cs,
  150. in_x86_get_ss,
  151. in_x86_get_ds,
  152. in_x86_get_es,
  153. in_x86_get_fs,
  154. in_x86_get_gs:
  155. expectloc:=LOC_REGISTER;
  156. in_x86_outportb,
  157. in_x86_outportw,
  158. in_x86_outportl,
  159. in_x86_cli,
  160. in_x86_sti:
  161. expectloc:=LOC_VOID;
  162. { include automatically generated code }
  163. {$i x86mmfirst.inc}
  164. else
  165. Result:=inherited first_cpu;
  166. end;
  167. end;
  168. function tx86inlinenode.first_pi : tnode;
  169. begin
  170. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  171. begin
  172. expectloc:=LOC_FPUREGISTER;
  173. first_pi := nil;
  174. end
  175. else
  176. result:=inherited;
  177. end;
  178. function tx86inlinenode.first_arctan_real : tnode;
  179. begin
  180. {$ifdef i8086}
  181. { FPATAN's range is limited to (0 <= value < 1) on the 8087 and 80287,
  182. so we need to use the RTL helper on these FPUs }
  183. if current_settings.cputype < cpu_386 then
  184. begin
  185. result := inherited;
  186. exit;
  187. end;
  188. {$endif i8086}
  189. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  190. begin
  191. expectloc:=LOC_FPUREGISTER;
  192. first_arctan_real := nil;
  193. end
  194. else
  195. result:=inherited;
  196. end;
  197. function tx86inlinenode.first_abs_real : tnode;
  198. begin
  199. if use_vectorfpu(resultdef) then
  200. expectloc:=LOC_MMREGISTER
  201. else
  202. expectloc:=LOC_FPUREGISTER;
  203. first_abs_real := nil;
  204. end;
  205. function tx86inlinenode.first_sqr_real : tnode;
  206. begin
  207. if use_vectorfpu(resultdef) then
  208. expectloc:=LOC_MMREGISTER
  209. else
  210. expectloc:=LOC_FPUREGISTER;
  211. first_sqr_real := nil;
  212. end;
  213. function tx86inlinenode.first_sqrt_real : tnode;
  214. begin
  215. if use_vectorfpu(resultdef) then
  216. expectloc:=LOC_MMREGISTER
  217. else
  218. expectloc:=LOC_FPUREGISTER;
  219. first_sqrt_real := nil;
  220. end;
  221. function tx86inlinenode.first_ln_real : tnode;
  222. begin
  223. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  224. begin
  225. expectloc:=LOC_FPUREGISTER;
  226. first_ln_real := nil;
  227. end
  228. else
  229. result:=inherited;
  230. end;
  231. function tx86inlinenode.first_cos_real : tnode;
  232. begin
  233. {$ifdef i8086}
  234. { FCOS is 387+ }
  235. if current_settings.cputype < cpu_386 then
  236. begin
  237. result := inherited;
  238. exit;
  239. end;
  240. {$endif i8086}
  241. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  242. begin
  243. expectloc:=LOC_FPUREGISTER;
  244. result:=nil;
  245. end
  246. else
  247. result:=inherited;
  248. end;
  249. function tx86inlinenode.first_sin_real : tnode;
  250. begin
  251. {$ifdef i8086}
  252. { FSIN is 387+ }
  253. if current_settings.cputype < cpu_386 then
  254. begin
  255. result := inherited;
  256. exit;
  257. end;
  258. {$endif i8086}
  259. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  260. begin
  261. expectloc:=LOC_FPUREGISTER;
  262. result:=nil;
  263. end
  264. else
  265. result:=inherited;
  266. end;
  267. function tx86inlinenode.first_round_real : tnode;
  268. begin
  269. maybe_remove_round_trunc_typeconv;
  270. {$ifdef x86_64}
  271. if use_vectorfpu(left.resultdef) then
  272. expectloc:=LOC_REGISTER
  273. else
  274. {$endif x86_64}
  275. expectloc:=LOC_REFERENCE;
  276. result:=nil;
  277. end;
  278. function tx86inlinenode.first_trunc_real: tnode;
  279. begin
  280. maybe_remove_round_trunc_typeconv;
  281. if (cs_opt_size in current_settings.optimizerswitches)
  282. {$ifdef x86_64}
  283. and not(use_vectorfpu(left.resultdef))
  284. {$endif x86_64}
  285. then
  286. result:=inherited
  287. else
  288. begin
  289. {$ifdef x86_64}
  290. if use_vectorfpu(left.resultdef) then
  291. expectloc:=LOC_REGISTER
  292. else
  293. {$endif x86_64}
  294. expectloc:=LOC_REFERENCE;
  295. result:=nil;
  296. end;
  297. end;
  298. function tx86inlinenode.first_popcnt: tnode;
  299. begin
  300. Result:=nil;
  301. {$ifndef i8086}
  302. if (CPUX86_HAS_POPCNT in cpu_capabilities[current_settings.cputype])
  303. {$ifdef i386}
  304. and not is_64bit(left.resultdef)
  305. {$endif i386}
  306. then
  307. expectloc:=LOC_REGISTER
  308. else
  309. {$endif not i8086}
  310. Result:=inherited first_popcnt
  311. end;
  312. function tx86inlinenode.first_fma : tnode;
  313. begin
  314. {$ifndef i8086}
  315. if ((fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[]) and
  316. ((is_double(resultdef)) or (is_single(resultdef))) then
  317. begin
  318. expectloc:=LOC_MMREGISTER;
  319. Result:=nil;
  320. end
  321. else
  322. {$endif i8086}
  323. Result:=inherited first_fma;
  324. end;
  325. function tx86inlinenode.first_frac_real : tnode;
  326. begin
  327. if (current_settings.fputype>=fpu_sse41) and
  328. ((is_double(resultdef)) or (is_single(resultdef))) then
  329. begin
  330. maybe_remove_round_trunc_typeconv;
  331. expectloc:=LOC_MMREGISTER;
  332. Result:=nil;
  333. end
  334. else
  335. Result:=inherited first_frac_real;
  336. end;
  337. function tx86inlinenode.first_int_real : tnode;
  338. begin
  339. if (current_settings.fputype>=fpu_sse41) and
  340. ((is_double(resultdef)) or (is_single(resultdef))) then
  341. begin
  342. Result:=nil;
  343. expectloc:=LOC_MMREGISTER;
  344. end
  345. else
  346. Result:=inherited first_int_real;
  347. end;
  348. function tx86inlinenode.first_minmax: tnode;
  349. begin
  350. {$ifndef i8086}
  351. if
  352. {$ifdef i386}
  353. ((current_settings.fputype>=fpu_sse) and is_single(resultdef)) or
  354. ((current_settings.fputype>=fpu_sse2) and is_double(resultdef))
  355. {$else i386}
  356. ((is_double(resultdef)) or (is_single(resultdef)))
  357. {$endif i386}
  358. then
  359. begin
  360. expectloc:=LOC_MMREGISTER;
  361. Result:=nil;
  362. end
  363. else
  364. {$endif i8086}
  365. Result:=inherited first_minmax;
  366. end;
  367. function tx86inlinenode.simplify(forinline : boolean) : tnode;
  368. var
  369. temp : tnode;
  370. begin
  371. if (current_settings.fputype>=fpu_sse41) and
  372. (inlinenumber=in_int_real) and (left.nodetype=typeconvn) and
  373. not(nf_explicit in left.flags) and
  374. (ttypeconvnode(left).left.resultdef.typ=floatdef) and
  375. ((is_double(ttypeconvnode(left).left.resultdef)) or (is_single(ttypeconvnode(left).left.resultdef))) then
  376. begin
  377. { get rid of the type conversion }
  378. temp:=ttypeconvnode(left).left;
  379. ttypeconvnode(left).left:=nil;
  380. left.free;
  381. left:=temp;
  382. result:=self.getcopy;
  383. tinlinenode(result).resultdef:=temp.resultdef;
  384. typecheckpass(result);
  385. end
  386. else
  387. Result:=inherited simplify(forinline);
  388. end;
  389. procedure tx86inlinenode.pass_generate_code_cpu;
  390. var
  391. paraarray : array[1..4] of tnode;
  392. i : integer;
  393. op: TAsmOp;
  394. procedure inport(dreg:TRegister;dsize:topsize;dtype:tdef);
  395. var
  396. portnumber: tnode;
  397. begin
  398. portnumber:=left;
  399. secondpass(portnumber);
  400. if (portnumber.location.loc=LOC_CONSTANT) and
  401. (portnumber.location.value>=0) and
  402. (portnumber.location.value<=255) then
  403. begin
  404. hlcg.getcpuregister(current_asmdata.CurrAsmList,dreg);
  405. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_IN,dsize,portnumber.location.value,dreg));
  406. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  407. location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  408. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,dreg);
  409. hlcg.a_load_reg_reg(current_asmdata.CurrAsmList,dtype,resultdef,dreg,location.register);
  410. end
  411. else
  412. begin
  413. hlcg.getcpuregister(current_asmdata.CurrAsmList,NR_DX);
  414. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,portnumber.resultdef,u16inttype,portnumber.location,NR_DX);
  415. hlcg.getcpuregister(current_asmdata.CurrAsmList,dreg);
  416. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_IN,dsize,NR_DX,dreg));
  417. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,NR_DX);
  418. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  419. location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  420. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,dreg);
  421. hlcg.a_load_reg_reg(current_asmdata.CurrAsmList,dtype,resultdef,dreg,location.register);
  422. end;
  423. end;
  424. procedure outport(dreg:TRegister;dsize:topsize;dtype:tdef);
  425. var
  426. portnumber, portdata: tnode;
  427. begin
  428. portnumber:=tcallparanode(tcallparanode(left).right).left;
  429. portdata:=tcallparanode(left).left;
  430. secondpass(portdata);
  431. secondpass(portnumber);
  432. hlcg.getcpuregister(current_asmdata.CurrAsmList,dreg);
  433. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,portdata.resultdef,dtype,portdata.location,dreg);
  434. if (portnumber.location.loc=LOC_CONSTANT) and
  435. (portnumber.location.value>=0) and
  436. (portnumber.location.value<=255) then
  437. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_OUT,dsize,dreg,portnumber.location.value))
  438. else
  439. begin
  440. hlcg.getcpuregister(current_asmdata.CurrAsmList,NR_DX);
  441. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,portnumber.resultdef,u16inttype,portnumber.location,NR_DX);
  442. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_OUT,dsize,dreg,NR_DX));
  443. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,NR_DX);
  444. end;
  445. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,dreg);
  446. end;
  447. procedure get_segreg(segreg:tregister);
  448. begin
  449. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  450. location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  451. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MOV,TCGSize2OpSize[def_cgsize(resultdef)],segreg,location.register));
  452. end;
  453. function GetConstInt(n: tnode): longint;
  454. begin
  455. Result:=0;
  456. if is_constintnode(n) then
  457. result:=tordconstnode(n).value.svalue
  458. else
  459. Message(type_e_constant_expr_expected);
  460. end;
  461. procedure GetParameters(count: longint);
  462. var
  463. i: longint;
  464. p: tnode;
  465. begin
  466. if (count=1) and
  467. (not (left is tcallparanode)) then
  468. paraarray[1]:=left
  469. else
  470. begin
  471. p:=left;
  472. for i := count downto 1 do
  473. begin
  474. paraarray[i]:=tcallparanode(p).paravalue;
  475. p:=tcallparanode(p).nextpara;
  476. end;
  477. end;
  478. end;
  479. procedure location_force_mmxreg(list:TAsmList;var l: tlocation;maybeconst:boolean);
  480. var
  481. reg : tregister;
  482. begin
  483. if (l.loc<>LOC_MMXREGISTER) and
  484. ((l.loc<>LOC_CMMXREGISTER) or (not maybeconst)) then
  485. begin
  486. reg:=tcgx86(cg).getmmxregister(list);
  487. cg.a_loadmm_loc_reg(list,OS_M64,l,reg,nil);
  488. location_freetemp(list,l);
  489. location_reset(l,LOC_MMXREGISTER,OS_M64);
  490. l.register:=reg;
  491. end;
  492. end;
  493. procedure location_make_ref(var loc: tlocation);
  494. var
  495. hloc: tlocation;
  496. begin
  497. case loc.loc of
  498. LOC_CREGISTER,
  499. LOC_REGISTER:
  500. begin
  501. location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1, []);
  502. hloc.reference.base:=loc.register;
  503. loc:=hloc;
  504. end;
  505. LOC_CREFERENCE,
  506. LOC_REFERENCE:
  507. begin
  508. end;
  509. else
  510. begin
  511. hlcg.location_force_reg(current_asmdata.CurrAsmList,loc,u32inttype,u32inttype,false);
  512. location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1, []);
  513. hloc.reference.base:=loc.register;
  514. loc:=hloc;
  515. end;
  516. end;
  517. end;
  518. begin
  519. FillChar(paraarray,sizeof(paraarray),0);
  520. case inlinenumber of
  521. in_x86_inportb:
  522. inport(NR_AL,S_B,u8inttype);
  523. in_x86_inportw:
  524. inport(NR_AX,S_W,u16inttype);
  525. in_x86_inportl:
  526. inport(NR_EAX,S_L,s32inttype);
  527. in_x86_outportb:
  528. outport(NR_AL,S_B,u8inttype);
  529. in_x86_outportw:
  530. outport(NR_AX,S_W,u16inttype);
  531. in_x86_outportl:
  532. outport(NR_EAX,S_L,s32inttype);
  533. in_x86_cli:
  534. current_asmdata.CurrAsmList.concat(taicpu.op_none(A_CLI));
  535. in_x86_sti:
  536. current_asmdata.CurrAsmList.concat(taicpu.op_none(A_STI));
  537. in_x86_get_cs:
  538. get_segreg(NR_CS);
  539. in_x86_get_ss:
  540. get_segreg(NR_SS);
  541. in_x86_get_ds:
  542. get_segreg(NR_DS);
  543. in_x86_get_es:
  544. get_segreg(NR_ES);
  545. in_x86_get_fs:
  546. get_segreg(NR_FS);
  547. in_x86_get_gs:
  548. get_segreg(NR_GS);
  549. {$i x86mmsecond.inc}
  550. else
  551. inherited pass_generate_code_cpu;
  552. end;
  553. end;
  554. procedure tx86inlinenode.second_AndOrXorShiftRot_assign;
  555. var
  556. opsize : tcgsize;
  557. valuenode, indexnode, loadnode: TNode;
  558. DestReg: TRegister;
  559. begin
  560. {$ifndef i8086}
  561. if (cs_opt_level2 in current_settings.optimizerswitches) then
  562. begin
  563. { Saves on a lot of typecasting and potential coding mistakes }
  564. valuenode := tcallparanode(left).left;
  565. loadnode := tcallparanode(tcallparanode(left).right).left;
  566. opsize := def_cgsize(loadnode.resultdef);
  567. { BMI2 optimisations }
  568. if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and (inlinenumber=in_and_assign_x_y) then
  569. begin
  570. { If the second operand is "((1 shl y) - 1)", we can turn it
  571. into a BZHI operator instead }
  572. if (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
  573. (valuenode.nodetype = subn) and
  574. (taddnode(valuenode).right.nodetype = ordconstn) and
  575. (tordconstnode(taddnode(valuenode).right).value = 1) and
  576. (taddnode(valuenode).left.nodetype = shln) and
  577. (tshlshrnode(taddnode(valuenode).left).left.nodetype = ordconstn) and
  578. (tordconstnode(tshlshrnode(taddnode(valuenode).left).left).value = 1) then
  579. begin
  580. { Skip the subtract and shift nodes completely }
  581. { Helps avoid all the awkward typecasts }
  582. indexnode := tshlshrnode(taddnode(valuenode).left).right;
  583. {$ifdef x86_64}
  584. { The code generator sometimes extends the shift result to 64-bit unnecessarily }
  585. if (indexnode.nodetype = typeconvn) and (opsize in [OS_32, OS_S32]) and
  586. (def_cgsize(TTypeConvNode(indexnode).resultdef) in [OS_64, OS_S64]) then
  587. begin
  588. { Convert to the 32-bit type }
  589. indexnode.resultdef := loadnode.resultdef;
  590. node_reset_flags(indexnode,[nf_pass1_done]);
  591. { We should't be getting any new errors }
  592. if do_firstpass(indexnode) then
  593. InternalError(2022110202);
  594. { Keep things internally consistent in case indexnode changed }
  595. tshlshrnode(taddnode(valuenode).left).right := indexnode;
  596. end;
  597. {$endif x86_64}
  598. secondpass(indexnode);
  599. secondpass(loadnode);
  600. { allocate registers }
  601. hlcg.location_force_reg(
  602. current_asmdata.CurrAsmList,
  603. indexnode.location,
  604. indexnode.resultdef,
  605. loadnode.resultdef,
  606. false
  607. );
  608. case loadnode.location.loc of
  609. LOC_REFERENCE,
  610. LOC_CREFERENCE:
  611. begin
  612. { BZHI can only write to a register }
  613. DestReg := cg.getintregister(current_asmdata.CurrAsmList,opsize);
  614. emit_reg_ref_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, loadnode.location.reference, DestReg);
  615. emit_reg_ref(A_MOV, TCGSize2OpSize[opsize], DestReg, loadnode.location.reference);
  616. end;
  617. LOC_REGISTER,
  618. LOC_CREGISTER:
  619. emit_reg_reg_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, loadnode.location.register, loadnode.location.register);
  620. else
  621. InternalError(2022102120);
  622. end;
  623. Exit;
  624. end;
  625. end;
  626. end;
  627. {$endif not i8086}
  628. inherited second_AndOrXorShiftRot_assign;
  629. end;
  630. procedure tx86inlinenode.second_pi;
  631. begin
  632. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  633. emit_none(A_FLDPI,S_NO);
  634. tcgx86(cg).inc_fpu_stack;
  635. location.register:=NR_FPU_RESULT_REG;
  636. end;
  637. { load the FPU into the an fpu register }
  638. procedure tx86inlinenode.load_fpu_location(lnode: tnode);
  639. begin
  640. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  641. location.register:=NR_FPU_RESULT_REG;
  642. secondpass(lnode);
  643. case lnode.location.loc of
  644. LOC_FPUREGISTER:
  645. ;
  646. LOC_CFPUREGISTER:
  647. begin
  648. cg.a_loadfpu_reg_reg(current_asmdata.CurrAsmList,lnode.location.size,
  649. lnode.location.size,lnode.location.register,location.register);
  650. end;
  651. LOC_REFERENCE,LOC_CREFERENCE:
  652. begin
  653. cg.a_loadfpu_ref_reg(current_asmdata.CurrAsmList,
  654. lnode.location.size,lnode.location.size,
  655. lnode.location.reference,location.register);
  656. end;
  657. LOC_MMREGISTER,LOC_CMMREGISTER:
  658. begin
  659. location:=lnode.location;
  660. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,location,lnode.resultdef,false);
  661. end;
  662. else
  663. internalerror(309991);
  664. end;
  665. end;
  666. procedure tx86inlinenode.second_arctan_real;
  667. begin
  668. load_fpu_location(left);
  669. emit_none(A_FLD1,S_NO);
  670. emit_none(A_FPATAN,S_NO);
  671. end;
  672. procedure tx86inlinenode.second_abs_real;
  673. function needs_indirect:boolean; inline;
  674. begin
  675. result:=(tf_supports_packages in target_info.flags) and
  676. (target_info.system in systems_indirect_var_imports);
  677. end;
  678. var
  679. href : treference;
  680. sym : tasmsymbol;
  681. begin
  682. if use_vectorfpu(resultdef) then
  683. begin
  684. secondpass(left);
  685. if left.location.loc<>LOC_MMREGISTER then
  686. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,UseAVX);
  687. if UseAVX then
  688. begin
  689. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  690. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
  691. end
  692. else
  693. location:=left.location;
  694. case tfloatdef(resultdef).floattype of
  695. s32real:
  696. begin
  697. sym:=current_asmdata.RefAsmSymbol(target_info.cprefix+'FPC_ABSMASK_SINGLE',AT_DATA,needs_indirect);
  698. reference_reset_symbol(href,sym,0,4,[]);
  699. current_module.add_extern_asmsym(sym);
  700. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList, href);
  701. if UseAVX then
  702. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(
  703. A_VANDPS,S_XMM,href,left.location.register,location.register))
  704. else
  705. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_ANDPS,S_XMM,href,location.register));
  706. end;
  707. s64real:
  708. begin
  709. sym:=current_asmdata.RefAsmSymbol(target_info.cprefix+'FPC_ABSMASK_DOUBLE',AT_DATA,needs_indirect);
  710. reference_reset_symbol(href,sym,0,4,[]);
  711. current_module.add_extern_asmsym(sym);
  712. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList, href);
  713. if UseAVX then
  714. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(
  715. A_VANDPD,S_XMM,href,left.location.register,location.register))
  716. else
  717. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_ANDPD,S_XMM,href,location.register))
  718. end;
  719. else
  720. internalerror(200506081);
  721. end;
  722. end
  723. else
  724. begin
  725. load_fpu_location(left);
  726. emit_none(A_FABS,S_NO);
  727. end;
  728. end;
  729. procedure tx86inlinenode.second_round_real;
  730. begin
  731. {$ifdef x86_64}
  732. if use_vectorfpu(left.resultdef) then
  733. begin
  734. secondpass(left);
  735. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  736. location_reset(location,LOC_REGISTER,OS_S64);
  737. location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
  738. if UseAVX then
  739. case left.location.size of
  740. OS_F32:
  741. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSS2SI,S_NO,left.location.register,location.register));
  742. OS_F64:
  743. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSD2SI,S_NO,left.location.register,location.register));
  744. else
  745. internalerror(2007031402);
  746. end
  747. else
  748. case left.location.size of
  749. OS_F32:
  750. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_NO,left.location.register,location.register));
  751. OS_F64:
  752. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_NO,left.location.register,location.register));
  753. else
  754. internalerror(2007031404);
  755. end;
  756. end
  757. else
  758. {$endif x86_64}
  759. begin
  760. load_fpu_location(left);
  761. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  762. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  763. emit_ref(A_FISTP,S_IQ,location.reference);
  764. tcgx86(cg).dec_fpu_stack;
  765. emit_none(A_FWAIT,S_NO);
  766. end;
  767. end;
  768. procedure tx86inlinenode.second_trunc_real;
  769. var
  770. oldcw,newcw : treference;
  771. begin
  772. {$ifdef x86_64}
  773. if use_vectorfpu(left.resultdef) and
  774. not((left.location.loc=LOC_FPUREGISTER) and (current_settings.fputype>=fpu_sse3)) then
  775. begin
  776. secondpass(left);
  777. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  778. location_reset(location,LOC_REGISTER,OS_S64);
  779. location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
  780. if UseAVX then
  781. case left.location.size of
  782. OS_F32:
  783. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSS2SI,S_NO,left.location.register,location.register));
  784. OS_F64:
  785. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSD2SI,S_NO,left.location.register,location.register));
  786. else
  787. internalerror(2007031401);
  788. end
  789. else
  790. case left.location.size of
  791. OS_F32:
  792. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_NO,left.location.register,location.register));
  793. OS_F64:
  794. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_NO,left.location.register,location.register));
  795. else
  796. internalerror(2007031403);
  797. end;
  798. end
  799. else
  800. {$endif x86_64}
  801. begin
  802. if (current_settings.fputype>=fpu_sse3) then
  803. begin
  804. load_fpu_location(left);
  805. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  806. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  807. emit_ref(A_FISTTP,S_IQ,location.reference);
  808. tcgx86(cg).dec_fpu_stack;
  809. end
  810. else
  811. begin
  812. tg.GetTemp(current_asmdata.CurrAsmList,2,2,tt_normal,oldcw);
  813. tg.GetTemp(current_asmdata.CurrAsmList,2,2,tt_normal,newcw);
  814. {$ifdef i8086}
  815. if current_settings.cputype<=cpu_286 then
  816. begin
  817. emit_ref(A_FSTCW,S_NO,newcw);
  818. emit_ref(A_FSTCW,S_NO,oldcw);
  819. emit_none(A_FWAIT,S_NO);
  820. end
  821. else
  822. {$endif i8086}
  823. begin
  824. emit_ref(A_FNSTCW,S_NO,newcw);
  825. emit_ref(A_FNSTCW,S_NO,oldcw);
  826. end;
  827. emit_const_ref(A_OR,S_W,$0f00,newcw);
  828. load_fpu_location(left);
  829. emit_ref(A_FLDCW,S_NO,newcw);
  830. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  831. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  832. emit_ref(A_FISTP,S_IQ,location.reference);
  833. tcgx86(cg).dec_fpu_stack;
  834. emit_ref(A_FLDCW,S_NO,oldcw);
  835. emit_none(A_FWAIT,S_NO);
  836. tg.UnGetTemp(current_asmdata.CurrAsmList,oldcw);
  837. tg.UnGetTemp(current_asmdata.CurrAsmList,newcw);
  838. end;
  839. end;
  840. end;
  841. procedure tx86inlinenode.second_sqr_real;
  842. begin
  843. if use_vectorfpu(resultdef) then
  844. begin
  845. secondpass(left);
  846. location_reset(location,LOC_MMREGISTER,left.location.size);
  847. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  848. if UseAVX then
  849. begin
  850. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  851. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location.register,left.location.register,location.register,mms_movescalar);
  852. end
  853. else
  854. begin
  855. if left.location.loc in [LOC_CFPUREGISTER,LOC_FPUREGISTER] then
  856. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  857. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  858. cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,location.register,location.register,mms_movescalar);
  859. end;
  860. end
  861. else
  862. begin
  863. load_fpu_location(left);
  864. emit_reg_reg(A_FMUL,S_NO,NR_ST0,NR_ST0);
  865. end;
  866. end;
  867. procedure tx86inlinenode.second_sqrt_real;
  868. begin
  869. if use_vectorfpu(resultdef) then
  870. begin
  871. secondpass(left);
  872. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  873. location_reset(location,LOC_MMREGISTER,left.location.size);
  874. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  875. if UseAVX then
  876. case tfloatdef(resultdef).floattype of
  877. s32real:
  878. { we use S_NO instead of S_XMM here, regardless of the register size, as the size of the memory location is 32/64 bit }
  879. { using left.location.register here as 2nd parameter is crucial to break dependency chains }
  880. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSS,S_NO,left.location.register,left.location.register,location.register));
  881. s64real:
  882. { we use S_NO instead of S_XMM here, regardless of the register size, as the size of the memory location is 32/64 bit }
  883. { using left.location.register here as 2nd parameter is crucial to break dependency chains }
  884. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSD,S_NO,left.location.register,left.location.register,location.register));
  885. else
  886. internalerror(200510031);
  887. end
  888. else
  889. case tfloatdef(resultdef).floattype of
  890. s32real:
  891. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_NO,left.location.register,location.register));
  892. s64real:
  893. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_NO,left.location.register,location.register));
  894. else
  895. internalerror(2005100303);
  896. end;
  897. end
  898. else
  899. begin
  900. load_fpu_location(left);
  901. if left.location.loc=LOC_REFERENCE then
  902. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  903. emit_none(A_FSQRT,S_NO);
  904. end;
  905. end;
  906. procedure tx86inlinenode.second_ln_real;
  907. begin
  908. load_fpu_location(left);
  909. emit_none(A_FLDLN2,S_NO);
  910. emit_none(A_FXCH,S_NO);
  911. emit_none(A_FYL2X,S_NO);
  912. end;
  913. procedure tx86inlinenode.second_cos_real;
  914. begin
  915. {$ifdef i8086}
  916. { FCOS is 387+ }
  917. if current_settings.cputype < cpu_386 then
  918. begin
  919. inherited;
  920. exit;
  921. end;
  922. {$endif i8086}
  923. load_fpu_location(left);
  924. emit_none(A_FCOS,S_NO);
  925. end;
  926. procedure tx86inlinenode.second_sin_real;
  927. begin
  928. {$ifdef i8086}
  929. { FSIN is 387+ }
  930. if current_settings.cputype < cpu_386 then
  931. begin
  932. inherited;
  933. exit;
  934. end;
  935. {$endif i8086}
  936. load_fpu_location(left);
  937. emit_none(A_FSIN,S_NO)
  938. end;
  939. procedure tx86inlinenode.second_prefetch;
  940. var
  941. ref : treference;
  942. r : tregister;
  943. checkpointer_used : boolean;
  944. begin
  945. {$if defined(i386) or defined(i8086)}
  946. if current_settings.cputype>=cpu_Pentium3 then
  947. {$endif i386 or i8086}
  948. begin
  949. { do not call Checkpointer for left node }
  950. checkpointer_used:=(cs_checkpointer in current_settings.localswitches);
  951. if checkpointer_used then
  952. node_change_local_switch(left,cs_checkpointer,false);
  953. secondpass(left);
  954. if checkpointer_used then
  955. node_change_local_switch(left,cs_checkpointer,false);
  956. case left.location.loc of
  957. LOC_CREFERENCE,
  958. LOC_REFERENCE:
  959. begin
  960. r:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
  961. cg.a_loadaddr_ref_reg(current_asmdata.CurrAsmList,left.location.reference,r);
  962. reference_reset_base(ref,r,0,left.location.reference.temppos,left.location.reference.alignment,left.location.reference.volatility);
  963. current_asmdata.CurrAsmList.concat(taicpu.op_ref(A_PREFETCHNTA,S_NO,ref));
  964. end;
  965. else
  966. { nothing to prefetch };
  967. end;
  968. end;
  969. end;
  970. procedure tx86inlinenode.second_abs_long;
  971. var
  972. hregister : tregister;
  973. opsize : tcgsize;
  974. hp : taicpu;
  975. begin
  976. {$if defined(i8086) or defined(i386)}
  977. if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) then
  978. begin
  979. opsize:=def_cgsize(left.resultdef);
  980. secondpass(left);
  981. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false);
  982. location:=left.location;
  983. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  984. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,location.register);
  985. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SAR,opsize,tcgsize2size[opsize]*8-1,left.location.register);
  986. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_XOR,opsize,left.location.register,location.register);
  987. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_SUB,opsize,left.location.register,location.register);
  988. end
  989. else
  990. {$endif i8086 or i386}
  991. begin
  992. opsize:=def_cgsize(left.resultdef);
  993. secondpass(left);
  994. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
  995. hregister:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  996. location:=left.location;
  997. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  998. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,hregister);
  999. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,location.register);
  1000. emit_reg(A_NEG,tcgsize2opsize[opsize],hregister);
  1001. hp:=taicpu.op_reg_reg(A_CMOVcc,tcgsize2opsize[opsize],hregister,location.register);
  1002. hp.condition:=C_NS;
  1003. current_asmdata.CurrAsmList.concat(hp);
  1004. end;
  1005. end;
  1006. {*****************************************************************************
  1007. INCLUDE/EXCLUDE GENERIC HANDLING
  1008. *****************************************************************************}
  1009. procedure tx86inlinenode.second_IncludeExclude;
  1010. var
  1011. hregister,
  1012. hregister2: tregister;
  1013. setbase : aint;
  1014. bitsperop,l : longint;
  1015. cgop : topcg;
  1016. asmop : tasmop;
  1017. opdef : tdef;
  1018. opsize,
  1019. orgsize: tcgsize;
  1020. begin
  1021. {$ifdef i8086}
  1022. { BTS and BTR are 386+ }
  1023. if current_settings.cputype < cpu_386 then
  1024. begin
  1025. inherited;
  1026. exit;
  1027. end;
  1028. {$endif i8086}
  1029. if is_smallset(tcallparanode(left).resultdef) then
  1030. begin
  1031. opdef:=tcallparanode(left).resultdef;
  1032. opsize:=int_cgsize(opdef.size)
  1033. end
  1034. else
  1035. begin
  1036. opdef:=u32inttype;
  1037. opsize:=OS_32;
  1038. end;
  1039. bitsperop:=(8*tcgsize2size[opsize]);
  1040. secondpass(tcallparanode(left).left);
  1041. secondpass(tcallparanode(tcallparanode(left).right).left);
  1042. setbase:=tsetdef(tcallparanode(left).left.resultdef).setbase;
  1043. if tcallparanode(tcallparanode(left).right).left.location.loc=LOC_CONSTANT then
  1044. begin
  1045. { calculate bit position }
  1046. l:=1 shl ((tcallparanode(tcallparanode(left).right).left.location.value-setbase) mod bitsperop);
  1047. { determine operator }
  1048. if inlinenumber=in_include_x_y then
  1049. cgop:=OP_OR
  1050. else
  1051. begin
  1052. cgop:=OP_AND;
  1053. l:=not(l);
  1054. end;
  1055. case tcallparanode(left).left.location.loc of
  1056. LOC_REFERENCE :
  1057. begin
  1058. inc(tcallparanode(left).left.location.reference.offset,
  1059. ((tcallparanode(tcallparanode(left).right).left.location.value-setbase) div bitsperop)*tcgsize2size[opsize]);
  1060. cg.a_op_const_ref(current_asmdata.CurrAsmList,cgop,opsize,l,tcallparanode(left).left.location.reference);
  1061. end;
  1062. LOC_CSUBSETREG,
  1063. LOC_CREGISTER :
  1064. hlcg.a_op_const_loc(current_asmdata.CurrAsmList,cgop,tcallparanode(left).left.resultdef,l,tcallparanode(left).left.location);
  1065. else
  1066. internalerror(200405022);
  1067. end;
  1068. end
  1069. else
  1070. begin
  1071. orgsize:=opsize;
  1072. if opsize in [OS_8,OS_S8] then
  1073. begin
  1074. opdef:=u32inttype;
  1075. opsize:=OS_32;
  1076. end;
  1077. { determine asm operator }
  1078. if inlinenumber=in_include_x_y then
  1079. asmop:=A_BTS
  1080. else
  1081. asmop:=A_BTR;
  1082. hlcg.location_force_reg(current_asmdata.CurrAsmList,tcallparanode(tcallparanode(left).right).left.location,tcallparanode(tcallparanode(left).right).left.resultdef,opdef,true);
  1083. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,tcallparanode(tcallparanode(left).right).left.resultdef,tcallparanode(tcallparanode(left).right).left.location,setbase);
  1084. hregister:=tcallparanode(tcallparanode(left).right).left.location.register;
  1085. if (tcallparanode(left).left.location.loc=LOC_REFERENCE) then
  1086. emit_reg_ref(asmop,tcgsize2opsize[opsize],hregister,tcallparanode(left).left.location.reference)
  1087. else
  1088. begin
  1089. { second argument can't be an 8 bit register either }
  1090. hregister2:=tcallparanode(left).left.location.register;
  1091. if (orgsize in [OS_8,OS_S8]) then
  1092. hregister2:=cg.makeregsize(current_asmdata.CurrAsmList,hregister2,opsize);
  1093. emit_reg_reg(asmop,tcgsize2opsize[opsize],hregister,hregister2);
  1094. end;
  1095. end;
  1096. end;
  1097. procedure tx86inlinenode.second_popcnt;
  1098. var
  1099. opsize: tcgsize;
  1100. begin
  1101. secondpass(left);
  1102. opsize:=tcgsize2unsigned[left.location.size];
  1103. { no 8 Bit popcont }
  1104. if opsize=OS_8 then
  1105. opsize:=OS_16;
  1106. if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_REFERENCE,LOC_CREFERENCE]) or
  1107. (left.location.size<>opsize) then
  1108. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,cgsize_orddef(opsize),true);
  1109. location_reset(location,LOC_REGISTER,opsize);
  1110. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  1111. if left.location.loc in [LOC_REGISTER,LOC_CREGISTER] then
  1112. emit_reg_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.register,location.register)
  1113. else
  1114. emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
  1115. if resultdef.size=1 then
  1116. begin
  1117. location.size:=OS_8;
  1118. location.register:=cg.makeregsize(current_asmdata.CurrAsmList,location.register,location.size);
  1119. end;
  1120. end;
  1121. procedure tx86inlinenode.second_fma;
  1122. {$ifndef i8086}
  1123. const
  1124. op : array[false..true,false..true,s32real..s64real,0..3] of TAsmOp =
  1125. (
  1126. { positive product }
  1127. (
  1128. { positive third operand }
  1129. ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
  1130. (A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD)
  1131. ),
  1132. { negative third operand }
  1133. ((A_VFMSUB231SS,A_VFMSUB231SS,A_VFMSUB231SS,A_VFMSUB213SS),
  1134. (A_VFMSUB231SD,A_VFMSUB231SD,A_VFMSUB231SD,A_VFMSUB213SD)
  1135. )
  1136. ),
  1137. { negative product }
  1138. (
  1139. { positive third operand }
  1140. ((A_VFNMADD231SS,A_VFNMADD231SS,A_VFNMADD231SS,A_VFNMADD213SS),
  1141. (A_VFNMADD231SD,A_VFNMADD231SD,A_VFNMADD231SD,A_VFNMADD213SD)
  1142. ),
  1143. { negative third operand }
  1144. ((A_VFNMSUB231SS,A_VFNMSUB231SS,A_VFNMSUB231SS,A_VFNMSUB213SS),
  1145. (A_VFNMSUB231SD,A_VFNMSUB231SD,A_VFNMSUB231SD,A_VFNMSUB213SD)
  1146. )
  1147. )
  1148. );
  1149. var
  1150. paraarray : array[1..3] of tnode;
  1151. memop,
  1152. i : integer;
  1153. negop3,
  1154. negproduct,
  1155. gotmem : boolean;
  1156. {$endif i8086}
  1157. begin
  1158. {$ifndef i8086}
  1159. if (fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[] then
  1160. begin
  1161. negop3:=false;
  1162. negproduct:=false;
  1163. paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
  1164. paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
  1165. paraarray[3]:=tcallparanode(parameters).paravalue;
  1166. { check if a neg. node can be removed
  1167. this is possible because changing the sign of
  1168. a floating point number does not affect its absolute
  1169. value in any way
  1170. }
  1171. if paraarray[1].nodetype=unaryminusn then
  1172. begin
  1173. paraarray[1]:=tunarynode(paraarray[1]).left;
  1174. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  1175. only no code is generated for it }
  1176. negproduct:=not(negproduct);
  1177. end;
  1178. if paraarray[2].nodetype=unaryminusn then
  1179. begin
  1180. paraarray[2]:=tunarynode(paraarray[2]).left;
  1181. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  1182. only no code is generated for it }
  1183. negproduct:=not(negproduct);
  1184. end;
  1185. if paraarray[3].nodetype=unaryminusn then
  1186. begin
  1187. paraarray[3]:=tunarynode(paraarray[3]).left;
  1188. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  1189. only no code is generated for it }
  1190. negop3:=true;
  1191. end;
  1192. for i:=1 to 3 do
  1193. secondpass(paraarray[i]);
  1194. { only one memory operand is allowed }
  1195. gotmem:=false;
  1196. memop:=0;
  1197. { in case parameters come on the FPU stack, we have to pop them in reverse order as we
  1198. called secondpass }
  1199. for i:=3 downto 1 do
  1200. begin
  1201. if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1202. begin
  1203. if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
  1204. begin
  1205. memop:=i;
  1206. gotmem:=true;
  1207. end
  1208. else
  1209. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
  1210. end;
  1211. end;
  1212. location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
  1213. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1214. if gotmem then
  1215. begin
  1216. case memop of
  1217. 1:
  1218. begin
  1219. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  1220. paraarray[3].location.register,location.register,mms_movescalar);
  1221. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  1222. paraarray[1].location.reference,paraarray[2].location.register,location.register);
  1223. end;
  1224. 2:
  1225. begin
  1226. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  1227. paraarray[3].location.register,location.register,mms_movescalar);
  1228. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  1229. paraarray[2].location.reference,paraarray[1].location.register,location.register);
  1230. end;
  1231. 3:
  1232. begin
  1233. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1234. paraarray[1].location.register,location.register,mms_movescalar);
  1235. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  1236. paraarray[3].location.reference,paraarray[2].location.register,location.register);
  1237. end
  1238. else
  1239. internalerror(2014041301);
  1240. end;
  1241. end
  1242. else
  1243. begin
  1244. { try to use the location which is already in a temp. mm register as destination,
  1245. so the compiler might be able to re-use the register }
  1246. if paraarray[1].location.loc=LOC_MMREGISTER then
  1247. begin
  1248. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1249. paraarray[1].location.register,location.register,mms_movescalar);
  1250. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,3],S_NO,
  1251. paraarray[3].location.register,paraarray[2].location.register,location.register);
  1252. end
  1253. else if paraarray[2].location.loc=LOC_MMREGISTER then
  1254. begin
  1255. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[2].resultdef,resultdef,
  1256. paraarray[2].location.register,location.register,mms_movescalar);
  1257. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,3],S_NO,
  1258. paraarray[3].location.register,paraarray[1].location.register,location.register);
  1259. end
  1260. else
  1261. begin
  1262. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  1263. paraarray[3].location.register,location.register,mms_movescalar);
  1264. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,0],S_NO,
  1265. paraarray[1].location.register,paraarray[2].location.register,location.register);
  1266. end;
  1267. end;
  1268. end
  1269. else
  1270. {$endif i8086}
  1271. internalerror(2014032301);
  1272. end;
  1273. procedure tx86inlinenode.second_frac_real;
  1274. var
  1275. extrareg : TRegister;
  1276. begin
  1277. if use_vectorfpu(resultdef) then
  1278. begin
  1279. secondpass(left);
  1280. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1281. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  1282. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1283. if UseAVX then
  1284. case tfloatdef(left.resultdef).floattype of
  1285. s32real:
  1286. begin
  1287. {$ifndef i8086}
  1288. if UseAVX512 and (FPUX86_HAS_AVX512DQ in fpu_capabilities[current_settings.fputype]) then
  1289. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VREDUCESS,S_NO,3,left.location.register,left.location.register,location.register))
  1290. else
  1291. {$endif not i8086}
  1292. begin
  1293. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1294. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSS,S_NO,3,left.location.register,left.location.register,location.register));
  1295. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSUBSS,S_NO,location.register,left.location.register,location.register));
  1296. end;
  1297. end;
  1298. s64real:
  1299. begin
  1300. {$ifndef i8086}
  1301. if UseAVX512 and (FPUX86_HAS_AVX512DQ in fpu_capabilities[current_settings.fputype]) then
  1302. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VREDUCESD,S_NO,3,left.location.register,left.location.register,location.register))
  1303. else
  1304. {$endif not i8086}
  1305. begin
  1306. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1307. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSD,S_NO,3,left.location.register,left.location.register,location.register));
  1308. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSUBSD,S_NO,location.register,left.location.register,location.register));
  1309. end;
  1310. end;
  1311. else
  1312. internalerror(2017052102);
  1313. end
  1314. else
  1315. begin
  1316. extrareg:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1317. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  1318. case tfloatdef(left.resultdef).floattype of
  1319. s32real:
  1320. begin
  1321. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSS,S_NO,3,left.location.register,extrareg));
  1322. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SUBSS,S_NO,extrareg,location.register));
  1323. end;
  1324. s64real:
  1325. begin
  1326. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSD,S_NO,3,left.location.register,extrareg));
  1327. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SUBSD,S_NO,extrareg,location.register));
  1328. end;
  1329. else
  1330. internalerror(2017052103);
  1331. end;
  1332. end;
  1333. if tfloatdef(left.resultdef).floattype<>tfloatdef(resultdef).floattype then
  1334. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,left.resultdef,resultdef,location.register,location.register,mms_movescalar);
  1335. end
  1336. else
  1337. internalerror(2017052101);
  1338. end;
  1339. procedure tx86inlinenode.second_int_real;
  1340. begin
  1341. if use_vectorfpu(resultdef) then
  1342. begin
  1343. secondpass(left);
  1344. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1345. location_reset(location,LOC_MMREGISTER,left.location.size);
  1346. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1347. if UseAVX then
  1348. case tfloatdef(resultdef).floattype of
  1349. s32real:
  1350. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1351. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSS,S_NO,3,left.location.register,left.location.register,location.register));
  1352. s64real:
  1353. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1354. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSD,S_NO,3,left.location.register,left.location.register,location.register));
  1355. else
  1356. internalerror(2017052105);
  1357. end
  1358. else
  1359. begin
  1360. case tfloatdef(resultdef).floattype of
  1361. s32real:
  1362. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSS,S_NO,3,left.location.register,location.register));
  1363. s64real:
  1364. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSD,S_NO,3,left.location.register,location.register));
  1365. else
  1366. internalerror(2017052106);
  1367. end;
  1368. end;
  1369. end
  1370. else
  1371. internalerror(2017052107);
  1372. end;
  1373. procedure tx86inlinenode.second_high;
  1374. var
  1375. donelab: tasmlabel;
  1376. hregister : tregister;
  1377. href : treference;
  1378. begin
  1379. secondpass(left);
  1380. if not(is_dynamic_array(left.resultdef)) then
  1381. Internalerror(2019122809);
  1382. { length in dynamic arrays is at offset -sizeof(pint) }
  1383. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false);
  1384. current_asmdata.getjumplabel(donelab);
  1385. { by subtracting 1 here, we get the -1 into the register we need if the dyn. array is nil and the carry
  1386. flag is set in this case, so we can jump depending on it
  1387. when loading the actual high value, we have to take care later of the decreased value
  1388. do not use the cgs, as they might emit dec instead of a sub instruction, however with dec the trick
  1389. we are using is not working as dec does not touch the carry flag }
  1390. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_SUB,TCGSize2OpSize[def_cgsize(left.resultdef)],1,left.location.register));
  1391. { volatility of the dyn. array refers to the volatility of the
  1392. string pointer, not of the string data }
  1393. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_C,donelab);
  1394. hlcg.reference_reset_base(href,left.resultdef,left.location.register,-ossinttype.size+1,ctempposinvalid,ossinttype.alignment,[]);
  1395. { if the string pointer is nil, the length is 0 -> reuse the register
  1396. that originally held the string pointer for the length, so that we
  1397. can keep the original nil/0 as length in that case }
  1398. hregister:=cg.makeregsize(current_asmdata.CurrAsmList,left.location.register,def_cgsize(resultdef));
  1399. hlcg.a_load_ref_reg(current_asmdata.CurrAsmList,ossinttype,resultdef,href,hregister);
  1400. cg.a_label(current_asmdata.CurrAsmList,donelab);
  1401. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  1402. location.register:=hregister;
  1403. end;
  1404. procedure tx86inlinenode.second_minmax;
  1405. {$ifndef i8086}
  1406. const
  1407. oparray : array[false..true,false..true,s32real..s64real] of TAsmOp =
  1408. (
  1409. (
  1410. (A_MINSS,A_MINSD),
  1411. (A_VMINSS,A_VMINSD)
  1412. ),
  1413. (
  1414. (A_MAXSS,A_MAXSD),
  1415. (A_VMAXSS,A_VMAXSD)
  1416. )
  1417. );
  1418. var
  1419. paraarray : array[1..2] of tnode;
  1420. memop,
  1421. i : integer;
  1422. gotmem : boolean;
  1423. op: TAsmOp;
  1424. {$endif i8086}
  1425. begin
  1426. {$ifndef i8086}
  1427. if
  1428. {$ifdef i386}
  1429. ((current_settings.fputype>=fpu_sse) and is_single(resultdef)) or
  1430. ((current_settings.fputype>=fpu_sse2) and is_double(resultdef))
  1431. {$else i386}
  1432. is_single(resultdef) or is_double(resultdef)
  1433. {$endif i386}
  1434. then
  1435. begin
  1436. paraarray[1]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
  1437. paraarray[2]:=tcallparanode(parameters).paravalue;
  1438. for i:=low(paraarray) to high(paraarray) do
  1439. secondpass(paraarray[i]);
  1440. { only one memory operand is allowed }
  1441. gotmem:=false;
  1442. memop:=0;
  1443. for i:=low(paraarray) to high(paraarray) do
  1444. begin
  1445. if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1446. begin
  1447. if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
  1448. begin
  1449. memop:=i;
  1450. gotmem:=true;
  1451. end
  1452. else
  1453. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
  1454. end;
  1455. end;
  1456. { due to min/max behaviour that it loads always the second operand (must be the else assignment) into destination if
  1457. one of the operands is a NaN, we cannot swap operands to omit a mova operation in case fastmath is off }
  1458. if not(cs_opt_fastmath in current_settings.optimizerswitches) and gotmem and (memop=1) then
  1459. begin
  1460. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[1].location,paraarray[1].resultdef,true);
  1461. gotmem:=false;
  1462. end;
  1463. op:=oparray[inlinenumber in [in_max_single,in_max_double],UseAVX,tfloatdef(resultdef).floattype];
  1464. location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
  1465. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1466. if gotmem then
  1467. begin
  1468. if UseAVX then
  1469. case memop of
  1470. 1:
  1471. emit_ref_reg_reg(op,S_NO,
  1472. paraarray[1].location.reference,paraarray[2].location.register,location.register);
  1473. 2:
  1474. emit_ref_reg_reg(op,S_NO,
  1475. paraarray[2].location.reference,paraarray[1].location.register,location.register);
  1476. else
  1477. internalerror(2020120504);
  1478. end
  1479. else
  1480. case memop of
  1481. 1:
  1482. begin
  1483. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[2].resultdef,resultdef,
  1484. paraarray[2].location.register,location.register,mms_movescalar);
  1485. emit_ref_reg(op,S_NO,
  1486. paraarray[1].location.reference,location.register);
  1487. end;
  1488. 2:
  1489. begin
  1490. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1491. paraarray[1].location.register,location.register,mms_movescalar);
  1492. emit_ref_reg(op,S_NO,
  1493. paraarray[2].location.reference,location.register);
  1494. end;
  1495. else
  1496. internalerror(2020120601);
  1497. end;
  1498. end
  1499. else
  1500. begin
  1501. if UseAVX then
  1502. emit_reg_reg_reg(op,S_NO,
  1503. paraarray[2].location.register,paraarray[1].location.register,location.register)
  1504. else
  1505. begin
  1506. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1507. paraarray[1].location.register,location.register,mms_movescalar);
  1508. emit_reg_reg(op,S_NO,
  1509. paraarray[2].location.register,location.register)
  1510. end;
  1511. end;
  1512. end
  1513. else
  1514. {$endif i8086}
  1515. internalerror(2020120503);
  1516. end;
  1517. end.