nx86inl.pas 66 KB


  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl
  3. Generate x86 inline nodes
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86inl;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node,ninl,ncginl;
  22. type
  23. tx86inlinenode = class(tcginlinenode)
  24. protected
  25. procedure maybe_remove_round_trunc_typeconv; virtual;
  26. public
  27. function pass_typecheck_cpu:tnode;override;
  28. { first pass override
  29. so that the code generator will actually generate
  30. these nodes.
  31. }
  32. function first_cpu: tnode;override;
  33. function first_pi: tnode ; override;
  34. function first_arctan_real: tnode; override;
  35. function first_abs_real: tnode; override;
  36. function first_sqr_real: tnode; override;
  37. function first_sqrt_real: tnode; override;
  38. function first_ln_real: tnode; override;
  39. function first_cos_real: tnode; override;
  40. function first_sin_real: tnode; override;
  41. function first_round_real: tnode; override;
  42. function first_trunc_real: tnode; override;
  43. function first_popcnt: tnode; override;
  44. function first_fma: tnode; override;
  45. function first_frac_real : tnode; override;
  46. function first_int_real : tnode; override;
  47. function first_minmax: tnode; override;
  48. function simplify(forinline : boolean) : tnode; override;
  49. { second pass override to generate these nodes }
  50. procedure pass_generate_code_cpu;override;
  51. procedure second_IncludeExclude;override;
  52. procedure second_AndOrXorShiftRot_assign;override;
  53. procedure second_pi; override;
  54. procedure second_arctan_real; override;
  55. procedure second_abs_real; override;
  56. procedure second_round_real; override;
  57. procedure second_sqr_real; override;
  58. procedure second_sqrt_real; override;
  59. procedure second_ln_real; override;
  60. procedure second_cos_real; override;
  61. procedure second_sin_real; override;
  62. procedure second_trunc_real; override;
  63. procedure second_prefetch;override;
  64. procedure second_abs_long;override;
  65. procedure second_popcnt;override;
  66. procedure second_fma;override;
  67. procedure second_frac_real;override;
  68. procedure second_int_real;override;
  69. procedure second_high;override;
  70. procedure second_minmax;override;
  71. private
  72. procedure load_fpu_location(lnode: tnode);
  73. end;
  74. implementation
  75. uses
  76. systems,
  77. globtype,globals,
  78. verbose,compinnr,fmodule,
  79. defutil,
  80. aasmbase,aasmdata,aasmcpu,
  81. symconst,symtype,symdef,symcpu,
  82. ncnv,
  83. htypechk,
  84. cgbase,pass_1,pass_2,
  85. cpuinfo,cpubase,nutils,
  86. ncal,ncgutil,nld,ncon,nadd,nmat,constexp,
  87. tgobj,
  88. cga,cgutils,cgx86,cgobj,hlcgobj;
  89. {*****************************************************************************
  90. TX86INLINENODE
  91. *****************************************************************************}
  92. procedure tx86inlinenode.maybe_remove_round_trunc_typeconv;
  93. begin
  94. { only makes a difference for x86_64 }
  95. end;
  96. function tx86inlinenode.pass_typecheck_cpu: tnode;
  97. begin
  98. Result:=nil;
  99. case inlinenumber of
  100. in_x86_inportb:
  101. begin
  102. CheckParameters(1);
  103. resultdef:=u8inttype;
  104. end;
  105. in_x86_inportw:
  106. begin
  107. CheckParameters(1);
  108. resultdef:=u16inttype;
  109. end;
  110. in_x86_inportl:
  111. begin
  112. CheckParameters(1);
  113. resultdef:=s32inttype;
  114. end;
  115. in_x86_outportb,
  116. in_x86_outportw,
  117. in_x86_outportl:
  118. begin
  119. CheckParameters(2);
  120. resultdef:=voidtype;
  121. end;
  122. in_x86_cli,
  123. in_x86_sti:
  124. resultdef:=voidtype;
  125. in_x86_get_cs,
  126. in_x86_get_ss,
  127. in_x86_get_ds,
  128. in_x86_get_es,
  129. in_x86_get_fs,
  130. in_x86_get_gs:
  131. {$ifdef i8086}
  132. resultdef:=u16inttype;
  133. {$else i8086}
  134. resultdef:=s32inttype;
  135. {$endif i8086}
  136. { include automatically generated code }
  137. {$i x86mmtype.inc}
  138. else
  139. Result:=inherited pass_typecheck_cpu;
  140. end;
  141. end;
  142. function tx86inlinenode.first_cpu: tnode;
  143. begin
  144. Result:=nil;
  145. case inlinenumber of
  146. in_x86_inportb,
  147. in_x86_inportw,
  148. in_x86_inportl,
  149. in_x86_get_cs,
  150. in_x86_get_ss,
  151. in_x86_get_ds,
  152. in_x86_get_es,
  153. in_x86_get_fs,
  154. in_x86_get_gs:
  155. expectloc:=LOC_REGISTER;
  156. in_x86_outportb,
  157. in_x86_outportw,
  158. in_x86_outportl,
  159. in_x86_cli,
  160. in_x86_sti:
  161. expectloc:=LOC_VOID;
  162. { include automatically generated code }
  163. {$i x86mmfirst.inc}
  164. else
  165. Result:=inherited first_cpu;
  166. end;
  167. end;
  168. function tx86inlinenode.first_pi : tnode;
  169. begin
  170. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  171. begin
  172. expectloc:=LOC_FPUREGISTER;
  173. first_pi := nil;
  174. end
  175. else
  176. result:=inherited;
  177. end;
  178. function tx86inlinenode.first_arctan_real : tnode;
  179. begin
  180. {$ifdef i8086}
  181. { FPATAN's range is limited to (0 <= value < 1) on the 8087 and 80287,
  182. so we need to use the RTL helper on these FPUs }
  183. if current_settings.cputype < cpu_386 then
  184. begin
  185. result := inherited;
  186. exit;
  187. end;
  188. {$endif i8086}
  189. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  190. begin
  191. expectloc:=LOC_FPUREGISTER;
  192. first_arctan_real := nil;
  193. end
  194. else
  195. result:=inherited;
  196. end;
  197. function tx86inlinenode.first_abs_real : tnode;
  198. begin
  199. if use_vectorfpu(resultdef) then
  200. expectloc:=LOC_MMREGISTER
  201. else
  202. expectloc:=LOC_FPUREGISTER;
  203. first_abs_real := nil;
  204. end;
  205. function tx86inlinenode.first_sqr_real : tnode;
  206. begin
  207. if use_vectorfpu(resultdef) then
  208. expectloc:=LOC_MMREGISTER
  209. else
  210. expectloc:=LOC_FPUREGISTER;
  211. first_sqr_real := nil;
  212. end;
  213. function tx86inlinenode.first_sqrt_real : tnode;
  214. begin
  215. if use_vectorfpu(resultdef) then
  216. expectloc:=LOC_MMREGISTER
  217. else
  218. expectloc:=LOC_FPUREGISTER;
  219. first_sqrt_real := nil;
  220. end;
  221. function tx86inlinenode.first_ln_real : tnode;
  222. begin
  223. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  224. begin
  225. expectloc:=LOC_FPUREGISTER;
  226. first_ln_real := nil;
  227. end
  228. else
  229. result:=inherited;
  230. end;
  231. function tx86inlinenode.first_cos_real : tnode;
  232. begin
  233. {$ifdef i8086}
  234. { FCOS is 387+ }
  235. if current_settings.cputype < cpu_386 then
  236. begin
  237. result := inherited;
  238. exit;
  239. end;
  240. {$endif i8086}
  241. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  242. begin
  243. expectloc:=LOC_FPUREGISTER;
  244. result:=nil;
  245. end
  246. else
  247. result:=inherited;
  248. end;
  249. function tx86inlinenode.first_sin_real : tnode;
  250. begin
  251. {$ifdef i8086}
  252. { FSIN is 387+ }
  253. if current_settings.cputype < cpu_386 then
  254. begin
  255. result := inherited;
  256. exit;
  257. end;
  258. {$endif i8086}
  259. if (tfloatdef(pbestrealtype^).floattype=s80real) then
  260. begin
  261. expectloc:=LOC_FPUREGISTER;
  262. result:=nil;
  263. end
  264. else
  265. result:=inherited;
  266. end;
  267. function tx86inlinenode.first_round_real : tnode;
  268. begin
  269. maybe_remove_round_trunc_typeconv;
  270. {$ifdef x86_64}
  271. if use_vectorfpu(left.resultdef) then
  272. expectloc:=LOC_REGISTER
  273. else
  274. {$endif x86_64}
  275. expectloc:=LOC_REFERENCE;
  276. result:=nil;
  277. end;
  278. function tx86inlinenode.first_trunc_real: tnode;
  279. begin
  280. maybe_remove_round_trunc_typeconv;
  281. if (cs_opt_size in current_settings.optimizerswitches)
  282. {$ifdef x86_64}
  283. and not(use_vectorfpu(left.resultdef))
  284. {$endif x86_64}
  285. then
  286. result:=inherited
  287. else
  288. begin
  289. {$ifdef x86_64}
  290. if use_vectorfpu(left.resultdef) then
  291. expectloc:=LOC_REGISTER
  292. else
  293. {$endif x86_64}
  294. expectloc:=LOC_REFERENCE;
  295. result:=nil;
  296. end;
  297. end;
  298. function tx86inlinenode.first_popcnt: tnode;
  299. begin
  300. Result:=nil;
  301. {$ifndef i8086}
  302. if (CPUX86_HAS_POPCNT in cpu_capabilities[current_settings.cputype])
  303. {$ifdef i386}
  304. and not is_64bit(left.resultdef)
  305. {$endif i386}
  306. then
  307. expectloc:=LOC_REGISTER
  308. else
  309. {$endif not i8086}
  310. Result:=inherited first_popcnt
  311. end;
  312. function tx86inlinenode.first_fma : tnode;
  313. begin
  314. {$ifndef i8086}
  315. if ((fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[]) and
  316. ((is_double(resultdef)) or (is_single(resultdef))) then
  317. begin
  318. expectloc:=LOC_MMREGISTER;
  319. Result:=nil;
  320. end
  321. else
  322. {$endif i8086}
  323. Result:=inherited first_fma;
  324. end;
  325. function tx86inlinenode.first_frac_real : tnode;
  326. begin
  327. if (current_settings.fputype>=fpu_sse41) and
  328. ((is_double(resultdef)) or (is_single(resultdef))) then
  329. begin
  330. maybe_remove_round_trunc_typeconv;
  331. expectloc:=LOC_MMREGISTER;
  332. Result:=nil;
  333. end
  334. else
  335. Result:=inherited first_frac_real;
  336. end;
  337. function tx86inlinenode.first_int_real : tnode;
  338. begin
  339. if (current_settings.fputype>=fpu_sse41) and
  340. ((is_double(resultdef)) or (is_single(resultdef))) then
  341. begin
  342. Result:=nil;
  343. expectloc:=LOC_MMREGISTER;
  344. end
  345. else
  346. Result:=inherited first_int_real;
  347. end;
  348. function tx86inlinenode.first_minmax: tnode;
  349. begin
  350. {$ifndef i8086}
  351. if
  352. {$ifdef i386}
  353. ((current_settings.fputype>=fpu_sse) and is_single(resultdef)) or
  354. ((current_settings.fputype>=fpu_sse2) and is_double(resultdef))
  355. {$else i386}
  356. ((is_double(resultdef)) or (is_single(resultdef)))
  357. {$endif i386}
  358. then
  359. begin
  360. expectloc:=LOC_MMREGISTER;
  361. Result:=nil;
  362. end
  363. else
  364. {$endif i8086}
  365. Result:=inherited first_minmax;
  366. end;
  367. function tx86inlinenode.simplify(forinline : boolean) : tnode;
  368. var
  369. temp : tnode;
  370. begin
  371. if (current_settings.fputype>=fpu_sse41) and
  372. (inlinenumber=in_int_real) and (left.nodetype=typeconvn) and
  373. not(nf_explicit in left.flags) and
  374. (ttypeconvnode(left).left.resultdef.typ=floatdef) and
  375. ((is_double(ttypeconvnode(left).left.resultdef)) or (is_single(ttypeconvnode(left).left.resultdef))) then
  376. begin
  377. { get rid of the type conversion }
  378. temp:=ttypeconvnode(left).left;
  379. ttypeconvnode(left).left:=nil;
  380. left.free;
  381. left:=temp;
  382. result:=self.getcopy;
  383. tinlinenode(result).resultdef:=temp.resultdef;
  384. typecheckpass(result);
  385. end
  386. else
  387. Result:=inherited simplify(forinline);
  388. end;
  389. procedure tx86inlinenode.pass_generate_code_cpu;
  390. var
  391. paraarray : array[1..4] of tnode;
  392. i : integer;
  393. op: TAsmOp;
  394. procedure inport(dreg:TRegister;dsize:topsize;dtype:tdef);
  395. var
  396. portnumber: tnode;
  397. begin
  398. portnumber:=left;
  399. secondpass(portnumber);
  400. if (portnumber.location.loc=LOC_CONSTANT) and
  401. (portnumber.location.value>=0) and
  402. (portnumber.location.value<=255) then
  403. begin
  404. hlcg.getcpuregister(current_asmdata.CurrAsmList,dreg);
  405. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_IN,dsize,portnumber.location.value,dreg));
  406. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  407. location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  408. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,dreg);
  409. hlcg.a_load_reg_reg(current_asmdata.CurrAsmList,dtype,resultdef,dreg,location.register);
  410. end
  411. else
  412. begin
  413. hlcg.getcpuregister(current_asmdata.CurrAsmList,NR_DX);
  414. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,portnumber.resultdef,u16inttype,portnumber.location,NR_DX);
  415. hlcg.getcpuregister(current_asmdata.CurrAsmList,dreg);
  416. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_IN,dsize,NR_DX,dreg));
  417. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,NR_DX);
  418. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  419. location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  420. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,dreg);
  421. hlcg.a_load_reg_reg(current_asmdata.CurrAsmList,dtype,resultdef,dreg,location.register);
  422. end;
  423. end;
  424. procedure outport(dreg:TRegister;dsize:topsize;dtype:tdef);
  425. var
  426. portnumber, portdata: tnode;
  427. begin
  428. portnumber:=tcallparanode(tcallparanode(left).right).left;
  429. portdata:=tcallparanode(left).left;
  430. secondpass(portdata);
  431. secondpass(portnumber);
  432. hlcg.getcpuregister(current_asmdata.CurrAsmList,dreg);
  433. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,portdata.resultdef,dtype,portdata.location,dreg);
  434. if (portnumber.location.loc=LOC_CONSTANT) and
  435. (portnumber.location.value>=0) and
  436. (portnumber.location.value<=255) then
  437. current_asmdata.CurrAsmList.concat(taicpu.op_reg_const(A_OUT,dsize,dreg,portnumber.location.value))
  438. else
  439. begin
  440. hlcg.getcpuregister(current_asmdata.CurrAsmList,NR_DX);
  441. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,portnumber.resultdef,u16inttype,portnumber.location,NR_DX);
  442. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_OUT,dsize,dreg,NR_DX));
  443. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,NR_DX);
  444. end;
  445. hlcg.ungetcpuregister(current_asmdata.CurrAsmList,dreg);
  446. end;
  447. procedure get_segreg(segreg:tregister);
  448. begin
  449. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  450. location.register:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  451. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MOV,TCGSize2OpSize[def_cgsize(resultdef)],segreg,location.register));
  452. end;
  453. function GetConstInt(n: tnode): longint;
  454. begin
  455. Result:=0;
  456. if is_constintnode(n) then
  457. result:=tordconstnode(n).value.svalue
  458. else
  459. Message(type_e_constant_expr_expected);
  460. end;
  461. procedure GetParameters(count: longint);
  462. var
  463. i: longint;
  464. p: tnode;
  465. begin
  466. if (count=1) and
  467. (not (left is tcallparanode)) then
  468. paraarray[1]:=left
  469. else
  470. begin
  471. p:=left;
  472. for i := count downto 1 do
  473. begin
  474. paraarray[i]:=tcallparanode(p).paravalue;
  475. p:=tcallparanode(p).nextpara;
  476. end;
  477. end;
  478. end;
  479. procedure location_force_mmxreg(list:TAsmList;var l: tlocation;maybeconst:boolean);
  480. var
  481. reg : tregister;
  482. begin
  483. if (l.loc<>LOC_MMXREGISTER) and
  484. ((l.loc<>LOC_CMMXREGISTER) or (not maybeconst)) then
  485. begin
  486. reg:=tcgx86(cg).getmmxregister(list);
  487. cg.a_loadmm_loc_reg(list,OS_M64,l,reg,nil);
  488. location_freetemp(list,l);
  489. location_reset(l,LOC_MMXREGISTER,OS_M64);
  490. l.register:=reg;
  491. end;
  492. end;
  493. procedure location_make_ref(var loc: tlocation);
  494. var
  495. hloc: tlocation;
  496. begin
  497. case loc.loc of
  498. LOC_CREGISTER,
  499. LOC_REGISTER:
  500. begin
  501. location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1, []);
  502. hloc.reference.base:=loc.register;
  503. loc:=hloc;
  504. end;
  505. LOC_CREFERENCE,
  506. LOC_REFERENCE:
  507. begin
  508. end;
  509. else
  510. begin
  511. hlcg.location_force_reg(current_asmdata.CurrAsmList,loc,u32inttype,u32inttype,false);
  512. location_reset_ref(hloc, LOC_REFERENCE, OS_32, 1, []);
  513. hloc.reference.base:=loc.register;
  514. loc:=hloc;
  515. end;
  516. end;
  517. end;
  518. begin
  519. FillChar(paraarray,sizeof(paraarray),0);
  520. case inlinenumber of
  521. in_x86_inportb:
  522. inport(NR_AL,S_B,u8inttype);
  523. in_x86_inportw:
  524. inport(NR_AX,S_W,u16inttype);
  525. in_x86_inportl:
  526. inport(NR_EAX,S_L,s32inttype);
  527. in_x86_outportb:
  528. outport(NR_AL,S_B,u8inttype);
  529. in_x86_outportw:
  530. outport(NR_AX,S_W,u16inttype);
  531. in_x86_outportl:
  532. outport(NR_EAX,S_L,s32inttype);
  533. in_x86_cli:
  534. current_asmdata.CurrAsmList.concat(taicpu.op_none(A_CLI));
  535. in_x86_sti:
  536. current_asmdata.CurrAsmList.concat(taicpu.op_none(A_STI));
  537. in_x86_get_cs:
  538. get_segreg(NR_CS);
  539. in_x86_get_ss:
  540. get_segreg(NR_SS);
  541. in_x86_get_ds:
  542. get_segreg(NR_DS);
  543. in_x86_get_es:
  544. get_segreg(NR_ES);
  545. in_x86_get_fs:
  546. get_segreg(NR_FS);
  547. in_x86_get_gs:
  548. get_segreg(NR_GS);
  549. {$i x86mmsecond.inc}
  550. else
  551. inherited pass_generate_code_cpu;
  552. end;
  553. end;
  554. procedure tx86inlinenode.second_AndOrXorShiftRot_assign;
  555. var
  556. opsize : tcgsize;
  557. valuenode, indexnode, loadnode: TNode;
  558. DestReg: TRegister;
  559. begin
  560. {$ifndef i8086}
  561. if (cs_opt_level2 in current_settings.optimizerswitches) then
  562. begin
  563. { Saves on a lot of typecasting and potential coding mistakes }
  564. valuenode := tcallparanode(left).left;
  565. loadnode := tcallparanode(tcallparanode(left).right).left;
  566. opsize := def_cgsize(loadnode.resultdef);
  567. { BMI2 optimisations }
  568. if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and (inlinenumber=in_and_assign_x_y) then
  569. begin
  570. { If the second operand is "((1 shl y) - 1)", we can turn it
  571. into a BZHI operator instead }
  572. if (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
  573. (valuenode.nodetype = subn) and
  574. (taddnode(valuenode).right.nodetype = ordconstn) and
  575. (tordconstnode(taddnode(valuenode).right).value = 1) and
  576. (taddnode(valuenode).left.nodetype = shln) and
  577. (tshlshrnode(taddnode(valuenode).left).left.nodetype = ordconstn) and
  578. (tordconstnode(tshlshrnode(taddnode(valuenode).left).left).value = 1) then
  579. begin
  580. { Skip the subtract and shift nodes completely }
  581. { Helps avoid all the awkward typecasts }
  582. indexnode := tshlshrnode(taddnode(valuenode).left).right;
  583. {$ifdef x86_64}
  584. { The code generator sometimes extends the shift result to 64-bit unnecessarily }
  585. if (indexnode.nodetype = typeconvn) and (opsize in [OS_32, OS_S32]) and
  586. (def_cgsize(TTypeConvNode(indexnode).resultdef) in [OS_64, OS_S64]) then
  587. begin
  588. { Convert to the 32-bit type }
  589. indexnode.resultdef := loadnode.resultdef;
  590. node_reset_flags(indexnode,[nf_pass1_done]);
  591. { We should't be getting any new errors }
  592. if do_firstpass(indexnode) then
  593. InternalError(2022110202);
  594. { Keep things internally consistent in case indexnode changed }
  595. tshlshrnode(taddnode(valuenode).left).right := indexnode;
  596. end;
  597. {$endif x86_64}
  598. secondpass(indexnode);
  599. secondpass(loadnode);
  600. { allocate registers }
  601. hlcg.location_force_reg(
  602. current_asmdata.CurrAsmList,
  603. indexnode.location,
  604. indexnode.resultdef,
  605. loadnode.resultdef,
  606. false
  607. );
  608. case loadnode.location.loc of
  609. LOC_REFERENCE,
  610. LOC_CREFERENCE:
  611. begin
  612. { BZHI can only write to a register }
  613. DestReg := cg.getintregister(current_asmdata.CurrAsmList,opsize);
  614. emit_reg_ref_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, loadnode.location.reference, DestReg);
  615. emit_reg_ref(A_MOV, TCGSize2OpSize[opsize], DestReg, loadnode.location.reference);
  616. end;
  617. LOC_REGISTER,
  618. LOC_CREGISTER:
  619. emit_reg_reg_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, loadnode.location.register, loadnode.location.register);
  620. else
  621. InternalError(2022102120);
  622. end;
  623. Exit;
  624. end;
  625. end;
  626. end;
  627. {$endif not i8086}
  628. inherited second_AndOrXorShiftRot_assign;
  629. end;
  630. procedure tx86inlinenode.second_pi;
  631. begin
  632. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  633. emit_none(A_FLDPI,S_NO);
  634. tcgx86(cg).inc_fpu_stack;
  635. location.register:=NR_FPU_RESULT_REG;
  636. end;
  637. { load the FPU into the an fpu register }
  638. procedure tx86inlinenode.load_fpu_location(lnode: tnode);
  639. begin
  640. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  641. location.register:=NR_FPU_RESULT_REG;
  642. secondpass(lnode);
  643. case lnode.location.loc of
  644. LOC_FPUREGISTER:
  645. ;
  646. LOC_CFPUREGISTER:
  647. begin
  648. cg.a_loadfpu_reg_reg(current_asmdata.CurrAsmList,lnode.location.size,
  649. lnode.location.size,lnode.location.register,location.register);
  650. end;
  651. LOC_REFERENCE,LOC_CREFERENCE:
  652. begin
  653. cg.a_loadfpu_ref_reg(current_asmdata.CurrAsmList,
  654. lnode.location.size,lnode.location.size,
  655. lnode.location.reference,location.register);
  656. end;
  657. LOC_MMREGISTER,LOC_CMMREGISTER:
  658. begin
  659. location:=lnode.location;
  660. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,location,lnode.resultdef,false);
  661. end;
  662. else
  663. internalerror(309991);
  664. end;
  665. end;
  666. procedure tx86inlinenode.second_arctan_real;
  667. begin
  668. load_fpu_location(left);
  669. emit_none(A_FLD1,S_NO);
  670. emit_none(A_FPATAN,S_NO);
  671. end;
  672. procedure tx86inlinenode.second_abs_real;
  673. function needs_indirect:boolean; inline;
  674. begin
  675. result:=(tf_supports_packages in target_info.flags) and
  676. (target_info.system in systems_indirect_var_imports);
  677. end;
  678. var
  679. href : treference;
  680. sym : tasmsymbol;
  681. begin
  682. if use_vectorfpu(resultdef) then
  683. begin
  684. secondpass(left);
  685. if left.location.loc<>LOC_MMREGISTER then
  686. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,UseAVX);
  687. if UseAVX then
  688. begin
  689. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  690. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
  691. end
  692. else
  693. location:=left.location;
  694. case tfloatdef(resultdef).floattype of
  695. s32real:
  696. begin
  697. sym:=current_asmdata.RefAsmSymbol(target_info.cprefix+'FPC_ABSMASK_SINGLE',AT_DATA,needs_indirect);
  698. reference_reset_symbol(href,sym,0,4,[]);
  699. current_module.add_extern_asmsym(sym);
  700. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList, href);
  701. if UseAVX then
  702. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(
  703. A_VANDPS,S_XMM,href,left.location.register,location.register))
  704. else
  705. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_ANDPS,S_XMM,href,location.register));
  706. end;
  707. s64real:
  708. begin
  709. sym:=current_asmdata.RefAsmSymbol(target_info.cprefix+'FPC_ABSMASK_DOUBLE',AT_DATA,needs_indirect);
  710. reference_reset_symbol(href,sym,0,4,[]);
  711. current_module.add_extern_asmsym(sym);
  712. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList, href);
  713. if UseAVX then
  714. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(
  715. A_VANDPD,S_XMM,href,left.location.register,location.register))
  716. else
  717. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_ANDPD,S_XMM,href,location.register))
  718. end;
  719. else
  720. internalerror(200506081);
  721. end;
  722. end
  723. else
  724. begin
  725. load_fpu_location(left);
  726. emit_none(A_FABS,S_NO);
  727. end;
  728. end;
  729. procedure tx86inlinenode.second_round_real;
  730. begin
  731. {$ifdef x86_64}
  732. if use_vectorfpu(left.resultdef) then
  733. begin
  734. secondpass(left);
  735. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  736. location_reset(location,LOC_REGISTER,OS_S64);
  737. location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
  738. if UseAVX then
  739. case left.location.size of
  740. OS_F32:
  741. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSS2SI,S_NO,left.location.register,location.register));
  742. OS_F64:
  743. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSD2SI,S_NO,left.location.register,location.register));
  744. else
  745. internalerror(2007031402);
  746. end
  747. else
  748. case left.location.size of
  749. OS_F32:
  750. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_NO,left.location.register,location.register));
  751. OS_F64:
  752. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_NO,left.location.register,location.register));
  753. else
  754. internalerror(2007031404);
  755. end;
  756. end
  757. else
  758. {$endif x86_64}
  759. begin
  760. load_fpu_location(left);
  761. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  762. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  763. emit_ref(A_FISTP,S_IQ,location.reference);
  764. tcgx86(cg).dec_fpu_stack;
  765. emit_none(A_FWAIT,S_NO);
  766. end;
  767. end;
  768. procedure tx86inlinenode.second_trunc_real;
  769. var
  770. oldcw,newcw : treference;
  771. begin
  772. {$ifdef x86_64}
  773. if use_vectorfpu(left.resultdef) and
  774. not((left.location.loc=LOC_FPUREGISTER) and (current_settings.fputype>=fpu_sse3)) then
  775. begin
  776. secondpass(left);
  777. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  778. location_reset(location,LOC_REGISTER,OS_S64);
  779. location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
  780. if UseAVX then
  781. case left.location.size of
  782. OS_F32:
  783. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSS2SI,S_NO,left.location.register,location.register));
  784. OS_F64:
  785. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSD2SI,S_NO,left.location.register,location.register));
  786. else
  787. internalerror(2007031401);
  788. end
  789. else
  790. case left.location.size of
  791. OS_F32:
  792. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_NO,left.location.register,location.register));
  793. OS_F64:
  794. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_NO,left.location.register,location.register));
  795. else
  796. internalerror(2007031403);
  797. end;
  798. end
  799. else
  800. {$endif x86_64}
  801. begin
  802. if (current_settings.fputype>=fpu_sse3) then
  803. begin
  804. load_fpu_location(left);
  805. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  806. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  807. emit_ref(A_FISTTP,S_IQ,location.reference);
  808. tcgx86(cg).dec_fpu_stack;
  809. end
  810. else
  811. begin
  812. tg.GetTemp(current_asmdata.CurrAsmList,2,2,tt_normal,oldcw);
  813. tg.GetTemp(current_asmdata.CurrAsmList,2,2,tt_normal,newcw);
  814. {$ifdef i8086}
  815. if current_settings.cputype<=cpu_286 then
  816. begin
  817. emit_ref(A_FSTCW,S_NO,newcw);
  818. emit_ref(A_FSTCW,S_NO,oldcw);
  819. emit_none(A_FWAIT,S_NO);
  820. end
  821. else
  822. {$endif i8086}
  823. begin
  824. emit_ref(A_FNSTCW,S_NO,newcw);
  825. emit_ref(A_FNSTCW,S_NO,oldcw);
  826. end;
  827. emit_const_ref(A_OR,S_W,$0f00,newcw);
  828. load_fpu_location(left);
  829. emit_ref(A_FLDCW,S_NO,newcw);
  830. location_reset_ref(location,LOC_REFERENCE,OS_S64,0,[]);
  831. tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
  832. emit_ref(A_FISTP,S_IQ,location.reference);
  833. tcgx86(cg).dec_fpu_stack;
  834. emit_ref(A_FLDCW,S_NO,oldcw);
  835. emit_none(A_FWAIT,S_NO);
  836. tg.UnGetTemp(current_asmdata.CurrAsmList,oldcw);
  837. tg.UnGetTemp(current_asmdata.CurrAsmList,newcw);
  838. end;
  839. end;
  840. end;
  841. procedure tx86inlinenode.second_sqr_real;
  842. begin
  843. if use_vectorfpu(resultdef) then
  844. begin
  845. secondpass(left);
  846. location_reset(location,LOC_MMREGISTER,left.location.size);
  847. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  848. if UseAVX then
  849. begin
  850. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  851. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location.register,left.location.register,location.register,mms_movescalar);
  852. end
  853. else
  854. begin
  855. if left.location.loc in [LOC_CFPUREGISTER,LOC_FPUREGISTER] then
  856. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  857. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  858. cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,location.register,location.register,mms_movescalar);
  859. end;
  860. end
  861. else
  862. begin
  863. load_fpu_location(left);
  864. emit_reg_reg(A_FMUL,S_NO,NR_ST0,NR_ST0);
  865. end;
  866. end;
  867. procedure tx86inlinenode.second_sqrt_real;
  868. begin
  869. if use_vectorfpu(resultdef) then
  870. begin
  871. secondpass(left);
  872. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  873. location_reset(location,LOC_MMREGISTER,left.location.size);
  874. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  875. if UseAVX then
  876. case tfloatdef(resultdef).floattype of
  877. s32real:
  878. { we use S_NO instead of S_XMM here, regardless of the register size, as the size of the memory location is 32/64 bit }
  879. { using left.location.register here as 2nd parameter is crucial to break dependency chains }
  880. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSS,S_NO,left.location.register,left.location.register,location.register));
  881. s64real:
  882. { we use S_NO instead of S_XMM here, regardless of the register size, as the size of the memory location is 32/64 bit }
  883. { using left.location.register here as 2nd parameter is crucial to break dependency chains }
  884. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSD,S_NO,left.location.register,left.location.register,location.register));
  885. else
  886. internalerror(200510031);
  887. end
  888. else
  889. case tfloatdef(resultdef).floattype of
  890. s32real:
  891. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_NO,left.location.register,location.register));
  892. s64real:
  893. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_NO,left.location.register,location.register));
  894. else
  895. internalerror(2005100303);
  896. end;
  897. end
  898. else
  899. begin
  900. load_fpu_location(left);
  901. if left.location.loc=LOC_REFERENCE then
  902. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  903. emit_none(A_FSQRT,S_NO);
  904. end;
  905. end;
  906. procedure tx86inlinenode.second_ln_real;
  907. begin
  908. load_fpu_location(left);
  909. emit_none(A_FLDLN2,S_NO);
  910. emit_none(A_FXCH,S_NO);
  911. emit_none(A_FYL2X,S_NO);
  912. end;
  913. procedure tx86inlinenode.second_cos_real;
  914. begin
  915. {$ifdef i8086}
  916. { FCOS is 387+ }
  917. if current_settings.cputype < cpu_386 then
  918. begin
  919. inherited;
  920. exit;
  921. end;
  922. {$endif i8086}
  923. load_fpu_location(left);
  924. emit_none(A_FCOS,S_NO);
  925. end;
  926. procedure tx86inlinenode.second_sin_real;
  927. begin
  928. {$ifdef i8086}
  929. { FSIN is 387+ }
  930. if current_settings.cputype < cpu_386 then
  931. begin
  932. inherited;
  933. exit;
  934. end;
  935. {$endif i8086}
  936. load_fpu_location(left);
  937. emit_none(A_FSIN,S_NO)
  938. end;
  939. procedure tx86inlinenode.second_prefetch;
  940. var
  941. ref : treference;
  942. r : tregister;
  943. checkpointer_used : boolean;
  944. begin
  945. {$if defined(i386) or defined(i8086)}
  946. if current_settings.cputype>=cpu_Pentium3 then
  947. {$endif i386 or i8086}
  948. begin
  949. { do not call Checkpointer for left node }
  950. checkpointer_used:=(cs_checkpointer in current_settings.localswitches);
  951. if checkpointer_used then
  952. node_change_local_switch(left,cs_checkpointer,false);
  953. secondpass(left);
  954. if checkpointer_used then
  955. node_change_local_switch(left,cs_checkpointer,false);
  956. case left.location.loc of
  957. LOC_CREFERENCE,
  958. LOC_REFERENCE:
  959. begin
  960. r:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
  961. cg.a_loadaddr_ref_reg(current_asmdata.CurrAsmList,left.location.reference,r);
  962. reference_reset_base(ref,r,0,left.location.reference.temppos,left.location.reference.alignment,left.location.reference.volatility);
  963. current_asmdata.CurrAsmList.concat(taicpu.op_ref(A_PREFETCHNTA,S_NO,ref));
  964. end;
  965. else
  966. { nothing to prefetch };
  967. end;
  968. end;
  969. end;
  970. procedure tx86inlinenode.second_abs_long;
  971. var
  972. hregister : tregister;
  973. opsize : tcgsize;
  974. hp : taicpu;
  975. hl: TAsmLabel;
  976. begin
  977. {$if defined(i8086) or defined(i386)}
  978. if not(CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) then
  979. begin
  980. opsize:=def_cgsize(left.resultdef);
  981. secondpass(left);
  982. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false);
  983. location:=left.location;
  984. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  985. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,location.register);
  986. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SAR,opsize,tcgsize2size[opsize]*8-1,left.location.register);
  987. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_XOR,opsize,left.location.register,location.register);
  988. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_SUB,opsize,left.location.register,location.register);
  989. if cs_check_overflow in current_settings.localswitches then
  990. begin
  991. current_asmdata.getjumplabel(hl);
  992. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl);
  993. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
  994. cg.a_label(current_asmdata.CurrAsmList,hl);
  995. end;
  996. end
  997. else
  998. {$endif i8086 or i386}
  999. begin
  1000. opsize:=def_cgsize(left.resultdef);
  1001. secondpass(left);
  1002. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true);
  1003. hregister:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  1004. location:=left.location;
  1005. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  1006. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,hregister);
  1007. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,left.location.register,location.register);
  1008. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1009. emit_reg(A_NEG,tcgsize2opsize[opsize],hregister);
  1010. if cs_check_overflow in current_settings.localswitches then
  1011. begin
  1012. current_asmdata.getjumplabel(hl);
  1013. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl);
  1014. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
  1015. cg.a_label(current_asmdata.CurrAsmList,hl);
  1016. end;
  1017. hp:=taicpu.op_reg_reg(A_CMOVcc,tcgsize2opsize[opsize],hregister,location.register);
  1018. hp.condition:=C_NS;
  1019. cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1020. current_asmdata.CurrAsmList.concat(hp);
  1021. end;
  1022. end;
  1023. {*****************************************************************************
  1024. INCLUDE/EXCLUDE GENERIC HANDLING
  1025. *****************************************************************************}
  1026. procedure tx86inlinenode.second_IncludeExclude;
  1027. var
  1028. hregister,
  1029. hregister2: tregister;
  1030. setbase : aint;
  1031. bitsperop,l : longint;
  1032. cgop : topcg;
  1033. asmop : tasmop;
  1034. opdef : tdef;
  1035. opsize,
  1036. orgsize: tcgsize;
  1037. begin
  1038. {$ifdef i8086}
  1039. { BTS and BTR are 386+ }
  1040. if current_settings.cputype < cpu_386 then
  1041. begin
  1042. inherited;
  1043. exit;
  1044. end;
  1045. {$endif i8086}
  1046. if is_smallset(tcallparanode(left).resultdef) then
  1047. begin
  1048. opdef:=tcallparanode(left).resultdef;
  1049. opsize:=int_cgsize(opdef.size)
  1050. end
  1051. else
  1052. begin
  1053. opdef:=u32inttype;
  1054. opsize:=OS_32;
  1055. end;
  1056. bitsperop:=(8*tcgsize2size[opsize]);
  1057. secondpass(tcallparanode(left).left);
  1058. secondpass(tcallparanode(tcallparanode(left).right).left);
  1059. setbase:=tsetdef(tcallparanode(left).left.resultdef).setbase;
  1060. if tcallparanode(tcallparanode(left).right).left.location.loc=LOC_CONSTANT then
  1061. begin
  1062. { calculate bit position }
  1063. l:=1 shl ((tcallparanode(tcallparanode(left).right).left.location.value-setbase) mod bitsperop);
  1064. { determine operator }
  1065. if inlinenumber=in_include_x_y then
  1066. cgop:=OP_OR
  1067. else
  1068. begin
  1069. cgop:=OP_AND;
  1070. l:=not(l);
  1071. end;
  1072. case tcallparanode(left).left.location.loc of
  1073. LOC_REFERENCE :
  1074. begin
  1075. inc(tcallparanode(left).left.location.reference.offset,
  1076. ((tcallparanode(tcallparanode(left).right).left.location.value-setbase) div bitsperop)*tcgsize2size[opsize]);
  1077. cg.a_op_const_ref(current_asmdata.CurrAsmList,cgop,opsize,l,tcallparanode(left).left.location.reference);
  1078. end;
  1079. LOC_CSUBSETREG,
  1080. LOC_CREGISTER :
  1081. hlcg.a_op_const_loc(current_asmdata.CurrAsmList,cgop,tcallparanode(left).left.resultdef,l,tcallparanode(left).left.location);
  1082. else
  1083. internalerror(200405022);
  1084. end;
  1085. end
  1086. else
  1087. begin
  1088. orgsize:=opsize;
  1089. if opsize in [OS_8,OS_S8] then
  1090. begin
  1091. opdef:=u32inttype;
  1092. opsize:=OS_32;
  1093. end;
  1094. { determine asm operator }
  1095. if inlinenumber=in_include_x_y then
  1096. asmop:=A_BTS
  1097. else
  1098. asmop:=A_BTR;
  1099. hlcg.location_force_reg(current_asmdata.CurrAsmList,tcallparanode(tcallparanode(left).right).left.location,tcallparanode(tcallparanode(left).right).left.resultdef,opdef,true);
  1100. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,tcallparanode(tcallparanode(left).right).left.resultdef,tcallparanode(tcallparanode(left).right).left.location,setbase);
  1101. hregister:=tcallparanode(tcallparanode(left).right).left.location.register;
  1102. if (tcallparanode(left).left.location.loc=LOC_REFERENCE) then
  1103. emit_reg_ref(asmop,tcgsize2opsize[opsize],hregister,tcallparanode(left).left.location.reference)
  1104. else
  1105. begin
  1106. { second argument can't be an 8 bit register either }
  1107. hregister2:=tcallparanode(left).left.location.register;
  1108. if (orgsize in [OS_8,OS_S8]) then
  1109. hregister2:=cg.makeregsize(current_asmdata.CurrAsmList,hregister2,opsize);
  1110. emit_reg_reg(asmop,tcgsize2opsize[opsize],hregister,hregister2);
  1111. end;
  1112. end;
  1113. end;
  1114. procedure tx86inlinenode.second_popcnt;
  1115. var
  1116. opsize: tcgsize;
  1117. begin
  1118. secondpass(left);
  1119. opsize:=tcgsize2unsigned[left.location.size];
  1120. { no 8 Bit popcont }
  1121. if opsize=OS_8 then
  1122. opsize:=OS_16;
  1123. if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_REFERENCE,LOC_CREFERENCE]) or
  1124. (left.location.size<>opsize) then
  1125. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,cgsize_orddef(opsize),true);
  1126. location_reset(location,LOC_REGISTER,opsize);
  1127. location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  1128. if left.location.loc in [LOC_REGISTER,LOC_CREGISTER] then
  1129. emit_reg_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.register,location.register)
  1130. else
  1131. emit_ref_reg(A_POPCNT,TCGSize2OpSize[opsize],left.location.reference,location.register);
  1132. if resultdef.size=1 then
  1133. begin
  1134. location.size:=OS_8;
  1135. location.register:=cg.makeregsize(current_asmdata.CurrAsmList,location.register,location.size);
  1136. end;
  1137. end;
  1138. procedure tx86inlinenode.second_fma;
  1139. {$ifndef i8086}
  1140. const
  1141. op : array[false..true,false..true,s32real..s64real,0..3] of TAsmOp =
  1142. (
  1143. { positive product }
  1144. (
  1145. { positive third operand }
  1146. ((A_VFMADD231SS,A_VFMADD231SS,A_VFMADD231SS,A_VFMADD213SS),
  1147. (A_VFMADD231SD,A_VFMADD231SD,A_VFMADD231SD,A_VFMADD213SD)
  1148. ),
  1149. { negative third operand }
  1150. ((A_VFMSUB231SS,A_VFMSUB231SS,A_VFMSUB231SS,A_VFMSUB213SS),
  1151. (A_VFMSUB231SD,A_VFMSUB231SD,A_VFMSUB231SD,A_VFMSUB213SD)
  1152. )
  1153. ),
  1154. { negative product }
  1155. (
  1156. { positive third operand }
  1157. ((A_VFNMADD231SS,A_VFNMADD231SS,A_VFNMADD231SS,A_VFNMADD213SS),
  1158. (A_VFNMADD231SD,A_VFNMADD231SD,A_VFNMADD231SD,A_VFNMADD213SD)
  1159. ),
  1160. { negative third operand }
  1161. ((A_VFNMSUB231SS,A_VFNMSUB231SS,A_VFNMSUB231SS,A_VFNMSUB213SS),
  1162. (A_VFNMSUB231SD,A_VFNMSUB231SD,A_VFNMSUB231SD,A_VFNMSUB213SD)
  1163. )
  1164. )
  1165. );
  1166. var
  1167. paraarray : array[1..3] of tnode;
  1168. memop,
  1169. i : integer;
  1170. negop3,
  1171. negproduct,
  1172. gotmem : boolean;
  1173. {$endif i8086}
  1174. begin
  1175. {$ifndef i8086}
  1176. if (fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[] then
  1177. begin
  1178. negop3:=false;
  1179. negproduct:=false;
  1180. paraarray[1]:=tcallparanode(tcallparanode(tcallparanode(parameters).nextpara).nextpara).paravalue;
  1181. paraarray[2]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
  1182. paraarray[3]:=tcallparanode(parameters).paravalue;
  1183. { check if a neg. node can be removed
  1184. this is possible because changing the sign of
  1185. a floating point number does not affect its absolute
  1186. value in any way
  1187. }
  1188. if paraarray[1].nodetype=unaryminusn then
  1189. begin
  1190. paraarray[1]:=tunarynode(paraarray[1]).left;
  1191. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  1192. only no code is generated for it }
  1193. negproduct:=not(negproduct);
  1194. end;
  1195. if paraarray[2].nodetype=unaryminusn then
  1196. begin
  1197. paraarray[2]:=tunarynode(paraarray[2]).left;
  1198. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  1199. only no code is generated for it }
  1200. negproduct:=not(negproduct);
  1201. end;
  1202. if paraarray[3].nodetype=unaryminusn then
  1203. begin
  1204. paraarray[3]:=tunarynode(paraarray[3]).left;
  1205. { do not release the unused unary minus node, it is kept and release together with the other nodes,
  1206. only no code is generated for it }
  1207. negop3:=true;
  1208. end;
  1209. for i:=1 to 3 do
  1210. secondpass(paraarray[i]);
  1211. { only one memory operand is allowed }
  1212. gotmem:=false;
  1213. memop:=0;
  1214. { in case parameters come on the FPU stack, we have to pop them in reverse order as we
  1215. called secondpass }
  1216. for i:=3 downto 1 do
  1217. begin
  1218. if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1219. begin
  1220. if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
  1221. begin
  1222. memop:=i;
  1223. gotmem:=true;
  1224. end
  1225. else
  1226. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
  1227. end;
  1228. end;
  1229. location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
  1230. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1231. if gotmem then
  1232. begin
  1233. case memop of
  1234. 1:
  1235. begin
  1236. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  1237. paraarray[3].location.register,location.register,mms_movescalar);
  1238. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  1239. paraarray[1].location.reference,paraarray[2].location.register,location.register);
  1240. end;
  1241. 2:
  1242. begin
  1243. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  1244. paraarray[3].location.register,location.register,mms_movescalar);
  1245. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  1246. paraarray[2].location.reference,paraarray[1].location.register,location.register);
  1247. end;
  1248. 3:
  1249. begin
  1250. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1251. paraarray[1].location.register,location.register,mms_movescalar);
  1252. emit_ref_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,memop],S_NO,
  1253. paraarray[3].location.reference,paraarray[2].location.register,location.register);
  1254. end
  1255. else
  1256. internalerror(2014041301);
  1257. end;
  1258. end
  1259. else
  1260. begin
  1261. { try to use the location which is already in a temp. mm register as destination,
  1262. so the compiler might be able to re-use the register }
  1263. if paraarray[1].location.loc=LOC_MMREGISTER then
  1264. begin
  1265. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1266. paraarray[1].location.register,location.register,mms_movescalar);
  1267. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,3],S_NO,
  1268. paraarray[3].location.register,paraarray[2].location.register,location.register);
  1269. end
  1270. else if paraarray[2].location.loc=LOC_MMREGISTER then
  1271. begin
  1272. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[2].resultdef,resultdef,
  1273. paraarray[2].location.register,location.register,mms_movescalar);
  1274. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,3],S_NO,
  1275. paraarray[3].location.register,paraarray[1].location.register,location.register);
  1276. end
  1277. else
  1278. begin
  1279. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[3].resultdef,resultdef,
  1280. paraarray[3].location.register,location.register,mms_movescalar);
  1281. emit_reg_reg_reg(op[negproduct,negop3,tfloatdef(resultdef).floattype,0],S_NO,
  1282. paraarray[1].location.register,paraarray[2].location.register,location.register);
  1283. end;
  1284. end;
  1285. end
  1286. else
  1287. {$endif i8086}
  1288. internalerror(2014032301);
  1289. end;
  1290. procedure tx86inlinenode.second_frac_real;
  1291. var
  1292. extrareg : TRegister;
  1293. begin
  1294. if use_vectorfpu(resultdef) then
  1295. begin
  1296. secondpass(left);
  1297. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1298. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  1299. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1300. if UseAVX then
  1301. case tfloatdef(left.resultdef).floattype of
  1302. s32real:
  1303. begin
  1304. {$ifndef i8086}
  1305. if UseAVX512 and (FPUX86_HAS_AVX512DQ in fpu_capabilities[current_settings.fputype]) then
  1306. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VREDUCESS,S_NO,3,left.location.register,left.location.register,location.register))
  1307. else
  1308. {$endif not i8086}
  1309. begin
  1310. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1311. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSS,S_NO,3,left.location.register,left.location.register,location.register));
  1312. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSUBSS,S_NO,location.register,left.location.register,location.register));
  1313. end;
  1314. end;
  1315. s64real:
  1316. begin
  1317. {$ifndef i8086}
  1318. if UseAVX512 and (FPUX86_HAS_AVX512DQ in fpu_capabilities[current_settings.fputype]) then
  1319. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VREDUCESD,S_NO,3,left.location.register,left.location.register,location.register))
  1320. else
  1321. {$endif not i8086}
  1322. begin
  1323. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1324. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSD,S_NO,3,left.location.register,left.location.register,location.register));
  1325. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSUBSD,S_NO,location.register,left.location.register,location.register));
  1326. end;
  1327. end;
  1328. else
  1329. internalerror(2017052102);
  1330. end
  1331. else
  1332. begin
  1333. extrareg:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1334. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  1335. case tfloatdef(left.resultdef).floattype of
  1336. s32real:
  1337. begin
  1338. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSS,S_NO,3,left.location.register,extrareg));
  1339. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SUBSS,S_NO,extrareg,location.register));
  1340. end;
  1341. s64real:
  1342. begin
  1343. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSD,S_NO,3,left.location.register,extrareg));
  1344. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SUBSD,S_NO,extrareg,location.register));
  1345. end;
  1346. else
  1347. internalerror(2017052103);
  1348. end;
  1349. end;
  1350. if tfloatdef(left.resultdef).floattype<>tfloatdef(resultdef).floattype then
  1351. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,left.resultdef,resultdef,location.register,location.register,mms_movescalar);
  1352. end
  1353. else
  1354. internalerror(2017052101);
  1355. end;
  1356. procedure tx86inlinenode.second_int_real;
  1357. begin
  1358. if use_vectorfpu(resultdef) then
  1359. begin
  1360. secondpass(left);
  1361. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1362. location_reset(location,LOC_MMREGISTER,left.location.size);
  1363. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1364. if UseAVX then
  1365. case tfloatdef(resultdef).floattype of
  1366. s32real:
  1367. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1368. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSS,S_NO,3,left.location.register,left.location.register,location.register));
  1369. s64real:
  1370. { using left.location.register here as 3rd parameter is crucial to break dependency chains }
  1371. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg_reg(A_VROUNDSD,S_NO,3,left.location.register,left.location.register,location.register));
  1372. else
  1373. internalerror(2017052105);
  1374. end
  1375. else
  1376. begin
  1377. case tfloatdef(resultdef).floattype of
  1378. s32real:
  1379. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSS,S_NO,3,left.location.register,location.register));
  1380. s64real:
  1381. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_ROUNDSD,S_NO,3,left.location.register,location.register));
  1382. else
  1383. internalerror(2017052106);
  1384. end;
  1385. end;
  1386. end
  1387. else
  1388. internalerror(2017052107);
  1389. end;
  1390. procedure tx86inlinenode.second_high;
  1391. var
  1392. donelab: tasmlabel;
  1393. hregister : tregister;
  1394. href : treference;
  1395. begin
  1396. secondpass(left);
  1397. if not(is_dynamic_array(left.resultdef)) then
  1398. Internalerror(2019122809);
  1399. { length in dynamic arrays is at offset -sizeof(pint) }
  1400. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false);
  1401. current_asmdata.getjumplabel(donelab);
  1402. { by subtracting 1 here, we get the -1 into the register we need if the dyn. array is nil and the carry
  1403. flag is set in this case, so we can jump depending on it
  1404. when loading the actual high value, we have to take care later of the decreased value
  1405. do not use the cgs, as they might emit dec instead of a sub instruction, however with dec the trick
  1406. we are using is not working as dec does not touch the carry flag }
  1407. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_SUB,TCGSize2OpSize[def_cgsize(left.resultdef)],1,left.location.register));
  1408. { volatility of the dyn. array refers to the volatility of the
  1409. string pointer, not of the string data }
  1410. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_C,donelab);
  1411. hlcg.reference_reset_base(href,left.resultdef,left.location.register,-ossinttype.size+1,ctempposinvalid,ossinttype.alignment,[]);
  1412. { if the string pointer is nil, the length is 0 -> reuse the register
  1413. that originally held the string pointer for the length, so that we
  1414. can keep the original nil/0 as length in that case }
  1415. hregister:=cg.makeregsize(current_asmdata.CurrAsmList,left.location.register,def_cgsize(resultdef));
  1416. hlcg.a_load_ref_reg(current_asmdata.CurrAsmList,ossinttype,resultdef,href,hregister);
  1417. cg.a_label(current_asmdata.CurrAsmList,donelab);
  1418. location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
  1419. location.register:=hregister;
  1420. end;
  1421. procedure tx86inlinenode.second_minmax;
  1422. {$ifndef i8086}
  1423. const
  1424. oparray : array[false..true,false..true,s32real..s64real] of TAsmOp =
  1425. (
  1426. (
  1427. (A_MINSS,A_MINSD),
  1428. (A_VMINSS,A_VMINSD)
  1429. ),
  1430. (
  1431. (A_MAXSS,A_MAXSD),
  1432. (A_VMAXSS,A_VMAXSD)
  1433. )
  1434. );
  1435. var
  1436. paraarray : array[1..2] of tnode;
  1437. memop,
  1438. i : integer;
  1439. gotmem : boolean;
  1440. op: TAsmOp;
  1441. {$endif i8086}
  1442. begin
  1443. {$ifndef i8086}
  1444. if
  1445. {$ifdef i386}
  1446. ((current_settings.fputype>=fpu_sse) and is_single(resultdef)) or
  1447. ((current_settings.fputype>=fpu_sse2) and is_double(resultdef))
  1448. {$else i386}
  1449. is_single(resultdef) or is_double(resultdef)
  1450. {$endif i386}
  1451. then
  1452. begin
  1453. paraarray[1]:=tcallparanode(tcallparanode(parameters).nextpara).paravalue;
  1454. paraarray[2]:=tcallparanode(parameters).paravalue;
  1455. for i:=low(paraarray) to high(paraarray) do
  1456. secondpass(paraarray[i]);
  1457. { only one memory operand is allowed }
  1458. gotmem:=false;
  1459. memop:=0;
  1460. for i:=low(paraarray) to high(paraarray) do
  1461. begin
  1462. if not(paraarray[i].location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1463. begin
  1464. if (paraarray[i].location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) and not(gotmem) then
  1465. begin
  1466. memop:=i;
  1467. gotmem:=true;
  1468. end
  1469. else
  1470. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[i].location,paraarray[i].resultdef,true);
  1471. end;
  1472. end;
  1473. { due to min/max behaviour that it loads always the second operand (must be the else assignment) into destination if
  1474. one of the operands is a NaN, we cannot swap operands to omit a mova operation in case fastmath is off }
  1475. if not(cs_opt_fastmath in current_settings.optimizerswitches) and gotmem and (memop=1) then
  1476. begin
  1477. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,paraarray[1].location,paraarray[1].resultdef,true);
  1478. gotmem:=false;
  1479. end;
  1480. op:=oparray[inlinenumber in [in_max_single,in_max_double],UseAVX,tfloatdef(resultdef).floattype];
  1481. location_reset(location,LOC_MMREGISTER,paraarray[1].location.size);
  1482. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  1483. if gotmem then
  1484. begin
  1485. if UseAVX then
  1486. case memop of
  1487. 1:
  1488. emit_ref_reg_reg(op,S_NO,
  1489. paraarray[1].location.reference,paraarray[2].location.register,location.register);
  1490. 2:
  1491. emit_ref_reg_reg(op,S_NO,
  1492. paraarray[2].location.reference,paraarray[1].location.register,location.register);
  1493. else
  1494. internalerror(2020120504);
  1495. end
  1496. else
  1497. case memop of
  1498. 1:
  1499. begin
  1500. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[2].resultdef,resultdef,
  1501. paraarray[2].location.register,location.register,mms_movescalar);
  1502. emit_ref_reg(op,S_NO,
  1503. paraarray[1].location.reference,location.register);
  1504. end;
  1505. 2:
  1506. begin
  1507. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1508. paraarray[1].location.register,location.register,mms_movescalar);
  1509. emit_ref_reg(op,S_NO,
  1510. paraarray[2].location.reference,location.register);
  1511. end;
  1512. else
  1513. internalerror(2020120601);
  1514. end;
  1515. end
  1516. else
  1517. begin
  1518. if UseAVX then
  1519. emit_reg_reg_reg(op,S_NO,
  1520. paraarray[2].location.register,paraarray[1].location.register,location.register)
  1521. else
  1522. begin
  1523. hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,paraarray[1].resultdef,resultdef,
  1524. paraarray[1].location.register,location.register,mms_movescalar);
  1525. emit_reg_reg(op,S_NO,
  1526. paraarray[2].location.register,location.register)
  1527. end;
  1528. end;
  1529. end
  1530. else
  1531. {$endif i8086}
  1532. internalerror(2020120503);
  1533. end;
  1534. end.