nx86add.pas 84 KB


  1. {
  2. Copyright (c) 2000-2002 by Florian Klaempfl
  3. Common code generation for add nodes on the i386 and x86
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86add;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. symtype,
  22. cgbase,
  23. cpubase,
  24. node,nadd,ncgadd;
  25. type
  26. tx86addnode = class(tcgaddnode)
  27. protected
  28. function getresflags(unsigned : boolean) : tresflags;
  29. function getfpuresflags : tresflags;
  30. procedure left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  31. procedure force_left_and_right_fpureg;
  32. procedure prepare_x87_locations(out refnode: tnode);
  33. procedure emit_op_right_left(op:TAsmOp;opsize:TCgSize;AllocFlags:boolean);
  34. procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
  35. procedure second_cmpfloatvector;
  36. procedure second_addfloatsse;
  37. procedure second_addfloatavx;
  38. public
  39. function pass_1 : tnode;override;
  40. function simplify(forinline : boolean) : tnode; override;
  41. function use_fma : boolean;override;
  42. procedure second_addfloat;override;
  43. {$ifndef i8086}
  44. procedure second_addsmallset;override;
  45. {$endif not i8086}
  46. procedure second_add64bit;override;
  47. procedure second_cmpfloat;override;
  48. procedure second_cmpsmallset;override;
  49. procedure second_cmp64bit;override;
  50. procedure second_cmpordinal;override;
  51. procedure second_addordinal;override;
  52. procedure second_addboolean;override;
  53. {$ifdef SUPPORT_MMX}
  54. procedure second_opmmx;override;
  55. {$endif SUPPORT_MMX}
  56. procedure second_opvector;override;
  57. end;
  58. implementation
  59. uses
  60. globtype,globals,
  61. verbose,cutils,compinnr,
  62. cpuinfo,
  63. aasmbase,aasmdata,aasmcpu,
  64. symconst,symdef,
  65. cgobj,hlcgobj,cgx86,cga,cgutils,
  66. tgobj,ncgutil,
  67. ncon,nset,ninl,ncnv,ncal,nmat,
  68. defutil,defcmp,constexp,
  69. htypechk;
  70. { Range check must be disabled explicitly as the code serves
  71. on three different architecture sizes }
  72. {$R-}
  73. {*****************************************************************************
  74. Helpers
  75. *****************************************************************************}
  76. procedure tx86addnode.emit_generic_code(op:TAsmOp;opsize:TCGSize;unsigned,extra_not,mboverflow:boolean);
  77. var
  78. power : longint;
  79. hl4 : tasmlabel;
  80. r : Tregister;
  81. href : treference;
  82. overflowcheck: boolean;
  83. comparison: boolean;
  84. begin
  85. overflowcheck:=needoverflowcheck;
  86. comparison:=
  87. (op=A_CMP) or (op=A_TEST) or (op=A_BT) or is_boolean(resultdef);
  88. { at this point, left.location.loc should be LOC_REGISTER }
  89. if right.location.loc=LOC_REGISTER then
  90. begin
  91. { right.location is a LOC_REGISTER }
  92. { when swapped another result register }
  93. if (nodetype=subn) and (nf_swapped in flags) then
  94. begin
  95. if extra_not then
  96. emit_reg(A_NOT,TCGSize2Opsize[opsize],left.location.register);
  97. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,right.location.register);
  98. { newly swapped also set swapped flag }
  99. location_swap(left.location,right.location);
  100. toggleflag(nf_swapped);
  101. end
  102. else
  103. begin
  104. if extra_not then
  105. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  106. if (op=A_ADD) or (op=A_OR) or (op=A_AND) or (op=A_XOR) or (op=A_IMUL) then
  107. location_swap(left.location,right.location);
  108. if comparison then
  109. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  110. emit_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register);
  111. end;
  112. end
  113. else
  114. begin
  115. { right.location is not a LOC_REGISTER }
  116. if (nodetype=subn) and (nf_swapped in flags) then
  117. begin
  118. if extra_not then
  119. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NOT,opsize,left.location.register,left.location.register);
  120. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  121. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  122. if comparison then
  123. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  124. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,r);
  125. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,r,left.location.register);
  126. end
  127. else
  128. begin
  129. { Optimizations when right.location is a constant value }
  130. if (op=A_CMP) and
  131. (nodetype in [equaln,unequaln]) and
  132. (right.location.loc=LOC_CONSTANT) and
  133. (right.location.value=0) then
  134. begin
  135. { 'test $-1,%reg' is transformable into 'test $-1,spilltemp' if %reg needs
  136. spilling, while 'test %reg,%reg' still requires loading into register.
  137. If spilling is not necessary, it is changed back into 'test %reg,%reg' by
  138. peephole optimizer (this optimization is currently available only for i386). }
  139. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  140. {$ifdef i386}
  141. emit_const_reg(A_TEST,TCGSize2Opsize[opsize],aint(-1),left.location.register)
  142. {$else i386}
  143. emit_reg_reg(A_TEST,TCGSize2Opsize[opsize],left.location.register,left.location.register);
  144. {$endif i386}
  145. end
  146. else
  147. if (op=A_ADD) and
  148. (right.location.loc=LOC_CONSTANT) and
  149. (right.location.value=1) and
  150. not overflowcheck and
  151. UseIncDec then
  152. begin
  153. emit_reg(A_INC,TCGSize2Opsize[opsize],left.location.register);
  154. end
  155. else
  156. if (op=A_SUB) and
  157. (right.location.loc=LOC_CONSTANT) and
  158. (right.location.value=1) and
  159. not overflowcheck and
  160. UseIncDec then
  161. begin
  162. emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register);
  163. end
  164. else
  165. if (op=A_IMUL) and
  166. (right.location.loc=LOC_CONSTANT) and
  167. (ispowerof2(int64(right.location.value),power)) and
  168. overflowcheck then
  169. begin
  170. emit_const_reg(A_SHL,TCGSize2Opsize[opsize],power,left.location.register);
  171. end
  172. else if (op=A_IMUL) and
  173. (right.location.loc=LOC_CONSTANT) and
  174. (right.location.value>1) and (ispowerof2(int64(right.location.value)-1,power)) and
  175. (power in [1..3]) and
  176. not overflowcheck then
  177. begin
  178. reference_reset_base(href,left.location.register,0,ctempposinvalid,0,[]);
  179. href.index:=left.location.register;
  180. href.scalefactor:=int64(right.location.value)-1;
  181. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  182. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_LEA,TCgSize2OpSize[opsize],href,left.location.register));
  183. end
  184. else
  185. begin
  186. if extra_not then
  187. begin
  188. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  189. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  190. emit_reg(A_NOT,TCGSize2Opsize[opsize],r);
  191. if comparison or (mboverflow and overflowcheck) then
  192. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  193. emit_reg_reg(A_AND,TCGSize2Opsize[opsize],r,left.location.register);
  194. end
  195. else
  196. emit_op_right_left(op,opsize,comparison or (mboverflow and overflowcheck));
  197. end;
  198. end;
  199. end;
  200. { only in case of overflow operations }
  201. { produce overflow code }
  202. { we must put it here directly, because sign of operation }
  203. { is in unsigned VAR!! }
  204. if mboverflow then
  205. begin
  206. if overflowcheck then
  207. begin
  208. current_asmdata.getjumplabel(hl4);
  209. if unsigned then
  210. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_AE,hl4)
  211. else
  212. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl4);
  213. if not comparison then
  214. cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  215. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
  216. cg.a_label(current_asmdata.CurrAsmList,hl4);
  217. end;
  218. end;
  219. end;
  220. procedure tx86addnode.left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  221. begin
  222. { left location is not a register? }
  223. if (left.location.loc<>LOC_REGISTER) then
  224. begin
  225. { if right is register then we can swap the locations }
  226. if (not noswap) and
  227. (right.location.loc=LOC_REGISTER) then
  228. begin
  229. location_swap(left.location,right.location);
  230. toggleflag(nf_swapped);
  231. end
  232. else if (not noswap) and
  233. (right.location.loc=LOC_CREGISTER) then
  234. begin
  235. location_swap(left.location,right.location);
  236. toggleflag(nf_swapped);
  237. { maybe we can reuse a constant register when the
  238. operation is a comparison that doesn't change the
  239. value of the register }
  240. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  241. location:=left.location;
  242. end
  243. else
  244. begin
  245. { maybe we can reuse a constant register when the
  246. operation is a comparison that doesn't change the
  247. value of the register }
  248. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  249. end;
  250. end;
  251. if (right.location.loc<>LOC_CONSTANT) and
  252. (tcgsize2unsigned[right.location.size]<>tcgsize2unsigned[opsize]) then
  253. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  254. if (left.location.loc<>LOC_CONSTANT) and
  255. (tcgsize2unsigned[left.location.size]<>tcgsize2unsigned[opsize]) then
  256. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  257. end;
  258. procedure tx86addnode.force_left_and_right_fpureg;
  259. begin
  260. if (right.location.loc<>LOC_FPUREGISTER) then
  261. begin
  262. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  263. if (left.location.loc<>LOC_FPUREGISTER) then
  264. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  265. else
  266. { left was on the stack => swap }
  267. toggleflag(nf_swapped);
  268. end
  269. { the nominator in st0 }
  270. else if (left.location.loc<>LOC_FPUREGISTER) then
  271. begin
  272. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  273. end
  274. else
  275. begin
  276. { fpu operands are always in the wrong order on the stack }
  277. toggleflag(nf_swapped);
  278. end;
  279. end;
  280. { Makes sides suitable for executing an x87 instruction:
  281. if either side is OS_F32/OS_F64-sized LOC_REFERENCE, it is returned in 'refnode'
  282. everything else is loaded to FPU stack. }
  283. procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
  284. begin
  285. refnode:=nil;
  286. { later on, no mm registers are allowed, so transfer everything to memory here
  287. below it is loaded into an fpu register if neede }
  288. if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  289. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  290. if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  291. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  292. case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
  293. 0:
  294. begin
  295. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  296. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  297. InternalError(2013090803);
  298. if (left.location.size in [OS_F32,OS_F64]) then
  299. begin
  300. refnode:=left;
  301. toggleflag(nf_swapped);
  302. end
  303. else
  304. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  305. end;
  306. 1:
  307. begin { if left is on the stack then swap. }
  308. if (left.location.loc=LOC_FPUREGISTER) then
  309. refnode:=right
  310. else
  311. refnode:=left;
  312. if not(refnode.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  313. InternalError(2013090801);
  314. if not (refnode.location.size in [OS_F32,OS_F64]) then
  315. begin
  316. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,refnode.location,refnode.resultdef,false);
  317. if (refnode=right) then
  318. toggleflag(nf_swapped);
  319. refnode:=nil;
  320. end
  321. else
  322. begin
  323. if (refnode=left) then
  324. toggleflag(nf_swapped);
  325. end;
  326. end;
  327. 2: { fpu operands are always in the wrong order on the stack }
  328. toggleflag(nf_swapped);
  329. else
  330. InternalError(2013090802);
  331. end;
  332. end;
  333. procedure tx86addnode.emit_op_right_left(op:TAsmOp;opsize:TCgsize;AllocFlags:boolean);
  334. {$ifdef x86_64}
  335. var
  336. tmpreg : tregister;
  337. {$endif x86_64}
  338. begin
  339. if (right.location.loc in [LOC_CSUBSETREG,LOC_SUBSETREG,LOC_SUBSETREF,LOC_CSUBSETREF]) then
  340. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
  341. { left must be a register }
  342. case right.location.loc of
  343. LOC_REGISTER,
  344. LOC_CREGISTER :
  345. begin
  346. if AllocFlags then
  347. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  348. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register));
  349. end;
  350. LOC_REFERENCE,
  351. LOC_CREFERENCE :
  352. begin
  353. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  354. if AllocFlags then
  355. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  356. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,TCGSize2Opsize[opsize],right.location.reference,left.location.register));
  357. end;
  358. LOC_CONSTANT :
  359. begin
  360. {$ifdef x86_64}
  361. { x86_64 only supports signed 32 bits constants directly }
  362. if (opsize in [OS_S64,OS_64]) and
  363. ((right.location.value<low(longint)) or (right.location.value>high(longint))) then
  364. begin
  365. tmpreg:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  366. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,right.location.value,tmpreg);
  367. if AllocFlags then
  368. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  369. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],tmpreg,left.location.register));
  370. end
  371. else
  372. {$endif x86_64}
  373. begin
  374. if AllocFlags then
  375. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  376. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(op,TCGSize2Opsize[opsize],right.location.value,left.location.register));
  377. end;
  378. end;
  379. else
  380. internalerror(200203232);
  381. end;
  382. end;
  383. function tx86addnode.getresflags(unsigned : boolean) : tresflags;
  384. begin
  385. case nodetype of
  386. equaln : getresflags:=F_E;
  387. unequaln : getresflags:=F_NE;
  388. else
  389. if not(unsigned) then
  390. begin
  391. if nf_swapped in flags then
  392. case nodetype of
  393. ltn : getresflags:=F_G;
  394. lten : getresflags:=F_GE;
  395. gtn : getresflags:=F_L;
  396. gten : getresflags:=F_LE;
  397. else
  398. internalerror(2013120105);
  399. end
  400. else
  401. case nodetype of
  402. ltn : getresflags:=F_L;
  403. lten : getresflags:=F_LE;
  404. gtn : getresflags:=F_G;
  405. gten : getresflags:=F_GE;
  406. else
  407. internalerror(2013120106);
  408. end;
  409. end
  410. else
  411. begin
  412. if nf_swapped in flags then
  413. case nodetype of
  414. ltn : getresflags:=F_A;
  415. lten : getresflags:=F_AE;
  416. gtn : getresflags:=F_B;
  417. gten : getresflags:=F_BE;
  418. else
  419. internalerror(2013120107);
  420. end
  421. else
  422. case nodetype of
  423. ltn : getresflags:=F_B;
  424. lten : getresflags:=F_BE;
  425. gtn : getresflags:=F_A;
  426. gten : getresflags:=F_AE;
  427. else
  428. internalerror(2013120108);
  429. end;
  430. end;
  431. end;
  432. end;
  433. function tx86addnode.getfpuresflags : tresflags;
  434. begin
  435. if (nodetype=equaln) then
  436. result:=F_FE
  437. else if (nodetype=unequaln) then
  438. result:=F_FNE
  439. else if (nf_swapped in flags) then
  440. case nodetype of
  441. ltn : result:=F_FA;
  442. lten : result:=F_FAE;
  443. gtn : result:=F_FB;
  444. gten : result:=F_FBE;
  445. else
  446. internalerror(2014031402);
  447. end
  448. else
  449. case nodetype of
  450. ltn : result:=F_FB;
  451. lten : result:=F_FBE;
  452. gtn : result:=F_FA;
  453. gten : result:=F_FAE;
  454. else
  455. internalerror(2014031403);
  456. end;
  457. end;
  458. {*****************************************************************************
  459. AddSmallSet
  460. *****************************************************************************}
  461. {$ifndef i8086}
  462. procedure tx86addnode.second_addsmallset;
  463. var
  464. setbase : aint;
  465. opdef : tdef;
  466. opsize : TCGSize;
  467. op : TAsmOp;
  468. extra_not,
  469. noswap : boolean;
  470. all_member_optimization:boolean;
  471. begin
  472. pass_left_right;
  473. noswap:=false;
  474. extra_not:=false;
  475. all_member_optimization:=false;
  476. opdef:=resultdef;
  477. opsize:=int_cgsize(opdef.size);
  478. if (left.resultdef.typ=setdef) then
  479. setbase:=tsetdef(left.resultdef).setbase
  480. else
  481. setbase:=tsetdef(right.resultdef).setbase;
  482. case nodetype of
  483. addn :
  484. begin
  485. { adding elements is not commutative }
  486. if (nf_swapped in flags) and (left.nodetype=setelementn) then
  487. swapleftright;
  488. { are we adding set elements ? }
  489. if right.nodetype=setelementn then
  490. begin
  491. { no range support for smallsets! }
  492. if assigned(tsetelementnode(right).right) then
  493. internalerror(43244);
  494. { btsb isn't supported }
  495. if opsize=OS_8 then
  496. begin
  497. opsize:=OS_32;
  498. opdef:=u32inttype;
  499. end;
  500. { bts requires both elements to be registers }
  501. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  502. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  503. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,opdef,right.location,setbase);
  504. op:=A_BTS;
  505. noswap:=true;
  506. end
  507. else
  508. op:=A_OR;
  509. end;
  510. symdifn :
  511. op:=A_XOR;
  512. muln :
  513. op:=A_AND;
  514. subn :
  515. begin
  516. op:=A_AND;
  517. if (not(nf_swapped in flags) and (left.location.loc=LOC_CONSTANT) and (left.location.value=-1)) or
  518. ((nf_swapped in flags) and (right.location.loc=LOC_CONSTANT) and (right.location.value=-1)) then
  519. all_member_optimization:=true;
  520. if (not(nf_swapped in flags)) and
  521. (right.location.loc=LOC_CONSTANT) then
  522. right.location.value := not(right.location.value)
  523. else if (nf_swapped in flags) and
  524. (left.location.loc=LOC_CONSTANT) then
  525. left.location.value := not(left.location.value)
  526. else
  527. extra_not:=true;
  528. end;
  529. xorn :
  530. op:=A_XOR;
  531. orn :
  532. op:=A_OR;
  533. andn :
  534. op:=A_AND;
  535. else
  536. internalerror(2003042215);
  537. end;
  538. if all_member_optimization then
  539. begin
  540. {A set expression [0..31]-x can be implemented with a simple NOT.}
  541. if nf_swapped in flags then
  542. begin
  543. { newly swapped also set swapped flag }
  544. location_swap(left.location,right.location);
  545. toggleflag(nf_swapped);
  546. end;
  547. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,false);
  548. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  549. location:=right.location;
  550. end
  551. else
  552. begin
  553. { can we use the BMI1 instruction andn? }
  554. if (op=A_AND) and extra_not and (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) and
  555. (resultdef.size in [4{$ifdef x86_64},8{$endif x86_64}]) then
  556. begin
  557. location_reset(location,LOC_REGISTER,left.location.size);
  558. location.register:=cg.getintregister(current_asmdata.currAsmList,left.location.size);
  559. if nf_swapped in flags then
  560. begin
  561. location_swap(left.location,right.location);
  562. toggleflag(nf_swapped);
  563. end;
  564. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,true);
  565. if not(left.location.loc in [LOC_CREGISTER,LOC_REGISTER,LOC_CREFERENCE,LOC_REFERENCE]) then
  566. hlcg.location_force_reg(current_asmdata.currAsmList,left.location,left.resultdef,opdef,true);
  567. case left.location.loc of
  568. LOC_CREGISTER,LOC_REGISTER:
  569. emit_reg_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.register,right.location.register,location.register);
  570. LOC_CREFERENCE,LOC_REFERENCE:
  571. emit_ref_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.reference,right.location.register,location.register);
  572. else
  573. Internalerror(2018040201);
  574. end;
  575. end
  576. else
  577. begin
  578. { left must be a register }
  579. left_must_be_reg(opdef,opsize,noswap);
  580. emit_generic_code(op,opsize,true,extra_not,false);
  581. location_freetemp(current_asmdata.CurrAsmList,right.location);
  582. { left is always a register and contains the result }
  583. location:=left.location;
  584. end;
  585. end;
  586. { fix the changed opsize we did above because of the missing btsb }
  587. if opsize<>int_cgsize(resultdef.size) then
  588. hlcg.location_force_reg(current_asmdata.CurrAsmList,location,opdef,cgsize_orddef(int_cgsize(resultdef.size)),false);
  589. end;
  590. {$endif not i8086}
  591. procedure tx86addnode.second_cmpsmallset;
  592. var
  593. opdef : tdef;
  594. opsize : TCGSize;
  595. op : TAsmOp;
  596. begin
  597. pass_left_right;
  598. opdef:=left.resultdef;
  599. opsize:=int_cgsize(opdef.size);
  600. case nodetype of
  601. equaln,
  602. unequaln :
  603. op:=A_CMP;
  604. lten,gten:
  605. begin
  606. if (not(nf_swapped in flags) and (nodetype = lten)) or
  607. ((nf_swapped in flags) and (nodetype = gten)) then
  608. swapleftright;
  609. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  610. emit_op_right_left(A_AND,opsize,False);
  611. op:=A_CMP;
  612. { warning: ugly hack, we need a JE so change the node to equaln }
  613. nodetype:=equaln;
  614. end;
  615. else
  616. internalerror(2003042204);
  617. end;
  618. { left must be a register }
  619. left_must_be_reg(opdef,opsize,false);
  620. emit_generic_code(op,opsize,true,false,false);
  621. location_freetemp(current_asmdata.CurrAsmList,right.location);
  622. location_freetemp(current_asmdata.CurrAsmList,left.location);
  623. location_reset(location,LOC_FLAGS,OS_NO);
  624. location.resflags:=getresflags(true);
  625. end;
  626. {*****************************************************************************
  627. AddMMX
  628. *****************************************************************************}
  629. {$ifdef SUPPORT_MMX}
  630. procedure tx86addnode.second_opmmx;
  631. var
  632. op : TAsmOp;
  633. cmpop : boolean;
  634. mmxbase : tmmxtype;
  635. hreg,
  636. hregister : tregister;
  637. begin
  638. pass_left_right;
  639. cmpop:=false;
  640. op:=A_NOP;
  641. mmxbase:=mmx_type(left.resultdef);
  642. location_reset(location,LOC_MMXREGISTER,def_cgsize(resultdef));
  643. case nodetype of
  644. addn :
  645. begin
  646. if (cs_mmx_saturation in current_settings.localswitches) then
  647. begin
  648. case mmxbase of
  649. mmxs8bit:
  650. op:=A_PADDSB;
  651. mmxu8bit:
  652. op:=A_PADDUSB;
  653. mmxs16bit,mmxfixed16:
  654. op:=A_PADDSW;
  655. mmxu16bit:
  656. op:=A_PADDUSW;
  657. else
  658. ;
  659. end;
  660. end
  661. else
  662. begin
  663. case mmxbase of
  664. mmxs8bit,mmxu8bit:
  665. op:=A_PADDB;
  666. mmxs16bit,mmxu16bit,mmxfixed16:
  667. op:=A_PADDW;
  668. mmxs32bit,mmxu32bit:
  669. op:=A_PADDD;
  670. else
  671. ;
  672. end;
  673. end;
  674. end;
  675. muln :
  676. begin
  677. case mmxbase of
  678. mmxs16bit,mmxu16bit:
  679. op:=A_PMULLW;
  680. mmxfixed16:
  681. op:=A_PMULHW;
  682. else
  683. ;
  684. end;
  685. end;
  686. subn :
  687. begin
  688. if (cs_mmx_saturation in current_settings.localswitches) then
  689. begin
  690. case mmxbase of
  691. mmxs8bit:
  692. op:=A_PSUBSB;
  693. mmxu8bit:
  694. op:=A_PSUBUSB;
  695. mmxs16bit,mmxfixed16:
  696. op:=A_PSUBSB;
  697. mmxu16bit:
  698. op:=A_PSUBUSW;
  699. else
  700. ;
  701. end;
  702. end
  703. else
  704. begin
  705. case mmxbase of
  706. mmxs8bit,mmxu8bit:
  707. op:=A_PSUBB;
  708. mmxs16bit,mmxu16bit,mmxfixed16:
  709. op:=A_PSUBW;
  710. mmxs32bit,mmxu32bit:
  711. op:=A_PSUBD;
  712. else
  713. ;
  714. end;
  715. end;
  716. end;
  717. xorn:
  718. op:=A_PXOR;
  719. orn:
  720. op:=A_POR;
  721. andn:
  722. op:=A_PAND;
  723. else
  724. internalerror(2003042214);
  725. end;
  726. if op = A_NOP then
  727. internalerror(201408201);
  728. { left and right no register? }
  729. { then one must be demanded }
  730. if (left.location.loc<>LOC_MMXREGISTER) then
  731. begin
  732. if (right.location.loc=LOC_MMXREGISTER) then
  733. begin
  734. location_swap(left.location,right.location);
  735. toggleflag(nf_swapped);
  736. end
  737. else
  738. begin
  739. { register variable ? }
  740. if (left.location.loc=LOC_CMMXREGISTER) then
  741. begin
  742. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  743. emit_reg_reg(A_MOVQ,S_NO,left.location.register,hregister);
  744. end
  745. else
  746. begin
  747. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  748. internalerror(200203245);
  749. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  750. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  751. emit_ref_reg(A_MOVQ,S_NO,left.location.reference,hregister);
  752. end;
  753. location_reset(left.location,LOC_MMXREGISTER,OS_NO);
  754. left.location.register:=hregister;
  755. end;
  756. end;
  757. { at this point, left.location.loc should be LOC_MMXREGISTER }
  758. if right.location.loc<>LOC_MMXREGISTER then
  759. begin
  760. if (nodetype=subn) and (nf_swapped in flags) then
  761. begin
  762. hreg:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  763. if right.location.loc=LOC_CMMXREGISTER then
  764. begin
  765. emit_reg_reg(A_MOVQ,S_NO,right.location.register,hreg);
  766. emit_reg_reg(op,S_NO,left.location.register,hreg);
  767. end
  768. else
  769. begin
  770. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  771. internalerror(2002032412);
  772. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  773. emit_ref_reg(A_MOVQ,S_NO,right.location.reference,hreg);
  774. emit_reg_reg(op,S_NO,left.location.register,hreg);
  775. end;
  776. location.register:=hreg;
  777. end
  778. else
  779. begin
  780. if (right.location.loc=LOC_CMMXREGISTER) then
  781. emit_reg_reg(op,S_NO,right.location.register,left.location.register)
  782. else
  783. begin
  784. if not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  785. internalerror(200203246);
  786. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  787. emit_ref_reg(op,S_NO,right.location.reference,left.location.register);
  788. end;
  789. location.register:=left.location.register;
  790. end;
  791. end
  792. else
  793. begin
  794. { right.location=LOC_MMXREGISTER }
  795. if (nodetype=subn) and (nf_swapped in flags) then
  796. begin
  797. emit_reg_reg(op,S_NO,left.location.register,right.location.register);
  798. location_swap(left.location,right.location);
  799. toggleflag(nf_swapped);
  800. end
  801. else
  802. begin
  803. emit_reg_reg(op,S_NO,right.location.register,left.location.register);
  804. end;
  805. location.register:=left.location.register;
  806. end;
  807. location_freetemp(current_asmdata.CurrAsmList,right.location);
  808. if cmpop then
  809. location_freetemp(current_asmdata.CurrAsmList,left.location);
  810. end;
  811. {$endif SUPPORT_MMX}
  812. {*****************************************************************************
  813. AddFloat
  814. *****************************************************************************}
  815. procedure tx86addnode.second_addfloatsse;
  816. var
  817. op : topcg;
  818. sqr_sum : boolean;
  819. tmp : tnode;
  820. begin
  821. sqr_sum:=false;
  822. if (current_settings.fputype>=fpu_sse3) and
  823. use_vectorfpu(resultdef) and
  824. (nodetype in [addn,subn]) and
  825. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  826. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  827. begin
  828. sqr_sum:=true;
  829. tmp:=tinlinenode(left).left;
  830. tinlinenode(left).left:=nil;
  831. left.free;
  832. left:=tmp;
  833. tmp:=tinlinenode(right).left;
  834. tinlinenode(right).left:=nil;
  835. right.free;
  836. right:=tmp;
  837. end;
  838. pass_left_right;
  839. { fpu operands are always in reversed order on the stack }
  840. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  841. toggleflag(nf_swapped);
  842. if (nf_swapped in flags) then
  843. { can't use swapleftright if both are on the fpu stack, since then }
  844. { both are "R_ST" -> nothing would change -> manually switch }
  845. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  846. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  847. emit_none(A_FXCH,S_NO)
  848. else
  849. swapleftright;
  850. case nodetype of
  851. addn :
  852. op:=OP_ADD;
  853. muln :
  854. op:=OP_MUL;
  855. subn :
  856. op:=OP_SUB;
  857. slashn :
  858. op:=OP_DIV;
  859. else
  860. internalerror(200312231);
  861. end;
  862. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  863. if sqr_sum then
  864. begin
  865. if nf_swapped in flags then
  866. swapleftright;
  867. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  868. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  869. location:=left.location;
  870. if is_double(resultdef) then
  871. begin
  872. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  873. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  874. case nodetype of
  875. addn:
  876. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  877. subn:
  878. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  879. else
  880. internalerror(201108162);
  881. end;
  882. end
  883. else
  884. begin
  885. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  886. { ensure that bits 64..127 contain valid values }
  887. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  888. { the data is now in bits 0..32 and 64..95 }
  889. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  890. case nodetype of
  891. addn:
  892. begin
  893. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  894. end;
  895. subn:
  896. begin
  897. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  898. end;
  899. else
  900. internalerror(201108163);
  901. end;
  902. end
  903. end
  904. { we can use only right as left operand if the operation is commutative }
  905. else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  906. begin
  907. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  908. cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,location.size,right.location.register,location.register,mms_movescalar);
  909. { force floating point reg. location to be written to memory,
  910. we don't force it to mm register because writing to memory
  911. allows probably shorter code because there is no direct fpu->mm register
  912. copy instruction
  913. }
  914. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  915. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  916. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,left.location,location.register,mms_movescalar);
  917. if left.location.loc=LOC_REFERENCE then
  918. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  919. end
  920. else
  921. begin
  922. if nf_swapped in flags then
  923. swapleftright;
  924. { force floating point reg. location to be written to memory,
  925. we don't force it to mm register because writing to memory
  926. allows probably shorter code because there is no direct fpu->mm register
  927. copy instruction
  928. }
  929. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  930. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  931. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  932. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  933. if left.location.loc=LOC_REFERENCE then
  934. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  935. { force floating point reg. location to be written to memory,
  936. we don't force it to mm register because writing to memory
  937. allows probably shorter code because there is no direct fpu->mm register
  938. copy instruction
  939. }
  940. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  941. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  942. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,right.location,location.register,mms_movescalar);
  943. if right.location.loc=LOC_REFERENCE then
  944. tg.ungetiftemp(current_asmdata.CurrAsmList,right.location.reference);
  945. end;
  946. end;
  947. procedure tx86addnode.second_addfloatavx;
  948. var
  949. op : topcg;
  950. sqr_sum : boolean;
  951. {$ifdef dummy}
  952. tmp : tnode;
  953. {$endif dummy}
  954. begin
  955. sqr_sum:=false;
  956. {$ifdef dummy}
  957. if (current_settings.fputype>=fpu_sse3) and
  958. use_vectorfpu(resultdef) and
  959. (nodetype in [addn,subn]) and
  960. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  961. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  962. begin
  963. sqr_sum:=true;
  964. tmp:=tinlinenode(left).left;
  965. tinlinenode(left).left:=nil;
  966. left.free;
  967. left:=tmp;
  968. tmp:=tinlinenode(right).left;
  969. tinlinenode(right).left:=nil;
  970. right.free;
  971. right:=tmp;
  972. end;
  973. {$endif dummy}
  974. pass_left_right;
  975. { fpu operands are always in reversed order on the stack }
  976. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  977. toggleflag(nf_swapped);
  978. if (nf_swapped in flags) then
  979. { can't use swapleftright if both are on the fpu stack, since then }
  980. { both are "R_ST" -> nothing would change -> manually switch }
  981. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  982. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  983. emit_none(A_FXCH,S_NO)
  984. else
  985. swapleftright;
  986. case nodetype of
  987. addn :
  988. op:=OP_ADD;
  989. muln :
  990. op:=OP_MUL;
  991. subn :
  992. op:=OP_SUB;
  993. slashn :
  994. op:=OP_DIV;
  995. else
  996. internalerror(2003122303);
  997. end;
  998. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  999. if sqr_sum then
  1000. begin
  1001. if nf_swapped in flags then
  1002. swapleftright;
  1003. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  1004. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  1005. location:=left.location;
  1006. if is_double(resultdef) then
  1007. begin
  1008. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  1009. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  1010. case nodetype of
  1011. addn:
  1012. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  1013. subn:
  1014. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  1015. else
  1016. internalerror(2011081601);
  1017. end;
  1018. end
  1019. else
  1020. begin
  1021. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  1022. { ensure that bits 64..127 contain valid values }
  1023. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  1024. { the data is now in bits 0..32 and 64..95 }
  1025. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  1026. case nodetype of
  1027. addn:
  1028. begin
  1029. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  1030. end;
  1031. subn:
  1032. begin
  1033. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  1034. end;
  1035. else
  1036. internalerror(2011081604);
  1037. end;
  1038. end
  1039. end
  1040. { left*2 ? }
  1041. else if (nodetype=muln) and is_constrealnode(right) and is_number_float(trealconstnode(right).value_real) and (trealconstnode(right).value_real=2) then
  1042. begin
  1043. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1044. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1045. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1046. left.location.register,
  1047. left.location.register,
  1048. location.register,
  1049. mms_movescalar);
  1050. end
  1051. { right*2 ? }
  1052. else if (nodetype=muln) and is_constrealnode(left) and is_number_float(trealconstnode(left).value_real) and (trealconstnode(left).value_real=2) then
  1053. begin
  1054. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,right.location.size);
  1055. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  1056. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1057. right.location.register,
  1058. right.location.register,
  1059. location.register,
  1060. mms_movescalar);
  1061. end
  1062. { we can use only right as left operand if the operation is commutative }
  1063. else if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) and (op in [OP_ADD,OP_MUL]) then
  1064. begin
  1065. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1066. { force floating point reg. location to be written to memory,
  1067. we don't force it to mm register because writing to memory
  1068. allows probably shorter code because there is no direct fpu->mm register
  1069. copy instruction
  1070. }
  1071. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1072. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1073. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1074. left.location,
  1075. right.location.register,
  1076. location.register,
  1077. mms_movescalar);
  1078. end
  1079. else
  1080. begin
  1081. if (nf_swapped in flags) then
  1082. swapleftright;
  1083. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1084. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1085. { force floating point reg. location to be written to memory,
  1086. we don't force it to mm register because writing to memory
  1087. allows probably shorter code because there is no direct fpu->mm register
  1088. copy instruction
  1089. }
  1090. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1091. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1092. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1093. right.location,
  1094. left.location.register,
  1095. location.register,
  1096. mms_movescalar);
  1097. end;
  1098. end;
  1099. function tx86addnode.pass_1: tnode;
  1100. begin
  1101. { on x86, we do not support fpu registers, so in case of operations using the x87, it
  1102. is normally useful, not to put the operands into registers which would be mm register }
  1103. if ((left.resultdef.typ=floatdef) or (right.resultdef.typ=floatdef)) and
  1104. (not(use_vectorfpu(left.resultdef)) and not(use_vectorfpu(right.resultdef)) and
  1105. not(use_vectorfpu(resultdef))) then
  1106. begin
  1107. make_not_regable(left,[ra_addr_regable]);
  1108. make_not_regable(right,[ra_addr_regable]);
  1109. end;
  1110. Result:=inherited pass_1;
  1111. { correct expectloc, it does not matter of Result is set as another pass_1 is run on it
  1112. which will fix that one }
  1113. if use_vectorfpu(resultdef) then
  1114. expectloc:=LOC_MMREGISTER;
  1115. end;
  1116. function tx86addnode.simplify(forinline : boolean) : tnode;
  1117. var
  1118. t, m, ThisNode, ConstNode: TNode;
  1119. lt,rt, ThisType: TNodeType;
  1120. ThisDef: TDef;
  1121. DoOptimisation: Boolean;
  1122. reciprocal, comparison, divisor: AWord;
  1123. shift, N: Byte;
  1124. begin
  1125. { Load into local variables to reduce the number of pointer deallocations }
  1126. rt:=right.nodetype;
  1127. lt:=left.nodetype;
  1128. DoOptimisation:=False;
  1129. {$if defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)}
  1130. if (cs_opt_level1 in current_settings.optimizerswitches) and
  1131. { The presence of overflow checks tends to cause internal errors with the multiplication nodes }
  1132. not (cs_check_overflow in current_settings.localswitches) and
  1133. (nodetype in [equaln,unequaln]) then
  1134. begin
  1135. if (lt=modn) and (rt=ordconstn) and (TOrdConstNode(right).value.uvalue=0) then
  1136. begin
  1137. t:=left;
  1138. m:=right;
  1139. end
  1140. else if (rt=modn) and (lt=ordconstn) and (TOrdConstNode(left).value.uvalue=0) then
  1141. begin
  1142. t:=right;
  1143. m:=left;
  1144. end
  1145. else
  1146. begin
  1147. t:=nil;
  1148. m:=nil;
  1149. end;
  1150. if Assigned(t) and (TModDivNode(t).right.nodetype=ordconstn) and
  1151. {$ifndef cpu64bitalu}
  1152. { Converting Int64 and QWord division doesn't work under i386 }
  1153. {$ifndef cpu32bitalu}
  1154. (TModDivNode(t).resultdef.size < 4) and
  1155. {$else cpu32bitalu}
  1156. (TModDivNode(t).resultdef.size < 8) and
  1157. {$endif cpu32bitalu}
  1158. {$endif cpu64bitalu}
  1159. (TOrdConstNode(TModDivNode(t).right).value>=3) then
  1160. begin
  1161. divisor:=TOrdConstNode(TModDivNode(t).right).value.uvalue;
  1162. { Exclude powers of 2, as there are more efficient ways to handle those }
  1163. if PopCnt(divisor)>1 then
  1164. begin
  1165. if is_signed(TModDivNode(t).left.resultdef) then
  1166. begin
  1167. { See pages 250-251 of Hacker's Delight, Second Edition
  1168. for an explanation and proof of the algorithm, but
  1169. essentially, we're doing the following:
  1170. - Convert the divisor d to the form k.2^b if it isn't
  1171. already odd (in which case, k = d and b = 0)
  1172. - Calculate r, the multiplicative inverse of k modulo 2^N
  1173. - Calculate c = floor(2^(N-1) / k) & -(2^b)
  1174. - Let q = ((n * r) + c) ror b (mod 2^N)
  1175. - Repurpose c to equal floor(2c / 2^b) = c shr (b - 1)
  1176. (some RISC platforms will benefit from doing this over
  1177. precalculating the modified constant. For x86,
  1178. it's better with the constant precalculated for
  1179. 32-bit and under, but for 64-bit, use SHR. )
  1180. - If q is below or equal to c, then (n mod d) = 0
  1181. }
  1182. while True do
  1183. begin
  1184. ThisNode:=TModDivNode(t).left;
  1185. case ThisNode.nodetype of
  1186. typeconvn:
  1187. begin
  1188. ThisDef:=TTypeConvNode(ThisNode).left.resultdef;
  1189. { See if we can simplify things to a smaller ordinal to
  1190. reduce code size and increase speed }
  1191. if is_signed(ThisDef) and
  1192. is_integer(ThisDef) and
  1193. { Byte-sized multiplications can cause problems }
  1194. (ThisDef.size>=2) and
  1195. { Make sure the divisor is in range }
  1196. (divisor>=TOrdDef(ThisDef).low) and
  1197. (divisor<=TOrdDef(ThisDef).high) then
  1198. begin
  1199. TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef;
  1200. TOrdConstNode(m).resultdef:=ThisDef;
  1201. TModDivNode(t).resultdef:=ThisDef;
  1202. { Destroy the typeconv node }
  1203. TModDivNode(t).left:=TTypeConvNode(ThisNode).left;
  1204. TTypeConvNode(ThisNode).left:=nil;
  1205. ThisNode.Free;
  1206. Continue;
  1207. end;
  1208. end;
  1209. ordconstn:
  1210. begin
  1211. { Just simplify into a constant }
  1212. Result:=inherited simplify(forinline);
  1213. Exit;
  1214. end;
  1215. else
  1216. ;
  1217. end;
  1218. DoOptimisation:=True;
  1219. Break;
  1220. end;
  1221. if DoOptimisation then
  1222. begin
  1223. ThisDef:=TModDivNode(t).left.resultdef;
  1224. if nodetype = equaln then
  1225. ThisType:=lten
  1226. else
  1227. ThisType:=gtn;
  1228. N:=ThisDef.size*8;
  1229. calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift);
  1230. { Construct the following node tree for odd divisors:
  1231. <lten> (for equaln) or <gtn> (for notequaln)
  1232. <addn>
  1233. <muln>
  1234. <typeconv signed-to-unsigned>
  1235. <numerator node (TModDivNode(t).left)>
  1236. <reciprocal constant>
  1237. <comparison constant (effectively a signed shift)>
  1238. <comparison constant * 2>
  1239. For even divisors, convert them to the form k.2^b, with
  1240. odd k, then construct the following:
  1241. <lten> (for equaln) or <gtn> (for notequaln)
  1242. <ror>
  1243. (b)
  1244. <addn>
  1245. <muln>
  1246. <typeconv signed-to-unsigned>
  1247. <numerator node (TModDivNode(t).left)>
  1248. <reciprocal constant>
  1249. <comparison constant (effectively a signed shift)>
  1250. <comparison constant shr (b - 1)>
  1251. }
  1252. ThisNode:=ctypeconvnode.create_internal(TModDivNode(t).left, ThisDef);
  1253. TTypeConvNode(ThisNode).convtype:=tc_int_2_int;
  1254. ThisDef:=get_unsigned_inttype(ThisDef);
  1255. ThisNode.resultdef:=ThisDef;
  1256. TModDivNode(t).left:=nil;
  1257. ConstNode:=cordconstnode.create(reciprocal, ThisDef, False);
  1258. ConstNode.resultdef:=ThisDef;
  1259. ThisNode:=caddnode.create_internal(muln, ThisNode, ConstNode);
  1260. ThisNode.resultdef:=ThisDef;
  1261. {$push}
  1262. {$warnings off}
  1263. if shift>0 then
  1264. comparison:=((aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div (divisor shr shift)) and -(1 shl shift)
  1265. else
  1266. comparison:=(aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div divisor;
  1267. {$pop}
  1268. ConstNode:=cordconstnode.create(comparison, ThisDef, False);
  1269. ConstNode.resultdef:=ThisDef;
  1270. ThisNode:=caddnode.create_internal(addn, ThisNode, ConstNode);
  1271. ThisNode.resultdef:=ThisDef;
  1272. if shift>0 then
  1273. begin
  1274. ConstNode:=cordconstnode.create(shift, u8inttype, False);
  1275. ConstNode.resultdef:=u8inttype;
  1276. ThisNode:=cinlinenode.createintern(in_ror_x_y,false,
  1277. ccallparanode.create(ConstNode,
  1278. ccallparanode.create(ThisNode, nil)));
  1279. ThisNode.resultdef:=ThisDef;
  1280. ConstNode:=cordconstnode.create(comparison shr (shift - 1), ThisDef, False);
  1281. end
  1282. else
  1283. ConstNode:=cordconstnode.create(comparison*2, ThisDef, False);
  1284. ConstNode.resultdef:=ThisDef;
  1285. Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode);
  1286. Result.resultdef:=resultdef;
  1287. Exit;
  1288. end;
  1289. end
  1290. else
  1291. begin
  1292. { For bit length N, convert "(x mod d) = 0" or "(x mod d) <> 0", where
  1293. d is an odd-numbered integer constant, to "(x * r) <= m", where
  1294. dr = 1 (mod 2^N) and m = floor(2^N / d).
  1295. If d is even, convert to the form k.2^b, where k is odd, then
  1296. convert to "(x * r) ror b <= m", where kr = 1 (mod 2^N) and
  1297. m = floor(2^N / d) = floor(2^(N-b) / k) }
  1298. while True do
  1299. begin
  1300. ThisNode:=TModDivNode(t).left;
  1301. case ThisNode.nodetype of
  1302. typeconvn:
  1303. begin
  1304. ThisDef:=TTypeConvNode(ThisNode).left.resultdef;
  1305. { See if we can simplify things to a smaller ordinal to
  1306. reduce code size and increase speed }
  1307. if not is_signed(ThisDef) and
  1308. is_integer(ThisDef) and
  1309. { Byte-sized multiplications can cause problems }
  1310. (ThisDef.size>=2) and
  1311. { Make sure the divisor is in range }
  1312. (divisor>=TOrdDef(ThisDef).low) and
  1313. (divisor<=TOrdDef(ThisDef).high) then
  1314. begin
  1315. TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef;
  1316. TOrdConstNode(m).resultdef:=ThisDef;
  1317. TModDivNode(t).resultdef:=ThisDef;
  1318. { Destroy the typeconv node }
  1319. TModDivNode(t).left:=TTypeConvNode(ThisNode).left;
  1320. TTypeConvNode(ThisNode).left:=nil;
  1321. ThisNode.Free;
  1322. Continue;
  1323. end;
  1324. end;
  1325. ordconstn:
  1326. begin
  1327. { Just simplify into a constant }
  1328. Result:=inherited simplify(forinline);
  1329. Exit;
  1330. end;
  1331. else
  1332. ;
  1333. end;
  1334. DoOptimisation:=True;
  1335. Break;
  1336. end;
  1337. if DoOptimisation then
  1338. begin
  1339. ThisDef:=TModDivNode(t).left.resultdef;
  1340. { Construct the following node tree for odd divisors:
  1341. <lten> (for equaln) or <gtn> (for notequaln)
  1342. <muln>
  1343. <numerator node (TModDivNode(t).left)>
  1344. <reciprocal constant>
  1345. (2^N / divisor)
  1346. For even divisors, convert them to the form k.2^b, with
  1347. odd k, then construct the following:
  1348. <lten> (for equaln) or <gtn> (for notequaln)
  1349. <ror>
  1350. (b)
  1351. <muln>
  1352. <numerator node (TModDivNode(t).left)>
  1353. <reciprocal constant>
  1354. (2^N / divisor)
  1355. }
  1356. if nodetype=equaln then
  1357. ThisType:=lten
  1358. else
  1359. ThisType:=gtn;
  1360. N:=ThisDef.size*8;
  1361. calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift);
  1362. ConstNode:=cordconstnode.create(reciprocal, ThisDef, False);
  1363. ConstNode.resultdef:=ThisDef;
  1364. ThisNode:=caddnode.create_internal(muln, TModDivNode(t).left, ConstNode);
  1365. ThisNode.resultdef:=ThisDef;
  1366. TModDivNode(t).left:=nil;
  1367. if shift>0 then
  1368. begin
  1369. ConstNode:=cordconstnode.create(shift, u8inttype, False);
  1370. ConstNode.resultdef:=u8inttype;
  1371. ThisNode:=cinlinenode.createintern(in_ror_x_y,false,
  1372. ccallparanode.create(ConstNode,
  1373. ccallparanode.create(ThisNode, nil)));
  1374. ThisNode.resultdef:=ThisDef;
  1375. comparison:=(aWord(1) shl ((N-shift) and (SizeOf(aWord)*8-1))) div (divisor shr shift);
  1376. end
  1377. else
  1378. begin
  1379. {$push}
  1380. {$warnings off}
  1381. { Because 2^N and divisor are relatively prime,
  1382. floor(2^N / divisor) = floor((2^N - 1) / divisor) }
  1383. comparison:=(aWord(not 0) shr (((SizeOf(aWord)*8)-N) and (SizeOf(aWord)*8-1))) div divisor;
  1384. {$pop}
  1385. end;
  1386. ConstNode:=cordconstnode.create(comparison, ThisDef, False);
  1387. ConstNode.resultdef:=ThisDef;
  1388. Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode);
  1389. Result.resultdef:=resultdef;
  1390. Exit;
  1391. end;
  1392. end;
  1393. end;
  1394. end;
  1395. end;
  1396. {$ifend defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)}
  1397. Result:=inherited simplify(forinline);
  1398. end;
  1399. function tx86addnode.use_fma : boolean;
  1400. begin
  1401. {$ifndef i8086}
  1402. { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense }
  1403. Result:=use_vectorfpu(resultdef) and
  1404. ((fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[]);
  1405. {$else i8086}
  1406. Result:=inherited use_fma;
  1407. {$endif i8086}
  1408. end;
  1409. procedure tx86addnode.second_cmpfloatvector;
  1410. var
  1411. op : tasmop;
  1412. const
  1413. ops_single: array[boolean] of tasmop = (A_COMISS,A_VCOMISS);
  1414. ops_double: array[boolean] of tasmop = (A_COMISD,A_VCOMISD);
  1415. begin
  1416. if is_single(left.resultdef) then
  1417. op:=ops_single[UseAVX]
  1418. else if is_double(left.resultdef) then
  1419. op:=ops_double[UseAVX]
  1420. else
  1421. internalerror(200402222);
  1422. pass_left_right;
  1423. { fpu operands are always in reversed order on the stack }
  1424. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  1425. toggleflag(nf_swapped);
  1426. location_reset(location,LOC_FLAGS,OS_NO);
  1427. { Direct move fpu->mm register is not possible, so force any fpu operands to
  1428. memory (not to mm registers because one of the memory locations can be used
  1429. directly in compare instruction, yielding shorter code) }
  1430. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1431. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1432. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1433. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1434. if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1435. begin
  1436. case left.location.loc of
  1437. LOC_REFERENCE,LOC_CREFERENCE:
  1438. begin
  1439. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  1440. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
  1441. end;
  1442. LOC_MMREGISTER,LOC_CMMREGISTER:
  1443. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
  1444. else
  1445. internalerror(200402221);
  1446. end;
  1447. toggleflag(nf_swapped);
  1448. end
  1449. else
  1450. begin
  1451. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1452. case right.location.loc of
  1453. LOC_REFERENCE,LOC_CREFERENCE:
  1454. begin
  1455. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  1456. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
  1457. end;
  1458. LOC_MMREGISTER,LOC_CMMREGISTER:
  1459. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
  1460. else
  1461. internalerror(200402223);
  1462. end;
  1463. end;
  1464. location.resflags:=getfpuresflags;
  1465. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1466. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1467. end;
  1468. procedure tx86addnode.second_opvector;
  1469. var
  1470. op : topcg;
  1471. begin
  1472. pass_left_right;
  1473. if (nf_swapped in flags) then
  1474. swapleftright;
  1475. case nodetype of
  1476. addn :
  1477. op:=OP_ADD;
  1478. muln :
  1479. op:=OP_MUL;
  1480. subn :
  1481. op:=OP_SUB;
  1482. slashn :
  1483. op:=OP_DIV;
  1484. else
  1485. internalerror(200610071);
  1486. end;
  1487. if fits_in_mm_register(left.resultdef) then
  1488. begin
  1489. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  1490. { we can use only right as left operand if the operation is commutative }
  1491. if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  1492. begin
  1493. if UseAVX then
  1494. begin
  1495. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1496. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,right.location.register,location.register,nil);
  1497. end
  1498. else
  1499. begin
  1500. location.register:=right.location.register;
  1501. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,location.register,nil);
  1502. end;
  1503. end
  1504. else
  1505. begin
  1506. location_force_mmreg(current_asmdata.CurrAsmList,left.location,false);
  1507. if UseAVX then
  1508. begin
  1509. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1510. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,
  1511. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,left.location.register,location.register,nil);
  1512. end
  1513. else
  1514. begin
  1515. location.register:=left.location.register;
  1516. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,
  1517. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,location.register,nil);
  1518. end;
  1519. end;
  1520. end
  1521. else
  1522. begin
  1523. { not yet supported }
  1524. internalerror(200610072);
  1525. end
  1526. end;
  1527. procedure tx86addnode.second_addfloat;
  1528. const
  1529. ops_add: array[boolean] of TAsmOp = (A_FADDP,A_FADD);
  1530. ops_mul: array[boolean] of TAsmOp = (A_FMULP,A_FMUL);
  1531. ops_sub: array[boolean] of TAsmOp = (A_FSUBP,A_FSUB);
  1532. ops_rsub: array[boolean] of TAsmOp = (A_FSUBRP,A_FSUBR);
  1533. ops_div: array[boolean] of TAsmOp = (A_FDIVP,A_FDIV);
  1534. ops_rdiv: array[boolean] of TAsmOp = (A_FDIVRP,A_FDIVR);
  1535. var
  1536. op : TAsmOp;
  1537. refnode, hp: tnode;
  1538. hasref : boolean;
  1539. begin
  1540. if use_vectorfpu(resultdef) then
  1541. begin
  1542. if UseAVX then
  1543. second_addfloatavx
  1544. else
  1545. second_addfloatsse;
  1546. exit;
  1547. end;
  1548. { can the operation do the conversion? }
  1549. if (left.nodetype=typeconvn) and (is_double(ttypeconvnode(left).left.resultdef) or is_single(ttypeconvnode(left).left.resultdef)) then
  1550. begin
  1551. hp:=left;
  1552. left:=ttypeconvnode(left).left;
  1553. ttypeconvnode(hp).left:=nil;
  1554. hp.Free;
  1555. end;
  1556. if (right.nodetype=typeconvn) and (is_double(ttypeconvnode(right).left.resultdef) or is_single(ttypeconvnode(right).left.resultdef)) then
  1557. begin
  1558. hp:=right;
  1559. right:=ttypeconvnode(right).left;
  1560. ttypeconvnode(hp).left:=nil;
  1561. hp.Free;
  1562. end;
  1563. pass_left_right;
  1564. prepare_x87_locations(refnode);
  1565. hasref:=assigned(refnode);
  1566. case nodetype of
  1567. addn :
  1568. op:=ops_add[hasref];
  1569. muln :
  1570. op:=ops_mul[hasref];
  1571. subn :
  1572. if (nf_swapped in flags) then
  1573. op:=ops_rsub[hasref]
  1574. else
  1575. op:=ops_sub[hasref];
  1576. slashn :
  1577. if (nf_swapped in flags) then
  1578. op:=ops_rdiv[hasref]
  1579. else
  1580. op:=ops_div[hasref];
  1581. else
  1582. internalerror(2003042203);
  1583. end;
  1584. if hasref then
  1585. emit_ref(op,tcgsize2opsize[refnode.location.size],refnode.location.reference)
  1586. else
  1587. begin
  1588. emit_reg_reg(op,S_NO,NR_ST,NR_ST1);
  1589. tcgx86(cg).dec_fpu_stack;
  1590. end;
  1591. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  1592. location.register:=NR_ST;
  1593. end;
  1594. procedure tx86addnode.second_cmpfloat;
  1595. {$ifdef i8086}
  1596. var
  1597. tmpref: treference;
  1598. {$endif i8086}
  1599. begin
  1600. if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
  1601. begin
  1602. second_cmpfloatvector;
  1603. exit;
  1604. end;
  1605. pass_left_right;
  1606. force_left_and_right_fpureg;
  1607. {$ifndef x86_64}
  1608. if current_settings.cputype<cpu_Pentium2 then
  1609. begin
  1610. emit_none(A_FCOMPP,S_NO);
  1611. tcgx86(cg).dec_fpu_stack;
  1612. tcgx86(cg).dec_fpu_stack;
  1613. { load fpu flags }
  1614. {$ifdef i8086}
  1615. if current_settings.cputype < cpu_286 then
  1616. begin
  1617. tg.gettemp(current_asmdata.CurrAsmList,2,2,tt_normal,tmpref);
  1618. emit_ref(A_FSTSW,S_NO,tmpref);
  1619. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1620. inc(tmpref.offset);
  1621. emit_ref_reg(A_MOV,S_B,tmpref,NR_AH);
  1622. dec(tmpref.offset);
  1623. emit_none(A_SAHF,S_NO);
  1624. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1625. tg.ungettemp(current_asmdata.CurrAsmList,tmpref);
  1626. end
  1627. else
  1628. {$endif i8086}
  1629. begin
  1630. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1631. emit_reg(A_FNSTSW,S_NO,NR_AX);
  1632. emit_none(A_SAHF,S_NO);
  1633. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1634. end;
  1635. if cs_fpu_fwait in current_settings.localswitches then
  1636. current_asmdata.CurrAsmList.concat(Taicpu.Op_none(A_FWAIT,S_NO));
  1637. end
  1638. else
  1639. {$endif x86_64}
  1640. begin
  1641. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCOMIP,S_NO,NR_ST1,NR_ST0));
  1642. { fcomip pops only one fpu register }
  1643. current_asmdata.CurrAsmList.concat(taicpu.op_reg(A_FSTP,S_NO,NR_ST0));
  1644. tcgx86(cg).dec_fpu_stack;
  1645. tcgx86(cg).dec_fpu_stack;
  1646. end;
  1647. location_reset(location,LOC_FLAGS,OS_NO);
  1648. location.resflags:=getfpuresflags;
  1649. end;
  1650. {*****************************************************************************
  1651. Add64bit
  1652. *****************************************************************************}
  1653. procedure tx86addnode.second_add64bit;
  1654. begin
  1655. {$ifdef cpu64bitalu}
  1656. second_addordinal;
  1657. {$else cpu64bitalu}
  1658. { must be implemented separate }
  1659. internalerror(200402042);
  1660. {$endif cpu64bitalu}
  1661. end;
  1662. procedure tx86addnode.second_cmp64bit;
  1663. begin
  1664. {$ifdef cpu64bitalu}
  1665. second_cmpordinal;
  1666. {$else cpu64bitalu}
  1667. { must be implemented separate }
  1668. internalerror(200402043);
  1669. {$endif cpu64bitalu}
  1670. end;
  1671. {*****************************************************************************
  1672. AddOrdinal
  1673. *****************************************************************************}
  1674. procedure tx86addnode.second_addordinal;
  1675. var
  1676. opsize : tcgsize;
  1677. unsigned : boolean;
  1678. cgop : topcg;
  1679. checkoverflow : Boolean;
  1680. ovloc : tlocation;
  1681. tmpreg : TRegister;
  1682. begin
  1683. { determine if the comparison will be unsigned }
  1684. unsigned:=not(is_signed(left.resultdef)) or
  1685. not(is_signed(right.resultdef));
  1686. { assume no overflow checking is require }
  1687. checkoverflow := false;
  1688. ovloc.loc:=LOC_VOID;
  1689. case nodetype of
  1690. addn:
  1691. begin
  1692. cgop:=OP_ADD;
  1693. checkoverflow:=true;
  1694. end;
  1695. xorn :
  1696. begin
  1697. cgop:=OP_XOR;
  1698. end;
  1699. orn :
  1700. begin
  1701. cgop:=OP_OR;
  1702. end;
  1703. andn:
  1704. begin
  1705. cgop:=OP_AND;
  1706. end;
  1707. muln:
  1708. begin
  1709. checkoverflow:=true;
  1710. if unsigned then
  1711. cgop:=OP_MUL
  1712. else
  1713. cgop:=OP_IMUL;
  1714. end;
  1715. subn :
  1716. begin
  1717. checkoverflow:=true;
  1718. cgop:=OP_SUB;
  1719. end;
  1720. else
  1721. internalerror(2015022501);
  1722. end;
  1723. checkoverflow:=
  1724. checkoverflow and
  1725. needoverflowcheck;
  1726. opsize:=def_cgsize(left.resultdef);
  1727. pass_left_right;
  1728. { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions
  1729. make no sense if right is a reference }
  1730. if ((left.location.loc<>LOC_REGISTER) and (right.location.loc<>LOC_REGISTER) and
  1731. ((nodetype<>subn) or not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE])) and
  1732. { 3 op mul makes only sense if a constant is involed }
  1733. ((nodetype<>muln) or (left.location.loc=LOC_CONSTANT) or (right.location.loc=LOC_CONSTANT)
  1734. {$ifndef i8086}
  1735. or ((CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and (not(needoverflowcheck))
  1736. )
  1737. {$endif i8086}
  1738. ) and
  1739. (not(nodetype in [orn,andn,xorn]))) or
  1740. ((nodetype=addn) and (left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT]) and (right.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT])) then
  1741. begin
  1742. { allocate registers }
  1743. force_reg_left_right(false,true);
  1744. set_result_location_reg;
  1745. if nodetype<>subn then
  1746. begin
  1747. if checkoverflow then
  1748. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1749. if (right.location.loc<>LOC_CONSTANT) then
  1750. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1751. left.location.register,right.location.register,
  1752. location.register,checkoverflow,ovloc)
  1753. else
  1754. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1755. right.location.value,left.location.register,
  1756. location.register,checkoverflow,ovloc);
  1757. end
  1758. else { subtract is a special case since its not commutative }
  1759. begin
  1760. if (nf_swapped in flags) then
  1761. swapleftright;
  1762. if left.location.loc<>LOC_CONSTANT then
  1763. begin
  1764. if checkoverflow then
  1765. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1766. if right.location.loc<>LOC_CONSTANT then
  1767. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1768. right.location.register,left.location.register,
  1769. location.register,checkoverflow,ovloc)
  1770. else
  1771. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1772. right.location.value,left.location.register,
  1773. location.register,checkoverflow,ovloc);
  1774. end
  1775. else
  1776. begin
  1777. tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  1778. hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef,
  1779. left.location.value,tmpreg);
  1780. if checkoverflow then
  1781. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1782. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1783. right.location.register,tmpreg,location.register,checkoverflow,ovloc);
  1784. end;
  1785. end
  1786. end
  1787. else
  1788. begin
  1789. { at least one location should be a register, if yes, try to re-use it, so we can try two operand opcodes }
  1790. if left.location.loc<>LOC_REGISTER then
  1791. begin
  1792. if right.location.loc<>LOC_REGISTER then
  1793. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false)
  1794. else
  1795. begin
  1796. location_swap(left.location,right.location);
  1797. toggleflag(nf_swapped);
  1798. end;
  1799. end;
  1800. { at this point, left.location.loc should be LOC_REGISTER }
  1801. if right.location.loc=LOC_REGISTER then
  1802. begin
  1803. if checkoverflow then
  1804. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1805. { when swapped another result register }
  1806. if (nodetype=subn) and (nf_swapped in flags) then
  1807. begin
  1808. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1809. left.location.register,right.location.register);
  1810. location_swap(left.location,right.location);
  1811. toggleflag(nf_swapped);
  1812. end
  1813. else
  1814. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1815. right.location.register,left.location.register);
  1816. end
  1817. else
  1818. begin
  1819. { right.location<>LOC_REGISTER }
  1820. if right.location.loc in [LOC_CSUBSETREF,LOC_CSUBSETREG,LOC_SUBSETREF,LOC_SUBSETREG] then
  1821. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,left.resultdef,true);
  1822. if (nodetype=subn) and (nf_swapped in flags) then
  1823. begin
  1824. tmpreg:=left.location.register;
  1825. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  1826. cg.a_load_loc_reg(current_asmdata.CurrAsmList,opsize,right.location,left.location.register);
  1827. if checkoverflow then
  1828. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1829. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,tmpreg,left.location.register);
  1830. end
  1831. else
  1832. begin
  1833. if checkoverflow then
  1834. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1835. cg.a_op_loc_reg(current_asmdata.CurrAsmList,cgop,opsize,right.location,left.location.register);
  1836. end;
  1837. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1838. end;
  1839. location_copy(location,left.location);
  1840. end;
  1841. { emit overflow check if required }
  1842. if checkoverflow then
  1843. cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,Location,resultdef,ovloc);
  1844. end;
  1845. procedure tx86addnode.second_addboolean;
  1846. begin
  1847. if (nodetype in [orn,andn]) and
  1848. (not(cs_full_boolean_eval in current_settings.localswitches) or
  1849. (nf_short_bool in flags)) then
  1850. inherited second_addboolean
  1851. else if is_64bit(left.resultdef) then
  1852. inherited
  1853. else
  1854. second_addordinal;
  1855. end;
  1856. procedure tx86addnode.second_cmpordinal;
  1857. var
  1858. opdef : tdef;
  1859. opsize : tcgsize;
  1860. unsigned : boolean;
  1861. begin
  1862. unsigned:=not(is_signed(left.resultdef)) or
  1863. not(is_signed(right.resultdef));
  1864. opdef:=left.resultdef;
  1865. opsize:=def_cgsize(opdef);
  1866. pass_left_right;
  1867. if (right.location.loc=LOC_CONSTANT) and
  1868. (left.location.loc in [LOC_REFERENCE, LOC_CREFERENCE])
  1869. {$ifdef x86_64}
  1870. and ((not (opsize in [OS_64,OS_S64])) or (
  1871. (right.location.value>=low(longint)) and (right.location.value<=high(longint))
  1872. ))
  1873. {$endif x86_64}
  1874. then
  1875. begin
  1876. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1877. emit_const_ref(A_CMP, TCGSize2Opsize[opsize], right.location.value, left.location.reference);
  1878. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1879. end
  1880. else
  1881. begin
  1882. left_must_be_reg(opdef,opsize,false);
  1883. emit_generic_code(A_CMP,opsize,unsigned,false,false);
  1884. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1885. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1886. end;
  1887. location_reset(location,LOC_FLAGS,OS_NO);
  1888. location.resflags:=getresflags(unsigned);
  1889. end;
  1890. begin
  1891. caddnode:=tx86addnode;
  1892. end.