nx86add.pas 93 KB


  1. {
  2. Copyright (c) 2000-2002 by Florian Klaempfl
  3. Common code generation for add nodes on the i386 and x86
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86add;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. symtype,
  22. cgbase,
  23. cpubase,
  24. node,nadd,ncgadd;
  25. type
  26. tx86addnode = class(tcgaddnode)
  27. protected
  28. function getresflags(unsigned : boolean) : tresflags;
  29. function getfpuresflags : tresflags;
  30. procedure left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  31. procedure force_left_and_right_fpureg;
  32. procedure prepare_x87_locations(out refnode: tnode);
  33. procedure emit_op_right_left(op:TAsmOp;opsize:TCgSize;AllocFlags:boolean);
  34. procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
  35. procedure second_cmpfloatvector;
  36. procedure second_addfloatsse;
  37. procedure second_addfloatavx;
  38. public
  39. function pass_1 : tnode;override;
  40. function simplify(forinline : boolean) : tnode; override;
  41. function use_fma : boolean;override;
  42. procedure second_addfloat;override;
  43. {$ifndef i8086}
  44. procedure second_addsmallset;override;
  45. procedure second_addsmallsetelement;override;
  46. {$endif not i8086}
  47. procedure second_add64bit;override;
  48. procedure second_cmpfloat;override;
  49. procedure second_cmpsmallset;override;
  50. procedure second_cmp64bit;override;
  51. procedure second_cmpordinal;override;
  52. procedure second_addordinal;override;
  53. procedure second_addboolean;override;
  54. {$ifdef SUPPORT_MMX}
  55. procedure second_opmmx;override;
  56. {$endif SUPPORT_MMX}
  57. procedure second_opvector;override;
  58. end;
  59. implementation
  60. uses
  61. globtype,globals,
  62. verbose,cutils,compinnr,
  63. cpuinfo,
  64. aasmbase,aasmdata,aasmcpu,
  65. symconst,symdef,
  66. cgobj,hlcgobj,cgx86,cga,cgutils,
  67. tgobj,ncgutil,nutils,
  68. ncon,nset,ninl,ncnv,ncal,nmat,
  69. defutil,defcmp,constexp,
  70. pass_1,pass_2,htypechk;
  71. { Range check must be disabled explicitly as the code serves
  72. on three different architecture sizes }
  73. {$R-}
  74. {*****************************************************************************
  75. Helpers
  76. *****************************************************************************}
  77. procedure tx86addnode.emit_generic_code(op:TAsmOp;opsize:TCGSize;unsigned,extra_not,mboverflow:boolean);
  78. var
  79. power : longint;
  80. hl4 : tasmlabel;
  81. r : Tregister;
  82. href : treference;
  83. overflowcheck: boolean;
  84. comparison: boolean;
  85. begin
  86. overflowcheck:=needoverflowcheck;
  87. comparison:=
  88. (op=A_CMP) or (op=A_TEST) or (op=A_BT) or is_boolean(resultdef);
  89. { at this point, left.location.loc should be LOC_REGISTER }
  90. if right.location.loc=LOC_REGISTER then
  91. begin
  92. { right.location is a LOC_REGISTER }
  93. { when swapped another result register }
  94. if (nodetype=subn) and (nf_swapped in flags) then
  95. begin
  96. if extra_not then
  97. emit_reg(A_NOT,TCGSize2Opsize[opsize],left.location.register);
  98. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,right.location.register);
  99. { newly swapped also set swapped flag }
  100. location_swap(left.location,right.location);
  101. toggleflag(nf_swapped);
  102. end
  103. else
  104. begin
  105. if extra_not then
  106. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  107. if (op=A_ADD) or (op=A_OR) or (op=A_AND) or (op=A_XOR) or (op=A_IMUL) then
  108. location_swap(left.location,right.location);
  109. if comparison then
  110. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  111. emit_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register);
  112. end;
  113. end
  114. else
  115. begin
  116. { right.location is not a LOC_REGISTER }
  117. if (nodetype=subn) and (nf_swapped in flags) then
  118. begin
  119. if extra_not then
  120. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NOT,opsize,left.location.register,left.location.register);
  121. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  122. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  123. if comparison then
  124. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  125. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,r);
  126. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,r,left.location.register);
  127. end
  128. else
  129. begin
  130. { Optimizations when right.location is a constant value }
  131. if (op=A_CMP) and
  132. (nodetype in [equaln,unequaln]) and
  133. (right.location.loc=LOC_CONSTANT) and
  134. (right.location.value=0) then
  135. begin
  136. { 'test $-1,%reg' is transformable into 'test $-1,spilltemp' if %reg needs
  137. spilling, while 'test %reg,%reg' still requires loading into register.
  138. If spilling is not necessary, it is changed back into 'test %reg,%reg' by
  139. peephole optimizer (this optimization is currently available only for i386). }
  140. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  141. {$ifdef i386}
  142. emit_const_reg(A_TEST,TCGSize2Opsize[opsize],aint(-1),left.location.register)
  143. {$else i386}
  144. emit_reg_reg(A_TEST,TCGSize2Opsize[opsize],left.location.register,left.location.register);
  145. {$endif i386}
  146. end
  147. else
  148. if (op=A_ADD) and
  149. (right.location.loc=LOC_CONSTANT) and
  150. (right.location.value=1) and
  151. not overflowcheck and
  152. UseIncDec then
  153. begin
  154. emit_reg(A_INC,TCGSize2Opsize[opsize],left.location.register);
  155. end
  156. else
  157. if (op=A_SUB) and
  158. (right.location.loc=LOC_CONSTANT) and
  159. (right.location.value=1) and
  160. not overflowcheck and
  161. UseIncDec then
  162. begin
  163. emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register);
  164. end
  165. else
  166. if (op=A_IMUL) and
  167. (right.location.loc=LOC_CONSTANT) and
  168. (ispowerof2(int64(right.location.value),power)) and
  169. overflowcheck then
  170. begin
  171. emit_const_reg(A_SHL,TCGSize2Opsize[opsize],power,left.location.register);
  172. end
  173. else if (op=A_IMUL) and
  174. (right.location.loc=LOC_CONSTANT) and
  175. (right.location.value>1) and (ispowerof2(int64(right.location.value)-1,power)) and
  176. (power in [1..3]) and
  177. not overflowcheck then
  178. begin
  179. reference_reset_base(href,left.location.register,0,ctempposinvalid,0,[]);
  180. href.index:=left.location.register;
  181. href.scalefactor:=int64(right.location.value)-1;
  182. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  183. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_LEA,TCgSize2OpSize[opsize],href,left.location.register));
  184. end
  185. else
  186. begin
  187. if extra_not then
  188. begin
  189. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  190. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  191. emit_reg(A_NOT,TCGSize2Opsize[opsize],r);
  192. if comparison or (mboverflow and overflowcheck) then
  193. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  194. emit_reg_reg(A_AND,TCGSize2Opsize[opsize],r,left.location.register);
  195. end
  196. else
  197. emit_op_right_left(op,opsize,comparison or (mboverflow and overflowcheck));
  198. end;
  199. end;
  200. end;
  201. { only in case of overflow operations }
  202. { produce overflow code }
  203. { we must put it here directly, because sign of operation }
  204. { is in unsigned VAR!! }
  205. if mboverflow then
  206. begin
  207. if overflowcheck then
  208. begin
  209. current_asmdata.getjumplabel(hl4);
  210. if unsigned then
  211. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_AE,hl4)
  212. else
  213. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl4);
  214. if not comparison then
  215. cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  216. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
  217. cg.a_label(current_asmdata.CurrAsmList,hl4);
  218. end;
  219. end;
  220. end;
  221. procedure tx86addnode.left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  222. begin
  223. { left location is not a register? }
  224. if (left.location.loc<>LOC_REGISTER) then
  225. begin
  226. { if right is register then we can swap the locations }
  227. if (not noswap) and
  228. (right.location.loc=LOC_REGISTER) then
  229. begin
  230. location_swap(left.location,right.location);
  231. toggleflag(nf_swapped);
  232. end
  233. else if (not noswap) and
  234. (right.location.loc=LOC_CREGISTER) then
  235. begin
  236. location_swap(left.location,right.location);
  237. toggleflag(nf_swapped);
  238. { maybe we can reuse a constant register when the
  239. operation is a comparison that doesn't change the
  240. value of the register }
  241. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  242. location:=left.location;
  243. end
  244. else
  245. begin
  246. { maybe we can reuse a constant register when the
  247. operation is a comparison that doesn't change the
  248. value of the register }
  249. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  250. end;
  251. end;
  252. if (right.location.loc<>LOC_CONSTANT) and
  253. (tcgsize2unsigned[right.location.size]<>tcgsize2unsigned[opsize]) then
  254. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  255. if (left.location.loc<>LOC_CONSTANT) and
  256. (tcgsize2unsigned[left.location.size]<>tcgsize2unsigned[opsize]) then
  257. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  258. end;
  259. procedure tx86addnode.force_left_and_right_fpureg;
  260. begin
  261. if (right.location.loc<>LOC_FPUREGISTER) then
  262. begin
  263. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  264. if (left.location.loc<>LOC_FPUREGISTER) then
  265. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  266. else
  267. { left was on the stack => swap }
  268. toggleflag(nf_swapped);
  269. end
  270. { the nominator in st0 }
  271. else if (left.location.loc<>LOC_FPUREGISTER) then
  272. begin
  273. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  274. end
  275. else
  276. begin
  277. { fpu operands are always in the wrong order on the stack }
  278. toggleflag(nf_swapped);
  279. end;
  280. end;
  281. { Makes sides suitable for executing an x87 instruction:
  282. if either side is OS_F32/OS_F64-sized LOC_REFERENCE, it is returned in 'refnode'
  283. everything else is loaded to FPU stack. }
  284. procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
  285. begin
  286. refnode:=nil;
  287. { later on, no mm registers are allowed, so transfer everything to memory here
  288. below it is loaded into an fpu register if neede }
  289. if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  290. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  291. if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  292. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  293. case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
  294. 0:
  295. begin
  296. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  297. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  298. InternalError(2013090803);
  299. if (left.location.size in [OS_F32,OS_F64]) then
  300. begin
  301. refnode:=left;
  302. toggleflag(nf_swapped);
  303. end
  304. else
  305. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  306. end;
  307. 1:
  308. begin { if left is on the stack then swap. }
  309. if (left.location.loc=LOC_FPUREGISTER) then
  310. refnode:=right
  311. else
  312. refnode:=left;
  313. if not(refnode.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  314. InternalError(2013090801);
  315. if not (refnode.location.size in [OS_F32,OS_F64]) then
  316. begin
  317. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,refnode.location,refnode.resultdef,false);
  318. if (refnode=right) then
  319. toggleflag(nf_swapped);
  320. refnode:=nil;
  321. end
  322. else
  323. begin
  324. if (refnode=left) then
  325. toggleflag(nf_swapped);
  326. end;
  327. end;
  328. 2: { fpu operands are always in the wrong order on the stack }
  329. toggleflag(nf_swapped);
  330. else
  331. InternalError(2013090802);
  332. end;
  333. end;
  334. procedure tx86addnode.emit_op_right_left(op:TAsmOp;opsize:TCgsize;AllocFlags:boolean);
  335. {$ifdef x86_64}
  336. var
  337. tmpreg : tregister;
  338. {$endif x86_64}
  339. begin
  340. if (right.location.loc in [LOC_CSUBSETREG,LOC_SUBSETREG,LOC_SUBSETREF,LOC_CSUBSETREF]) then
  341. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
  342. { left must be a register }
  343. case right.location.loc of
  344. LOC_REGISTER,
  345. LOC_CREGISTER :
  346. begin
  347. if AllocFlags then
  348. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  349. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register));
  350. end;
  351. LOC_REFERENCE,
  352. LOC_CREFERENCE :
  353. begin
  354. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  355. if AllocFlags then
  356. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  357. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,TCGSize2Opsize[opsize],right.location.reference,left.location.register));
  358. end;
  359. LOC_CONSTANT :
  360. begin
  361. {$ifdef x86_64}
  362. { x86_64 only supports signed 32 bits constants directly }
  363. if (opsize in [OS_S64,OS_64]) and
  364. ((right.location.value<low(longint)) or (right.location.value>high(longint))) then
  365. begin
  366. tmpreg:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  367. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,right.location.value,tmpreg);
  368. if AllocFlags then
  369. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  370. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],tmpreg,left.location.register));
  371. end
  372. else
  373. {$endif x86_64}
  374. begin
  375. if AllocFlags then
  376. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  377. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(op,TCGSize2Opsize[opsize],right.location.value,left.location.register));
  378. end;
  379. end;
  380. else
  381. internalerror(200203232);
  382. end;
  383. end;
  384. function tx86addnode.getresflags(unsigned : boolean) : tresflags;
  385. begin
  386. case nodetype of
  387. equaln : getresflags:=F_E;
  388. unequaln : getresflags:=F_NE;
  389. else
  390. if not(unsigned) then
  391. begin
  392. if nf_swapped in flags then
  393. case nodetype of
  394. ltn : getresflags:=F_G;
  395. lten : getresflags:=F_GE;
  396. gtn : getresflags:=F_L;
  397. gten : getresflags:=F_LE;
  398. else
  399. internalerror(2013120105);
  400. end
  401. else
  402. case nodetype of
  403. ltn : getresflags:=F_L;
  404. lten : getresflags:=F_LE;
  405. gtn : getresflags:=F_G;
  406. gten : getresflags:=F_GE;
  407. else
  408. internalerror(2013120106);
  409. end;
  410. end
  411. else
  412. begin
  413. if nf_swapped in flags then
  414. case nodetype of
  415. ltn : getresflags:=F_A;
  416. lten : getresflags:=F_AE;
  417. gtn : getresflags:=F_B;
  418. gten : getresflags:=F_BE;
  419. else
  420. internalerror(2013120107);
  421. end
  422. else
  423. case nodetype of
  424. ltn : getresflags:=F_B;
  425. lten : getresflags:=F_BE;
  426. gtn : getresflags:=F_A;
  427. gten : getresflags:=F_AE;
  428. else
  429. internalerror(2013120108);
  430. end;
  431. end;
  432. end;
  433. end;
  434. function tx86addnode.getfpuresflags : tresflags;
  435. begin
  436. if (nodetype=equaln) then
  437. result:=F_FE
  438. else if (nodetype=unequaln) then
  439. result:=F_FNE
  440. else if (nf_swapped in flags) then
  441. case nodetype of
  442. ltn : result:=F_FA;
  443. lten : result:=F_FAE;
  444. gtn : result:=F_FB;
  445. gten : result:=F_FBE;
  446. else
  447. internalerror(2014031402);
  448. end
  449. else
  450. case nodetype of
  451. ltn : result:=F_FB;
  452. lten : result:=F_FBE;
  453. gtn : result:=F_FA;
  454. gten : result:=F_FAE;
  455. else
  456. internalerror(2014031403);
  457. end;
  458. end;
  459. {*****************************************************************************
  460. AddSmallSet
  461. *****************************************************************************}
  462. {$ifndef i8086}
  463. procedure tx86addnode.second_addsmallset;
  464. var
  465. setbase : aint;
  466. opdef : tdef;
  467. opsize : TCGSize;
  468. op : TAsmOp;
  469. extra_not,
  470. noswap : boolean;
  471. all_member_optimization:boolean;
  472. begin
  473. pass_left_right;
  474. noswap:=false;
  475. extra_not:=false;
  476. all_member_optimization:=false;
  477. opdef:=resultdef;
  478. opsize:=int_cgsize(opdef.size);
  479. if (left.resultdef.typ=setdef) then
  480. setbase:=tsetdef(left.resultdef).setbase
  481. else
  482. setbase:=tsetdef(right.resultdef).setbase;
  483. case nodetype of
  484. addn :
  485. begin
  486. { adding elements is not commutative }
  487. if (nf_swapped in flags) and (left.nodetype=setelementn) then
  488. swapleftright;
  489. { are we adding set elements ? }
  490. if right.nodetype=setelementn then
  491. begin
  492. { no range support for smallsets! }
  493. if assigned(tsetelementnode(right).right) then
  494. internalerror(43244);
  495. { btsb isn't supported }
  496. if opsize=OS_8 then
  497. begin
  498. opsize:=OS_32;
  499. opdef:=u32inttype;
  500. end;
  501. { bts requires both elements to be registers }
  502. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  503. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  504. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,opdef,right.location,setbase);
  505. op:=A_BTS;
  506. noswap:=true;
  507. end
  508. else
  509. op:=A_OR;
  510. end;
  511. symdifn :
  512. op:=A_XOR;
  513. muln :
  514. op:=A_AND;
  515. subn :
  516. begin
  517. op:=A_AND;
  518. if (not(nf_swapped in flags) and (left.location.loc=LOC_CONSTANT) and (left.location.value=-1)) or
  519. ((nf_swapped in flags) and (right.location.loc=LOC_CONSTANT) and (right.location.value=-1)) then
  520. all_member_optimization:=true;
  521. if (not(nf_swapped in flags)) and
  522. (right.location.loc=LOC_CONSTANT) then
  523. right.location.value := not(right.location.value)
  524. else if (nf_swapped in flags) and
  525. (left.location.loc=LOC_CONSTANT) then
  526. left.location.value := not(left.location.value)
  527. else
  528. extra_not:=true;
  529. end;
  530. xorn :
  531. op:=A_XOR;
  532. orn :
  533. op:=A_OR;
  534. andn :
  535. op:=A_AND;
  536. else
  537. internalerror(2003042215);
  538. end;
  539. if all_member_optimization then
  540. begin
  541. {A set expression [0..31]-x can be implemented with a simple NOT.}
  542. if nf_swapped in flags then
  543. begin
  544. { newly swapped also set swapped flag }
  545. location_swap(left.location,right.location);
  546. toggleflag(nf_swapped);
  547. end;
  548. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,false);
  549. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  550. location:=right.location;
  551. end
  552. else
  553. begin
  554. { can we use the BMI1 instruction andn? }
  555. if (op=A_AND) and extra_not and (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) and
  556. (resultdef.size in [4{$ifdef x86_64},8{$endif x86_64}]) then
  557. begin
  558. location_reset(location,LOC_REGISTER,left.location.size);
  559. location.register:=cg.getintregister(current_asmdata.currAsmList,left.location.size);
  560. if nf_swapped in flags then
  561. begin
  562. location_swap(left.location,right.location);
  563. toggleflag(nf_swapped);
  564. end;
  565. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,true);
  566. if not(left.location.loc in [LOC_CREGISTER,LOC_REGISTER,LOC_CREFERENCE,LOC_REFERENCE]) then
  567. hlcg.location_force_reg(current_asmdata.currAsmList,left.location,left.resultdef,opdef,true);
  568. case left.location.loc of
  569. LOC_CREGISTER,LOC_REGISTER:
  570. emit_reg_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.register,right.location.register,location.register);
  571. LOC_CREFERENCE,LOC_REFERENCE:
  572. emit_ref_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.reference,right.location.register,location.register);
  573. else
  574. Internalerror(2018040201);
  575. end;
  576. end
  577. else
  578. begin
  579. { left must be a register }
  580. left_must_be_reg(opdef,opsize,noswap);
  581. emit_generic_code(op,opsize,true,extra_not,false);
  582. location_freetemp(current_asmdata.CurrAsmList,right.location);
  583. { left is always a register and contains the result }
  584. location:=left.location;
  585. end;
  586. end;
  587. { fix the changed opsize we did above because of the missing btsb }
  588. if opsize<>int_cgsize(resultdef.size) then
  589. hlcg.location_force_reg(current_asmdata.CurrAsmList,location,opdef,cgsize_orddef(int_cgsize(resultdef.size)),false);
  590. end;
  591. procedure tx86addnode.second_addsmallsetelement;
  592. var
  593. setbase, mask: aint;
  594. begin
  595. if resultdef.size=1 then
  596. inherited second_addsmallsetelement
  597. else
  598. begin
  599. if nodetype<>addn then
  600. internalerror(2022090502);
  601. { no range support for smallsets }
  602. if assigned(tsetelementnode(right).right) then
  603. internalerror(2022090501);
  604. pass_left_right;
  605. { setelementn is a special case, it must be on right }
  606. if (nf_swapped in flags) and
  607. (left.nodetype=setelementn) then
  608. swapleftright;
  609. force_reg_left_right(false,false);
  610. set_result_location_reg;
  611. setbase:=tsetdef(left.resultdef).setbase;
  612. if (right.location.loc = LOC_CONSTANT) then
  613. begin
  614. mask:=aint(1 shl (right.location.value-setbase));
  615. hlcg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_OR,resultdef,
  616. mask,left.location.register,location.register);
  617. end
  618. else
  619. begin
  620. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,resultdef,true);
  621. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,resultdef,right.location,setbase);
  622. if left.location.loc <> LOC_CONSTANT then
  623. hlcg.a_load_reg_reg(current_asmdata.CurrAsmList,left.resultdef,resultdef,
  624. left.location.register,location.register)
  625. else
  626. hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef,
  627. left.location.value,location.register);
  628. emit_reg_reg(A_BTS,TCGSize2Opsize[def_cgsize(resultdef)],right.location.register,location.register);
  629. end;
  630. end;
  631. end;
  632. {$endif not i8086}
  633. procedure tx86addnode.second_cmpsmallset;
  634. var
  635. opdef : tdef;
  636. opsize : TCGSize;
  637. op : TAsmOp;
  638. begin
  639. pass_left_right;
  640. opdef:=left.resultdef;
  641. opsize:=int_cgsize(opdef.size);
  642. case nodetype of
  643. equaln,
  644. unequaln :
  645. op:=A_CMP;
  646. lten,gten:
  647. begin
  648. if (not(nf_swapped in flags) and (nodetype = lten)) or
  649. ((nf_swapped in flags) and (nodetype = gten)) then
  650. swapleftright;
  651. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  652. emit_op_right_left(A_AND,opsize,False);
  653. op:=A_CMP;
  654. { warning: ugly hack, we need a JE so change the node to equaln }
  655. nodetype:=equaln;
  656. end;
  657. else
  658. internalerror(2003042204);
  659. end;
  660. { left must be a register }
  661. left_must_be_reg(opdef,opsize,false);
  662. emit_generic_code(op,opsize,true,false,false);
  663. location_freetemp(current_asmdata.CurrAsmList,right.location);
  664. location_freetemp(current_asmdata.CurrAsmList,left.location);
  665. location_reset(location,LOC_FLAGS,OS_NO);
  666. location.resflags:=getresflags(true);
  667. end;
  668. {*****************************************************************************
  669. AddMMX
  670. *****************************************************************************}
  671. {$ifdef SUPPORT_MMX}
  672. procedure tx86addnode.second_opmmx;
  673. var
  674. op : TAsmOp;
  675. cmpop : boolean;
  676. mmxbase : tmmxtype;
  677. hreg,
  678. hregister : tregister;
  679. begin
  680. pass_left_right;
  681. cmpop:=false;
  682. op:=A_NOP;
  683. mmxbase:=mmx_type(left.resultdef);
  684. location_reset(location,LOC_MMXREGISTER,def_cgsize(resultdef));
  685. case nodetype of
  686. addn :
  687. begin
  688. if (cs_mmx_saturation in current_settings.localswitches) then
  689. begin
  690. case mmxbase of
  691. mmxs8bit:
  692. op:=A_PADDSB;
  693. mmxu8bit:
  694. op:=A_PADDUSB;
  695. mmxs16bit,mmxfixed16:
  696. op:=A_PADDSW;
  697. mmxu16bit:
  698. op:=A_PADDUSW;
  699. else
  700. ;
  701. end;
  702. end
  703. else
  704. begin
  705. case mmxbase of
  706. mmxs8bit,mmxu8bit:
  707. op:=A_PADDB;
  708. mmxs16bit,mmxu16bit,mmxfixed16:
  709. op:=A_PADDW;
  710. mmxs32bit,mmxu32bit:
  711. op:=A_PADDD;
  712. else
  713. ;
  714. end;
  715. end;
  716. end;
  717. muln :
  718. begin
  719. case mmxbase of
  720. mmxs16bit,mmxu16bit:
  721. op:=A_PMULLW;
  722. mmxfixed16:
  723. op:=A_PMULHW;
  724. else
  725. ;
  726. end;
  727. end;
  728. subn :
  729. begin
  730. if (cs_mmx_saturation in current_settings.localswitches) then
  731. begin
  732. case mmxbase of
  733. mmxs8bit:
  734. op:=A_PSUBSB;
  735. mmxu8bit:
  736. op:=A_PSUBUSB;
  737. mmxs16bit,mmxfixed16:
  738. op:=A_PSUBSB;
  739. mmxu16bit:
  740. op:=A_PSUBUSW;
  741. else
  742. ;
  743. end;
  744. end
  745. else
  746. begin
  747. case mmxbase of
  748. mmxs8bit,mmxu8bit:
  749. op:=A_PSUBB;
  750. mmxs16bit,mmxu16bit,mmxfixed16:
  751. op:=A_PSUBW;
  752. mmxs32bit,mmxu32bit:
  753. op:=A_PSUBD;
  754. else
  755. ;
  756. end;
  757. end;
  758. end;
  759. xorn:
  760. op:=A_PXOR;
  761. orn:
  762. op:=A_POR;
  763. andn:
  764. op:=A_PAND;
  765. else
  766. internalerror(2003042214);
  767. end;
  768. if op = A_NOP then
  769. internalerror(201408201);
  770. { left and right no register? }
  771. { then one must be demanded }
  772. if (left.location.loc<>LOC_MMXREGISTER) then
  773. begin
  774. if (right.location.loc=LOC_MMXREGISTER) then
  775. begin
  776. location_swap(left.location,right.location);
  777. toggleflag(nf_swapped);
  778. end
  779. else
  780. begin
  781. { register variable ? }
  782. if (left.location.loc=LOC_CMMXREGISTER) then
  783. begin
  784. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  785. emit_reg_reg(A_MOVQ,S_NO,left.location.register,hregister);
  786. end
  787. else
  788. begin
  789. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  790. internalerror(200203245);
  791. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  792. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  793. emit_ref_reg(A_MOVQ,S_NO,left.location.reference,hregister);
  794. end;
  795. location_reset(left.location,LOC_MMXREGISTER,OS_NO);
  796. left.location.register:=hregister;
  797. end;
  798. end;
  799. { at this point, left.location.loc should be LOC_MMXREGISTER }
  800. if right.location.loc<>LOC_MMXREGISTER then
  801. begin
  802. if (nodetype=subn) and (nf_swapped in flags) then
  803. begin
  804. hreg:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  805. if right.location.loc=LOC_CMMXREGISTER then
  806. begin
  807. emit_reg_reg(A_MOVQ,S_NO,right.location.register,hreg);
  808. emit_reg_reg(op,S_NO,left.location.register,hreg);
  809. end
  810. else
  811. begin
  812. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  813. internalerror(2002032412);
  814. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  815. emit_ref_reg(A_MOVQ,S_NO,right.location.reference,hreg);
  816. emit_reg_reg(op,S_NO,left.location.register,hreg);
  817. end;
  818. location.register:=hreg;
  819. end
  820. else
  821. begin
  822. if (right.location.loc=LOC_CMMXREGISTER) then
  823. emit_reg_reg(op,S_NO,right.location.register,left.location.register)
  824. else
  825. begin
  826. if not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  827. internalerror(200203246);
  828. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  829. emit_ref_reg(op,S_NO,right.location.reference,left.location.register);
  830. end;
  831. location.register:=left.location.register;
  832. end;
  833. end
  834. else
  835. begin
  836. { right.location=LOC_MMXREGISTER }
  837. if (nodetype=subn) and (nf_swapped in flags) then
  838. begin
  839. emit_reg_reg(op,S_NO,left.location.register,right.location.register);
  840. location_swap(left.location,right.location);
  841. toggleflag(nf_swapped);
  842. end
  843. else
  844. begin
  845. emit_reg_reg(op,S_NO,right.location.register,left.location.register);
  846. end;
  847. location.register:=left.location.register;
  848. end;
  849. location_freetemp(current_asmdata.CurrAsmList,right.location);
  850. if cmpop then
  851. location_freetemp(current_asmdata.CurrAsmList,left.location);
  852. end;
  853. {$endif SUPPORT_MMX}
  854. {*****************************************************************************
  855. AddFloat
  856. *****************************************************************************}
  857. procedure tx86addnode.second_addfloatsse;
  858. var
  859. op : topcg;
  860. sqr_sum : boolean;
  861. tmp : tnode;
  862. begin
  863. sqr_sum:=false;
  864. if (current_settings.fputype>=fpu_sse3) and
  865. use_vectorfpu(resultdef) and
  866. (nodetype in [addn,subn]) and
  867. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  868. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  869. begin
  870. sqr_sum:=true;
  871. tmp:=tinlinenode(left).left;
  872. tinlinenode(left).left:=nil;
  873. left.free;
  874. left:=tmp;
  875. tmp:=tinlinenode(right).left;
  876. tinlinenode(right).left:=nil;
  877. right.free;
  878. right:=tmp;
  879. end;
  880. pass_left_right;
  881. { fpu operands are always in reversed order on the stack }
  882. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  883. toggleflag(nf_swapped);
  884. if (nf_swapped in flags) then
  885. { can't use swapleftright if both are on the fpu stack, since then }
  886. { both are "R_ST" -> nothing would change -> manually switch }
  887. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  888. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  889. emit_none(A_FXCH,S_NO)
  890. else
  891. swapleftright;
  892. case nodetype of
  893. addn :
  894. op:=OP_ADD;
  895. muln :
  896. op:=OP_MUL;
  897. subn :
  898. op:=OP_SUB;
  899. slashn :
  900. op:=OP_DIV;
  901. else
  902. internalerror(200312231);
  903. end;
  904. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  905. if sqr_sum then
  906. begin
  907. if nf_swapped in flags then
  908. swapleftright;
  909. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  910. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  911. location:=left.location;
  912. if is_double(resultdef) then
  913. begin
  914. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  915. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  916. case nodetype of
  917. addn:
  918. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  919. subn:
  920. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  921. else
  922. internalerror(201108162);
  923. end;
  924. end
  925. else
  926. begin
  927. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  928. { ensure that bits 64..127 contain valid values }
  929. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  930. { the data is now in bits 0..32 and 64..95 }
  931. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  932. case nodetype of
  933. addn:
  934. begin
  935. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  936. end;
  937. subn:
  938. begin
  939. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  940. end;
  941. else
  942. internalerror(201108163);
  943. end;
  944. end
  945. end
  946. { we can use only right as left operand if the operation is commutative }
  947. else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  948. begin
  949. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  950. cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,location.size,right.location.register,location.register,mms_movescalar);
  951. { force floating point reg. location to be written to memory,
  952. we don't force it to mm register because writing to memory
  953. allows probably shorter code because there is no direct fpu->mm register
  954. copy instruction
  955. }
  956. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  957. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  958. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,left.location,location.register,mms_movescalar);
  959. if left.location.loc=LOC_REFERENCE then
  960. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  961. end
  962. else
  963. begin
  964. if nf_swapped in flags then
  965. swapleftright;
  966. { force floating point reg. location to be written to memory,
  967. we don't force it to mm register because writing to memory
  968. allows probably shorter code because there is no direct fpu->mm register
  969. copy instruction
  970. }
  971. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  972. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  973. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  974. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  975. if left.location.loc=LOC_REFERENCE then
  976. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  977. { force floating point reg. location to be written to memory,
  978. we don't force it to mm register because writing to memory
  979. allows probably shorter code because there is no direct fpu->mm register
  980. copy instruction
  981. }
  982. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  983. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  984. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,right.location,location.register,mms_movescalar);
  985. if right.location.loc=LOC_REFERENCE then
  986. tg.ungetiftemp(current_asmdata.CurrAsmList,right.location.reference);
  987. end;
  988. end;
  989. procedure tx86addnode.second_addfloatavx;
  990. var
  991. op : topcg;
  992. sqr_sum : boolean;
  993. {$ifdef dummy}
  994. tmp : tnode;
  995. {$endif dummy}
  996. begin
  997. sqr_sum:=false;
  998. {$ifdef dummy}
  999. if (current_settings.fputype>=fpu_sse3) and
  1000. use_vectorfpu(resultdef) and
  1001. (nodetype in [addn,subn]) and
  1002. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  1003. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  1004. begin
  1005. sqr_sum:=true;
  1006. tmp:=tinlinenode(left).left;
  1007. tinlinenode(left).left:=nil;
  1008. left.free;
  1009. left:=tmp;
  1010. tmp:=tinlinenode(right).left;
  1011. tinlinenode(right).left:=nil;
  1012. right.free;
  1013. right:=tmp;
  1014. end;
  1015. {$endif dummy}
  1016. pass_left_right;
  1017. { fpu operands are always in reversed order on the stack }
  1018. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  1019. toggleflag(nf_swapped);
  1020. if (nf_swapped in flags) then
  1021. { can't use swapleftright if both are on the fpu stack, since then }
  1022. { both are "R_ST" -> nothing would change -> manually switch }
  1023. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  1024. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  1025. emit_none(A_FXCH,S_NO)
  1026. else
  1027. swapleftright;
  1028. case nodetype of
  1029. addn :
  1030. op:=OP_ADD;
  1031. muln :
  1032. op:=OP_MUL;
  1033. subn :
  1034. op:=OP_SUB;
  1035. slashn :
  1036. op:=OP_DIV;
  1037. else
  1038. internalerror(2003122303);
  1039. end;
  1040. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  1041. if sqr_sum then
  1042. begin
  1043. if nf_swapped in flags then
  1044. swapleftright;
  1045. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  1046. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  1047. location:=left.location;
  1048. if is_double(resultdef) then
  1049. begin
  1050. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  1051. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  1052. case nodetype of
  1053. addn:
  1054. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  1055. subn:
  1056. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  1057. else
  1058. internalerror(2011081601);
  1059. end;
  1060. end
  1061. else
  1062. begin
  1063. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  1064. { ensure that bits 64..127 contain valid values }
  1065. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  1066. { the data is now in bits 0..32 and 64..95 }
  1067. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  1068. case nodetype of
  1069. addn:
  1070. begin
  1071. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  1072. end;
  1073. subn:
  1074. begin
  1075. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  1076. end;
  1077. else
  1078. internalerror(2011081604);
  1079. end;
  1080. end
  1081. end
  1082. { left*2 ? }
  1083. else if (nodetype=muln) and is_constrealnode(right) and is_number_float(trealconstnode(right).value_real) and (trealconstnode(right).value_real=2) then
  1084. begin
  1085. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1086. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1087. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1088. left.location.register,
  1089. left.location.register,
  1090. location.register,
  1091. mms_movescalar);
  1092. end
  1093. { right*2 ? }
  1094. else if (nodetype=muln) and is_constrealnode(left) and is_number_float(trealconstnode(left).value_real) and (trealconstnode(left).value_real=2) then
  1095. begin
  1096. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,right.location.size);
  1097. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  1098. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1099. right.location.register,
  1100. right.location.register,
  1101. location.register,
  1102. mms_movescalar);
  1103. end
  1104. { we can use only right as left operand if the operation is commutative }
  1105. else if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) and (op in [OP_ADD,OP_MUL]) then
  1106. begin
  1107. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1108. { force floating point reg. location to be written to memory,
  1109. we don't force it to mm register because writing to memory
  1110. allows probably shorter code because there is no direct fpu->mm register
  1111. copy instruction
  1112. }
  1113. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1114. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1115. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1116. left.location,
  1117. right.location.register,
  1118. location.register,
  1119. mms_movescalar);
  1120. end
  1121. else
  1122. begin
  1123. if (nf_swapped in flags) then
  1124. swapleftright;
  1125. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1126. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1127. { force floating point reg. location to be written to memory,
  1128. we don't force it to mm register because writing to memory
  1129. allows probably shorter code because there is no direct fpu->mm register
  1130. copy instruction
  1131. }
  1132. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1133. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1134. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1135. right.location,
  1136. left.location.register,
  1137. location.register,
  1138. mms_movescalar);
  1139. end;
  1140. end;
  1141. function tx86addnode.pass_1: tnode;
  1142. begin
  1143. { on x86, we do not support fpu registers, so in case of operations using the x87, it
  1144. is normally useful, not to put the operands into registers which would be mm register }
  1145. if ((left.resultdef.typ=floatdef) or (right.resultdef.typ=floatdef)) and
  1146. (not(use_vectorfpu(left.resultdef)) and not(use_vectorfpu(right.resultdef)) and
  1147. not(use_vectorfpu(resultdef))) then
  1148. begin
  1149. make_not_regable(left,[ra_addr_regable]);
  1150. make_not_regable(right,[ra_addr_regable]);
  1151. end;
  1152. Result:=inherited pass_1;
  1153. { correct expectloc, it does not matter of Result is set as another pass_1 is run on it
  1154. which will fix that one }
  1155. if use_vectorfpu(resultdef) then
  1156. expectloc:=LOC_MMREGISTER;
  1157. end;
  1158. function tx86addnode.simplify(forinline : boolean) : tnode;
  1159. var
  1160. t, m, ThisNode, ConstNode: TNode;
  1161. lt,rt, ThisType: TNodeType;
  1162. ThisDef: TDef;
  1163. DoOptimisation: Boolean;
  1164. reciprocal, comparison, divisor: AWord;
  1165. shift, N: Byte;
  1166. begin
  1167. { Load into local variables to reduce the number of pointer deallocations }
  1168. rt:=right.nodetype;
  1169. lt:=left.nodetype;
  1170. DoOptimisation:=False;
  1171. {$if defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)}
  1172. if (cs_opt_level1 in current_settings.optimizerswitches) and
  1173. { The presence of overflow checks tends to cause internal errors with the multiplication nodes }
  1174. not (cs_check_overflow in current_settings.localswitches) and
  1175. (nodetype in [equaln,unequaln]) then
  1176. begin
  1177. if (lt=modn) and (rt=ordconstn) and (TOrdConstNode(right).value.uvalue=0) then
  1178. begin
  1179. t:=left;
  1180. m:=right;
  1181. end
  1182. else if (rt=modn) and (lt=ordconstn) and (TOrdConstNode(left).value.uvalue=0) then
  1183. begin
  1184. t:=right;
  1185. m:=left;
  1186. end
  1187. else
  1188. begin
  1189. t:=nil;
  1190. m:=nil;
  1191. end;
  1192. if Assigned(t) and (TModDivNode(t).right.nodetype=ordconstn) and
  1193. {$ifndef cpu64bitalu}
  1194. { Converting Int64 and QWord division doesn't work under i386 }
  1195. {$ifndef cpu32bitalu}
  1196. (TModDivNode(t).resultdef.size < 4) and
  1197. {$else cpu32bitalu}
  1198. (TModDivNode(t).resultdef.size < 8) and
  1199. {$endif cpu32bitalu}
  1200. {$endif cpu64bitalu}
  1201. (TOrdConstNode(TModDivNode(t).right).value>=3) then
  1202. begin
  1203. divisor:=TOrdConstNode(TModDivNode(t).right).value.uvalue;
  1204. { Exclude powers of 2, as there are more efficient ways to handle those }
  1205. if PopCnt(divisor)>1 then
  1206. begin
  1207. if is_signed(TModDivNode(t).left.resultdef) then
  1208. begin
  1209. { See pages 250-251 of Hacker's Delight, Second Edition
  1210. for an explanation and proof of the algorithm, but
  1211. essentially, we're doing the following:
  1212. - Convert the divisor d to the form k.2^b if it isn't
  1213. already odd (in which case, k = d and b = 0)
  1214. - Calculate r, the multiplicative inverse of k modulo 2^N
  1215. - Calculate c = floor(2^(N-1) / k) & -(2^b)
  1216. - Let q = ((n * r) + c) ror b (mod 2^N)
  1217. - Repurpose c to equal floor(2c / 2^b) = c shr (b - 1)
  1218. (some RISC platforms will benefit from doing this over
  1219. precalculating the modified constant. For x86,
  1220. it's better with the constant precalculated for
  1221. 32-bit and under, but for 64-bit, use SHR. )
  1222. - If q is below or equal to c, then (n mod d) = 0
  1223. }
  1224. while True do
  1225. begin
  1226. ThisNode:=TModDivNode(t).left;
  1227. case ThisNode.nodetype of
  1228. typeconvn:
  1229. begin
  1230. ThisDef:=TTypeConvNode(ThisNode).left.resultdef;
  1231. { See if we can simplify things to a smaller ordinal to
  1232. reduce code size and increase speed }
  1233. if is_signed(ThisDef) and
  1234. is_integer(ThisDef) and
  1235. { Byte-sized multiplications can cause problems }
  1236. (ThisDef.size>=2) and
  1237. { Make sure the divisor is in range }
  1238. (divisor>=TOrdDef(ThisDef).low) and
  1239. (divisor<=TOrdDef(ThisDef).high) then
  1240. begin
  1241. TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef;
  1242. TOrdConstNode(m).resultdef:=ThisDef;
  1243. TModDivNode(t).resultdef:=ThisDef;
  1244. { Destroy the typeconv node }
  1245. TModDivNode(t).left:=TTypeConvNode(ThisNode).left;
  1246. TTypeConvNode(ThisNode).left:=nil;
  1247. ThisNode.Free;
  1248. Continue;
  1249. end;
  1250. end;
  1251. ordconstn:
  1252. begin
  1253. { Just simplify into a constant }
  1254. Result:=inherited simplify(forinline);
  1255. Exit;
  1256. end;
  1257. else
  1258. ;
  1259. end;
  1260. DoOptimisation:=True;
  1261. Break;
  1262. end;
  1263. if DoOptimisation then
  1264. begin
  1265. ThisDef:=TModDivNode(t).left.resultdef;
  1266. if nodetype = equaln then
  1267. ThisType:=lten
  1268. else
  1269. ThisType:=gtn;
  1270. N:=ThisDef.size*8;
  1271. calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift);
  1272. { Construct the following node tree for odd divisors:
  1273. <lten> (for equaln) or <gtn> (for notequaln)
  1274. <addn>
  1275. <muln>
  1276. <typeconv signed-to-unsigned>
  1277. <numerator node (TModDivNode(t).left)>
  1278. <reciprocal constant>
  1279. <comparison constant (effectively a signed shift)>
  1280. <comparison constant * 2>
  1281. For even divisors, convert them to the form k.2^b, with
  1282. odd k, then construct the following:
  1283. <lten> (for equaln) or <gtn> (for notequaln)
  1284. <ror>
  1285. (b)
  1286. <addn>
  1287. <muln>
  1288. <typeconv signed-to-unsigned>
  1289. <numerator node (TModDivNode(t).left)>
  1290. <reciprocal constant>
  1291. <comparison constant (effectively a signed shift)>
  1292. <comparison constant shr (b - 1)>
  1293. }
  1294. ThisNode:=ctypeconvnode.create_internal(TModDivNode(t).left, ThisDef);
  1295. TTypeConvNode(ThisNode).convtype:=tc_int_2_int;
  1296. ThisDef:=get_unsigned_inttype(ThisDef);
  1297. ThisNode.resultdef:=ThisDef;
  1298. TModDivNode(t).left:=nil;
  1299. ConstNode:=cordconstnode.create(reciprocal, ThisDef, False);
  1300. ConstNode.resultdef:=ThisDef;
  1301. ThisNode:=caddnode.create_internal(muln, ThisNode, ConstNode);
  1302. ThisNode.resultdef:=ThisDef;
  1303. {$push}
  1304. {$warnings off}
  1305. if shift>0 then
  1306. comparison:=((aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div (divisor shr shift)) and -(1 shl shift)
  1307. else
  1308. comparison:=(aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div divisor;
  1309. {$pop}
  1310. ConstNode:=cordconstnode.create(comparison, ThisDef, False);
  1311. ConstNode.resultdef:=ThisDef;
  1312. ThisNode:=caddnode.create_internal(addn, ThisNode, ConstNode);
  1313. ThisNode.resultdef:=ThisDef;
  1314. if shift>0 then
  1315. begin
  1316. ConstNode:=cordconstnode.create(shift, u8inttype, False);
  1317. ConstNode.resultdef:=u8inttype;
  1318. ThisNode:=cinlinenode.createintern(in_ror_x_y,false,
  1319. ccallparanode.create(ConstNode,
  1320. ccallparanode.create(ThisNode, nil)));
  1321. ThisNode.resultdef:=ThisDef;
  1322. ConstNode:=cordconstnode.create(comparison shr (shift - 1), ThisDef, False);
  1323. end
  1324. else
  1325. ConstNode:=cordconstnode.create(comparison*2, ThisDef, False);
  1326. ConstNode.resultdef:=ThisDef;
  1327. Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode);
  1328. Result.resultdef:=resultdef;
  1329. Exit;
  1330. end;
  1331. end
  1332. else
  1333. begin
  1334. { For bit length N, convert "(x mod d) = 0" or "(x mod d) <> 0", where
  1335. d is an odd-numbered integer constant, to "(x * r) <= m", where
  1336. dr = 1 (mod 2^N) and m = floor(2^N / d).
  1337. If d is even, convert to the form k.2^b, where k is odd, then
  1338. convert to "(x * r) ror b <= m", where kr = 1 (mod 2^N) and
  1339. m = floor(2^N / d) = floor(2^(N-b) / k) }
  1340. while True do
  1341. begin
  1342. ThisNode:=TModDivNode(t).left;
  1343. case ThisNode.nodetype of
  1344. typeconvn:
  1345. begin
  1346. ThisDef:=TTypeConvNode(ThisNode).left.resultdef;
  1347. { See if we can simplify things to a smaller ordinal to
  1348. reduce code size and increase speed }
  1349. if not is_signed(ThisDef) and
  1350. is_integer(ThisDef) and
  1351. { Byte-sized multiplications can cause problems }
  1352. (ThisDef.size>=2) and
  1353. { Make sure the divisor is in range }
  1354. (divisor>=TOrdDef(ThisDef).low) and
  1355. (divisor<=TOrdDef(ThisDef).high) then
  1356. begin
  1357. TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef;
  1358. TOrdConstNode(m).resultdef:=ThisDef;
  1359. TModDivNode(t).resultdef:=ThisDef;
  1360. { Destroy the typeconv node }
  1361. TModDivNode(t).left:=TTypeConvNode(ThisNode).left;
  1362. TTypeConvNode(ThisNode).left:=nil;
  1363. ThisNode.Free;
  1364. Continue;
  1365. end;
  1366. end;
  1367. ordconstn:
  1368. begin
  1369. { Just simplify into a constant }
  1370. Result:=inherited simplify(forinline);
  1371. Exit;
  1372. end;
  1373. else
  1374. ;
  1375. end;
  1376. DoOptimisation:=True;
  1377. Break;
  1378. end;
  1379. if DoOptimisation then
  1380. begin
  1381. ThisDef:=TModDivNode(t).left.resultdef;
  1382. { Construct the following node tree for odd divisors:
  1383. <lten> (for equaln) or <gtn> (for notequaln)
  1384. <muln>
  1385. <numerator node (TModDivNode(t).left)>
  1386. <reciprocal constant>
  1387. (2^N / divisor)
  1388. For even divisors, convert them to the form k.2^b, with
  1389. odd k, then construct the following:
  1390. <lten> (for equaln) or <gtn> (for notequaln)
  1391. <ror>
  1392. (b)
  1393. <muln>
  1394. <numerator node (TModDivNode(t).left)>
  1395. <reciprocal constant>
  1396. (2^N / divisor)
  1397. }
  1398. if nodetype=equaln then
  1399. ThisType:=lten
  1400. else
  1401. ThisType:=gtn;
  1402. N:=ThisDef.size*8;
  1403. calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift);
  1404. ConstNode:=cordconstnode.create(reciprocal, ThisDef, False);
  1405. ConstNode.resultdef:=ThisDef;
  1406. ThisNode:=caddnode.create_internal(muln, TModDivNode(t).left, ConstNode);
  1407. ThisNode.resultdef:=ThisDef;
  1408. TModDivNode(t).left:=nil;
  1409. if shift>0 then
  1410. begin
  1411. ConstNode:=cordconstnode.create(shift, u8inttype, False);
  1412. ConstNode.resultdef:=u8inttype;
  1413. ThisNode:=cinlinenode.createintern(in_ror_x_y,false,
  1414. ccallparanode.create(ConstNode,
  1415. ccallparanode.create(ThisNode, nil)));
  1416. ThisNode.resultdef:=ThisDef;
  1417. comparison:=(aWord(1) shl ((N-shift) and (SizeOf(aWord)*8-1))) div (divisor shr shift);
  1418. end
  1419. else
  1420. begin
  1421. {$push}
  1422. {$warnings off}
  1423. { Because 2^N and divisor are relatively prime,
  1424. floor(2^N / divisor) = floor((2^N - 1) / divisor) }
  1425. comparison:=(aWord(not 0) shr (((SizeOf(aWord)*8)-N) and (SizeOf(aWord)*8-1))) div divisor;
  1426. {$pop}
  1427. end;
  1428. ConstNode:=cordconstnode.create(comparison, ThisDef, False);
  1429. ConstNode.resultdef:=ThisDef;
  1430. Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode);
  1431. Result.resultdef:=resultdef;
  1432. Exit;
  1433. end;
  1434. end;
  1435. end;
  1436. end;
  1437. end;
  1438. {$ifend defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)}
  1439. Result:=inherited simplify(forinline);
  1440. end;
  1441. function tx86addnode.use_fma : boolean;
  1442. begin
  1443. {$ifndef i8086}
  1444. { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense }
  1445. Result:=use_vectorfpu(resultdef) and
  1446. ((fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[]);
  1447. {$else i8086}
  1448. Result:=inherited use_fma;
  1449. {$endif i8086}
  1450. end;
  1451. procedure tx86addnode.second_cmpfloatvector;
  1452. var
  1453. op : tasmop;
  1454. const
  1455. ops_single: array[boolean] of tasmop = (A_COMISS,A_VCOMISS);
  1456. ops_double: array[boolean] of tasmop = (A_COMISD,A_VCOMISD);
  1457. begin
  1458. if is_single(left.resultdef) then
  1459. op:=ops_single[UseAVX]
  1460. else if is_double(left.resultdef) then
  1461. op:=ops_double[UseAVX]
  1462. else
  1463. internalerror(200402222);
  1464. pass_left_right;
  1465. { fpu operands are always in reversed order on the stack }
  1466. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  1467. toggleflag(nf_swapped);
  1468. location_reset(location,LOC_FLAGS,OS_NO);
  1469. { Direct move fpu->mm register is not possible, so force any fpu operands to
  1470. memory (not to mm registers because one of the memory locations can be used
  1471. directly in compare instruction, yielding shorter code) }
  1472. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1473. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1474. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1475. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1476. if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1477. begin
  1478. case left.location.loc of
  1479. LOC_REFERENCE,LOC_CREFERENCE:
  1480. begin
  1481. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  1482. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
  1483. end;
  1484. LOC_MMREGISTER,LOC_CMMREGISTER:
  1485. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
  1486. else
  1487. internalerror(200402221);
  1488. end;
  1489. toggleflag(nf_swapped);
  1490. end
  1491. else
  1492. begin
  1493. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1494. case right.location.loc of
  1495. LOC_REFERENCE,LOC_CREFERENCE:
  1496. begin
  1497. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  1498. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
  1499. end;
  1500. LOC_MMREGISTER,LOC_CMMREGISTER:
  1501. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
  1502. else
  1503. internalerror(200402223);
  1504. end;
  1505. end;
  1506. location.resflags:=getfpuresflags;
  1507. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1508. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1509. end;
  1510. procedure tx86addnode.second_opvector;
  1511. var
  1512. op : topcg;
  1513. begin
  1514. pass_left_right;
  1515. if (nf_swapped in flags) then
  1516. swapleftright;
  1517. case nodetype of
  1518. addn :
  1519. op:=OP_ADD;
  1520. muln :
  1521. op:=OP_MUL;
  1522. subn :
  1523. op:=OP_SUB;
  1524. slashn :
  1525. op:=OP_DIV;
  1526. else
  1527. internalerror(200610071);
  1528. end;
  1529. if fits_in_mm_register(left.resultdef) then
  1530. begin
  1531. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  1532. { we can use only right as left operand if the operation is commutative }
  1533. if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  1534. begin
  1535. if UseAVX then
  1536. begin
  1537. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1538. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,right.location.register,location.register,nil);
  1539. end
  1540. else
  1541. begin
  1542. location.register:=right.location.register;
  1543. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,location.register,nil);
  1544. end;
  1545. end
  1546. else
  1547. begin
  1548. location_force_mmreg(current_asmdata.CurrAsmList,left.location,false);
  1549. if UseAVX then
  1550. begin
  1551. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1552. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,
  1553. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,left.location.register,location.register,nil);
  1554. end
  1555. else
  1556. begin
  1557. location.register:=left.location.register;
  1558. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,
  1559. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,location.register,nil);
  1560. end;
  1561. end;
  1562. end
  1563. else
  1564. begin
  1565. { not yet supported }
  1566. internalerror(200610072);
  1567. end
  1568. end;
  1569. procedure tx86addnode.second_addfloat;
  1570. const
  1571. ops_add: array[boolean] of TAsmOp = (A_FADDP,A_FADD);
  1572. ops_mul: array[boolean] of TAsmOp = (A_FMULP,A_FMUL);
  1573. ops_sub: array[boolean] of TAsmOp = (A_FSUBP,A_FSUB);
  1574. ops_rsub: array[boolean] of TAsmOp = (A_FSUBRP,A_FSUBR);
  1575. ops_div: array[boolean] of TAsmOp = (A_FDIVP,A_FDIV);
  1576. ops_rdiv: array[boolean] of TAsmOp = (A_FDIVRP,A_FDIVR);
  1577. var
  1578. op : TAsmOp;
  1579. refnode, hp: tnode;
  1580. hasref : boolean;
  1581. begin
  1582. if use_vectorfpu(resultdef) then
  1583. begin
  1584. if UseAVX then
  1585. second_addfloatavx
  1586. else
  1587. second_addfloatsse;
  1588. exit;
  1589. end;
  1590. { can the operation do the conversion? }
  1591. if (left.nodetype=typeconvn) and (is_double(ttypeconvnode(left).left.resultdef) or is_single(ttypeconvnode(left).left.resultdef)) then
  1592. begin
  1593. hp:=left;
  1594. left:=ttypeconvnode(left).left;
  1595. ttypeconvnode(hp).left:=nil;
  1596. hp.Free;
  1597. end;
  1598. if (right.nodetype=typeconvn) and (is_double(ttypeconvnode(right).left.resultdef) or is_single(ttypeconvnode(right).left.resultdef)) then
  1599. begin
  1600. hp:=right;
  1601. right:=ttypeconvnode(right).left;
  1602. ttypeconvnode(hp).left:=nil;
  1603. hp.Free;
  1604. end;
  1605. pass_left_right;
  1606. prepare_x87_locations(refnode);
  1607. hasref:=assigned(refnode);
  1608. case nodetype of
  1609. addn :
  1610. op:=ops_add[hasref];
  1611. muln :
  1612. op:=ops_mul[hasref];
  1613. subn :
  1614. if (nf_swapped in flags) then
  1615. op:=ops_rsub[hasref]
  1616. else
  1617. op:=ops_sub[hasref];
  1618. slashn :
  1619. if (nf_swapped in flags) then
  1620. op:=ops_rdiv[hasref]
  1621. else
  1622. op:=ops_div[hasref];
  1623. else
  1624. internalerror(2003042203);
  1625. end;
  1626. if hasref then
  1627. emit_ref(op,tcgsize2opsize[refnode.location.size],refnode.location.reference)
  1628. else
  1629. begin
  1630. emit_reg_reg(op,S_NO,NR_ST,NR_ST1);
  1631. tcgx86(cg).dec_fpu_stack;
  1632. end;
  1633. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  1634. location.register:=NR_ST;
  1635. end;
  1636. procedure tx86addnode.second_cmpfloat;
  1637. {$ifdef i8086}
  1638. var
  1639. tmpref: treference;
  1640. {$endif i8086}
  1641. begin
  1642. if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
  1643. begin
  1644. second_cmpfloatvector;
  1645. exit;
  1646. end;
  1647. pass_left_right;
  1648. force_left_and_right_fpureg;
  1649. {$ifndef x86_64}
  1650. if current_settings.cputype<cpu_Pentium2 then
  1651. begin
  1652. emit_none(A_FCOMPP,S_NO);
  1653. tcgx86(cg).dec_fpu_stack;
  1654. tcgx86(cg).dec_fpu_stack;
  1655. { load fpu flags }
  1656. {$ifdef i8086}
  1657. if current_settings.cputype < cpu_286 then
  1658. begin
  1659. tg.gettemp(current_asmdata.CurrAsmList,2,2,tt_normal,tmpref);
  1660. emit_ref(A_FSTSW,S_NO,tmpref);
  1661. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1662. inc(tmpref.offset);
  1663. emit_ref_reg(A_MOV,S_B,tmpref,NR_AH);
  1664. dec(tmpref.offset);
  1665. emit_none(A_SAHF,S_NO);
  1666. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1667. tg.ungettemp(current_asmdata.CurrAsmList,tmpref);
  1668. end
  1669. else
  1670. {$endif i8086}
  1671. begin
  1672. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1673. emit_reg(A_FNSTSW,S_NO,NR_AX);
  1674. emit_none(A_SAHF,S_NO);
  1675. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1676. end;
  1677. if cs_fpu_fwait in current_settings.localswitches then
  1678. current_asmdata.CurrAsmList.concat(Taicpu.Op_none(A_FWAIT,S_NO));
  1679. end
  1680. else
  1681. {$endif x86_64}
  1682. begin
  1683. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCOMIP,S_NO,NR_ST1,NR_ST0));
  1684. { fcomip pops only one fpu register }
  1685. current_asmdata.CurrAsmList.concat(taicpu.op_reg(A_FSTP,S_NO,NR_ST0));
  1686. tcgx86(cg).dec_fpu_stack;
  1687. tcgx86(cg).dec_fpu_stack;
  1688. end;
  1689. location_reset(location,LOC_FLAGS,OS_NO);
  1690. location.resflags:=getfpuresflags;
  1691. end;
  1692. {*****************************************************************************
  1693. Add64bit
  1694. *****************************************************************************}
  1695. procedure tx86addnode.second_add64bit;
  1696. begin
  1697. {$ifdef cpu64bitalu}
  1698. second_addordinal;
  1699. {$else cpu64bitalu}
  1700. { must be implemented separate }
  1701. internalerror(200402042);
  1702. {$endif cpu64bitalu}
  1703. end;
  1704. procedure tx86addnode.second_cmp64bit;
  1705. begin
  1706. {$ifdef cpu64bitalu}
  1707. second_cmpordinal;
  1708. {$else cpu64bitalu}
  1709. { must be implemented separate }
  1710. internalerror(200402043);
  1711. {$endif cpu64bitalu}
  1712. end;
  1713. {*****************************************************************************
  1714. AddOrdinal
  1715. *****************************************************************************}
  1716. procedure tx86addnode.second_addordinal;
  1717. var
  1718. opsize : tcgsize;
  1719. unsigned : boolean;
  1720. cgop : topcg;
  1721. checkoverflow : Boolean;
  1722. ovloc : tlocation;
  1723. tmpreg : TRegister;
  1724. indexnode : TNode;
  1725. begin
  1726. { determine if the comparison will be unsigned }
  1727. unsigned:=not(is_signed(left.resultdef)) or
  1728. not(is_signed(right.resultdef));
  1729. { assume no overflow checking is require }
  1730. checkoverflow := false;
  1731. ovloc.loc:=LOC_VOID;
  1732. case nodetype of
  1733. addn:
  1734. begin
  1735. cgop:=OP_ADD;
  1736. checkoverflow:=true;
  1737. end;
  1738. xorn :
  1739. begin
  1740. cgop:=OP_XOR;
  1741. end;
  1742. orn :
  1743. begin
  1744. cgop:=OP_OR;
  1745. end;
  1746. andn:
  1747. begin
  1748. cgop:=OP_AND;
  1749. end;
  1750. muln:
  1751. begin
  1752. checkoverflow:=true;
  1753. if unsigned then
  1754. cgop:=OP_MUL
  1755. else
  1756. cgop:=OP_IMUL;
  1757. end;
  1758. subn :
  1759. begin
  1760. checkoverflow:=true;
  1761. cgop:=OP_SUB;
  1762. end;
  1763. else
  1764. internalerror(2015022501);
  1765. end;
  1766. checkoverflow:=
  1767. checkoverflow and
  1768. needoverflowcheck;
  1769. opsize:=def_cgsize(left.resultdef);
  1770. {$ifndef i8086}
  1771. if (cs_opt_level2 in current_settings.optimizerswitches) then
  1772. begin
  1773. { BMI1 optimisations }
  1774. if (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) then
  1775. begin
  1776. { Can we turn "x and (not y)" into an ANDN instruction instead? }
  1777. if (nodetype = andn) and
  1778. (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
  1779. ((left.nodetype = notn) or (right.nodetype = notn)) and
  1780. (
  1781. { With "const and (not variable)", ANDN will produce larger
  1782. code once everything is moved into registers (as a side-note,
  1783. "const and (not const)" and "variable and (not const)" will
  1784. have been simplified earlier to remove the NOT operation). }
  1785. not (cs_opt_size in current_settings.optimizerswitches) or
  1786. (
  1787. (left.location.loc <> LOC_CONSTANT) and
  1788. (right.location.loc <> LOC_CONSTANT)
  1789. )
  1790. ) then
  1791. begin
  1792. { ANDN only supports the second operand being inverted; however,
  1793. since we're dealing with ordinals, there won't be any Boolean
  1794. shortcutting, so we can safely swap the parameters }
  1795. if (right.nodetype <> notn) then
  1796. swapleftright;
  1797. secondpass(left);
  1798. { Skip the not node completely }
  1799. Include(right.flags, nf_do_not_execute);
  1800. secondpass(tnotnode(right).left);
  1801. { allocate registers }
  1802. hlcg.location_force_reg(
  1803. current_asmdata.CurrAsmList,
  1804. tnotnode(right).left.location,
  1805. tnotnode(right).left.resultdef,
  1806. tnotnode(right).left.resultdef,
  1807. false
  1808. );
  1809. if left.location.loc = LOC_CONSTANT then
  1810. { With "const and (not variable)", we can probably still make a
  1811. saving when it comes to pipeline stalls (left.location.loc
  1812. will become LOC_CREGISTER). }
  1813. hlcg.location_force_reg(
  1814. current_asmdata.CurrAsmList,
  1815. left.location,
  1816. left.resultdef,
  1817. left.resultdef,
  1818. true
  1819. );
  1820. set_result_location_reg;
  1821. case left.location.loc of
  1822. LOC_REFERENCE,
  1823. LOC_CREFERENCE:
  1824. emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register);
  1825. LOC_REGISTER,
  1826. LOC_CREGISTER:
  1827. emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register);
  1828. else
  1829. InternalError(2022102110);
  1830. end;
  1831. { Overflow can't happen with and/andn }
  1832. Exit;
  1833. end;
  1834. end;
  1835. { BMI2 optimisations }
  1836. if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then
  1837. begin
  1838. { Can we turn "x and ((1 shl y) - 1)" into a BZHI instruction instead? }
  1839. if (nodetype = andn) and
  1840. (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
  1841. (
  1842. (
  1843. (right.nodetype = subn) and
  1844. (taddnode(right).right.nodetype = ordconstn) and
  1845. (tordconstnode(taddnode(right).right).value = 1) and
  1846. (taddnode(right).left.nodetype = shln) and
  1847. (tshlshrnode(taddnode(right).left).left.nodetype = ordconstn) and
  1848. (tordconstnode(tshlshrnode(taddnode(right).left).left).value = 1)
  1849. ) or
  1850. (
  1851. (left.nodetype = subn) and
  1852. (taddnode(left).right.nodetype = ordconstn) and
  1853. (tordconstnode(taddnode(left).right).value = 1) and
  1854. (taddnode(left).left.nodetype = shln) and
  1855. (tshlshrnode(taddnode(left).left).left.nodetype = ordconstn) and
  1856. (tordconstnode(tshlshrnode(taddnode(left).left).left).value = 1)
  1857. )
  1858. ) then
  1859. begin
  1860. { Put the subtract node on the right }
  1861. if (right.nodetype <> subn) then
  1862. swapleftright;
  1863. secondpass(left);
  1864. { Skip the subtract and shift nodes completely }
  1865. Include(right.flags, nf_do_not_execute);
  1866. Include(taddnode(right).left.flags, nf_do_not_execute);
  1867. { Helps avoid all the awkward typecasts }
  1868. indexnode := tshlshrnode(taddnode(right).left).right;
  1869. {$ifdef x86_64}
  1870. { The code generator sometimes extends the shift result to 64-bit unnecessarily }
  1871. if (indexnode.nodetype = typeconvn) and (opsize in [OS_32, OS_S32]) and
  1872. (def_cgsize(TTypeConvNode(indexnode).resultdef) in [OS_64, OS_S64]) then
  1873. begin
  1874. { Convert to the 32-bit type }
  1875. indexnode.resultdef := resultdef;
  1876. node_reset_flags(indexnode,[nf_pass1_done]);
  1877. { We should't be getting any new errors }
  1878. if do_firstpass(indexnode) then
  1879. InternalError(2022110201);
  1880. { Keep things internally consistent in case indexnode changed }
  1881. tshlshrnode(taddnode(right).left).right := indexnode;
  1882. end;
  1883. {$endif x86_64}
  1884. secondpass(indexnode);
  1885. { allocate registers }
  1886. hlcg.location_force_reg(
  1887. current_asmdata.CurrAsmList,
  1888. indexnode.location,
  1889. indexnode.resultdef,
  1890. resultdef,
  1891. false
  1892. );
  1893. set_result_location_reg;
  1894. case left.location.loc of
  1895. LOC_REFERENCE,
  1896. LOC_CREFERENCE:
  1897. emit_reg_ref_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.reference, location.register);
  1898. LOC_REGISTER,
  1899. LOC_CREGISTER:
  1900. emit_reg_reg_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.register, location.register);
  1901. else
  1902. InternalError(2022102111);
  1903. end;
  1904. Exit;
  1905. end;
  1906. end;
  1907. end;
  1908. {$endif not i8086}
  1909. pass_left_right;
  1910. { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions
  1911. make no sense if right is a reference }
  1912. if ((left.location.loc<>LOC_REGISTER) and (right.location.loc<>LOC_REGISTER) and
  1913. ((nodetype<>subn) or not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE])) and
  1914. { 3 op mul makes only sense if a constant is involed }
  1915. ((nodetype<>muln) or (left.location.loc=LOC_CONSTANT) or (right.location.loc=LOC_CONSTANT)
  1916. {$ifndef i8086}
  1917. or ((CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and (not(needoverflowcheck))
  1918. )
  1919. {$endif i8086}
  1920. ) and
  1921. (not(nodetype in [orn,andn,xorn]))) or
  1922. ((nodetype=addn) and (left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT]) and (right.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT])) then
  1923. begin
  1924. { allocate registers }
  1925. force_reg_left_right(false,true);
  1926. set_result_location_reg;
  1927. if nodetype<>subn then
  1928. begin
  1929. if checkoverflow then
  1930. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1931. if (right.location.loc<>LOC_CONSTANT) then
  1932. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1933. left.location.register,right.location.register,
  1934. location.register,checkoverflow,ovloc)
  1935. else
  1936. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1937. right.location.value,left.location.register,
  1938. location.register,checkoverflow,ovloc);
  1939. end
  1940. else { subtract is a special case since its not commutative }
  1941. begin
  1942. if (nf_swapped in flags) then
  1943. swapleftright;
  1944. if left.location.loc<>LOC_CONSTANT then
  1945. begin
  1946. if checkoverflow then
  1947. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1948. if right.location.loc<>LOC_CONSTANT then
  1949. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1950. right.location.register,left.location.register,
  1951. location.register,checkoverflow,ovloc)
  1952. else
  1953. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1954. right.location.value,left.location.register,
  1955. location.register,checkoverflow,ovloc);
  1956. end
  1957. else
  1958. begin
  1959. tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  1960. hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef,
  1961. left.location.value,tmpreg);
  1962. if checkoverflow then
  1963. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1964. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1965. right.location.register,tmpreg,location.register,checkoverflow,ovloc);
  1966. end;
  1967. end
  1968. end
  1969. else
  1970. begin
  1971. { at least one location should be a register, if yes, try to re-use it, so we can try two operand opcodes }
  1972. if left.location.loc<>LOC_REGISTER then
  1973. begin
  1974. if right.location.loc<>LOC_REGISTER then
  1975. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false)
  1976. else
  1977. begin
  1978. location_swap(left.location,right.location);
  1979. toggleflag(nf_swapped);
  1980. end;
  1981. end;
  1982. { at this point, left.location.loc should be LOC_REGISTER }
  1983. if right.location.loc=LOC_REGISTER then
  1984. begin
  1985. if checkoverflow then
  1986. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1987. { when swapped another result register }
  1988. if (nodetype=subn) and (nf_swapped in flags) then
  1989. begin
  1990. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1991. left.location.register,right.location.register);
  1992. location_swap(left.location,right.location);
  1993. toggleflag(nf_swapped);
  1994. end
  1995. else
  1996. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1997. right.location.register,left.location.register);
  1998. end
  1999. else
  2000. begin
  2001. { right.location<>LOC_REGISTER }
  2002. if right.location.loc in [LOC_CSUBSETREF,LOC_CSUBSETREG,LOC_SUBSETREF,LOC_SUBSETREG] then
  2003. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,left.resultdef,true);
  2004. if (nodetype=subn) and (nf_swapped in flags) then
  2005. begin
  2006. tmpreg:=left.location.register;
  2007. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  2008. cg.a_load_loc_reg(current_asmdata.CurrAsmList,opsize,right.location,left.location.register);
  2009. if checkoverflow then
  2010. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  2011. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,tmpreg,left.location.register);
  2012. end
  2013. else
  2014. begin
  2015. if checkoverflow then
  2016. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  2017. cg.a_op_loc_reg(current_asmdata.CurrAsmList,cgop,opsize,right.location,left.location.register);
  2018. end;
  2019. location_freetemp(current_asmdata.CurrAsmList,right.location);
  2020. end;
  2021. location_copy(location,left.location);
  2022. end;
  2023. { emit overflow check if required }
  2024. if checkoverflow then
  2025. cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,Location,resultdef,ovloc);
  2026. end;
  2027. procedure tx86addnode.second_addboolean;
  2028. begin
  2029. if (nodetype in [orn,andn]) and
  2030. (not(cs_full_boolean_eval in current_settings.localswitches) or
  2031. (anf_short_bool in addnodeflags)) then
  2032. inherited second_addboolean
  2033. else if is_64bit(left.resultdef) then
  2034. inherited
  2035. else
  2036. second_addordinal;
  2037. end;
  2038. procedure tx86addnode.second_cmpordinal;
  2039. var
  2040. opdef : tdef;
  2041. opsize : tcgsize;
  2042. unsigned : boolean;
  2043. begin
  2044. unsigned:=not(is_signed(left.resultdef)) or
  2045. not(is_signed(right.resultdef));
  2046. opdef:=left.resultdef;
  2047. opsize:=def_cgsize(opdef);
  2048. pass_left_right;
  2049. if (right.location.loc=LOC_CONSTANT) and
  2050. (left.location.loc in [LOC_REFERENCE, LOC_CREFERENCE])
  2051. {$ifdef x86_64}
  2052. and ((not (opsize in [OS_64,OS_S64])) or (
  2053. (right.location.value>=low(longint)) and (right.location.value<=high(longint))
  2054. ))
  2055. {$endif x86_64}
  2056. then
  2057. begin
  2058. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  2059. emit_const_ref(A_CMP, TCGSize2Opsize[opsize], right.location.value, left.location.reference);
  2060. location_freetemp(current_asmdata.CurrAsmList,left.location);
  2061. end
  2062. else
  2063. begin
  2064. left_must_be_reg(opdef,opsize,false);
  2065. emit_generic_code(A_CMP,opsize,unsigned,false,false);
  2066. location_freetemp(current_asmdata.CurrAsmList,right.location);
  2067. location_freetemp(current_asmdata.CurrAsmList,left.location);
  2068. end;
  2069. location_reset(location,LOC_FLAGS,OS_NO);
  2070. location.resflags:=getresflags(unsigned);
  2071. end;
  2072. begin
  2073. caddnode:=tx86addnode;
  2074. end.