2
0

nx86add.pas 66 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693
  1. {
  2. Copyright (c) 2000-2002 by Florian Klaempfl
  3. Common code generation for add nodes on the i386 and x86
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86add;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. symtype,
  22. cgbase,
  23. cpubase,
  24. node,nadd,ncgadd;
  25. type
  26. tx86addnode = class(tcgaddnode)
  27. protected
  28. function getresflags(unsigned : boolean) : tresflags;
  29. function getfpuresflags : tresflags;
  30. procedure left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  31. procedure force_left_and_right_fpureg;
  32. procedure prepare_x87_locations(out refnode: tnode);
  33. procedure emit_op_right_left(op:TAsmOp;opsize:TCgSize);
  34. procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
  35. procedure second_cmpfloatvector;
  36. procedure second_addfloatsse;
  37. procedure second_addfloatavx;
  38. public
  39. function pass_1 : tnode;override;
  40. function use_fma : boolean;override;
  41. procedure second_addfloat;override;
  42. {$ifndef i8086}
  43. procedure second_addsmallset;override;
  44. {$endif not i8086}
  45. procedure second_add64bit;override;
  46. procedure second_cmpfloat;override;
  47. procedure second_cmpsmallset;override;
  48. procedure second_cmp64bit;override;
  49. procedure second_cmpordinal;override;
  50. procedure second_addordinal;override;
  51. procedure second_addboolean;override;
  52. {$ifdef SUPPORT_MMX}
  53. procedure second_opmmx;override;
  54. {$endif SUPPORT_MMX}
  55. procedure second_opvector;override;
  56. end;
  57. implementation
  58. uses
  59. globtype,globals,
  60. verbose,cutils,compinnr,
  61. cpuinfo,
  62. aasmbase,aasmdata,aasmcpu,
  63. symconst,symdef,
  64. cgobj,hlcgobj,cgx86,cga,cgutils,
  65. tgobj,ncgutil,
  66. ncon,nset,ninl,ncnv,
  67. defutil,
  68. htypechk;
  69. { Range check must be disabled explicitly as the code serves
  70. on three different architecture sizes }
  71. {$R-}
  72. {*****************************************************************************
  73. Helpers
  74. *****************************************************************************}
  75. procedure tx86addnode.emit_generic_code(op:TAsmOp;opsize:TCGSize;unsigned,extra_not,mboverflow:boolean);
  76. var
  77. power : longint;
  78. hl4 : tasmlabel;
  79. r : Tregister;
  80. href : treference;
  81. overflowcheck: boolean;
  82. begin
  83. overflowcheck:=needoverflowcheck;
  84. { at this point, left.location.loc should be LOC_REGISTER }
  85. if right.location.loc=LOC_REGISTER then
  86. begin
  87. { right.location is a LOC_REGISTER }
  88. { when swapped another result register }
  89. if (nodetype=subn) and (nf_swapped in flags) then
  90. begin
  91. if extra_not then
  92. emit_reg(A_NOT,TCGSize2Opsize[opsize],left.location.register);
  93. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,right.location.register);
  94. { newly swapped also set swapped flag }
  95. location_swap(left.location,right.location);
  96. toggleflag(nf_swapped);
  97. end
  98. else
  99. begin
  100. if extra_not then
  101. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  102. if (op=A_ADD) or (op=A_OR) or (op=A_AND) or (op=A_XOR) or (op=A_IMUL) then
  103. location_swap(left.location,right.location);
  104. emit_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register);
  105. end;
  106. end
  107. else
  108. begin
  109. { right.location is not a LOC_REGISTER }
  110. if (nodetype=subn) and (nf_swapped in flags) then
  111. begin
  112. if extra_not then
  113. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NOT,opsize,left.location.register,left.location.register);
  114. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  115. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  116. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,r);
  117. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,r,left.location.register);
  118. end
  119. else
  120. begin
  121. { Optimizations when right.location is a constant value }
  122. if (op=A_CMP) and
  123. (nodetype in [equaln,unequaln]) and
  124. (right.location.loc=LOC_CONSTANT) and
  125. (right.location.value=0) then
  126. begin
  127. { 'test $-1,%reg' is transformable into 'test $-1,spilltemp' if %reg needs
  128. spilling, while 'test %reg,%reg' still requires loading into register.
  129. If spilling is not necessary, it is changed back into 'test %reg,%reg' by
  130. peephole optimizer (this optimization is currently available only for i386). }
  131. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  132. {$ifdef i386}
  133. emit_const_reg(A_TEST,TCGSize2Opsize[opsize],aint(-1),left.location.register)
  134. {$else i386}
  135. emit_reg_reg(A_TEST,TCGSize2Opsize[opsize],left.location.register,left.location.register);
  136. {$endif i386}
  137. end
  138. else
  139. if (op=A_ADD) and
  140. (right.location.loc=LOC_CONSTANT) and
  141. (right.location.value=1) and
  142. not overflowcheck and
  143. UseIncDec then
  144. begin
  145. emit_reg(A_INC,TCGSize2Opsize[opsize],left.location.register);
  146. end
  147. else
  148. if (op=A_SUB) and
  149. (right.location.loc=LOC_CONSTANT) and
  150. (right.location.value=1) and
  151. overflowcheck and
  152. UseIncDec then
  153. begin
  154. emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register);
  155. end
  156. else
  157. if (op=A_IMUL) and
  158. (right.location.loc=LOC_CONSTANT) and
  159. (ispowerof2(int64(right.location.value),power)) and
  160. overflowcheck then
  161. begin
  162. emit_const_reg(A_SHL,TCGSize2Opsize[opsize],power,left.location.register);
  163. end
  164. else if (op=A_IMUL) and
  165. (right.location.loc=LOC_CONSTANT) and
  166. (right.location.value>1) and (ispowerof2(int64(right.location.value)-1,power)) and
  167. (power in [1..3]) and
  168. not overflowcheck then
  169. begin
  170. reference_reset_base(href,left.location.register,0,ctempposinvalid,0,[]);
  171. href.index:=left.location.register;
  172. href.scalefactor:=int64(right.location.value)-1;
  173. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  174. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_LEA,TCgSize2OpSize[opsize],href,left.location.register));
  175. end
  176. else
  177. begin
  178. if extra_not then
  179. begin
  180. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  181. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  182. emit_reg(A_NOT,TCGSize2Opsize[opsize],r);
  183. emit_reg_reg(A_AND,TCGSize2Opsize[opsize],r,left.location.register);
  184. end
  185. else
  186. begin
  187. emit_op_right_left(op,opsize);
  188. end;
  189. end;
  190. end;
  191. end;
  192. { only in case of overflow operations }
  193. { produce overflow code }
  194. { we must put it here directly, because sign of operation }
  195. { is in unsigned VAR!! }
  196. if mboverflow then
  197. begin
  198. if overflowcheck then
  199. begin
  200. current_asmdata.getjumplabel(hl4);
  201. if unsigned then
  202. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_AE,hl4)
  203. else
  204. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl4);
  205. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
  206. cg.a_label(current_asmdata.CurrAsmList,hl4);
  207. end;
  208. end;
  209. end;
  210. procedure tx86addnode.left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  211. begin
  212. { left location is not a register? }
  213. if (left.location.loc<>LOC_REGISTER) then
  214. begin
  215. { if right is register then we can swap the locations }
  216. if (not noswap) and
  217. (right.location.loc=LOC_REGISTER) then
  218. begin
  219. location_swap(left.location,right.location);
  220. toggleflag(nf_swapped);
  221. end
  222. else if (not noswap) and
  223. (right.location.loc=LOC_CREGISTER) then
  224. begin
  225. location_swap(left.location,right.location);
  226. toggleflag(nf_swapped);
  227. { maybe we can reuse a constant register when the
  228. operation is a comparison that doesn't change the
  229. value of the register }
  230. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  231. location:=left.location;
  232. end
  233. else
  234. begin
  235. { maybe we can reuse a constant register when the
  236. operation is a comparison that doesn't change the
  237. value of the register }
  238. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  239. end;
  240. end;
  241. if (right.location.loc<>LOC_CONSTANT) and
  242. (tcgsize2unsigned[right.location.size]<>tcgsize2unsigned[opsize]) then
  243. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  244. if (left.location.loc<>LOC_CONSTANT) and
  245. (tcgsize2unsigned[left.location.size]<>tcgsize2unsigned[opsize]) then
  246. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  247. end;
  248. procedure tx86addnode.force_left_and_right_fpureg;
  249. begin
  250. if (right.location.loc<>LOC_FPUREGISTER) then
  251. begin
  252. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  253. if (left.location.loc<>LOC_FPUREGISTER) then
  254. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  255. else
  256. { left was on the stack => swap }
  257. toggleflag(nf_swapped);
  258. end
  259. { the nominator in st0 }
  260. else if (left.location.loc<>LOC_FPUREGISTER) then
  261. begin
  262. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  263. end
  264. else
  265. begin
  266. { fpu operands are always in the wrong order on the stack }
  267. toggleflag(nf_swapped);
  268. end;
  269. end;
  270. { Makes sides suitable for executing an x87 instruction:
  271. if either side is OS_F32/OS_F64-sized LOC_REFERENCE, it is returned in 'refnode'
  272. everything else is loaded to FPU stack. }
  273. procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
  274. begin
  275. refnode:=nil;
  276. { later on, no mm registers are allowed, so transfer everything to memory here
  277. below it is loaded into an fpu register if neede }
  278. if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  279. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  280. if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  281. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  282. case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
  283. 0:
  284. begin
  285. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  286. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  287. InternalError(2013090803);
  288. if (left.location.size in [OS_F32,OS_F64]) then
  289. begin
  290. refnode:=left;
  291. toggleflag(nf_swapped);
  292. end
  293. else
  294. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  295. end;
  296. 1:
  297. begin { if left is on the stack then swap. }
  298. if (left.location.loc=LOC_FPUREGISTER) then
  299. refnode:=right
  300. else
  301. refnode:=left;
  302. if not(refnode.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  303. InternalError(2013090801);
  304. if not (refnode.location.size in [OS_F32,OS_F64]) then
  305. begin
  306. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,refnode.location,refnode.resultdef,false);
  307. if (refnode=right) then
  308. toggleflag(nf_swapped);
  309. refnode:=nil;
  310. end
  311. else
  312. begin
  313. if (refnode=left) then
  314. toggleflag(nf_swapped);
  315. end;
  316. end;
  317. 2: { fpu operands are always in the wrong order on the stack }
  318. toggleflag(nf_swapped);
  319. else
  320. InternalError(2013090802);
  321. end;
  322. end;
  323. procedure tx86addnode.emit_op_right_left(op:TAsmOp;opsize:TCgsize);
  324. {$ifdef x86_64}
  325. var
  326. tmpreg : tregister;
  327. {$endif x86_64}
  328. begin
  329. if (right.location.loc in [LOC_CSUBSETREG,LOC_SUBSETREG,LOC_SUBSETREF,LOC_CSUBSETREF]) then
  330. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
  331. { left must be a register }
  332. case right.location.loc of
  333. LOC_REGISTER,
  334. LOC_CREGISTER :
  335. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register));
  336. LOC_REFERENCE,
  337. LOC_CREFERENCE :
  338. begin
  339. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  340. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,TCGSize2Opsize[opsize],right.location.reference,left.location.register));
  341. end;
  342. LOC_CONSTANT :
  343. begin
  344. {$ifdef x86_64}
  345. { x86_64 only supports signed 32 bits constants directly }
  346. if (opsize in [OS_S64,OS_64]) and
  347. ((right.location.value<low(longint)) or (right.location.value>high(longint))) then
  348. begin
  349. tmpreg:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  350. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,right.location.value,tmpreg);
  351. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],tmpreg,left.location.register));
  352. end
  353. else
  354. {$endif x86_64}
  355. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(op,TCGSize2Opsize[opsize],right.location.value,left.location.register));
  356. end;
  357. else
  358. internalerror(200203232);
  359. end;
  360. end;
  361. function tx86addnode.getresflags(unsigned : boolean) : tresflags;
  362. begin
  363. case nodetype of
  364. equaln : getresflags:=F_E;
  365. unequaln : getresflags:=F_NE;
  366. else
  367. if not(unsigned) then
  368. begin
  369. if nf_swapped in flags then
  370. case nodetype of
  371. ltn : getresflags:=F_G;
  372. lten : getresflags:=F_GE;
  373. gtn : getresflags:=F_L;
  374. gten : getresflags:=F_LE;
  375. else
  376. internalerror(2013120105);
  377. end
  378. else
  379. case nodetype of
  380. ltn : getresflags:=F_L;
  381. lten : getresflags:=F_LE;
  382. gtn : getresflags:=F_G;
  383. gten : getresflags:=F_GE;
  384. else
  385. internalerror(2013120106);
  386. end;
  387. end
  388. else
  389. begin
  390. if nf_swapped in flags then
  391. case nodetype of
  392. ltn : getresflags:=F_A;
  393. lten : getresflags:=F_AE;
  394. gtn : getresflags:=F_B;
  395. gten : getresflags:=F_BE;
  396. else
  397. internalerror(2013120107);
  398. end
  399. else
  400. case nodetype of
  401. ltn : getresflags:=F_B;
  402. lten : getresflags:=F_BE;
  403. gtn : getresflags:=F_A;
  404. gten : getresflags:=F_AE;
  405. else
  406. internalerror(2013120108);
  407. end;
  408. end;
  409. end;
  410. end;
  411. function tx86addnode.getfpuresflags : tresflags;
  412. begin
  413. if (nodetype=equaln) then
  414. result:=F_FE
  415. else if (nodetype=unequaln) then
  416. result:=F_FNE
  417. else if (nf_swapped in flags) then
  418. case nodetype of
  419. ltn : result:=F_FA;
  420. lten : result:=F_FAE;
  421. gtn : result:=F_FB;
  422. gten : result:=F_FBE;
  423. else
  424. internalerror(2014031402);
  425. end
  426. else
  427. case nodetype of
  428. ltn : result:=F_FB;
  429. lten : result:=F_FBE;
  430. gtn : result:=F_FA;
  431. gten : result:=F_FAE;
  432. else
  433. internalerror(2014031403);
  434. end;
  435. end;
  436. {*****************************************************************************
  437. AddSmallSet
  438. *****************************************************************************}
  439. {$ifndef i8086}
  440. procedure tx86addnode.second_addsmallset;
  441. var
  442. setbase : aint;
  443. opdef : tdef;
  444. opsize : TCGSize;
  445. op : TAsmOp;
  446. extra_not,
  447. noswap : boolean;
  448. all_member_optimization:boolean;
  449. begin
  450. pass_left_right;
  451. noswap:=false;
  452. extra_not:=false;
  453. all_member_optimization:=false;
  454. opdef:=resultdef;
  455. opsize:=int_cgsize(opdef.size);
  456. if (left.resultdef.typ=setdef) then
  457. setbase:=tsetdef(left.resultdef).setbase
  458. else
  459. setbase:=tsetdef(right.resultdef).setbase;
  460. case nodetype of
  461. addn :
  462. begin
  463. { adding elements is not commutative }
  464. if (nf_swapped in flags) and (left.nodetype=setelementn) then
  465. swapleftright;
  466. { are we adding set elements ? }
  467. if right.nodetype=setelementn then
  468. begin
  469. { no range support for smallsets! }
  470. if assigned(tsetelementnode(right).right) then
  471. internalerror(43244);
  472. { btsb isn't supported }
  473. if opsize=OS_8 then
  474. begin
  475. opsize:=OS_32;
  476. opdef:=u32inttype;
  477. end;
  478. { bts requires both elements to be registers }
  479. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  480. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  481. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,opdef,right.location,setbase);
  482. op:=A_BTS;
  483. noswap:=true;
  484. end
  485. else
  486. op:=A_OR;
  487. end;
  488. symdifn :
  489. op:=A_XOR;
  490. muln :
  491. op:=A_AND;
  492. subn :
  493. begin
  494. op:=A_AND;
  495. if (not(nf_swapped in flags) and (left.location.loc=LOC_CONSTANT) and (left.location.value=-1)) or
  496. ((nf_swapped in flags) and (right.location.loc=LOC_CONSTANT) and (right.location.value=-1)) then
  497. all_member_optimization:=true;
  498. if (not(nf_swapped in flags)) and
  499. (right.location.loc=LOC_CONSTANT) then
  500. right.location.value := not(right.location.value)
  501. else if (nf_swapped in flags) and
  502. (left.location.loc=LOC_CONSTANT) then
  503. left.location.value := not(left.location.value)
  504. else
  505. extra_not:=true;
  506. end;
  507. xorn :
  508. op:=A_XOR;
  509. orn :
  510. op:=A_OR;
  511. andn :
  512. op:=A_AND;
  513. else
  514. internalerror(2003042215);
  515. end;
  516. if all_member_optimization then
  517. begin
  518. {A set expression [0..31]-x can be implemented with a simple NOT.}
  519. if nf_swapped in flags then
  520. begin
  521. { newly swapped also set swapped flag }
  522. location_swap(left.location,right.location);
  523. toggleflag(nf_swapped);
  524. end;
  525. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,false);
  526. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  527. location:=right.location;
  528. end
  529. else
  530. begin
  531. { can we use the BMI1 instruction andn? }
  532. if (op=A_AND) and extra_not and (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) and
  533. (resultdef.size in [4{$ifdef x86_64},8{$endif x86_64}]) then
  534. begin
  535. location_reset(location,LOC_REGISTER,left.location.size);
  536. location.register:=cg.getintregister(current_asmdata.currAsmList,left.location.size);
  537. if nf_swapped in flags then
  538. begin
  539. location_swap(left.location,right.location);
  540. toggleflag(nf_swapped);
  541. end;
  542. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,true);
  543. if not(left.location.loc in [LOC_CREGISTER,LOC_REGISTER,LOC_CREFERENCE,LOC_REFERENCE]) then
  544. hlcg.location_force_reg(current_asmdata.currAsmList,left.location,left.resultdef,opdef,true);
  545. case left.location.loc of
  546. LOC_CREGISTER,LOC_REGISTER:
  547. emit_reg_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.register,right.location.register,location.register);
  548. LOC_CREFERENCE,LOC_REFERENCE:
  549. emit_ref_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.reference,right.location.register,location.register);
  550. else
  551. Internalerror(2018040201);
  552. end;
  553. end
  554. else
  555. begin
  556. { left must be a register }
  557. left_must_be_reg(opdef,opsize,noswap);
  558. emit_generic_code(op,opsize,true,extra_not,false);
  559. location_freetemp(current_asmdata.CurrAsmList,right.location);
  560. { left is always a register and contains the result }
  561. location:=left.location;
  562. end;
  563. end;
  564. { fix the changed opsize we did above because of the missing btsb }
  565. if opsize<>int_cgsize(resultdef.size) then
  566. hlcg.location_force_reg(current_asmdata.CurrAsmList,location,opdef,cgsize_orddef(int_cgsize(resultdef.size)),false);
  567. end;
  568. {$endif not i8086}
  569. procedure tx86addnode.second_cmpsmallset;
  570. var
  571. opdef : tdef;
  572. opsize : TCGSize;
  573. op : TAsmOp;
  574. begin
  575. pass_left_right;
  576. opdef:=left.resultdef;
  577. opsize:=int_cgsize(opdef.size);
  578. case nodetype of
  579. equaln,
  580. unequaln :
  581. op:=A_CMP;
  582. lten,gten:
  583. begin
  584. if (not(nf_swapped in flags) and (nodetype = lten)) or
  585. ((nf_swapped in flags) and (nodetype = gten)) then
  586. swapleftright;
  587. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  588. emit_op_right_left(A_AND,opsize);
  589. op:=A_CMP;
  590. { warning: ugly hack, we need a JE so change the node to equaln }
  591. nodetype:=equaln;
  592. end;
  593. else
  594. internalerror(2003042204);
  595. end;
  596. { left must be a register }
  597. left_must_be_reg(opdef,opsize,false);
  598. emit_generic_code(op,opsize,true,false,false);
  599. location_freetemp(current_asmdata.CurrAsmList,right.location);
  600. location_freetemp(current_asmdata.CurrAsmList,left.location);
  601. location_reset(location,LOC_FLAGS,OS_NO);
  602. location.resflags:=getresflags(true);
  603. end;
  604. {*****************************************************************************
  605. AddMMX
  606. *****************************************************************************}
  607. {$ifdef SUPPORT_MMX}
  608. procedure tx86addnode.second_opmmx;
  609. var
  610. op : TAsmOp;
  611. cmpop : boolean;
  612. mmxbase : tmmxtype;
  613. hreg,
  614. hregister : tregister;
  615. begin
  616. pass_left_right;
  617. cmpop:=false;
  618. op:=A_NOP;
  619. mmxbase:=mmx_type(left.resultdef);
  620. location_reset(location,LOC_MMXREGISTER,def_cgsize(resultdef));
  621. case nodetype of
  622. addn :
  623. begin
  624. if (cs_mmx_saturation in current_settings.localswitches) then
  625. begin
  626. case mmxbase of
  627. mmxs8bit:
  628. op:=A_PADDSB;
  629. mmxu8bit:
  630. op:=A_PADDUSB;
  631. mmxs16bit,mmxfixed16:
  632. op:=A_PADDSW;
  633. mmxu16bit:
  634. op:=A_PADDUSW;
  635. else
  636. ;
  637. end;
  638. end
  639. else
  640. begin
  641. case mmxbase of
  642. mmxs8bit,mmxu8bit:
  643. op:=A_PADDB;
  644. mmxs16bit,mmxu16bit,mmxfixed16:
  645. op:=A_PADDW;
  646. mmxs32bit,mmxu32bit:
  647. op:=A_PADDD;
  648. else
  649. ;
  650. end;
  651. end;
  652. end;
  653. muln :
  654. begin
  655. case mmxbase of
  656. mmxs16bit,mmxu16bit:
  657. op:=A_PMULLW;
  658. mmxfixed16:
  659. op:=A_PMULHW;
  660. else
  661. ;
  662. end;
  663. end;
  664. subn :
  665. begin
  666. if (cs_mmx_saturation in current_settings.localswitches) then
  667. begin
  668. case mmxbase of
  669. mmxs8bit:
  670. op:=A_PSUBSB;
  671. mmxu8bit:
  672. op:=A_PSUBUSB;
  673. mmxs16bit,mmxfixed16:
  674. op:=A_PSUBSB;
  675. mmxu16bit:
  676. op:=A_PSUBUSW;
  677. else
  678. ;
  679. end;
  680. end
  681. else
  682. begin
  683. case mmxbase of
  684. mmxs8bit,mmxu8bit:
  685. op:=A_PSUBB;
  686. mmxs16bit,mmxu16bit,mmxfixed16:
  687. op:=A_PSUBW;
  688. mmxs32bit,mmxu32bit:
  689. op:=A_PSUBD;
  690. else
  691. ;
  692. end;
  693. end;
  694. end;
  695. xorn:
  696. op:=A_PXOR;
  697. orn:
  698. op:=A_POR;
  699. andn:
  700. op:=A_PAND;
  701. else
  702. internalerror(2003042214);
  703. end;
  704. if op = A_NOP then
  705. internalerror(201408201);
  706. { left and right no register? }
  707. { then one must be demanded }
  708. if (left.location.loc<>LOC_MMXREGISTER) then
  709. begin
  710. if (right.location.loc=LOC_MMXREGISTER) then
  711. begin
  712. location_swap(left.location,right.location);
  713. toggleflag(nf_swapped);
  714. end
  715. else
  716. begin
  717. { register variable ? }
  718. if (left.location.loc=LOC_CMMXREGISTER) then
  719. begin
  720. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  721. emit_reg_reg(A_MOVQ,S_NO,left.location.register,hregister);
  722. end
  723. else
  724. begin
  725. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  726. internalerror(200203245);
  727. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  728. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  729. emit_ref_reg(A_MOVQ,S_NO,left.location.reference,hregister);
  730. end;
  731. location_reset(left.location,LOC_MMXREGISTER,OS_NO);
  732. left.location.register:=hregister;
  733. end;
  734. end;
  735. { at this point, left.location.loc should be LOC_MMXREGISTER }
  736. if right.location.loc<>LOC_MMXREGISTER then
  737. begin
  738. if (nodetype=subn) and (nf_swapped in flags) then
  739. begin
  740. hreg:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  741. if right.location.loc=LOC_CMMXREGISTER then
  742. begin
  743. emit_reg_reg(A_MOVQ,S_NO,right.location.register,hreg);
  744. emit_reg_reg(op,S_NO,left.location.register,hreg);
  745. end
  746. else
  747. begin
  748. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  749. internalerror(2002032412);
  750. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  751. emit_ref_reg(A_MOVQ,S_NO,right.location.reference,hreg);
  752. emit_reg_reg(op,S_NO,left.location.register,hreg);
  753. end;
  754. location.register:=hreg;
  755. end
  756. else
  757. begin
  758. if (right.location.loc=LOC_CMMXREGISTER) then
  759. emit_reg_reg(op,S_NO,right.location.register,left.location.register)
  760. else
  761. begin
  762. if not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  763. internalerror(200203246);
  764. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  765. emit_ref_reg(op,S_NO,right.location.reference,left.location.register);
  766. end;
  767. location.register:=left.location.register;
  768. end;
  769. end
  770. else
  771. begin
  772. { right.location=LOC_MMXREGISTER }
  773. if (nodetype=subn) and (nf_swapped in flags) then
  774. begin
  775. emit_reg_reg(op,S_NO,left.location.register,right.location.register);
  776. location_swap(left.location,right.location);
  777. toggleflag(nf_swapped);
  778. end
  779. else
  780. begin
  781. emit_reg_reg(op,S_NO,right.location.register,left.location.register);
  782. end;
  783. location.register:=left.location.register;
  784. end;
  785. location_freetemp(current_asmdata.CurrAsmList,right.location);
  786. if cmpop then
  787. location_freetemp(current_asmdata.CurrAsmList,left.location);
  788. end;
  789. {$endif SUPPORT_MMX}
  790. {*****************************************************************************
  791. AddFloat
  792. *****************************************************************************}
  793. procedure tx86addnode.second_addfloatsse;
  794. var
  795. op : topcg;
  796. sqr_sum : boolean;
  797. tmp : tnode;
  798. begin
  799. sqr_sum:=false;
  800. if (current_settings.fputype>=fpu_sse3) and
  801. use_vectorfpu(resultdef) and
  802. (nodetype in [addn,subn]) and
  803. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  804. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  805. begin
  806. sqr_sum:=true;
  807. tmp:=tinlinenode(left).left;
  808. tinlinenode(left).left:=nil;
  809. left.free;
  810. left:=tmp;
  811. tmp:=tinlinenode(right).left;
  812. tinlinenode(right).left:=nil;
  813. right.free;
  814. right:=tmp;
  815. end;
  816. pass_left_right;
  817. { fpu operands are always in reversed order on the stack }
  818. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  819. toggleflag(nf_swapped);
  820. if (nf_swapped in flags) then
  821. { can't use swapleftright if both are on the fpu stack, since then }
  822. { both are "R_ST" -> nothing would change -> manually switch }
  823. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  824. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  825. emit_none(A_FXCH,S_NO)
  826. else
  827. swapleftright;
  828. case nodetype of
  829. addn :
  830. op:=OP_ADD;
  831. muln :
  832. op:=OP_MUL;
  833. subn :
  834. op:=OP_SUB;
  835. slashn :
  836. op:=OP_DIV;
  837. else
  838. internalerror(200312231);
  839. end;
  840. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  841. if sqr_sum then
  842. begin
  843. if nf_swapped in flags then
  844. swapleftright;
  845. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  846. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  847. location:=left.location;
  848. if is_double(resultdef) then
  849. begin
  850. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  851. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  852. case nodetype of
  853. addn:
  854. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  855. subn:
  856. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  857. else
  858. internalerror(201108162);
  859. end;
  860. end
  861. else
  862. begin
  863. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  864. { ensure that bits 64..127 contain valid values }
  865. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  866. { the data is now in bits 0..32 and 64..95 }
  867. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  868. case nodetype of
  869. addn:
  870. begin
  871. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  872. end;
  873. subn:
  874. begin
  875. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  876. end;
  877. else
  878. internalerror(201108163);
  879. end;
  880. end
  881. end
  882. { we can use only right as left operand if the operation is commutative }
  883. else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  884. begin
  885. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  886. cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,location.size,right.location.register,location.register,mms_movescalar);
  887. { force floating point reg. location to be written to memory,
  888. we don't force it to mm register because writing to memory
  889. allows probably shorter code because there is no direct fpu->mm register
  890. copy instruction
  891. }
  892. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  893. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  894. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,left.location,location.register,mms_movescalar);
  895. end
  896. else
  897. begin
  898. if nf_swapped in flags then
  899. swapleftright;
  900. { force floating point reg. location to be written to memory,
  901. we don't force it to mm register because writing to memory
  902. allows probably shorter code because there is no direct fpu->mm register
  903. copy instruction
  904. }
  905. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  906. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  907. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  908. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  909. { force floating point reg. location to be written to memory,
  910. we don't force it to mm register because writing to memory
  911. allows probably shorter code because there is no direct fpu->mm register
  912. copy instruction
  913. }
  914. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  915. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  916. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,right.location,location.register,mms_movescalar);
  917. end;
  918. end;
  919. procedure tx86addnode.second_addfloatavx;
  920. var
  921. op : topcg;
  922. sqr_sum : boolean;
  923. {$ifdef dummy}
  924. tmp : tnode;
  925. {$endif dummy}
  926. begin
  927. sqr_sum:=false;
  928. {$ifdef dummy}
  929. if (current_settings.fputype>=fpu_sse3) and
  930. use_vectorfpu(resultdef) and
  931. (nodetype in [addn,subn]) and
  932. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  933. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  934. begin
  935. sqr_sum:=true;
  936. tmp:=tinlinenode(left).left;
  937. tinlinenode(left).left:=nil;
  938. left.free;
  939. left:=tmp;
  940. tmp:=tinlinenode(right).left;
  941. tinlinenode(right).left:=nil;
  942. right.free;
  943. right:=tmp;
  944. end;
  945. {$endif dummy}
  946. pass_left_right;
  947. { fpu operands are always in reversed order on the stack }
  948. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  949. toggleflag(nf_swapped);
  950. if (nf_swapped in flags) then
  951. { can't use swapleftright if both are on the fpu stack, since then }
  952. { both are "R_ST" -> nothing would change -> manually switch }
  953. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  954. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  955. emit_none(A_FXCH,S_NO)
  956. else
  957. swapleftright;
  958. case nodetype of
  959. addn :
  960. op:=OP_ADD;
  961. muln :
  962. op:=OP_MUL;
  963. subn :
  964. op:=OP_SUB;
  965. slashn :
  966. op:=OP_DIV;
  967. else
  968. internalerror(2003122303);
  969. end;
  970. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  971. if sqr_sum then
  972. begin
  973. if nf_swapped in flags then
  974. swapleftright;
  975. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  976. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  977. location:=left.location;
  978. if is_double(resultdef) then
  979. begin
  980. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  981. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  982. case nodetype of
  983. addn:
  984. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  985. subn:
  986. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  987. else
  988. internalerror(2011081601);
  989. end;
  990. end
  991. else
  992. begin
  993. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  994. { ensure that bits 64..127 contain valid values }
  995. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  996. { the data is now in bits 0..32 and 64..95 }
  997. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  998. case nodetype of
  999. addn:
  1000. begin
  1001. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  1002. end;
  1003. subn:
  1004. begin
  1005. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  1006. end;
  1007. else
  1008. internalerror(2011081604);
  1009. end;
  1010. end
  1011. end
  1012. { left*2 ? }
  1013. else if (nodetype=muln) and is_constrealnode(right) and is_number_float(trealconstnode(right).value_real) and (trealconstnode(right).value_real=2) then
  1014. begin
  1015. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1016. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1017. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1018. left.location.register,
  1019. left.location.register,
  1020. location.register,
  1021. mms_movescalar);
  1022. end
  1023. { right*2 ? }
  1024. else if (nodetype=muln) and is_constrealnode(left) and is_number_float(trealconstnode(left).value_real) and (trealconstnode(left).value_real=2) then
  1025. begin
  1026. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,right.location.size);
  1027. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  1028. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1029. right.location.register,
  1030. right.location.register,
  1031. location.register,
  1032. mms_movescalar);
  1033. end
  1034. { we can use only right as left operand if the operation is commutative }
  1035. else if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) and (op in [OP_ADD,OP_MUL]) then
  1036. begin
  1037. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1038. { force floating point reg. location to be written to memory,
  1039. we don't force it to mm register because writing to memory
  1040. allows probably shorter code because there is no direct fpu->mm register
  1041. copy instruction
  1042. }
  1043. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1044. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1045. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1046. left.location,
  1047. right.location.register,
  1048. location.register,
  1049. mms_movescalar);
  1050. end
  1051. else
  1052. begin
  1053. if (nf_swapped in flags) then
  1054. swapleftright;
  1055. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1056. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1057. { force floating point reg. location to be written to memory,
  1058. we don't force it to mm register because writing to memory
  1059. allows probably shorter code because there is no direct fpu->mm register
  1060. copy instruction
  1061. }
  1062. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1063. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1064. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1065. right.location,
  1066. left.location.register,
  1067. location.register,
  1068. mms_movescalar);
  1069. end;
  1070. end;
  1071. function tx86addnode.pass_1: tnode;
  1072. begin
  1073. { on x86, we do not support fpu registers, so in case of operations using the x87, it
  1074. is normally useful, not to put the operands into registers which would be mm register }
  1075. if ((left.resultdef.typ=floatdef) or (right.resultdef.typ=floatdef)) and
  1076. (not(use_vectorfpu(left.resultdef)) and not(use_vectorfpu(right.resultdef)) and
  1077. not(use_vectorfpu(resultdef))) then
  1078. begin
  1079. make_not_regable(left,[ra_addr_regable]);
  1080. make_not_regable(right,[ra_addr_regable]);
  1081. end;
  1082. Result:=inherited pass_1;
  1083. end;
  1084. function tx86addnode.use_fma : boolean;
  1085. begin
  1086. {$ifndef i8086}
  1087. { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense }
  1088. Result:=use_vectorfpu(resultdef) and
  1089. ((cpu_capabilities[current_settings.cputype]*[CPUX86_HAS_FMA,CPUX86_HAS_FMA4])<>[]);
  1090. {$else i8086}
  1091. Result:=inherited use_fma;
  1092. {$endif i8086}
  1093. end;
  1094. procedure tx86addnode.second_cmpfloatvector;
  1095. var
  1096. op : tasmop;
  1097. const
  1098. ops_single: array[boolean] of tasmop = (A_COMISS,A_VCOMISS);
  1099. ops_double: array[boolean] of tasmop = (A_COMISD,A_VCOMISD);
  1100. begin
  1101. if is_single(left.resultdef) then
  1102. op:=ops_single[UseAVX]
  1103. else if is_double(left.resultdef) then
  1104. op:=ops_double[UseAVX]
  1105. else
  1106. internalerror(200402222);
  1107. pass_left_right;
  1108. { fpu operands are always in reversed order on the stack }
  1109. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  1110. toggleflag(nf_swapped);
  1111. location_reset(location,LOC_FLAGS,OS_NO);
  1112. { Direct move fpu->mm register is not possible, so force any fpu operands to
  1113. memory (not to mm registers because one of the memory locations can be used
  1114. directly in compare instruction, yielding shorter code) }
  1115. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1116. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1117. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1118. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1119. if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1120. begin
  1121. case left.location.loc of
  1122. LOC_REFERENCE,LOC_CREFERENCE:
  1123. begin
  1124. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  1125. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
  1126. end;
  1127. LOC_MMREGISTER,LOC_CMMREGISTER:
  1128. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
  1129. else
  1130. internalerror(200402221);
  1131. end;
  1132. toggleflag(nf_swapped);
  1133. end
  1134. else
  1135. begin
  1136. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1137. case right.location.loc of
  1138. LOC_REFERENCE,LOC_CREFERENCE:
  1139. begin
  1140. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  1141. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
  1142. end;
  1143. LOC_MMREGISTER,LOC_CMMREGISTER:
  1144. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
  1145. else
  1146. internalerror(200402223);
  1147. end;
  1148. end;
  1149. location.resflags:=getfpuresflags;
  1150. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1151. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1152. end;
  1153. procedure tx86addnode.second_opvector;
  1154. var
  1155. op : topcg;
  1156. begin
  1157. pass_left_right;
  1158. if (nf_swapped in flags) then
  1159. swapleftright;
  1160. case nodetype of
  1161. addn :
  1162. op:=OP_ADD;
  1163. muln :
  1164. op:=OP_MUL;
  1165. subn :
  1166. op:=OP_SUB;
  1167. slashn :
  1168. op:=OP_DIV;
  1169. else
  1170. internalerror(200610071);
  1171. end;
  1172. if fits_in_mm_register(left.resultdef) then
  1173. begin
  1174. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  1175. { we can use only right as left operand if the operation is commutative }
  1176. if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  1177. begin
  1178. if UseAVX then
  1179. begin
  1180. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1181. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,right.location.register,location.register,nil);
  1182. end
  1183. else
  1184. begin
  1185. location.register:=right.location.register;
  1186. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,location.register,nil);
  1187. end;
  1188. end
  1189. else
  1190. begin
  1191. location_force_mmreg(current_asmdata.CurrAsmList,left.location,false);
  1192. if UseAVX then
  1193. begin
  1194. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1195. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,
  1196. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,left.location.register,location.register,nil);
  1197. end
  1198. else
  1199. begin
  1200. location.register:=left.location.register;
  1201. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,
  1202. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,location.register,nil);
  1203. end;
  1204. end;
  1205. end
  1206. else
  1207. begin
  1208. { not yet supported }
  1209. internalerror(200610072);
  1210. end
  1211. end;
  1212. procedure tx86addnode.second_addfloat;
  1213. const
  1214. ops_add: array[boolean] of TAsmOp = (A_FADDP,A_FADD);
  1215. ops_mul: array[boolean] of TAsmOp = (A_FMULP,A_FMUL);
  1216. ops_sub: array[boolean] of TAsmOp = (A_FSUBP,A_FSUB);
  1217. ops_rsub: array[boolean] of TAsmOp = (A_FSUBRP,A_FSUBR);
  1218. ops_div: array[boolean] of TAsmOp = (A_FDIVP,A_FDIV);
  1219. ops_rdiv: array[boolean] of TAsmOp = (A_FDIVRP,A_FDIVR);
  1220. var
  1221. op : TAsmOp;
  1222. refnode, hp: tnode;
  1223. hasref : boolean;
  1224. begin
  1225. if use_vectorfpu(resultdef) then
  1226. begin
  1227. if UseAVX then
  1228. second_addfloatavx
  1229. else
  1230. second_addfloatsse;
  1231. exit;
  1232. end;
  1233. { can the operation do the conversion? }
  1234. if (left.nodetype=typeconvn) and (is_double(ttypeconvnode(left).left.resultdef) or is_single(ttypeconvnode(left).left.resultdef)) then
  1235. begin
  1236. hp:=left;
  1237. left:=ttypeconvnode(left).left;
  1238. ttypeconvnode(hp).left:=nil;
  1239. hp.Free;
  1240. end;
  1241. if (right.nodetype=typeconvn) and (is_double(ttypeconvnode(right).left.resultdef) or is_single(ttypeconvnode(right).left.resultdef)) then
  1242. begin
  1243. hp:=right;
  1244. right:=ttypeconvnode(right).left;
  1245. ttypeconvnode(hp).left:=nil;
  1246. hp.Free;
  1247. end;
  1248. pass_left_right;
  1249. prepare_x87_locations(refnode);
  1250. hasref:=assigned(refnode);
  1251. case nodetype of
  1252. addn :
  1253. op:=ops_add[hasref];
  1254. muln :
  1255. op:=ops_mul[hasref];
  1256. subn :
  1257. if (nf_swapped in flags) then
  1258. op:=ops_rsub[hasref]
  1259. else
  1260. op:=ops_sub[hasref];
  1261. slashn :
  1262. if (nf_swapped in flags) then
  1263. op:=ops_rdiv[hasref]
  1264. else
  1265. op:=ops_div[hasref];
  1266. else
  1267. internalerror(2003042203);
  1268. end;
  1269. if hasref then
  1270. emit_ref(op,tcgsize2opsize[refnode.location.size],refnode.location.reference)
  1271. else
  1272. begin
  1273. emit_reg_reg(op,S_NO,NR_ST,NR_ST1);
  1274. tcgx86(cg).dec_fpu_stack;
  1275. end;
  1276. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  1277. location.register:=NR_ST;
  1278. end;
  1279. procedure tx86addnode.second_cmpfloat;
  1280. {$ifdef i8086}
  1281. var
  1282. tmpref: treference;
  1283. {$endif i8086}
  1284. begin
  1285. if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
  1286. begin
  1287. second_cmpfloatvector;
  1288. exit;
  1289. end;
  1290. pass_left_right;
  1291. force_left_and_right_fpureg;
  1292. {$ifndef x86_64}
  1293. if current_settings.cputype<cpu_Pentium2 then
  1294. begin
  1295. emit_none(A_FCOMPP,S_NO);
  1296. tcgx86(cg).dec_fpu_stack;
  1297. tcgx86(cg).dec_fpu_stack;
  1298. { load fpu flags }
  1299. {$ifdef i8086}
  1300. if current_settings.cputype < cpu_286 then
  1301. begin
  1302. tg.gettemp(current_asmdata.CurrAsmList,2,2,tt_normal,tmpref);
  1303. emit_ref(A_FSTSW,S_NO,tmpref);
  1304. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1305. inc(tmpref.offset);
  1306. emit_ref_reg(A_MOV,S_B,tmpref,NR_AH);
  1307. dec(tmpref.offset);
  1308. emit_none(A_SAHF,S_NO);
  1309. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1310. tg.ungettemp(current_asmdata.CurrAsmList,tmpref);
  1311. end
  1312. else
  1313. {$endif i8086}
  1314. begin
  1315. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1316. emit_reg(A_FNSTSW,S_NO,NR_AX);
  1317. emit_none(A_SAHF,S_NO);
  1318. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1319. end;
  1320. if cs_fpu_fwait in current_settings.localswitches then
  1321. current_asmdata.CurrAsmList.concat(Taicpu.Op_none(A_FWAIT,S_NO));
  1322. end
  1323. else
  1324. {$endif x86_64}
  1325. begin
  1326. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCOMIP,S_NO,NR_ST1,NR_ST0));
  1327. { fcomip pops only one fpu register }
  1328. current_asmdata.CurrAsmList.concat(taicpu.op_reg(A_FSTP,S_NO,NR_ST0));
  1329. tcgx86(cg).dec_fpu_stack;
  1330. tcgx86(cg).dec_fpu_stack;
  1331. end;
  1332. location_reset(location,LOC_FLAGS,OS_NO);
  1333. location.resflags:=getfpuresflags;
  1334. end;
  1335. {*****************************************************************************
  1336. Add64bit
  1337. *****************************************************************************}
  1338. procedure tx86addnode.second_add64bit;
  1339. begin
  1340. {$ifdef cpu64bitalu}
  1341. second_addordinal;
  1342. {$else cpu64bitalu}
  1343. { must be implemented separate }
  1344. internalerror(200402042);
  1345. {$endif cpu64bitalu}
  1346. end;
  1347. procedure tx86addnode.second_cmp64bit;
  1348. begin
  1349. {$ifdef cpu64bitalu}
  1350. second_cmpordinal;
  1351. {$else cpu64bitalu}
  1352. { must be implemented separate }
  1353. internalerror(200402043);
  1354. {$endif cpu64bitalu}
  1355. end;
  1356. {*****************************************************************************
  1357. AddOrdinal
  1358. *****************************************************************************}
  1359. procedure tx86addnode.second_addordinal;
  1360. var
  1361. opsize : tcgsize;
  1362. unsigned : boolean;
  1363. cgop : topcg;
  1364. checkoverflow : Boolean;
  1365. ovloc : tlocation;
  1366. tmpreg : TRegister;
  1367. begin
  1368. { determine if the comparison will be unsigned }
  1369. unsigned:=not(is_signed(left.resultdef)) or
  1370. not(is_signed(right.resultdef));
  1371. { assume no overflow checking is require }
  1372. checkoverflow := false;
  1373. ovloc.loc:=LOC_VOID;
  1374. case nodetype of
  1375. addn:
  1376. begin
  1377. cgop:=OP_ADD;
  1378. checkoverflow:=true;
  1379. end;
  1380. xorn :
  1381. begin
  1382. cgop:=OP_XOR;
  1383. end;
  1384. orn :
  1385. begin
  1386. cgop:=OP_OR;
  1387. end;
  1388. andn:
  1389. begin
  1390. cgop:=OP_AND;
  1391. end;
  1392. muln:
  1393. begin
  1394. checkoverflow:=true;
  1395. if unsigned then
  1396. cgop:=OP_MUL
  1397. else
  1398. cgop:=OP_IMUL;
  1399. end;
  1400. subn :
  1401. begin
  1402. checkoverflow:=true;
  1403. cgop:=OP_SUB;
  1404. end;
  1405. else
  1406. internalerror(2015022501);
  1407. end;
  1408. checkoverflow:=
  1409. checkoverflow and
  1410. needoverflowcheck;
  1411. opsize:=def_cgsize(left.resultdef);
  1412. pass_left_right;
  1413. { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions
  1414. make no sense if right is a reference }
  1415. if ((left.location.loc<>LOC_REGISTER) and (right.location.loc<>LOC_REGISTER) and
  1416. ((nodetype<>subn) or not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE])) and
  1417. { 3 op mul makes only sense if a constant is involed }
  1418. ((nodetype<>muln) or (left.location.loc=LOC_CONSTANT) or (right.location.loc=LOC_CONSTANT)
  1419. {$ifndef i8086}
  1420. or ((CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and (not(needoverflowcheck))
  1421. )
  1422. {$endif i8086}
  1423. ) and
  1424. (not(nodetype in [orn,andn,xorn]))) or
  1425. ((nodetype=addn) and (left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT]) and (right.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT])) then
  1426. begin
  1427. { allocate registers }
  1428. force_reg_left_right(false,true);
  1429. set_result_location_reg;
  1430. if nodetype<>subn then
  1431. begin
  1432. if (right.location.loc<>LOC_CONSTANT) then
  1433. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1434. left.location.register,right.location.register,
  1435. location.register,checkoverflow,ovloc)
  1436. else
  1437. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1438. right.location.value,left.location.register,
  1439. location.register,checkoverflow,ovloc);
  1440. end
  1441. else { subtract is a special case since its not commutative }
  1442. begin
  1443. if (nf_swapped in flags) then
  1444. swapleftright;
  1445. if left.location.loc<>LOC_CONSTANT then
  1446. begin
  1447. if right.location.loc<>LOC_CONSTANT then
  1448. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1449. right.location.register,left.location.register,
  1450. location.register,checkoverflow,ovloc)
  1451. else
  1452. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1453. right.location.value,left.location.register,
  1454. location.register,checkoverflow,ovloc);
  1455. end
  1456. else
  1457. begin
  1458. tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  1459. hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef,
  1460. left.location.value,tmpreg);
  1461. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1462. right.location.register,tmpreg,location.register,checkoverflow,ovloc);
  1463. end;
  1464. end
  1465. end
  1466. else
  1467. begin
  1468. { at least one location should be a register, if yes, try to re-use it, so we can try two operand opcodes }
  1469. if left.location.loc<>LOC_REGISTER then
  1470. begin
  1471. if right.location.loc<>LOC_REGISTER then
  1472. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false)
  1473. else
  1474. begin
  1475. location_swap(left.location,right.location);
  1476. toggleflag(nf_swapped);
  1477. end;
  1478. end;
  1479. { at this point, left.location.loc should be LOC_REGISTER }
  1480. if right.location.loc=LOC_REGISTER then
  1481. begin
  1482. { when swapped another result register }
  1483. if (nodetype=subn) and (nf_swapped in flags) then
  1484. begin
  1485. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1486. left.location.register,right.location.register);
  1487. location_swap(left.location,right.location);
  1488. toggleflag(nf_swapped);
  1489. end
  1490. else
  1491. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1492. right.location.register,left.location.register);
  1493. end
  1494. else
  1495. begin
  1496. { right.location<>LOC_REGISTER }
  1497. if right.location.loc in [LOC_CSUBSETREF,LOC_CSUBSETREG,LOC_SUBSETREF,LOC_SUBSETREG] then
  1498. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,left.resultdef,true);
  1499. if (nodetype=subn) and (nf_swapped in flags) then
  1500. begin
  1501. tmpreg:=left.location.register;
  1502. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  1503. cg.a_load_loc_reg(current_asmdata.CurrAsmList,opsize,right.location,left.location.register);
  1504. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,tmpreg,left.location.register);
  1505. end
  1506. else
  1507. cg.a_op_loc_reg(current_asmdata.CurrAsmList,cgop,opsize,right.location,left.location.register);
  1508. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1509. end;
  1510. location_copy(location,left.location);
  1511. end;
  1512. { emit overflow check if required }
  1513. if checkoverflow then
  1514. cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,Location,resultdef,ovloc);
  1515. end;
  1516. procedure tx86addnode.second_addboolean;
  1517. begin
  1518. if (nodetype in [orn,andn]) and
  1519. (not(cs_full_boolean_eval in current_settings.localswitches) or
  1520. (nf_short_bool in flags)) then
  1521. inherited second_addboolean
  1522. else if is_64bit(left.resultdef) then
  1523. inherited
  1524. else
  1525. second_addordinal;
  1526. end;
  1527. procedure tx86addnode.second_cmpordinal;
  1528. var
  1529. opdef : tdef;
  1530. opsize : tcgsize;
  1531. unsigned : boolean;
  1532. begin
  1533. unsigned:=not(is_signed(left.resultdef)) or
  1534. not(is_signed(right.resultdef));
  1535. opdef:=left.resultdef;
  1536. opsize:=def_cgsize(opdef);
  1537. pass_left_right;
  1538. if (right.location.loc=LOC_CONSTANT) and
  1539. (left.location.loc in [LOC_REFERENCE, LOC_CREFERENCE])
  1540. {$ifdef x86_64}
  1541. and ((not (opsize in [OS_64,OS_S64])) or (
  1542. (right.location.value>=low(longint)) and (right.location.value<=high(longint))
  1543. ))
  1544. {$endif x86_64}
  1545. then
  1546. begin
  1547. emit_const_ref(A_CMP, TCGSize2Opsize[opsize], right.location.value, left.location.reference);
  1548. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1549. end
  1550. else
  1551. begin
  1552. left_must_be_reg(opdef,opsize,false);
  1553. emit_generic_code(A_CMP,opsize,unsigned,false,false);
  1554. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1555. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1556. end;
  1557. location_reset(location,LOC_FLAGS,OS_NO);
  1558. location.resflags:=getresflags(unsigned);
  1559. end;
  1560. begin
  1561. caddnode:=tx86addnode;
  1562. end.