nx86add.pas 84 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063
  1. {
  2. Copyright (c) 2000-2002 by Florian Klaempfl
  3. Common code generation for add nodes on the i386 and x86
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86add;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. symtype,
  22. cgbase,
  23. cpubase,
  24. node,nadd,ncgadd;
  25. type
  26. tx86addnode = class(tcgaddnode)
  27. protected
  28. function getresflags(unsigned : boolean) : tresflags;
  29. function getfpuresflags : tresflags;
  30. procedure left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  31. procedure force_left_and_right_fpureg;
  32. procedure prepare_x87_locations(out refnode: tnode);
  33. procedure emit_op_right_left(op:TAsmOp;opsize:TCgSize);
  34. procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
  35. procedure second_cmpfloatvector;
  36. procedure second_addfloatsse;
  37. procedure second_addfloatavx;
  38. public
  39. function pass_1 : tnode;override;
  40. function simplify(forinline : boolean) : tnode; override;
  41. function use_fma : boolean;override;
  42. procedure second_addfloat;override;
  43. {$ifndef i8086}
  44. procedure second_addsmallset;override;
  45. {$endif not i8086}
  46. procedure second_add64bit;override;
  47. procedure second_cmpfloat;override;
  48. procedure second_cmpsmallset;override;
  49. procedure second_cmp64bit;override;
  50. procedure second_cmpordinal;override;
  51. procedure second_addordinal;override;
  52. procedure second_addboolean;override;
  53. {$ifdef SUPPORT_MMX}
  54. procedure second_opmmx;override;
  55. {$endif SUPPORT_MMX}
  56. procedure second_opvector;override;
  57. end;
  58. implementation
  59. uses
  60. globtype,globals,
  61. verbose,cutils,compinnr,
  62. cpuinfo,
  63. aasmbase,aasmdata,aasmcpu,
  64. symconst,symdef,
  65. cgobj,hlcgobj,cgx86,cga,cgutils,
  66. tgobj,ncgutil,
  67. ncon,nset,ninl,ncnv,ncal,nmat,
  68. defutil,defcmp,constexp,
  69. htypechk;
  70. { Range check must be disabled explicitly as the code serves
  71. on three different architecture sizes }
  72. {$R-}
  73. {*****************************************************************************
  74. Helpers
  75. *****************************************************************************}
  76. procedure tx86addnode.emit_generic_code(op:TAsmOp;opsize:TCGSize;unsigned,extra_not,mboverflow:boolean);
  77. var
  78. power : longint;
  79. hl4 : tasmlabel;
  80. r : Tregister;
  81. href : treference;
  82. overflowcheck: boolean;
  83. begin
  84. overflowcheck:=needoverflowcheck;
  85. { at this point, left.location.loc should be LOC_REGISTER }
  86. if right.location.loc=LOC_REGISTER then
  87. begin
  88. { right.location is a LOC_REGISTER }
  89. { when swapped another result register }
  90. if (nodetype=subn) and (nf_swapped in flags) then
  91. begin
  92. if extra_not then
  93. emit_reg(A_NOT,TCGSize2Opsize[opsize],left.location.register);
  94. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,right.location.register);
  95. { newly swapped also set swapped flag }
  96. location_swap(left.location,right.location);
  97. toggleflag(nf_swapped);
  98. end
  99. else
  100. begin
  101. if extra_not then
  102. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  103. if (op=A_ADD) or (op=A_OR) or (op=A_AND) or (op=A_XOR) or (op=A_IMUL) then
  104. location_swap(left.location,right.location);
  105. emit_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register);
  106. end;
  107. end
  108. else
  109. begin
  110. { right.location is not a LOC_REGISTER }
  111. if (nodetype=subn) and (nf_swapped in flags) then
  112. begin
  113. if extra_not then
  114. cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NOT,opsize,left.location.register,left.location.register);
  115. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  116. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  117. emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,r);
  118. cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,r,left.location.register);
  119. end
  120. else
  121. begin
  122. { Optimizations when right.location is a constant value }
  123. if (op=A_CMP) and
  124. (nodetype in [equaln,unequaln]) and
  125. (right.location.loc=LOC_CONSTANT) and
  126. (right.location.value=0) then
  127. begin
  128. { 'test $-1,%reg' is transformable into 'test $-1,spilltemp' if %reg needs
  129. spilling, while 'test %reg,%reg' still requires loading into register.
  130. If spilling is not necessary, it is changed back into 'test %reg,%reg' by
  131. peephole optimizer (this optimization is currently available only for i386). }
  132. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  133. {$ifdef i386}
  134. emit_const_reg(A_TEST,TCGSize2Opsize[opsize],aint(-1),left.location.register)
  135. {$else i386}
  136. emit_reg_reg(A_TEST,TCGSize2Opsize[opsize],left.location.register,left.location.register);
  137. {$endif i386}
  138. end
  139. else
  140. if (op=A_ADD) and
  141. (right.location.loc=LOC_CONSTANT) and
  142. (right.location.value=1) and
  143. not overflowcheck and
  144. UseIncDec then
  145. begin
  146. emit_reg(A_INC,TCGSize2Opsize[opsize],left.location.register);
  147. end
  148. else
  149. if (op=A_SUB) and
  150. (right.location.loc=LOC_CONSTANT) and
  151. (right.location.value=1) and
  152. not overflowcheck and
  153. UseIncDec then
  154. begin
  155. emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register);
  156. end
  157. else
  158. if (op=A_IMUL) and
  159. (right.location.loc=LOC_CONSTANT) and
  160. (ispowerof2(int64(right.location.value),power)) and
  161. overflowcheck then
  162. begin
  163. emit_const_reg(A_SHL,TCGSize2Opsize[opsize],power,left.location.register);
  164. end
  165. else if (op=A_IMUL) and
  166. (right.location.loc=LOC_CONSTANT) and
  167. (right.location.value>1) and (ispowerof2(int64(right.location.value)-1,power)) and
  168. (power in [1..3]) and
  169. not overflowcheck then
  170. begin
  171. reference_reset_base(href,left.location.register,0,ctempposinvalid,0,[]);
  172. href.index:=left.location.register;
  173. href.scalefactor:=int64(right.location.value)-1;
  174. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  175. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(A_LEA,TCgSize2OpSize[opsize],href,left.location.register));
  176. end
  177. else
  178. begin
  179. if extra_not then
  180. begin
  181. r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  182. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
  183. emit_reg(A_NOT,TCGSize2Opsize[opsize],r);
  184. if mboverflow and overflowcheck then
  185. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  186. emit_reg_reg(A_AND,TCGSize2Opsize[opsize],r,left.location.register);
  187. end
  188. else
  189. begin
  190. if mboverflow and overflowcheck then
  191. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  192. emit_op_right_left(op,opsize);
  193. end;
  194. end;
  195. end;
  196. end;
  197. { only in case of overflow operations }
  198. { produce overflow code }
  199. { we must put it here directly, because sign of operation }
  200. { is in unsigned VAR!! }
  201. if mboverflow then
  202. begin
  203. if overflowcheck then
  204. begin
  205. current_asmdata.getjumplabel(hl4);
  206. if unsigned then
  207. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_AE,hl4)
  208. else
  209. cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NO,hl4);
  210. cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  211. cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
  212. cg.a_label(current_asmdata.CurrAsmList,hl4);
  213. end;
  214. end;
  215. end;
  216. procedure tx86addnode.left_must_be_reg(opdef: tdef; opsize:TCGSize;noswap:boolean);
  217. begin
  218. { left location is not a register? }
  219. if (left.location.loc<>LOC_REGISTER) then
  220. begin
  221. { if right is register then we can swap the locations }
  222. if (not noswap) and
  223. (right.location.loc=LOC_REGISTER) then
  224. begin
  225. location_swap(left.location,right.location);
  226. toggleflag(nf_swapped);
  227. end
  228. else if (not noswap) and
  229. (right.location.loc=LOC_CREGISTER) then
  230. begin
  231. location_swap(left.location,right.location);
  232. toggleflag(nf_swapped);
  233. { maybe we can reuse a constant register when the
  234. operation is a comparison that doesn't change the
  235. value of the register }
  236. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  237. location:=left.location;
  238. end
  239. else
  240. begin
  241. { maybe we can reuse a constant register when the
  242. operation is a comparison that doesn't change the
  243. value of the register }
  244. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,(nodetype in [ltn,lten,gtn,gten,equaln,unequaln]));
  245. end;
  246. end;
  247. if (right.location.loc<>LOC_CONSTANT) and
  248. (tcgsize2unsigned[right.location.size]<>tcgsize2unsigned[opsize]) then
  249. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  250. if (left.location.loc<>LOC_CONSTANT) and
  251. (tcgsize2unsigned[left.location.size]<>tcgsize2unsigned[opsize]) then
  252. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  253. end;
  254. procedure tx86addnode.force_left_and_right_fpureg;
  255. begin
  256. if (right.location.loc<>LOC_FPUREGISTER) then
  257. begin
  258. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  259. if (left.location.loc<>LOC_FPUREGISTER) then
  260. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  261. else
  262. { left was on the stack => swap }
  263. toggleflag(nf_swapped);
  264. end
  265. { the nominator in st0 }
  266. else if (left.location.loc<>LOC_FPUREGISTER) then
  267. begin
  268. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false)
  269. end
  270. else
  271. begin
  272. { fpu operands are always in the wrong order on the stack }
  273. toggleflag(nf_swapped);
  274. end;
  275. end;
  276. { Makes sides suitable for executing an x87 instruction:
  277. if either side is OS_F32/OS_F64-sized LOC_REFERENCE, it is returned in 'refnode'
  278. everything else is loaded to FPU stack. }
  279. procedure tx86addnode.prepare_x87_locations(out refnode: tnode);
  280. begin
  281. refnode:=nil;
  282. { later on, no mm registers are allowed, so transfer everything to memory here
  283. below it is loaded into an fpu register if neede }
  284. if left.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  285. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  286. if right.location.loc in [LOC_CMMREGISTER,LOC_MMREGISTER] then
  287. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  288. case ord(left.location.loc=LOC_FPUREGISTER)+ord(right.location.loc=LOC_FPUREGISTER) of
  289. 0:
  290. begin
  291. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,right.location,right.resultdef,false);
  292. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  293. InternalError(2013090803);
  294. if (left.location.size in [OS_F32,OS_F64]) then
  295. begin
  296. refnode:=left;
  297. toggleflag(nf_swapped);
  298. end
  299. else
  300. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  301. end;
  302. 1:
  303. begin { if left is on the stack then swap. }
  304. if (left.location.loc=LOC_FPUREGISTER) then
  305. refnode:=right
  306. else
  307. refnode:=left;
  308. if not(refnode.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  309. InternalError(2013090801);
  310. if not (refnode.location.size in [OS_F32,OS_F64]) then
  311. begin
  312. hlcg.location_force_fpureg(current_asmdata.CurrAsmList,refnode.location,refnode.resultdef,false);
  313. if (refnode=right) then
  314. toggleflag(nf_swapped);
  315. refnode:=nil;
  316. end
  317. else
  318. begin
  319. if (refnode=left) then
  320. toggleflag(nf_swapped);
  321. end;
  322. end;
  323. 2: { fpu operands are always in the wrong order on the stack }
  324. toggleflag(nf_swapped);
  325. else
  326. InternalError(2013090802);
  327. end;
  328. end;
  329. procedure tx86addnode.emit_op_right_left(op:TAsmOp;opsize:TCgsize);
  330. {$ifdef x86_64}
  331. var
  332. tmpreg : tregister;
  333. {$endif x86_64}
  334. begin
  335. if (right.location.loc in [LOC_CSUBSETREG,LOC_SUBSETREG,LOC_SUBSETREF,LOC_CSUBSETREF]) then
  336. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,right.resultdef,true);
  337. { left must be a register }
  338. case right.location.loc of
  339. LOC_REGISTER,
  340. LOC_CREGISTER :
  341. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],right.location.register,left.location.register));
  342. LOC_REFERENCE,
  343. LOC_CREFERENCE :
  344. begin
  345. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  346. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,TCGSize2Opsize[opsize],right.location.reference,left.location.register));
  347. end;
  348. LOC_CONSTANT :
  349. begin
  350. {$ifdef x86_64}
  351. { x86_64 only supports signed 32 bits constants directly }
  352. if (opsize in [OS_S64,OS_64]) and
  353. ((right.location.value<low(longint)) or (right.location.value>high(longint))) then
  354. begin
  355. tmpreg:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  356. cg.a_load_const_reg(current_asmdata.CurrAsmList,opsize,right.location.value,tmpreg);
  357. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,TCGSize2Opsize[opsize],tmpreg,left.location.register));
  358. end
  359. else
  360. {$endif x86_64}
  361. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(op,TCGSize2Opsize[opsize],right.location.value,left.location.register));
  362. end;
  363. else
  364. internalerror(200203232);
  365. end;
  366. end;
  367. function tx86addnode.getresflags(unsigned : boolean) : tresflags;
  368. begin
  369. case nodetype of
  370. equaln : getresflags:=F_E;
  371. unequaln : getresflags:=F_NE;
  372. else
  373. if not(unsigned) then
  374. begin
  375. if nf_swapped in flags then
  376. case nodetype of
  377. ltn : getresflags:=F_G;
  378. lten : getresflags:=F_GE;
  379. gtn : getresflags:=F_L;
  380. gten : getresflags:=F_LE;
  381. else
  382. internalerror(2013120105);
  383. end
  384. else
  385. case nodetype of
  386. ltn : getresflags:=F_L;
  387. lten : getresflags:=F_LE;
  388. gtn : getresflags:=F_G;
  389. gten : getresflags:=F_GE;
  390. else
  391. internalerror(2013120106);
  392. end;
  393. end
  394. else
  395. begin
  396. if nf_swapped in flags then
  397. case nodetype of
  398. ltn : getresflags:=F_A;
  399. lten : getresflags:=F_AE;
  400. gtn : getresflags:=F_B;
  401. gten : getresflags:=F_BE;
  402. else
  403. internalerror(2013120107);
  404. end
  405. else
  406. case nodetype of
  407. ltn : getresflags:=F_B;
  408. lten : getresflags:=F_BE;
  409. gtn : getresflags:=F_A;
  410. gten : getresflags:=F_AE;
  411. else
  412. internalerror(2013120108);
  413. end;
  414. end;
  415. end;
  416. end;
  417. function tx86addnode.getfpuresflags : tresflags;
  418. begin
  419. if (nodetype=equaln) then
  420. result:=F_FE
  421. else if (nodetype=unequaln) then
  422. result:=F_FNE
  423. else if (nf_swapped in flags) then
  424. case nodetype of
  425. ltn : result:=F_FA;
  426. lten : result:=F_FAE;
  427. gtn : result:=F_FB;
  428. gten : result:=F_FBE;
  429. else
  430. internalerror(2014031402);
  431. end
  432. else
  433. case nodetype of
  434. ltn : result:=F_FB;
  435. lten : result:=F_FBE;
  436. gtn : result:=F_FA;
  437. gten : result:=F_FAE;
  438. else
  439. internalerror(2014031403);
  440. end;
  441. end;
  442. {*****************************************************************************
  443. AddSmallSet
  444. *****************************************************************************}
  445. {$ifndef i8086}
  446. procedure tx86addnode.second_addsmallset;
  447. var
  448. setbase : aint;
  449. opdef : tdef;
  450. opsize : TCGSize;
  451. op : TAsmOp;
  452. extra_not,
  453. noswap : boolean;
  454. all_member_optimization:boolean;
  455. begin
  456. pass_left_right;
  457. noswap:=false;
  458. extra_not:=false;
  459. all_member_optimization:=false;
  460. opdef:=resultdef;
  461. opsize:=int_cgsize(opdef.size);
  462. if (left.resultdef.typ=setdef) then
  463. setbase:=tsetdef(left.resultdef).setbase
  464. else
  465. setbase:=tsetdef(right.resultdef).setbase;
  466. case nodetype of
  467. addn :
  468. begin
  469. { adding elements is not commutative }
  470. if (nf_swapped in flags) and (left.nodetype=setelementn) then
  471. swapleftright;
  472. { are we adding set elements ? }
  473. if right.nodetype=setelementn then
  474. begin
  475. { no range support for smallsets! }
  476. if assigned(tsetelementnode(right).right) then
  477. internalerror(43244);
  478. { btsb isn't supported }
  479. if opsize=OS_8 then
  480. begin
  481. opsize:=OS_32;
  482. opdef:=u32inttype;
  483. end;
  484. { bts requires both elements to be registers }
  485. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  486. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
  487. register_maybe_adjust_setbase(current_asmdata.CurrAsmList,opdef,right.location,setbase);
  488. op:=A_BTS;
  489. noswap:=true;
  490. end
  491. else
  492. op:=A_OR;
  493. end;
  494. symdifn :
  495. op:=A_XOR;
  496. muln :
  497. op:=A_AND;
  498. subn :
  499. begin
  500. op:=A_AND;
  501. if (not(nf_swapped in flags) and (left.location.loc=LOC_CONSTANT) and (left.location.value=-1)) or
  502. ((nf_swapped in flags) and (right.location.loc=LOC_CONSTANT) and (right.location.value=-1)) then
  503. all_member_optimization:=true;
  504. if (not(nf_swapped in flags)) and
  505. (right.location.loc=LOC_CONSTANT) then
  506. right.location.value := not(right.location.value)
  507. else if (nf_swapped in flags) and
  508. (left.location.loc=LOC_CONSTANT) then
  509. left.location.value := not(left.location.value)
  510. else
  511. extra_not:=true;
  512. end;
  513. xorn :
  514. op:=A_XOR;
  515. orn :
  516. op:=A_OR;
  517. andn :
  518. op:=A_AND;
  519. else
  520. internalerror(2003042215);
  521. end;
  522. if all_member_optimization then
  523. begin
  524. {A set expression [0..31]-x can be implemented with a simple NOT.}
  525. if nf_swapped in flags then
  526. begin
  527. { newly swapped also set swapped flag }
  528. location_swap(left.location,right.location);
  529. toggleflag(nf_swapped);
  530. end;
  531. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,false);
  532. emit_reg(A_NOT,TCGSize2Opsize[opsize],right.location.register);
  533. location:=right.location;
  534. end
  535. else
  536. begin
  537. { can we use the BMI1 instruction andn? }
  538. if (op=A_AND) and extra_not and (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) and
  539. (resultdef.size in [4{$ifdef x86_64},8{$endif x86_64}]) then
  540. begin
  541. location_reset(location,LOC_REGISTER,left.location.size);
  542. location.register:=cg.getintregister(current_asmdata.currAsmList,left.location.size);
  543. if nf_swapped in flags then
  544. begin
  545. location_swap(left.location,right.location);
  546. toggleflag(nf_swapped);
  547. end;
  548. hlcg.location_force_reg(current_asmdata.currAsmList,right.location,right.resultdef,opdef,true);
  549. if not(left.location.loc in [LOC_CREGISTER,LOC_REGISTER,LOC_CREFERENCE,LOC_REFERENCE]) then
  550. hlcg.location_force_reg(current_asmdata.currAsmList,left.location,left.resultdef,opdef,true);
  551. case left.location.loc of
  552. LOC_CREGISTER,LOC_REGISTER:
  553. emit_reg_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.register,right.location.register,location.register);
  554. LOC_CREFERENCE,LOC_REFERENCE:
  555. emit_ref_reg_reg(A_ANDN,TCGSize2Opsize[opsize],left.location.reference,right.location.register,location.register);
  556. else
  557. Internalerror(2018040201);
  558. end;
  559. end
  560. else
  561. begin
  562. { left must be a register }
  563. left_must_be_reg(opdef,opsize,noswap);
  564. emit_generic_code(op,opsize,true,extra_not,false);
  565. location_freetemp(current_asmdata.CurrAsmList,right.location);
  566. { left is always a register and contains the result }
  567. location:=left.location;
  568. end;
  569. end;
  570. { fix the changed opsize we did above because of the missing btsb }
  571. if opsize<>int_cgsize(resultdef.size) then
  572. hlcg.location_force_reg(current_asmdata.CurrAsmList,location,opdef,cgsize_orddef(int_cgsize(resultdef.size)),false);
  573. end;
  574. {$endif not i8086}
  575. procedure tx86addnode.second_cmpsmallset;
  576. var
  577. opdef : tdef;
  578. opsize : TCGSize;
  579. op : TAsmOp;
  580. begin
  581. pass_left_right;
  582. opdef:=left.resultdef;
  583. opsize:=int_cgsize(opdef.size);
  584. case nodetype of
  585. equaln,
  586. unequaln :
  587. op:=A_CMP;
  588. lten,gten:
  589. begin
  590. if (not(nf_swapped in flags) and (nodetype = lten)) or
  591. ((nf_swapped in flags) and (nodetype = gten)) then
  592. swapleftright;
  593. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
  594. emit_op_right_left(A_AND,opsize);
  595. op:=A_CMP;
  596. { warning: ugly hack, we need a JE so change the node to equaln }
  597. nodetype:=equaln;
  598. end;
  599. else
  600. internalerror(2003042204);
  601. end;
  602. { left must be a register }
  603. left_must_be_reg(opdef,opsize,false);
  604. emit_generic_code(op,opsize,true,false,false);
  605. location_freetemp(current_asmdata.CurrAsmList,right.location);
  606. location_freetemp(current_asmdata.CurrAsmList,left.location);
  607. location_reset(location,LOC_FLAGS,OS_NO);
  608. location.resflags:=getresflags(true);
  609. end;
  610. {*****************************************************************************
  611. AddMMX
  612. *****************************************************************************}
  613. {$ifdef SUPPORT_MMX}
  614. procedure tx86addnode.second_opmmx;
  615. var
  616. op : TAsmOp;
  617. cmpop : boolean;
  618. mmxbase : tmmxtype;
  619. hreg,
  620. hregister : tregister;
  621. begin
  622. pass_left_right;
  623. cmpop:=false;
  624. op:=A_NOP;
  625. mmxbase:=mmx_type(left.resultdef);
  626. location_reset(location,LOC_MMXREGISTER,def_cgsize(resultdef));
  627. case nodetype of
  628. addn :
  629. begin
  630. if (cs_mmx_saturation in current_settings.localswitches) then
  631. begin
  632. case mmxbase of
  633. mmxs8bit:
  634. op:=A_PADDSB;
  635. mmxu8bit:
  636. op:=A_PADDUSB;
  637. mmxs16bit,mmxfixed16:
  638. op:=A_PADDSW;
  639. mmxu16bit:
  640. op:=A_PADDUSW;
  641. else
  642. ;
  643. end;
  644. end
  645. else
  646. begin
  647. case mmxbase of
  648. mmxs8bit,mmxu8bit:
  649. op:=A_PADDB;
  650. mmxs16bit,mmxu16bit,mmxfixed16:
  651. op:=A_PADDW;
  652. mmxs32bit,mmxu32bit:
  653. op:=A_PADDD;
  654. else
  655. ;
  656. end;
  657. end;
  658. end;
  659. muln :
  660. begin
  661. case mmxbase of
  662. mmxs16bit,mmxu16bit:
  663. op:=A_PMULLW;
  664. mmxfixed16:
  665. op:=A_PMULHW;
  666. else
  667. ;
  668. end;
  669. end;
  670. subn :
  671. begin
  672. if (cs_mmx_saturation in current_settings.localswitches) then
  673. begin
  674. case mmxbase of
  675. mmxs8bit:
  676. op:=A_PSUBSB;
  677. mmxu8bit:
  678. op:=A_PSUBUSB;
  679. mmxs16bit,mmxfixed16:
  680. op:=A_PSUBSB;
  681. mmxu16bit:
  682. op:=A_PSUBUSW;
  683. else
  684. ;
  685. end;
  686. end
  687. else
  688. begin
  689. case mmxbase of
  690. mmxs8bit,mmxu8bit:
  691. op:=A_PSUBB;
  692. mmxs16bit,mmxu16bit,mmxfixed16:
  693. op:=A_PSUBW;
  694. mmxs32bit,mmxu32bit:
  695. op:=A_PSUBD;
  696. else
  697. ;
  698. end;
  699. end;
  700. end;
  701. xorn:
  702. op:=A_PXOR;
  703. orn:
  704. op:=A_POR;
  705. andn:
  706. op:=A_PAND;
  707. else
  708. internalerror(2003042214);
  709. end;
  710. if op = A_NOP then
  711. internalerror(201408201);
  712. { left and right no register? }
  713. { then one must be demanded }
  714. if (left.location.loc<>LOC_MMXREGISTER) then
  715. begin
  716. if (right.location.loc=LOC_MMXREGISTER) then
  717. begin
  718. location_swap(left.location,right.location);
  719. toggleflag(nf_swapped);
  720. end
  721. else
  722. begin
  723. { register variable ? }
  724. if (left.location.loc=LOC_CMMXREGISTER) then
  725. begin
  726. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  727. emit_reg_reg(A_MOVQ,S_NO,left.location.register,hregister);
  728. end
  729. else
  730. begin
  731. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  732. internalerror(200203245);
  733. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  734. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  735. emit_ref_reg(A_MOVQ,S_NO,left.location.reference,hregister);
  736. end;
  737. location_reset(left.location,LOC_MMXREGISTER,OS_NO);
  738. left.location.register:=hregister;
  739. end;
  740. end;
  741. { at this point, left.location.loc should be LOC_MMXREGISTER }
  742. if right.location.loc<>LOC_MMXREGISTER then
  743. begin
  744. if (nodetype=subn) and (nf_swapped in flags) then
  745. begin
  746. hreg:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  747. if right.location.loc=LOC_CMMXREGISTER then
  748. begin
  749. emit_reg_reg(A_MOVQ,S_NO,right.location.register,hreg);
  750. emit_reg_reg(op,S_NO,left.location.register,hreg);
  751. end
  752. else
  753. begin
  754. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  755. internalerror(2002032412);
  756. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  757. emit_ref_reg(A_MOVQ,S_NO,right.location.reference,hreg);
  758. emit_reg_reg(op,S_NO,left.location.register,hreg);
  759. end;
  760. location.register:=hreg;
  761. end
  762. else
  763. begin
  764. if (right.location.loc=LOC_CMMXREGISTER) then
  765. emit_reg_reg(op,S_NO,right.location.register,left.location.register)
  766. else
  767. begin
  768. if not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  769. internalerror(200203246);
  770. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  771. emit_ref_reg(op,S_NO,right.location.reference,left.location.register);
  772. end;
  773. location.register:=left.location.register;
  774. end;
  775. end
  776. else
  777. begin
  778. { right.location=LOC_MMXREGISTER }
  779. if (nodetype=subn) and (nf_swapped in flags) then
  780. begin
  781. emit_reg_reg(op,S_NO,left.location.register,right.location.register);
  782. location_swap(left.location,right.location);
  783. toggleflag(nf_swapped);
  784. end
  785. else
  786. begin
  787. emit_reg_reg(op,S_NO,right.location.register,left.location.register);
  788. end;
  789. location.register:=left.location.register;
  790. end;
  791. location_freetemp(current_asmdata.CurrAsmList,right.location);
  792. if cmpop then
  793. location_freetemp(current_asmdata.CurrAsmList,left.location);
  794. end;
  795. {$endif SUPPORT_MMX}
  796. {*****************************************************************************
  797. AddFloat
  798. *****************************************************************************}
  799. procedure tx86addnode.second_addfloatsse;
  800. var
  801. op : topcg;
  802. sqr_sum : boolean;
  803. tmp : tnode;
  804. begin
  805. sqr_sum:=false;
  806. if (current_settings.fputype>=fpu_sse3) and
  807. use_vectorfpu(resultdef) and
  808. (nodetype in [addn,subn]) and
  809. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  810. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  811. begin
  812. sqr_sum:=true;
  813. tmp:=tinlinenode(left).left;
  814. tinlinenode(left).left:=nil;
  815. left.free;
  816. left:=tmp;
  817. tmp:=tinlinenode(right).left;
  818. tinlinenode(right).left:=nil;
  819. right.free;
  820. right:=tmp;
  821. end;
  822. pass_left_right;
  823. { fpu operands are always in reversed order on the stack }
  824. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  825. toggleflag(nf_swapped);
  826. if (nf_swapped in flags) then
  827. { can't use swapleftright if both are on the fpu stack, since then }
  828. { both are "R_ST" -> nothing would change -> manually switch }
  829. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  830. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  831. emit_none(A_FXCH,S_NO)
  832. else
  833. swapleftright;
  834. case nodetype of
  835. addn :
  836. op:=OP_ADD;
  837. muln :
  838. op:=OP_MUL;
  839. subn :
  840. op:=OP_SUB;
  841. slashn :
  842. op:=OP_DIV;
  843. else
  844. internalerror(200312231);
  845. end;
  846. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  847. if sqr_sum then
  848. begin
  849. if nf_swapped in flags then
  850. swapleftright;
  851. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  852. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  853. location:=left.location;
  854. if is_double(resultdef) then
  855. begin
  856. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  857. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  858. case nodetype of
  859. addn:
  860. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  861. subn:
  862. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  863. else
  864. internalerror(201108162);
  865. end;
  866. end
  867. else
  868. begin
  869. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  870. { ensure that bits 64..127 contain valid values }
  871. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  872. { the data is now in bits 0..32 and 64..95 }
  873. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  874. case nodetype of
  875. addn:
  876. begin
  877. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  878. end;
  879. subn:
  880. begin
  881. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  882. end;
  883. else
  884. internalerror(201108163);
  885. end;
  886. end
  887. end
  888. { we can use only right as left operand if the operation is commutative }
  889. else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  890. begin
  891. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  892. cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,location.size,right.location.register,location.register,mms_movescalar);
  893. { force floating point reg. location to be written to memory,
  894. we don't force it to mm register because writing to memory
  895. allows probably shorter code because there is no direct fpu->mm register
  896. copy instruction
  897. }
  898. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  899. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  900. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,left.location,location.register,mms_movescalar);
  901. if left.location.loc=LOC_REFERENCE then
  902. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  903. end
  904. else
  905. begin
  906. if nf_swapped in flags then
  907. swapleftright;
  908. { force floating point reg. location to be written to memory,
  909. we don't force it to mm register because writing to memory
  910. allows probably shorter code because there is no direct fpu->mm register
  911. copy instruction
  912. }
  913. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  914. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  915. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
  916. cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
  917. if left.location.loc=LOC_REFERENCE then
  918. tg.ungetiftemp(current_asmdata.CurrAsmList,left.location.reference);
  919. { force floating point reg. location to be written to memory,
  920. we don't force it to mm register because writing to memory
  921. allows probably shorter code because there is no direct fpu->mm register
  922. copy instruction
  923. }
  924. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  925. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  926. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,right.location,location.register,mms_movescalar);
  927. if right.location.loc=LOC_REFERENCE then
  928. tg.ungetiftemp(current_asmdata.CurrAsmList,right.location.reference);
  929. end;
  930. end;
  931. procedure tx86addnode.second_addfloatavx;
  932. var
  933. op : topcg;
  934. sqr_sum : boolean;
  935. {$ifdef dummy}
  936. tmp : tnode;
  937. {$endif dummy}
  938. begin
  939. sqr_sum:=false;
  940. {$ifdef dummy}
  941. if (current_settings.fputype>=fpu_sse3) and
  942. use_vectorfpu(resultdef) and
  943. (nodetype in [addn,subn]) and
  944. (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
  945. (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
  946. begin
  947. sqr_sum:=true;
  948. tmp:=tinlinenode(left).left;
  949. tinlinenode(left).left:=nil;
  950. left.free;
  951. left:=tmp;
  952. tmp:=tinlinenode(right).left;
  953. tinlinenode(right).left:=nil;
  954. right.free;
  955. right:=tmp;
  956. end;
  957. {$endif dummy}
  958. pass_left_right;
  959. { fpu operands are always in reversed order on the stack }
  960. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  961. toggleflag(nf_swapped);
  962. if (nf_swapped in flags) then
  963. { can't use swapleftright if both are on the fpu stack, since then }
  964. { both are "R_ST" -> nothing would change -> manually switch }
  965. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and
  966. (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  967. emit_none(A_FXCH,S_NO)
  968. else
  969. swapleftright;
  970. case nodetype of
  971. addn :
  972. op:=OP_ADD;
  973. muln :
  974. op:=OP_MUL;
  975. subn :
  976. op:=OP_SUB;
  977. slashn :
  978. op:=OP_DIV;
  979. else
  980. internalerror(2003122303);
  981. end;
  982. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  983. if sqr_sum then
  984. begin
  985. if nf_swapped in flags then
  986. swapleftright;
  987. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  988. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  989. location:=left.location;
  990. if is_double(resultdef) then
  991. begin
  992. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
  993. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
  994. case nodetype of
  995. addn:
  996. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
  997. subn:
  998. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
  999. else
  1000. internalerror(2011081601);
  1001. end;
  1002. end
  1003. else
  1004. begin
  1005. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
  1006. { ensure that bits 64..127 contain valid values }
  1007. current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
  1008. { the data is now in bits 0..32 and 64..95 }
  1009. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
  1010. case nodetype of
  1011. addn:
  1012. begin
  1013. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
  1014. end;
  1015. subn:
  1016. begin
  1017. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
  1018. end;
  1019. else
  1020. internalerror(2011081604);
  1021. end;
  1022. end
  1023. end
  1024. { left*2 ? }
  1025. else if (nodetype=muln) and is_constrealnode(right) and is_number_float(trealconstnode(right).value_real) and (trealconstnode(right).value_real=2) then
  1026. begin
  1027. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1028. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1029. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1030. left.location.register,
  1031. left.location.register,
  1032. location.register,
  1033. mms_movescalar);
  1034. end
  1035. { right*2 ? }
  1036. else if (nodetype=muln) and is_constrealnode(left) and is_number_float(trealconstnode(left).value_real) and (trealconstnode(left).value_real=2) then
  1037. begin
  1038. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,right.location.size);
  1039. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
  1040. cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
  1041. right.location.register,
  1042. right.location.register,
  1043. location.register,
  1044. mms_movescalar);
  1045. end
  1046. { we can use only right as left operand if the operation is commutative }
  1047. else if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) and (op in [OP_ADD,OP_MUL]) then
  1048. begin
  1049. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1050. { force floating point reg. location to be written to memory,
  1051. we don't force it to mm register because writing to memory
  1052. allows probably shorter code because there is no direct fpu->mm register
  1053. copy instruction
  1054. }
  1055. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1056. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1057. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1058. left.location,
  1059. right.location.register,
  1060. location.register,
  1061. mms_movescalar);
  1062. end
  1063. else
  1064. begin
  1065. if (nf_swapped in flags) then
  1066. swapleftright;
  1067. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1068. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
  1069. { force floating point reg. location to be written to memory,
  1070. we don't force it to mm register because writing to memory
  1071. allows probably shorter code because there is no direct fpu->mm register
  1072. copy instruction
  1073. }
  1074. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1075. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1076. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
  1077. right.location,
  1078. left.location.register,
  1079. location.register,
  1080. mms_movescalar);
  1081. end;
  1082. end;
  1083. function tx86addnode.pass_1: tnode;
  1084. begin
  1085. { on x86, we do not support fpu registers, so in case of operations using the x87, it
  1086. is normally useful, not to put the operands into registers which would be mm register }
  1087. if ((left.resultdef.typ=floatdef) or (right.resultdef.typ=floatdef)) and
  1088. (not(use_vectorfpu(left.resultdef)) and not(use_vectorfpu(right.resultdef)) and
  1089. not(use_vectorfpu(resultdef))) then
  1090. begin
  1091. make_not_regable(left,[ra_addr_regable]);
  1092. make_not_regable(right,[ra_addr_regable]);
  1093. end;
  1094. Result:=inherited pass_1;
  1095. { correct expectloc, it does not matter of Result is set as another pass_1 is run on it
  1096. which will fix that one }
  1097. if use_vectorfpu(resultdef) then
  1098. expectloc:=LOC_MMREGISTER;
  1099. end;
  1100. function tx86addnode.simplify(forinline : boolean) : tnode;
  1101. var
  1102. t, m, ThisNode, ConstNode: TNode;
  1103. lt,rt, ThisType: TNodeType;
  1104. ThisDef: TDef;
  1105. DoOptimisation: Boolean;
  1106. reciprocal, comparison, divisor: AWord;
  1107. shift, N: Byte;
  1108. begin
  1109. { Load into local variables to reduce the number of pointer deallocations }
  1110. rt:=right.nodetype;
  1111. lt:=left.nodetype;
  1112. DoOptimisation:=False;
  1113. {$if defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)}
  1114. if (cs_opt_level1 in current_settings.optimizerswitches) and
  1115. { The presence of overflow checks tends to cause internal errors with the multiplication nodes }
  1116. not (cs_check_overflow in current_settings.localswitches) and
  1117. (nodetype in [equaln,unequaln]) then
  1118. begin
  1119. if (lt=modn) and (rt=ordconstn) and (TOrdConstNode(right).value.uvalue=0) then
  1120. begin
  1121. t:=left;
  1122. m:=right;
  1123. end
  1124. else if (rt=modn) and (lt=ordconstn) and (TOrdConstNode(left).value.uvalue=0) then
  1125. begin
  1126. t:=right;
  1127. m:=left;
  1128. end
  1129. else
  1130. begin
  1131. t:=nil;
  1132. m:=nil;
  1133. end;
  1134. if Assigned(t) and (TModDivNode(t).right.nodetype=ordconstn) and
  1135. {$ifndef cpu64bitalu}
  1136. { Converting Int64 and QWord division doesn't work under i386 }
  1137. {$ifndef cpu32bitalu}
  1138. (TModDivNode(t).resultdef.size < 4) and
  1139. {$else cpu32bitalu}
  1140. (TModDivNode(t).resultdef.size < 8) and
  1141. {$endif cpu32bitalu}
  1142. {$endif cpu64bitalu}
  1143. (TOrdConstNode(TModDivNode(t).right).value>=3) then
  1144. begin
  1145. divisor:=TOrdConstNode(TModDivNode(t).right).value.uvalue;
  1146. { Exclude powers of 2, as there are more efficient ways to handle those }
  1147. if PopCnt(divisor)>1 then
  1148. begin
  1149. if is_signed(TModDivNode(t).left.resultdef) then
  1150. begin
  1151. { See pages 250-251 of Hacker's Delight, Second Edition
  1152. for an explanation and proof of the algorithm, but
  1153. essentially, we're doing the following:
  1154. - Convert the divisor d to the form k.2^b if it isn't
  1155. already odd (in which case, k = d and b = 0)
  1156. - Calculate r, the multiplicative inverse of k modulo 2^N
  1157. - Calculate c = floor(2^(N-1) / k) & -(2^b)
  1158. - Let q = ((n * r) + c) ror b (mod 2^N)
  1159. - Repurpose c to equal floor(2c / 2^b) = c shr (b - 1)
  1160. (some RISC platforms will benefit from doing this over
  1161. precalculating the modified constant. For x86,
  1162. it's better with the constant precalculated for
  1163. 32-bit and under, but for 64-bit, use SHR. )
  1164. - If q is below or equal to c, then (n mod d) = 0
  1165. }
  1166. while True do
  1167. begin
  1168. ThisNode:=TModDivNode(t).left;
  1169. case ThisNode.nodetype of
  1170. typeconvn:
  1171. begin
  1172. ThisDef:=TTypeConvNode(ThisNode).left.resultdef;
  1173. { See if we can simplify things to a smaller ordinal to
  1174. reduce code size and increase speed }
  1175. if is_signed(ThisDef) and
  1176. is_integer(ThisDef) and
  1177. { Byte-sized multiplications can cause problems }
  1178. (ThisDef.size>=2) and
  1179. { Make sure the divisor is in range }
  1180. (divisor>=TOrdDef(ThisDef).low) and
  1181. (divisor<=TOrdDef(ThisDef).high) then
  1182. begin
  1183. TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef;
  1184. TOrdConstNode(m).resultdef:=ThisDef;
  1185. TModDivNode(t).resultdef:=ThisDef;
  1186. { Destroy the typeconv node }
  1187. TModDivNode(t).left:=TTypeConvNode(ThisNode).left;
  1188. TTypeConvNode(ThisNode).left:=nil;
  1189. ThisNode.Free;
  1190. Continue;
  1191. end;
  1192. end;
  1193. ordconstn:
  1194. begin
  1195. { Just simplify into a constant }
  1196. Result:=inherited simplify(forinline);
  1197. Exit;
  1198. end;
  1199. else
  1200. ;
  1201. end;
  1202. DoOptimisation:=True;
  1203. Break;
  1204. end;
  1205. if DoOptimisation then
  1206. begin
  1207. ThisDef:=TModDivNode(t).left.resultdef;
  1208. if nodetype = equaln then
  1209. ThisType:=lten
  1210. else
  1211. ThisType:=gtn;
  1212. N:=ThisDef.size*8;
  1213. calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift);
  1214. { Construct the following node tree for odd divisors:
  1215. <lten> (for equaln) or <gtn> (for notequaln)
  1216. <addn>
  1217. <muln>
  1218. <typeconv signed-to-unsigned>
  1219. <numerator node (TModDivNode(t).left)>
  1220. <reciprocal constant>
  1221. <comparison constant (effectively a signed shift)>
  1222. <comparison constant * 2>
  1223. For even divisors, convert them to the form k.2^b, with
  1224. odd k, then construct the following:
  1225. <lten> (for equaln) or <gtn> (for notequaln)
  1226. <ror>
  1227. (b)
  1228. <addn>
  1229. <muln>
  1230. <typeconv signed-to-unsigned>
  1231. <numerator node (TModDivNode(t).left)>
  1232. <reciprocal constant>
  1233. <comparison constant (effectively a signed shift)>
  1234. <comparison constant shr (b - 1)>
  1235. }
  1236. ThisNode:=ctypeconvnode.create_internal(TModDivNode(t).left, ThisDef);
  1237. TTypeConvNode(ThisNode).convtype:=tc_int_2_int;
  1238. ThisDef:=get_unsigned_inttype(ThisDef);
  1239. ThisNode.resultdef:=ThisDef;
  1240. TModDivNode(t).left:=nil;
  1241. ConstNode:=cordconstnode.create(reciprocal, ThisDef, False);
  1242. ConstNode.resultdef:=ThisDef;
  1243. ThisNode:=caddnode.create_internal(muln, ThisNode, ConstNode);
  1244. ThisNode.resultdef:=ThisDef;
  1245. {$push}
  1246. {$warnings off}
  1247. if shift>0 then
  1248. comparison:=((aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div (divisor shr shift)) and -(1 shl shift)
  1249. else
  1250. comparison:=(aWord(1) shl ((N-1) and (SizeOf(aWord)*8-1))) div divisor;
  1251. {$pop}
  1252. ConstNode:=cordconstnode.create(comparison, ThisDef, False);
  1253. ConstNode.resultdef:=ThisDef;
  1254. ThisNode:=caddnode.create_internal(addn, ThisNode, ConstNode);
  1255. ThisNode.resultdef:=ThisDef;
  1256. if shift>0 then
  1257. begin
  1258. ConstNode:=cordconstnode.create(shift, u8inttype, False);
  1259. ConstNode.resultdef:=u8inttype;
  1260. ThisNode:=cinlinenode.createintern(in_ror_x_y,false,
  1261. ccallparanode.create(ConstNode,
  1262. ccallparanode.create(ThisNode, nil)));
  1263. ThisNode.resultdef:=ThisDef;
  1264. ConstNode:=cordconstnode.create(comparison shr (shift - 1), ThisDef, False);
  1265. end
  1266. else
  1267. ConstNode:=cordconstnode.create(comparison*2, ThisDef, False);
  1268. ConstNode.resultdef:=ThisDef;
  1269. Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode);
  1270. Result.resultdef:=resultdef;
  1271. Exit;
  1272. end;
  1273. end
  1274. else
  1275. begin
  1276. { For bit length N, convert "(x mod d) = 0" or "(x mod d) <> 0", where
  1277. d is an odd-numbered integer constant, to "(x * r) <= m", where
  1278. dr = 1 (mod 2^N) and m = floor(2^N / d).
  1279. If d is even, convert to the form k.2^b, where k is odd, then
  1280. convert to "(x * r) ror b <= m", where kr = 1 (mod 2^N) and
  1281. m = floor(2^N / d) = floor(2^(N-b) / k) }
  1282. while True do
  1283. begin
  1284. ThisNode:=TModDivNode(t).left;
  1285. case ThisNode.nodetype of
  1286. typeconvn:
  1287. begin
  1288. ThisDef:=TTypeConvNode(ThisNode).left.resultdef;
  1289. { See if we can simplify things to a smaller ordinal to
  1290. reduce code size and increase speed }
  1291. if not is_signed(ThisDef) and
  1292. is_integer(ThisDef) and
  1293. { Byte-sized multiplications can cause problems }
  1294. (ThisDef.size>=2) and
  1295. { Make sure the divisor is in range }
  1296. (divisor>=TOrdDef(ThisDef).low) and
  1297. (divisor<=TOrdDef(ThisDef).high) then
  1298. begin
  1299. TOrdConstNode(TModDivNode(t).right).resultdef:=ThisDef;
  1300. TOrdConstNode(m).resultdef:=ThisDef;
  1301. TModDivNode(t).resultdef:=ThisDef;
  1302. { Destroy the typeconv node }
  1303. TModDivNode(t).left:=TTypeConvNode(ThisNode).left;
  1304. TTypeConvNode(ThisNode).left:=nil;
  1305. ThisNode.Free;
  1306. Continue;
  1307. end;
  1308. end;
  1309. ordconstn:
  1310. begin
  1311. { Just simplify into a constant }
  1312. Result:=inherited simplify(forinline);
  1313. Exit;
  1314. end;
  1315. else
  1316. ;
  1317. end;
  1318. DoOptimisation:=True;
  1319. Break;
  1320. end;
  1321. if DoOptimisation then
  1322. begin
  1323. ThisDef:=TModDivNode(t).left.resultdef;
  1324. { Construct the following node tree for odd divisors:
  1325. <lten> (for equaln) or <gtn> (for notequaln)
  1326. <muln>
  1327. <numerator node (TModDivNode(t).left)>
  1328. <reciprocal constant>
  1329. (2^N / divisor)
  1330. For even divisors, convert them to the form k.2^b, with
  1331. odd k, then construct the following:
  1332. <lten> (for equaln) or <gtn> (for notequaln)
  1333. <ror>
  1334. (b)
  1335. <muln>
  1336. <numerator node (TModDivNode(t).left)>
  1337. <reciprocal constant>
  1338. (2^N / divisor)
  1339. }
  1340. if nodetype=equaln then
  1341. ThisType:=lten
  1342. else
  1343. ThisType:=gtn;
  1344. N:=ThisDef.size*8;
  1345. calc_mul_inverse(N, TOrdConstNode(TModDivNode(t).right).value.uvalue, reciprocal, shift);
  1346. ConstNode:=cordconstnode.create(reciprocal, ThisDef, False);
  1347. ConstNode.resultdef:=ThisDef;
  1348. ThisNode:=caddnode.create_internal(muln, TModDivNode(t).left, ConstNode);
  1349. ThisNode.resultdef:=ThisDef;
  1350. TModDivNode(t).left:=nil;
  1351. if shift>0 then
  1352. begin
  1353. ConstNode:=cordconstnode.create(shift, u8inttype, False);
  1354. ConstNode.resultdef:=u8inttype;
  1355. ThisNode:=cinlinenode.createintern(in_ror_x_y,false,
  1356. ccallparanode.create(ConstNode,
  1357. ccallparanode.create(ThisNode, nil)));
  1358. ThisNode.resultdef:=ThisDef;
  1359. comparison:=(aWord(1) shl ((N-shift) and (SizeOf(aWord)*8-1))) div (divisor shr shift);
  1360. end
  1361. else
  1362. begin
  1363. {$push}
  1364. {$warnings off}
  1365. { Because 2^N and divisor are relatively prime,
  1366. floor(2^N / divisor) = floor((2^N - 1) / divisor) }
  1367. comparison:=(aWord(not 0) shr (((SizeOf(aWord)*8)-N) and (SizeOf(aWord)*8-1))) div divisor;
  1368. {$pop}
  1369. end;
  1370. ConstNode:=cordconstnode.create(comparison, ThisDef, False);
  1371. ConstNode.resultdef:=ThisDef;
  1372. Result:=CAddNode.create_internal(ThisType, ThisNode, ConstNode);
  1373. Result.resultdef:=resultdef;
  1374. Exit;
  1375. end;
  1376. end;
  1377. end;
  1378. end;
  1379. end;
  1380. {$ifend defined(cpu64bitalu) or defined(cpu32bitalu) or defined(cpu16bitalu)}
  1381. Result:=inherited simplify(forinline);
  1382. end;
  1383. function tx86addnode.use_fma : boolean;
  1384. begin
  1385. {$ifndef i8086}
  1386. { test if the result stays in an xmm register, fiddeling with fpu registers and fma makes no sense }
  1387. Result:=use_vectorfpu(resultdef) and
  1388. ((fpu_capabilities[current_settings.fputype]*[FPUX86_HAS_FMA,FPUX86_HAS_FMA4])<>[]);
  1389. {$else i8086}
  1390. Result:=inherited use_fma;
  1391. {$endif i8086}
  1392. end;
  1393. procedure tx86addnode.second_cmpfloatvector;
  1394. var
  1395. op : tasmop;
  1396. const
  1397. ops_single: array[boolean] of tasmop = (A_COMISS,A_VCOMISS);
  1398. ops_double: array[boolean] of tasmop = (A_COMISD,A_VCOMISD);
  1399. begin
  1400. if is_single(left.resultdef) then
  1401. op:=ops_single[UseAVX]
  1402. else if is_double(left.resultdef) then
  1403. op:=ops_double[UseAVX]
  1404. else
  1405. internalerror(200402222);
  1406. pass_left_right;
  1407. { fpu operands are always in reversed order on the stack }
  1408. if (left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) and (right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER]) then
  1409. toggleflag(nf_swapped);
  1410. location_reset(location,LOC_FLAGS,OS_NO);
  1411. { Direct move fpu->mm register is not possible, so force any fpu operands to
  1412. memory (not to mm registers because one of the memory locations can be used
  1413. directly in compare instruction, yielding shorter code) }
  1414. if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1415. hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
  1416. if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
  1417. hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
  1418. if (right.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  1419. begin
  1420. case left.location.loc of
  1421. LOC_REFERENCE,LOC_CREFERENCE:
  1422. begin
  1423. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  1424. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
  1425. end;
  1426. LOC_MMREGISTER,LOC_CMMREGISTER:
  1427. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
  1428. else
  1429. internalerror(200402221);
  1430. end;
  1431. toggleflag(nf_swapped);
  1432. end
  1433. else
  1434. begin
  1435. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  1436. case right.location.loc of
  1437. LOC_REFERENCE,LOC_CREFERENCE:
  1438. begin
  1439. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  1440. current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
  1441. end;
  1442. LOC_MMREGISTER,LOC_CMMREGISTER:
  1443. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
  1444. else
  1445. internalerror(200402223);
  1446. end;
  1447. end;
  1448. location.resflags:=getfpuresflags;
  1449. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1450. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1451. end;
  1452. procedure tx86addnode.second_opvector;
  1453. var
  1454. op : topcg;
  1455. begin
  1456. pass_left_right;
  1457. if (nf_swapped in flags) then
  1458. swapleftright;
  1459. case nodetype of
  1460. addn :
  1461. op:=OP_ADD;
  1462. muln :
  1463. op:=OP_MUL;
  1464. subn :
  1465. op:=OP_SUB;
  1466. slashn :
  1467. op:=OP_DIV;
  1468. else
  1469. internalerror(200610071);
  1470. end;
  1471. if fits_in_mm_register(left.resultdef) then
  1472. begin
  1473. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  1474. { we can use only right as left operand if the operation is commutative }
  1475. if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
  1476. begin
  1477. if UseAVX then
  1478. begin
  1479. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1480. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,right.location.register,location.register,nil);
  1481. end
  1482. else
  1483. begin
  1484. location.register:=right.location.register;
  1485. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,tfloat2tcgsize[tfloatdef(left.resultdef).floattype],left.location,location.register,nil);
  1486. end;
  1487. end
  1488. else
  1489. begin
  1490. location_force_mmreg(current_asmdata.CurrAsmList,left.location,false);
  1491. if UseAVX then
  1492. begin
  1493. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,OS_VECTOR);
  1494. cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,
  1495. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,left.location.register,location.register,nil);
  1496. end
  1497. else
  1498. begin
  1499. location.register:=left.location.register;
  1500. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,
  1501. tfloat2tcgsize[tfloatdef(tarraydef(left.resultdef).elementdef).floattype],right.location,location.register,nil);
  1502. end;
  1503. end;
  1504. end
  1505. else
  1506. begin
  1507. { not yet supported }
  1508. internalerror(200610072);
  1509. end
  1510. end;
  1511. procedure tx86addnode.second_addfloat;
  1512. const
  1513. ops_add: array[boolean] of TAsmOp = (A_FADDP,A_FADD);
  1514. ops_mul: array[boolean] of TAsmOp = (A_FMULP,A_FMUL);
  1515. ops_sub: array[boolean] of TAsmOp = (A_FSUBP,A_FSUB);
  1516. ops_rsub: array[boolean] of TAsmOp = (A_FSUBRP,A_FSUBR);
  1517. ops_div: array[boolean] of TAsmOp = (A_FDIVP,A_FDIV);
  1518. ops_rdiv: array[boolean] of TAsmOp = (A_FDIVRP,A_FDIVR);
  1519. var
  1520. op : TAsmOp;
  1521. refnode, hp: tnode;
  1522. hasref : boolean;
  1523. begin
  1524. if use_vectorfpu(resultdef) then
  1525. begin
  1526. if UseAVX then
  1527. second_addfloatavx
  1528. else
  1529. second_addfloatsse;
  1530. exit;
  1531. end;
  1532. { can the operation do the conversion? }
  1533. if (left.nodetype=typeconvn) and (is_double(ttypeconvnode(left).left.resultdef) or is_single(ttypeconvnode(left).left.resultdef)) then
  1534. begin
  1535. hp:=left;
  1536. left:=ttypeconvnode(left).left;
  1537. ttypeconvnode(hp).left:=nil;
  1538. hp.Free;
  1539. end;
  1540. if (right.nodetype=typeconvn) and (is_double(ttypeconvnode(right).left.resultdef) or is_single(ttypeconvnode(right).left.resultdef)) then
  1541. begin
  1542. hp:=right;
  1543. right:=ttypeconvnode(right).left;
  1544. ttypeconvnode(hp).left:=nil;
  1545. hp.Free;
  1546. end;
  1547. pass_left_right;
  1548. prepare_x87_locations(refnode);
  1549. hasref:=assigned(refnode);
  1550. case nodetype of
  1551. addn :
  1552. op:=ops_add[hasref];
  1553. muln :
  1554. op:=ops_mul[hasref];
  1555. subn :
  1556. if (nf_swapped in flags) then
  1557. op:=ops_rsub[hasref]
  1558. else
  1559. op:=ops_sub[hasref];
  1560. slashn :
  1561. if (nf_swapped in flags) then
  1562. op:=ops_rdiv[hasref]
  1563. else
  1564. op:=ops_div[hasref];
  1565. else
  1566. internalerror(2003042203);
  1567. end;
  1568. if hasref then
  1569. emit_ref(op,tcgsize2opsize[refnode.location.size],refnode.location.reference)
  1570. else
  1571. begin
  1572. emit_reg_reg(op,S_NO,NR_ST,NR_ST1);
  1573. tcgx86(cg).dec_fpu_stack;
  1574. end;
  1575. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  1576. location.register:=NR_ST;
  1577. end;
  1578. procedure tx86addnode.second_cmpfloat;
  1579. {$ifdef i8086}
  1580. var
  1581. tmpref: treference;
  1582. {$endif i8086}
  1583. begin
  1584. if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
  1585. begin
  1586. second_cmpfloatvector;
  1587. exit;
  1588. end;
  1589. pass_left_right;
  1590. force_left_and_right_fpureg;
  1591. {$ifndef x86_64}
  1592. if current_settings.cputype<cpu_Pentium2 then
  1593. begin
  1594. emit_none(A_FCOMPP,S_NO);
  1595. tcgx86(cg).dec_fpu_stack;
  1596. tcgx86(cg).dec_fpu_stack;
  1597. { load fpu flags }
  1598. {$ifdef i8086}
  1599. if current_settings.cputype < cpu_286 then
  1600. begin
  1601. tg.gettemp(current_asmdata.CurrAsmList,2,2,tt_normal,tmpref);
  1602. emit_ref(A_FSTSW,S_NO,tmpref);
  1603. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1604. inc(tmpref.offset);
  1605. emit_ref_reg(A_MOV,S_B,tmpref,NR_AH);
  1606. dec(tmpref.offset);
  1607. emit_none(A_SAHF,S_NO);
  1608. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1609. tg.ungettemp(current_asmdata.CurrAsmList,tmpref);
  1610. end
  1611. else
  1612. {$endif i8086}
  1613. begin
  1614. cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1615. emit_reg(A_FNSTSW,S_NO,NR_AX);
  1616. emit_none(A_SAHF,S_NO);
  1617. cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
  1618. end;
  1619. if cs_fpu_fwait in current_settings.localswitches then
  1620. current_asmdata.CurrAsmList.concat(Taicpu.Op_none(A_FWAIT,S_NO));
  1621. end
  1622. else
  1623. {$endif x86_64}
  1624. begin
  1625. current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCOMIP,S_NO,NR_ST1,NR_ST0));
  1626. { fcomip pops only one fpu register }
  1627. current_asmdata.CurrAsmList.concat(taicpu.op_reg(A_FSTP,S_NO,NR_ST0));
  1628. tcgx86(cg).dec_fpu_stack;
  1629. tcgx86(cg).dec_fpu_stack;
  1630. end;
  1631. location_reset(location,LOC_FLAGS,OS_NO);
  1632. location.resflags:=getfpuresflags;
  1633. end;
  1634. {*****************************************************************************
  1635. Add64bit
  1636. *****************************************************************************}
  1637. procedure tx86addnode.second_add64bit;
  1638. begin
  1639. {$ifdef cpu64bitalu}
  1640. second_addordinal;
  1641. {$else cpu64bitalu}
  1642. { must be implemented separate }
  1643. internalerror(200402042);
  1644. {$endif cpu64bitalu}
  1645. end;
  1646. procedure tx86addnode.second_cmp64bit;
  1647. begin
  1648. {$ifdef cpu64bitalu}
  1649. second_cmpordinal;
  1650. {$else cpu64bitalu}
  1651. { must be implemented separate }
  1652. internalerror(200402043);
  1653. {$endif cpu64bitalu}
  1654. end;
  1655. {*****************************************************************************
  1656. AddOrdinal
  1657. *****************************************************************************}
  1658. procedure tx86addnode.second_addordinal;
  1659. var
  1660. opsize : tcgsize;
  1661. unsigned : boolean;
  1662. cgop : topcg;
  1663. checkoverflow : Boolean;
  1664. ovloc : tlocation;
  1665. tmpreg : TRegister;
  1666. begin
  1667. { determine if the comparison will be unsigned }
  1668. unsigned:=not(is_signed(left.resultdef)) or
  1669. not(is_signed(right.resultdef));
  1670. { assume no overflow checking is require }
  1671. checkoverflow := false;
  1672. ovloc.loc:=LOC_VOID;
  1673. case nodetype of
  1674. addn:
  1675. begin
  1676. cgop:=OP_ADD;
  1677. checkoverflow:=true;
  1678. end;
  1679. xorn :
  1680. begin
  1681. cgop:=OP_XOR;
  1682. end;
  1683. orn :
  1684. begin
  1685. cgop:=OP_OR;
  1686. end;
  1687. andn:
  1688. begin
  1689. cgop:=OP_AND;
  1690. end;
  1691. muln:
  1692. begin
  1693. checkoverflow:=true;
  1694. if unsigned then
  1695. cgop:=OP_MUL
  1696. else
  1697. cgop:=OP_IMUL;
  1698. end;
  1699. subn :
  1700. begin
  1701. checkoverflow:=true;
  1702. cgop:=OP_SUB;
  1703. end;
  1704. else
  1705. internalerror(2015022501);
  1706. end;
  1707. checkoverflow:=
  1708. checkoverflow and
  1709. needoverflowcheck;
  1710. opsize:=def_cgsize(left.resultdef);
  1711. pass_left_right;
  1712. { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions
  1713. make no sense if right is a reference }
  1714. if ((left.location.loc<>LOC_REGISTER) and (right.location.loc<>LOC_REGISTER) and
  1715. ((nodetype<>subn) or not(right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE])) and
  1716. { 3 op mul makes only sense if a constant is involed }
  1717. ((nodetype<>muln) or (left.location.loc=LOC_CONSTANT) or (right.location.loc=LOC_CONSTANT)
  1718. {$ifndef i8086}
  1719. or ((CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) and (not(needoverflowcheck))
  1720. )
  1721. {$endif i8086}
  1722. ) and
  1723. (not(nodetype in [orn,andn,xorn]))) or
  1724. ((nodetype=addn) and (left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT]) and (right.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_CONSTANT])) then
  1725. begin
  1726. { allocate registers }
  1727. force_reg_left_right(false,true);
  1728. set_result_location_reg;
  1729. if nodetype<>subn then
  1730. begin
  1731. if checkoverflow then
  1732. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1733. if (right.location.loc<>LOC_CONSTANT) then
  1734. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1735. left.location.register,right.location.register,
  1736. location.register,checkoverflow,ovloc)
  1737. else
  1738. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,cgop,resultdef,
  1739. right.location.value,left.location.register,
  1740. location.register,checkoverflow,ovloc);
  1741. end
  1742. else { subtract is a special case since its not commutative }
  1743. begin
  1744. if (nf_swapped in flags) then
  1745. swapleftright;
  1746. if left.location.loc<>LOC_CONSTANT then
  1747. begin
  1748. if checkoverflow then
  1749. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1750. if right.location.loc<>LOC_CONSTANT then
  1751. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1752. right.location.register,left.location.register,
  1753. location.register,checkoverflow,ovloc)
  1754. else
  1755. hlcg.a_op_const_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1756. right.location.value,left.location.register,
  1757. location.register,checkoverflow,ovloc);
  1758. end
  1759. else
  1760. begin
  1761. tmpreg:=hlcg.getintregister(current_asmdata.CurrAsmList,resultdef);
  1762. hlcg.a_load_const_reg(current_asmdata.CurrAsmList,resultdef,
  1763. left.location.value,tmpreg);
  1764. if checkoverflow then
  1765. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1766. hlcg.a_op_reg_reg_reg_checkoverflow(current_asmdata.CurrAsmList,OP_SUB,resultdef,
  1767. right.location.register,tmpreg,location.register,checkoverflow,ovloc);
  1768. end;
  1769. end
  1770. end
  1771. else
  1772. begin
  1773. { at least one location should be a register, if yes, try to re-use it, so we can try two operand opcodes }
  1774. if left.location.loc<>LOC_REGISTER then
  1775. begin
  1776. if right.location.loc<>LOC_REGISTER then
  1777. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false)
  1778. else
  1779. begin
  1780. location_swap(left.location,right.location);
  1781. toggleflag(nf_swapped);
  1782. end;
  1783. end;
  1784. { at this point, left.location.loc should be LOC_REGISTER }
  1785. if right.location.loc=LOC_REGISTER then
  1786. begin
  1787. if checkoverflow then
  1788. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1789. { when swapped another result register }
  1790. if (nodetype=subn) and (nf_swapped in flags) then
  1791. begin
  1792. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1793. left.location.register,right.location.register);
  1794. location_swap(left.location,right.location);
  1795. toggleflag(nf_swapped);
  1796. end
  1797. else
  1798. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,
  1799. right.location.register,left.location.register);
  1800. end
  1801. else
  1802. begin
  1803. { right.location<>LOC_REGISTER }
  1804. if right.location.loc in [LOC_CSUBSETREF,LOC_CSUBSETREG,LOC_SUBSETREF,LOC_SUBSETREG] then
  1805. hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,left.resultdef,true);
  1806. if (nodetype=subn) and (nf_swapped in flags) then
  1807. begin
  1808. tmpreg:=left.location.register;
  1809. left.location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
  1810. cg.a_load_loc_reg(current_asmdata.CurrAsmList,opsize,right.location,left.location.register);
  1811. if checkoverflow then
  1812. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1813. cg.a_op_reg_reg(current_asmdata.CurrAsmList,cgop,opsize,tmpreg,left.location.register);
  1814. end
  1815. else
  1816. begin
  1817. if checkoverflow then
  1818. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  1819. cg.a_op_loc_reg(current_asmdata.CurrAsmList,cgop,opsize,right.location,left.location.register);
  1820. end;
  1821. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1822. end;
  1823. location_copy(location,left.location);
  1824. end;
  1825. { emit overflow check if required }
  1826. if checkoverflow then
  1827. cg.g_overflowcheck_loc(current_asmdata.CurrAsmList,Location,resultdef,ovloc);
  1828. end;
  1829. procedure tx86addnode.second_addboolean;
  1830. begin
  1831. if (nodetype in [orn,andn]) and
  1832. (not(cs_full_boolean_eval in current_settings.localswitches) or
  1833. (nf_short_bool in flags)) then
  1834. inherited second_addboolean
  1835. else if is_64bit(left.resultdef) then
  1836. inherited
  1837. else
  1838. second_addordinal;
  1839. end;
  1840. procedure tx86addnode.second_cmpordinal;
  1841. var
  1842. opdef : tdef;
  1843. opsize : tcgsize;
  1844. unsigned : boolean;
  1845. begin
  1846. unsigned:=not(is_signed(left.resultdef)) or
  1847. not(is_signed(right.resultdef));
  1848. opdef:=left.resultdef;
  1849. opsize:=def_cgsize(opdef);
  1850. pass_left_right;
  1851. if (right.location.loc=LOC_CONSTANT) and
  1852. (left.location.loc in [LOC_REFERENCE, LOC_CREFERENCE])
  1853. {$ifdef x86_64}
  1854. and ((not (opsize in [OS_64,OS_S64])) or (
  1855. (right.location.value>=low(longint)) and (right.location.value<=high(longint))
  1856. ))
  1857. {$endif x86_64}
  1858. then
  1859. begin
  1860. emit_const_ref(A_CMP, TCGSize2Opsize[opsize], right.location.value, left.location.reference);
  1861. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1862. end
  1863. else
  1864. begin
  1865. left_must_be_reg(opdef,opsize,false);
  1866. emit_generic_code(A_CMP,opsize,unsigned,false,false);
  1867. location_freetemp(current_asmdata.CurrAsmList,right.location);
  1868. location_freetemp(current_asmdata.CurrAsmList,left.location);
  1869. end;
  1870. location_reset(location,LOC_FLAGS,OS_NO);
  1871. location.resflags:=getresflags(unsigned);
  1872. end;
  1873. begin
  1874. caddnode:=tx86addnode;
  1875. end.