2
0

nx86mat.pas 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075
  1. {
  2. Copyright (c) 1998-2002 by Florian Klaempfl
  3. Generate x86 code for math nodes
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit nx86mat;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. node,ncgmat;
  22. type
  23. tx86unaryminusnode = class(tcgunaryminusnode)
  24. {$ifdef SUPPORT_MMX}
  25. procedure second_mmx;override;
  26. {$endif SUPPORT_MMX}
  27. procedure second_float;override;
  28. function pass_1:tnode;override;
  29. end;
  30. tx86notnode = class(tcgnotnode)
  31. procedure second_boolean;override;
  32. {$ifdef SUPPORT_MMX}
  33. procedure second_mmx;override;
  34. {$endif SUPPORT_MMX}
  35. end;
  36. tx86moddivnode = class(tcgmoddivnode)
  37. procedure pass_generate_code;override;
  38. end;
  39. tx86shlshrnode = class(tcgshlshrnode)
  40. {$ifdef SUPPORT_MMX}
  41. procedure second_mmx;override;
  42. {$endif SUPPORT_MMX}
  43. end;
  44. implementation
  45. uses
  46. globtype,
  47. constexp,
  48. cutils,verbose,globals,
  49. symconst,symdef,
  50. aasmbase,aasmtai,aasmcpu,aasmdata,defutil,
  51. cgbase,pass_1,pass_2,
  52. ncon,
  53. cpubase,cpuinfo,
  54. cga,cgobj,hlcgobj,cgx86,cgutils,
  55. tgobj;
  56. {*****************************************************************************
  57. TI386UNARYMINUSNODE
  58. *****************************************************************************}
  59. function tx86unaryminusnode.pass_1 : tnode;
  60. begin
  61. result:=nil;
  62. firstpass(left);
  63. if codegenerror then
  64. exit;
  65. if (left.resultdef.typ=floatdef) then
  66. begin
  67. if use_vectorfpu(left.resultdef) then
  68. expectloc:=LOC_MMREGISTER
  69. else
  70. expectloc:=LOC_FPUREGISTER;
  71. end
  72. {$ifdef SUPPORT_MMX}
  73. else
  74. if (cs_mmx in current_settings.localswitches) and
  75. is_mmx_able_array(left.resultdef) then
  76. begin
  77. expectloc:=LOC_MMXREGISTER;
  78. end
  79. {$endif SUPPORT_MMX}
  80. else
  81. inherited pass_1;
  82. end;
  83. {$ifdef SUPPORT_MMX}
  84. procedure tx86unaryminusnode.second_mmx;
  85. var
  86. op : tasmop;
  87. hreg : tregister;
  88. begin
  89. op:=A_NONE;
  90. secondpass(left);
  91. location_reset(location,LOC_MMXREGISTER,OS_NO);
  92. hreg:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  93. emit_reg_reg(A_PXOR,S_NO,hreg,hreg);
  94. case left.location.loc of
  95. LOC_MMXREGISTER:
  96. begin
  97. location.register:=left.location.register;
  98. end;
  99. LOC_CMMXREGISTER:
  100. begin
  101. location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  102. emit_reg_reg(A_MOVQ,S_NO,left.location.register,location.register);
  103. end;
  104. LOC_REFERENCE,
  105. LOC_CREFERENCE:
  106. begin
  107. location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  108. emit_ref_reg(A_MOVQ,S_NO,left.location.reference,location.register);
  109. end;
  110. else
  111. internalerror(200203225);
  112. end;
  113. if cs_mmx_saturation in current_settings.localswitches then
  114. case mmx_type(resultdef) of
  115. mmxs8bit:
  116. op:=A_PSUBSB;
  117. mmxu8bit:
  118. op:=A_PSUBUSB;
  119. mmxs16bit,mmxfixed16:
  120. op:=A_PSUBSW;
  121. mmxu16bit:
  122. op:=A_PSUBUSW;
  123. else
  124. ;
  125. end
  126. else
  127. case mmx_type(resultdef) of
  128. mmxs8bit,mmxu8bit:
  129. op:=A_PSUBB;
  130. mmxs16bit,mmxu16bit,mmxfixed16:
  131. op:=A_PSUBW;
  132. mmxs32bit,mmxu32bit:
  133. op:=A_PSUBD;
  134. else
  135. ;
  136. end;
  137. if op = A_NONE then
  138. internalerror(201408202);
  139. emit_reg_reg(op,S_NO,location.register,hreg);
  140. emit_reg_reg(A_MOVQ,S_NO,hreg,location.register);
  141. end;
  142. {$endif SUPPORT_MMX}
  143. procedure tx86unaryminusnode.second_float;
  144. var
  145. l1: TAsmLabel;
  146. href: treference;
  147. reg: tregister;
  148. begin
  149. secondpass(left);
  150. if expectloc=LOC_MMREGISTER then
  151. begin
  152. if cs_opt_fastmath in current_settings.optimizerswitches then
  153. begin
  154. if not(left.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER,LOC_CREFERENCE,LOC_REFERENCE]) then
  155. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  156. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  157. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
  158. cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,location.size,location.register,location.register,nil);
  159. cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,OP_SUB,location.size,left.location,location.register,mms_movescalar);
  160. end
  161. else
  162. begin
  163. location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
  164. current_asmdata.getglobaldatalabel(l1);
  165. new_section(current_asmdata.asmlists[al_typedconsts],sec_rodata_norel,l1.name,const_align(16));
  166. current_asmdata.asmlists[al_typedconsts].concat(Tai_label.Create(l1));
  167. case def_cgsize(resultdef) of
  168. OS_F32:
  169. current_asmdata.asmlists[al_typedconsts].concat(tai_const.create_32bit(longint(1 shl 31)));
  170. OS_F64:
  171. begin
  172. current_asmdata.asmlists[al_typedconsts].concat(tai_const.create_32bit(0));
  173. current_asmdata.asmlists[al_typedconsts].concat(tai_const.create_32bit(-(1 shl 31)));
  174. end
  175. else
  176. internalerror(2004110215);
  177. end;
  178. reference_reset_symbol(href,l1,0,resultdef.alignment,[]);
  179. if UseAVX then
  180. begin
  181. if not(left.location.loc in [LOC_MMREGISTER,LOC_CMMREGISTER]) then
  182. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
  183. location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
  184. cg.a_opmm_ref_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,href,left.location.register,location.register,nil)
  185. end
  186. else
  187. begin
  188. if not(left.location.loc=LOC_MMREGISTER) then
  189. hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
  190. location.register:=left.location.register;
  191. cg.a_opmm_ref_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,href,location.register,mms_movescalar);
  192. end;
  193. end;
  194. end
  195. else
  196. begin
  197. location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
  198. case left.location.loc of
  199. LOC_REFERENCE,
  200. LOC_CREFERENCE:
  201. begin
  202. location.register:=NR_ST;
  203. cg.a_loadfpu_ref_reg(current_asmdata.CurrAsmList,
  204. left.location.size,location.size,
  205. left.location.reference,location.register);
  206. emit_none(A_FCHS,S_NO);
  207. end;
  208. LOC_FPUREGISTER,
  209. LOC_CFPUREGISTER:
  210. begin
  211. { "load st,st" is ignored by the code generator }
  212. cg.a_loadfpu_reg_reg(current_asmdata.CurrAsmList,left.location.size,location.size,left.location.register,NR_ST);
  213. location.register:=NR_ST;
  214. emit_none(A_FCHS,S_NO);
  215. end;
  216. else
  217. internalerror(200312241);
  218. end;
  219. end;
  220. end;
  221. {*****************************************************************************
  222. TX86NOTNODE
  223. *****************************************************************************}
  224. procedure tx86notnode.second_boolean;
  225. var
  226. opsize : tcgsize;
  227. {$if defined(cpu32bitalu) or defined(cpu16bitalu)}
  228. hreg: tregister;
  229. {$endif}
  230. begin
  231. opsize:=def_cgsize(resultdef);
  232. secondpass(left);
  233. if not handle_locjump then
  234. begin
  235. case left.location.loc of
  236. LOC_FLAGS :
  237. begin
  238. location_reset(location,LOC_FLAGS,OS_NO);
  239. location.resflags:=left.location.resflags;
  240. inverse_flags(location.resflags);
  241. end;
  242. LOC_CREFERENCE,
  243. LOC_REFERENCE:
  244. begin
  245. {$if defined(cpu32bitalu)}
  246. if is_64bit(resultdef) then
  247. begin
  248. hreg:=cg.GetIntRegister(current_asmdata.CurrAsmList,OS_32);
  249. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  250. cg.a_load_ref_reg(current_asmdata.CurrAsmList,OS_32,OS_32,left.location.reference,hreg);
  251. inc(left.location.reference.offset,4);
  252. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  253. cg.a_op_ref_reg(current_asmdata.CurrAsmList,OP_OR,OS_32,left.location.reference,hreg);
  254. end
  255. else
  256. {$elseif defined(cpu16bitalu)}
  257. if is_64bit(resultdef) then
  258. begin
  259. hreg:=cg.GetIntRegister(current_asmdata.CurrAsmList,OS_16);
  260. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  261. cg.a_load_ref_reg(current_asmdata.CurrAsmList,OS_16,OS_16,left.location.reference,hreg);
  262. inc(left.location.reference.offset,2);
  263. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  264. cg.a_op_ref_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,left.location.reference,hreg);
  265. inc(left.location.reference.offset,2);
  266. cg.a_op_ref_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,left.location.reference,hreg);
  267. inc(left.location.reference.offset,2);
  268. cg.a_op_ref_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,left.location.reference,hreg);
  269. end
  270. else if is_32bit(resultdef) then
  271. begin
  272. hreg:=cg.GetIntRegister(current_asmdata.CurrAsmList,OS_16);
  273. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  274. cg.a_load_ref_reg(current_asmdata.CurrAsmList,OS_16,OS_16,left.location.reference,hreg);
  275. inc(left.location.reference.offset,2);
  276. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  277. cg.a_op_ref_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,left.location.reference,hreg);
  278. end
  279. else
  280. {$endif}
  281. begin
  282. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  283. emit_const_ref(A_CMP, TCGSize2Opsize[opsize], 0, left.location.reference);
  284. end;
  285. location_reset(location,LOC_FLAGS,OS_NO);
  286. location.resflags:=F_E;
  287. end;
  288. LOC_CONSTANT,
  289. LOC_REGISTER,
  290. LOC_CREGISTER,
  291. LOC_SUBSETREG,
  292. LOC_CSUBSETREG,
  293. LOC_SUBSETREF,
  294. LOC_CSUBSETREF :
  295. begin
  296. {$if defined(cpu32bitalu)}
  297. if is_64bit(resultdef) then
  298. begin
  299. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,false);
  300. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  301. emit_reg_reg(A_OR,S_L,left.location.register64.reghi,left.location.register64.reglo);
  302. end
  303. else
  304. {$elseif defined(cpu16bitalu)}
  305. if is_64bit(resultdef) then
  306. begin
  307. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,false);
  308. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  309. emit_reg_reg(A_OR,S_W,cg.GetNextReg(left.location.register64.reghi),left.location.register64.reghi);
  310. emit_reg_reg(A_OR,S_W,cg.GetNextReg(left.location.register64.reglo),left.location.register64.reglo);
  311. emit_reg_reg(A_OR,S_W,left.location.register64.reghi,left.location.register64.reglo);
  312. end
  313. else if is_32bit(resultdef) then
  314. begin
  315. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,false);
  316. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  317. emit_reg_reg(A_OR,S_L,cg.GetNextReg(left.location.register),left.location.register);
  318. end
  319. else
  320. {$endif}
  321. begin
  322. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,true);
  323. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  324. emit_reg_reg(A_TEST,TCGSize2Opsize[opsize],left.location.register,left.location.register);
  325. end;
  326. location_reset(location,LOC_FLAGS,OS_NO);
  327. location.resflags:=F_E;
  328. end;
  329. else
  330. internalerror(200203224);
  331. end;
  332. end;
  333. end;
  334. {$ifdef SUPPORT_MMX}
  335. procedure tx86notnode.second_mmx;
  336. var hreg,r:Tregister;
  337. begin
  338. secondpass(left);
  339. location_reset(location,LOC_MMXREGISTER,OS_NO);
  340. r:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
  341. emit_const_reg(A_MOV,S_L,longint($ffffffff),r);
  342. { load operand }
  343. case left.location.loc of
  344. LOC_MMXREGISTER:
  345. location_copy(location,left.location);
  346. LOC_CMMXREGISTER:
  347. begin
  348. location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  349. emit_reg_reg(A_MOVQ,S_NO,left.location.register,location.register);
  350. end;
  351. LOC_REFERENCE,
  352. LOC_CREFERENCE:
  353. begin
  354. location.register:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  355. emit_ref_reg(A_MOVQ,S_NO,left.location.reference,location.register);
  356. end;
  357. else
  358. internalerror(2019050906);
  359. end;
  360. { load mask }
  361. hreg:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  362. emit_reg_reg(A_MOVD,S_NO,r,hreg);
  363. { lower 32 bit }
  364. emit_reg_reg(A_PXOR,S_NO,hreg,location.register);
  365. { shift mask }
  366. emit_const_reg(A_PSLLQ,S_B,32,hreg);
  367. { higher 32 bit }
  368. emit_reg_reg(A_PXOR,S_NO,hreg,location.register);
  369. end;
  370. {$endif SUPPORT_MMX}
  371. {*****************************************************************************
  372. TX86MODDIVNODE
  373. *****************************************************************************}
  374. procedure tx86moddivnode.pass_generate_code;
  375. var
  376. hreg1,hreg2,hreg3,hreg4,rega,regd,tempreg:Tregister;
  377. power:longint;
  378. instr:TAiCpu;
  379. op:Tasmop;
  380. cgsize:TCgSize;
  381. opsize:topsize;
  382. e, sm: aint;
  383. d,m: aword;
  384. m_add, invertsign: boolean;
  385. s: byte;
  386. label
  387. DefaultDiv;
  388. {$ifndef i8086}
  389. procedure DoBMI2ReciprocalDivision;
  390. var
  391. exp_regd: Tregister;
  392. exp_opsize: topsize;
  393. DoMod: Boolean;
  394. SubSize: TSubRegister;
  395. divsize: Byte;
  396. begin
  397. DoMod := (nodetype = modn);
  398. { Extend 32-bit divides to 64-bit registers and 16-bit
  399. divides to 32-bit registers. Because the domain of
  400. the left input is only up to 2^(X/2 - 1) - 1, (i.e.
  401. 2^31 - 1 for 64-bit and 2^15 - 1 for 32-bit), a much
  402. larger error in the reciprocal is permitted. }
  403. if (resultdef.size <= {$ifdef x86_64}4{$else x86_64}2{$endif x86_64}) then
  404. begin
  405. {$ifdef x86_64}
  406. if resultdef.size = 4 then
  407. divsize := 64
  408. else
  409. {$endif x86_64}
  410. divsize := 32;
  411. calc_divconst_magic_unsigned(divsize, d, m, m_add, s);
  412. { Should never have a zero shift and a magic add together }
  413. if (s = 0) and m_add then
  414. InternalError(2021090203);
  415. { Extend the input and out registers (the peephole optimizer should
  416. help clean up unnecessary MOVZX instructions }
  417. hreg3 := hreg1;
  418. case resultdef.size of
  419. {$ifdef x86_64}
  420. 4:
  421. begin
  422. SubSize := R_SUBQ;
  423. setsubreg(hreg3, R_SUBQ);
  424. { Make sure the upper 32 bits are zero; the peephole
  425. optimizer will remove this instruction via MovAnd2Mov
  426. if it's not needed }
  427. emit_const_reg(A_AND, S_L, $FFFFFFFF, hreg1);
  428. exp_regd := NR_RDX;
  429. exp_opsize := S_Q;
  430. if m_add then
  431. { Append 1 to the tail end of the result }
  432. m := (m shr s) or ($8000000000000000 shr (s - 1))
  433. else
  434. m := m shr s;
  435. end;
  436. {$endif x86_64}
  437. 1, 2:
  438. begin
  439. { MULX doesn't have a 16-bit version }
  440. SubSize := R_SUBD;
  441. setsubreg(hreg3, R_SUBD);
  442. if resultdef.size = 1 then
  443. exp_opsize := S_BL
  444. else
  445. exp_opsize := S_WL;
  446. emit_reg_reg(A_MOVZX, exp_opsize, hreg1, hreg3);
  447. exp_regd := NR_EDX;
  448. exp_opsize := S_L;
  449. if m_add then
  450. { Append 1 to the tail end of the result }
  451. m := (m shr s) or ($80000000 shr (s - 1))
  452. else
  453. m := m shr s;
  454. end;
  455. else
  456. InternalError(2021090211);
  457. end;
  458. Inc(m);
  459. cg.getcpuregister(current_asmdata.CurrAsmList, exp_regd);
  460. emit_const_reg(A_MOV, exp_opsize, aint(m), exp_regd);
  461. hreg2 := cg.getintregister(current_asmdata.CurrAsmList, cgsize);
  462. hreg4 := hreg2;
  463. setsubreg(hreg4, SubSize);
  464. cg.ungetcpuregister(current_asmdata.CurrAsmList, exp_regd);
  465. emit_reg_reg_reg(A_MULX, exp_opsize, hreg3, hreg4, hreg4);
  466. end
  467. else
  468. begin
  469. calc_divconst_magic_unsigned(resultdef.size * 8, d, m, m_add, s);
  470. { Should never have a zero shift and a magic add together }
  471. if (s = 0) and m_add then
  472. InternalError(2021090204);
  473. cg.getcpuregister(current_asmdata.CurrAsmList, regd);
  474. emit_const_reg(A_MOV, opsize, aint(m), regd);
  475. hreg2 := cg.getintregister(current_asmdata.CurrAsmList, cgsize);
  476. cg.ungetcpuregister(current_asmdata.CurrAsmList, regd);
  477. emit_reg_reg_reg(A_MULX, opsize, hreg1, hreg2, hreg2);
  478. if m_add then
  479. begin
  480. { addition can overflow, shift first bit considering carry,
  481. then shift remaining bits in regular way. }
  482. cg.a_reg_alloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  483. emit_reg_reg(A_ADD, opsize, hreg1, hreg2);
  484. emit_const_reg(A_RCR, opsize, 1, hreg2);
  485. cg.a_reg_dealloc(current_asmdata.CurrAsmList, NR_DEFAULTFLAGS);
  486. dec(s);
  487. end;
  488. if s<>0 then
  489. emit_const_reg(A_SHR, opsize, aint(s), hreg2);
  490. end;
  491. if DoMod then
  492. begin
  493. { Now multiply the quotient by the original denominator and
  494. subtract the product from the original numerator to get
  495. the remainder. }
  496. {$ifdef x86_64}
  497. if (cgsize in [OS_64,OS_S64]) and (d > $7FFFFFFF) then { Cannot use 64-bit constants in IMUL }
  498. begin
  499. hreg4 := cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  500. emit_const_reg(A_MOV, opsize, aint(d), hreg4);
  501. emit_reg_reg(A_IMUL, opsize, hreg4, hreg2);
  502. end
  503. else
  504. {$endif x86_64}
  505. emit_const_reg(A_IMUL, opsize, aint(d), hreg2);
  506. emit_reg_reg(A_SUB, opsize, hreg2, hreg1);
  507. location.register := hreg1;
  508. end
  509. else
  510. location.register := hreg2;
  511. end;
  512. {$endif not i8086}
  513. procedure DoUnsignedReciprocalDivision;
  514. var
  515. exp_rega,exp_regd:Tregister;
  516. exp_opsize:topsize;
  517. DoMod: Boolean;
  518. begin
  519. {$ifndef i8086}
  520. IF (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then
  521. begin
  522. { If BMI2 is available, use more efficient instructions }
  523. DoBMI2ReciprocalDivision;
  524. Exit;
  525. end;
  526. {$endif not i8086}
  527. DoMod := (nodetype = modn);
  528. { Extend 32-bit divides to 64-bit registers and 16-bit
  529. divides to 32-bit registers. Because the domain of
  530. the left input is only up to 2^(X/2 - 1) - 1, (i.e.
  531. 2^31 - 1 for 64-bit and 2^15 - 1 for 32-bit), a much
  532. larger error in the reciprocal is permitted. }
  533. if (resultdef.size <= {$ifdef x86_64}4{$else x86_64}2{$endif x86_64}) then
  534. begin
  535. calc_divconst_magic_unsigned(resultdef.size * 2 * 8,d,m,m_add,s);
  536. { Should never have a zero shift and a magic add together }
  537. if (s = 0) and m_add then
  538. InternalError(2021090201);
  539. { Extend the input register (the peephole optimizer should
  540. help clean up unnecessary MOVZX instructions }
  541. hreg3 := hreg1;
  542. case resultdef.size of
  543. {$ifdef x86_64}
  544. 4:
  545. begin
  546. setsubreg(hreg3, R_SUBQ);
  547. { Make sure the upper 32 bits are zero; the peephole
  548. optimizer will remove this instruction via MovAnd2Mov
  549. if it's not needed }
  550. emit_const_reg(A_AND, S_L, $FFFFFFFF, hreg1);
  551. exp_rega := NR_RAX;
  552. exp_regd := NR_RDX;
  553. exp_opsize := S_Q;
  554. if m_add then
  555. { Append 1 to the tail end of the result }
  556. m := (m shr s) or ($8000000000000000 shr (s - 1))
  557. else
  558. m := m shr s;
  559. end;
  560. {$endif x86_64}
  561. 2:
  562. begin
  563. setsubreg(hreg3, R_SUBD);
  564. emit_reg_reg(A_MOVZX, S_WL, hreg1, hreg3);
  565. exp_rega := NR_EAX;
  566. exp_regd := NR_EDX;
  567. exp_opsize := S_L;
  568. if m_add then
  569. { Append 1 to the tail end of the result }
  570. m := (m shr s) or ($80000000 shr (s - 1))
  571. else
  572. m := m shr s;
  573. end;
  574. 1:
  575. begin
  576. setsubreg(hreg3, R_SUBW);
  577. emit_reg_reg(A_MOVZX, S_BW, hreg1, hreg3);
  578. exp_rega := NR_AX;
  579. exp_regd := NR_DX;
  580. regd := NR_DL; { We need to change this from AH }
  581. exp_opsize := S_W;
  582. if m_add then
  583. { Append 1 to the tail end of the result }
  584. m := (m shr s) or ($8000 shr (s - 1))
  585. else
  586. m := m shr s;
  587. end;
  588. else
  589. InternalError(2021090210);
  590. end;
  591. Inc(m);
  592. cg.getcpuregister(current_asmdata.CurrAsmList,exp_rega);
  593. emit_const_reg(A_MOV,exp_opsize,aint(m),exp_rega);
  594. cg.getcpuregister(current_asmdata.CurrAsmList,exp_regd);
  595. emit_reg(A_MUL,exp_opsize,hreg3);
  596. cg.ungetcpuregister(current_asmdata.CurrAsmList,exp_rega);
  597. if DoMod then
  598. begin
  599. hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  600. emit_reg_reg(A_MOV,opsize,hreg1,hreg2);
  601. end;
  602. end
  603. else
  604. begin
  605. calc_divconst_magic_unsigned(resultdef.size*8,d,m,m_add,s);
  606. { Should never have a zero shift and a magic add together }
  607. if (s = 0) and m_add then
  608. InternalError(2021090202);
  609. cg.getcpuregister(current_asmdata.CurrAsmList,rega);
  610. emit_const_reg(A_MOV,opsize,aint(m),rega);
  611. cg.getcpuregister(current_asmdata.CurrAsmList,regd);
  612. emit_reg(A_MUL,opsize,hreg1);
  613. cg.ungetcpuregister(current_asmdata.CurrAsmList,rega);
  614. if DoMod then
  615. begin
  616. hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  617. emit_reg_reg(A_MOV,opsize,hreg1,hreg2);
  618. end;
  619. if m_add then
  620. begin
  621. { addition can overflow, shift first bit considering carry,
  622. then shift remaining bits in regular way. }
  623. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  624. emit_reg_reg(A_ADD,opsize,hreg1,regd);
  625. emit_const_reg(A_RCR,opsize,1,regd);
  626. cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  627. dec(s);
  628. end;
  629. if s<>0 then
  630. emit_const_reg(A_SHR,opsize,aint(s),regd);
  631. end;
  632. if DoMod then
  633. begin
  634. { Now multiply the quotient by the original denominator and
  635. subtract the product from the original numerator to get
  636. the remainder. }
  637. {$ifdef x86_64}
  638. if (cgsize in [OS_64,OS_S64]) and (d > $7FFFFFFF) then { Cannot use 64-bit constants in IMUL }
  639. begin
  640. hreg3:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  641. emit_const_reg(A_MOV,opsize,aint(d),hreg3);
  642. emit_reg_reg(A_IMUL,opsize,hreg3,regd);
  643. end
  644. else
  645. {$endif x86_64}
  646. emit_const_reg(A_IMUL,opsize,aint(d),regd);
  647. emit_reg_reg(A_SUB,opsize,regd,hreg2);
  648. end;
  649. cg.ungetcpuregister(current_asmdata.CurrAsmList,regd);
  650. if not DoMod then
  651. begin
  652. hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  653. cg.a_load_reg_reg(current_asmdata.CurrAsmList,cgsize,cgsize,regd,hreg2);
  654. end;
  655. location.register:=hreg2;
  656. end;
  657. begin
  658. secondpass(left);
  659. if codegenerror then
  660. exit;
  661. secondpass(right);
  662. if codegenerror then
  663. exit;
  664. { put numerator in register }
  665. cgsize:=def_cgsize(resultdef);
  666. opsize:=TCGSize2OpSize[cgsize];
  667. rega:=newreg(R_INTREGISTER,RS_EAX,cgsize2subreg(R_INTREGISTER,cgsize));
  668. if cgsize in [OS_8,OS_S8] then
  669. regd:=NR_AH
  670. else
  671. regd:=newreg(R_INTREGISTER,RS_EDX,cgsize2subreg(R_INTREGISTER,cgsize));
  672. location_reset(location,LOC_REGISTER,cgsize);
  673. hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,false);
  674. hreg1:=left.location.register;
  675. if (nodetype=divn) and (right.nodetype=ordconstn) then
  676. begin
  677. if isabspowerof2(tordconstnode(right).value,power) then
  678. begin
  679. { for signed numbers, the numerator must be adjusted before the
  680. shift instruction, but not with unsigned numbers! Otherwise,
  681. "Cardinal($ffffffff) div 16" overflows! (JM) }
  682. if is_signed(left.resultdef) Then
  683. begin
  684. invertsign:=tordconstnode(right).value<0;
  685. { use a sequence without jumps, saw this in
  686. comp.compilers (JM) }
  687. { no jumps, but more operations }
  688. hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  689. emit_reg_reg(A_MOV,opsize,hreg1,hreg2);
  690. if power=1 then
  691. begin
  692. {If the left value is negative, hreg2=(1 shl power)-1=1, otherwise 0.}
  693. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SHR,cgsize,resultdef.size*8-1,hreg2);
  694. end
  695. else
  696. begin
  697. {If the left value is negative, hreg2=$ffffffff, otherwise 0.}
  698. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SAR,cgsize,resultdef.size*8-1,hreg2);
  699. {If negative, hreg2=(1 shl power)-1, otherwise 0.}
  700. { (don't use emit_const_reg, because if value>high(longint)
  701. then it must first be loaded into a register) }
  702. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_AND,cgsize,(aint(1) shl power)-1,hreg2);
  703. end;
  704. { add to the left value }
  705. emit_reg_reg(A_ADD,opsize,hreg2,hreg1);
  706. { do the shift }
  707. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SAR,cgsize,power,hreg1);
  708. if invertsign then
  709. emit_reg(A_NEG,opsize,hreg1);
  710. end
  711. else
  712. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SHR,cgsize,power,hreg1);
  713. location.register:=hreg1;
  714. end
  715. else
  716. begin
  717. if is_signed(left.resultdef) then
  718. begin
  719. e:=tordconstnode(right).value.svalue;
  720. calc_divconst_magic_signed(resultdef.size*8,e,sm,s);
  721. cg.getcpuregister(current_asmdata.CurrAsmList,rega);
  722. emit_const_reg(A_MOV,opsize,sm,rega);
  723. cg.getcpuregister(current_asmdata.CurrAsmList,regd);
  724. emit_reg(A_IMUL,opsize,hreg1);
  725. { only the high half of result is used }
  726. cg.ungetcpuregister(current_asmdata.CurrAsmList,rega);
  727. { add or subtract dividend }
  728. if (e>0) and (sm<0) then
  729. emit_reg_reg(A_ADD,opsize,hreg1,regd)
  730. else if (e<0) and (sm>0) then
  731. emit_reg_reg(A_SUB,opsize,hreg1,regd);
  732. { shift if necessary }
  733. if (s<>0) then
  734. emit_const_reg(A_SAR,opsize,s,regd);
  735. { extract and add the sign bit }
  736. if (e<0) then
  737. emit_reg_reg(A_MOV,opsize,regd,hreg1);
  738. { if e>=0, hreg1 still contains dividend }
  739. emit_const_reg(A_SHR,opsize,left.resultdef.size*8-1,hreg1);
  740. emit_reg_reg(A_ADD,opsize,hreg1,regd);
  741. cg.ungetcpuregister(current_asmdata.CurrAsmList,regd);
  742. location.register:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  743. cg.a_load_reg_reg(current_asmdata.CurrAsmList,cgsize,cgsize,regd,location.register)
  744. end
  745. else
  746. begin
  747. d:=tordconstnode(right).value.uvalue;
  748. if d>=aword(1) shl (left.resultdef.size*8-1) then
  749. begin
  750. location.register:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  751. { Ensure that the whole register is 0, since SETcc only sets the lowest byte }
  752. { If the operands are 64 bits, this XOR routine will be shrunk by the
  753. peephole optimizer. [Kit] }
  754. emit_reg_reg(A_XOR,opsize,location.register,location.register);
  755. if (cgsize in [OS_64,OS_S64]) then { Cannot use 64-bit constants in CMP }
  756. begin
  757. hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  758. emit_const_reg(A_MOV,opsize,aint(d),hreg2);
  759. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  760. emit_reg_reg(A_CMP,opsize,hreg2,hreg1);
  761. end
  762. else
  763. begin
  764. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  765. emit_const_reg(A_CMP,opsize,aint(d),hreg1);
  766. end;
  767. { NOTE: SBB and SETAE are both 3 bytes long without the REX prefix,
  768. both use an ALU for their execution and take a single cycle to
  769. run. The only difference is that SETAE does not modify the flags,
  770. allowing for some possible reuse. [Kit] }
  771. { Emit a SETcc instruction that depends on the carry bit being zero,
  772. that is, the numerator is greater than or equal to the denominator. }
  773. tempreg:=cg.makeregsize(current_asmdata.CurrAsmList,location.register,OS_8);
  774. instr:=TAiCpu.op_reg(A_SETcc,S_B,tempreg);
  775. instr.condition:=C_AE;
  776. current_asmdata.CurrAsmList.concat(instr);
  777. cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  778. end
  779. else
  780. DoUnsignedReciprocalDivision;
  781. end;
  782. end;
  783. end
  784. else if (nodetype=modn) and (right.nodetype=ordconstn) and not(is_signed(left.resultdef)) then
  785. begin
  786. { unsigned modulus by a (+/-)power-of-2 constant? }
  787. if isabspowerof2(tordconstnode(right).value,power) then
  788. begin
  789. emit_const_reg(A_AND,opsize,(aint(1) shl power)-1,hreg1);
  790. location.register:=hreg1;
  791. end
  792. else
  793. begin
  794. d:=tordconstnode(right).value.uvalue;
  795. if d>=aword(1) shl (left.resultdef.size*8-1) then
  796. begin
  797. if not (CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) then
  798. goto DefaultDiv;
  799. location.register:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  800. hreg3:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  801. m := aword(-aint(d)); { Two's complement of d }
  802. if (cgsize in [OS_64,OS_S64]) then { Cannot use 64-bit constants in CMP }
  803. begin
  804. hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  805. emit_const_reg(A_MOV,opsize,aint(d),hreg2);
  806. emit_const_reg(A_MOV,opsize,aint(m),hreg3);
  807. emit_reg_reg(A_XOR,opsize,location.register,location.register);
  808. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  809. emit_reg_reg(A_CMP,opsize,hreg2,hreg1);
  810. end
  811. else
  812. begin
  813. emit_const_reg(A_MOV,opsize,aint(m),hreg3);
  814. emit_reg_reg(A_XOR,opsize,location.register,location.register);
  815. cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  816. emit_const_reg(A_CMP,opsize,aint(d),hreg1);
  817. end;
  818. { Emit conditional move that depends on the carry flag being zero,
  819. that is, the comparison result is above or equal }
  820. instr:=TAiCpu.op_reg_reg(A_CMOVcc,opsize,hreg3,location.register);
  821. instr.condition := C_AE;
  822. current_asmdata.CurrAsmList.concat(instr);
  823. cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
  824. emit_reg_reg(A_ADD,opsize,hreg1,location.register);
  825. end
  826. else
  827. { Convert the division to a multiplication }
  828. DoUnsignedReciprocalDivision;
  829. end;
  830. end
  831. else if (nodetype=modn) and (right.nodetype=ordconstn) and (is_signed(left.resultdef)) and isabspowerof2(tordconstnode(right).value,power) then
  832. begin
  833. hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  834. if power=1 then
  835. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,cgsize,resultdef.size*8-power,hreg1,hreg2)
  836. else
  837. begin
  838. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SAR,cgsize,resultdef.size*8-1,hreg1,hreg2);
  839. cg.a_op_const_reg_reg(current_asmdata.CurrAsmList,OP_SHR,cgsize,resultdef.size*8-power,hreg2,hreg2);
  840. end;
  841. emit_reg_reg(A_ADD,opsize,hreg1,hreg2);
  842. cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_AND,cgsize,not((aint(1) shl power)-1),hreg2);
  843. emit_reg_reg(A_SUB,opsize,hreg2,hreg1);
  844. location.register:=hreg1;
  845. end
  846. else
  847. begin
  848. DefaultDiv:
  849. {Bring denominator to a register.}
  850. cg.getcpuregister(current_asmdata.CurrAsmList,rega);
  851. emit_reg_reg(A_MOV,opsize,hreg1,rega);
  852. cg.getcpuregister(current_asmdata.CurrAsmList,regd);
  853. {Sign extension depends on the left type.}
  854. if is_signed(left.resultdef) then
  855. case left.resultdef.size of
  856. {$ifdef x86_64}
  857. 8:
  858. emit_none(A_CQO,S_NO);
  859. {$endif x86_64}
  860. 4:
  861. emit_none(A_CDQ,S_NO);
  862. else
  863. internalerror(2013102704);
  864. end
  865. else
  866. emit_reg_reg(A_XOR,opsize,regd,regd);
  867. { Division depends on the result type }
  868. if is_signed(resultdef) then
  869. op:=A_IDIV
  870. else
  871. op:=A_DIV;
  872. if right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
  873. emit_ref(op,opsize,right.location.reference)
  874. else if right.location.loc in [LOC_REGISTER,LOC_CREGISTER] then
  875. emit_reg(op,opsize,right.location.register)
  876. else
  877. begin
  878. hreg1:=cg.getintregister(current_asmdata.CurrAsmList,right.location.size);
  879. hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,right.resultdef,right.location,hreg1);
  880. emit_reg(op,opsize,hreg1);
  881. end;
  882. { Copy the result into a new register. Release R/EAX & R/EDX.}
  883. cg.ungetcpuregister(current_asmdata.CurrAsmList,regd);
  884. cg.ungetcpuregister(current_asmdata.CurrAsmList,rega);
  885. location.register:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
  886. if nodetype=divn then
  887. cg.a_load_reg_reg(current_asmdata.CurrAsmList,cgsize,cgsize,rega,location.register)
  888. else
  889. cg.a_load_reg_reg(current_asmdata.CurrAsmList,cgsize,cgsize,regd,location.register);
  890. end;
  891. end;
  892. {$ifdef SUPPORT_MMX}
  893. procedure tx86shlshrnode.second_mmx;
  894. var
  895. op : TAsmOp;
  896. mmxbase : tmmxtype;
  897. hregister : tregister;
  898. begin
  899. secondpass(left);
  900. if codegenerror then
  901. exit;
  902. secondpass(right);
  903. if codegenerror then
  904. exit;
  905. op:=A_NOP;
  906. mmxbase:=mmx_type(left.resultdef);
  907. location_reset(location,LOC_MMXREGISTER,def_cgsize(resultdef));
  908. case nodetype of
  909. shrn :
  910. case mmxbase of
  911. mmxs16bit,mmxu16bit,mmxfixed16:
  912. op:=A_PSRLW;
  913. mmxs32bit,mmxu32bit:
  914. op:=A_PSRLD;
  915. mmxs64bit,mmxu64bit:
  916. op:=A_PSRLQ;
  917. else
  918. Internalerror(2018022504);
  919. end;
  920. shln :
  921. case mmxbase of
  922. mmxs16bit,mmxu16bit,mmxfixed16:
  923. op:=A_PSLLW;
  924. mmxs32bit,mmxu32bit:
  925. op:=A_PSLLD;
  926. mmxs64bit,mmxu64bit:
  927. op:=A_PSLLD;
  928. else
  929. Internalerror(2018022503);
  930. end;
  931. else
  932. internalerror(2018022502);
  933. end;
  934. { left and right no register? }
  935. { then one must be demanded }
  936. if (left.location.loc<>LOC_MMXREGISTER) then
  937. begin
  938. { register variable ? }
  939. if (left.location.loc=LOC_CMMXREGISTER) then
  940. begin
  941. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  942. emit_reg_reg(A_MOVQ,S_NO,left.location.register,hregister);
  943. end
  944. else
  945. begin
  946. if not(left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
  947. internalerror(2018022505);
  948. hregister:=tcgx86(cg).getmmxregister(current_asmdata.CurrAsmList);
  949. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
  950. emit_ref_reg(A_MOVQ,S_NO,left.location.reference,hregister);
  951. end;
  952. location_reset(left.location,LOC_MMXREGISTER,OS_NO);
  953. left.location.register:=hregister;
  954. end;
  955. { at this point, left.location.loc should be LOC_MMXREGISTER }
  956. case right.location.loc of
  957. LOC_MMXREGISTER,LOC_CMMXREGISTER:
  958. begin
  959. emit_reg_reg(op,S_NO,right.location.register,left.location.register);
  960. location.register:=left.location.register;
  961. end;
  962. LOC_CONSTANT:
  963. emit_const_reg(op,S_NO,right.location.value,left.location.register);
  964. LOC_REFERENCE,LOC_CREFERENCE:
  965. begin
  966. tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
  967. emit_ref_reg(op,S_NO,right.location.reference,left.location.register);
  968. end;
  969. else
  970. internalerror(2018022506);
  971. end;
  972. location.register:=left.location.register;
  973. location_freetemp(current_asmdata.CurrAsmList,right.location);
  974. end;
  975. {$endif SUPPORT_MMX}
  976. end.