cgcpu.pas 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. {
  2. Copyright (c) 2002 by Florian Klaempfl
  3. This unit implements the code generator for the x86-64.
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. cgbase,cgutils,cgobj,cgx86,
  22. aasmbase,aasmtai,aasmdata,aasmcpu,
  23. cpubase,cpuinfo,cpupara,parabase,
  24. symdef,
  25. node,symconst,rgx86,procinfo;
  26. type
  27. tcgx86_64 = class(tcgx86)
  28. procedure init_register_allocators;override;
  29. procedure a_loadfpu_ref_cgpara(list: TAsmList; size: tcgsize; const ref: treference; const cgpara: TCGPara); override;
  30. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  31. procedure g_proc_entry(list : TAsmList;localsize:longint; nostackframe:boolean);override;
  32. procedure g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);override;
  33. procedure g_local_unwind(list: TAsmList; l: TAsmLabel);override;
  34. procedure g_save_registers(list: TAsmList);override;
  35. procedure g_restore_registers(list: TAsmList);override;
  36. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  37. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister;shuffle : pmmshuffle); override;
  38. private
  39. function use_push: boolean;
  40. function saved_xmm_reg_size: longint;
  41. end;
  42. procedure create_codegen;
  43. implementation
  44. uses
  45. globtype,globals,verbose,systems,cutils,cclasses,
  46. symsym,symtable,defutil,paramgr,fmodule,cpupi,
  47. rgobj,tgobj,rgcpu,ncgutil;
  48. procedure Tcgx86_64.init_register_allocators;
  49. const
  50. win64_saved_std_regs : array[0..7] of tsuperregister = (RS_RBX,RS_RDI,RS_RSI,RS_R12,RS_R13,RS_R14,RS_R15,RS_RBP);
  51. others_saved_std_regs : array[0..4] of tsuperregister = (RS_RBX,RS_R12,RS_R13,RS_R14,RS_R15);
  52. saved_regs_length : array[boolean] of longint = (5,7);
  53. win64_saved_xmm_regs : array[0..9] of tsuperregister = (RS_XMM6,RS_XMM7,
  54. RS_XMM8,RS_XMM9,RS_XMM10,RS_XMM11,RS_XMM12,RS_XMM13,RS_XMM14,RS_XMM15);
  55. var
  56. i : longint;
  57. begin
  58. inherited init_register_allocators;
  59. if (length(saved_standard_registers)<>saved_regs_length[target_info.system=system_x86_64_win64]) then
  60. begin
  61. if target_info.system=system_x86_64_win64 then
  62. begin
  63. SetLength(saved_standard_registers,Length(win64_saved_std_regs));
  64. SetLength(saved_mm_registers,Length(win64_saved_xmm_regs));
  65. for i:=low(win64_saved_std_regs) to high(win64_saved_std_regs) do
  66. saved_standard_registers[i]:=win64_saved_std_regs[i];
  67. for i:=low(win64_saved_xmm_regs) to high(win64_saved_xmm_regs) do
  68. saved_mm_registers[i]:=win64_saved_xmm_regs[i];
  69. end
  70. else
  71. begin
  72. SetLength(saved_standard_registers,Length(others_saved_std_regs));
  73. SetLength(saved_mm_registers,0);
  74. for i:=low(others_saved_std_regs) to high(others_saved_std_regs) do
  75. saved_standard_registers[i]:=others_saved_std_regs[i];
  76. end;
  77. end;
  78. if target_info.system=system_x86_64_win64 then
  79. begin
  80. if (cs_userbp in current_settings.optimizerswitches) and assigned(current_procinfo) and (current_procinfo.framepointer=NR_STACK_POINTER_REG) then
  81. begin
  82. rg[R_INTREGISTER]:=trgcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_RAX,RS_RDX,RS_RCX,RS_R8,RS_R9,RS_R10,
  83. RS_R11,RS_RBX,RS_RSI,RS_RDI,RS_R12,RS_R13,RS_R14,RS_R15,RS_RBP],first_int_imreg,[]);
  84. end
  85. else
  86. rg[R_INTREGISTER]:=trgcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_RAX,RS_RDX,RS_RCX,RS_R8,RS_R9,RS_R10,
  87. RS_R11,RS_RBX,RS_RSI,RS_RDI,RS_R12,RS_R13,RS_R14,RS_R15],first_int_imreg,[])
  88. end
  89. else
  90. rg[R_INTREGISTER]:=trgcpu.create(R_INTREGISTER,R_SUBWHOLE,[RS_RAX,RS_RDX,RS_RCX,RS_RSI,RS_RDI,RS_R8,
  91. RS_R9,RS_R10,RS_R11,RS_RBX,RS_R12,RS_R13,RS_R14,RS_R15],first_int_imreg,[]);
  92. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBWHOLE,[RS_XMM0,RS_XMM1,RS_XMM2,RS_XMM3,RS_XMM4,RS_XMM5,RS_XMM6,RS_XMM7,
  93. RS_XMM8,RS_XMM9,RS_XMM10,RS_XMM11,RS_XMM12,RS_XMM13,RS_XMM14,RS_XMM15],first_mm_imreg,[]);
  94. rgfpu:=Trgx86fpu.create;
  95. end;
  96. procedure tcgx86_64.a_loadfpu_ref_cgpara(list: TAsmList; size: tcgsize; const ref: treference; const cgpara: TCGPara);
  97. begin
  98. { a record containing an extended value is returned on the x87 stack
  99. -> size will be OS_F128 (if not packed), while cgpara.paraloc^.size
  100. contains the proper size
  101. In the future we should probably always use cgpara.location^.size, but
  102. that should only be tested/done after 2.8 is branched }
  103. if size in [OS_128,OS_F128] then
  104. size:=cgpara.location^.size;
  105. inherited;
  106. end;
  107. procedure tcgx86_64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  108. begin
  109. { same as with a_loadfpu_ref_cgpara() above, but on the callee side
  110. when the value is moved from the fpu register into a memory location }
  111. if tosize in [OS_128,OS_F128] then
  112. tosize:=OS_F80;
  113. inherited;
  114. end;
  115. function tcgx86_64.use_push: boolean;
  116. begin
  117. result:=(current_procinfo.framepointer=NR_STACK_POINTER_REG) or
  118. (current_procinfo.procdef.proctypeoption=potype_exceptfilter);
  119. end;
  120. function tcgx86_64.saved_xmm_reg_size: longint;
  121. var
  122. i: longint;
  123. begin
  124. result:=0;
  125. if (target_info.system<>system_x86_64_win64) or
  126. (not uses_registers(R_MMREGISTER)) then
  127. exit;
  128. for i:=low(saved_mm_registers) to high(saved_mm_registers) do
  129. begin
  130. if (saved_mm_registers[i] in rg[R_MMREGISTER].used_in_proc) then
  131. inc(result,tcgsize2size[OS_VECTOR]);
  132. end;
  133. end;
  134. procedure tcgx86_64.g_proc_entry(list : TAsmList;localsize:longint;nostackframe:boolean);
  135. var
  136. hitem: tlinkedlistitem;
  137. r: integer;
  138. href: treference;
  139. templist: TAsmList;
  140. frame_offset: longint;
  141. suppress_endprologue: boolean;
  142. stackmisalignment: longint;
  143. xmmsize: longint;
  144. procedure push_one_reg(reg: tregister);
  145. begin
  146. list.concat(taicpu.op_reg(A_PUSH,tcgsize2opsize[OS_ADDR],reg));
  147. if (target_info.system=system_x86_64_win64) then
  148. begin
  149. list.concat(cai_seh_directive.create_reg(ash_pushreg,reg));
  150. include(current_procinfo.flags,pi_has_unwind_info);
  151. end;
  152. end;
  153. procedure push_regs;
  154. var
  155. r: longint;
  156. usedregs: tcpuregisterset;
  157. begin
  158. usedregs:=rg[R_INTREGISTER].used_in_proc-paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption);
  159. for r := low(saved_standard_registers) to high(saved_standard_registers) do
  160. if saved_standard_registers[r] in usedregs then
  161. begin
  162. inc(stackmisalignment,sizeof(pint));
  163. push_one_reg(newreg(R_INTREGISTER,saved_standard_registers[r],R_SUBWHOLE));
  164. end;
  165. end;
  166. begin
  167. hitem:=list.last;
  168. { pi_has_unwind_info may already be set at this point if there are
  169. SEH directives in assembler body. In this case, .seh_endprologue
  170. is expected to be one of those directives, and not generated here. }
  171. suppress_endprologue:=(pi_has_unwind_info in current_procinfo.flags);
  172. { save old framepointer }
  173. if not nostackframe then
  174. begin
  175. { return address }
  176. stackmisalignment := sizeof(pint);
  177. list.concat(tai_regalloc.alloc(current_procinfo.framepointer,nil));
  178. if current_procinfo.framepointer=NR_STACK_POINTER_REG then
  179. begin
  180. push_regs;
  181. CGmessage(cg_d_stackframe_omited);
  182. end
  183. else
  184. begin
  185. { push <frame_pointer> }
  186. inc(stackmisalignment,sizeof(pint));
  187. push_one_reg(NR_FRAME_POINTER_REG);
  188. { Return address and FP are both on stack }
  189. current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
  190. current_asmdata.asmcfi.cfa_offset(list,NR_FRAME_POINTER_REG,-(2*sizeof(pint)));
  191. if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then
  192. list.concat(Taicpu.op_reg_reg(A_MOV,tcgsize2opsize[OS_ADDR],NR_STACK_POINTER_REG,NR_FRAME_POINTER_REG))
  193. else
  194. begin
  195. push_regs;
  196. gen_load_frame_for_exceptfilter(list);
  197. { Need only as much stack space as necessary to do the calls.
  198. Exception filters don't have own local vars, and temps are 'mapped'
  199. to the parent procedure.
  200. maxpushedparasize is already aligned at least on x86_64. }
  201. localsize:=current_procinfo.maxpushedparasize;
  202. end;
  203. current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FRAME_POINTER_REG);
  204. {
  205. TODO: current framepointer handling is not compatible with Win64 at all:
  206. Win64 expects FP to point to the top or into the middle of local area.
  207. In FPC it points to the bottom, making it impossible to generate
  208. UWOP_SET_FPREG unwind code if local area is > 240 bytes.
  209. So for now pretend we never have a framepointer.
  210. }
  211. end;
  212. xmmsize:=saved_xmm_reg_size;
  213. if use_push and (xmmsize<>0) then
  214. begin
  215. localsize:=align(localsize,target_info.stackalign)+xmmsize;
  216. reference_reset_base(current_procinfo.save_regs_ref,NR_STACK_POINTER_REG,
  217. localsize-xmmsize,tcgsize2size[OS_VECTOR]);
  218. end;
  219. { allocate stackframe space }
  220. if (localsize<>0) or
  221. ((target_info.stackalign>sizeof(pint)) and
  222. (stackmisalignment <> 0) and
  223. ((pi_do_call in current_procinfo.flags) or
  224. (po_assembler in current_procinfo.procdef.procoptions))) then
  225. begin
  226. if target_info.stackalign>sizeof(pint) then
  227. localsize := align(localsize+stackmisalignment,target_info.stackalign)-stackmisalignment;
  228. g_stackpointer_alloc(list,localsize);
  229. if current_procinfo.framepointer=NR_STACK_POINTER_REG then
  230. current_asmdata.asmcfi.cfa_def_cfa_offset(list,localsize+sizeof(pint));
  231. current_procinfo.final_localsize:=localsize;
  232. if (target_info.system=system_x86_64_win64) then
  233. begin
  234. if localsize<>0 then
  235. list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize));
  236. include(current_procinfo.flags,pi_has_unwind_info);
  237. if use_push and (xmmsize<>0) then
  238. begin
  239. href:=current_procinfo.save_regs_ref;
  240. for r:=low(saved_mm_registers) to high(saved_mm_registers) do
  241. if saved_mm_registers[r] in rg[R_MMREGISTER].used_in_proc then
  242. begin
  243. a_loadmm_reg_ref(list,OS_VECTOR,OS_VECTOR,newreg(R_MMREGISTER,saved_mm_registers[r],R_SUBMMWHOLE),href,nil);
  244. inc(href.offset,tcgsize2size[OS_VECTOR]);
  245. end;
  246. end;
  247. end;
  248. end;
  249. end;
  250. if not (pi_has_unwind_info in current_procinfo.flags) then
  251. exit;
  252. { Generate unwind data for x86_64-win64 }
  253. list.insertafter(cai_seh_directive.create_name(ash_proc,current_procinfo.procdef.mangledname),hitem);
  254. templist:=TAsmList.Create;
  255. { We need to record postive offsets from RSP; if registers are saved
  256. at negative offsets from RBP we need to account for it. }
  257. if (not use_push) then
  258. frame_offset:=current_procinfo.final_localsize
  259. else
  260. frame_offset:=0;
  261. { There's no need to describe position of register saves precisely;
  262. since registers are not modified before they are saved, and saves do not
  263. change RSP, 'logically' all saves can happen at the end of prologue. }
  264. href:=current_procinfo.save_regs_ref;
  265. if (not use_push) then
  266. begin
  267. for r:=low(saved_standard_registers) to high(saved_standard_registers) do
  268. if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
  269. begin
  270. templist.concat(cai_seh_directive.create_reg_offset(ash_savereg,
  271. newreg(R_INTREGISTER,saved_standard_registers[r],R_SUBWHOLE),
  272. href.offset+frame_offset));
  273. inc(href.offset,sizeof(aint));
  274. end;
  275. end;
  276. if uses_registers(R_MMREGISTER) then
  277. begin
  278. if (href.offset mod tcgsize2size[OS_VECTOR])<>0 then
  279. inc(href.offset,tcgsize2size[OS_VECTOR]-(href.offset mod tcgsize2size[OS_VECTOR]));
  280. for r:=low(saved_mm_registers) to high(saved_mm_registers) do
  281. begin
  282. if saved_mm_registers[r] in rg[R_MMREGISTER].used_in_proc then
  283. begin
  284. templist.concat(cai_seh_directive.create_reg_offset(ash_savexmm,
  285. newreg(R_MMREGISTER,saved_mm_registers[r],R_SUBMMWHOLE),
  286. href.offset+frame_offset));
  287. inc(href.offset,tcgsize2size[OS_VECTOR]);
  288. end;
  289. end;
  290. end;
  291. if not suppress_endprologue then
  292. templist.concat(cai_seh_directive.create(ash_endprologue));
  293. if assigned(current_procinfo.endprologue_ai) then
  294. current_procinfo.aktproccode.insertlistafter(current_procinfo.endprologue_ai,templist)
  295. else
  296. list.concatlist(templist);
  297. templist.free;
  298. end;
  299. procedure tcgx86_64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  300. procedure increase_sp(a : tcgint);
  301. var
  302. href : treference;
  303. begin
  304. reference_reset_base(href,NR_STACK_POINTER_REG,a,0);
  305. { normally, lea is a better choice than an add }
  306. list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,NR_STACK_POINTER_REG));
  307. end;
  308. var
  309. href : treference;
  310. hreg : tregister;
  311. r : longint;
  312. begin
  313. { Prevent return address from a possible call from ending up in the epilogue }
  314. { (restoring registers happens before epilogue, providing necessary padding) }
  315. if (current_procinfo.flags*[pi_has_unwind_info,pi_do_call,pi_has_saved_regs])=[pi_has_unwind_info,pi_do_call] then
  316. list.concat(Taicpu.op_none(A_NOP));
  317. { remove stackframe }
  318. if not nostackframe then
  319. begin
  320. if use_push then
  321. begin
  322. if (saved_xmm_reg_size<>0) then
  323. begin
  324. href:=current_procinfo.save_regs_ref;
  325. for r:=low(saved_mm_registers) to high(saved_mm_registers) do
  326. if saved_mm_registers[r] in rg[R_MMREGISTER].used_in_proc then
  327. begin
  328. { Allocate register so the optimizer does not remove the load }
  329. hreg:=newreg(R_MMREGISTER,saved_mm_registers[r],R_SUBMMWHOLE);
  330. a_reg_alloc(list,hreg);
  331. a_loadmm_ref_reg(list,OS_VECTOR,OS_VECTOR,href,hreg,nil);
  332. inc(href.offset,tcgsize2size[OS_VECTOR]);
  333. end;
  334. end;
  335. if (current_procinfo.final_localsize<>0) then
  336. increase_sp(current_procinfo.final_localsize);
  337. internal_restore_regs(list,true);
  338. if (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
  339. list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG));
  340. end
  341. else if (target_info.system=system_x86_64_win64) then
  342. begin
  343. { Comply with Win64 unwinding mechanism, which only recognizes
  344. 'add $constant,%rsp' and 'lea offset(FPREG),%rsp' as belonging to
  345. the function epilog.
  346. Neither 'leave' nor even 'mov %FPREG,%rsp' are allowed. }
  347. reference_reset_base(href,current_procinfo.framepointer,0,sizeof(pint));
  348. list.concat(Taicpu.op_ref_reg(A_LEA,tcgsize2opsize[OS_ADDR],href,NR_STACK_POINTER_REG));
  349. list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],current_procinfo.framepointer));
  350. end
  351. else
  352. generate_leave(list);
  353. list.concat(tai_regalloc.dealloc(current_procinfo.framepointer,nil));
  354. end;
  355. list.concat(Taicpu.Op_none(A_RET,S_NO));
  356. if (pi_has_unwind_info in current_procinfo.flags) then
  357. begin
  358. tx86_64procinfo(current_procinfo).dump_scopes(list);
  359. list.concat(cai_seh_directive.create(ash_endproc));
  360. end;
  361. end;
  362. procedure tcgx86_64.g_save_registers(list: TAsmList);
  363. begin
  364. if (not use_push) then
  365. inherited g_save_registers(list);
  366. end;
  367. procedure tcgx86_64.g_restore_registers(list: TAsmList);
  368. begin
  369. if (not use_push) then
  370. inherited g_restore_registers(list);
  371. end;
  372. procedure tcgx86_64.g_local_unwind(list: TAsmList; l: TAsmLabel);
  373. var
  374. para1,para2: tcgpara;
  375. href: treference;
  376. pd: tprocdef;
  377. begin
  378. if (target_info.system<>system_x86_64_win64) then
  379. begin
  380. inherited g_local_unwind(list,l);
  381. exit;
  382. end;
  383. pd:=search_system_proc('_fpc_local_unwind');
  384. para1.init;
  385. para2.init;
  386. paramanager.getintparaloc(list,pd,1,para1);
  387. paramanager.getintparaloc(list,pd,2,para2);
  388. reference_reset_symbol(href,l,0,1);
  389. { TODO: using RSP is correct only while the stack is fixed!!
  390. (true now, but will change if/when allocating from stack is implemented) }
  391. a_load_reg_cgpara(list,OS_ADDR,NR_STACK_POINTER_REG,para1);
  392. a_loadaddr_ref_cgpara(list,href,para2);
  393. paramanager.freecgpara(list,para2);
  394. paramanager.freecgpara(list,para1);
  395. g_call(list,'_FPC_local_unwind');
  396. para2.done;
  397. para1.done;
  398. end;
  399. procedure tcgx86_64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  400. var
  401. opc: tasmop;
  402. begin
  403. { this code can only be used to transfer raw data, not to perform
  404. conversions }
  405. if (tcgsize2size[fromsize]<>tcgsize2size[tosize]) or
  406. not(tosize in [OS_F32,OS_F64,OS_M64]) then
  407. internalerror(2009112505);
  408. case fromsize of
  409. OS_32,OS_S32:
  410. opc:=A_MOVD;
  411. OS_64,OS_S64:
  412. opc:=A_MOVQ;
  413. else
  414. internalerror(2009112506);
  415. end;
  416. if assigned(shuffle) and
  417. not shufflescalar(shuffle) then
  418. internalerror(2009112517);
  419. list.concat(taicpu.op_reg_reg(opc,S_NO,intreg,mmreg));
  420. end;
  421. procedure tcgx86_64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister;shuffle : pmmshuffle);
  422. var
  423. opc: tasmop;
  424. begin
  425. { this code can only be used to transfer raw data, not to perform
  426. conversions }
  427. if (tcgsize2size[fromsize]<>tcgsize2size[tosize]) or
  428. not (fromsize in [OS_F32,OS_F64,OS_M64]) then
  429. internalerror(2009112507);
  430. case tosize of
  431. OS_32,OS_S32:
  432. opc:=A_MOVD;
  433. OS_64,OS_S64:
  434. opc:=A_MOVQ;
  435. else
  436. internalerror(2009112408);
  437. end;
  438. if assigned(shuffle) and
  439. not shufflescalar(shuffle) then
  440. internalerror(2009112515);
  441. list.concat(taicpu.op_reg_reg(opc,S_NO,mmreg,intreg));
  442. end;
  443. procedure create_codegen;
  444. begin
  445. cg:=tcgx86_64.create;
  446. cg128:=tcg128.create;
  447. end;
  448. end.