cgcpu.pas 104 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620
  1. {
  2. Copyright (c) 2014 by Jonas Maebe
  3. This unit implements the code generator for AArch64
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. globtype,parabase,
  22. cgbase,cgutils,cgobj,
  23. aasmbase,aasmtai,aasmdata,aasmcpu,
  24. cpubase,cpuinfo,
  25. node,symconst,SymType,symdef,
  26. rgcpu;
  27. type
  28. tcgaarch64=class(tcg)
  29. protected
  30. { changes register size without adding register allocation info }
  31. function makeregsize(reg: tregister; size: tcgsize): tregister; overload;
  32. public
  33. { simplifies "ref" so it can be used with "op". If "ref" can be used
  34. with a different load/Store operation that has the same meaning as the
  35. original one, "op" will be replaced with the alternative }
  36. procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  37. function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override;
  38. procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  39. procedure init_register_allocators;override;
  40. procedure done_register_allocators;override;
  41. function getmmregister(list:TAsmList;size:tcgsize):tregister;override;
  42. function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  43. procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override;
  44. procedure a_call_reg(list:TAsmList;Reg:tregister);override;
  45. { General purpose instructions }
  46. procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  47. procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override;
  48. procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override;
  49. procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override;
  50. procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override;
  51. procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  52. procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  53. { move instructions }
  54. procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override;
  55. procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override;
  56. procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override;
  57. procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override;
  58. procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override;
  59. procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override;
  60. procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override;
  61. procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override;
  62. { fpu move instructions (not used, all floating point is vector unit-based) }
  63. procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override;
  64. procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override;
  65. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  66. procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override;
  67. procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override;
  68. procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override;
  69. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  70. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override;
  71. procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override;
  72. procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override;
  73. { comparison operations }
  74. procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override;
  75. procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override;
  76. procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override;
  77. procedure a_jmp_name(list: TAsmList; const s: string);override;
  78. procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;}
  79. procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override;
  80. procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override;
  81. procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override;
  82. procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override;
  83. procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override;
  84. procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override;
  85. procedure g_maybe_got_init(list: TAsmList); override;
  86. procedure g_restore_registers(list: TAsmList);override;
  87. procedure g_save_registers(list: TAsmList);override;
  88. procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
  89. procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
  90. procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
  91. procedure g_check_for_fpu_exception(list: TAsmList; force, clear: boolean);override;
  92. procedure g_profilecode(list: TAsmList);override;
  93. private
  94. function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  95. procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  96. end;
  97. procedure create_codegen;
  98. const
  99. TOpCG2AsmOpReg: array[topcg] of TAsmOp = (
  100. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV
  101. );
  102. TOpCG2AsmOpImm: array[topcg] of TAsmOp = (
  103. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR
  104. );
  105. TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT,
  106. C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI
  107. );
  108. implementation
  109. uses
  110. globals,verbose,systems,cutils,cclasses,
  111. paramgr,fmodule,
  112. symtable,symsym,
  113. tgobj,
  114. ncgutil,
  115. procinfo,cpupi;
  116. procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  117. var
  118. href: treference;
  119. so: tshifterop;
  120. accesssize: longint;
  121. begin
  122. if (ref.base=NR_NO) then
  123. begin
  124. if ref.shiftmode<>SM_None then
  125. internalerror(2014110701);
  126. ref.base:=ref.index;
  127. ref.index:=NR_NO;
  128. end;
  129. { no abitrary scale factor support (the generic code doesn't set it,
  130. AArch-specific code shouldn't either) }
  131. if not(ref.scalefactor in [0,1]) then
  132. internalerror(2014111002);
  133. case simple_ref_type(op,size,oppostfix,ref) of
  134. sr_simple:
  135. exit;
  136. sr_internal_illegal:
  137. internalerror(2014121702);
  138. sr_complex:
  139. { continue } ;
  140. end;
  141. if assigned(ref.symbol) then
  142. begin
  143. { internal "load symbol" instructions should already be valid }
  144. if assigned(ref.symboldata) or
  145. (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then
  146. internalerror(2014110802);
  147. { no relative symbol support (needed) yet }
  148. if assigned(ref.relsymbol) then
  149. internalerror(2014111001);
  150. { loading a symbol address (whether it's in the GOT or not) consists
  151. of two parts: first load the page on which it is located, then
  152. either the offset in the page or load the value at that offset in
  153. the page. This final GOT-load can be relaxed by the linker in case
  154. the variable itself can be stored directly in the GOT }
  155. if (preferred_newbasereg=NR_NO) or
  156. (ref.base=preferred_newbasereg) or
  157. (ref.index=preferred_newbasereg) then
  158. preferred_newbasereg:=getaddressregister(list);
  159. { load the (GOT) page }
  160. reference_reset_symbol(href,ref.symbol,0,8,[]);
  161. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  162. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  163. ((ref.symbol.typ=AT_DATA) and
  164. (ref.symbol.bind=AB_LOCAL)) or
  165. (target_info.system=system_aarch64_win64) then
  166. href.refaddr:=addr_page
  167. else
  168. href.refaddr:=addr_gotpage;
  169. list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href));
  170. { load the GOT entry (= address of the variable) }
  171. reference_reset_base(href,preferred_newbasereg,0,ctempposinvalid,sizeof(pint),[]);
  172. href.symbol:=ref.symbol;
  173. { code symbols defined in the current compilation unit do not
  174. have to be accessed via the GOT }
  175. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  176. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  177. ((ref.symbol.typ=AT_DATA) and
  178. (ref.symbol.bind=AB_LOCAL)) or
  179. (target_info.system=system_aarch64_win64) then
  180. begin
  181. href.base:=NR_NO;
  182. href.refaddr:=addr_pageoffset;
  183. list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href));
  184. end
  185. else
  186. begin
  187. href.refaddr:=addr_gotpageoffset;
  188. { use a_load_ref_reg() rather than directly encoding the LDR,
  189. so that we'll check the validity of the reference }
  190. a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg);
  191. end;
  192. { set as new base register }
  193. if ref.base=NR_NO then
  194. ref.base:=preferred_newbasereg
  195. else if ref.index=NR_NO then
  196. ref.index:=preferred_newbasereg
  197. else
  198. begin
  199. { make sure it's valid in case ref.base is SP -> make it
  200. the second operand}
  201. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg);
  202. ref.base:=preferred_newbasereg
  203. end;
  204. ref.symbol:=nil;
  205. end;
  206. { base & index }
  207. if (ref.base<>NR_NO) and
  208. (ref.index<>NR_NO) then
  209. begin
  210. case op of
  211. A_LDR, A_STR:
  212. begin
  213. if (ref.shiftmode=SM_None) and
  214. (ref.shiftimm<>0) then
  215. internalerror(2014110805);
  216. { wrong shift? (possible in case of something like
  217. array_of_2byte_rec[x].bytefield -> shift will be set 1, but
  218. the final load is a 1 byte -> can't use shift after all }
  219. if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and
  220. ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or
  221. (ref.offset<>0)) then
  222. begin
  223. if preferred_newbasereg=NR_NO then
  224. preferred_newbasereg:=getaddressregister(list);
  225. { "add" supports a superset of the shift modes supported by
  226. load/store instructions }
  227. shifterop_reset(so);
  228. so.shiftmode:=ref.shiftmode;
  229. so.shiftimm:=ref.shiftimm;
  230. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  231. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  232. { possibly still an invalid offset -> fall through }
  233. end
  234. else if ref.offset<>0 then
  235. begin
  236. if (preferred_newbasereg=NR_NO) or
  237. { we keep ref.index, so it must not be overwritten }
  238. (ref.index=preferred_newbasereg) then
  239. preferred_newbasereg:=getaddressregister(list);
  240. { add to the base and not to the index, because the index
  241. may be scaled; this works even if the base is SP }
  242. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  243. ref.offset:=0;
  244. ref.base:=preferred_newbasereg;
  245. { finished }
  246. exit;
  247. end
  248. else
  249. { valid -> exit }
  250. exit;
  251. end;
  252. { todo }
  253. A_LD1,A_LD2,A_LD3,A_LD4,
  254. A_ST1,A_ST2,A_ST3,A_ST4:
  255. internalerror(2014110702);
  256. { these don't support base+index }
  257. A_LDUR,A_STUR,
  258. A_LDP,A_STP:
  259. begin
  260. { these either don't support pre-/post-indexing, or don't
  261. support it with base+index }
  262. if ref.addressmode<>AM_OFFSET then
  263. internalerror(2014110911);
  264. if preferred_newbasereg=NR_NO then
  265. preferred_newbasereg:=getaddressregister(list);
  266. if ref.shiftmode<>SM_None then
  267. begin
  268. { "add" supports a superset of the shift modes supported by
  269. load/store instructions }
  270. shifterop_reset(so);
  271. so.shiftmode:=ref.shiftmode;
  272. so.shiftimm:=ref.shiftimm;
  273. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  274. end
  275. else
  276. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg);
  277. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  278. { fall through to the handling of base + offset, since the
  279. offset may still be too big }
  280. end;
  281. else
  282. internalerror(2014110903);
  283. end;
  284. end;
  285. { base + offset }
  286. if ref.base<>NR_NO then
  287. begin
  288. { valid offset for LDUR/STUR -> use that }
  289. if (ref.addressmode=AM_OFFSET) and
  290. (op in [A_LDR,A_STR]) and
  291. (ref.offset>=-256) and
  292. (ref.offset<=255) then
  293. begin
  294. if op=A_LDR then
  295. op:=A_LDUR
  296. else
  297. op:=A_STUR
  298. end
  299. { if it's not a valid LDUR/STUR, use LDR/STR }
  300. else if (op in [A_LDUR,A_STUR]) and
  301. ((ref.offset<-256) or
  302. (ref.offset>255) or
  303. (ref.addressmode<>AM_OFFSET)) then
  304. begin
  305. if op=A_LDUR then
  306. op:=A_LDR
  307. else
  308. op:=A_STR
  309. end;
  310. case op of
  311. A_LDR,A_STR:
  312. begin
  313. case ref.addressmode of
  314. AM_PREINDEXED:
  315. begin
  316. { since the loaded/stored register cannot be the same
  317. as the base register, we can safely add the
  318. offset to the base if it doesn't fit}
  319. if (ref.offset<-256) or
  320. (ref.offset>255) then
  321. begin
  322. a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base);
  323. ref.offset:=0;
  324. end;
  325. end;
  326. AM_POSTINDEXED:
  327. begin
  328. { cannot emulate post-indexing if we have to fold the
  329. offset into the base register }
  330. if (ref.offset<-256) or
  331. (ref.offset>255) then
  332. internalerror(2014110909);
  333. { ok }
  334. end;
  335. AM_OFFSET:
  336. begin
  337. { unsupported offset -> fold into base register }
  338. accesssize:=1 shl tcgsizep2size[size];
  339. if (ref.offset<0) or
  340. (ref.offset>(((1 shl 12)-1)*accesssize)) or
  341. ((ref.offset mod accesssize)<>0) then
  342. begin
  343. if preferred_newbasereg=NR_NO then
  344. preferred_newbasereg:=getaddressregister(list);
  345. { can we split the offset beween an
  346. "add/sub (imm12 shl 12)" and the load (also an
  347. imm12)?
  348. -- the offset from the load will always be added,
  349. that's why the lower bound has a smaller range
  350. than the upper bound; it must also be a multiple
  351. of the access size }
  352. if (ref.offset>=-(((1 shl 12)-1) shl 12)) and
  353. (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and
  354. ((ref.offset mod accesssize)=0) then
  355. begin
  356. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg);
  357. ref.offset:=ref.offset-(ref.offset shr 12) shl 12;
  358. end
  359. else
  360. begin
  361. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  362. ref.offset:=0;
  363. end;
  364. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  365. end;
  366. end
  367. end;
  368. end;
  369. A_LDP,A_STP:
  370. begin
  371. { unsupported offset -> fold into base register (these
  372. instructions support all addressmodes) }
  373. if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or
  374. (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then
  375. begin
  376. case ref.addressmode of
  377. AM_POSTINDEXED:
  378. { don't emulate post-indexing if we have to fold the
  379. offset into the base register }
  380. internalerror(2014110910);
  381. AM_PREINDEXED:
  382. { this means the offset must be added to the current
  383. base register }
  384. preferred_newbasereg:=ref.base;
  385. AM_OFFSET:
  386. if preferred_newbasereg=NR_NO then
  387. preferred_newbasereg:=getaddressregister(list);
  388. end;
  389. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  390. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,ref.alignment,ref.volatility);
  391. end
  392. end;
  393. A_LDUR,A_STUR:
  394. begin
  395. { valid, checked above }
  396. end;
  397. { todo }
  398. A_LD1,A_LD2,A_LD3,A_LD4,
  399. A_ST1,A_ST2,A_ST3,A_ST4:
  400. internalerror(2014110908);
  401. else
  402. internalerror(2014110708);
  403. end;
  404. { done }
  405. exit;
  406. end;
  407. { only an offset -> change to base (+ offset 0) }
  408. if preferred_newbasereg=NR_NO then
  409. preferred_newbasereg:=getaddressregister(list);
  410. a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg);
  411. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,newalignment(8,ref.offset),ref.volatility);
  412. end;
  413. function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister;
  414. var
  415. subreg:Tsubregister;
  416. begin
  417. subreg:=cgsize2subreg(getregtype(reg),size);
  418. result:=reg;
  419. setsubreg(result,subreg);
  420. end;
  421. function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister;
  422. begin
  423. internalerror(2014122110);
  424. { squash warning }
  425. result:=NR_NO;
  426. end;
  427. function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  428. begin
  429. make_simple_ref(list,op,size,oppostfix,ref,NR_NO);
  430. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix));
  431. result:=ref;
  432. end;
  433. procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  434. var
  435. instr: taicpu;
  436. so: tshifterop;
  437. hadtmpreg: boolean;
  438. begin
  439. { imm12 }
  440. if (a>=0) and
  441. (a<=((1 shl 12)-1)) then
  442. if usedest then
  443. instr:=taicpu.op_reg_reg_const(op,dst,src,a)
  444. else
  445. instr:=taicpu.op_reg_const(op,src,a)
  446. { imm12 lsl 12 }
  447. else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then
  448. begin
  449. so.shiftmode:=SM_LSL;
  450. so.shiftimm:=12;
  451. if usedest then
  452. instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so)
  453. else
  454. instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so)
  455. end
  456. else
  457. begin
  458. { todo: other possible optimizations (e.g. load 16 bit constant in
  459. register and then add/sub/cmp/cmn shifted the rest) }
  460. if tmpreg=NR_NO then
  461. begin
  462. hadtmpreg:=false;
  463. tmpreg:=getintregister(list,size);
  464. end
  465. else
  466. begin
  467. hadtmpreg:=true;
  468. getcpuregister(list,tmpreg);
  469. end;
  470. a_load_const_reg(list,size,a,tmpreg);
  471. if usedest then
  472. instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg)
  473. else
  474. instr:=taicpu.op_reg_reg(op,src,tmpreg);
  475. if hadtmpreg then
  476. ungetcpuregister(list,tmpreg);
  477. end;
  478. if setflags then
  479. setoppostfix(instr,PF_S);
  480. list.concat(instr);
  481. end;
  482. {****************************************************************************
  483. Assembler code
  484. ****************************************************************************}
  485. procedure tcgaarch64.init_register_allocators;
  486. begin
  487. inherited init_register_allocators;
  488. rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
  489. [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
  490. RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
  491. RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
  492. { maybe we can enable this in the future for leaf functions (it's
  493. the frame pointer)
  494. ,RS_X29 }],
  495. first_int_imreg,[]);
  496. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD,
  497. [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7,
  498. RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15,
  499. RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23,
  500. RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31],
  501. first_mm_imreg,[]);
  502. end;
  503. procedure tcgaarch64.done_register_allocators;
  504. begin
  505. rg[R_INTREGISTER].free;
  506. rg[R_FPUREGISTER].free;
  507. rg[R_MMREGISTER].free;
  508. inherited done_register_allocators;
  509. end;
  510. function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister;
  511. begin
  512. case size of
  513. OS_F32:
  514. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
  515. OS_F64:
  516. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD)
  517. else
  518. internalerror(2014102701);
  519. end;
  520. end;
  521. procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean);
  522. begin
  523. if not weak then
  524. list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)))
  525. else
  526. list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION)));
  527. end;
  528. procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister);
  529. begin
  530. list.concat(taicpu.op_reg(A_BLR,reg));
  531. end;
  532. {********************** load instructions ********************}
  533. procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister);
  534. var
  535. opc: tasmop;
  536. shift: byte;
  537. so: tshifterop;
  538. reginited,doinverted,extendedsize: boolean;
  539. manipulated_a: tcgint;
  540. leftover_a: word;
  541. begin
  542. {$ifdef extdebug}
  543. list.concat(tai_comment.Create(strpnew('Generating constant ' + tostr(a) + ' / $' + hexstr(a, 16))));
  544. {$endif extdebug}
  545. extendedsize := (size in [OS_64,OS_S64]);
  546. case a of
  547. { Small positive number }
  548. $0..$FFFF:
  549. begin
  550. list.concat(taicpu.op_reg_const(A_MOVZ, reg, a));
  551. Exit;
  552. end;
  553. { Small negative number }
  554. -65536..-1:
  555. begin
  556. list.concat(taicpu.op_reg_const(A_MOVN, reg, Word(not a)));
  557. Exit;
  558. end;
  559. { Can be represented as a negative number more compactly }
  560. $FFFF0000..$FFFFFFFF:
  561. begin
  562. { if we load a value into a 32 bit register, it is automatically
  563. zero-extended to 64 bit }
  564. list.concat(taicpu.op_reg_const(A_MOVN, makeregsize(reg,OS_32), Word(not a)));
  565. Exit;
  566. end;
  567. else
  568. begin
  569. if not extendedsize then
  570. { Mostly so programmers don't get confused when they view the disassembly and
  571. 'a' is sign-extended to 64-bit, say, but also avoids potential problems with
  572. third-party assemblers if the number is out of bounds for a given size }
  573. a := Cardinal(a);
  574. { Check to see if a is a valid shifter constant that can be encoded in ORR as is }
  575. if is_shifter_const(a,size) then
  576. begin
  577. { Use synthetic "MOV" instruction instead of "ORR reg,wzr,#a" (an alias),
  578. since AArch64 conventions prefer this, and it's clearer in the
  579. disassembly }
  580. list.concat(taicpu.op_reg_const(A_MOV,reg,a));
  581. Exit;
  582. end;
  583. { If the value of a fits into 32 bits, it's fastest to use movz/movk regardless }
  584. if extendedsize and ((a shr 32) <> 0) then
  585. begin
  586. { This determines whether this write can be performed with an ORR followed by MOVK
  587. by copying the 3nd word to the 1st word for the ORR constant, then overwriting
  588. the 1st word. The alternative would require 4 instructions. This sequence is
  589. common when division reciprocals are calculated (e.g. 3 produces AAAAAAAAAAAAAAAB). }
  590. leftover_a := word(a and $FFFF);
  591. manipulated_a := (a and $FFFFFFFFFFFF0000) or ((a shr 32) and $FFFF);
  592. { if manipulated_a = a, don't check, because is_shifter_const was already
  593. called for a and it returned False. Reduces processing time. [Kit] }
  594. if (manipulated_a <> a) and is_shifter_const(manipulated_a, OS_64) then
  595. begin
  596. { Encode value as:
  597. orr reg,xzr,manipulated_a
  598. movk reg,#(leftover_a)
  599. Use "orr" instead of "mov" here for the assembly dump so it better
  600. implies that something special is happening with the number arrangement.
  601. }
  602. list.concat(taicpu.op_reg_reg_const(A_ORR, reg, NR_XZR, manipulated_a));
  603. list.concat(taicpu.op_reg_const(A_MOVK, reg, leftover_a));
  604. Exit;
  605. end;
  606. { This determines whether this write can be performed with an ORR followed by MOVK
  607. by copying the 2nd word to the 4th word for the ORR constant, then overwriting
  608. the 4th word. The alternative would require 3 instructions }
  609. leftover_a := word(a shr 48);
  610. manipulated_a := (a and $0000FFFFFFFFFFFF);
  611. if manipulated_a = $0000FFFFFFFFFFFF then
  612. begin
  613. { This is even better, as we can just use a single MOVN on the last word }
  614. shifterop_reset(so);
  615. so.shiftmode := SM_LSL;
  616. so.shiftimm := 48;
  617. list.concat(taicpu.op_reg_const_shifterop(A_MOVN, reg, word(not leftover_a), so));
  618. Exit;
  619. end;
  620. manipulated_a := manipulated_a or (((a shr 16) and $FFFF) shl 48);
  621. { if manipulated_a = a, don't check, because is_shifter_const was already
  622. called for a and it returned False. Reduces processing time. [Kit] }
  623. if (manipulated_a <> a) and is_shifter_const(manipulated_a, OS_64) then
  624. begin
  625. { Encode value as:
  626. orr reg,xzr,manipulated_a
  627. movk reg,#(leftover_a),lsl #48
  628. Use "orr" instead of "mov" here for the assembly dump so it better
  629. implies that something special is happening with the number arrangement.
  630. }
  631. list.concat(taicpu.op_reg_reg_const(A_ORR, reg, NR_XZR, manipulated_a));
  632. shifterop_reset(so);
  633. so.shiftmode := SM_LSL;
  634. so.shiftimm := 48;
  635. list.concat(taicpu.op_reg_const_shifterop(A_MOVK, reg, leftover_a, so));
  636. Exit;
  637. end;
  638. case a of
  639. { If a is in the given negative range, it can be stored
  640. more efficiently if it is inverted. }
  641. TCgInt($FFFF000000000000)..-65537:
  642. begin
  643. { NOTE: This excluded range can be more efficiently
  644. stored as the first 16 bits followed by a shifter constant }
  645. case a of
  646. TCgInt($FFFF0000FFFF0000)..TCgInt($FFFF0000FFFFFFFF):
  647. doinverted := False;
  648. else
  649. begin
  650. doinverted := True;
  651. a := not a;
  652. end;
  653. end;
  654. end;
  655. else
  656. doinverted := False;
  657. end;
  658. end
  659. else
  660. doinverted:=False;
  661. end;
  662. end;
  663. reginited:=false;
  664. shift:=0;
  665. if doinverted then
  666. opc:=A_MOVN
  667. else
  668. opc:=A_MOVZ;
  669. repeat
  670. { leftover is shifterconst? (don't check if we can represent it just
  671. as effectively with movz/movk, as this check is expensive) }
  672. if (word(a)<>0) then
  673. begin
  674. if not doinverted and
  675. ((shift<tcgsize2size[size]*(8 div 2)) and
  676. ((a shr 16)<>0)) and
  677. is_shifter_const(a shl shift,size) then
  678. begin
  679. if reginited then
  680. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift))
  681. else
  682. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift));
  683. exit;
  684. end;
  685. { set all 16 bit parts <> 0 }
  686. if shift=0 then
  687. begin
  688. list.concat(taicpu.op_reg_const(opc,reg,word(a)));
  689. reginited:=true;
  690. end
  691. else
  692. begin
  693. shifterop_reset(so);
  694. so.shiftmode:=SM_LSL;
  695. so.shiftimm:=shift;
  696. if not reginited then
  697. begin
  698. list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so));
  699. reginited:=true;
  700. end
  701. else
  702. begin
  703. if doinverted then
  704. list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(not a),so))
  705. else
  706. list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(a),so));
  707. end;
  708. end;
  709. end;
  710. a:=a shr 16;
  711. inc(shift,16);
  712. until a = 0;
  713. if not reginited then
  714. internalerror(2014102702);
  715. end;
  716. procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference);
  717. var
  718. reg: tregister;
  719. href: treference;
  720. i: Integer;
  721. begin
  722. { use the zero register if possible }
  723. if a=0 then
  724. begin
  725. href:=ref;
  726. inc(href.offset,tcgsize2size[size]-1);
  727. if (tcgsize2size[size]>1) and (ref.alignment=1) and (simple_ref_type(A_STUR,OS_8,PF_None,ref)=sr_simple) and
  728. (simple_ref_type(A_STUR,OS_8,PF_None,href)=sr_simple) then
  729. begin
  730. href:=ref;
  731. for i:=0 to tcgsize2size[size]-1 do
  732. begin
  733. a_load_const_ref(list,OS_8,0,href);
  734. inc(href.offset);
  735. end;
  736. end
  737. else
  738. begin
  739. if size in [OS_64,OS_S64] then
  740. reg:=NR_XZR
  741. else
  742. reg:=NR_WZR;
  743. a_load_reg_ref(list,size,size,reg,ref);
  744. end;
  745. end
  746. else
  747. inherited;
  748. end;
  749. procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  750. var
  751. oppostfix:toppostfix;
  752. hreg: tregister;
  753. begin
  754. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  755. begin
  756. fromsize:=tosize;
  757. reg:=makeregsize(list,reg,fromsize);
  758. end
  759. { have a 32 bit register but need a 64 bit one? }
  760. else if tosize in [OS_64,OS_S64] then
  761. begin
  762. { sign extend if necessary }
  763. if fromsize in [OS_S8,OS_S16,OS_S32] then
  764. begin
  765. { can't overwrite reg, may be a constant reg }
  766. hreg:=getintregister(list,tosize);
  767. a_load_reg_reg(list,fromsize,tosize,reg,hreg);
  768. reg:=hreg;
  769. end
  770. else
  771. { top 32 bit are zero by default }
  772. reg:=makeregsize(reg,OS_64);
  773. fromsize:=tosize;
  774. end;
  775. if not(target_info.system=system_aarch64_darwin) and (ref.alignment<>0) and
  776. (ref.alignment<tcgsize2size[tosize]) then
  777. begin
  778. a_load_reg_ref_unaligned(list,fromsize,tosize,reg,ref);
  779. end
  780. else
  781. begin
  782. case tosize of
  783. { signed integer registers }
  784. OS_8,
  785. OS_S8:
  786. oppostfix:=PF_B;
  787. OS_16,
  788. OS_S16:
  789. oppostfix:=PF_H;
  790. OS_32,
  791. OS_S32,
  792. OS_64,
  793. OS_S64:
  794. oppostfix:=PF_None;
  795. else
  796. InternalError(200308299);
  797. end;
  798. handle_load_store(list,A_STR,tosize,oppostfix,reg,ref);
  799. end;
  800. end;
  801. procedure tcgaarch64.a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  802. var
  803. oppostfix:toppostfix;
  804. begin
  805. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  806. fromsize:=tosize;
  807. { ensure that all bits of the 32/64 register are always correctly set:
  808. * default behaviour is always to zero-extend to the entire (64 bit)
  809. register -> unsigned 8/16/32 bit loads only exist with a 32 bit
  810. target register, as the upper 32 bit will be zeroed implicitly
  811. -> always make target register 32 bit
  812. * signed loads exist both with 32 and 64 bit target registers,
  813. depending on whether the value should be sign extended to 32 or
  814. to 64 bit (if sign extended to 32 bit, the upper 32 bits of the
  815. corresponding 64 bit register are again zeroed) -> no need to
  816. change anything (we only have 32 and 64 bit registers), except that
  817. when loading an OS_S32 to a 32 bit register, we don't need/can't
  818. use sign extension
  819. }
  820. if fromsize in [OS_8,OS_16,OS_32] then
  821. reg:=makeregsize(reg,OS_32);
  822. if not(target_info.system=system_aarch64_darwin) and (ref.alignment<>0) and
  823. (ref.alignment<tcgsize2size[fromsize]) then
  824. begin
  825. a_load_ref_reg_unaligned(list,fromsize,tosize,ref,reg);
  826. exit;
  827. end;
  828. case fromsize of
  829. { signed integer registers }
  830. OS_8:
  831. oppostfix:=PF_B;
  832. OS_S8:
  833. oppostfix:=PF_SB;
  834. OS_16:
  835. oppostfix:=PF_H;
  836. OS_S16:
  837. oppostfix:=PF_SH;
  838. OS_S32:
  839. if getsubreg(reg)=R_SUBD then
  840. oppostfix:=PF_NONE
  841. else
  842. oppostfix:=PF_SW;
  843. OS_32,
  844. OS_64,
  845. OS_S64:
  846. oppostfix:=PF_None;
  847. else
  848. InternalError(200308297);
  849. end;
  850. handle_load_store(list,A_LDR,fromsize,oppostfix,reg,ref);
  851. { clear upper 16 bits if the value was negative }
  852. if (fromsize=OS_S8) and (tosize=OS_16) then
  853. a_load_reg_reg(list,fromsize,tosize,reg,reg);
  854. end;
  855. procedure tcgaarch64.a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister);
  856. var
  857. href: treference;
  858. hreg1, hreg2, tmpreg,tmpreg2: tregister;
  859. i : Integer;
  860. begin
  861. case fromsize of
  862. OS_64,OS_S64:
  863. begin
  864. { split into two 32 bit loads }
  865. hreg1:=getintregister(list,OS_32);
  866. hreg2:=getintregister(list,OS_32);
  867. if target_info.endian=endian_big then
  868. begin
  869. tmpreg:=hreg1;
  870. hreg1:=hreg2;
  871. hreg2:=tmpreg;
  872. end;
  873. { can we use LDP? }
  874. if (ref.alignment=4) and
  875. (simple_ref_type(A_LDP,OS_32,PF_None,ref)=sr_simple) then
  876. list.concat(taicpu.op_reg_reg_ref(A_LDP,hreg1,hreg2,ref))
  877. else
  878. begin
  879. a_load_ref_reg(list,OS_32,OS_32,ref,hreg1);
  880. href:=ref;
  881. inc(href.offset,4);
  882. a_load_ref_reg(list,OS_32,OS_32,href,hreg2);
  883. end;
  884. a_load_reg_reg(list,OS_32,OS_64,hreg1,register);
  885. list.concat(taicpu.op_reg_reg_const_const(A_BFI,register,makeregsize(hreg2,OS_64),32,32));
  886. end;
  887. OS_16,OS_S16,
  888. OS_32,OS_S32:
  889. begin
  890. if ref.alignment=2 then
  891. begin
  892. href:=ref;
  893. if target_info.endian=endian_big then
  894. inc(href.offset,tcgsize2size[fromsize]-2);
  895. tmpreg:=getintregister(list,OS_32);
  896. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg);
  897. tmpreg2:=getintregister(list,OS_32);
  898. for i:=1 to (tcgsize2size[fromsize]-1) div 2 do
  899. begin
  900. if target_info.endian=endian_big then
  901. dec(href.offset,2)
  902. else
  903. inc(href.offset,2);
  904. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg2);
  905. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*16,16));
  906. end;
  907. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  908. end
  909. else
  910. begin
  911. href:=ref;
  912. if target_info.endian=endian_big then
  913. inc(href.offset,tcgsize2size[fromsize]-1);
  914. tmpreg:=getintregister(list,OS_32);
  915. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg);
  916. tmpreg2:=getintregister(list,OS_32);
  917. for i:=1 to tcgsize2size[fromsize]-1 do
  918. begin
  919. if target_info.endian=endian_big then
  920. dec(href.offset)
  921. else
  922. inc(href.offset);
  923. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg2);
  924. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*8,8));
  925. end;
  926. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  927. end;
  928. end;
  929. else
  930. inherited;
  931. end;
  932. end;
  933. procedure tcgaarch64.a_load_reg_reg(list:TAsmList;fromsize,tosize:tcgsize;reg1,reg2:tregister);
  934. var
  935. instr: taicpu;
  936. begin
  937. { we use both 32 and 64 bit registers -> insert conversion when when
  938. we have to truncate/sign extend inside the (32 or 64 bit) register
  939. holding the value, and when we sign extend from a 32 to a 64 bit
  940. register }
  941. if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or
  942. ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and
  943. (fromsize<>tosize) and
  944. not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or
  945. ((fromsize in [OS_S8,OS_S16,OS_S32]) and
  946. (tosize in [OS_64,OS_S64])) or
  947. { needs to mask out the sign in the top 16 bits }
  948. ((fromsize=OS_S8) and
  949. (tosize=OS_16)) then
  950. begin
  951. case tosize of
  952. OS_8:
  953. list.concat(taicpu.op_reg_reg(A_UXTB,reg2,makeregsize(reg1,OS_32)));
  954. OS_16:
  955. list.concat(taicpu.op_reg_reg(A_UXTH,reg2,makeregsize(reg1,OS_32)));
  956. OS_S8:
  957. list.concat(taicpu.op_reg_reg(A_SXTB,reg2,makeregsize(reg1,OS_32)));
  958. OS_S16:
  959. list.concat(taicpu.op_reg_reg(A_SXTH,reg2,makeregsize(reg1,OS_32)));
  960. { while "mov wN, wM" automatically inserts a zero-extension and
  961. hence we could encode a 64->32 bit move like that, the problem
  962. is that we then can't distinguish 64->32 from 32->32 moves, and
  963. the 64->32 truncation could be removed altogether... So use a
  964. different instruction }
  965. OS_32,
  966. OS_S32:
  967. { in theory, reg1 should be 64 bit here (since fromsize>tosize),
  968. but because of the way location_force_register() tries to
  969. avoid superfluous zero/sign extensions, it's not always the
  970. case -> also force reg1 to to 64 bit }
  971. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  972. OS_64,
  973. OS_S64:
  974. list.concat(taicpu.op_reg_reg(A_SXTW,reg2,makeregsize(reg1,OS_32)));
  975. else
  976. internalerror(2002090901);
  977. end;
  978. end
  979. else
  980. begin
  981. { 32 -> 32 bit move implies zero extension (sign extensions have
  982. been handled above) -> also use for 32 <-> 64 bit moves }
  983. if not(fromsize in [OS_64,OS_S64]) or
  984. not(tosize in [OS_64,OS_S64]) then
  985. instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32))
  986. else
  987. instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1);
  988. list.Concat(instr);
  989. { Notify the register allocator that we have written a move instruction so
  990. it can try to eliminate it. }
  991. add_move_instruction(instr);
  992. end;
  993. end;
  994. procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister);
  995. var
  996. href: treference;
  997. so: tshifterop;
  998. op: tasmop;
  999. begin
  1000. op:=A_LDR;
  1001. href:=ref;
  1002. { simplify as if we're going to perform a regular 64 bit load, using
  1003. "r" as the new base register if possible/necessary }
  1004. make_simple_ref(list,op,OS_ADDR,PF_None,href,r);
  1005. { load literal? }
  1006. if assigned(href.symbol) then
  1007. begin
  1008. if (href.base<>NR_NO) or
  1009. (href.index<>NR_NO) or
  1010. not assigned(href.symboldata) then
  1011. internalerror(2014110912);
  1012. list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset));
  1013. end
  1014. else
  1015. begin
  1016. if href.index<>NR_NO then
  1017. begin
  1018. if href.shiftmode<>SM_None then
  1019. begin
  1020. { "add" supports a supperset of the shift modes supported by
  1021. load/store instructions }
  1022. shifterop_reset(so);
  1023. so.shiftmode:=href.shiftmode;
  1024. so.shiftimm:=href.shiftimm;
  1025. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so));
  1026. end
  1027. else
  1028. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r);
  1029. end
  1030. else if href.offset<>0 then
  1031. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r)
  1032. else
  1033. a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r);
  1034. end;
  1035. end;
  1036. procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);
  1037. begin
  1038. internalerror(2014122107)
  1039. end;
  1040. procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  1041. begin
  1042. internalerror(2014122108)
  1043. end;
  1044. procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  1045. begin
  1046. internalerror(2014122109)
  1047. end;
  1048. procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
  1049. var
  1050. instr: taicpu;
  1051. begin
  1052. if assigned(shuffle) and
  1053. not shufflescalar(shuffle) then
  1054. internalerror(2014122104);
  1055. if fromsize=tosize then
  1056. begin
  1057. instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1);
  1058. { Notify the register allocator that we have written a move
  1059. instruction so it can try to eliminate it. }
  1060. add_move_instruction(instr);
  1061. { FMOV cannot generate a floating point exception }
  1062. end
  1063. else
  1064. begin
  1065. if (reg_cgsize(reg1)<>fromsize) or
  1066. (reg_cgsize(reg2)<>tosize) then
  1067. internalerror(2014110913);
  1068. instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1);
  1069. maybe_check_for_fpu_exception(list);
  1070. end;
  1071. list.Concat(instr);
  1072. end;
  1073. procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
  1074. var
  1075. tmpreg: tregister;
  1076. begin
  1077. if assigned(shuffle) and
  1078. not shufflescalar(shuffle) then
  1079. internalerror(2014122105);
  1080. tmpreg:=NR_NO;
  1081. if (fromsize<>tosize) then
  1082. begin
  1083. tmpreg:=reg;
  1084. reg:=getmmregister(list,fromsize);
  1085. end;
  1086. handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref);
  1087. if (fromsize<>tosize) then
  1088. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  1089. end;
  1090. procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
  1091. var
  1092. tmpreg: tregister;
  1093. begin
  1094. if assigned(shuffle) and
  1095. not shufflescalar(shuffle) then
  1096. internalerror(2014122106);
  1097. if (fromsize<>tosize) then
  1098. begin
  1099. tmpreg:=getmmregister(list,tosize);
  1100. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  1101. reg:=tmpreg;
  1102. end;
  1103. handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref);
  1104. end;
  1105. procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  1106. begin
  1107. if not shufflescalar(shuffle) then
  1108. internalerror(2014122801);
  1109. if tcgsize2size[fromsize]<>tcgsize2size[tosize] then
  1110. internalerror(2014122803);
  1111. case tcgsize2size[tosize] of
  1112. 4:
  1113. setsubreg(mmreg,R_SUBMMS);
  1114. 8:
  1115. setsubreg(mmreg,R_SUBMMD);
  1116. else
  1117. internalerror(2020101310);
  1118. end;
  1119. list.concat(taicpu.op_indexedreg_reg(A_INS,mmreg,0,intreg));
  1120. end;
  1121. procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
  1122. var
  1123. r : tregister;
  1124. begin
  1125. if not shufflescalar(shuffle) then
  1126. internalerror(2014122802);
  1127. if tcgsize2size[fromsize]>tcgsize2size[tosize] then
  1128. internalerror(2014122804);
  1129. case tcgsize2size[fromsize] of
  1130. 4:
  1131. setsubreg(mmreg,R_SUBMMS);
  1132. 8:
  1133. setsubreg(mmreg,R_SUBMMD);
  1134. else
  1135. internalerror(2020101311);
  1136. end;
  1137. if tcgsize2size[fromsize]<tcgsize2size[tosize] then
  1138. r:=makeregsize(intreg,fromsize)
  1139. else
  1140. r:=intreg;
  1141. list.concat(taicpu.op_reg_indexedreg(A_UMOV,r,mmreg,0));
  1142. end;
  1143. procedure tcgaarch64.a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
  1144. begin
  1145. case op of
  1146. { "xor Vx,Vx" is used to initialize global regvars to 0 }
  1147. OP_XOR:
  1148. begin
  1149. if shuffle=nil then
  1150. begin
  1151. dst:=newreg(R_MMREGISTER,getsupreg(dst),R_SUBMM16B);
  1152. src:=newreg(R_MMREGISTER,getsupreg(src),R_SUBMM16B);
  1153. list.concat(taicpu.op_reg_reg_reg(A_EOR,dst,dst,src))
  1154. end
  1155. else if (src<>dst) or
  1156. (reg_cgsize(src)<>size) or
  1157. assigned(shuffle) then
  1158. internalerror(2015011401)
  1159. else
  1160. case size of
  1161. OS_F32,
  1162. OS_F64:
  1163. list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0));
  1164. else
  1165. internalerror(2015011402);
  1166. end;
  1167. end
  1168. else
  1169. internalerror(2015011403);
  1170. end;
  1171. end;
  1172. procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister);
  1173. var
  1174. bitsize: longint;
  1175. begin
  1176. if srcsize in [OS_64,OS_S64] then
  1177. begin
  1178. bitsize:=64;
  1179. end
  1180. else
  1181. begin
  1182. bitsize:=32;
  1183. end;
  1184. { source is 0 -> dst will have to become 255 }
  1185. list.concat(taicpu.op_reg_const(A_CMP,src,0));
  1186. if reverse then
  1187. begin
  1188. list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src));
  1189. { xor 31/63 is the same as setting the lower 5/6 bits to
  1190. "31/63-(lower 5/6 bits of dst)" }
  1191. list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1));
  1192. end
  1193. else
  1194. begin
  1195. list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src));
  1196. list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
  1197. end;
  1198. { set dst to -1 if src was 0 }
  1199. list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE));
  1200. { mask the -1 to 255 if src was 0 (anyone find a two-instruction
  1201. branch-free version? All of mine are 3...) }
  1202. list.Concat(taicpu.op_reg_reg(A_UXTB,makeregsize(dst,OS_32),makeregsize(dst,OS_32)));
  1203. end;
  1204. procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference);
  1205. var
  1206. href: treference;
  1207. hreg1, hreg2, tmpreg: tregister;
  1208. begin
  1209. if fromsize in [OS_64,OS_S64] then
  1210. begin
  1211. { split into two 32 bit stores }
  1212. hreg1:=getintregister(list,OS_32);
  1213. hreg2:=getintregister(list,OS_32);
  1214. a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1);
  1215. a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64));
  1216. if target_info.endian=endian_big then
  1217. begin
  1218. tmpreg:=hreg1;
  1219. hreg1:=hreg2;
  1220. hreg2:=tmpreg;
  1221. end;
  1222. { can we use STP? }
  1223. if (ref.alignment=4) and
  1224. (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then
  1225. list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref))
  1226. else
  1227. begin
  1228. a_load_reg_ref(list,OS_32,OS_32,hreg1,ref);
  1229. href:=ref;
  1230. inc(href.offset,4);
  1231. a_load_reg_ref(list,OS_32,OS_32,hreg2,href);
  1232. end;
  1233. end
  1234. else
  1235. inherited;
  1236. end;
  1237. procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  1238. const
  1239. overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG];
  1240. begin
  1241. if (op in overflowops) and
  1242. (size in [OS_8,OS_S8,OS_16,OS_S16]) then
  1243. a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32))
  1244. end;
  1245. procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);
  1246. begin
  1247. optimize_op_const(size,op,a);
  1248. case op of
  1249. OP_NONE:
  1250. exit;
  1251. OP_MOVE:
  1252. a_load_const_reg(list,size,a,reg);
  1253. OP_NEG,OP_NOT:
  1254. internalerror(200306011);
  1255. else
  1256. a_op_const_reg_reg(list,op,size,a,reg,reg);
  1257. end;
  1258. end;
  1259. procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister);
  1260. begin
  1261. Case op of
  1262. OP_NEG,
  1263. OP_NOT:
  1264. begin
  1265. if (op=OP_NOT) and (size in [OS_8,OS_S8]) then
  1266. list.concat(taicpu.op_reg_reg_const(A_EOR,dst,src,255))
  1267. else
  1268. begin
  1269. list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src));
  1270. maybeadjustresult(list,op,size,dst);
  1271. end;
  1272. end
  1273. else
  1274. a_op_reg_reg_reg(list,op,size,src,dst,dst);
  1275. end;
  1276. end;
  1277. procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);
  1278. var
  1279. l: tlocation;
  1280. begin
  1281. a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l);
  1282. end;
  1283. procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);
  1284. var
  1285. hreg: tregister;
  1286. begin
  1287. { no ROLV opcode... }
  1288. if op=OP_ROL then
  1289. begin
  1290. case size of
  1291. OS_32,OS_S32,
  1292. OS_64,OS_S64:
  1293. begin
  1294. hreg:=getintregister(list,size);
  1295. a_load_const_reg(list,size,tcgsize2size[size]*8,hreg);
  1296. a_op_reg_reg(list,OP_SUB,size,src1,hreg);
  1297. a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst);
  1298. exit;
  1299. end;
  1300. else
  1301. internalerror(2014111005);
  1302. end;
  1303. end
  1304. else if (op=OP_ROR) and
  1305. not(size in [OS_32,OS_S32,OS_64,OS_S64]) then
  1306. internalerror(2014111006);
  1307. if TOpCG2AsmOpReg[op]=A_NONE then
  1308. internalerror(2014111007);
  1309. list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1));
  1310. maybeadjustresult(list,op,size,dst);
  1311. end;
  1312. procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1313. var
  1314. shiftcountmask: longint;
  1315. constreg: tregister;
  1316. begin
  1317. { add/sub instructions have only positive immediate operands }
  1318. if (op in [OP_ADD,OP_SUB]) and
  1319. (a<0) then
  1320. begin
  1321. if op=OP_ADD then
  1322. op:=op_SUB
  1323. else
  1324. op:=OP_ADD;
  1325. { avoid range/overflow error in case a = low(tcgint) }
  1326. {$push}{$r-}{$q-}
  1327. a:=-a;
  1328. {$pop}
  1329. end;
  1330. ovloc.loc:=LOC_VOID;
  1331. optimize_op_const(size,op,a);
  1332. case op of
  1333. OP_NONE:
  1334. begin
  1335. a_load_reg_reg(list,size,size,src,dst);
  1336. exit;
  1337. end;
  1338. OP_MOVE:
  1339. begin
  1340. a_load_const_reg(list,size,a,dst);
  1341. exit;
  1342. end;
  1343. else
  1344. ;
  1345. end;
  1346. case op of
  1347. OP_ADD,
  1348. OP_SUB:
  1349. begin
  1350. handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true);
  1351. { on a 64 bit target, overflows with smaller data types
  1352. are handled via range errors }
  1353. if setflags and
  1354. (size in [OS_64,OS_S64]) then
  1355. begin
  1356. location_reset(ovloc,LOC_FLAGS,OS_8);
  1357. if size=OS_64 then
  1358. if op=OP_ADD then
  1359. ovloc.resflags:=F_CS
  1360. else
  1361. ovloc.resflags:=F_CC
  1362. else
  1363. ovloc.resflags:=F_VS;
  1364. end;
  1365. end;
  1366. OP_OR,
  1367. OP_AND,
  1368. OP_XOR:
  1369. begin
  1370. if not(size in [OS_64,OS_S64]) then
  1371. a:=cardinal(a);
  1372. if is_shifter_const(a,size) then
  1373. list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a))
  1374. else
  1375. begin
  1376. constreg:=getintregister(list,size);
  1377. a_load_const_reg(list,size,a,constreg);
  1378. a_op_reg_reg_reg(list,op,size,constreg,src,dst);
  1379. end;
  1380. end;
  1381. OP_SHL,
  1382. OP_SHR,
  1383. OP_SAR:
  1384. begin
  1385. if size in [OS_64,OS_S64] then
  1386. shiftcountmask:=63
  1387. else
  1388. shiftcountmask:=31;
  1389. if (a and shiftcountmask)<>0 Then
  1390. list.concat(taicpu.op_reg_reg_const(
  1391. TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask))
  1392. else
  1393. a_load_reg_reg(list,size,size,src,dst);
  1394. if (a and not(tcgint(shiftcountmask)))<>0 then
  1395. internalError(2014112101);
  1396. end;
  1397. OP_ROL,
  1398. OP_ROR:
  1399. begin
  1400. case size of
  1401. OS_32,OS_S32:
  1402. if (a and not(tcgint(31)))<>0 then
  1403. internalError(2014112102);
  1404. OS_64,OS_S64:
  1405. if (a and not(tcgint(63)))<>0 then
  1406. internalError(2014112103);
  1407. else
  1408. internalError(2014112104);
  1409. end;
  1410. { there's only a ror opcode }
  1411. if op=OP_ROL then
  1412. a:=(tcgsize2size[size]*8)-a;
  1413. list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a));
  1414. end;
  1415. OP_MUL,
  1416. OP_IMUL,
  1417. OP_DIV,
  1418. OP_IDIV:
  1419. begin
  1420. constreg:=getintregister(list,size);
  1421. a_load_const_reg(list,size,a,constreg);
  1422. a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc);
  1423. end;
  1424. else
  1425. internalerror(2014111403);
  1426. end;
  1427. maybeadjustresult(list,op,size,dst);
  1428. end;
  1429. procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1430. var
  1431. tmpreg1, tmpreg2: tregister;
  1432. begin
  1433. ovloc.loc:=LOC_VOID;
  1434. { overflow can only occur with 64 bit calculations on 64 bit cpus }
  1435. if setflags and
  1436. (size in [OS_64,OS_S64]) then
  1437. begin
  1438. case op of
  1439. OP_ADD,
  1440. OP_SUB:
  1441. begin
  1442. list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S));
  1443. ovloc.loc:=LOC_FLAGS;
  1444. if size=OS_64 then
  1445. if op=OP_ADD then
  1446. ovloc.resflags:=F_CS
  1447. else
  1448. ovloc.resflags:=F_CC
  1449. else
  1450. ovloc.resflags:=F_VS;
  1451. { finished }
  1452. exit;
  1453. end;
  1454. OP_MUL:
  1455. begin
  1456. { check whether the upper 64 bit of the 128 bit product is 0 }
  1457. tmpreg1:=getintregister(list,OS_64);
  1458. list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1));
  1459. list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0));
  1460. ovloc.loc:=LOC_FLAGS;
  1461. ovloc.resflags:=F_NE;
  1462. { still have to perform the actual multiplication }
  1463. end;
  1464. OP_IMUL:
  1465. begin
  1466. { check whether the upper 64 bits of the 128 bit multiplication
  1467. result have the same value as the replicated sign bit of the
  1468. lower 64 bits }
  1469. tmpreg1:=getintregister(list,OS_64);
  1470. list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1));
  1471. { calculate lower 64 bits (afterwards, because dst may be
  1472. equal to src1 or src2) }
  1473. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1474. { replicate sign bit }
  1475. tmpreg2:=getintregister(list,OS_64);
  1476. a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2);
  1477. list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2));
  1478. ovloc.loc:=LOC_FLAGS;
  1479. ovloc.resflags:=F_NE;
  1480. { finished }
  1481. exit;
  1482. end;
  1483. OP_IDIV,
  1484. OP_DIV:
  1485. begin
  1486. { not handled here, needs div-by-zero check (dividing by zero
  1487. just gives a 0 result on aarch64), and low(int64) div -1
  1488. check for overflow) }
  1489. internalerror(2014122101);
  1490. end;
  1491. else
  1492. internalerror(2019050936);
  1493. end;
  1494. end;
  1495. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1496. end;
  1497. {*************** compare instructructions ****************}
  1498. procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
  1499. var
  1500. op: tasmop;
  1501. begin
  1502. if a>=0 then
  1503. op:=A_CMP
  1504. else
  1505. op:=A_CMN;
  1506. { avoid range/overflow error in case a=low(tcgint) }
  1507. {$push}{$r-}{$q-}
  1508. handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false);
  1509. {$pop}
  1510. a_jmp_cond(list,cmp_op,l);
  1511. end;
  1512. procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel);
  1513. begin
  1514. list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1));
  1515. a_jmp_cond(list,cmp_op,l);
  1516. end;
  1517. procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel);
  1518. var
  1519. ai: taicpu;
  1520. begin
  1521. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION));
  1522. ai.is_jmp:=true;
  1523. list.Concat(ai);
  1524. end;
  1525. procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string);
  1526. var
  1527. ai: taicpu;
  1528. begin
  1529. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION));
  1530. ai.is_jmp:=true;
  1531. list.Concat(ai);
  1532. end;
  1533. procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel);
  1534. var
  1535. ai: taicpu;
  1536. begin
  1537. ai:=TAiCpu.op_sym(A_B,l);
  1538. ai.is_jmp:=true;
  1539. ai.SetCondition(TOpCmp2AsmCond[cond]);
  1540. list.Concat(ai);
  1541. end;
  1542. procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);
  1543. var
  1544. ai : taicpu;
  1545. begin
  1546. ai:=Taicpu.op_sym(A_B,l);
  1547. ai.is_jmp:=true;
  1548. ai.SetCondition(flags_to_cond(f));
  1549. list.Concat(ai);
  1550. end;
  1551. procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister);
  1552. begin
  1553. list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f)));
  1554. end;
  1555. procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);
  1556. begin
  1557. { we need an explicit overflow location, because there are many
  1558. possibilities (not just the overflow flag, which is only used for
  1559. signed add/sub) }
  1560. internalerror(2014112303);
  1561. end;
  1562. procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation);
  1563. var
  1564. hl : tasmlabel;
  1565. hflags : tresflags;
  1566. begin
  1567. if not(cs_check_overflow in current_settings.localswitches) then
  1568. exit;
  1569. current_asmdata.getjumplabel(hl);
  1570. case ovloc.loc of
  1571. LOC_FLAGS:
  1572. begin
  1573. hflags:=ovloc.resflags;
  1574. inverse_flags(hflags);
  1575. cg.a_jmp_flags(list,hflags,hl);
  1576. end;
  1577. else
  1578. internalerror(2014112304);
  1579. end;
  1580. a_call_name(list,'FPC_OVERFLOW',false);
  1581. a_label(list,hl);
  1582. end;
  1583. { *********** entry/exit code and address loading ************ }
  1584. function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  1585. var
  1586. ref: treference;
  1587. sr: tsuperregister;
  1588. pairreg: tregister;
  1589. sehreg,sehregp : TAsmSehDirective;
  1590. begin
  1591. result:=0;
  1592. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1593. ref.addressmode:=AM_PREINDEXED;
  1594. pairreg:=NR_NO;
  1595. { for SEH on Win64 we can only store consecutive register pairs, others
  1596. need to be stored with STR }
  1597. if target_info.system=system_aarch64_win64 then
  1598. begin
  1599. if rt=R_INTREGISTER then
  1600. begin
  1601. sehreg:=ash_savereg_x;
  1602. sehregp:=ash_saveregp_x;
  1603. end
  1604. else if rt=R_MMREGISTER then
  1605. begin
  1606. sehreg:=ash_savefreg_x;
  1607. sehregp:=ash_savefregp_x;
  1608. end
  1609. else
  1610. internalerror(2020041304);
  1611. for sr:=lowsr to highsr do
  1612. if sr in rg[rt].used_in_proc then
  1613. if pairreg=NR_NO then
  1614. pairreg:=newreg(rt,sr,sub)
  1615. else
  1616. begin
  1617. inc(result,16);
  1618. if getsupreg(pairreg)=sr-1 then
  1619. begin
  1620. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1621. list.concat(cai_seh_directive.create_reg_offset(sehregp,pairreg,16));
  1622. pairreg:=NR_NO;
  1623. end
  1624. else
  1625. begin
  1626. list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref));
  1627. list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16));
  1628. pairreg:=newreg(rt,sr,sub);
  1629. end;
  1630. end;
  1631. if pairreg<>NR_NO then
  1632. begin
  1633. inc(result,16);
  1634. list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref));
  1635. list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16));
  1636. end;
  1637. end
  1638. else
  1639. begin
  1640. { store all used registers pairwise }
  1641. for sr:=lowsr to highsr do
  1642. if sr in rg[rt].used_in_proc then
  1643. if pairreg=NR_NO then
  1644. pairreg:=newreg(rt,sr,sub)
  1645. else
  1646. begin
  1647. inc(result,16);
  1648. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1649. pairreg:=NR_NO
  1650. end;
  1651. { one left -> store twice (stack must be 16 bytes aligned) }
  1652. if pairreg<>NR_NO then
  1653. begin
  1654. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
  1655. inc(result,16);
  1656. end;
  1657. end;
  1658. end;
  1659. procedure FixupOffsets(p:TObject;arg:pointer);
  1660. var
  1661. sym: tabstractnormalvarsym absolute p;
  1662. begin
  1663. if (tsym(p).typ in [paravarsym,localvarsym]) and
  1664. (sym.localloc.loc=LOC_REFERENCE) and
  1665. (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
  1666. begin
  1667. sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
  1668. dec(sym.localloc.reference.offset,PLongint(arg)^);
  1669. end;
  1670. end;
  1671. procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
  1672. var
  1673. hitem: tlinkedlistitem;
  1674. seh_proc: tai_seh_directive;
  1675. templist: TAsmList;
  1676. suppress_endprologue: boolean;
  1677. ref: treference;
  1678. totalstackframesize: longint;
  1679. begin
  1680. hitem:=list.last;
  1681. { pi_has_unwind_info may already be set at this point if there are
  1682. SEH directives in assembler body. In this case, .seh_endprologue
  1683. is expected to be one of those directives, and not generated here. }
  1684. suppress_endprologue:=(pi_has_unwind_info in current_procinfo.flags);
  1685. if not nostackframe then
  1686. begin
  1687. { stack pointer has to be aligned to 16 bytes at all times }
  1688. localsize:=align(localsize,16);
  1689. if target_info.system=system_aarch64_win64 then
  1690. include(current_procinfo.flags,pi_has_unwind_info);
  1691. { save stack pointer and return address }
  1692. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1693. ref.addressmode:=AM_PREINDEXED;
  1694. list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
  1695. current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
  1696. current_asmdata.asmcfi.cfa_offset(list,NR_FP,-16);
  1697. current_asmdata.asmcfi.cfa_offset(list,NR_LR,-8);
  1698. if target_info.system=system_aarch64_win64 then
  1699. list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16));
  1700. { initialise frame pointer }
  1701. if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then
  1702. begin
  1703. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
  1704. current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FP);
  1705. if target_info.system=system_aarch64_win64 then
  1706. list.concat(cai_seh_directive.create(ash_setfp));
  1707. end
  1708. else
  1709. begin
  1710. gen_load_frame_for_exceptfilter(list);
  1711. localsize:=current_procinfo.maxpushedparasize;
  1712. end;
  1713. totalstackframesize:=localsize;
  1714. { save modified integer registers }
  1715. inc(totalstackframesize,
  1716. save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
  1717. { only the lower 64 bits of the modified vector registers need to be
  1718. saved; if the caller needs the upper 64 bits, it has to save them
  1719. itself }
  1720. inc(totalstackframesize,
  1721. save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
  1722. { allocate stack space }
  1723. if localsize<>0 then
  1724. begin
  1725. localsize:=align(localsize,16);
  1726. current_procinfo.final_localsize:=localsize;
  1727. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1728. if target_info.system=system_aarch64_win64 then
  1729. list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize));
  1730. end;
  1731. { By default, we use the frame pointer to access parameters passed via
  1732. the stack and the stack pointer to address local variables and temps
  1733. because
  1734. a) we can use bigger positive than negative offsets (so accessing
  1735. locals via negative offsets from the frame pointer would be less
  1736. efficient)
  1737. b) we don't know the local size while generating the code, so
  1738. accessing the parameters via the stack pointer is not possible
  1739. without copying them
  1740. The problem with this is the get_frame() intrinsic:
  1741. a) it must return the same value as what we pass as parentfp
  1742. parameter, since that's how it's used in the TP-style objects unit
  1743. b) its return value must usable to access all local data from a
  1744. routine (locals and parameters), since it's all the nested
  1745. routines have access to
  1746. c) its return value must be usable to construct a backtrace, as it's
  1747. also used by the exception handling routines
  1748. The solution we use here, based on something similar that's done in
  1749. the MIPS port, is to generate all accesses to locals in the routine
  1750. itself SP-relative, and then after the code is generated and the local
  1751. size is known (namely, here), we change all SP-relative variables/
  1752. parameters into FP-relative ones. This means that they'll be accessed
  1753. less efficiently from nested routines, but those accesses are indirect
  1754. anyway and at least this way they can be accessed at all
  1755. }
  1756. if current_procinfo.has_nestedprocs or
  1757. (
  1758. (target_info.system=system_aarch64_win64) and
  1759. (current_procinfo.flags*[pi_has_implicit_finally,pi_needs_implicit_finally,pi_uses_exceptions]<>[])
  1760. ) then
  1761. begin
  1762. current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1763. current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1764. end;
  1765. end;
  1766. if not (pi_has_unwind_info in current_procinfo.flags) then
  1767. exit;
  1768. { Generate unwind data for aarch64-win64 }
  1769. seh_proc:=cai_seh_directive.create_name(ash_proc,current_procinfo.procdef.mangledname);
  1770. if assigned(hitem) then
  1771. list.insertafter(seh_proc,hitem)
  1772. else
  1773. list.insert(seh_proc);
  1774. { the directive creates another section }
  1775. inc(list.section_count);
  1776. templist:=TAsmList.Create;
  1777. if not suppress_endprologue then
  1778. begin
  1779. templist.concat(cai_seh_directive.create(ash_endprologue));
  1780. end;
  1781. if assigned(current_procinfo.endprologue_ai) then
  1782. current_procinfo.aktproccode.insertlistafter(current_procinfo.endprologue_ai,templist)
  1783. else
  1784. list.concatlist(templist);
  1785. templist.free;
  1786. end;
  1787. procedure tcgaarch64.g_maybe_got_init(list : TAsmList);
  1788. begin
  1789. { nothing to do on Darwin or Linux }
  1790. end;
  1791. procedure tcgaarch64.g_restore_registers(list:TAsmList);
  1792. begin
  1793. { done in g_proc_exit }
  1794. end;
  1795. procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  1796. var
  1797. ref: treference;
  1798. sr, highestsetsr: tsuperregister;
  1799. pairreg: tregister;
  1800. i,
  1801. regcount: longint;
  1802. aiarr : array of tai;
  1803. begin
  1804. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1805. ref.addressmode:=AM_POSTINDEXED;
  1806. regcount:=0;
  1807. { due to SEH on Win64 we can only load consecutive registers and single
  1808. ones are done using LDR, so we need to handle this differently there }
  1809. if target_info.system=system_aarch64_win64 then
  1810. begin
  1811. setlength(aiarr,highsr-lowsr+1);
  1812. pairreg:=NR_NO;
  1813. for sr:=lowsr to highsr do
  1814. if sr in rg[rt].used_in_proc then
  1815. begin
  1816. if pairreg=NR_NO then
  1817. pairreg:=newreg(rt,sr,sub)
  1818. else
  1819. begin
  1820. if getsupreg(pairreg)=sr-1 then
  1821. begin
  1822. aiarr[regcount]:=taicpu.op_reg_reg_ref(A_LDP,pairreg,newreg(rt,sr,sub),ref);
  1823. inc(regcount);
  1824. pairreg:=NR_NO;
  1825. end
  1826. else
  1827. begin
  1828. aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref);
  1829. inc(regcount);
  1830. pairreg:=newreg(rt,sr,sub);
  1831. end;
  1832. end;
  1833. end;
  1834. if pairreg<>NR_NO then
  1835. begin
  1836. aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref);
  1837. inc(regcount);
  1838. pairreg:=NR_NO;
  1839. end;
  1840. for i:=regcount-1 downto 0 do
  1841. list.concat(aiarr[i]);
  1842. end
  1843. else
  1844. begin
  1845. { highest reg stored twice? }
  1846. highestsetsr:=RS_NO;
  1847. for sr:=lowsr to highsr do
  1848. if sr in rg[rt].used_in_proc then
  1849. begin
  1850. inc(regcount);
  1851. highestsetsr:=sr;
  1852. end;
  1853. if odd(regcount) then
  1854. begin
  1855. list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref));
  1856. highestsetsr:=pred(highestsetsr);
  1857. end;
  1858. { load all (other) used registers pairwise }
  1859. pairreg:=NR_NO;
  1860. for sr:=highestsetsr downto lowsr do
  1861. if sr in rg[rt].used_in_proc then
  1862. if pairreg=NR_NO then
  1863. pairreg:=newreg(rt,sr,sub)
  1864. else
  1865. begin
  1866. list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref));
  1867. pairreg:=NR_NO
  1868. end;
  1869. end;
  1870. { There can't be any register left }
  1871. if pairreg<>NR_NO then
  1872. internalerror(2014112602);
  1873. end;
  1874. procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  1875. var
  1876. ref: treference;
  1877. regsstored: boolean;
  1878. sr: tsuperregister;
  1879. begin
  1880. if not(nostackframe) and
  1881. { we do not need an exit stack frame when we never return
  1882. * the final ret is left so the peephole optimizer can easily do call/ret -> jmp or call conversions
  1883. * the entry stack frame must be normally generated because the subroutine could be still left by
  1884. an exception and then the unwinding code might need to restore the registers stored by the entry code
  1885. }
  1886. not(po_noreturn in current_procinfo.procdef.procoptions) then
  1887. begin
  1888. { if no registers have been stored, we don't have to subtract the
  1889. allocated temp space from the stack pointer }
  1890. regsstored:=false;
  1891. for sr:=RS_X19 to RS_X28 do
  1892. if sr in rg[R_INTREGISTER].used_in_proc then
  1893. begin
  1894. regsstored:=true;
  1895. break;
  1896. end;
  1897. if not regsstored then
  1898. for sr:=RS_D8 to RS_D15 do
  1899. if sr in rg[R_MMREGISTER].used_in_proc then
  1900. begin
  1901. regsstored:=true;
  1902. break;
  1903. end;
  1904. { restore registers (and stack pointer) }
  1905. if regsstored then
  1906. begin
  1907. if current_procinfo.final_localsize<>0 then
  1908. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true);
  1909. load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
  1910. load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
  1911. end
  1912. else if current_procinfo.final_localsize<>0 then
  1913. { restore stack pointer }
  1914. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  1915. { restore framepointer and return address }
  1916. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1917. ref.addressmode:=AM_POSTINDEXED;
  1918. list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
  1919. end;
  1920. { return }
  1921. list.concat(taicpu.op_none(A_RET));
  1922. if (pi_has_unwind_info in current_procinfo.flags) then
  1923. begin
  1924. tcpuprocinfo(current_procinfo).dump_scopes(list);
  1925. list.concat(cai_seh_directive.create(ash_endproc));
  1926. end;
  1927. end;
  1928. procedure tcgaarch64.g_save_registers(list : TAsmList);
  1929. begin
  1930. { done in g_proc_entry }
  1931. end;
  1932. { ************* concatcopy ************ }
  1933. procedure tcgaarch64.g_concatcopy_move(list : TAsmList;const source,dest : treference;len : tcgint);
  1934. var
  1935. paraloc1,paraloc2,paraloc3 : TCGPara;
  1936. pd : tprocdef;
  1937. begin
  1938. pd:=search_system_proc('MOVE');
  1939. paraloc1.init;
  1940. paraloc2.init;
  1941. paraloc3.init;
  1942. paramanager.getcgtempparaloc(list,pd,1,paraloc1);
  1943. paramanager.getcgtempparaloc(list,pd,2,paraloc2);
  1944. paramanager.getcgtempparaloc(list,pd,3,paraloc3);
  1945. a_load_const_cgpara(list,OS_SINT,len,paraloc3);
  1946. a_loadaddr_ref_cgpara(list,dest,paraloc2);
  1947. a_loadaddr_ref_cgpara(list,source,paraloc1);
  1948. paramanager.freecgpara(list,paraloc3);
  1949. paramanager.freecgpara(list,paraloc2);
  1950. paramanager.freecgpara(list,paraloc1);
  1951. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1952. alloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1953. a_call_name(list,'FPC_MOVE',false);
  1954. dealloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1955. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1956. paraloc3.done;
  1957. paraloc2.done;
  1958. paraloc1.done;
  1959. end;
  1960. procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
  1961. var
  1962. sourcebasereplaced, destbasereplaced: boolean;
  1963. { get optimal memory operation to use for loading/storing data
  1964. in an unrolled loop }
  1965. procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
  1966. begin
  1967. if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
  1968. (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
  1969. begin
  1970. memop:=unscaledop;
  1971. needsimplify:=true;
  1972. end
  1973. else if (unscaledop<>A_NONE) and
  1974. (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
  1975. (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
  1976. begin
  1977. memop:=unscaledop;
  1978. needsimplify:=false;
  1979. end
  1980. else
  1981. begin
  1982. memop:=scaledop;
  1983. needsimplify:=true;
  1984. end;
  1985. end;
  1986. { adjust the offset and/or addressing mode after a load/store so it's
  1987. correct for the next one of the same size }
  1988. procedure updaterefafterloadstore(var ref: treference; oplen: longint);
  1989. begin
  1990. case ref.addressmode of
  1991. AM_OFFSET:
  1992. inc(ref.offset,oplen);
  1993. AM_POSTINDEXED:
  1994. { base register updated by instruction, next offset can remain
  1995. the same }
  1996. ;
  1997. AM_PREINDEXED:
  1998. begin
  1999. { base register updated by instruction -> next instruction can
  2000. use post-indexing with offset = sizeof(operation) }
  2001. ref.offset:=0;
  2002. ref.addressmode:=AM_OFFSET;
  2003. end;
  2004. end;
  2005. end;
  2006. { generate a load/store and adjust the reference offset to the next
  2007. memory location if necessary }
  2008. procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  2009. begin
  2010. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
  2011. updaterefafterloadstore(ref,tcgsize2size[opsize]);
  2012. end;
  2013. { generate a dual load/store (ldp/stp) and adjust the reference offset to
  2014. the next memory location if necessary }
  2015. procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  2016. begin
  2017. list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
  2018. updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
  2019. end;
  2020. { turn a reference into a pre- or post-indexed reference for use in a
  2021. load/store of a particular size }
  2022. procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
  2023. var
  2024. tmpreg: tregister;
  2025. scaledoffset: longint;
  2026. orgaddressmode: taddressmode;
  2027. begin
  2028. scaledoffset:=tcgsize2size[opsize];
  2029. if scaledop in [A_LDP,A_STP] then
  2030. scaledoffset:=scaledoffset*2;
  2031. { can we use the reference as post-indexed without changes? }
  2032. if forcepostindexing then
  2033. begin
  2034. orgaddressmode:=ref.addressmode;
  2035. ref.addressmode:=AM_POSTINDEXED;
  2036. if (orgaddressmode=AM_POSTINDEXED) or
  2037. ((ref.offset=0) and
  2038. (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
  2039. begin
  2040. { just change the post-indexed offset to the access size }
  2041. ref.offset:=scaledoffset;
  2042. { and replace the base register if that didn't happen yet
  2043. (could be sp or a regvar) }
  2044. if not basereplaced then
  2045. begin
  2046. tmpreg:=getaddressregister(list);
  2047. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  2048. ref.base:=tmpreg;
  2049. basereplaced:=true;
  2050. end;
  2051. exit;
  2052. end;
  2053. ref.addressmode:=orgaddressmode;
  2054. end;
  2055. {$ifdef dummy}
  2056. This could in theory be useful in case you have a concatcopy from
  2057. e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
  2058. very unlikely. Disabled because it still needs fixes, as it
  2059. also generates pre-indexed loads right now at the very end for the
  2060. left-over gencopies
  2061. { can we turn it into a pre-indexed reference for free? (after the
  2062. first operation, it will be turned into an offset one) }
  2063. if not forcepostindexing and
  2064. (ref.offset<>0) then
  2065. begin
  2066. orgaddressmode:=ref.addressmode;
  2067. ref.addressmode:=AM_PREINDEXED;
  2068. tmpreg:=ref.base;
  2069. if not basereplaced and
  2070. (ref.base=tmpreg) then
  2071. begin
  2072. tmpreg:=getaddressregister(list);
  2073. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  2074. ref.base:=tmpreg;
  2075. basereplaced:=true;
  2076. end;
  2077. if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
  2078. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  2079. exit;
  2080. end;
  2081. {$endif dummy}
  2082. if not forcepostindexing then
  2083. begin
  2084. ref.addressmode:=AM_OFFSET;
  2085. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  2086. { this may still cause problems if the final offset is no longer
  2087. a simple ref; it's a bit complicated to pass all information
  2088. through at all places and check that here, so play safe: we
  2089. currently never generate unrolled copies for more than 64
  2090. bytes (32 with non-double-register copies) }
  2091. if ref.index=NR_NO then
  2092. begin
  2093. if ((scaledop in [A_LDP,A_STP]) and
  2094. (ref.offset<((64-8)*tcgsize2size[opsize]))) or
  2095. ((scaledop in [A_LDUR,A_STUR]) and
  2096. (ref.offset<(255-8*tcgsize2size[opsize]))) or
  2097. ((scaledop in [A_LDR,A_STR]) and
  2098. (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
  2099. exit;
  2100. end;
  2101. end;
  2102. tmpreg:=getaddressregister(list);
  2103. a_loadaddr_ref_reg(list,ref,tmpreg);
  2104. basereplaced:=true;
  2105. if forcepostindexing then
  2106. begin
  2107. reference_reset_base(ref,tmpreg,scaledoffset,ref.temppos,ref.alignment,ref.volatility);
  2108. ref.addressmode:=AM_POSTINDEXED;
  2109. end
  2110. else
  2111. begin
  2112. reference_reset_base(ref,tmpreg,0,ref.temppos,ref.alignment,ref.volatility);
  2113. ref.addressmode:=AM_OFFSET;
  2114. end
  2115. end;
  2116. { prepare a reference for use by gencopy. This is done both after the
  2117. unrolled and regular copy loop -> get rid of post-indexing mode, make
  2118. sure ref is valid }
  2119. procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
  2120. var
  2121. simplify: boolean;
  2122. begin
  2123. if ref.addressmode=AM_POSTINDEXED then
  2124. ref.offset:=tcgsize2size[opsize];
  2125. getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
  2126. if simplify then
  2127. begin
  2128. makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
  2129. op:=scaledop;
  2130. end;
  2131. end;
  2132. { generate a copy from source to dest of size opsize/postfix }
  2133. procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
  2134. var
  2135. reg: tregister;
  2136. loadop, storeop: tasmop;
  2137. begin
  2138. preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
  2139. preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
  2140. reg:=getintregister(list,opsize);
  2141. genloadstore(list,loadop,reg,source,postfix,opsize);
  2142. genloadstore(list,storeop,reg,dest,postfix,opsize);
  2143. end;
  2144. { copy the leftovers after an unrolled or regular copy loop }
  2145. procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
  2146. begin
  2147. { stop post-indexing if we did so in the loop, since in that case all
  2148. offsets definitely can be represented now }
  2149. if source.addressmode=AM_POSTINDEXED then
  2150. begin
  2151. source.addressmode:=AM_OFFSET;
  2152. source.offset:=0;
  2153. end;
  2154. if dest.addressmode=AM_POSTINDEXED then
  2155. begin
  2156. dest.addressmode:=AM_OFFSET;
  2157. dest.offset:=0;
  2158. end;
  2159. { transfer the leftovers }
  2160. if len>=8 then
  2161. begin
  2162. dec(len,8);
  2163. gencopy(list,source,dest,PF_NONE,OS_64);
  2164. end;
  2165. if len>=4 then
  2166. begin
  2167. dec(len,4);
  2168. gencopy(list,source,dest,PF_NONE,OS_32);
  2169. end;
  2170. if len>=2 then
  2171. begin
  2172. dec(len,2);
  2173. gencopy(list,source,dest,PF_H,OS_16);
  2174. end;
  2175. if len>=1 then
  2176. begin
  2177. dec(len);
  2178. gencopy(list,source,dest,PF_B,OS_8);
  2179. end;
  2180. end;
  2181. const
  2182. { load_length + loop dec + cbnz }
  2183. loopoverhead=12;
  2184. { loop overhead + load + store }
  2185. totallooplen=loopoverhead + 8;
  2186. var
  2187. totalalign: longint;
  2188. maxlenunrolled: tcgint;
  2189. loadop, storeop: tasmop;
  2190. opsize: tcgsize;
  2191. postfix: toppostfix;
  2192. tmpsource, tmpdest: treference;
  2193. scaledstoreop, unscaledstoreop,
  2194. scaledloadop, unscaledloadop: tasmop;
  2195. regs: array[1..8] of tregister;
  2196. countreg: tregister;
  2197. i, regcount: longint;
  2198. hl: tasmlabel;
  2199. simplifysource, simplifydest: boolean;
  2200. begin
  2201. if len=0 then
  2202. exit;
  2203. sourcebasereplaced:=false;
  2204. destbasereplaced:=false;
  2205. { maximum common alignment }
  2206. totalalign:=max(1,newalignment(source.alignment,dest.alignment));
  2207. { use a simple load/store? }
  2208. if (len in [1,2,4,8]) and
  2209. ((totalalign>=(len div 2)) or
  2210. (source.alignment=len) or
  2211. (dest.alignment=len)) then
  2212. begin
  2213. opsize:=int_cgsize(len);
  2214. a_load_ref_ref(list,opsize,opsize,source,dest);
  2215. exit;
  2216. end;
  2217. { alignment > length is not useful, and would break some checks below }
  2218. while totalalign>len do
  2219. totalalign:=totalalign div 2;
  2220. { operation sizes to use based on common alignment }
  2221. case totalalign of
  2222. 1:
  2223. begin
  2224. postfix:=PF_B;
  2225. opsize:=OS_8;
  2226. end;
  2227. 2:
  2228. begin
  2229. postfix:=PF_H;
  2230. opsize:=OS_16;
  2231. end;
  2232. 4:
  2233. begin
  2234. postfix:=PF_None;
  2235. opsize:=OS_32;
  2236. end
  2237. else
  2238. begin
  2239. totalalign:=8;
  2240. postfix:=PF_None;
  2241. opsize:=OS_64;
  2242. end;
  2243. end;
  2244. { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
  2245. maxlenunrolled:=min(totalalign,8)*4;
  2246. { ldp/stp -> 2 registers per instruction }
  2247. if (totalalign>=4) and
  2248. (len>=totalalign*2) then
  2249. begin
  2250. maxlenunrolled:=maxlenunrolled*2;
  2251. scaledstoreop:=A_STP;
  2252. scaledloadop:=A_LDP;
  2253. unscaledstoreop:=A_NONE;
  2254. unscaledloadop:=A_NONE;
  2255. end
  2256. else
  2257. begin
  2258. scaledstoreop:=A_STR;
  2259. scaledloadop:=A_LDR;
  2260. unscaledstoreop:=A_STUR;
  2261. unscaledloadop:=A_LDUR;
  2262. end;
  2263. { we only need 4 instructions extra to call FPC_MOVE }
  2264. if cs_opt_size in current_settings.optimizerswitches then
  2265. maxlenunrolled:=maxlenunrolled div 2;
  2266. if (len>maxlenunrolled) and
  2267. (len>totalalign*8) then
  2268. begin
  2269. g_concatcopy_move(list,source,dest,len);
  2270. exit;
  2271. end;
  2272. simplifysource:=true;
  2273. simplifydest:=true;
  2274. tmpsource:=source;
  2275. tmpdest:=dest;
  2276. { can we directly encode all offsets in an unrolled loop? }
  2277. if len<=maxlenunrolled then
  2278. begin
  2279. {$ifdef extdebug}
  2280. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
  2281. {$endif extdebug}
  2282. { the leftovers will be handled separately -> -(len mod opsize) }
  2283. inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
  2284. { additionally, the last regular load/store will be at
  2285. offset+len-opsize (if len-(len mod opsize)>len) }
  2286. if tmpsource.offset>source.offset then
  2287. dec(tmpsource.offset,tcgsize2size[opsize]);
  2288. getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
  2289. inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
  2290. if tmpdest.offset>dest.offset then
  2291. dec(tmpdest.offset,tcgsize2size[opsize]);
  2292. getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
  2293. tmpsource:=source;
  2294. tmpdest:=dest;
  2295. { if we can't directly encode all offsets, simplify }
  2296. if simplifysource then
  2297. begin
  2298. loadop:=scaledloadop;
  2299. makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
  2300. end;
  2301. if simplifydest then
  2302. begin
  2303. storeop:=scaledstoreop;
  2304. makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
  2305. end;
  2306. regcount:=len div tcgsize2size[opsize];
  2307. { in case we transfer two registers at a time, we copy an even
  2308. number of registers }
  2309. if loadop=A_LDP then
  2310. regcount:=regcount and not(1);
  2311. { initialise for dfa }
  2312. regs[low(regs)]:=NR_NO;
  2313. { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
  2314. for i:=1 to regcount do
  2315. regs[i]:=getintregister(list,opsize);
  2316. if loadop=A_LDP then
  2317. begin
  2318. { load registers }
  2319. for i:=1 to (regcount div 2) do
  2320. gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
  2321. { store registers }
  2322. for i:=1 to (regcount div 2) do
  2323. gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
  2324. end
  2325. else
  2326. begin
  2327. for i:=1 to regcount do
  2328. genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
  2329. for i:=1 to regcount do
  2330. genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
  2331. end;
  2332. { leftover }
  2333. len:=len-regcount*tcgsize2size[opsize];
  2334. {$ifdef extdebug}
  2335. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
  2336. {$endif extdebug}
  2337. end
  2338. else
  2339. begin
  2340. {$ifdef extdebug}
  2341. list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
  2342. {$endif extdebug}
  2343. { regular loop -> definitely use post-indexing }
  2344. loadop:=scaledloadop;
  2345. makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
  2346. storeop:=scaledstoreop;
  2347. makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
  2348. current_asmdata.getjumplabel(hl);
  2349. countreg:=getintregister(list,OS_32);
  2350. if loadop=A_LDP then
  2351. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
  2352. else
  2353. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
  2354. a_label(list,hl);
  2355. a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
  2356. if loadop=A_LDP then
  2357. begin
  2358. regs[1]:=getintregister(list,opsize);
  2359. regs[2]:=getintregister(list,opsize);
  2360. gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
  2361. gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
  2362. end
  2363. else
  2364. begin
  2365. regs[1]:=getintregister(list,opsize);
  2366. genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
  2367. genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
  2368. end;
  2369. list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
  2370. len:=len mod tcgsize2size[opsize];
  2371. end;
  2372. gencopyleftovers(list,tmpsource,tmpdest,len);
  2373. end;
  2374. procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
  2375. begin
  2376. { This method is integrated into g_intf_wrapper and shouldn't be called separately }
  2377. InternalError(2013020102);
  2378. end;
  2379. procedure tcgaarch64.g_check_for_fpu_exception(list: TAsmList;force,clear : boolean);
  2380. var
  2381. r, tmpreg: TRegister;
  2382. ai: taicpu;
  2383. l1,l2: TAsmLabel;
  2384. begin
  2385. { so far, we assume all flavours of AArch64 need explicit floating point exception checking }
  2386. if ((cs_check_fpu_exceptions in current_settings.localswitches) and
  2387. (force or current_procinfo.FPUExceptionCheckNeeded)) then
  2388. begin
  2389. r:=getintregister(list,OS_INT);
  2390. tmpreg:=getintregister(list,OS_INT);
  2391. list.concat(taicpu.op_reg_reg(A_MRS,r,NR_FPSR));
  2392. list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$1f));
  2393. current_asmdata.getjumplabel(l1);
  2394. current_asmdata.getjumplabel(l2);
  2395. ai:=taicpu.op_reg_sym_ofs(A_CBNZ,tmpreg,l1,0);
  2396. ai.is_jmp:=true;
  2397. list.concat(ai);
  2398. list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$80));
  2399. ai:=taicpu.op_reg_sym_ofs(A_CBZ,tmpreg,l2,0);
  2400. ai.is_jmp:=true;
  2401. list.concat(ai);
  2402. a_label(list,l1);
  2403. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2404. cg.a_call_name(list,'FPC_THROWFPUEXCEPTION',false);
  2405. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2406. a_label(list,l2);
  2407. if clear then
  2408. current_procinfo.FPUExceptionCheckNeeded:=false;
  2409. end;
  2410. end;
  2411. procedure tcgaarch64.g_profilecode(list : TAsmList);
  2412. begin
  2413. if target_info.system = system_aarch64_linux then
  2414. begin
  2415. list.concat(taicpu.op_reg_reg(A_MOV,NR_X0,NR_X30));
  2416. a_call_name(list,'_mcount',false);
  2417. end
  2418. else
  2419. internalerror(2020021901);
  2420. end;
  2421. procedure create_codegen;
  2422. begin
  2423. cg:=tcgaarch64.Create;
  2424. cg128:=tcg128.Create;
  2425. end;
  2426. end.