cgcpu.pas 88 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266
  1. {
  2. Copyright (c) 2014 by Jonas Maebe
  3. This unit implements the code generator for AArch64
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. globtype,parabase,
  22. cgbase,cgutils,cgobj,
  23. aasmbase,aasmtai,aasmdata,aasmcpu,
  24. cpubase,cpuinfo,
  25. node,symconst,SymType,symdef,
  26. rgcpu;
  27. type
  28. tcgaarch64=class(tcg)
  29. protected
  30. { changes register size without adding register allocation info }
  31. function makeregsize(reg: tregister; size: tcgsize): tregister; overload;
  32. public
  33. { simplifies "ref" so it can be used with "op". If "ref" can be used
  34. with a different load/Store operation that has the same meaning as the
  35. original one, "op" will be replaced with the alternative }
  36. procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  37. function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override;
  38. procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  39. procedure init_register_allocators;override;
  40. procedure done_register_allocators;override;
  41. function getmmregister(list:TAsmList;size:tcgsize):tregister;override;
  42. function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  43. procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override;
  44. procedure a_call_reg(list:TAsmList;Reg:tregister);override;
  45. { General purpose instructions }
  46. procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  47. procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override;
  48. procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override;
  49. procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override;
  50. procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override;
  51. procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  52. procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  53. { move instructions }
  54. procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override;
  55. procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override;
  56. procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override;
  57. procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override;
  58. procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override;
  59. procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override;
  60. procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override;
  61. procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override;
  62. { fpu move instructions (not used, all floating point is vector unit-based) }
  63. procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override;
  64. procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override;
  65. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  66. procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override;
  67. procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override;
  68. procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override;
  69. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  70. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override;
  71. procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override;
  72. procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override;
  73. { comparison operations }
  74. procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override;
  75. procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override;
  76. procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override;
  77. procedure a_jmp_name(list: TAsmList; const s: string);override;
  78. procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;}
  79. procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override;
  80. procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override;
  81. procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override;
  82. procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override;
  83. procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override;
  84. procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override;
  85. procedure g_maybe_got_init(list: TAsmList); override;
  86. procedure g_restore_registers(list: TAsmList);override;
  87. procedure g_save_registers(list: TAsmList);override;
  88. procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
  89. procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
  90. procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
  91. private
  92. function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  93. procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  94. end;
  95. procedure create_codegen;
  96. const
  97. TOpCG2AsmOpReg: array[topcg] of TAsmOp = (
  98. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV
  99. );
  100. TOpCG2AsmOpImm: array[topcg] of TAsmOp = (
  101. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR
  102. );
  103. TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT,
  104. C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI
  105. );
  106. implementation
  107. uses
  108. globals,verbose,systems,cutils,
  109. paramgr,fmodule,
  110. symtable,symsym,
  111. tgobj,
  112. procinfo,cpupi;
  113. procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  114. var
  115. href: treference;
  116. so: tshifterop;
  117. accesssize: longint;
  118. begin
  119. if (ref.base=NR_NO) then
  120. begin
  121. if ref.shiftmode<>SM_None then
  122. internalerror(2014110701);
  123. ref.base:=ref.index;
  124. ref.index:=NR_NO;
  125. end;
  126. { no abitrary scale factor support (the generic code doesn't set it,
  127. AArch-specific code shouldn't either) }
  128. if not(ref.scalefactor in [0,1]) then
  129. internalerror(2014111002);
  130. case simple_ref_type(op,size,oppostfix,ref) of
  131. sr_simple:
  132. exit;
  133. sr_internal_illegal:
  134. internalerror(2014121702);
  135. sr_complex:
  136. { continue } ;
  137. end;
  138. if assigned(ref.symbol) then
  139. begin
  140. { internal "load symbol" instructions should already be valid }
  141. if assigned(ref.symboldata) or
  142. (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then
  143. internalerror(2014110802);
  144. { no relative symbol support (needed) yet }
  145. if assigned(ref.relsymbol) then
  146. internalerror(2014111001);
  147. { loading a symbol address (whether it's in the GOT or not) consists
  148. of two parts: first load the page on which it is located, then
  149. either the offset in the page or load the value at that offset in
  150. the page. This final GOT-load can be relaxed by the linker in case
  151. the variable itself can be stored directly in the GOT }
  152. if (preferred_newbasereg=NR_NO) or
  153. (ref.base=preferred_newbasereg) or
  154. (ref.index=preferred_newbasereg) then
  155. preferred_newbasereg:=getaddressregister(list);
  156. { load the (GOT) page }
  157. reference_reset_symbol(href,ref.symbol,0,8,[]);
  158. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  159. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  160. ((ref.symbol.typ=AT_DATA) and
  161. (ref.symbol.bind=AB_LOCAL)) then
  162. href.refaddr:=addr_page
  163. else
  164. href.refaddr:=addr_gotpage;
  165. list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href));
  166. { load the GOT entry (= address of the variable) }
  167. reference_reset_base(href,preferred_newbasereg,0,ctempposinvalid,sizeof(pint),[]);
  168. href.symbol:=ref.symbol;
  169. { code symbols defined in the current compilation unit do not
  170. have to be accessed via the GOT }
  171. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  172. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  173. ((ref.symbol.typ=AT_DATA) and
  174. (ref.symbol.bind=AB_LOCAL)) then
  175. begin
  176. href.base:=NR_NO;
  177. href.refaddr:=addr_pageoffset;
  178. list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href));
  179. end
  180. else
  181. begin
  182. href.refaddr:=addr_gotpageoffset;
  183. { use a_load_ref_reg() rather than directly encoding the LDR,
  184. so that we'll check the validity of the reference }
  185. a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg);
  186. end;
  187. { set as new base register }
  188. if ref.base=NR_NO then
  189. ref.base:=preferred_newbasereg
  190. else if ref.index=NR_NO then
  191. ref.index:=preferred_newbasereg
  192. else
  193. begin
  194. { make sure it's valid in case ref.base is SP -> make it
  195. the second operand}
  196. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg);
  197. ref.base:=preferred_newbasereg
  198. end;
  199. ref.symbol:=nil;
  200. end;
  201. { base & index }
  202. if (ref.base<>NR_NO) and
  203. (ref.index<>NR_NO) then
  204. begin
  205. case op of
  206. A_LDR, A_STR:
  207. begin
  208. if (ref.shiftmode=SM_None) and
  209. (ref.shiftimm<>0) then
  210. internalerror(2014110805);
  211. { wrong shift? (possible in case of something like
  212. array_of_2byte_rec[x].bytefield -> shift will be set 1, but
  213. the final load is a 1 byte -> can't use shift after all }
  214. if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and
  215. ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or
  216. (ref.offset<>0)) then
  217. begin
  218. if preferred_newbasereg=NR_NO then
  219. preferred_newbasereg:=getaddressregister(list);
  220. { "add" supports a superset of the shift modes supported by
  221. load/store instructions }
  222. shifterop_reset(so);
  223. so.shiftmode:=ref.shiftmode;
  224. so.shiftimm:=ref.shiftimm;
  225. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  226. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  227. { possibly still an invalid offset -> fall through }
  228. end
  229. else if ref.offset<>0 then
  230. begin
  231. if (preferred_newbasereg=NR_NO) or
  232. { we keep ref.index, so it must not be overwritten }
  233. (ref.index=preferred_newbasereg) then
  234. preferred_newbasereg:=getaddressregister(list);
  235. { add to the base and not to the index, because the index
  236. may be scaled; this works even if the base is SP }
  237. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  238. ref.offset:=0;
  239. ref.base:=preferred_newbasereg;
  240. { finished }
  241. exit;
  242. end
  243. else
  244. { valid -> exit }
  245. exit;
  246. end;
  247. { todo }
  248. A_LD1,A_LD2,A_LD3,A_LD4,
  249. A_ST1,A_ST2,A_ST3,A_ST4:
  250. internalerror(2014110704);
  251. { these don't support base+index }
  252. A_LDUR,A_STUR,
  253. A_LDP,A_STP:
  254. begin
  255. { these either don't support pre-/post-indexing, or don't
  256. support it with base+index }
  257. if ref.addressmode<>AM_OFFSET then
  258. internalerror(2014110911);
  259. if preferred_newbasereg=NR_NO then
  260. preferred_newbasereg:=getaddressregister(list);
  261. if ref.shiftmode<>SM_None then
  262. begin
  263. { "add" supports a superset of the shift modes supported by
  264. load/store instructions }
  265. shifterop_reset(so);
  266. so.shiftmode:=ref.shiftmode;
  267. so.shiftimm:=ref.shiftimm;
  268. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  269. end
  270. else
  271. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg);
  272. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  273. { fall through to the handling of base + offset, since the
  274. offset may still be too big }
  275. end;
  276. else
  277. internalerror(2014110901);
  278. end;
  279. end;
  280. { base + offset }
  281. if ref.base<>NR_NO then
  282. begin
  283. { valid offset for LDUR/STUR -> use that }
  284. if (ref.addressmode=AM_OFFSET) and
  285. (op in [A_LDR,A_STR]) and
  286. (ref.offset>=-256) and
  287. (ref.offset<=255) then
  288. begin
  289. if op=A_LDR then
  290. op:=A_LDUR
  291. else
  292. op:=A_STUR
  293. end
  294. { if it's not a valid LDUR/STUR, use LDR/STR }
  295. else if (op in [A_LDUR,A_STUR]) and
  296. ((ref.offset<-256) or
  297. (ref.offset>255) or
  298. (ref.addressmode<>AM_OFFSET)) then
  299. begin
  300. if op=A_LDUR then
  301. op:=A_LDR
  302. else
  303. op:=A_STR
  304. end;
  305. case op of
  306. A_LDR,A_STR:
  307. begin
  308. case ref.addressmode of
  309. AM_PREINDEXED:
  310. begin
  311. { since the loaded/stored register cannot be the same
  312. as the base register, we can safely add the
  313. offset to the base if it doesn't fit}
  314. if (ref.offset<-256) or
  315. (ref.offset>255) then
  316. begin
  317. a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base);
  318. ref.offset:=0;
  319. end;
  320. end;
  321. AM_POSTINDEXED:
  322. begin
  323. { cannot emulate post-indexing if we have to fold the
  324. offset into the base register }
  325. if (ref.offset<-256) or
  326. (ref.offset>255) then
  327. internalerror(2014110909);
  328. { ok }
  329. end;
  330. AM_OFFSET:
  331. begin
  332. { unsupported offset -> fold into base register }
  333. accesssize:=1 shl tcgsizep2size[size];
  334. if (ref.offset<0) or
  335. (ref.offset>(((1 shl 12)-1)*accesssize)) or
  336. ((ref.offset mod accesssize)<>0) then
  337. begin
  338. if preferred_newbasereg=NR_NO then
  339. preferred_newbasereg:=getaddressregister(list);
  340. { can we split the offset beween an
  341. "add/sub (imm12 shl 12)" and the load (also an
  342. imm12)?
  343. -- the offset from the load will always be added,
  344. that's why the lower bound has a smaller range
  345. than the upper bound; it must also be a multiple
  346. of the access size }
  347. if (ref.offset>=-(((1 shl 12)-1) shl 12)) and
  348. (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and
  349. ((ref.offset mod accesssize)=0) then
  350. begin
  351. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg);
  352. ref.offset:=ref.offset-(ref.offset shr 12) shl 12;
  353. end
  354. else
  355. begin
  356. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  357. ref.offset:=0;
  358. end;
  359. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  360. end;
  361. end
  362. end;
  363. end;
  364. A_LDP,A_STP:
  365. begin
  366. { unsupported offset -> fold into base register (these
  367. instructions support all addressmodes) }
  368. if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or
  369. (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then
  370. begin
  371. case ref.addressmode of
  372. AM_POSTINDEXED:
  373. { don't emulate post-indexing if we have to fold the
  374. offset into the base register }
  375. internalerror(2014110910);
  376. AM_PREINDEXED:
  377. { this means the offset must be added to the current
  378. base register }
  379. preferred_newbasereg:=ref.base;
  380. AM_OFFSET:
  381. if preferred_newbasereg=NR_NO then
  382. preferred_newbasereg:=getaddressregister(list);
  383. end;
  384. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  385. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,ref.alignment,ref.volatility);
  386. end
  387. end;
  388. A_LDUR,A_STUR:
  389. begin
  390. { valid, checked above }
  391. end;
  392. { todo }
  393. A_LD1,A_LD2,A_LD3,A_LD4,
  394. A_ST1,A_ST2,A_ST3,A_ST4:
  395. internalerror(2014110908);
  396. else
  397. internalerror(2014110708);
  398. end;
  399. { done }
  400. exit;
  401. end;
  402. { only an offset -> change to base (+ offset 0) }
  403. if preferred_newbasereg=NR_NO then
  404. preferred_newbasereg:=getaddressregister(list);
  405. a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg);
  406. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,newalignment(8,ref.offset),ref.volatility);
  407. end;
  408. function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister;
  409. var
  410. subreg:Tsubregister;
  411. begin
  412. subreg:=cgsize2subreg(getregtype(reg),size);
  413. result:=reg;
  414. setsubreg(result,subreg);
  415. end;
  416. function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister;
  417. begin
  418. internalerror(2014122110);
  419. { squash warning }
  420. result:=NR_NO;
  421. end;
  422. function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  423. begin
  424. make_simple_ref(list,op,size,oppostfix,ref,NR_NO);
  425. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix));
  426. result:=ref;
  427. end;
  428. procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  429. var
  430. instr: taicpu;
  431. so: tshifterop;
  432. hadtmpreg: boolean;
  433. begin
  434. { imm12 }
  435. if (a>=0) and
  436. (a<=((1 shl 12)-1)) then
  437. if usedest then
  438. instr:=taicpu.op_reg_reg_const(op,dst,src,a)
  439. else
  440. instr:=taicpu.op_reg_const(op,src,a)
  441. { imm12 lsl 12 }
  442. else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then
  443. begin
  444. so.shiftmode:=SM_LSL;
  445. so.shiftimm:=12;
  446. if usedest then
  447. instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so)
  448. else
  449. instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so)
  450. end
  451. else
  452. begin
  453. { todo: other possible optimizations (e.g. load 16 bit constant in
  454. register and then add/sub/cmp/cmn shifted the rest) }
  455. if tmpreg=NR_NO then
  456. begin
  457. hadtmpreg:=false;
  458. tmpreg:=getintregister(list,size);
  459. end
  460. else
  461. begin
  462. hadtmpreg:=true;
  463. getcpuregister(list,tmpreg);
  464. end;
  465. a_load_const_reg(list,size,a,tmpreg);
  466. if usedest then
  467. instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg)
  468. else
  469. instr:=taicpu.op_reg_reg(op,src,tmpreg);
  470. if hadtmpreg then
  471. ungetcpuregister(list,tmpreg);
  472. end;
  473. if setflags then
  474. setoppostfix(instr,PF_S);
  475. list.concat(instr);
  476. end;
  477. {****************************************************************************
  478. Assembler code
  479. ****************************************************************************}
  480. procedure tcgaarch64.init_register_allocators;
  481. begin
  482. inherited init_register_allocators;
  483. rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
  484. [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
  485. RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
  486. RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
  487. { maybe we can enable this in the future for leaf functions (it's
  488. the frame pointer)
  489. ,RS_X29 }],
  490. first_int_imreg,[]);
  491. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD,
  492. [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7,
  493. RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15,
  494. RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23,
  495. RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31],
  496. first_mm_imreg,[]);
  497. end;
  498. procedure tcgaarch64.done_register_allocators;
  499. begin
  500. rg[R_INTREGISTER].free;
  501. rg[R_FPUREGISTER].free;
  502. rg[R_MMREGISTER].free;
  503. inherited done_register_allocators;
  504. end;
  505. function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister;
  506. begin
  507. case size of
  508. OS_F32:
  509. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
  510. OS_F64:
  511. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD)
  512. else
  513. internalerror(2014102701);
  514. end;
  515. end;
  516. procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean);
  517. begin
  518. if not weak then
  519. list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)))
  520. else
  521. list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION)));
  522. end;
  523. procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister);
  524. begin
  525. list.concat(taicpu.op_reg(A_BLR,reg));
  526. end;
  527. {********************** load instructions ********************}
  528. procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister);
  529. var
  530. preva: tcgint;
  531. opc: tasmop;
  532. shift,maxshift: byte;
  533. so: tshifterop;
  534. reginited: boolean;
  535. mask: tcgint;
  536. begin
  537. { if we load a value into a 32 bit register, it is automatically
  538. zero-extended to 64 bit }
  539. if (hi(a)=0) and
  540. (size in [OS_64,OS_S64]) then
  541. begin
  542. size:=OS_32;
  543. reg:=makeregsize(reg,size);
  544. end;
  545. { values <= 32 bit are stored in a 32 bit register }
  546. if not(size in [OS_64,OS_S64]) then
  547. a:=cardinal(a);
  548. if size in [OS_64,OS_S64] then
  549. begin
  550. mask:=-1;
  551. maxshift:=64;
  552. end
  553. else
  554. begin
  555. mask:=$ffffffff;
  556. maxshift:=32;
  557. end;
  558. { single movn enough? (to be extended) }
  559. shift:=16;
  560. preva:=a;
  561. repeat
  562. if (a shr shift)=(mask shr shift) then
  563. begin
  564. if shift=16 then
  565. list.concat(taicpu.op_reg_const(A_MOVN,reg,not(word(preva))))
  566. else
  567. begin
  568. shifterop_reset(so);
  569. so.shiftmode:=SM_LSL;
  570. so.shiftimm:=shift-16;
  571. list.concat(taicpu.op_reg_const_shifterop(A_MOVN,reg,not(word(preva)),so));
  572. end;
  573. exit;
  574. end;
  575. { only try the next 16 bits if the current one is all 1 bits, since
  576. the movn will set all lower bits to 1 }
  577. if word(a shr (shift-16))<>$ffff then
  578. break;
  579. inc(shift,16);
  580. until shift=maxshift;
  581. reginited:=false;
  582. shift:=0;
  583. { can be optimized later to use more movn }
  584. repeat
  585. { leftover is shifterconst? (don't check if we can represent it just
  586. as effectively with movz/movk, as this check is expensive) }
  587. if ((shift<tcgsize2size[size]*(8 div 2)) and
  588. (word(a)<>0) and
  589. ((a shr 16)<>0)) and
  590. is_shifter_const(a shl shift,size) then
  591. begin
  592. if reginited then
  593. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift))
  594. else
  595. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift));
  596. exit;
  597. end;
  598. { set all 16 bit parts <> 0 }
  599. if (word(a)<>0) or
  600. ((shift=0) and
  601. (a=0)) then
  602. if shift=0 then
  603. begin
  604. list.concat(taicpu.op_reg_const(A_MOVZ,reg,word(a)));
  605. reginited:=true;
  606. end
  607. else
  608. begin
  609. shifterop_reset(so);
  610. so.shiftmode:=SM_LSL;
  611. so.shiftimm:=shift;
  612. if not reginited then
  613. begin
  614. opc:=A_MOVZ;
  615. reginited:=true;
  616. end
  617. else
  618. opc:=A_MOVK;
  619. list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so));
  620. end;
  621. preva:=a;
  622. a:=a shr 16;
  623. inc(shift,16);
  624. until word(preva)=preva;
  625. if not reginited then
  626. internalerror(2014102702);
  627. end;
  628. procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference);
  629. var
  630. reg: tregister;
  631. begin
  632. { use the zero register if possible }
  633. if a=0 then
  634. begin
  635. if size in [OS_64,OS_S64] then
  636. reg:=NR_XZR
  637. else
  638. reg:=NR_WZR;
  639. a_load_reg_ref(list,size,size,reg,ref);
  640. end
  641. else
  642. inherited;
  643. end;
  644. procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  645. var
  646. oppostfix:toppostfix;
  647. hreg: tregister;
  648. begin
  649. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  650. begin
  651. fromsize:=tosize;
  652. reg:=makeregsize(list,reg,fromsize);
  653. end
  654. { have a 32 bit register but need a 64 bit one? }
  655. else if tosize in [OS_64,OS_S64] then
  656. begin
  657. { sign extend if necessary }
  658. if fromsize in [OS_S8,OS_S16,OS_S32] then
  659. begin
  660. { can't overwrite reg, may be a constant reg }
  661. hreg:=getintregister(list,tosize);
  662. a_load_reg_reg(list,fromsize,tosize,reg,hreg);
  663. reg:=hreg;
  664. end
  665. else
  666. { top 32 bit are zero by default }
  667. reg:=makeregsize(reg,OS_64);
  668. fromsize:=tosize;
  669. end;
  670. if (ref.alignment<>0) and
  671. (ref.alignment<tcgsize2size[tosize]) then
  672. begin
  673. a_load_reg_ref_unaligned(list,fromsize,tosize,reg,ref);
  674. end
  675. else
  676. begin
  677. case tosize of
  678. { signed integer registers }
  679. OS_8,
  680. OS_S8:
  681. oppostfix:=PF_B;
  682. OS_16,
  683. OS_S16:
  684. oppostfix:=PF_H;
  685. OS_32,
  686. OS_S32,
  687. OS_64,
  688. OS_S64:
  689. oppostfix:=PF_None;
  690. else
  691. InternalError(200308299);
  692. end;
  693. handle_load_store(list,A_STR,tosize,oppostfix,reg,ref);
  694. end;
  695. end;
  696. procedure tcgaarch64.a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  697. var
  698. oppostfix:toppostfix;
  699. begin
  700. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  701. fromsize:=tosize;
  702. { ensure that all bits of the 32/64 register are always correctly set:
  703. * default behaviour is always to zero-extend to the entire (64 bit)
  704. register -> unsigned 8/16/32 bit loads only exist with a 32 bit
  705. target register, as the upper 32 bit will be zeroed implicitly
  706. -> always make target register 32 bit
  707. * signed loads exist both with 32 and 64 bit target registers,
  708. depending on whether the value should be sign extended to 32 or
  709. to 64 bit (if sign extended to 32 bit, the upper 32 bits of the
  710. corresponding 64 bit register are again zeroed) -> no need to
  711. change anything (we only have 32 and 64 bit registers), except that
  712. when loading an OS_S32 to a 32 bit register, we don't need/can't
  713. use sign extension
  714. }
  715. if fromsize in [OS_8,OS_16,OS_32] then
  716. reg:=makeregsize(reg,OS_32);
  717. if (ref.alignment<>0) and
  718. (ref.alignment<tcgsize2size[fromsize]) then
  719. begin
  720. a_load_ref_reg_unaligned(list,fromsize,tosize,ref,reg);
  721. exit;
  722. end;
  723. case fromsize of
  724. { signed integer registers }
  725. OS_8:
  726. oppostfix:=PF_B;
  727. OS_S8:
  728. oppostfix:=PF_SB;
  729. OS_16:
  730. oppostfix:=PF_H;
  731. OS_S16:
  732. oppostfix:=PF_SH;
  733. OS_S32:
  734. if getsubreg(reg)=R_SUBD then
  735. oppostfix:=PF_NONE
  736. else
  737. oppostfix:=PF_SW;
  738. OS_32,
  739. OS_64,
  740. OS_S64:
  741. oppostfix:=PF_None;
  742. else
  743. InternalError(200308297);
  744. end;
  745. handle_load_store(list,A_LDR,fromsize,oppostfix,reg,ref);
  746. { clear upper 16 bits if the value was negative }
  747. if (fromsize=OS_S8) and (tosize=OS_16) then
  748. a_load_reg_reg(list,fromsize,tosize,reg,reg);
  749. end;
  750. procedure tcgaarch64.a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister);
  751. var
  752. href: treference;
  753. hreg1, hreg2, tmpreg,tmpreg2: tregister;
  754. i : Integer;
  755. begin
  756. case fromsize of
  757. OS_64,OS_S64:
  758. begin
  759. { split into two 32 bit loads }
  760. hreg1:=getintregister(list,OS_32);
  761. hreg2:=getintregister(list,OS_32);
  762. if target_info.endian=endian_big then
  763. begin
  764. tmpreg:=hreg1;
  765. hreg1:=hreg2;
  766. hreg2:=tmpreg;
  767. end;
  768. { can we use LDP? }
  769. if (ref.alignment=4) and
  770. (simple_ref_type(A_LDP,OS_32,PF_None,ref)=sr_simple) then
  771. list.concat(taicpu.op_reg_reg_ref(A_LDP,hreg1,hreg2,ref))
  772. else
  773. begin
  774. a_load_ref_reg(list,OS_32,OS_32,ref,hreg1);
  775. href:=ref;
  776. inc(href.offset,4);
  777. a_load_ref_reg(list,OS_32,OS_32,href,hreg2);
  778. end;
  779. a_load_reg_reg(list,OS_32,OS_64,hreg1,register);
  780. list.concat(taicpu.op_reg_reg_const_const(A_BFI,register,makeregsize(hreg2,OS_64),32,32));
  781. end;
  782. OS_16,OS_S16,
  783. OS_32,OS_S32:
  784. begin
  785. if ref.alignment=2 then
  786. begin
  787. href:=ref;
  788. if target_info.endian=endian_big then
  789. inc(href.offset,tcgsize2size[fromsize]-2);
  790. tmpreg:=getintregister(list,OS_32);
  791. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg);
  792. tmpreg2:=getintregister(list,OS_32);
  793. for i:=1 to (tcgsize2size[fromsize]-1) div 2 do
  794. begin
  795. if target_info.endian=endian_big then
  796. dec(href.offset,2)
  797. else
  798. inc(href.offset,2);
  799. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg2);
  800. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*16,16));
  801. end;
  802. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  803. end
  804. else
  805. begin
  806. href:=ref;
  807. if target_info.endian=endian_big then
  808. inc(href.offset,tcgsize2size[fromsize]-1);
  809. tmpreg:=getintregister(list,OS_32);
  810. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg);
  811. tmpreg2:=getintregister(list,OS_32);
  812. for i:=1 to tcgsize2size[fromsize]-1 do
  813. begin
  814. if target_info.endian=endian_big then
  815. dec(href.offset)
  816. else
  817. inc(href.offset);
  818. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg2);
  819. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*8,8));
  820. end;
  821. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  822. end;
  823. end;
  824. else
  825. inherited;
  826. end;
  827. end;
  828. procedure tcgaarch64.a_load_reg_reg(list:TAsmList;fromsize,tosize:tcgsize;reg1,reg2:tregister);
  829. var
  830. instr: taicpu;
  831. begin
  832. { we use both 32 and 64 bit registers -> insert conversion when when
  833. we have to truncate/sign extend inside the (32 or 64 bit) register
  834. holding the value, and when we sign extend from a 32 to a 64 bit
  835. register }
  836. if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or
  837. ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and
  838. (fromsize<>tosize) and
  839. not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or
  840. ((fromsize in [OS_S8,OS_S16,OS_S32]) and
  841. (tosize in [OS_64,OS_S64])) or
  842. { needs to mask out the sign in the top 16 bits }
  843. ((fromsize=OS_S8) and
  844. (tosize=OS_16)) then
  845. begin
  846. case tosize of
  847. OS_8:
  848. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  849. OS_16:
  850. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  851. OS_S8:
  852. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  853. OS_S16:
  854. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  855. { while "mov wN, wM" automatically inserts a zero-extension and
  856. hence we could encode a 64->32 bit move like that, the problem
  857. is that we then can't distinguish 64->32 from 32->32 moves, and
  858. the 64->32 truncation could be removed altogether... So use a
  859. different instruction }
  860. OS_32,
  861. OS_S32:
  862. { in theory, reg1 should be 64 bit here (since fromsize>tosize),
  863. but because of the way location_force_register() tries to
  864. avoid superfluous zero/sign extensions, it's not always the
  865. case -> also force reg1 to to 64 bit }
  866. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  867. OS_64,
  868. OS_S64:
  869. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_W));
  870. else
  871. internalerror(2002090901);
  872. end;
  873. end
  874. else
  875. begin
  876. { 32 -> 32 bit move implies zero extension (sign extensions have
  877. been handled above) -> also use for 32 <-> 64 bit moves }
  878. if not(fromsize in [OS_64,OS_S64]) or
  879. not(tosize in [OS_64,OS_S64]) then
  880. instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32))
  881. else
  882. instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1);
  883. list.Concat(instr);
  884. { Notify the register allocator that we have written a move instruction so
  885. it can try to eliminate it. }
  886. add_move_instruction(instr);
  887. end;
  888. end;
  889. procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister);
  890. var
  891. href: treference;
  892. so: tshifterop;
  893. op: tasmop;
  894. begin
  895. op:=A_LDR;
  896. href:=ref;
  897. { simplify as if we're going to perform a regular 64 bit load, using
  898. "r" as the new base register if possible/necessary }
  899. make_simple_ref(list,op,OS_ADDR,PF_None,href,r);
  900. { load literal? }
  901. if assigned(href.symbol) then
  902. begin
  903. if (href.base<>NR_NO) or
  904. (href.index<>NR_NO) or
  905. not assigned(href.symboldata) then
  906. internalerror(2014110912);
  907. list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset));
  908. end
  909. else
  910. begin
  911. if href.index<>NR_NO then
  912. begin
  913. if href.shiftmode<>SM_None then
  914. begin
  915. { "add" supports a supperset of the shift modes supported by
  916. load/store instructions }
  917. shifterop_reset(so);
  918. so.shiftmode:=href.shiftmode;
  919. so.shiftimm:=href.shiftimm;
  920. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so));
  921. end
  922. else
  923. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r);
  924. end
  925. else if href.offset<>0 then
  926. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r)
  927. else
  928. a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r);
  929. end;
  930. end;
  931. procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);
  932. begin
  933. internalerror(2014122107)
  934. end;
  935. procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  936. begin
  937. internalerror(2014122108)
  938. end;
  939. procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  940. begin
  941. internalerror(2014122109)
  942. end;
  943. procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
  944. var
  945. instr: taicpu;
  946. begin
  947. if assigned(shuffle) and
  948. not shufflescalar(shuffle) then
  949. internalerror(2014122104);
  950. if fromsize=tosize then
  951. begin
  952. instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1);
  953. { Notify the register allocator that we have written a move
  954. instruction so it can try to eliminate it. }
  955. add_move_instruction(instr);
  956. end
  957. else
  958. begin
  959. if (reg_cgsize(reg1)<>fromsize) or
  960. (reg_cgsize(reg2)<>tosize) then
  961. internalerror(2014110913);
  962. instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1);
  963. end;
  964. list.Concat(instr);
  965. end;
  966. procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
  967. var
  968. tmpreg: tregister;
  969. begin
  970. if assigned(shuffle) and
  971. not shufflescalar(shuffle) then
  972. internalerror(2014122105);
  973. tmpreg:=NR_NO;
  974. if (fromsize<>tosize) then
  975. begin
  976. tmpreg:=reg;
  977. reg:=getmmregister(list,fromsize);
  978. end;
  979. handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref);
  980. if (fromsize<>tosize) then
  981. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  982. end;
  983. procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
  984. var
  985. tmpreg: tregister;
  986. begin
  987. if assigned(shuffle) and
  988. not shufflescalar(shuffle) then
  989. internalerror(2014122106);
  990. if (fromsize<>tosize) then
  991. begin
  992. tmpreg:=getmmregister(list,tosize);
  993. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  994. reg:=tmpreg;
  995. end;
  996. handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref);
  997. end;
  998. procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  999. begin
  1000. if not shufflescalar(shuffle) then
  1001. internalerror(2014122801);
  1002. if not(tcgsize2size[fromsize] in [4,8]) or
  1003. (tcgsize2size[fromsize]<>tcgsize2size[tosize]) then
  1004. internalerror(2014122803);
  1005. list.concat(taicpu.op_reg_reg(A_INS,mmreg,intreg));
  1006. end;
  1007. procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
  1008. var
  1009. r : tregister;
  1010. begin
  1011. if not shufflescalar(shuffle) then
  1012. internalerror(2014122802);
  1013. if not(tcgsize2size[fromsize] in [4,8]) or
  1014. (tcgsize2size[fromsize]>tcgsize2size[tosize]) then
  1015. internalerror(2014122804);
  1016. if tcgsize2size[fromsize]<tcgsize2size[tosize] then
  1017. r:=makeregsize(intreg,fromsize)
  1018. else
  1019. r:=intreg;
  1020. list.concat(taicpu.op_reg_reg(A_UMOV,r,mmreg));
  1021. end;
  1022. procedure tcgaarch64.a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
  1023. begin
  1024. case op of
  1025. { "xor Vx,Vx" is used to initialize global regvars to 0 }
  1026. OP_XOR:
  1027. begin
  1028. if (src<>dst) or
  1029. (reg_cgsize(src)<>size) or
  1030. assigned(shuffle) then
  1031. internalerror(2015011401);
  1032. case size of
  1033. OS_F32,
  1034. OS_F64:
  1035. list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0));
  1036. else
  1037. internalerror(2015011402);
  1038. end;
  1039. end
  1040. else
  1041. internalerror(2015011403);
  1042. end;
  1043. end;
  1044. procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister);
  1045. var
  1046. bitsize: longint;
  1047. begin
  1048. if srcsize in [OS_64,OS_S64] then
  1049. begin
  1050. bitsize:=64;
  1051. end
  1052. else
  1053. begin
  1054. bitsize:=32;
  1055. end;
  1056. { source is 0 -> dst will have to become 255 }
  1057. list.concat(taicpu.op_reg_const(A_CMP,src,0));
  1058. if reverse then
  1059. begin
  1060. list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src));
  1061. { xor 31/63 is the same as setting the lower 5/6 bits to
  1062. "31/63-(lower 5/6 bits of dst)" }
  1063. list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1));
  1064. end
  1065. else
  1066. begin
  1067. list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src));
  1068. list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
  1069. end;
  1070. { set dst to -1 if src was 0 }
  1071. list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE));
  1072. { mask the -1 to 255 if src was 0 (anyone find a two-instruction
  1073. branch-free version? All of mine are 3...) }
  1074. list.Concat(setoppostfix(taicpu.op_reg_reg(A_UXT,makeregsize(dst,OS_32),makeregsize(dst,OS_32)),PF_B));
  1075. end;
  1076. procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference);
  1077. var
  1078. href: treference;
  1079. hreg1, hreg2, tmpreg: tregister;
  1080. begin
  1081. if fromsize in [OS_64,OS_S64] then
  1082. begin
  1083. { split into two 32 bit stores }
  1084. hreg1:=getintregister(list,OS_32);
  1085. hreg2:=getintregister(list,OS_32);
  1086. a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1);
  1087. a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64));
  1088. if target_info.endian=endian_big then
  1089. begin
  1090. tmpreg:=hreg1;
  1091. hreg1:=hreg2;
  1092. hreg2:=tmpreg;
  1093. end;
  1094. { can we use STP? }
  1095. if (ref.alignment=4) and
  1096. (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then
  1097. list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref))
  1098. else
  1099. begin
  1100. a_load_reg_ref(list,OS_32,OS_32,hreg1,ref);
  1101. href:=ref;
  1102. inc(href.offset,4);
  1103. a_load_reg_ref(list,OS_32,OS_32,hreg2,href);
  1104. end;
  1105. end
  1106. else
  1107. inherited;
  1108. end;
  1109. procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  1110. const
  1111. overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG];
  1112. begin
  1113. if (op in overflowops) and
  1114. (size in [OS_8,OS_S8,OS_16,OS_S16]) then
  1115. a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32))
  1116. end;
  1117. procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);
  1118. begin
  1119. optimize_op_const(size,op,a);
  1120. case op of
  1121. OP_NONE:
  1122. exit;
  1123. OP_MOVE:
  1124. a_load_const_reg(list,size,a,reg);
  1125. OP_NEG,OP_NOT:
  1126. internalerror(200306011);
  1127. else
  1128. a_op_const_reg_reg(list,op,size,a,reg,reg);
  1129. end;
  1130. end;
  1131. procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister);
  1132. begin
  1133. Case op of
  1134. OP_NEG,
  1135. OP_NOT:
  1136. begin
  1137. list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src));
  1138. maybeadjustresult(list,op,size,dst);
  1139. end
  1140. else
  1141. a_op_reg_reg_reg(list,op,size,src,dst,dst);
  1142. end;
  1143. end;
  1144. procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);
  1145. var
  1146. l: tlocation;
  1147. begin
  1148. a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l);
  1149. end;
  1150. procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);
  1151. var
  1152. hreg: tregister;
  1153. begin
  1154. { no ROLV opcode... }
  1155. if op=OP_ROL then
  1156. begin
  1157. case size of
  1158. OS_32,OS_S32,
  1159. OS_64,OS_S64:
  1160. begin
  1161. hreg:=getintregister(list,size);
  1162. a_load_const_reg(list,size,tcgsize2size[size]*8,hreg);
  1163. a_op_reg_reg(list,OP_SUB,size,src1,hreg);
  1164. a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst);
  1165. exit;
  1166. end;
  1167. else
  1168. internalerror(2014111005);
  1169. end;
  1170. end
  1171. else if (op=OP_ROR) and
  1172. not(size in [OS_32,OS_S32,OS_64,OS_S64]) then
  1173. internalerror(2014111006);
  1174. if TOpCG2AsmOpReg[op]=A_NONE then
  1175. internalerror(2014111007);
  1176. list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1));
  1177. maybeadjustresult(list,op,size,dst);
  1178. end;
  1179. procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1180. var
  1181. shiftcountmask: longint;
  1182. constreg: tregister;
  1183. begin
  1184. { add/sub instructions have only positive immediate operands }
  1185. if (op in [OP_ADD,OP_SUB]) and
  1186. (a<0) then
  1187. begin
  1188. if op=OP_ADD then
  1189. op:=op_SUB
  1190. else
  1191. op:=OP_ADD;
  1192. { avoid range/overflow error in case a = low(tcgint) }
  1193. {$push}{$r-}{$q-}
  1194. a:=-a;
  1195. {$pop}
  1196. end;
  1197. ovloc.loc:=LOC_VOID;
  1198. optimize_op_const(size,op,a);
  1199. case op of
  1200. OP_NONE:
  1201. begin
  1202. a_load_reg_reg(list,size,size,src,dst);
  1203. exit;
  1204. end;
  1205. OP_MOVE:
  1206. begin
  1207. a_load_const_reg(list,size,a,dst);
  1208. exit;
  1209. end;
  1210. else
  1211. ;
  1212. end;
  1213. case op of
  1214. OP_ADD,
  1215. OP_SUB:
  1216. begin
  1217. handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true);
  1218. { on a 64 bit target, overflows with smaller data types
  1219. are handled via range errors }
  1220. if setflags and
  1221. (size in [OS_64,OS_S64]) then
  1222. begin
  1223. location_reset(ovloc,LOC_FLAGS,OS_8);
  1224. if size=OS_64 then
  1225. if op=OP_ADD then
  1226. ovloc.resflags:=F_CS
  1227. else
  1228. ovloc.resflags:=F_CC
  1229. else
  1230. ovloc.resflags:=F_VS;
  1231. end;
  1232. end;
  1233. OP_OR,
  1234. OP_AND,
  1235. OP_XOR:
  1236. begin
  1237. if not(size in [OS_64,OS_S64]) then
  1238. a:=cardinal(a);
  1239. if is_shifter_const(a,size) then
  1240. list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a))
  1241. else
  1242. begin
  1243. constreg:=getintregister(list,size);
  1244. a_load_const_reg(list,size,a,constreg);
  1245. a_op_reg_reg_reg(list,op,size,constreg,src,dst);
  1246. end;
  1247. end;
  1248. OP_SHL,
  1249. OP_SHR,
  1250. OP_SAR:
  1251. begin
  1252. if size in [OS_64,OS_S64] then
  1253. shiftcountmask:=63
  1254. else
  1255. shiftcountmask:=31;
  1256. if (a and shiftcountmask)<>0 Then
  1257. list.concat(taicpu.op_reg_reg_const(
  1258. TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask))
  1259. else
  1260. a_load_reg_reg(list,size,size,src,dst);
  1261. if (a and not(tcgint(shiftcountmask)))<>0 then
  1262. internalError(2014112101);
  1263. end;
  1264. OP_ROL,
  1265. OP_ROR:
  1266. begin
  1267. case size of
  1268. OS_32,OS_S32:
  1269. if (a and not(tcgint(31)))<>0 then
  1270. internalError(2014112102);
  1271. OS_64,OS_S64:
  1272. if (a and not(tcgint(63)))<>0 then
  1273. internalError(2014112103);
  1274. else
  1275. internalError(2014112104);
  1276. end;
  1277. { there's only a ror opcode }
  1278. if op=OP_ROL then
  1279. a:=(tcgsize2size[size]*8)-a;
  1280. list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a));
  1281. end;
  1282. OP_MUL,
  1283. OP_IMUL,
  1284. OP_DIV,
  1285. OP_IDIV:
  1286. begin
  1287. constreg:=getintregister(list,size);
  1288. a_load_const_reg(list,size,a,constreg);
  1289. a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc);
  1290. end;
  1291. else
  1292. internalerror(2014111403);
  1293. end;
  1294. maybeadjustresult(list,op,size,dst);
  1295. end;
  1296. procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1297. var
  1298. tmpreg1, tmpreg2: tregister;
  1299. begin
  1300. ovloc.loc:=LOC_VOID;
  1301. { overflow can only occur with 64 bit calculations on 64 bit cpus }
  1302. if setflags and
  1303. (size in [OS_64,OS_S64]) then
  1304. begin
  1305. case op of
  1306. OP_ADD,
  1307. OP_SUB:
  1308. begin
  1309. list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S));
  1310. ovloc.loc:=LOC_FLAGS;
  1311. if size=OS_64 then
  1312. if op=OP_ADD then
  1313. ovloc.resflags:=F_CS
  1314. else
  1315. ovloc.resflags:=F_CC
  1316. else
  1317. ovloc.resflags:=F_VS;
  1318. { finished }
  1319. exit;
  1320. end;
  1321. OP_MUL:
  1322. begin
  1323. { check whether the upper 64 bit of the 128 bit product is 0 }
  1324. tmpreg1:=getintregister(list,OS_64);
  1325. list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1));
  1326. list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0));
  1327. ovloc.loc:=LOC_FLAGS;
  1328. ovloc.resflags:=F_NE;
  1329. { still have to perform the actual multiplication }
  1330. end;
  1331. OP_IMUL:
  1332. begin
  1333. { check whether the upper 64 bits of the 128 bit multiplication
  1334. result have the same value as the replicated sign bit of the
  1335. lower 64 bits }
  1336. tmpreg1:=getintregister(list,OS_64);
  1337. list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1));
  1338. { calculate lower 64 bits (afterwards, because dst may be
  1339. equal to src1 or src2) }
  1340. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1341. { replicate sign bit }
  1342. tmpreg2:=getintregister(list,OS_64);
  1343. a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2);
  1344. list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2));
  1345. ovloc.loc:=LOC_FLAGS;
  1346. ovloc.resflags:=F_NE;
  1347. { finished }
  1348. exit;
  1349. end;
  1350. OP_IDIV,
  1351. OP_DIV:
  1352. begin
  1353. { not handled here, needs div-by-zero check (dividing by zero
  1354. just gives a 0 result on aarch64), and low(int64) div -1
  1355. check for overflow) }
  1356. internalerror(2014122101);
  1357. end;
  1358. else
  1359. internalerror(2019050936);
  1360. end;
  1361. end;
  1362. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1363. end;
  1364. {*************** compare instructructions ****************}
  1365. procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
  1366. var
  1367. op: tasmop;
  1368. begin
  1369. if a>=0 then
  1370. op:=A_CMP
  1371. else
  1372. op:=A_CMN;
  1373. { avoid range/overflow error in case a=low(tcgint) }
  1374. {$push}{$r-}{$q-}
  1375. handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false);
  1376. {$pop}
  1377. a_jmp_cond(list,cmp_op,l);
  1378. end;
  1379. procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel);
  1380. begin
  1381. list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1));
  1382. a_jmp_cond(list,cmp_op,l);
  1383. end;
  1384. procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel);
  1385. var
  1386. ai: taicpu;
  1387. begin
  1388. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION));
  1389. ai.is_jmp:=true;
  1390. list.Concat(ai);
  1391. end;
  1392. procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string);
  1393. var
  1394. ai: taicpu;
  1395. begin
  1396. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION));
  1397. ai.is_jmp:=true;
  1398. list.Concat(ai);
  1399. end;
  1400. procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel);
  1401. var
  1402. ai: taicpu;
  1403. begin
  1404. ai:=TAiCpu.op_sym(A_B,l);
  1405. ai.is_jmp:=true;
  1406. ai.SetCondition(TOpCmp2AsmCond[cond]);
  1407. list.Concat(ai);
  1408. end;
  1409. procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);
  1410. var
  1411. ai : taicpu;
  1412. begin
  1413. ai:=Taicpu.op_sym(A_B,l);
  1414. ai.is_jmp:=true;
  1415. ai.SetCondition(flags_to_cond(f));
  1416. list.Concat(ai);
  1417. end;
  1418. procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister);
  1419. begin
  1420. list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f)));
  1421. end;
  1422. procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);
  1423. begin
  1424. { we need an explicit overflow location, because there are many
  1425. possibilities (not just the overflow flag, which is only used for
  1426. signed add/sub) }
  1427. internalerror(2014112303);
  1428. end;
  1429. procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation);
  1430. var
  1431. hl : tasmlabel;
  1432. hflags : tresflags;
  1433. begin
  1434. if not(cs_check_overflow in current_settings.localswitches) then
  1435. exit;
  1436. current_asmdata.getjumplabel(hl);
  1437. case ovloc.loc of
  1438. LOC_FLAGS:
  1439. begin
  1440. hflags:=ovloc.resflags;
  1441. inverse_flags(hflags);
  1442. cg.a_jmp_flags(list,hflags,hl);
  1443. end;
  1444. else
  1445. internalerror(2014112304);
  1446. end;
  1447. a_call_name(list,'FPC_OVERFLOW',false);
  1448. a_label(list,hl);
  1449. end;
  1450. { *********** entry/exit code and address loading ************ }
  1451. function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  1452. var
  1453. ref: treference;
  1454. sr: tsuperregister;
  1455. pairreg: tregister;
  1456. begin
  1457. result:=0;
  1458. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1459. ref.addressmode:=AM_PREINDEXED;
  1460. pairreg:=NR_NO;
  1461. { store all used registers pairwise }
  1462. for sr:=lowsr to highsr do
  1463. if sr in rg[rt].used_in_proc then
  1464. if pairreg=NR_NO then
  1465. pairreg:=newreg(rt,sr,sub)
  1466. else
  1467. begin
  1468. inc(result,16);
  1469. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1470. pairreg:=NR_NO
  1471. end;
  1472. { one left -> store twice (stack must be 16 bytes aligned) }
  1473. if pairreg<>NR_NO then
  1474. begin
  1475. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
  1476. inc(result,16);
  1477. end;
  1478. end;
  1479. procedure FixupOffsets(p:TObject;arg:pointer);
  1480. var
  1481. sym: tabstractnormalvarsym absolute p;
  1482. begin
  1483. if (tsym(p).typ in [paravarsym,localvarsym]) and
  1484. (sym.localloc.loc=LOC_REFERENCE) and
  1485. (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
  1486. begin
  1487. sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
  1488. dec(sym.localloc.reference.offset,PLongint(arg)^);
  1489. end;
  1490. end;
  1491. procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
  1492. var
  1493. ref: treference;
  1494. totalstackframesize: longint;
  1495. begin
  1496. if nostackframe then
  1497. exit;
  1498. { stack pointer has to be aligned to 16 bytes at all times }
  1499. localsize:=align(localsize,16);
  1500. { save stack pointer and return address }
  1501. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1502. ref.addressmode:=AM_PREINDEXED;
  1503. list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
  1504. { initialise frame pointer }
  1505. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
  1506. totalstackframesize:=localsize;
  1507. { save modified integer registers }
  1508. inc(totalstackframesize,
  1509. save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
  1510. { only the lower 64 bits of the modified vector registers need to be
  1511. saved; if the caller needs the upper 64 bits, it has to save them
  1512. itself }
  1513. inc(totalstackframesize,
  1514. save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
  1515. { allocate stack space }
  1516. if localsize<>0 then
  1517. begin
  1518. localsize:=align(localsize,16);
  1519. current_procinfo.final_localsize:=localsize;
  1520. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1521. end;
  1522. { By default, we use the frame pointer to access parameters passed via
  1523. the stack and the stack pointer to address local variables and temps
  1524. because
  1525. a) we can use bigger positive than negative offsets (so accessing
  1526. locals via negative offsets from the frame pointer would be less
  1527. efficient)
  1528. b) we don't know the local size while generating the code, so
  1529. accessing the parameters via the stack pointer is not possible
  1530. without copying them
  1531. The problem with this is the get_frame() intrinsic:
  1532. a) it must return the same value as what we pass as parentfp
  1533. parameter, since that's how it's used in the TP-style objects unit
  1534. b) its return value must usable to access all local data from a
  1535. routine (locals and parameters), since it's all the nested
  1536. routines have access to
  1537. c) its return value must be usable to construct a backtrace, as it's
  1538. also used by the exception handling routines
  1539. The solution we use here, based on something similar that's done in
  1540. the MIPS port, is to generate all accesses to locals in the routine
  1541. itself SP-relative, and then after the code is generated and the local
  1542. size is known (namely, here), we change all SP-relative variables/
  1543. parameters into FP-relative ones. This means that they'll be accessed
  1544. less efficiently from nested routines, but those accesses are indirect
  1545. anyway and at least this way they can be accessed at all
  1546. }
  1547. if current_procinfo.has_nestedprocs then
  1548. begin
  1549. current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1550. current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1551. end;
  1552. end;
  1553. procedure tcgaarch64.g_maybe_got_init(list : TAsmList);
  1554. begin
  1555. { nothing to do on Darwin or Linux }
  1556. end;
  1557. procedure tcgaarch64.g_restore_registers(list:TAsmList);
  1558. begin
  1559. { done in g_proc_exit }
  1560. end;
  1561. procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  1562. var
  1563. ref: treference;
  1564. sr, highestsetsr: tsuperregister;
  1565. pairreg: tregister;
  1566. regcount: longint;
  1567. begin
  1568. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1569. ref.addressmode:=AM_POSTINDEXED;
  1570. { highest reg stored twice? }
  1571. regcount:=0;
  1572. highestsetsr:=RS_NO;
  1573. for sr:=lowsr to highsr do
  1574. if sr in rg[rt].used_in_proc then
  1575. begin
  1576. inc(regcount);
  1577. highestsetsr:=sr;
  1578. end;
  1579. if odd(regcount) then
  1580. begin
  1581. list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref));
  1582. highestsetsr:=pred(highestsetsr);
  1583. end;
  1584. { load all (other) used registers pairwise }
  1585. pairreg:=NR_NO;
  1586. for sr:=highestsetsr downto lowsr do
  1587. if sr in rg[rt].used_in_proc then
  1588. if pairreg=NR_NO then
  1589. pairreg:=newreg(rt,sr,sub)
  1590. else
  1591. begin
  1592. list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref));
  1593. pairreg:=NR_NO
  1594. end;
  1595. { There can't be any register left }
  1596. if pairreg<>NR_NO then
  1597. internalerror(2014112602);
  1598. end;
  1599. procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  1600. var
  1601. ref: treference;
  1602. regsstored: boolean;
  1603. sr: tsuperregister;
  1604. begin
  1605. if not nostackframe then
  1606. begin
  1607. { if no registers have been stored, we don't have to subtract the
  1608. allocated temp space from the stack pointer }
  1609. regsstored:=false;
  1610. for sr:=RS_X19 to RS_X28 do
  1611. if sr in rg[R_INTREGISTER].used_in_proc then
  1612. begin
  1613. regsstored:=true;
  1614. break;
  1615. end;
  1616. if not regsstored then
  1617. for sr:=RS_D8 to RS_D15 do
  1618. if sr in rg[R_MMREGISTER].used_in_proc then
  1619. begin
  1620. regsstored:=true;
  1621. break;
  1622. end;
  1623. { restore registers (and stack pointer) }
  1624. if regsstored then
  1625. begin
  1626. if current_procinfo.final_localsize<>0 then
  1627. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true);
  1628. load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
  1629. load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
  1630. end
  1631. else if current_procinfo.final_localsize<>0 then
  1632. { restore stack pointer }
  1633. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  1634. { restore framepointer and return address }
  1635. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1636. ref.addressmode:=AM_POSTINDEXED;
  1637. list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
  1638. end;
  1639. { return }
  1640. list.concat(taicpu.op_none(A_RET));
  1641. end;
  1642. procedure tcgaarch64.g_save_registers(list : TAsmList);
  1643. begin
  1644. { done in g_proc_entry }
  1645. end;
  1646. { ************* concatcopy ************ }
  1647. procedure tcgaarch64.g_concatcopy_move(list : TAsmList;const source,dest : treference;len : tcgint);
  1648. var
  1649. paraloc1,paraloc2,paraloc3 : TCGPara;
  1650. pd : tprocdef;
  1651. begin
  1652. pd:=search_system_proc('MOVE');
  1653. paraloc1.init;
  1654. paraloc2.init;
  1655. paraloc3.init;
  1656. paramanager.getintparaloc(list,pd,1,paraloc1);
  1657. paramanager.getintparaloc(list,pd,2,paraloc2);
  1658. paramanager.getintparaloc(list,pd,3,paraloc3);
  1659. a_load_const_cgpara(list,OS_SINT,len,paraloc3);
  1660. a_loadaddr_ref_cgpara(list,dest,paraloc2);
  1661. a_loadaddr_ref_cgpara(list,source,paraloc1);
  1662. paramanager.freecgpara(list,paraloc3);
  1663. paramanager.freecgpara(list,paraloc2);
  1664. paramanager.freecgpara(list,paraloc1);
  1665. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1666. alloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1667. a_call_name(list,'FPC_MOVE',false);
  1668. dealloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1669. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1670. paraloc3.done;
  1671. paraloc2.done;
  1672. paraloc1.done;
  1673. end;
  1674. procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
  1675. var
  1676. sourcebasereplaced, destbasereplaced: boolean;
  1677. { get optimal memory operation to use for loading/storing data
  1678. in an unrolled loop }
  1679. procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
  1680. begin
  1681. if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
  1682. (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
  1683. begin
  1684. memop:=unscaledop;
  1685. needsimplify:=true;
  1686. end
  1687. else if (unscaledop<>A_NONE) and
  1688. (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
  1689. (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
  1690. begin
  1691. memop:=unscaledop;
  1692. needsimplify:=false;
  1693. end
  1694. else
  1695. begin
  1696. memop:=scaledop;
  1697. needsimplify:=true;
  1698. end;
  1699. end;
  1700. { adjust the offset and/or addressing mode after a load/store so it's
  1701. correct for the next one of the same size }
  1702. procedure updaterefafterloadstore(var ref: treference; oplen: longint);
  1703. begin
  1704. case ref.addressmode of
  1705. AM_OFFSET:
  1706. inc(ref.offset,oplen);
  1707. AM_POSTINDEXED:
  1708. { base register updated by instruction, next offset can remain
  1709. the same }
  1710. ;
  1711. AM_PREINDEXED:
  1712. begin
  1713. { base register updated by instruction -> next instruction can
  1714. use post-indexing with offset = sizeof(operation) }
  1715. ref.offset:=0;
  1716. ref.addressmode:=AM_OFFSET;
  1717. end;
  1718. end;
  1719. end;
  1720. { generate a load/store and adjust the reference offset to the next
  1721. memory location if necessary }
  1722. procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1723. begin
  1724. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
  1725. updaterefafterloadstore(ref,tcgsize2size[opsize]);
  1726. end;
  1727. { generate a dual load/store (ldp/stp) and adjust the reference offset to
  1728. the next memory location if necessary }
  1729. procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1730. begin
  1731. list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
  1732. updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
  1733. end;
  1734. { turn a reference into a pre- or post-indexed reference for use in a
  1735. load/store of a particular size }
  1736. procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
  1737. var
  1738. tmpreg: tregister;
  1739. scaledoffset: longint;
  1740. orgaddressmode: taddressmode;
  1741. begin
  1742. scaledoffset:=tcgsize2size[opsize];
  1743. if scaledop in [A_LDP,A_STP] then
  1744. scaledoffset:=scaledoffset*2;
  1745. { can we use the reference as post-indexed without changes? }
  1746. if forcepostindexing then
  1747. begin
  1748. orgaddressmode:=ref.addressmode;
  1749. ref.addressmode:=AM_POSTINDEXED;
  1750. if (orgaddressmode=AM_POSTINDEXED) or
  1751. ((ref.offset=0) and
  1752. (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
  1753. begin
  1754. { just change the post-indexed offset to the access size }
  1755. ref.offset:=scaledoffset;
  1756. { and replace the base register if that didn't happen yet
  1757. (could be sp or a regvar) }
  1758. if not basereplaced then
  1759. begin
  1760. tmpreg:=getaddressregister(list);
  1761. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1762. ref.base:=tmpreg;
  1763. basereplaced:=true;
  1764. end;
  1765. exit;
  1766. end;
  1767. ref.addressmode:=orgaddressmode;
  1768. end;
  1769. {$ifdef dummy}
  1770. This could in theory be useful in case you have a concatcopy from
  1771. e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
  1772. very unlikely. Disabled because it still needs fixes, as it
  1773. also generates pre-indexed loads right now at the very end for the
  1774. left-over gencopies
  1775. { can we turn it into a pre-indexed reference for free? (after the
  1776. first operation, it will be turned into an offset one) }
  1777. if not forcepostindexing and
  1778. (ref.offset<>0) then
  1779. begin
  1780. orgaddressmode:=ref.addressmode;
  1781. ref.addressmode:=AM_PREINDEXED;
  1782. tmpreg:=ref.base;
  1783. if not basereplaced and
  1784. (ref.base=tmpreg) then
  1785. begin
  1786. tmpreg:=getaddressregister(list);
  1787. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1788. ref.base:=tmpreg;
  1789. basereplaced:=true;
  1790. end;
  1791. if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
  1792. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1793. exit;
  1794. end;
  1795. {$endif dummy}
  1796. if not forcepostindexing then
  1797. begin
  1798. ref.addressmode:=AM_OFFSET;
  1799. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1800. { this may still cause problems if the final offset is no longer
  1801. a simple ref; it's a bit complicated to pass all information
  1802. through at all places and check that here, so play safe: we
  1803. currently never generate unrolled copies for more than 64
  1804. bytes (32 with non-double-register copies) }
  1805. if ref.index=NR_NO then
  1806. begin
  1807. if ((scaledop in [A_LDP,A_STP]) and
  1808. (ref.offset<((64-8)*tcgsize2size[opsize]))) or
  1809. ((scaledop in [A_LDUR,A_STUR]) and
  1810. (ref.offset<(255-8*tcgsize2size[opsize]))) or
  1811. ((scaledop in [A_LDR,A_STR]) and
  1812. (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
  1813. exit;
  1814. end;
  1815. end;
  1816. tmpreg:=getaddressregister(list);
  1817. a_loadaddr_ref_reg(list,ref,tmpreg);
  1818. basereplaced:=true;
  1819. if forcepostindexing then
  1820. begin
  1821. reference_reset_base(ref,tmpreg,scaledoffset,ref.temppos,ref.alignment,ref.volatility);
  1822. ref.addressmode:=AM_POSTINDEXED;
  1823. end
  1824. else
  1825. begin
  1826. reference_reset_base(ref,tmpreg,0,ref.temppos,ref.alignment,ref.volatility);
  1827. ref.addressmode:=AM_OFFSET;
  1828. end
  1829. end;
  1830. { prepare a reference for use by gencopy. This is done both after the
  1831. unrolled and regular copy loop -> get rid of post-indexing mode, make
  1832. sure ref is valid }
  1833. procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
  1834. var
  1835. simplify: boolean;
  1836. begin
  1837. if ref.addressmode=AM_POSTINDEXED then
  1838. ref.offset:=tcgsize2size[opsize];
  1839. getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
  1840. if simplify then
  1841. begin
  1842. makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
  1843. op:=scaledop;
  1844. end;
  1845. end;
  1846. { generate a copy from source to dest of size opsize/postfix }
  1847. procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
  1848. var
  1849. reg: tregister;
  1850. loadop, storeop: tasmop;
  1851. begin
  1852. preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
  1853. preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
  1854. reg:=getintregister(list,opsize);
  1855. genloadstore(list,loadop,reg,source,postfix,opsize);
  1856. genloadstore(list,storeop,reg,dest,postfix,opsize);
  1857. end;
  1858. { copy the leftovers after an unrolled or regular copy loop }
  1859. procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
  1860. begin
  1861. { stop post-indexing if we did so in the loop, since in that case all
  1862. offsets definitely can be represented now }
  1863. if source.addressmode=AM_POSTINDEXED then
  1864. begin
  1865. source.addressmode:=AM_OFFSET;
  1866. source.offset:=0;
  1867. end;
  1868. if dest.addressmode=AM_POSTINDEXED then
  1869. begin
  1870. dest.addressmode:=AM_OFFSET;
  1871. dest.offset:=0;
  1872. end;
  1873. { transfer the leftovers }
  1874. if len>=8 then
  1875. begin
  1876. dec(len,8);
  1877. gencopy(list,source,dest,PF_NONE,OS_64);
  1878. end;
  1879. if len>=4 then
  1880. begin
  1881. dec(len,4);
  1882. gencopy(list,source,dest,PF_NONE,OS_32);
  1883. end;
  1884. if len>=2 then
  1885. begin
  1886. dec(len,2);
  1887. gencopy(list,source,dest,PF_H,OS_16);
  1888. end;
  1889. if len>=1 then
  1890. begin
  1891. dec(len);
  1892. gencopy(list,source,dest,PF_B,OS_8);
  1893. end;
  1894. end;
  1895. const
  1896. { load_length + loop dec + cbnz }
  1897. loopoverhead=12;
  1898. { loop overhead + load + store }
  1899. totallooplen=loopoverhead + 8;
  1900. var
  1901. totalalign: longint;
  1902. maxlenunrolled: tcgint;
  1903. loadop, storeop: tasmop;
  1904. opsize: tcgsize;
  1905. postfix: toppostfix;
  1906. tmpsource, tmpdest: treference;
  1907. scaledstoreop, unscaledstoreop,
  1908. scaledloadop, unscaledloadop: tasmop;
  1909. regs: array[1..8] of tregister;
  1910. countreg: tregister;
  1911. i, regcount: longint;
  1912. hl: tasmlabel;
  1913. simplifysource, simplifydest: boolean;
  1914. begin
  1915. if len=0 then
  1916. exit;
  1917. sourcebasereplaced:=false;
  1918. destbasereplaced:=false;
  1919. { maximum common alignment }
  1920. totalalign:=max(1,newalignment(source.alignment,dest.alignment));
  1921. { use a simple load/store? }
  1922. if (len in [1,2,4,8]) and
  1923. ((totalalign>=(len div 2)) or
  1924. (source.alignment=len) or
  1925. (dest.alignment=len)) then
  1926. begin
  1927. opsize:=int_cgsize(len);
  1928. a_load_ref_ref(list,opsize,opsize,source,dest);
  1929. exit;
  1930. end;
  1931. { alignment > length is not useful, and would break some checks below }
  1932. while totalalign>len do
  1933. totalalign:=totalalign div 2;
  1934. { operation sizes to use based on common alignment }
  1935. case totalalign of
  1936. 1:
  1937. begin
  1938. postfix:=PF_B;
  1939. opsize:=OS_8;
  1940. end;
  1941. 2:
  1942. begin
  1943. postfix:=PF_H;
  1944. opsize:=OS_16;
  1945. end;
  1946. 4:
  1947. begin
  1948. postfix:=PF_None;
  1949. opsize:=OS_32;
  1950. end
  1951. else
  1952. begin
  1953. totalalign:=8;
  1954. postfix:=PF_None;
  1955. opsize:=OS_64;
  1956. end;
  1957. end;
  1958. { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
  1959. maxlenunrolled:=min(totalalign,8)*4;
  1960. { ldp/stp -> 2 registers per instruction }
  1961. if (totalalign>=4) and
  1962. (len>=totalalign*2) then
  1963. begin
  1964. maxlenunrolled:=maxlenunrolled*2;
  1965. scaledstoreop:=A_STP;
  1966. scaledloadop:=A_LDP;
  1967. unscaledstoreop:=A_NONE;
  1968. unscaledloadop:=A_NONE;
  1969. end
  1970. else
  1971. begin
  1972. scaledstoreop:=A_STR;
  1973. scaledloadop:=A_LDR;
  1974. unscaledstoreop:=A_STUR;
  1975. unscaledloadop:=A_LDUR;
  1976. end;
  1977. { we only need 4 instructions extra to call FPC_MOVE }
  1978. if cs_opt_size in current_settings.optimizerswitches then
  1979. maxlenunrolled:=maxlenunrolled div 2;
  1980. if (len>maxlenunrolled) and
  1981. (len>totalalign*8) then
  1982. begin
  1983. g_concatcopy_move(list,source,dest,len);
  1984. exit;
  1985. end;
  1986. simplifysource:=true;
  1987. simplifydest:=true;
  1988. tmpsource:=source;
  1989. tmpdest:=dest;
  1990. { can we directly encode all offsets in an unrolled loop? }
  1991. if len<=maxlenunrolled then
  1992. begin
  1993. {$ifdef extdebug}
  1994. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
  1995. {$endif extdebug}
  1996. { the leftovers will be handled separately -> -(len mod opsize) }
  1997. inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
  1998. { additionally, the last regular load/store will be at
  1999. offset+len-opsize (if len-(len mod opsize)>len) }
  2000. if tmpsource.offset>source.offset then
  2001. dec(tmpsource.offset,tcgsize2size[opsize]);
  2002. getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
  2003. inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
  2004. if tmpdest.offset>dest.offset then
  2005. dec(tmpdest.offset,tcgsize2size[opsize]);
  2006. getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
  2007. tmpsource:=source;
  2008. tmpdest:=dest;
  2009. { if we can't directly encode all offsets, simplify }
  2010. if simplifysource then
  2011. begin
  2012. loadop:=scaledloadop;
  2013. makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
  2014. end;
  2015. if simplifydest then
  2016. begin
  2017. storeop:=scaledstoreop;
  2018. makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
  2019. end;
  2020. regcount:=len div tcgsize2size[opsize];
  2021. { in case we transfer two registers at a time, we copy an even
  2022. number of registers }
  2023. if loadop=A_LDP then
  2024. regcount:=regcount and not(1);
  2025. { initialise for dfa }
  2026. regs[low(regs)]:=NR_NO;
  2027. { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
  2028. for i:=1 to regcount do
  2029. regs[i]:=getintregister(list,opsize);
  2030. if loadop=A_LDP then
  2031. begin
  2032. { load registers }
  2033. for i:=1 to (regcount div 2) do
  2034. gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
  2035. { store registers }
  2036. for i:=1 to (regcount div 2) do
  2037. gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
  2038. end
  2039. else
  2040. begin
  2041. for i:=1 to regcount do
  2042. genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
  2043. for i:=1 to regcount do
  2044. genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
  2045. end;
  2046. { leftover }
  2047. len:=len-regcount*tcgsize2size[opsize];
  2048. {$ifdef extdebug}
  2049. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
  2050. {$endif extdebug}
  2051. end
  2052. else
  2053. begin
  2054. {$ifdef extdebug}
  2055. list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
  2056. {$endif extdebug}
  2057. { regular loop -> definitely use post-indexing }
  2058. loadop:=scaledloadop;
  2059. makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
  2060. storeop:=scaledstoreop;
  2061. makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
  2062. current_asmdata.getjumplabel(hl);
  2063. countreg:=getintregister(list,OS_32);
  2064. if loadop=A_LDP then
  2065. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
  2066. else
  2067. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
  2068. a_label(list,hl);
  2069. a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
  2070. if loadop=A_LDP then
  2071. begin
  2072. regs[1]:=getintregister(list,opsize);
  2073. regs[2]:=getintregister(list,opsize);
  2074. gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
  2075. gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
  2076. end
  2077. else
  2078. begin
  2079. regs[1]:=getintregister(list,opsize);
  2080. genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
  2081. genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
  2082. end;
  2083. list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
  2084. len:=len mod tcgsize2size[opsize];
  2085. end;
  2086. gencopyleftovers(list,tmpsource,tmpdest,len);
  2087. end;
  2088. procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
  2089. begin
  2090. { This method is integrated into g_intf_wrapper and shouldn't be called separately }
  2091. InternalError(2013020102);
  2092. end;
  2093. procedure create_codegen;
  2094. begin
  2095. cg:=tcgaarch64.Create;
  2096. cg128:=tcg128.Create;
  2097. end;
  2098. end.