cgcpu.pas 86 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213
  1. {
  2. Copyright (c) 2014 by Jonas Maebe
  3. This unit implements the code generator for AArch64
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. globtype,parabase,
  22. cgbase,cgutils,cgobj,
  23. aasmbase,aasmtai,aasmdata,aasmcpu,
  24. cpubase,cpuinfo,
  25. node,symconst,SymType,symdef,
  26. rgcpu;
  27. type
  28. tcgaarch64=class(tcg)
  29. protected
  30. { changes register size without adding register allocation info }
  31. function makeregsize(reg: tregister; size: tcgsize): tregister; overload;
  32. public
  33. { simplifies "ref" so it can be used with "op". If "ref" can be used
  34. with a different load/Store operation that has the same meaning as the
  35. original one, "op" will be replaced with the alternative }
  36. procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  37. function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override;
  38. procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  39. procedure init_register_allocators;override;
  40. procedure done_register_allocators;override;
  41. function getmmregister(list:TAsmList;size:tcgsize):tregister;override;
  42. function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  43. procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override;
  44. procedure a_call_reg(list:TAsmList;Reg:tregister);override;
  45. { General purpose instructions }
  46. procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  47. procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override;
  48. procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override;
  49. procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override;
  50. procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override;
  51. procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  52. procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  53. { move instructions }
  54. procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override;
  55. procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override;
  56. procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override;
  57. procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override;
  58. procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override;
  59. procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override;
  60. procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override;
  61. procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override;
  62. { fpu move instructions (not used, all floating point is vector unit-based) }
  63. procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override;
  64. procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override;
  65. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  66. procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override;
  67. procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override;
  68. procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override;
  69. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  70. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override;
  71. procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override;
  72. procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override;
  73. { comparison operations }
  74. procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override;
  75. procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override;
  76. procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override;
  77. procedure a_jmp_name(list: TAsmList; const s: string);override;
  78. procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;}
  79. procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override;
  80. procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override;
  81. procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override;
  82. procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override;
  83. procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override;
  84. procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override;
  85. procedure g_maybe_got_init(list: TAsmList); override;
  86. procedure g_restore_registers(list: TAsmList);override;
  87. procedure g_save_registers(list: TAsmList);override;
  88. procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
  89. procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
  90. procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
  91. private
  92. function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  93. procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  94. end;
  95. procedure create_codegen;
  96. const
  97. TOpCG2AsmOpReg: array[topcg] of TAsmOp = (
  98. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV
  99. );
  100. TOpCG2AsmOpImm: array[topcg] of TAsmOp = (
  101. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR
  102. );
  103. TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT,
  104. C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI
  105. );
  106. implementation
  107. uses
  108. globals,verbose,systems,cutils,
  109. paramgr,fmodule,
  110. symtable,symsym,
  111. tgobj,
  112. procinfo,cpupi;
  113. procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  114. var
  115. href: treference;
  116. so: tshifterop;
  117. accesssize: longint;
  118. begin
  119. if (ref.base=NR_NO) then
  120. begin
  121. if ref.shiftmode<>SM_None then
  122. internalerror(2014110701);
  123. ref.base:=ref.index;
  124. ref.index:=NR_NO;
  125. end;
  126. { no abitrary scale factor support (the generic code doesn't set it,
  127. AArch-specific code shouldn't either) }
  128. if not(ref.scalefactor in [0,1]) then
  129. internalerror(2014111002);
  130. case simple_ref_type(op,size,oppostfix,ref) of
  131. sr_simple:
  132. exit;
  133. sr_internal_illegal:
  134. internalerror(2014121702);
  135. sr_complex:
  136. { continue } ;
  137. end;
  138. if assigned(ref.symbol) then
  139. begin
  140. { internal "load symbol" instructions should already be valid }
  141. if assigned(ref.symboldata) or
  142. (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then
  143. internalerror(2014110802);
  144. { no relative symbol support (needed) yet }
  145. if assigned(ref.relsymbol) then
  146. internalerror(2014111001);
  147. { loading a symbol address (whether it's in the GOT or not) consists
  148. of two parts: first load the page on which it is located, then
  149. either the offset in the page or load the value at that offset in
  150. the page. This final GOT-load can be relaxed by the linker in case
  151. the variable itself can be stored directly in the GOT }
  152. if (preferred_newbasereg=NR_NO) or
  153. (ref.base=preferred_newbasereg) or
  154. (ref.index=preferred_newbasereg) then
  155. preferred_newbasereg:=getaddressregister(list);
  156. { load the (GOT) page }
  157. reference_reset_symbol(href,ref.symbol,0,8);
  158. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  159. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  160. ((ref.symbol.typ=AT_DATA) and
  161. (ref.symbol.bind=AB_LOCAL)) then
  162. href.refaddr:=addr_page
  163. else
  164. href.refaddr:=addr_gotpage;
  165. list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href));
  166. { load the GOT entry (= address of the variable) }
  167. reference_reset_base(href,preferred_newbasereg,0,sizeof(pint));
  168. href.symbol:=ref.symbol;
  169. { code symbols defined in the current compilation unit do not
  170. have to be accessed via the GOT }
  171. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  172. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  173. ((ref.symbol.typ=AT_DATA) and
  174. (ref.symbol.bind=AB_LOCAL)) then
  175. begin
  176. href.base:=NR_NO;
  177. href.refaddr:=addr_pageoffset;
  178. list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href));
  179. end
  180. else
  181. begin
  182. href.refaddr:=addr_gotpageoffset;
  183. { use a_load_ref_reg() rather than directly encoding the LDR,
  184. so that we'll check the validity of the reference }
  185. a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg);
  186. end;
  187. { set as new base register }
  188. if ref.base=NR_NO then
  189. ref.base:=preferred_newbasereg
  190. else if ref.index=NR_NO then
  191. ref.index:=preferred_newbasereg
  192. else
  193. begin
  194. { make sure it's valid in case ref.base is SP -> make it
  195. the second operand}
  196. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg);
  197. ref.base:=preferred_newbasereg
  198. end;
  199. ref.symbol:=nil;
  200. end;
  201. { base & index }
  202. if (ref.base<>NR_NO) and
  203. (ref.index<>NR_NO) then
  204. begin
  205. case op of
  206. A_LDR, A_STR:
  207. begin
  208. if (ref.shiftmode=SM_None) and
  209. (ref.shiftimm<>0) then
  210. internalerror(2014110805);
  211. { wrong shift? (possible in case of something like
  212. array_of_2byte_rec[x].bytefield -> shift will be set 1, but
  213. the final load is a 1 byte -> can't use shift after all }
  214. if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and
  215. ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or
  216. (ref.offset<>0)) then
  217. begin
  218. if preferred_newbasereg=NR_NO then
  219. preferred_newbasereg:=getaddressregister(list);
  220. { "add" supports a superset of the shift modes supported by
  221. load/store instructions }
  222. shifterop_reset(so);
  223. so.shiftmode:=ref.shiftmode;
  224. so.shiftimm:=ref.shiftimm;
  225. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  226. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.alignment);
  227. { possibly still an invalid offset -> fall through }
  228. end
  229. else if ref.offset<>0 then
  230. begin
  231. if (preferred_newbasereg=NR_NO) or
  232. { we keep ref.index, so it must not be overwritten }
  233. (ref.index=preferred_newbasereg) then
  234. preferred_newbasereg:=getaddressregister(list);
  235. { add to the base and not to the index, because the index
  236. may be scaled; this works even if the base is SP }
  237. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  238. ref.offset:=0;
  239. ref.base:=preferred_newbasereg;
  240. { finished }
  241. exit;
  242. end
  243. else
  244. { valid -> exit }
  245. exit;
  246. end;
  247. { todo }
  248. A_LD1,A_LD2,A_LD3,A_LD4,
  249. A_ST1,A_ST2,A_ST3,A_ST4:
  250. internalerror(2014110704);
  251. { these don't support base+index }
  252. A_LDUR,A_STUR,
  253. A_LDP,A_STP:
  254. begin
  255. { these either don't support pre-/post-indexing, or don't
  256. support it with base+index }
  257. if ref.addressmode<>AM_OFFSET then
  258. internalerror(2014110911);
  259. if preferred_newbasereg=NR_NO then
  260. preferred_newbasereg:=getaddressregister(list);
  261. if ref.shiftmode<>SM_None then
  262. begin
  263. { "add" supports a superset of the shift modes supported by
  264. load/store instructions }
  265. shifterop_reset(so);
  266. so.shiftmode:=ref.shiftmode;
  267. so.shiftimm:=ref.shiftimm;
  268. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  269. end
  270. else
  271. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg);
  272. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.alignment);
  273. { fall through to the handling of base + offset, since the
  274. offset may still be too big }
  275. end;
  276. else
  277. internalerror(2014110901);
  278. end;
  279. end;
  280. { base + offset }
  281. if ref.base<>NR_NO then
  282. begin
  283. { valid offset for LDUR/STUR -> use that }
  284. if (ref.addressmode=AM_OFFSET) and
  285. (op in [A_LDR,A_STR]) and
  286. (ref.offset>=-256) and
  287. (ref.offset<=255) then
  288. begin
  289. if op=A_LDR then
  290. op:=A_LDUR
  291. else
  292. op:=A_STUR
  293. end
  294. { if it's not a valid LDUR/STUR, use LDR/STR }
  295. else if (op in [A_LDUR,A_STUR]) and
  296. ((ref.offset<-256) or
  297. (ref.offset>255) or
  298. (ref.addressmode<>AM_OFFSET)) then
  299. begin
  300. if op=A_LDUR then
  301. op:=A_LDR
  302. else
  303. op:=A_STR
  304. end;
  305. case op of
  306. A_LDR,A_STR:
  307. begin
  308. case ref.addressmode of
  309. AM_PREINDEXED:
  310. begin
  311. { since the loaded/stored register cannot be the same
  312. as the base register, we can safely add the
  313. offset to the base if it doesn't fit}
  314. if (ref.offset<-256) or
  315. (ref.offset>255) then
  316. begin
  317. a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base);
  318. ref.offset:=0;
  319. end;
  320. end;
  321. AM_POSTINDEXED:
  322. begin
  323. { cannot emulate post-indexing if we have to fold the
  324. offset into the base register }
  325. if (ref.offset<-256) or
  326. (ref.offset>255) then
  327. internalerror(2014110909);
  328. { ok }
  329. end;
  330. AM_OFFSET:
  331. begin
  332. { unsupported offset -> fold into base register }
  333. accesssize:=1 shl tcgsizep2size[size];
  334. if (ref.offset<0) or
  335. (ref.offset>(((1 shl 12)-1)*accesssize)) or
  336. ((ref.offset mod accesssize)<>0) then
  337. begin
  338. if preferred_newbasereg=NR_NO then
  339. preferred_newbasereg:=getaddressregister(list);
  340. { can we split the offset beween an
  341. "add/sub (imm12 shl 12)" and the load (also an
  342. imm12)?
  343. -- the offset from the load will always be added,
  344. that's why the lower bound has a smaller range
  345. than the upper bound; it must also be a multiple
  346. of the access size }
  347. if (ref.offset>=-(((1 shl 12)-1) shl 12)) and
  348. (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and
  349. ((ref.offset mod accesssize)=0) then
  350. begin
  351. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg);
  352. ref.offset:=ref.offset-(ref.offset shr 12) shl 12;
  353. end
  354. else
  355. begin
  356. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  357. ref.offset:=0;
  358. end;
  359. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.alignment);
  360. end;
  361. end
  362. else
  363. internalerror(2014110904);
  364. end;
  365. end;
  366. A_LDP,A_STP:
  367. begin
  368. { unsupported offset -> fold into base register (these
  369. instructions support all addressmodes) }
  370. if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or
  371. (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then
  372. begin
  373. case ref.addressmode of
  374. AM_POSTINDEXED:
  375. { don't emulate post-indexing if we have to fold the
  376. offset into the base register }
  377. internalerror(2014110910);
  378. AM_PREINDEXED:
  379. { this means the offset must be added to the current
  380. base register }
  381. preferred_newbasereg:=ref.base;
  382. AM_OFFSET:
  383. if preferred_newbasereg=NR_NO then
  384. preferred_newbasereg:=getaddressregister(list);
  385. end;
  386. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  387. reference_reset_base(ref,preferred_newbasereg,0,ref.alignment);
  388. end
  389. end;
  390. A_LDUR,A_STUR:
  391. begin
  392. { valid, checked above }
  393. end;
  394. { todo }
  395. A_LD1,A_LD2,A_LD3,A_LD4,
  396. A_ST1,A_ST2,A_ST3,A_ST4:
  397. internalerror(2014110908);
  398. else
  399. internalerror(2014110708);
  400. end;
  401. { done }
  402. exit;
  403. end;
  404. { only an offset -> change to base (+ offset 0) }
  405. if preferred_newbasereg=NR_NO then
  406. preferred_newbasereg:=getaddressregister(list);
  407. a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg);
  408. reference_reset_base(ref,preferred_newbasereg,0,newalignment(8,ref.offset));
  409. end;
  410. function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister;
  411. var
  412. subreg:Tsubregister;
  413. begin
  414. subreg:=cgsize2subreg(getregtype(reg),size);
  415. result:=reg;
  416. setsubreg(result,subreg);
  417. end;
  418. function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister;
  419. begin
  420. internalerror(2014122110);
  421. { squash warning }
  422. result:=NR_NO;
  423. end;
  424. function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  425. begin
  426. make_simple_ref(list,op,size,oppostfix,ref,NR_NO);
  427. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix));
  428. result:=ref;
  429. end;
  430. procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  431. var
  432. instr: taicpu;
  433. so: tshifterop;
  434. hadtmpreg: boolean;
  435. begin
  436. { imm12 }
  437. if (a>=0) and
  438. (a<=((1 shl 12)-1)) then
  439. if usedest then
  440. instr:=taicpu.op_reg_reg_const(op,dst,src,a)
  441. else
  442. instr:=taicpu.op_reg_const(op,src,a)
  443. { imm12 lsl 12 }
  444. else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then
  445. begin
  446. so.shiftmode:=SM_LSL;
  447. so.shiftimm:=12;
  448. if usedest then
  449. instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so)
  450. else
  451. instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so)
  452. end
  453. else
  454. begin
  455. { todo: other possible optimizations (e.g. load 16 bit constant in
  456. register and then add/sub/cmp/cmn shifted the rest) }
  457. if tmpreg=NR_NO then
  458. begin
  459. hadtmpreg:=false;
  460. tmpreg:=getintregister(list,size);
  461. end
  462. else
  463. begin
  464. hadtmpreg:=true;
  465. getcpuregister(list,tmpreg);
  466. end;
  467. a_load_const_reg(list,size,a,tmpreg);
  468. if usedest then
  469. instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg)
  470. else
  471. instr:=taicpu.op_reg_reg(op,src,tmpreg);
  472. if hadtmpreg then
  473. ungetcpuregister(list,tmpreg);
  474. end;
  475. if setflags then
  476. setoppostfix(instr,PF_S);
  477. list.concat(instr);
  478. end;
  479. {****************************************************************************
  480. Assembler code
  481. ****************************************************************************}
  482. procedure tcgaarch64.init_register_allocators;
  483. begin
  484. inherited init_register_allocators;
  485. rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
  486. [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
  487. RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
  488. RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
  489. { maybe we can enable this in the future for leaf functions (it's
  490. the frame pointer)
  491. ,RS_X29 }],
  492. first_int_imreg,[]);
  493. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD,
  494. [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7,
  495. RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15,
  496. RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23,
  497. RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31],
  498. first_mm_imreg,[]);
  499. end;
  500. procedure tcgaarch64.done_register_allocators;
  501. begin
  502. rg[R_INTREGISTER].free;
  503. rg[R_FPUREGISTER].free;
  504. rg[R_MMREGISTER].free;
  505. inherited done_register_allocators;
  506. end;
  507. function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister;
  508. begin
  509. case size of
  510. OS_F32:
  511. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
  512. OS_F64:
  513. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD)
  514. else
  515. internalerror(2014102701);
  516. end;
  517. end;
  518. procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean);
  519. begin
  520. if not weak then
  521. list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)))
  522. else
  523. list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION)));
  524. end;
  525. procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister);
  526. begin
  527. list.concat(taicpu.op_reg(A_BLR,reg));
  528. end;
  529. {********************** load instructions ********************}
  530. procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister);
  531. var
  532. preva: tcgint;
  533. opc: tasmop;
  534. shift,maxshift: byte;
  535. so: tshifterop;
  536. reginited: boolean;
  537. mask: tcgint;
  538. begin
  539. { if we load a value into a 32 bit register, it is automatically
  540. zero-extended to 64 bit }
  541. if (high(a)=0) and
  542. (size in [OS_64,OS_S64]) then
  543. begin
  544. size:=OS_32;
  545. reg:=makeregsize(reg,size);
  546. end;
  547. { values <= 32 bit are stored in a 32 bit register }
  548. if not(size in [OS_64,OS_S64]) then
  549. a:=cardinal(a);
  550. if size in [OS_64,OS_S64] then
  551. begin
  552. mask:=-1;
  553. maxshift:=64;
  554. end
  555. else
  556. begin
  557. mask:=$ffffffff;
  558. maxshift:=32;
  559. end;
  560. { single movn enough? (to be extended) }
  561. shift:=16;
  562. preva:=a;
  563. repeat
  564. if (a shr shift)=(mask shr shift) then
  565. begin
  566. if shift=16 then
  567. list.concat(taicpu.op_reg_const(A_MOVN,reg,not(word(preva))))
  568. else
  569. begin
  570. shifterop_reset(so);
  571. so.shiftmode:=SM_LSL;
  572. so.shiftimm:=shift-16;
  573. list.concat(taicpu.op_reg_const_shifterop(A_MOVN,reg,not(word(preva)),so));
  574. end;
  575. exit;
  576. end;
  577. { only try the next 16 bits if the current one is all 1 bits, since
  578. the movn will set all lower bits to 1 }
  579. if word(a shr (shift-16))<>$ffff then
  580. break;
  581. inc(shift,16);
  582. until shift=maxshift;
  583. reginited:=false;
  584. shift:=0;
  585. { can be optimized later to use more movn }
  586. repeat
  587. { leftover is shifterconst? (don't check if we can represent it just
  588. as effectively with movz/movk, as this check is expensive) }
  589. if ((shift<tcgsize2size[size]*(8 div 2)) and
  590. (word(a)<>0) and
  591. ((a shr 16)<>0)) and
  592. is_shifter_const(a shl shift,size) then
  593. begin
  594. if reginited then
  595. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift))
  596. else
  597. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift));
  598. exit;
  599. end;
  600. { set all 16 bit parts <> 0 }
  601. if (word(a)<>0) or
  602. ((shift=0) and
  603. (a=0)) then
  604. if shift=0 then
  605. begin
  606. list.concat(taicpu.op_reg_const(A_MOVZ,reg,word(a)));
  607. reginited:=true;
  608. end
  609. else
  610. begin
  611. shifterop_reset(so);
  612. so.shiftmode:=SM_LSL;
  613. so.shiftimm:=shift;
  614. if not reginited then
  615. begin
  616. opc:=A_MOVZ;
  617. reginited:=true;
  618. end
  619. else
  620. opc:=A_MOVK;
  621. list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so));
  622. end;
  623. preva:=a;
  624. a:=a shr 16;
  625. inc(shift,16);
  626. until word(preva)=preva;
  627. if not reginited then
  628. internalerror(2014102702);
  629. end;
  630. procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference);
  631. var
  632. reg: tregister;
  633. begin
  634. { use the zero register if possible }
  635. if a=0 then
  636. begin
  637. if size in [OS_64,OS_S64] then
  638. reg:=NR_XZR
  639. else
  640. reg:=NR_WZR;
  641. a_load_reg_ref(list,size,size,reg,ref);
  642. end
  643. else
  644. inherited;
  645. end;
  646. procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  647. var
  648. oppostfix:toppostfix;
  649. hreg: tregister;
  650. begin
  651. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  652. fromsize:=tosize
  653. { have a 32 bit register but need a 64 bit one? }
  654. else if tosize in [OS_64,OS_S64] then
  655. begin
  656. { sign extend if necessary }
  657. if fromsize in [OS_S8,OS_S16,OS_S32] then
  658. begin
  659. { can't overwrite reg, may be a constant reg }
  660. hreg:=getintregister(list,tosize);
  661. a_load_reg_reg(list,fromsize,tosize,reg,hreg);
  662. reg:=hreg;
  663. end
  664. else
  665. { top 32 bit are zero by default }
  666. reg:=makeregsize(reg,OS_64);
  667. fromsize:=tosize;
  668. end;
  669. if (ref.alignment<>0) and
  670. (ref.alignment<tcgsize2size[tosize]) then
  671. begin
  672. a_load_reg_ref_unaligned(list,fromsize,tosize,reg,ref);
  673. end
  674. else
  675. begin
  676. case tosize of
  677. { signed integer registers }
  678. OS_8,
  679. OS_S8:
  680. oppostfix:=PF_B;
  681. OS_16,
  682. OS_S16:
  683. oppostfix:=PF_H;
  684. OS_32,
  685. OS_S32,
  686. OS_64,
  687. OS_S64:
  688. oppostfix:=PF_None;
  689. else
  690. InternalError(200308299);
  691. end;
  692. handle_load_store(list,A_STR,tosize,oppostfix,reg,ref);
  693. end;
  694. end;
  695. procedure tcgaarch64.a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  696. var
  697. oppostfix:toppostfix;
  698. begin
  699. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  700. fromsize:=tosize;
  701. { ensure that all bits of the 32/64 register are always correctly set:
  702. * default behaviour is always to zero-extend to the entire (64 bit)
  703. register -> unsigned 8/16/32 bit loads only exist with a 32 bit
  704. target register, as the upper 32 bit will be zeroed implicitly
  705. -> always make target register 32 bit
  706. * signed loads exist both with 32 and 64 bit target registers,
  707. depending on whether the value should be sign extended to 32 or
  708. to 64 bit (if sign extended to 32 bit, the upper 32 bits of the
  709. corresponding 64 bit register are again zeroed) -> no need to
  710. change anything (we only have 32 and 64 bit registers), except that
  711. when loading an OS_S32 to a 32 bit register, we don't need/can't
  712. use sign extension
  713. }
  714. if fromsize in [OS_8,OS_16,OS_32] then
  715. reg:=makeregsize(reg,OS_32);
  716. if (ref.alignment<>0) and
  717. (ref.alignment<tcgsize2size[fromsize]) then
  718. begin
  719. a_load_ref_reg_unaligned(list,fromsize,tosize,ref,reg);
  720. exit;
  721. end;
  722. case fromsize of
  723. { signed integer registers }
  724. OS_8:
  725. oppostfix:=PF_B;
  726. OS_S8:
  727. oppostfix:=PF_SB;
  728. OS_16:
  729. oppostfix:=PF_H;
  730. OS_S16:
  731. oppostfix:=PF_SH;
  732. OS_S32:
  733. if getsubreg(reg)=R_SUBD then
  734. oppostfix:=PF_NONE
  735. else
  736. oppostfix:=PF_SW;
  737. OS_32,
  738. OS_64,
  739. OS_S64:
  740. oppostfix:=PF_None;
  741. else
  742. InternalError(200308297);
  743. end;
  744. handle_load_store(list,A_LDR,fromsize,oppostfix,reg,ref);
  745. { clear upper 16 bits if the value was negative }
  746. if (fromsize=OS_S8) and (tosize=OS_16) then
  747. a_load_reg_reg(list,fromsize,tosize,reg,reg);
  748. end;
  749. procedure tcgaarch64.a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister);
  750. var
  751. href: treference;
  752. hreg1, hreg2, tmpreg: tregister;
  753. begin
  754. if fromsize in [OS_64,OS_S64] then
  755. begin
  756. { split into two 32 bit loads }
  757. hreg1:=getintregister(list,OS_32);
  758. hreg2:=getintregister(list,OS_32);
  759. if target_info.endian=endian_big then
  760. begin
  761. tmpreg:=hreg1;
  762. hreg1:=hreg2;
  763. hreg2:=tmpreg;
  764. end;
  765. { can we use LDP? }
  766. if (ref.alignment=4) and
  767. (simple_ref_type(A_LDP,OS_32,PF_None,ref)=sr_simple) then
  768. list.concat(taicpu.op_reg_reg_ref(A_LDP,hreg1,hreg2,ref))
  769. else
  770. begin
  771. a_load_ref_reg(list,OS_32,OS_32,ref,hreg1);
  772. href:=ref;
  773. inc(href.offset,4);
  774. a_load_ref_reg(list,OS_32,OS_32,href,hreg2);
  775. end;
  776. a_load_reg_reg(list,OS_32,OS_64,hreg1,register);
  777. list.concat(taicpu.op_reg_reg_const_const(A_BFI,register,makeregsize(hreg2,OS_64),32,32));
  778. end
  779. else
  780. inherited;
  781. end;
  782. procedure tcgaarch64.a_load_reg_reg(list:TAsmList;fromsize,tosize:tcgsize;reg1,reg2:tregister);
  783. var
  784. instr: taicpu;
  785. begin
  786. { we use both 32 and 64 bit registers -> insert conversion when when
  787. we have to truncate/sign extend inside the (32 or 64 bit) register
  788. holding the value, and when we sign extend from a 32 to a 64 bit
  789. register }
  790. if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or
  791. ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and
  792. (fromsize<>tosize) and
  793. not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or
  794. ((fromsize in [OS_S8,OS_S16,OS_S32]) and
  795. (tosize in [OS_64,OS_S64])) or
  796. { needs to mask out the sign in the top 16 bits }
  797. ((fromsize=OS_S8) and
  798. (tosize=OS_16)) then
  799. begin
  800. case tosize of
  801. OS_8:
  802. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  803. OS_16:
  804. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  805. OS_S8:
  806. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  807. OS_S16:
  808. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  809. { while "mov wN, wM" automatically inserts a zero-extension and
  810. hence we could encode a 64->32 bit move like that, the problem
  811. is that we then can't distinguish 64->32 from 32->32 moves, and
  812. the 64->32 truncation could be removed altogether... So use a
  813. different instruction }
  814. OS_32,
  815. OS_S32:
  816. { in theory, reg1 should be 64 bit here (since fromsize>tosize),
  817. but because of the way location_force_register() tries to
  818. avoid superfluous zero/sign extensions, it's not always the
  819. case -> also force reg1 to to 64 bit }
  820. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  821. OS_64,
  822. OS_S64:
  823. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_W));
  824. else
  825. internalerror(2002090901);
  826. end;
  827. end
  828. else
  829. begin
  830. { 32 -> 32 bit move implies zero extension (sign extensions have
  831. been handled above) -> also use for 32 <-> 64 bit moves }
  832. if not(fromsize in [OS_64,OS_S64]) or
  833. not(tosize in [OS_64,OS_S64]) then
  834. instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32))
  835. else
  836. instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1);
  837. list.Concat(instr);
  838. { Notify the register allocator that we have written a move instruction so
  839. it can try to eliminate it. }
  840. add_move_instruction(instr);
  841. end;
  842. end;
  843. procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister);
  844. var
  845. href: treference;
  846. so: tshifterop;
  847. op: tasmop;
  848. begin
  849. op:=A_LDR;
  850. href:=ref;
  851. { simplify as if we're going to perform a regular 64 bit load, using
  852. "r" as the new base register if possible/necessary }
  853. make_simple_ref(list,op,OS_ADDR,PF_None,href,r);
  854. { load literal? }
  855. if assigned(href.symbol) then
  856. begin
  857. if (href.base<>NR_NO) or
  858. (href.index<>NR_NO) or
  859. not assigned(href.symboldata) then
  860. internalerror(2014110912);
  861. list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset));
  862. end
  863. else
  864. begin
  865. if href.index<>NR_NO then
  866. begin
  867. if href.shiftmode<>SM_None then
  868. begin
  869. { "add" supports a supperset of the shift modes supported by
  870. load/store instructions }
  871. shifterop_reset(so);
  872. so.shiftmode:=href.shiftmode;
  873. so.shiftimm:=href.shiftimm;
  874. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so));
  875. end
  876. else
  877. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r);
  878. end
  879. else if href.offset<>0 then
  880. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r)
  881. else
  882. a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r);
  883. end;
  884. end;
  885. procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);
  886. begin
  887. internalerror(2014122107)
  888. end;
  889. procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  890. begin
  891. internalerror(2014122108)
  892. end;
  893. procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  894. begin
  895. internalerror(2014122109)
  896. end;
  897. procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
  898. var
  899. instr: taicpu;
  900. begin
  901. if assigned(shuffle) and
  902. not shufflescalar(shuffle) then
  903. internalerror(2014122104);
  904. if fromsize=tosize then
  905. begin
  906. instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1);
  907. { Notify the register allocator that we have written a move
  908. instruction so it can try to eliminate it. }
  909. add_move_instruction(instr);
  910. end
  911. else
  912. begin
  913. if (reg_cgsize(reg1)<>fromsize) or
  914. (reg_cgsize(reg2)<>tosize) then
  915. internalerror(2014110913);
  916. instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1);
  917. end;
  918. list.Concat(instr);
  919. end;
  920. procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
  921. var
  922. tmpreg: tregister;
  923. begin
  924. if assigned(shuffle) and
  925. not shufflescalar(shuffle) then
  926. internalerror(2014122105);
  927. tmpreg:=NR_NO;
  928. if (fromsize<>tosize) then
  929. begin
  930. tmpreg:=reg;
  931. reg:=getmmregister(list,fromsize);
  932. end;
  933. handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref);
  934. if (fromsize<>tosize) then
  935. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  936. end;
  937. procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
  938. var
  939. tmpreg: tregister;
  940. begin
  941. if assigned(shuffle) and
  942. not shufflescalar(shuffle) then
  943. internalerror(2014122106);
  944. if (fromsize<>tosize) then
  945. begin
  946. tmpreg:=getmmregister(list,tosize);
  947. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  948. reg:=tmpreg;
  949. end;
  950. handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref);
  951. end;
  952. procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  953. begin
  954. if not shufflescalar(shuffle) then
  955. internalerror(2014122801);
  956. if not(tcgsize2size[fromsize] in [4,8]) or
  957. (tcgsize2size[fromsize]<>tcgsize2size[tosize]) then
  958. internalerror(2014122803);
  959. list.concat(taicpu.op_reg_reg(A_INS,mmreg,intreg));
  960. end;
  961. procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
  962. begin
  963. if not shufflescalar(shuffle) then
  964. internalerror(2014122802);
  965. if not(tcgsize2size[fromsize] in [4,8]) or
  966. (tcgsize2size[fromsize]<>tcgsize2size[tosize]) then
  967. internalerror(2014122804);
  968. list.concat(taicpu.op_reg_reg(A_UMOV,intreg,mmreg));
  969. end;
  970. procedure tcgaarch64.a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
  971. begin
  972. case op of
  973. { "xor Vx,Vx" is used to initialize global regvars to 0 }
  974. OP_XOR:
  975. begin
  976. if (src<>dst) or
  977. (reg_cgsize(src)<>size) or
  978. assigned(shuffle) then
  979. internalerror(2015011401);
  980. case size of
  981. OS_F32,
  982. OS_F64:
  983. list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0));
  984. else
  985. internalerror(2015011402);
  986. end;
  987. end
  988. else
  989. internalerror(2015011403);
  990. end;
  991. end;
  992. procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister);
  993. var
  994. bitsize,
  995. signbit: longint;
  996. begin
  997. if srcsize in [OS_64,OS_S64] then
  998. begin
  999. bitsize:=64;
  1000. signbit:=6;
  1001. end
  1002. else
  1003. begin
  1004. bitsize:=32;
  1005. signbit:=5;
  1006. end;
  1007. { source is 0 -> dst will have to become 255 }
  1008. list.concat(taicpu.op_reg_const(A_CMP,src,0));
  1009. if reverse then
  1010. begin
  1011. list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src));
  1012. { xor 31/63 is the same as setting the lower 5/6 bits to
  1013. "31/63-(lower 5/6 bits of dst)" }
  1014. list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1));
  1015. end
  1016. else
  1017. begin
  1018. list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src));
  1019. list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
  1020. end;
  1021. { set dst to -1 if src was 0 }
  1022. list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE));
  1023. { mask the -1 to 255 if src was 0 (anyone find a two-instruction
  1024. branch-free version? All of mine are 3...) }
  1025. list.Concat(setoppostfix(taicpu.op_reg_reg(A_UXT,makeregsize(dst,OS_32),makeregsize(dst,OS_32)),PF_B));
  1026. end;
  1027. procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference);
  1028. var
  1029. href: treference;
  1030. hreg1, hreg2, tmpreg: tregister;
  1031. begin
  1032. if fromsize in [OS_64,OS_S64] then
  1033. begin
  1034. { split into two 32 bit stores }
  1035. hreg1:=getintregister(list,OS_32);
  1036. hreg2:=getintregister(list,OS_32);
  1037. a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1);
  1038. a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64));
  1039. if target_info.endian=endian_big then
  1040. begin
  1041. tmpreg:=hreg1;
  1042. hreg1:=hreg2;
  1043. hreg2:=tmpreg;
  1044. end;
  1045. { can we use STP? }
  1046. if (ref.alignment=4) and
  1047. (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then
  1048. list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref))
  1049. else
  1050. begin
  1051. a_load_reg_ref(list,OS_32,OS_32,hreg1,ref);
  1052. href:=ref;
  1053. inc(href.offset,4);
  1054. a_load_reg_ref(list,OS_32,OS_32,hreg2,href);
  1055. end;
  1056. end
  1057. else
  1058. inherited;
  1059. end;
  1060. procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  1061. const
  1062. overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG];
  1063. begin
  1064. if (op in overflowops) and
  1065. (size in [OS_8,OS_S8,OS_16,OS_S16]) then
  1066. a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32))
  1067. end;
  1068. procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);
  1069. begin
  1070. optimize_op_const(size,op,a);
  1071. case op of
  1072. OP_NONE:
  1073. exit;
  1074. OP_MOVE:
  1075. a_load_const_reg(list,size,a,reg);
  1076. OP_NEG,OP_NOT:
  1077. internalerror(200306011);
  1078. else
  1079. a_op_const_reg_reg(list,op,size,a,reg,reg);
  1080. end;
  1081. end;
  1082. procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister);
  1083. begin
  1084. Case op of
  1085. OP_NEG,
  1086. OP_NOT:
  1087. begin
  1088. list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src));
  1089. maybeadjustresult(list,op,size,dst);
  1090. end
  1091. else
  1092. a_op_reg_reg_reg(list,op,size,src,dst,dst);
  1093. end;
  1094. end;
  1095. procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);
  1096. var
  1097. l: tlocation;
  1098. begin
  1099. a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l);
  1100. end;
  1101. procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);
  1102. var
  1103. hreg: tregister;
  1104. begin
  1105. { no ROLV opcode... }
  1106. if op=OP_ROL then
  1107. begin
  1108. case size of
  1109. OS_32,OS_S32,
  1110. OS_64,OS_S64:
  1111. begin
  1112. hreg:=getintregister(list,size);
  1113. a_load_const_reg(list,size,tcgsize2size[size]*8,hreg);
  1114. a_op_reg_reg(list,OP_SUB,size,src1,hreg);
  1115. a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst);
  1116. exit;
  1117. end;
  1118. else
  1119. internalerror(2014111005);
  1120. end;
  1121. end
  1122. else if (op=OP_ROR) and
  1123. not(size in [OS_32,OS_S32,OS_64,OS_S64]) then
  1124. internalerror(2014111006);
  1125. if TOpCG2AsmOpReg[op]=A_NONE then
  1126. internalerror(2014111007);
  1127. list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1));
  1128. maybeadjustresult(list,op,size,dst);
  1129. end;
  1130. procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1131. var
  1132. shiftcountmask: longint;
  1133. constreg: tregister;
  1134. begin
  1135. { add/sub instructions have only positive immediate operands }
  1136. if (op in [OP_ADD,OP_SUB]) and
  1137. (a<0) then
  1138. begin
  1139. if op=OP_ADD then
  1140. op:=op_SUB
  1141. else
  1142. op:=OP_ADD;
  1143. { avoid range/overflow error in case a = low(tcgint) }
  1144. {$push}{$r-}{$q-}
  1145. a:=-a;
  1146. {$pop}
  1147. end;
  1148. ovloc.loc:=LOC_VOID;
  1149. optimize_op_const(size,op,a);
  1150. case op of
  1151. OP_NONE:
  1152. begin
  1153. a_load_reg_reg(list,size,size,src,dst);
  1154. exit;
  1155. end;
  1156. OP_MOVE:
  1157. begin
  1158. a_load_const_reg(list,size,a,dst);
  1159. exit;
  1160. end;
  1161. end;
  1162. case op of
  1163. OP_ADD,
  1164. OP_SUB:
  1165. begin
  1166. handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true);
  1167. { on a 64 bit target, overflows with smaller data types
  1168. are handled via range errors }
  1169. if setflags and
  1170. (size in [OS_64,OS_S64]) then
  1171. begin
  1172. location_reset(ovloc,LOC_FLAGS,OS_8);
  1173. if size=OS_64 then
  1174. if op=OP_ADD then
  1175. ovloc.resflags:=F_CS
  1176. else
  1177. ovloc.resflags:=F_CC
  1178. else
  1179. ovloc.resflags:=F_VS;
  1180. end;
  1181. end;
  1182. OP_OR,
  1183. OP_AND,
  1184. OP_XOR:
  1185. begin
  1186. if not(size in [OS_64,OS_S64]) then
  1187. a:=cardinal(a);
  1188. if is_shifter_const(a,size) then
  1189. list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a))
  1190. else
  1191. begin
  1192. constreg:=getintregister(list,size);
  1193. a_load_const_reg(list,size,a,constreg);
  1194. a_op_reg_reg_reg(list,op,size,constreg,src,dst);
  1195. end;
  1196. end;
  1197. OP_SHL,
  1198. OP_SHR,
  1199. OP_SAR:
  1200. begin
  1201. if size in [OS_64,OS_S64] then
  1202. shiftcountmask:=63
  1203. else
  1204. shiftcountmask:=31;
  1205. if (a and shiftcountmask)<>0 Then
  1206. list.concat(taicpu.op_reg_reg_const(
  1207. TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask))
  1208. else
  1209. a_load_reg_reg(list,size,size,src,dst);
  1210. if (a and not(tcgint(shiftcountmask)))<>0 then
  1211. internalError(2014112101);
  1212. end;
  1213. OP_ROL,
  1214. OP_ROR:
  1215. begin
  1216. case size of
  1217. OS_32,OS_S32:
  1218. if (a and not(tcgint(31)))<>0 then
  1219. internalError(2014112102);
  1220. OS_64,OS_S64:
  1221. if (a and not(tcgint(63)))<>0 then
  1222. internalError(2014112103);
  1223. else
  1224. internalError(2014112104);
  1225. end;
  1226. { there's only a ror opcode }
  1227. if op=OP_ROL then
  1228. a:=(tcgsize2size[size]*8)-a;
  1229. list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a));
  1230. end;
  1231. OP_MUL,
  1232. OP_IMUL,
  1233. OP_DIV,
  1234. OP_IDIV:
  1235. begin
  1236. constreg:=getintregister(list,size);
  1237. a_load_const_reg(list,size,a,constreg);
  1238. a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc);
  1239. end;
  1240. else
  1241. internalerror(2014111403);
  1242. end;
  1243. maybeadjustresult(list,op,size,dst);
  1244. end;
  1245. procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1246. var
  1247. tmpreg1, tmpreg2: tregister;
  1248. begin
  1249. ovloc.loc:=LOC_VOID;
  1250. { overflow can only occur with 64 bit calculations on 64 bit cpus }
  1251. if setflags and
  1252. (size in [OS_64,OS_S64]) then
  1253. begin
  1254. case op of
  1255. OP_ADD,
  1256. OP_SUB:
  1257. begin
  1258. list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S));
  1259. ovloc.loc:=LOC_FLAGS;
  1260. if size=OS_64 then
  1261. if op=OP_ADD then
  1262. ovloc.resflags:=F_CS
  1263. else
  1264. ovloc.resflags:=F_CC
  1265. else
  1266. ovloc.resflags:=F_VS;
  1267. { finished }
  1268. exit;
  1269. end;
  1270. OP_MUL:
  1271. begin
  1272. { check whether the upper 64 bit of the 128 bit product is 0 }
  1273. tmpreg1:=getintregister(list,OS_64);
  1274. list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1));
  1275. list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0));
  1276. ovloc.loc:=LOC_FLAGS;
  1277. ovloc.resflags:=F_NE;
  1278. { still have to perform the actual multiplication }
  1279. end;
  1280. OP_IMUL:
  1281. begin
  1282. { check whether the upper 64 bits of the 128 bit multiplication
  1283. result have the same value as the replicated sign bit of the
  1284. lower 64 bits }
  1285. tmpreg1:=getintregister(list,OS_64);
  1286. list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1));
  1287. { calculate lower 64 bits (afterwards, because dst may be
  1288. equal to src1 or src2) }
  1289. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1290. { replicate sign bit }
  1291. tmpreg2:=getintregister(list,OS_64);
  1292. a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2);
  1293. list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2));
  1294. ovloc.loc:=LOC_FLAGS;
  1295. ovloc.resflags:=F_NE;
  1296. { finished }
  1297. exit;
  1298. end;
  1299. OP_IDIV,
  1300. OP_DIV:
  1301. begin
  1302. { not handled here, needs div-by-zero check (dividing by zero
  1303. just gives a 0 result on aarch64), and low(int64) div -1
  1304. check for overflow) }
  1305. internalerror(2014122101);
  1306. end;
  1307. end;
  1308. end;
  1309. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1310. end;
  1311. {*************** compare instructructions ****************}
  1312. procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
  1313. var
  1314. op: tasmop;
  1315. begin
  1316. if a>=0 then
  1317. op:=A_CMP
  1318. else
  1319. op:=A_CMN;
  1320. { avoid range/overflow error in case a=low(tcgint) }
  1321. {$push}{$r-}{$q-}
  1322. handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false);
  1323. {$pop}
  1324. a_jmp_cond(list,cmp_op,l);
  1325. end;
  1326. procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel);
  1327. begin
  1328. list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1));
  1329. a_jmp_cond(list,cmp_op,l);
  1330. end;
  1331. procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel);
  1332. var
  1333. ai: taicpu;
  1334. begin
  1335. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION));
  1336. ai.is_jmp:=true;
  1337. list.Concat(ai);
  1338. end;
  1339. procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string);
  1340. var
  1341. ai: taicpu;
  1342. begin
  1343. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION));
  1344. ai.is_jmp:=true;
  1345. list.Concat(ai);
  1346. end;
  1347. procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel);
  1348. var
  1349. ai: taicpu;
  1350. begin
  1351. ai:=TAiCpu.op_sym(A_B,l);
  1352. ai.is_jmp:=true;
  1353. ai.SetCondition(TOpCmp2AsmCond[cond]);
  1354. list.Concat(ai);
  1355. end;
  1356. procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);
  1357. var
  1358. ai : taicpu;
  1359. begin
  1360. ai:=Taicpu.op_sym(A_B,l);
  1361. ai.is_jmp:=true;
  1362. ai.SetCondition(flags_to_cond(f));
  1363. list.Concat(ai);
  1364. end;
  1365. procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister);
  1366. begin
  1367. list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f)));
  1368. end;
  1369. procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);
  1370. begin
  1371. { we need an explicit overflow location, because there are many
  1372. possibilities (not just the overflow flag, which is only used for
  1373. signed add/sub) }
  1374. internalerror(2014112303);
  1375. end;
  1376. procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation);
  1377. var
  1378. hl : tasmlabel;
  1379. hflags : tresflags;
  1380. begin
  1381. if not(cs_check_overflow in current_settings.localswitches) then
  1382. exit;
  1383. current_asmdata.getjumplabel(hl);
  1384. case ovloc.loc of
  1385. LOC_FLAGS:
  1386. begin
  1387. hflags:=ovloc.resflags;
  1388. inverse_flags(hflags);
  1389. cg.a_jmp_flags(list,hflags,hl);
  1390. end;
  1391. else
  1392. internalerror(2014112304);
  1393. end;
  1394. a_call_name(list,'FPC_OVERFLOW',false);
  1395. a_label(list,hl);
  1396. end;
  1397. { *********** entry/exit code and address loading ************ }
  1398. function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  1399. var
  1400. ref: treference;
  1401. sr: tsuperregister;
  1402. pairreg: tregister;
  1403. begin
  1404. result:=0;
  1405. reference_reset_base(ref,NR_SP,-16,16);
  1406. ref.addressmode:=AM_PREINDEXED;
  1407. pairreg:=NR_NO;
  1408. { store all used registers pairwise }
  1409. for sr:=lowsr to highsr do
  1410. if sr in rg[rt].used_in_proc then
  1411. if pairreg=NR_NO then
  1412. pairreg:=newreg(rt,sr,sub)
  1413. else
  1414. begin
  1415. inc(result,16);
  1416. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1417. pairreg:=NR_NO
  1418. end;
  1419. { one left -> store twice (stack must be 16 bytes aligned) }
  1420. if pairreg<>NR_NO then
  1421. begin
  1422. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
  1423. inc(result,16);
  1424. end;
  1425. end;
  1426. procedure FixupOffsets(p:TObject;arg:pointer);
  1427. var
  1428. sym: tabstractnormalvarsym absolute p;
  1429. begin
  1430. if (tsym(p).typ in [paravarsym,localvarsym]) and
  1431. (sym.localloc.loc=LOC_REFERENCE) and
  1432. (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
  1433. begin
  1434. sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
  1435. dec(sym.localloc.reference.offset,PLongint(arg)^);
  1436. end;
  1437. end;
  1438. procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
  1439. var
  1440. ref: treference;
  1441. totalstackframesize: longint;
  1442. begin
  1443. if nostackframe then
  1444. exit;
  1445. { stack pointer has to be aligned to 16 bytes at all times }
  1446. localsize:=align(localsize,16);
  1447. { save stack pointer and return address }
  1448. reference_reset_base(ref,NR_SP,-16,16);
  1449. ref.addressmode:=AM_PREINDEXED;
  1450. list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
  1451. { initialise frame pointer }
  1452. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
  1453. totalstackframesize:=localsize;
  1454. { save modified integer registers }
  1455. inc(totalstackframesize,
  1456. save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
  1457. { only the lower 64 bits of the modified vector registers need to be
  1458. saved; if the caller needs the upper 64 bits, it has to save them
  1459. itself }
  1460. inc(totalstackframesize,
  1461. save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
  1462. { allocate stack space }
  1463. if localsize<>0 then
  1464. begin
  1465. localsize:=align(localsize,16);
  1466. current_procinfo.final_localsize:=localsize;
  1467. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1468. end;
  1469. { By default, we use the frame pointer to access parameters passed via
  1470. the stack and the stack pointer to address local variables and temps
  1471. because
  1472. a) we can use bigger positive than negative offsets (so accessing
  1473. locals via negative offsets from the frame pointer would be less
  1474. efficient)
  1475. b) we don't know the local size while generating the code, so
  1476. accessing the parameters via the stack pointer is not possible
  1477. without copying them
  1478. The problem with this is the get_frame() intrinsic:
  1479. a) it must return the same value as what we pass as parentfp
  1480. parameter, since that's how it's used in the TP-style objects unit
  1481. b) its return value must usable to access all local data from a
  1482. routine (locals and parameters), since it's all the nested
  1483. routines have access to
  1484. c) its return value must be usable to construct a backtrace, as it's
  1485. also used by the exception handling routines
  1486. The solution we use here, based on something similar that's done in
  1487. the MIPS port, is to generate all accesses to locals in the routine
  1488. itself SP-relative, and then after the code is generated and the local
  1489. size is known (namely, here), we change all SP-relative variables/
  1490. parameters into FP-relative ones. This means that they'll be accessed
  1491. less efficiently from nested routines, but those accesses are indirect
  1492. anyway and at least this way they can be accessed at all
  1493. }
  1494. if current_procinfo.has_nestedprocs then
  1495. begin
  1496. current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1497. current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1498. end;
  1499. end;
  1500. procedure tcgaarch64.g_maybe_got_init(list : TAsmList);
  1501. begin
  1502. { nothing to do on Darwin or Linux }
  1503. end;
  1504. procedure tcgaarch64.g_restore_registers(list:TAsmList);
  1505. begin
  1506. { done in g_proc_exit }
  1507. end;
  1508. procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  1509. var
  1510. ref: treference;
  1511. sr, highestsetsr: tsuperregister;
  1512. pairreg: tregister;
  1513. regcount: longint;
  1514. begin
  1515. reference_reset_base(ref,NR_SP,16,16);
  1516. ref.addressmode:=AM_POSTINDEXED;
  1517. { highest reg stored twice? }
  1518. regcount:=0;
  1519. highestsetsr:=RS_NO;
  1520. for sr:=lowsr to highsr do
  1521. if sr in rg[rt].used_in_proc then
  1522. begin
  1523. inc(regcount);
  1524. highestsetsr:=sr;
  1525. end;
  1526. if odd(regcount) then
  1527. begin
  1528. list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref));
  1529. highestsetsr:=pred(highestsetsr);
  1530. end;
  1531. { load all (other) used registers pairwise }
  1532. pairreg:=NR_NO;
  1533. for sr:=highestsetsr downto lowsr do
  1534. if sr in rg[rt].used_in_proc then
  1535. if pairreg=NR_NO then
  1536. pairreg:=newreg(rt,sr,sub)
  1537. else
  1538. begin
  1539. list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref));
  1540. pairreg:=NR_NO
  1541. end;
  1542. { There can't be any register left }
  1543. if pairreg<>NR_NO then
  1544. internalerror(2014112602);
  1545. end;
  1546. procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  1547. var
  1548. ref: treference;
  1549. regsstored: boolean;
  1550. sr: tsuperregister;
  1551. begin
  1552. if not nostackframe then
  1553. begin
  1554. { if no registers have been stored, we don't have to subtract the
  1555. allocated temp space from the stack pointer }
  1556. regsstored:=false;
  1557. for sr:=RS_X19 to RS_X28 do
  1558. if sr in rg[R_INTREGISTER].used_in_proc then
  1559. begin
  1560. regsstored:=true;
  1561. break;
  1562. end;
  1563. if not regsstored then
  1564. for sr:=RS_D8 to RS_D15 do
  1565. if sr in rg[R_MMREGISTER].used_in_proc then
  1566. begin
  1567. regsstored:=true;
  1568. break;
  1569. end;
  1570. { restore registers (and stack pointer) }
  1571. if regsstored then
  1572. begin
  1573. if current_procinfo.final_localsize<>0 then
  1574. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true);
  1575. load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
  1576. load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
  1577. end
  1578. else if current_procinfo.final_localsize<>0 then
  1579. { restore stack pointer }
  1580. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  1581. { restore framepointer and return address }
  1582. reference_reset_base(ref,NR_SP,16,16);
  1583. ref.addressmode:=AM_POSTINDEXED;
  1584. list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
  1585. end;
  1586. { return }
  1587. list.concat(taicpu.op_none(A_RET));
  1588. end;
  1589. procedure tcgaarch64.g_save_registers(list : TAsmList);
  1590. begin
  1591. { done in g_proc_entry }
  1592. end;
  1593. { ************* concatcopy ************ }
  1594. procedure tcgaarch64.g_concatcopy_move(list : TAsmList;const source,dest : treference;len : tcgint);
  1595. var
  1596. paraloc1,paraloc2,paraloc3 : TCGPara;
  1597. pd : tprocdef;
  1598. begin
  1599. pd:=search_system_proc('MOVE');
  1600. paraloc1.init;
  1601. paraloc2.init;
  1602. paraloc3.init;
  1603. paramanager.getintparaloc(list,pd,1,paraloc1);
  1604. paramanager.getintparaloc(list,pd,2,paraloc2);
  1605. paramanager.getintparaloc(list,pd,3,paraloc3);
  1606. a_load_const_cgpara(list,OS_SINT,len,paraloc3);
  1607. a_loadaddr_ref_cgpara(list,dest,paraloc2);
  1608. a_loadaddr_ref_cgpara(list,source,paraloc1);
  1609. paramanager.freecgpara(list,paraloc3);
  1610. paramanager.freecgpara(list,paraloc2);
  1611. paramanager.freecgpara(list,paraloc1);
  1612. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1613. alloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1614. a_call_name(list,'FPC_MOVE',false);
  1615. dealloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1616. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1617. paraloc3.done;
  1618. paraloc2.done;
  1619. paraloc1.done;
  1620. end;
  1621. procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
  1622. var
  1623. sourcebasereplaced, destbasereplaced: boolean;
  1624. { get optimal memory operation to use for loading/storing data
  1625. in an unrolled loop }
  1626. procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
  1627. begin
  1628. if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
  1629. (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
  1630. begin
  1631. memop:=unscaledop;
  1632. needsimplify:=true;
  1633. end
  1634. else if (unscaledop<>A_NONE) and
  1635. (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
  1636. (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
  1637. begin
  1638. memop:=unscaledop;
  1639. needsimplify:=false;
  1640. end
  1641. else
  1642. begin
  1643. memop:=scaledop;
  1644. needsimplify:=true;
  1645. end;
  1646. end;
  1647. { adjust the offset and/or addressing mode after a load/store so it's
  1648. correct for the next one of the same size }
  1649. procedure updaterefafterloadstore(var ref: treference; oplen: longint);
  1650. begin
  1651. case ref.addressmode of
  1652. AM_OFFSET:
  1653. inc(ref.offset,oplen);
  1654. AM_POSTINDEXED:
  1655. { base register updated by instruction, next offset can remain
  1656. the same }
  1657. ;
  1658. AM_PREINDEXED:
  1659. begin
  1660. { base register updated by instruction -> next instruction can
  1661. use post-indexing with offset = sizeof(operation) }
  1662. ref.offset:=0;
  1663. ref.addressmode:=AM_OFFSET;
  1664. end;
  1665. end;
  1666. end;
  1667. { generate a load/store and adjust the reference offset to the next
  1668. memory location if necessary }
  1669. procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1670. begin
  1671. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
  1672. updaterefafterloadstore(ref,tcgsize2size[opsize]);
  1673. end;
  1674. { generate a dual load/store (ldp/stp) and adjust the reference offset to
  1675. the next memory location if necessary }
  1676. procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1677. begin
  1678. list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
  1679. updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
  1680. end;
  1681. { turn a reference into a pre- or post-indexed reference for use in a
  1682. load/store of a particular size }
  1683. procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
  1684. var
  1685. tmpreg: tregister;
  1686. scaledoffset: longint;
  1687. orgaddressmode: taddressmode;
  1688. begin
  1689. scaledoffset:=tcgsize2size[opsize];
  1690. if scaledop in [A_LDP,A_STP] then
  1691. scaledoffset:=scaledoffset*2;
  1692. { can we use the reference as post-indexed without changes? }
  1693. if forcepostindexing then
  1694. begin
  1695. orgaddressmode:=ref.addressmode;
  1696. ref.addressmode:=AM_POSTINDEXED;
  1697. if (orgaddressmode=AM_POSTINDEXED) or
  1698. ((ref.offset=0) and
  1699. (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
  1700. begin
  1701. { just change the post-indexed offset to the access size }
  1702. ref.offset:=scaledoffset;
  1703. { and replace the base register if that didn't happen yet
  1704. (could be sp or a regvar) }
  1705. if not basereplaced then
  1706. begin
  1707. tmpreg:=getaddressregister(list);
  1708. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1709. ref.base:=tmpreg;
  1710. basereplaced:=true;
  1711. end;
  1712. exit;
  1713. end;
  1714. ref.addressmode:=orgaddressmode;
  1715. end;
  1716. {$ifdef dummy}
  1717. This could in theory be useful in case you have a concatcopy from
  1718. e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
  1719. very unlikely. Disabled because it still needs fixes, as it
  1720. also generates pre-indexed loads right now at the very end for the
  1721. left-over gencopies
  1722. { can we turn it into a pre-indexed reference for free? (after the
  1723. first operation, it will be turned into an offset one) }
  1724. if not forcepostindexing and
  1725. (ref.offset<>0) then
  1726. begin
  1727. orgaddressmode:=ref.addressmode;
  1728. ref.addressmode:=AM_PREINDEXED;
  1729. tmpreg:=ref.base;
  1730. if not basereplaced and
  1731. (ref.base=tmpreg) then
  1732. begin
  1733. tmpreg:=getaddressregister(list);
  1734. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1735. ref.base:=tmpreg;
  1736. basereplaced:=true;
  1737. end;
  1738. if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
  1739. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1740. exit;
  1741. end;
  1742. {$endif dummy}
  1743. if not forcepostindexing then
  1744. begin
  1745. ref.addressmode:=AM_OFFSET;
  1746. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1747. { this may still cause problems if the final offset is no longer
  1748. a simple ref; it's a bit complicated to pass all information
  1749. through at all places and check that here, so play safe: we
  1750. currently never generate unrolled copies for more than 64
  1751. bytes (32 with non-double-register copies) }
  1752. if ref.index=NR_NO then
  1753. begin
  1754. if ((scaledop in [A_LDP,A_STP]) and
  1755. (ref.offset<((64-8)*tcgsize2size[opsize]))) or
  1756. ((scaledop in [A_LDUR,A_STUR]) and
  1757. (ref.offset<(255-8*tcgsize2size[opsize]))) or
  1758. ((scaledop in [A_LDR,A_STR]) and
  1759. (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
  1760. exit;
  1761. end;
  1762. end;
  1763. tmpreg:=getaddressregister(list);
  1764. a_loadaddr_ref_reg(list,ref,tmpreg);
  1765. basereplaced:=true;
  1766. if forcepostindexing then
  1767. begin
  1768. reference_reset_base(ref,tmpreg,scaledoffset,ref.alignment);
  1769. ref.addressmode:=AM_POSTINDEXED;
  1770. end
  1771. else
  1772. begin
  1773. reference_reset_base(ref,tmpreg,0,ref.alignment);
  1774. ref.addressmode:=AM_OFFSET;
  1775. end
  1776. end;
  1777. { prepare a reference for use by gencopy. This is done both after the
  1778. unrolled and regular copy loop -> get rid of post-indexing mode, make
  1779. sure ref is valid }
  1780. procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
  1781. var
  1782. simplify: boolean;
  1783. begin
  1784. if ref.addressmode=AM_POSTINDEXED then
  1785. ref.offset:=tcgsize2size[opsize];
  1786. getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
  1787. if simplify then
  1788. begin
  1789. makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
  1790. op:=scaledop;
  1791. end;
  1792. end;
  1793. { generate a copy from source to dest of size opsize/postfix }
  1794. procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
  1795. var
  1796. reg: tregister;
  1797. loadop, storeop: tasmop;
  1798. begin
  1799. preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
  1800. preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
  1801. reg:=getintregister(list,opsize);
  1802. genloadstore(list,loadop,reg,source,postfix,opsize);
  1803. genloadstore(list,storeop,reg,dest,postfix,opsize);
  1804. end;
  1805. { copy the leftovers after an unrolled or regular copy loop }
  1806. procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
  1807. begin
  1808. { stop post-indexing if we did so in the loop, since in that case all
  1809. offsets definitely can be represented now }
  1810. if source.addressmode=AM_POSTINDEXED then
  1811. begin
  1812. source.addressmode:=AM_OFFSET;
  1813. source.offset:=0;
  1814. end;
  1815. if dest.addressmode=AM_POSTINDEXED then
  1816. begin
  1817. dest.addressmode:=AM_OFFSET;
  1818. dest.offset:=0;
  1819. end;
  1820. { transfer the leftovers }
  1821. if len>=8 then
  1822. begin
  1823. dec(len,8);
  1824. gencopy(list,source,dest,PF_NONE,OS_64);
  1825. end;
  1826. if len>=4 then
  1827. begin
  1828. dec(len,4);
  1829. gencopy(list,source,dest,PF_NONE,OS_32);
  1830. end;
  1831. if len>=2 then
  1832. begin
  1833. dec(len,2);
  1834. gencopy(list,source,dest,PF_H,OS_16);
  1835. end;
  1836. if len>=1 then
  1837. begin
  1838. dec(len);
  1839. gencopy(list,source,dest,PF_B,OS_8);
  1840. end;
  1841. end;
  1842. const
  1843. { load_length + loop dec + cbnz }
  1844. loopoverhead=12;
  1845. { loop overhead + load + store }
  1846. totallooplen=loopoverhead + 8;
  1847. var
  1848. totalalign: longint;
  1849. maxlenunrolled: tcgint;
  1850. loadop, storeop: tasmop;
  1851. opsize: tcgsize;
  1852. postfix: toppostfix;
  1853. tmpsource, tmpdest: treference;
  1854. scaledstoreop, unscaledstoreop,
  1855. scaledloadop, unscaledloadop: tasmop;
  1856. regs: array[1..8] of tregister;
  1857. countreg: tregister;
  1858. i, regcount: longint;
  1859. hl: tasmlabel;
  1860. simplifysource, simplifydest: boolean;
  1861. begin
  1862. if len=0 then
  1863. exit;
  1864. sourcebasereplaced:=false;
  1865. destbasereplaced:=false;
  1866. { maximum common alignment }
  1867. totalalign:=max(1,newalignment(source.alignment,dest.alignment));
  1868. { use a simple load/store? }
  1869. if (len in [1,2,4,8]) and
  1870. ((totalalign>=(len div 2)) or
  1871. (source.alignment=len) or
  1872. (dest.alignment=len)) then
  1873. begin
  1874. opsize:=int_cgsize(len);
  1875. a_load_ref_ref(list,opsize,opsize,source,dest);
  1876. exit;
  1877. end;
  1878. { alignment > length is not useful, and would break some checks below }
  1879. while totalalign>len do
  1880. totalalign:=totalalign div 2;
  1881. { operation sizes to use based on common alignment }
  1882. case totalalign of
  1883. 1:
  1884. begin
  1885. postfix:=PF_B;
  1886. opsize:=OS_8;
  1887. end;
  1888. 2:
  1889. begin
  1890. postfix:=PF_H;
  1891. opsize:=OS_16;
  1892. end;
  1893. 4:
  1894. begin
  1895. postfix:=PF_None;
  1896. opsize:=OS_32;
  1897. end
  1898. else
  1899. begin
  1900. totalalign:=8;
  1901. postfix:=PF_None;
  1902. opsize:=OS_64;
  1903. end;
  1904. end;
  1905. { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
  1906. maxlenunrolled:=min(totalalign,8)*4;
  1907. { ldp/stp -> 2 registers per instruction }
  1908. if (totalalign>=4) and
  1909. (len>=totalalign*2) then
  1910. begin
  1911. maxlenunrolled:=maxlenunrolled*2;
  1912. scaledstoreop:=A_STP;
  1913. scaledloadop:=A_LDP;
  1914. unscaledstoreop:=A_NONE;
  1915. unscaledloadop:=A_NONE;
  1916. end
  1917. else
  1918. begin
  1919. scaledstoreop:=A_STR;
  1920. scaledloadop:=A_LDR;
  1921. unscaledstoreop:=A_STUR;
  1922. unscaledloadop:=A_LDUR;
  1923. end;
  1924. { we only need 4 instructions extra to call FPC_MOVE }
  1925. if cs_opt_size in current_settings.optimizerswitches then
  1926. maxlenunrolled:=maxlenunrolled div 2;
  1927. if (len>maxlenunrolled) and
  1928. (len>totalalign*8) then
  1929. begin
  1930. g_concatcopy_move(list,source,dest,len);
  1931. exit;
  1932. end;
  1933. simplifysource:=true;
  1934. simplifydest:=true;
  1935. tmpsource:=source;
  1936. tmpdest:=dest;
  1937. { can we directly encode all offsets in an unrolled loop? }
  1938. if len<=maxlenunrolled then
  1939. begin
  1940. {$ifdef extdebug}
  1941. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
  1942. {$endif extdebug}
  1943. { the leftovers will be handled separately -> -(len mod opsize) }
  1944. inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
  1945. { additionally, the last regular load/store will be at
  1946. offset+len-opsize (if len-(len mod opsize)>len) }
  1947. if tmpsource.offset>source.offset then
  1948. dec(tmpsource.offset,tcgsize2size[opsize]);
  1949. getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
  1950. inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
  1951. if tmpdest.offset>dest.offset then
  1952. dec(tmpdest.offset,tcgsize2size[opsize]);
  1953. getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
  1954. tmpsource:=source;
  1955. tmpdest:=dest;
  1956. { if we can't directly encode all offsets, simplify }
  1957. if simplifysource then
  1958. begin
  1959. loadop:=scaledloadop;
  1960. makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
  1961. end;
  1962. if simplifydest then
  1963. begin
  1964. storeop:=scaledstoreop;
  1965. makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
  1966. end;
  1967. regcount:=len div tcgsize2size[opsize];
  1968. { in case we transfer two registers at a time, we copy an even
  1969. number of registers }
  1970. if loadop=A_LDP then
  1971. regcount:=regcount and not(1);
  1972. { initialise for dfa }
  1973. regs[low(regs)]:=NR_NO;
  1974. { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
  1975. for i:=1 to regcount do
  1976. regs[i]:=getintregister(list,opsize);
  1977. if loadop=A_LDP then
  1978. begin
  1979. { load registers }
  1980. for i:=1 to (regcount div 2) do
  1981. gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
  1982. { store registers }
  1983. for i:=1 to (regcount div 2) do
  1984. gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
  1985. end
  1986. else
  1987. begin
  1988. for i:=1 to regcount do
  1989. genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
  1990. for i:=1 to regcount do
  1991. genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
  1992. end;
  1993. { leftover }
  1994. len:=len-regcount*tcgsize2size[opsize];
  1995. {$ifdef extdebug}
  1996. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
  1997. {$endif extdebug}
  1998. end
  1999. else
  2000. begin
  2001. {$ifdef extdebug}
  2002. list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
  2003. {$endif extdebug}
  2004. { regular loop -> definitely use post-indexing }
  2005. loadop:=scaledloadop;
  2006. makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
  2007. storeop:=scaledstoreop;
  2008. makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
  2009. current_asmdata.getjumplabel(hl);
  2010. countreg:=getintregister(list,OS_32);
  2011. if loadop=A_LDP then
  2012. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
  2013. else
  2014. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
  2015. a_label(list,hl);
  2016. a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
  2017. if loadop=A_LDP then
  2018. begin
  2019. regs[1]:=getintregister(list,opsize);
  2020. regs[2]:=getintregister(list,opsize);
  2021. gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
  2022. gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
  2023. end
  2024. else
  2025. begin
  2026. regs[1]:=getintregister(list,opsize);
  2027. genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
  2028. genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
  2029. end;
  2030. list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
  2031. len:=len mod tcgsize2size[opsize];
  2032. end;
  2033. gencopyleftovers(list,tmpsource,tmpdest,len);
  2034. end;
  2035. procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
  2036. begin
  2037. { This method is integrated into g_intf_wrapper and shouldn't be called separately }
  2038. InternalError(2013020102);
  2039. end;
  2040. procedure create_codegen;
  2041. begin
  2042. cg:=tcgaarch64.Create;
  2043. cg128:=tcg128.Create;
  2044. end;
  2045. end.