cgcpu.pas 98 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498
  1. {
  2. Copyright (c) 2014 by Jonas Maebe
  3. This unit implements the code generator for AArch64
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. globtype,parabase,
  22. cgbase,cgutils,cgobj,
  23. aasmbase,aasmtai,aasmdata,aasmcpu,
  24. cpubase,cpuinfo,
  25. node,symconst,SymType,symdef,
  26. rgcpu;
  27. type
  28. tcgaarch64=class(tcg)
  29. protected
  30. { changes register size without adding register allocation info }
  31. function makeregsize(reg: tregister; size: tcgsize): tregister; overload;
  32. public
  33. { simplifies "ref" so it can be used with "op". If "ref" can be used
  34. with a different load/Store operation that has the same meaning as the
  35. original one, "op" will be replaced with the alternative }
  36. procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  37. function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override;
  38. procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  39. procedure init_register_allocators;override;
  40. procedure done_register_allocators;override;
  41. function getmmregister(list:TAsmList;size:tcgsize):tregister;override;
  42. function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  43. procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override;
  44. procedure a_call_reg(list:TAsmList;Reg:tregister);override;
  45. { General purpose instructions }
  46. procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  47. procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override;
  48. procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override;
  49. procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override;
  50. procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override;
  51. procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  52. procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  53. { move instructions }
  54. procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override;
  55. procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override;
  56. procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override;
  57. procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override;
  58. procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override;
  59. procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override;
  60. procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override;
  61. procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override;
  62. { fpu move instructions (not used, all floating point is vector unit-based) }
  63. procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override;
  64. procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override;
  65. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  66. procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override;
  67. procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override;
  68. procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override;
  69. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  70. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override;
  71. procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override;
  72. procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override;
  73. { comparison operations }
  74. procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override;
  75. procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override;
  76. procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override;
  77. procedure a_jmp_name(list: TAsmList; const s: string);override;
  78. procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;}
  79. procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override;
  80. procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override;
  81. procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override;
  82. procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override;
  83. procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override;
  84. procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override;
  85. procedure g_maybe_got_init(list: TAsmList); override;
  86. procedure g_restore_registers(list: TAsmList);override;
  87. procedure g_save_registers(list: TAsmList);override;
  88. procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
  89. procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
  90. procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
  91. procedure g_check_for_fpu_exception(list: TAsmList; force, clear: boolean);override;
  92. procedure g_profilecode(list: TAsmList);override;
  93. private
  94. function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  95. procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  96. end;
  97. procedure create_codegen;
  98. const
  99. TOpCG2AsmOpReg: array[topcg] of TAsmOp = (
  100. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV
  101. );
  102. TOpCG2AsmOpImm: array[topcg] of TAsmOp = (
  103. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR
  104. );
  105. TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT,
  106. C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI
  107. );
  108. implementation
  109. uses
  110. globals,verbose,systems,cutils,cclasses,
  111. paramgr,fmodule,
  112. symtable,symsym,
  113. tgobj,
  114. ncgutil,
  115. procinfo,cpupi;
  116. procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  117. var
  118. href: treference;
  119. so: tshifterop;
  120. accesssize: longint;
  121. begin
  122. if (ref.base=NR_NO) then
  123. begin
  124. if ref.shiftmode<>SM_None then
  125. internalerror(2014110701);
  126. ref.base:=ref.index;
  127. ref.index:=NR_NO;
  128. end;
  129. { no abitrary scale factor support (the generic code doesn't set it,
  130. AArch-specific code shouldn't either) }
  131. if not(ref.scalefactor in [0,1]) then
  132. internalerror(2014111002);
  133. case simple_ref_type(op,size,oppostfix,ref) of
  134. sr_simple:
  135. exit;
  136. sr_internal_illegal:
  137. internalerror(2014121702);
  138. sr_complex:
  139. { continue } ;
  140. end;
  141. if assigned(ref.symbol) then
  142. begin
  143. { internal "load symbol" instructions should already be valid }
  144. if assigned(ref.symboldata) or
  145. (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then
  146. internalerror(2014110802);
  147. { no relative symbol support (needed) yet }
  148. if assigned(ref.relsymbol) then
  149. internalerror(2014111001);
  150. { loading a symbol address (whether it's in the GOT or not) consists
  151. of two parts: first load the page on which it is located, then
  152. either the offset in the page or load the value at that offset in
  153. the page. This final GOT-load can be relaxed by the linker in case
  154. the variable itself can be stored directly in the GOT }
  155. if (preferred_newbasereg=NR_NO) or
  156. (ref.base=preferred_newbasereg) or
  157. (ref.index=preferred_newbasereg) then
  158. preferred_newbasereg:=getaddressregister(list);
  159. { load the (GOT) page }
  160. reference_reset_symbol(href,ref.symbol,0,8,[]);
  161. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  162. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  163. ((ref.symbol.typ=AT_DATA) and
  164. (ref.symbol.bind=AB_LOCAL)) or
  165. (target_info.system=system_aarch64_win64) then
  166. href.refaddr:=addr_page
  167. else
  168. href.refaddr:=addr_gotpage;
  169. list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href));
  170. { load the GOT entry (= address of the variable) }
  171. reference_reset_base(href,preferred_newbasereg,0,ctempposinvalid,sizeof(pint),[]);
  172. href.symbol:=ref.symbol;
  173. { code symbols defined in the current compilation unit do not
  174. have to be accessed via the GOT }
  175. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  176. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  177. ((ref.symbol.typ=AT_DATA) and
  178. (ref.symbol.bind=AB_LOCAL)) or
  179. (target_info.system=system_aarch64_win64) then
  180. begin
  181. href.base:=NR_NO;
  182. href.refaddr:=addr_pageoffset;
  183. list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href));
  184. end
  185. else
  186. begin
  187. href.refaddr:=addr_gotpageoffset;
  188. { use a_load_ref_reg() rather than directly encoding the LDR,
  189. so that we'll check the validity of the reference }
  190. a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg);
  191. end;
  192. { set as new base register }
  193. if ref.base=NR_NO then
  194. ref.base:=preferred_newbasereg
  195. else if ref.index=NR_NO then
  196. ref.index:=preferred_newbasereg
  197. else
  198. begin
  199. { make sure it's valid in case ref.base is SP -> make it
  200. the second operand}
  201. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg);
  202. ref.base:=preferred_newbasereg
  203. end;
  204. ref.symbol:=nil;
  205. end;
  206. { base & index }
  207. if (ref.base<>NR_NO) and
  208. (ref.index<>NR_NO) then
  209. begin
  210. case op of
  211. A_LDR, A_STR:
  212. begin
  213. if (ref.shiftmode=SM_None) and
  214. (ref.shiftimm<>0) then
  215. internalerror(2014110805);
  216. { wrong shift? (possible in case of something like
  217. array_of_2byte_rec[x].bytefield -> shift will be set 1, but
  218. the final load is a 1 byte -> can't use shift after all }
  219. if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and
  220. ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or
  221. (ref.offset<>0)) then
  222. begin
  223. if preferred_newbasereg=NR_NO then
  224. preferred_newbasereg:=getaddressregister(list);
  225. { "add" supports a superset of the shift modes supported by
  226. load/store instructions }
  227. shifterop_reset(so);
  228. so.shiftmode:=ref.shiftmode;
  229. so.shiftimm:=ref.shiftimm;
  230. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  231. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  232. { possibly still an invalid offset -> fall through }
  233. end
  234. else if ref.offset<>0 then
  235. begin
  236. if (preferred_newbasereg=NR_NO) or
  237. { we keep ref.index, so it must not be overwritten }
  238. (ref.index=preferred_newbasereg) then
  239. preferred_newbasereg:=getaddressregister(list);
  240. { add to the base and not to the index, because the index
  241. may be scaled; this works even if the base is SP }
  242. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  243. ref.offset:=0;
  244. ref.base:=preferred_newbasereg;
  245. { finished }
  246. exit;
  247. end
  248. else
  249. { valid -> exit }
  250. exit;
  251. end;
  252. { todo }
  253. A_LD1,A_LD2,A_LD3,A_LD4,
  254. A_ST1,A_ST2,A_ST3,A_ST4:
  255. internalerror(2014110704);
  256. { these don't support base+index }
  257. A_LDUR,A_STUR,
  258. A_LDP,A_STP:
  259. begin
  260. { these either don't support pre-/post-indexing, or don't
  261. support it with base+index }
  262. if ref.addressmode<>AM_OFFSET then
  263. internalerror(2014110911);
  264. if preferred_newbasereg=NR_NO then
  265. preferred_newbasereg:=getaddressregister(list);
  266. if ref.shiftmode<>SM_None then
  267. begin
  268. { "add" supports a superset of the shift modes supported by
  269. load/store instructions }
  270. shifterop_reset(so);
  271. so.shiftmode:=ref.shiftmode;
  272. so.shiftimm:=ref.shiftimm;
  273. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  274. end
  275. else
  276. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg);
  277. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  278. { fall through to the handling of base + offset, since the
  279. offset may still be too big }
  280. end;
  281. else
  282. internalerror(2014110901);
  283. end;
  284. end;
  285. { base + offset }
  286. if ref.base<>NR_NO then
  287. begin
  288. { valid offset for LDUR/STUR -> use that }
  289. if (ref.addressmode=AM_OFFSET) and
  290. (op in [A_LDR,A_STR]) and
  291. (ref.offset>=-256) and
  292. (ref.offset<=255) then
  293. begin
  294. if op=A_LDR then
  295. op:=A_LDUR
  296. else
  297. op:=A_STUR
  298. end
  299. { if it's not a valid LDUR/STUR, use LDR/STR }
  300. else if (op in [A_LDUR,A_STUR]) and
  301. ((ref.offset<-256) or
  302. (ref.offset>255) or
  303. (ref.addressmode<>AM_OFFSET)) then
  304. begin
  305. if op=A_LDUR then
  306. op:=A_LDR
  307. else
  308. op:=A_STR
  309. end;
  310. case op of
  311. A_LDR,A_STR:
  312. begin
  313. case ref.addressmode of
  314. AM_PREINDEXED:
  315. begin
  316. { since the loaded/stored register cannot be the same
  317. as the base register, we can safely add the
  318. offset to the base if it doesn't fit}
  319. if (ref.offset<-256) or
  320. (ref.offset>255) then
  321. begin
  322. a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base);
  323. ref.offset:=0;
  324. end;
  325. end;
  326. AM_POSTINDEXED:
  327. begin
  328. { cannot emulate post-indexing if we have to fold the
  329. offset into the base register }
  330. if (ref.offset<-256) or
  331. (ref.offset>255) then
  332. internalerror(2014110909);
  333. { ok }
  334. end;
  335. AM_OFFSET:
  336. begin
  337. { unsupported offset -> fold into base register }
  338. accesssize:=1 shl tcgsizep2size[size];
  339. if (ref.offset<0) or
  340. (ref.offset>(((1 shl 12)-1)*accesssize)) or
  341. ((ref.offset mod accesssize)<>0) then
  342. begin
  343. if preferred_newbasereg=NR_NO then
  344. preferred_newbasereg:=getaddressregister(list);
  345. { can we split the offset beween an
  346. "add/sub (imm12 shl 12)" and the load (also an
  347. imm12)?
  348. -- the offset from the load will always be added,
  349. that's why the lower bound has a smaller range
  350. than the upper bound; it must also be a multiple
  351. of the access size }
  352. if (ref.offset>=-(((1 shl 12)-1) shl 12)) and
  353. (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and
  354. ((ref.offset mod accesssize)=0) then
  355. begin
  356. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg);
  357. ref.offset:=ref.offset-(ref.offset shr 12) shl 12;
  358. end
  359. else
  360. begin
  361. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  362. ref.offset:=0;
  363. end;
  364. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  365. end;
  366. end
  367. end;
  368. end;
  369. A_LDP,A_STP:
  370. begin
  371. { unsupported offset -> fold into base register (these
  372. instructions support all addressmodes) }
  373. if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or
  374. (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then
  375. begin
  376. case ref.addressmode of
  377. AM_POSTINDEXED:
  378. { don't emulate post-indexing if we have to fold the
  379. offset into the base register }
  380. internalerror(2014110910);
  381. AM_PREINDEXED:
  382. { this means the offset must be added to the current
  383. base register }
  384. preferred_newbasereg:=ref.base;
  385. AM_OFFSET:
  386. if preferred_newbasereg=NR_NO then
  387. preferred_newbasereg:=getaddressregister(list);
  388. end;
  389. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  390. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,ref.alignment,ref.volatility);
  391. end
  392. end;
  393. A_LDUR,A_STUR:
  394. begin
  395. { valid, checked above }
  396. end;
  397. { todo }
  398. A_LD1,A_LD2,A_LD3,A_LD4,
  399. A_ST1,A_ST2,A_ST3,A_ST4:
  400. internalerror(2014110908);
  401. else
  402. internalerror(2014110708);
  403. end;
  404. { done }
  405. exit;
  406. end;
  407. { only an offset -> change to base (+ offset 0) }
  408. if preferred_newbasereg=NR_NO then
  409. preferred_newbasereg:=getaddressregister(list);
  410. a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg);
  411. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,newalignment(8,ref.offset),ref.volatility);
  412. end;
  413. function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister;
  414. var
  415. subreg:Tsubregister;
  416. begin
  417. subreg:=cgsize2subreg(getregtype(reg),size);
  418. result:=reg;
  419. setsubreg(result,subreg);
  420. end;
  421. function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister;
  422. begin
  423. internalerror(2014122110);
  424. { squash warning }
  425. result:=NR_NO;
  426. end;
  427. function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  428. begin
  429. make_simple_ref(list,op,size,oppostfix,ref,NR_NO);
  430. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix));
  431. result:=ref;
  432. end;
  433. procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  434. var
  435. instr: taicpu;
  436. so: tshifterop;
  437. hadtmpreg: boolean;
  438. begin
  439. { imm12 }
  440. if (a>=0) and
  441. (a<=((1 shl 12)-1)) then
  442. if usedest then
  443. instr:=taicpu.op_reg_reg_const(op,dst,src,a)
  444. else
  445. instr:=taicpu.op_reg_const(op,src,a)
  446. { imm12 lsl 12 }
  447. else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then
  448. begin
  449. so.shiftmode:=SM_LSL;
  450. so.shiftimm:=12;
  451. if usedest then
  452. instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so)
  453. else
  454. instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so)
  455. end
  456. else
  457. begin
  458. { todo: other possible optimizations (e.g. load 16 bit constant in
  459. register and then add/sub/cmp/cmn shifted the rest) }
  460. if tmpreg=NR_NO then
  461. begin
  462. hadtmpreg:=false;
  463. tmpreg:=getintregister(list,size);
  464. end
  465. else
  466. begin
  467. hadtmpreg:=true;
  468. getcpuregister(list,tmpreg);
  469. end;
  470. a_load_const_reg(list,size,a,tmpreg);
  471. if usedest then
  472. instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg)
  473. else
  474. instr:=taicpu.op_reg_reg(op,src,tmpreg);
  475. if hadtmpreg then
  476. ungetcpuregister(list,tmpreg);
  477. end;
  478. if setflags then
  479. setoppostfix(instr,PF_S);
  480. list.concat(instr);
  481. end;
  482. {****************************************************************************
  483. Assembler code
  484. ****************************************************************************}
  485. procedure tcgaarch64.init_register_allocators;
  486. begin
  487. inherited init_register_allocators;
  488. rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
  489. [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
  490. RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
  491. RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
  492. { maybe we can enable this in the future for leaf functions (it's
  493. the frame pointer)
  494. ,RS_X29 }],
  495. first_int_imreg,[]);
  496. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD,
  497. [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7,
  498. RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15,
  499. RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23,
  500. RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31],
  501. first_mm_imreg,[]);
  502. end;
  503. procedure tcgaarch64.done_register_allocators;
  504. begin
  505. rg[R_INTREGISTER].free;
  506. rg[R_FPUREGISTER].free;
  507. rg[R_MMREGISTER].free;
  508. inherited done_register_allocators;
  509. end;
  510. function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister;
  511. begin
  512. case size of
  513. OS_F32:
  514. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
  515. OS_F64:
  516. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD)
  517. else
  518. internalerror(2014102701);
  519. end;
  520. end;
  521. procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean);
  522. begin
  523. if not weak then
  524. list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)))
  525. else
  526. list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION)));
  527. end;
  528. procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister);
  529. begin
  530. list.concat(taicpu.op_reg(A_BLR,reg));
  531. end;
  532. {********************** load instructions ********************}
  533. procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister);
  534. var
  535. preva: tcgint;
  536. opc: tasmop;
  537. shift,maxshift: byte;
  538. so: tshifterop;
  539. reginited: boolean;
  540. mask: tcgint;
  541. begin
  542. { if we load a value into a 32 bit register, it is automatically
  543. zero-extended to 64 bit }
  544. if (hi(a)=0) and
  545. (size in [OS_64,OS_S64]) then
  546. begin
  547. size:=OS_32;
  548. reg:=makeregsize(reg,size);
  549. end;
  550. { values <= 32 bit are stored in a 32 bit register }
  551. if not(size in [OS_64,OS_S64]) then
  552. a:=cardinal(a);
  553. if size in [OS_64,OS_S64] then
  554. begin
  555. mask:=-1;
  556. maxshift:=64;
  557. end
  558. else
  559. begin
  560. mask:=$ffffffff;
  561. maxshift:=32;
  562. end;
  563. { single movn enough? (to be extended) }
  564. shift:=16;
  565. preva:=a;
  566. repeat
  567. if (a shr shift)=(mask shr shift) then
  568. begin
  569. if shift=16 then
  570. list.concat(taicpu.op_reg_const(A_MOVN,reg,not(word(preva))))
  571. else
  572. begin
  573. shifterop_reset(so);
  574. so.shiftmode:=SM_LSL;
  575. so.shiftimm:=shift-16;
  576. list.concat(taicpu.op_reg_const_shifterop(A_MOVN,reg,not(word(preva)),so));
  577. end;
  578. exit;
  579. end;
  580. { only try the next 16 bits if the current one is all 1 bits, since
  581. the movn will set all lower bits to 1 }
  582. if word(a shr (shift-16))<>$ffff then
  583. break;
  584. inc(shift,16);
  585. until shift=maxshift;
  586. reginited:=false;
  587. shift:=0;
  588. { can be optimized later to use more movn }
  589. repeat
  590. { leftover is shifterconst? (don't check if we can represent it just
  591. as effectively with movz/movk, as this check is expensive) }
  592. if ((shift<tcgsize2size[size]*(8 div 2)) and
  593. (word(a)<>0) and
  594. ((a shr 16)<>0)) and
  595. is_shifter_const(a shl shift,size) then
  596. begin
  597. if reginited then
  598. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift))
  599. else
  600. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift));
  601. exit;
  602. end;
  603. { set all 16 bit parts <> 0 }
  604. if (word(a)<>0) or
  605. ((shift=0) and
  606. (a=0)) then
  607. if shift=0 then
  608. begin
  609. list.concat(taicpu.op_reg_const(A_MOVZ,reg,word(a)));
  610. reginited:=true;
  611. end
  612. else
  613. begin
  614. shifterop_reset(so);
  615. so.shiftmode:=SM_LSL;
  616. so.shiftimm:=shift;
  617. if not reginited then
  618. begin
  619. opc:=A_MOVZ;
  620. reginited:=true;
  621. end
  622. else
  623. opc:=A_MOVK;
  624. list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so));
  625. end;
  626. preva:=a;
  627. a:=a shr 16;
  628. inc(shift,16);
  629. until word(preva)=preva;
  630. if not reginited then
  631. internalerror(2014102702);
  632. end;
  633. procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference);
  634. var
  635. reg: tregister;
  636. href: treference;
  637. i: Integer;
  638. begin
  639. { use the zero register if possible }
  640. if a=0 then
  641. begin
  642. href:=ref;
  643. inc(href.offset,tcgsize2size[size]-1);
  644. if (tcgsize2size[size]>1) and (ref.alignment=1) and (simple_ref_type(A_STUR,OS_8,PF_None,ref)=sr_simple) and
  645. (simple_ref_type(A_STUR,OS_8,PF_None,href)=sr_simple) then
  646. begin
  647. href:=ref;
  648. for i:=0 to tcgsize2size[size]-1 do
  649. begin
  650. a_load_const_ref(list,OS_8,0,href);
  651. inc(href.offset);
  652. end;
  653. end
  654. else
  655. begin
  656. if size in [OS_64,OS_S64] then
  657. reg:=NR_XZR
  658. else
  659. reg:=NR_WZR;
  660. a_load_reg_ref(list,size,size,reg,ref);
  661. end;
  662. end
  663. else
  664. inherited;
  665. end;
  666. procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  667. var
  668. oppostfix:toppostfix;
  669. hreg: tregister;
  670. begin
  671. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  672. begin
  673. fromsize:=tosize;
  674. reg:=makeregsize(list,reg,fromsize);
  675. end
  676. { have a 32 bit register but need a 64 bit one? }
  677. else if tosize in [OS_64,OS_S64] then
  678. begin
  679. { sign extend if necessary }
  680. if fromsize in [OS_S8,OS_S16,OS_S32] then
  681. begin
  682. { can't overwrite reg, may be a constant reg }
  683. hreg:=getintregister(list,tosize);
  684. a_load_reg_reg(list,fromsize,tosize,reg,hreg);
  685. reg:=hreg;
  686. end
  687. else
  688. { top 32 bit are zero by default }
  689. reg:=makeregsize(reg,OS_64);
  690. fromsize:=tosize;
  691. end;
  692. if (ref.alignment<>0) and
  693. (ref.alignment<tcgsize2size[tosize]) then
  694. begin
  695. a_load_reg_ref_unaligned(list,fromsize,tosize,reg,ref);
  696. end
  697. else
  698. begin
  699. case tosize of
  700. { signed integer registers }
  701. OS_8,
  702. OS_S8:
  703. oppostfix:=PF_B;
  704. OS_16,
  705. OS_S16:
  706. oppostfix:=PF_H;
  707. OS_32,
  708. OS_S32,
  709. OS_64,
  710. OS_S64:
  711. oppostfix:=PF_None;
  712. else
  713. InternalError(200308299);
  714. end;
  715. handle_load_store(list,A_STR,tosize,oppostfix,reg,ref);
  716. end;
  717. end;
  718. procedure tcgaarch64.a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  719. var
  720. oppostfix:toppostfix;
  721. begin
  722. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  723. fromsize:=tosize;
  724. { ensure that all bits of the 32/64 register are always correctly set:
  725. * default behaviour is always to zero-extend to the entire (64 bit)
  726. register -> unsigned 8/16/32 bit loads only exist with a 32 bit
  727. target register, as the upper 32 bit will be zeroed implicitly
  728. -> always make target register 32 bit
  729. * signed loads exist both with 32 and 64 bit target registers,
  730. depending on whether the value should be sign extended to 32 or
  731. to 64 bit (if sign extended to 32 bit, the upper 32 bits of the
  732. corresponding 64 bit register are again zeroed) -> no need to
  733. change anything (we only have 32 and 64 bit registers), except that
  734. when loading an OS_S32 to a 32 bit register, we don't need/can't
  735. use sign extension
  736. }
  737. if fromsize in [OS_8,OS_16,OS_32] then
  738. reg:=makeregsize(reg,OS_32);
  739. if (ref.alignment<>0) and
  740. (ref.alignment<tcgsize2size[fromsize]) then
  741. begin
  742. a_load_ref_reg_unaligned(list,fromsize,tosize,ref,reg);
  743. exit;
  744. end;
  745. case fromsize of
  746. { signed integer registers }
  747. OS_8:
  748. oppostfix:=PF_B;
  749. OS_S8:
  750. oppostfix:=PF_SB;
  751. OS_16:
  752. oppostfix:=PF_H;
  753. OS_S16:
  754. oppostfix:=PF_SH;
  755. OS_S32:
  756. if getsubreg(reg)=R_SUBD then
  757. oppostfix:=PF_NONE
  758. else
  759. oppostfix:=PF_SW;
  760. OS_32,
  761. OS_64,
  762. OS_S64:
  763. oppostfix:=PF_None;
  764. else
  765. InternalError(200308297);
  766. end;
  767. handle_load_store(list,A_LDR,fromsize,oppostfix,reg,ref);
  768. { clear upper 16 bits if the value was negative }
  769. if (fromsize=OS_S8) and (tosize=OS_16) then
  770. a_load_reg_reg(list,fromsize,tosize,reg,reg);
  771. end;
  772. procedure tcgaarch64.a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister);
  773. var
  774. href: treference;
  775. hreg1, hreg2, tmpreg,tmpreg2: tregister;
  776. i : Integer;
  777. begin
  778. case fromsize of
  779. OS_64,OS_S64:
  780. begin
  781. { split into two 32 bit loads }
  782. hreg1:=getintregister(list,OS_32);
  783. hreg2:=getintregister(list,OS_32);
  784. if target_info.endian=endian_big then
  785. begin
  786. tmpreg:=hreg1;
  787. hreg1:=hreg2;
  788. hreg2:=tmpreg;
  789. end;
  790. { can we use LDP? }
  791. if (ref.alignment=4) and
  792. (simple_ref_type(A_LDP,OS_32,PF_None,ref)=sr_simple) then
  793. list.concat(taicpu.op_reg_reg_ref(A_LDP,hreg1,hreg2,ref))
  794. else
  795. begin
  796. a_load_ref_reg(list,OS_32,OS_32,ref,hreg1);
  797. href:=ref;
  798. inc(href.offset,4);
  799. a_load_ref_reg(list,OS_32,OS_32,href,hreg2);
  800. end;
  801. a_load_reg_reg(list,OS_32,OS_64,hreg1,register);
  802. list.concat(taicpu.op_reg_reg_const_const(A_BFI,register,makeregsize(hreg2,OS_64),32,32));
  803. end;
  804. OS_16,OS_S16,
  805. OS_32,OS_S32:
  806. begin
  807. if ref.alignment=2 then
  808. begin
  809. href:=ref;
  810. if target_info.endian=endian_big then
  811. inc(href.offset,tcgsize2size[fromsize]-2);
  812. tmpreg:=getintregister(list,OS_32);
  813. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg);
  814. tmpreg2:=getintregister(list,OS_32);
  815. for i:=1 to (tcgsize2size[fromsize]-1) div 2 do
  816. begin
  817. if target_info.endian=endian_big then
  818. dec(href.offset,2)
  819. else
  820. inc(href.offset,2);
  821. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg2);
  822. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*16,16));
  823. end;
  824. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  825. end
  826. else
  827. begin
  828. href:=ref;
  829. if target_info.endian=endian_big then
  830. inc(href.offset,tcgsize2size[fromsize]-1);
  831. tmpreg:=getintregister(list,OS_32);
  832. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg);
  833. tmpreg2:=getintregister(list,OS_32);
  834. for i:=1 to tcgsize2size[fromsize]-1 do
  835. begin
  836. if target_info.endian=endian_big then
  837. dec(href.offset)
  838. else
  839. inc(href.offset);
  840. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg2);
  841. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*8,8));
  842. end;
  843. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  844. end;
  845. end;
  846. else
  847. inherited;
  848. end;
  849. end;
  850. procedure tcgaarch64.a_load_reg_reg(list:TAsmList;fromsize,tosize:tcgsize;reg1,reg2:tregister);
  851. var
  852. instr: taicpu;
  853. begin
  854. { we use both 32 and 64 bit registers -> insert conversion when when
  855. we have to truncate/sign extend inside the (32 or 64 bit) register
  856. holding the value, and when we sign extend from a 32 to a 64 bit
  857. register }
  858. if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or
  859. ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and
  860. (fromsize<>tosize) and
  861. not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or
  862. ((fromsize in [OS_S8,OS_S16,OS_S32]) and
  863. (tosize in [OS_64,OS_S64])) or
  864. { needs to mask out the sign in the top 16 bits }
  865. ((fromsize=OS_S8) and
  866. (tosize=OS_16)) then
  867. begin
  868. case tosize of
  869. OS_8:
  870. list.concat(taicpu.op_reg_reg(A_UXTB,reg2,makeregsize(reg1,OS_32)));
  871. OS_16:
  872. list.concat(taicpu.op_reg_reg(A_UXTH,reg2,makeregsize(reg1,OS_32)));
  873. OS_S8:
  874. list.concat(taicpu.op_reg_reg(A_SXTB,reg2,makeregsize(reg1,OS_32)));
  875. OS_S16:
  876. list.concat(taicpu.op_reg_reg(A_SXTH,reg2,makeregsize(reg1,OS_32)));
  877. { while "mov wN, wM" automatically inserts a zero-extension and
  878. hence we could encode a 64->32 bit move like that, the problem
  879. is that we then can't distinguish 64->32 from 32->32 moves, and
  880. the 64->32 truncation could be removed altogether... So use a
  881. different instruction }
  882. OS_32,
  883. OS_S32:
  884. { in theory, reg1 should be 64 bit here (since fromsize>tosize),
  885. but because of the way location_force_register() tries to
  886. avoid superfluous zero/sign extensions, it's not always the
  887. case -> also force reg1 to to 64 bit }
  888. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  889. OS_64,
  890. OS_S64:
  891. list.concat(taicpu.op_reg_reg(A_SXTW,reg2,makeregsize(reg1,OS_32)));
  892. else
  893. internalerror(2002090901);
  894. end;
  895. end
  896. else
  897. begin
  898. { 32 -> 32 bit move implies zero extension (sign extensions have
  899. been handled above) -> also use for 32 <-> 64 bit moves }
  900. if not(fromsize in [OS_64,OS_S64]) or
  901. not(tosize in [OS_64,OS_S64]) then
  902. instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32))
  903. else
  904. instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1);
  905. list.Concat(instr);
  906. { Notify the register allocator that we have written a move instruction so
  907. it can try to eliminate it. }
  908. add_move_instruction(instr);
  909. end;
  910. end;
  911. procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister);
  912. var
  913. href: treference;
  914. so: tshifterop;
  915. op: tasmop;
  916. begin
  917. op:=A_LDR;
  918. href:=ref;
  919. { simplify as if we're going to perform a regular 64 bit load, using
  920. "r" as the new base register if possible/necessary }
  921. make_simple_ref(list,op,OS_ADDR,PF_None,href,r);
  922. { load literal? }
  923. if assigned(href.symbol) then
  924. begin
  925. if (href.base<>NR_NO) or
  926. (href.index<>NR_NO) or
  927. not assigned(href.symboldata) then
  928. internalerror(2014110912);
  929. list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset));
  930. end
  931. else
  932. begin
  933. if href.index<>NR_NO then
  934. begin
  935. if href.shiftmode<>SM_None then
  936. begin
  937. { "add" supports a supperset of the shift modes supported by
  938. load/store instructions }
  939. shifterop_reset(so);
  940. so.shiftmode:=href.shiftmode;
  941. so.shiftimm:=href.shiftimm;
  942. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so));
  943. end
  944. else
  945. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r);
  946. end
  947. else if href.offset<>0 then
  948. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r)
  949. else
  950. a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r);
  951. end;
  952. end;
  953. procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);
  954. begin
  955. internalerror(2014122107)
  956. end;
  957. procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  958. begin
  959. internalerror(2014122108)
  960. end;
  961. procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  962. begin
  963. internalerror(2014122109)
  964. end;
  965. procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
  966. var
  967. instr: taicpu;
  968. begin
  969. if assigned(shuffle) and
  970. not shufflescalar(shuffle) then
  971. internalerror(2014122104);
  972. if fromsize=tosize then
  973. begin
  974. instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1);
  975. { Notify the register allocator that we have written a move
  976. instruction so it can try to eliminate it. }
  977. add_move_instruction(instr);
  978. { FMOV cannot generate a floating point exception }
  979. end
  980. else
  981. begin
  982. if (reg_cgsize(reg1)<>fromsize) or
  983. (reg_cgsize(reg2)<>tosize) then
  984. internalerror(2014110913);
  985. instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1);
  986. maybe_check_for_fpu_exception(list);
  987. end;
  988. list.Concat(instr);
  989. end;
  990. procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
  991. var
  992. tmpreg: tregister;
  993. begin
  994. if assigned(shuffle) and
  995. not shufflescalar(shuffle) then
  996. internalerror(2014122105);
  997. tmpreg:=NR_NO;
  998. if (fromsize<>tosize) then
  999. begin
  1000. tmpreg:=reg;
  1001. reg:=getmmregister(list,fromsize);
  1002. end;
  1003. handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref);
  1004. if (fromsize<>tosize) then
  1005. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  1006. end;
  1007. procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
  1008. var
  1009. tmpreg: tregister;
  1010. begin
  1011. if assigned(shuffle) and
  1012. not shufflescalar(shuffle) then
  1013. internalerror(2014122106);
  1014. if (fromsize<>tosize) then
  1015. begin
  1016. tmpreg:=getmmregister(list,tosize);
  1017. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  1018. reg:=tmpreg;
  1019. end;
  1020. handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref);
  1021. end;
  1022. procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  1023. begin
  1024. if not shufflescalar(shuffle) then
  1025. internalerror(2014122801);
  1026. if not(tcgsize2size[fromsize] in [4,8]) or
  1027. (tcgsize2size[fromsize]<>tcgsize2size[tosize]) then
  1028. internalerror(2014122803);
  1029. list.concat(taicpu.op_reg_reg(A_INS,mmreg,intreg));
  1030. end;
  1031. procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
  1032. var
  1033. r : tregister;
  1034. begin
  1035. if not shufflescalar(shuffle) then
  1036. internalerror(2014122802);
  1037. if not(tcgsize2size[fromsize] in [4,8]) or
  1038. (tcgsize2size[fromsize]>tcgsize2size[tosize]) then
  1039. internalerror(2014122804);
  1040. if tcgsize2size[fromsize]<tcgsize2size[tosize] then
  1041. r:=makeregsize(intreg,fromsize)
  1042. else
  1043. r:=intreg;
  1044. list.concat(taicpu.op_reg_reg(A_UMOV,r,mmreg));
  1045. end;
  1046. procedure tcgaarch64.a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
  1047. begin
  1048. case op of
  1049. { "xor Vx,Vx" is used to initialize global regvars to 0 }
  1050. OP_XOR:
  1051. begin
  1052. if shuffle=nil then
  1053. begin
  1054. dst:=newreg(R_MMREGISTER,getsupreg(dst),R_SUBMM16B);
  1055. src:=newreg(R_MMREGISTER,getsupreg(src),R_SUBMM16B);
  1056. list.concat(taicpu.op_reg_reg_reg(A_EOR,dst,dst,src))
  1057. end
  1058. else if (src<>dst) or
  1059. (reg_cgsize(src)<>size) or
  1060. assigned(shuffle) then
  1061. internalerror(2015011401)
  1062. else
  1063. case size of
  1064. OS_F32,
  1065. OS_F64:
  1066. list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0));
  1067. else
  1068. internalerror(2015011402);
  1069. end;
  1070. end
  1071. else
  1072. internalerror(2015011403);
  1073. end;
  1074. end;
  1075. procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister);
  1076. var
  1077. bitsize: longint;
  1078. begin
  1079. if srcsize in [OS_64,OS_S64] then
  1080. begin
  1081. bitsize:=64;
  1082. end
  1083. else
  1084. begin
  1085. bitsize:=32;
  1086. end;
  1087. { source is 0 -> dst will have to become 255 }
  1088. list.concat(taicpu.op_reg_const(A_CMP,src,0));
  1089. if reverse then
  1090. begin
  1091. list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src));
  1092. { xor 31/63 is the same as setting the lower 5/6 bits to
  1093. "31/63-(lower 5/6 bits of dst)" }
  1094. list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1));
  1095. end
  1096. else
  1097. begin
  1098. list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src));
  1099. list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
  1100. end;
  1101. { set dst to -1 if src was 0 }
  1102. list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE));
  1103. { mask the -1 to 255 if src was 0 (anyone find a two-instruction
  1104. branch-free version? All of mine are 3...) }
  1105. list.Concat(taicpu.op_reg_reg(A_UXTB,makeregsize(dst,OS_32),makeregsize(dst,OS_32)));
  1106. end;
  1107. procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference);
  1108. var
  1109. href: treference;
  1110. hreg1, hreg2, tmpreg: tregister;
  1111. begin
  1112. if fromsize in [OS_64,OS_S64] then
  1113. begin
  1114. { split into two 32 bit stores }
  1115. hreg1:=getintregister(list,OS_32);
  1116. hreg2:=getintregister(list,OS_32);
  1117. a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1);
  1118. a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64));
  1119. if target_info.endian=endian_big then
  1120. begin
  1121. tmpreg:=hreg1;
  1122. hreg1:=hreg2;
  1123. hreg2:=tmpreg;
  1124. end;
  1125. { can we use STP? }
  1126. if (ref.alignment=4) and
  1127. (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then
  1128. list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref))
  1129. else
  1130. begin
  1131. a_load_reg_ref(list,OS_32,OS_32,hreg1,ref);
  1132. href:=ref;
  1133. inc(href.offset,4);
  1134. a_load_reg_ref(list,OS_32,OS_32,hreg2,href);
  1135. end;
  1136. end
  1137. else
  1138. inherited;
  1139. end;
  1140. procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  1141. const
  1142. overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG];
  1143. begin
  1144. if (op in overflowops) and
  1145. (size in [OS_8,OS_S8,OS_16,OS_S16]) then
  1146. a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32))
  1147. end;
  1148. procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);
  1149. begin
  1150. optimize_op_const(size,op,a);
  1151. case op of
  1152. OP_NONE:
  1153. exit;
  1154. OP_MOVE:
  1155. a_load_const_reg(list,size,a,reg);
  1156. OP_NEG,OP_NOT:
  1157. internalerror(200306011);
  1158. else
  1159. a_op_const_reg_reg(list,op,size,a,reg,reg);
  1160. end;
  1161. end;
  1162. procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister);
  1163. begin
  1164. Case op of
  1165. OP_NEG,
  1166. OP_NOT:
  1167. begin
  1168. list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src));
  1169. maybeadjustresult(list,op,size,dst);
  1170. end
  1171. else
  1172. a_op_reg_reg_reg(list,op,size,src,dst,dst);
  1173. end;
  1174. end;
  1175. procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);
  1176. var
  1177. l: tlocation;
  1178. begin
  1179. a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l);
  1180. end;
  1181. procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);
  1182. var
  1183. hreg: tregister;
  1184. begin
  1185. { no ROLV opcode... }
  1186. if op=OP_ROL then
  1187. begin
  1188. case size of
  1189. OS_32,OS_S32,
  1190. OS_64,OS_S64:
  1191. begin
  1192. hreg:=getintregister(list,size);
  1193. a_load_const_reg(list,size,tcgsize2size[size]*8,hreg);
  1194. a_op_reg_reg(list,OP_SUB,size,src1,hreg);
  1195. a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst);
  1196. exit;
  1197. end;
  1198. else
  1199. internalerror(2014111005);
  1200. end;
  1201. end
  1202. else if (op=OP_ROR) and
  1203. not(size in [OS_32,OS_S32,OS_64,OS_S64]) then
  1204. internalerror(2014111006);
  1205. if TOpCG2AsmOpReg[op]=A_NONE then
  1206. internalerror(2014111007);
  1207. list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1));
  1208. maybeadjustresult(list,op,size,dst);
  1209. end;
  1210. procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1211. var
  1212. shiftcountmask: longint;
  1213. constreg: tregister;
  1214. begin
  1215. { add/sub instructions have only positive immediate operands }
  1216. if (op in [OP_ADD,OP_SUB]) and
  1217. (a<0) then
  1218. begin
  1219. if op=OP_ADD then
  1220. op:=op_SUB
  1221. else
  1222. op:=OP_ADD;
  1223. { avoid range/overflow error in case a = low(tcgint) }
  1224. {$push}{$r-}{$q-}
  1225. a:=-a;
  1226. {$pop}
  1227. end;
  1228. ovloc.loc:=LOC_VOID;
  1229. optimize_op_const(size,op,a);
  1230. case op of
  1231. OP_NONE:
  1232. begin
  1233. a_load_reg_reg(list,size,size,src,dst);
  1234. exit;
  1235. end;
  1236. OP_MOVE:
  1237. begin
  1238. a_load_const_reg(list,size,a,dst);
  1239. exit;
  1240. end;
  1241. else
  1242. ;
  1243. end;
  1244. case op of
  1245. OP_ADD,
  1246. OP_SUB:
  1247. begin
  1248. handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true);
  1249. { on a 64 bit target, overflows with smaller data types
  1250. are handled via range errors }
  1251. if setflags and
  1252. (size in [OS_64,OS_S64]) then
  1253. begin
  1254. location_reset(ovloc,LOC_FLAGS,OS_8);
  1255. if size=OS_64 then
  1256. if op=OP_ADD then
  1257. ovloc.resflags:=F_CS
  1258. else
  1259. ovloc.resflags:=F_CC
  1260. else
  1261. ovloc.resflags:=F_VS;
  1262. end;
  1263. end;
  1264. OP_OR,
  1265. OP_AND,
  1266. OP_XOR:
  1267. begin
  1268. if not(size in [OS_64,OS_S64]) then
  1269. a:=cardinal(a);
  1270. if is_shifter_const(a,size) then
  1271. list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a))
  1272. else
  1273. begin
  1274. constreg:=getintregister(list,size);
  1275. a_load_const_reg(list,size,a,constreg);
  1276. a_op_reg_reg_reg(list,op,size,constreg,src,dst);
  1277. end;
  1278. end;
  1279. OP_SHL,
  1280. OP_SHR,
  1281. OP_SAR:
  1282. begin
  1283. if size in [OS_64,OS_S64] then
  1284. shiftcountmask:=63
  1285. else
  1286. shiftcountmask:=31;
  1287. if (a and shiftcountmask)<>0 Then
  1288. list.concat(taicpu.op_reg_reg_const(
  1289. TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask))
  1290. else
  1291. a_load_reg_reg(list,size,size,src,dst);
  1292. if (a and not(tcgint(shiftcountmask)))<>0 then
  1293. internalError(2014112101);
  1294. end;
  1295. OP_ROL,
  1296. OP_ROR:
  1297. begin
  1298. case size of
  1299. OS_32,OS_S32:
  1300. if (a and not(tcgint(31)))<>0 then
  1301. internalError(2014112102);
  1302. OS_64,OS_S64:
  1303. if (a and not(tcgint(63)))<>0 then
  1304. internalError(2014112103);
  1305. else
  1306. internalError(2014112104);
  1307. end;
  1308. { there's only a ror opcode }
  1309. if op=OP_ROL then
  1310. a:=(tcgsize2size[size]*8)-a;
  1311. list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a));
  1312. end;
  1313. OP_MUL,
  1314. OP_IMUL,
  1315. OP_DIV,
  1316. OP_IDIV:
  1317. begin
  1318. constreg:=getintregister(list,size);
  1319. a_load_const_reg(list,size,a,constreg);
  1320. a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc);
  1321. end;
  1322. else
  1323. internalerror(2014111403);
  1324. end;
  1325. maybeadjustresult(list,op,size,dst);
  1326. end;
  1327. procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1328. var
  1329. tmpreg1, tmpreg2: tregister;
  1330. begin
  1331. ovloc.loc:=LOC_VOID;
  1332. { overflow can only occur with 64 bit calculations on 64 bit cpus }
  1333. if setflags and
  1334. (size in [OS_64,OS_S64]) then
  1335. begin
  1336. case op of
  1337. OP_ADD,
  1338. OP_SUB:
  1339. begin
  1340. list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S));
  1341. ovloc.loc:=LOC_FLAGS;
  1342. if size=OS_64 then
  1343. if op=OP_ADD then
  1344. ovloc.resflags:=F_CS
  1345. else
  1346. ovloc.resflags:=F_CC
  1347. else
  1348. ovloc.resflags:=F_VS;
  1349. { finished }
  1350. exit;
  1351. end;
  1352. OP_MUL:
  1353. begin
  1354. { check whether the upper 64 bit of the 128 bit product is 0 }
  1355. tmpreg1:=getintregister(list,OS_64);
  1356. list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1));
  1357. list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0));
  1358. ovloc.loc:=LOC_FLAGS;
  1359. ovloc.resflags:=F_NE;
  1360. { still have to perform the actual multiplication }
  1361. end;
  1362. OP_IMUL:
  1363. begin
  1364. { check whether the upper 64 bits of the 128 bit multiplication
  1365. result have the same value as the replicated sign bit of the
  1366. lower 64 bits }
  1367. tmpreg1:=getintregister(list,OS_64);
  1368. list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1));
  1369. { calculate lower 64 bits (afterwards, because dst may be
  1370. equal to src1 or src2) }
  1371. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1372. { replicate sign bit }
  1373. tmpreg2:=getintregister(list,OS_64);
  1374. a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2);
  1375. list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2));
  1376. ovloc.loc:=LOC_FLAGS;
  1377. ovloc.resflags:=F_NE;
  1378. { finished }
  1379. exit;
  1380. end;
  1381. OP_IDIV,
  1382. OP_DIV:
  1383. begin
  1384. { not handled here, needs div-by-zero check (dividing by zero
  1385. just gives a 0 result on aarch64), and low(int64) div -1
  1386. check for overflow) }
  1387. internalerror(2014122101);
  1388. end;
  1389. else
  1390. internalerror(2019050936);
  1391. end;
  1392. end;
  1393. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1394. end;
  1395. {*************** compare instructructions ****************}
  1396. procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
  1397. var
  1398. op: tasmop;
  1399. begin
  1400. if a>=0 then
  1401. op:=A_CMP
  1402. else
  1403. op:=A_CMN;
  1404. { avoid range/overflow error in case a=low(tcgint) }
  1405. {$push}{$r-}{$q-}
  1406. handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false);
  1407. {$pop}
  1408. a_jmp_cond(list,cmp_op,l);
  1409. end;
  1410. procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel);
  1411. begin
  1412. list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1));
  1413. a_jmp_cond(list,cmp_op,l);
  1414. end;
  1415. procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel);
  1416. var
  1417. ai: taicpu;
  1418. begin
  1419. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION));
  1420. ai.is_jmp:=true;
  1421. list.Concat(ai);
  1422. end;
  1423. procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string);
  1424. var
  1425. ai: taicpu;
  1426. begin
  1427. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION));
  1428. ai.is_jmp:=true;
  1429. list.Concat(ai);
  1430. end;
  1431. procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel);
  1432. var
  1433. ai: taicpu;
  1434. begin
  1435. ai:=TAiCpu.op_sym(A_B,l);
  1436. ai.is_jmp:=true;
  1437. ai.SetCondition(TOpCmp2AsmCond[cond]);
  1438. list.Concat(ai);
  1439. end;
  1440. procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);
  1441. var
  1442. ai : taicpu;
  1443. begin
  1444. ai:=Taicpu.op_sym(A_B,l);
  1445. ai.is_jmp:=true;
  1446. ai.SetCondition(flags_to_cond(f));
  1447. list.Concat(ai);
  1448. end;
  1449. procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister);
  1450. begin
  1451. list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f)));
  1452. end;
  1453. procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);
  1454. begin
  1455. { we need an explicit overflow location, because there are many
  1456. possibilities (not just the overflow flag, which is only used for
  1457. signed add/sub) }
  1458. internalerror(2014112303);
  1459. end;
  1460. procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation);
  1461. var
  1462. hl : tasmlabel;
  1463. hflags : tresflags;
  1464. begin
  1465. if not(cs_check_overflow in current_settings.localswitches) then
  1466. exit;
  1467. current_asmdata.getjumplabel(hl);
  1468. case ovloc.loc of
  1469. LOC_FLAGS:
  1470. begin
  1471. hflags:=ovloc.resflags;
  1472. inverse_flags(hflags);
  1473. cg.a_jmp_flags(list,hflags,hl);
  1474. end;
  1475. else
  1476. internalerror(2014112304);
  1477. end;
  1478. a_call_name(list,'FPC_OVERFLOW',false);
  1479. a_label(list,hl);
  1480. end;
  1481. { *********** entry/exit code and address loading ************ }
  1482. function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  1483. var
  1484. ref: treference;
  1485. sr: tsuperregister;
  1486. pairreg: tregister;
  1487. sehreg,sehregp : TAsmSehDirective;
  1488. begin
  1489. result:=0;
  1490. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1491. ref.addressmode:=AM_PREINDEXED;
  1492. pairreg:=NR_NO;
  1493. { for SEH on Win64 we can only store consecutive register pairs, others
  1494. need to be stored with STR }
  1495. if target_info.system=system_aarch64_win64 then
  1496. begin
  1497. if rt=R_INTREGISTER then
  1498. begin
  1499. sehreg:=ash_savereg_x;
  1500. sehregp:=ash_saveregp_x;
  1501. end
  1502. else if rt=R_MMREGISTER then
  1503. begin
  1504. sehreg:=ash_savefreg_x;
  1505. sehregp:=ash_savefregp_x;
  1506. end
  1507. else
  1508. internalerror(2020041304);
  1509. for sr:=lowsr to highsr do
  1510. if sr in rg[rt].used_in_proc then
  1511. if pairreg=NR_NO then
  1512. pairreg:=newreg(rt,sr,sub)
  1513. else
  1514. begin
  1515. inc(result,16);
  1516. if getsupreg(pairreg)=sr-1 then
  1517. begin
  1518. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1519. list.concat(cai_seh_directive.create_reg_offset(sehregp,pairreg,16));
  1520. pairreg:=NR_NO;
  1521. end
  1522. else
  1523. begin
  1524. list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref));
  1525. list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16));
  1526. pairreg:=newreg(rt,sr,sub);
  1527. end;
  1528. end;
  1529. if pairreg<>NR_NO then
  1530. begin
  1531. inc(result,16);
  1532. list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref));
  1533. list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16));
  1534. end;
  1535. end
  1536. else
  1537. begin
  1538. { store all used registers pairwise }
  1539. for sr:=lowsr to highsr do
  1540. if sr in rg[rt].used_in_proc then
  1541. if pairreg=NR_NO then
  1542. pairreg:=newreg(rt,sr,sub)
  1543. else
  1544. begin
  1545. inc(result,16);
  1546. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1547. pairreg:=NR_NO
  1548. end;
  1549. { one left -> store twice (stack must be 16 bytes aligned) }
  1550. if pairreg<>NR_NO then
  1551. begin
  1552. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
  1553. inc(result,16);
  1554. end;
  1555. end;
  1556. end;
  1557. procedure FixupOffsets(p:TObject;arg:pointer);
  1558. var
  1559. sym: tabstractnormalvarsym absolute p;
  1560. begin
  1561. if (tsym(p).typ in [paravarsym,localvarsym]) and
  1562. (sym.localloc.loc=LOC_REFERENCE) and
  1563. (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
  1564. begin
  1565. sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
  1566. dec(sym.localloc.reference.offset,PLongint(arg)^);
  1567. end;
  1568. end;
  1569. procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
  1570. var
  1571. hitem: tlinkedlistitem;
  1572. seh_proc: tai_seh_directive;
  1573. templist: TAsmList;
  1574. suppress_endprologue: boolean;
  1575. ref: treference;
  1576. totalstackframesize: longint;
  1577. begin
  1578. hitem:=list.last;
  1579. { pi_has_unwind_info may already be set at this point if there are
  1580. SEH directives in assembler body. In this case, .seh_endprologue
  1581. is expected to be one of those directives, and not generated here. }
  1582. suppress_endprologue:=(pi_has_unwind_info in current_procinfo.flags);
  1583. if not nostackframe then
  1584. begin
  1585. { stack pointer has to be aligned to 16 bytes at all times }
  1586. localsize:=align(localsize,16);
  1587. if target_info.system=system_aarch64_win64 then
  1588. include(current_procinfo.flags,pi_has_unwind_info);
  1589. { save stack pointer and return address }
  1590. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1591. ref.addressmode:=AM_PREINDEXED;
  1592. list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
  1593. if target_info.system=system_aarch64_win64 then
  1594. list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16));
  1595. { initialise frame pointer }
  1596. if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then
  1597. begin
  1598. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
  1599. if target_info.system=system_aarch64_win64 then
  1600. list.concat(cai_seh_directive.create(ash_setfp));
  1601. end
  1602. else
  1603. begin
  1604. gen_load_frame_for_exceptfilter(list);
  1605. localsize:=current_procinfo.maxpushedparasize;
  1606. end;
  1607. totalstackframesize:=localsize;
  1608. { save modified integer registers }
  1609. inc(totalstackframesize,
  1610. save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
  1611. { only the lower 64 bits of the modified vector registers need to be
  1612. saved; if the caller needs the upper 64 bits, it has to save them
  1613. itself }
  1614. inc(totalstackframesize,
  1615. save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
  1616. { allocate stack space }
  1617. if localsize<>0 then
  1618. begin
  1619. localsize:=align(localsize,16);
  1620. current_procinfo.final_localsize:=localsize;
  1621. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1622. if target_info.system=system_aarch64_win64 then
  1623. list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize));
  1624. end;
  1625. { By default, we use the frame pointer to access parameters passed via
  1626. the stack and the stack pointer to address local variables and temps
  1627. because
  1628. a) we can use bigger positive than negative offsets (so accessing
  1629. locals via negative offsets from the frame pointer would be less
  1630. efficient)
  1631. b) we don't know the local size while generating the code, so
  1632. accessing the parameters via the stack pointer is not possible
  1633. without copying them
  1634. The problem with this is the get_frame() intrinsic:
  1635. a) it must return the same value as what we pass as parentfp
  1636. parameter, since that's how it's used in the TP-style objects unit
  1637. b) its return value must usable to access all local data from a
  1638. routine (locals and parameters), since it's all the nested
  1639. routines have access to
  1640. c) its return value must be usable to construct a backtrace, as it's
  1641. also used by the exception handling routines
  1642. The solution we use here, based on something similar that's done in
  1643. the MIPS port, is to generate all accesses to locals in the routine
  1644. itself SP-relative, and then after the code is generated and the local
  1645. size is known (namely, here), we change all SP-relative variables/
  1646. parameters into FP-relative ones. This means that they'll be accessed
  1647. less efficiently from nested routines, but those accesses are indirect
  1648. anyway and at least this way they can be accessed at all
  1649. }
  1650. if current_procinfo.has_nestedprocs or
  1651. (
  1652. (target_info.system=system_aarch64_win64) and
  1653. (current_procinfo.flags*[pi_has_implicit_finally,pi_needs_implicit_finally,pi_uses_exceptions]<>[])
  1654. ) then
  1655. begin
  1656. current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1657. current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1658. end;
  1659. end;
  1660. if not (pi_has_unwind_info in current_procinfo.flags) then
  1661. exit;
  1662. { Generate unwind data for aarch64-win64 }
  1663. seh_proc:=cai_seh_directive.create_name(ash_proc,current_procinfo.procdef.mangledname);
  1664. if assigned(hitem) then
  1665. list.insertafter(seh_proc,hitem)
  1666. else
  1667. list.insert(seh_proc);
  1668. { the directive creates another section }
  1669. inc(list.section_count);
  1670. templist:=TAsmList.Create;
  1671. if not suppress_endprologue then
  1672. begin
  1673. templist.concat(cai_seh_directive.create(ash_endprologue));
  1674. end;
  1675. if assigned(current_procinfo.endprologue_ai) then
  1676. current_procinfo.aktproccode.insertlistafter(current_procinfo.endprologue_ai,templist)
  1677. else
  1678. list.concatlist(templist);
  1679. templist.free;
  1680. end;
  1681. procedure tcgaarch64.g_maybe_got_init(list : TAsmList);
  1682. begin
  1683. { nothing to do on Darwin or Linux }
  1684. end;
  1685. procedure tcgaarch64.g_restore_registers(list:TAsmList);
  1686. begin
  1687. { done in g_proc_exit }
  1688. end;
  1689. procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  1690. var
  1691. ref: treference;
  1692. sr, highestsetsr: tsuperregister;
  1693. pairreg: tregister;
  1694. i,
  1695. regcount: longint;
  1696. aiarr : array of tai;
  1697. begin
  1698. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1699. ref.addressmode:=AM_POSTINDEXED;
  1700. regcount:=0;
  1701. { due to SEH on Win64 we can only load consecutive registers and single
  1702. ones are done using LDR, so we need to handle this differently there }
  1703. if target_info.system=system_aarch64_win64 then
  1704. begin
  1705. setlength(aiarr,highsr-lowsr+1);
  1706. pairreg:=NR_NO;
  1707. for sr:=lowsr to highsr do
  1708. if sr in rg[rt].used_in_proc then
  1709. begin
  1710. if pairreg=NR_NO then
  1711. pairreg:=newreg(rt,sr,sub)
  1712. else
  1713. begin
  1714. if getsupreg(pairreg)=sr-1 then
  1715. begin
  1716. aiarr[regcount]:=taicpu.op_reg_reg_ref(A_LDP,pairreg,newreg(rt,sr,sub),ref);
  1717. inc(regcount);
  1718. pairreg:=NR_NO;
  1719. end
  1720. else
  1721. begin
  1722. aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref);
  1723. inc(regcount);
  1724. pairreg:=newreg(rt,sr,sub);
  1725. end;
  1726. end;
  1727. end;
  1728. if pairreg<>NR_NO then
  1729. begin
  1730. aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref);
  1731. inc(regcount);
  1732. pairreg:=NR_NO;
  1733. end;
  1734. for i:=regcount-1 downto 0 do
  1735. list.concat(aiarr[i]);
  1736. end
  1737. else
  1738. begin
  1739. { highest reg stored twice? }
  1740. highestsetsr:=RS_NO;
  1741. for sr:=lowsr to highsr do
  1742. if sr in rg[rt].used_in_proc then
  1743. begin
  1744. inc(regcount);
  1745. highestsetsr:=sr;
  1746. end;
  1747. if odd(regcount) then
  1748. begin
  1749. list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref));
  1750. highestsetsr:=pred(highestsetsr);
  1751. end;
  1752. { load all (other) used registers pairwise }
  1753. pairreg:=NR_NO;
  1754. for sr:=highestsetsr downto lowsr do
  1755. if sr in rg[rt].used_in_proc then
  1756. if pairreg=NR_NO then
  1757. pairreg:=newreg(rt,sr,sub)
  1758. else
  1759. begin
  1760. list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref));
  1761. pairreg:=NR_NO
  1762. end;
  1763. end;
  1764. { There can't be any register left }
  1765. if pairreg<>NR_NO then
  1766. internalerror(2014112602);
  1767. end;
  1768. procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  1769. var
  1770. ref: treference;
  1771. regsstored: boolean;
  1772. sr: tsuperregister;
  1773. begin
  1774. if not(nostackframe) and
  1775. { we do not need an exit stack frame when we never return
  1776. * the final ret is left so the peephole optimizer can easily do call/ret -> jmp or call conversions
  1777. * the entry stack frame must be normally generated because the subroutine could be still left by
  1778. an exception and then the unwinding code might need to restore the registers stored by the entry code
  1779. }
  1780. not(po_noreturn in current_procinfo.procdef.procoptions) then
  1781. begin
  1782. { if no registers have been stored, we don't have to subtract the
  1783. allocated temp space from the stack pointer }
  1784. regsstored:=false;
  1785. for sr:=RS_X19 to RS_X28 do
  1786. if sr in rg[R_INTREGISTER].used_in_proc then
  1787. begin
  1788. regsstored:=true;
  1789. break;
  1790. end;
  1791. if not regsstored then
  1792. for sr:=RS_D8 to RS_D15 do
  1793. if sr in rg[R_MMREGISTER].used_in_proc then
  1794. begin
  1795. regsstored:=true;
  1796. break;
  1797. end;
  1798. { restore registers (and stack pointer) }
  1799. if regsstored then
  1800. begin
  1801. if current_procinfo.final_localsize<>0 then
  1802. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true);
  1803. load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
  1804. load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
  1805. end
  1806. else if current_procinfo.final_localsize<>0 then
  1807. { restore stack pointer }
  1808. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  1809. { restore framepointer and return address }
  1810. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1811. ref.addressmode:=AM_POSTINDEXED;
  1812. list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
  1813. end;
  1814. { return }
  1815. list.concat(taicpu.op_none(A_RET));
  1816. if (pi_has_unwind_info in current_procinfo.flags) then
  1817. begin
  1818. tcpuprocinfo(current_procinfo).dump_scopes(list);
  1819. list.concat(cai_seh_directive.create(ash_endproc));
  1820. end;
  1821. end;
  1822. procedure tcgaarch64.g_save_registers(list : TAsmList);
  1823. begin
  1824. { done in g_proc_entry }
  1825. end;
  1826. { ************* concatcopy ************ }
  1827. procedure tcgaarch64.g_concatcopy_move(list : TAsmList;const source,dest : treference;len : tcgint);
  1828. var
  1829. paraloc1,paraloc2,paraloc3 : TCGPara;
  1830. pd : tprocdef;
  1831. begin
  1832. pd:=search_system_proc('MOVE');
  1833. paraloc1.init;
  1834. paraloc2.init;
  1835. paraloc3.init;
  1836. paramanager.getcgtempparaloc(list,pd,1,paraloc1);
  1837. paramanager.getcgtempparaloc(list,pd,2,paraloc2);
  1838. paramanager.getcgtempparaloc(list,pd,3,paraloc3);
  1839. a_load_const_cgpara(list,OS_SINT,len,paraloc3);
  1840. a_loadaddr_ref_cgpara(list,dest,paraloc2);
  1841. a_loadaddr_ref_cgpara(list,source,paraloc1);
  1842. paramanager.freecgpara(list,paraloc3);
  1843. paramanager.freecgpara(list,paraloc2);
  1844. paramanager.freecgpara(list,paraloc1);
  1845. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1846. alloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1847. a_call_name(list,'FPC_MOVE',false);
  1848. dealloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1849. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1850. paraloc3.done;
  1851. paraloc2.done;
  1852. paraloc1.done;
  1853. end;
  1854. procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
  1855. var
  1856. sourcebasereplaced, destbasereplaced: boolean;
  1857. { get optimal memory operation to use for loading/storing data
  1858. in an unrolled loop }
  1859. procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
  1860. begin
  1861. if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
  1862. (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
  1863. begin
  1864. memop:=unscaledop;
  1865. needsimplify:=true;
  1866. end
  1867. else if (unscaledop<>A_NONE) and
  1868. (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
  1869. (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
  1870. begin
  1871. memop:=unscaledop;
  1872. needsimplify:=false;
  1873. end
  1874. else
  1875. begin
  1876. memop:=scaledop;
  1877. needsimplify:=true;
  1878. end;
  1879. end;
  1880. { adjust the offset and/or addressing mode after a load/store so it's
  1881. correct for the next one of the same size }
  1882. procedure updaterefafterloadstore(var ref: treference; oplen: longint);
  1883. begin
  1884. case ref.addressmode of
  1885. AM_OFFSET:
  1886. inc(ref.offset,oplen);
  1887. AM_POSTINDEXED:
  1888. { base register updated by instruction, next offset can remain
  1889. the same }
  1890. ;
  1891. AM_PREINDEXED:
  1892. begin
  1893. { base register updated by instruction -> next instruction can
  1894. use post-indexing with offset = sizeof(operation) }
  1895. ref.offset:=0;
  1896. ref.addressmode:=AM_OFFSET;
  1897. end;
  1898. end;
  1899. end;
  1900. { generate a load/store and adjust the reference offset to the next
  1901. memory location if necessary }
  1902. procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1903. begin
  1904. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
  1905. updaterefafterloadstore(ref,tcgsize2size[opsize]);
  1906. end;
  1907. { generate a dual load/store (ldp/stp) and adjust the reference offset to
  1908. the next memory location if necessary }
  1909. procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1910. begin
  1911. list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
  1912. updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
  1913. end;
  1914. { turn a reference into a pre- or post-indexed reference for use in a
  1915. load/store of a particular size }
  1916. procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
  1917. var
  1918. tmpreg: tregister;
  1919. scaledoffset: longint;
  1920. orgaddressmode: taddressmode;
  1921. begin
  1922. scaledoffset:=tcgsize2size[opsize];
  1923. if scaledop in [A_LDP,A_STP] then
  1924. scaledoffset:=scaledoffset*2;
  1925. { can we use the reference as post-indexed without changes? }
  1926. if forcepostindexing then
  1927. begin
  1928. orgaddressmode:=ref.addressmode;
  1929. ref.addressmode:=AM_POSTINDEXED;
  1930. if (orgaddressmode=AM_POSTINDEXED) or
  1931. ((ref.offset=0) and
  1932. (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
  1933. begin
  1934. { just change the post-indexed offset to the access size }
  1935. ref.offset:=scaledoffset;
  1936. { and replace the base register if that didn't happen yet
  1937. (could be sp or a regvar) }
  1938. if not basereplaced then
  1939. begin
  1940. tmpreg:=getaddressregister(list);
  1941. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1942. ref.base:=tmpreg;
  1943. basereplaced:=true;
  1944. end;
  1945. exit;
  1946. end;
  1947. ref.addressmode:=orgaddressmode;
  1948. end;
  1949. {$ifdef dummy}
  1950. This could in theory be useful in case you have a concatcopy from
  1951. e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
  1952. very unlikely. Disabled because it still needs fixes, as it
  1953. also generates pre-indexed loads right now at the very end for the
  1954. left-over gencopies
  1955. { can we turn it into a pre-indexed reference for free? (after the
  1956. first operation, it will be turned into an offset one) }
  1957. if not forcepostindexing and
  1958. (ref.offset<>0) then
  1959. begin
  1960. orgaddressmode:=ref.addressmode;
  1961. ref.addressmode:=AM_PREINDEXED;
  1962. tmpreg:=ref.base;
  1963. if not basereplaced and
  1964. (ref.base=tmpreg) then
  1965. begin
  1966. tmpreg:=getaddressregister(list);
  1967. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1968. ref.base:=tmpreg;
  1969. basereplaced:=true;
  1970. end;
  1971. if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
  1972. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1973. exit;
  1974. end;
  1975. {$endif dummy}
  1976. if not forcepostindexing then
  1977. begin
  1978. ref.addressmode:=AM_OFFSET;
  1979. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1980. { this may still cause problems if the final offset is no longer
  1981. a simple ref; it's a bit complicated to pass all information
  1982. through at all places and check that here, so play safe: we
  1983. currently never generate unrolled copies for more than 64
  1984. bytes (32 with non-double-register copies) }
  1985. if ref.index=NR_NO then
  1986. begin
  1987. if ((scaledop in [A_LDP,A_STP]) and
  1988. (ref.offset<((64-8)*tcgsize2size[opsize]))) or
  1989. ((scaledop in [A_LDUR,A_STUR]) and
  1990. (ref.offset<(255-8*tcgsize2size[opsize]))) or
  1991. ((scaledop in [A_LDR,A_STR]) and
  1992. (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
  1993. exit;
  1994. end;
  1995. end;
  1996. tmpreg:=getaddressregister(list);
  1997. a_loadaddr_ref_reg(list,ref,tmpreg);
  1998. basereplaced:=true;
  1999. if forcepostindexing then
  2000. begin
  2001. reference_reset_base(ref,tmpreg,scaledoffset,ref.temppos,ref.alignment,ref.volatility);
  2002. ref.addressmode:=AM_POSTINDEXED;
  2003. end
  2004. else
  2005. begin
  2006. reference_reset_base(ref,tmpreg,0,ref.temppos,ref.alignment,ref.volatility);
  2007. ref.addressmode:=AM_OFFSET;
  2008. end
  2009. end;
  2010. { prepare a reference for use by gencopy. This is done both after the
  2011. unrolled and regular copy loop -> get rid of post-indexing mode, make
  2012. sure ref is valid }
  2013. procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
  2014. var
  2015. simplify: boolean;
  2016. begin
  2017. if ref.addressmode=AM_POSTINDEXED then
  2018. ref.offset:=tcgsize2size[opsize];
  2019. getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
  2020. if simplify then
  2021. begin
  2022. makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
  2023. op:=scaledop;
  2024. end;
  2025. end;
  2026. { generate a copy from source to dest of size opsize/postfix }
  2027. procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
  2028. var
  2029. reg: tregister;
  2030. loadop, storeop: tasmop;
  2031. begin
  2032. preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
  2033. preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
  2034. reg:=getintregister(list,opsize);
  2035. genloadstore(list,loadop,reg,source,postfix,opsize);
  2036. genloadstore(list,storeop,reg,dest,postfix,opsize);
  2037. end;
  2038. { copy the leftovers after an unrolled or regular copy loop }
  2039. procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
  2040. begin
  2041. { stop post-indexing if we did so in the loop, since in that case all
  2042. offsets definitely can be represented now }
  2043. if source.addressmode=AM_POSTINDEXED then
  2044. begin
  2045. source.addressmode:=AM_OFFSET;
  2046. source.offset:=0;
  2047. end;
  2048. if dest.addressmode=AM_POSTINDEXED then
  2049. begin
  2050. dest.addressmode:=AM_OFFSET;
  2051. dest.offset:=0;
  2052. end;
  2053. { transfer the leftovers }
  2054. if len>=8 then
  2055. begin
  2056. dec(len,8);
  2057. gencopy(list,source,dest,PF_NONE,OS_64);
  2058. end;
  2059. if len>=4 then
  2060. begin
  2061. dec(len,4);
  2062. gencopy(list,source,dest,PF_NONE,OS_32);
  2063. end;
  2064. if len>=2 then
  2065. begin
  2066. dec(len,2);
  2067. gencopy(list,source,dest,PF_H,OS_16);
  2068. end;
  2069. if len>=1 then
  2070. begin
  2071. dec(len);
  2072. gencopy(list,source,dest,PF_B,OS_8);
  2073. end;
  2074. end;
  2075. const
  2076. { load_length + loop dec + cbnz }
  2077. loopoverhead=12;
  2078. { loop overhead + load + store }
  2079. totallooplen=loopoverhead + 8;
  2080. var
  2081. totalalign: longint;
  2082. maxlenunrolled: tcgint;
  2083. loadop, storeop: tasmop;
  2084. opsize: tcgsize;
  2085. postfix: toppostfix;
  2086. tmpsource, tmpdest: treference;
  2087. scaledstoreop, unscaledstoreop,
  2088. scaledloadop, unscaledloadop: tasmop;
  2089. regs: array[1..8] of tregister;
  2090. countreg: tregister;
  2091. i, regcount: longint;
  2092. hl: tasmlabel;
  2093. simplifysource, simplifydest: boolean;
  2094. begin
  2095. if len=0 then
  2096. exit;
  2097. sourcebasereplaced:=false;
  2098. destbasereplaced:=false;
  2099. { maximum common alignment }
  2100. totalalign:=max(1,newalignment(source.alignment,dest.alignment));
  2101. { use a simple load/store? }
  2102. if (len in [1,2,4,8]) and
  2103. ((totalalign>=(len div 2)) or
  2104. (source.alignment=len) or
  2105. (dest.alignment=len)) then
  2106. begin
  2107. opsize:=int_cgsize(len);
  2108. a_load_ref_ref(list,opsize,opsize,source,dest);
  2109. exit;
  2110. end;
  2111. { alignment > length is not useful, and would break some checks below }
  2112. while totalalign>len do
  2113. totalalign:=totalalign div 2;
  2114. { operation sizes to use based on common alignment }
  2115. case totalalign of
  2116. 1:
  2117. begin
  2118. postfix:=PF_B;
  2119. opsize:=OS_8;
  2120. end;
  2121. 2:
  2122. begin
  2123. postfix:=PF_H;
  2124. opsize:=OS_16;
  2125. end;
  2126. 4:
  2127. begin
  2128. postfix:=PF_None;
  2129. opsize:=OS_32;
  2130. end
  2131. else
  2132. begin
  2133. totalalign:=8;
  2134. postfix:=PF_None;
  2135. opsize:=OS_64;
  2136. end;
  2137. end;
  2138. { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
  2139. maxlenunrolled:=min(totalalign,8)*4;
  2140. { ldp/stp -> 2 registers per instruction }
  2141. if (totalalign>=4) and
  2142. (len>=totalalign*2) then
  2143. begin
  2144. maxlenunrolled:=maxlenunrolled*2;
  2145. scaledstoreop:=A_STP;
  2146. scaledloadop:=A_LDP;
  2147. unscaledstoreop:=A_NONE;
  2148. unscaledloadop:=A_NONE;
  2149. end
  2150. else
  2151. begin
  2152. scaledstoreop:=A_STR;
  2153. scaledloadop:=A_LDR;
  2154. unscaledstoreop:=A_STUR;
  2155. unscaledloadop:=A_LDUR;
  2156. end;
  2157. { we only need 4 instructions extra to call FPC_MOVE }
  2158. if cs_opt_size in current_settings.optimizerswitches then
  2159. maxlenunrolled:=maxlenunrolled div 2;
  2160. if (len>maxlenunrolled) and
  2161. (len>totalalign*8) then
  2162. begin
  2163. g_concatcopy_move(list,source,dest,len);
  2164. exit;
  2165. end;
  2166. simplifysource:=true;
  2167. simplifydest:=true;
  2168. tmpsource:=source;
  2169. tmpdest:=dest;
  2170. { can we directly encode all offsets in an unrolled loop? }
  2171. if len<=maxlenunrolled then
  2172. begin
  2173. {$ifdef extdebug}
  2174. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
  2175. {$endif extdebug}
  2176. { the leftovers will be handled separately -> -(len mod opsize) }
  2177. inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
  2178. { additionally, the last regular load/store will be at
  2179. offset+len-opsize (if len-(len mod opsize)>len) }
  2180. if tmpsource.offset>source.offset then
  2181. dec(tmpsource.offset,tcgsize2size[opsize]);
  2182. getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
  2183. inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
  2184. if tmpdest.offset>dest.offset then
  2185. dec(tmpdest.offset,tcgsize2size[opsize]);
  2186. getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
  2187. tmpsource:=source;
  2188. tmpdest:=dest;
  2189. { if we can't directly encode all offsets, simplify }
  2190. if simplifysource then
  2191. begin
  2192. loadop:=scaledloadop;
  2193. makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
  2194. end;
  2195. if simplifydest then
  2196. begin
  2197. storeop:=scaledstoreop;
  2198. makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
  2199. end;
  2200. regcount:=len div tcgsize2size[opsize];
  2201. { in case we transfer two registers at a time, we copy an even
  2202. number of registers }
  2203. if loadop=A_LDP then
  2204. regcount:=regcount and not(1);
  2205. { initialise for dfa }
  2206. regs[low(regs)]:=NR_NO;
  2207. { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
  2208. for i:=1 to regcount do
  2209. regs[i]:=getintregister(list,opsize);
  2210. if loadop=A_LDP then
  2211. begin
  2212. { load registers }
  2213. for i:=1 to (regcount div 2) do
  2214. gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
  2215. { store registers }
  2216. for i:=1 to (regcount div 2) do
  2217. gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
  2218. end
  2219. else
  2220. begin
  2221. for i:=1 to regcount do
  2222. genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
  2223. for i:=1 to regcount do
  2224. genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
  2225. end;
  2226. { leftover }
  2227. len:=len-regcount*tcgsize2size[opsize];
  2228. {$ifdef extdebug}
  2229. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
  2230. {$endif extdebug}
  2231. end
  2232. else
  2233. begin
  2234. {$ifdef extdebug}
  2235. list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
  2236. {$endif extdebug}
  2237. { regular loop -> definitely use post-indexing }
  2238. loadop:=scaledloadop;
  2239. makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
  2240. storeop:=scaledstoreop;
  2241. makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
  2242. current_asmdata.getjumplabel(hl);
  2243. countreg:=getintregister(list,OS_32);
  2244. if loadop=A_LDP then
  2245. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
  2246. else
  2247. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
  2248. a_label(list,hl);
  2249. a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
  2250. if loadop=A_LDP then
  2251. begin
  2252. regs[1]:=getintregister(list,opsize);
  2253. regs[2]:=getintregister(list,opsize);
  2254. gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
  2255. gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
  2256. end
  2257. else
  2258. begin
  2259. regs[1]:=getintregister(list,opsize);
  2260. genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
  2261. genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
  2262. end;
  2263. list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
  2264. len:=len mod tcgsize2size[opsize];
  2265. end;
  2266. gencopyleftovers(list,tmpsource,tmpdest,len);
  2267. end;
  2268. procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
  2269. begin
  2270. { This method is integrated into g_intf_wrapper and shouldn't be called separately }
  2271. InternalError(2013020102);
  2272. end;
  2273. procedure tcgaarch64.g_check_for_fpu_exception(list: TAsmList;force,clear : boolean);
  2274. var
  2275. r : TRegister;
  2276. ai: taicpu;
  2277. l1,l2: TAsmLabel;
  2278. begin
  2279. { so far, we assume all flavours of AArch64 need explicit floating point exception checking }
  2280. if ((cs_check_fpu_exceptions in current_settings.localswitches) and
  2281. (force or current_procinfo.FPUExceptionCheckNeeded)) then
  2282. begin
  2283. r:=getintregister(list,OS_INT);
  2284. list.concat(taicpu.op_reg_reg(A_MRS,r,NR_FPSR));
  2285. list.concat(taicpu.op_reg_const(A_TST,r,$1f));
  2286. current_asmdata.getjumplabel(l1);
  2287. current_asmdata.getjumplabel(l2);
  2288. ai:=taicpu.op_sym(A_B,l1);
  2289. ai.is_jmp:=true;
  2290. ai.condition:=C_NE;
  2291. list.concat(ai);
  2292. list.concat(taicpu.op_reg_const(A_TST,r,$80));
  2293. ai:=taicpu.op_sym(A_B,l2);
  2294. ai.is_jmp:=true;
  2295. ai.condition:=C_EQ;
  2296. list.concat(ai);
  2297. a_label(list,l1);
  2298. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2299. cg.a_call_name(list,'FPC_THROWFPUEXCEPTION',false);
  2300. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2301. a_label(list,l2);
  2302. if clear then
  2303. current_procinfo.FPUExceptionCheckNeeded:=false;
  2304. end;
  2305. end;
  2306. procedure tcgaarch64.g_profilecode(list : TAsmList);
  2307. begin
  2308. if target_info.system = system_aarch64_linux then
  2309. begin
  2310. list.concat(taicpu.op_reg_reg(A_MOV,NR_X0,NR_X30));
  2311. a_call_name(list,'_mcount',false);
  2312. end
  2313. else
  2314. internalerror(2020021901);
  2315. end;
  2316. procedure create_codegen;
  2317. begin
  2318. cg:=tcgaarch64.Create;
  2319. cg128:=tcg128.Create;
  2320. end;
  2321. end.