cgcpu.pas 90 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303
  1. {
  2. Copyright (c) 2014 by Jonas Maebe
  3. This unit implements the code generator for AArch64
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. globtype,parabase,
  22. cgbase,cgutils,cgobj,
  23. aasmbase,aasmtai,aasmdata,aasmcpu,
  24. cpubase,cpuinfo,
  25. node,symconst,SymType,symdef,
  26. rgcpu;
  27. type
  28. tcgaarch64=class(tcg)
  29. protected
  30. { changes register size without adding register allocation info }
  31. function makeregsize(reg: tregister; size: tcgsize): tregister; overload;
  32. public
  33. { simplifies "ref" so it can be used with "op". If "ref" can be used
  34. with a different load/Store operation that has the same meaning as the
  35. original one, "op" will be replaced with the alternative }
  36. procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  37. function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override;
  38. procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  39. procedure init_register_allocators;override;
  40. procedure done_register_allocators;override;
  41. function getmmregister(list:TAsmList;size:tcgsize):tregister;override;
  42. function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  43. procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override;
  44. procedure a_call_reg(list:TAsmList;Reg:tregister);override;
  45. { General purpose instructions }
  46. procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  47. procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override;
  48. procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override;
  49. procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override;
  50. procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override;
  51. procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  52. procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  53. { move instructions }
  54. procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override;
  55. procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override;
  56. procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override;
  57. procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override;
  58. procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override;
  59. procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override;
  60. procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override;
  61. procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override;
  62. { fpu move instructions (not used, all floating point is vector unit-based) }
  63. procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override;
  64. procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override;
  65. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  66. procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override;
  67. procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override;
  68. procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override;
  69. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  70. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override;
  71. procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override;
  72. procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override;
  73. { comparison operations }
  74. procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override;
  75. procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override;
  76. procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override;
  77. procedure a_jmp_name(list: TAsmList; const s: string);override;
  78. procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;}
  79. procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override;
  80. procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override;
  81. procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override;
  82. procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override;
  83. procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override;
  84. procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override;
  85. procedure g_maybe_got_init(list: TAsmList); override;
  86. procedure g_restore_registers(list: TAsmList);override;
  87. procedure g_save_registers(list: TAsmList);override;
  88. procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
  89. procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
  90. procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
  91. procedure g_check_for_fpu_exception(list: TAsmList; force, clear: boolean);override;
  92. private
  93. function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  94. procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  95. end;
  96. procedure create_codegen;
  97. const
  98. TOpCG2AsmOpReg: array[topcg] of TAsmOp = (
  99. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV
  100. );
  101. TOpCG2AsmOpImm: array[topcg] of TAsmOp = (
  102. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR
  103. );
  104. TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT,
  105. C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI
  106. );
  107. implementation
  108. uses
  109. globals,verbose,systems,cutils,
  110. paramgr,fmodule,
  111. symtable,symsym,
  112. tgobj,
  113. procinfo,cpupi;
  114. procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  115. var
  116. href: treference;
  117. so: tshifterop;
  118. accesssize: longint;
  119. begin
  120. if (ref.base=NR_NO) then
  121. begin
  122. if ref.shiftmode<>SM_None then
  123. internalerror(2014110701);
  124. ref.base:=ref.index;
  125. ref.index:=NR_NO;
  126. end;
  127. { no abitrary scale factor support (the generic code doesn't set it,
  128. AArch-specific code shouldn't either) }
  129. if not(ref.scalefactor in [0,1]) then
  130. internalerror(2014111002);
  131. case simple_ref_type(op,size,oppostfix,ref) of
  132. sr_simple:
  133. exit;
  134. sr_internal_illegal:
  135. internalerror(2014121702);
  136. sr_complex:
  137. { continue } ;
  138. end;
  139. if assigned(ref.symbol) then
  140. begin
  141. { internal "load symbol" instructions should already be valid }
  142. if assigned(ref.symboldata) or
  143. (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then
  144. internalerror(2014110802);
  145. { no relative symbol support (needed) yet }
  146. if assigned(ref.relsymbol) then
  147. internalerror(2014111001);
  148. { loading a symbol address (whether it's in the GOT or not) consists
  149. of two parts: first load the page on which it is located, then
  150. either the offset in the page or load the value at that offset in
  151. the page. This final GOT-load can be relaxed by the linker in case
  152. the variable itself can be stored directly in the GOT }
  153. if (preferred_newbasereg=NR_NO) or
  154. (ref.base=preferred_newbasereg) or
  155. (ref.index=preferred_newbasereg) then
  156. preferred_newbasereg:=getaddressregister(list);
  157. { load the (GOT) page }
  158. reference_reset_symbol(href,ref.symbol,0,8,[]);
  159. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  160. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  161. ((ref.symbol.typ=AT_DATA) and
  162. (ref.symbol.bind=AB_LOCAL)) then
  163. href.refaddr:=addr_page
  164. else
  165. href.refaddr:=addr_gotpage;
  166. list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href));
  167. { load the GOT entry (= address of the variable) }
  168. reference_reset_base(href,preferred_newbasereg,0,ctempposinvalid,sizeof(pint),[]);
  169. href.symbol:=ref.symbol;
  170. { code symbols defined in the current compilation unit do not
  171. have to be accessed via the GOT }
  172. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  173. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  174. ((ref.symbol.typ=AT_DATA) and
  175. (ref.symbol.bind=AB_LOCAL)) then
  176. begin
  177. href.base:=NR_NO;
  178. href.refaddr:=addr_pageoffset;
  179. list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href));
  180. end
  181. else
  182. begin
  183. href.refaddr:=addr_gotpageoffset;
  184. { use a_load_ref_reg() rather than directly encoding the LDR,
  185. so that we'll check the validity of the reference }
  186. a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg);
  187. end;
  188. { set as new base register }
  189. if ref.base=NR_NO then
  190. ref.base:=preferred_newbasereg
  191. else if ref.index=NR_NO then
  192. ref.index:=preferred_newbasereg
  193. else
  194. begin
  195. { make sure it's valid in case ref.base is SP -> make it
  196. the second operand}
  197. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg);
  198. ref.base:=preferred_newbasereg
  199. end;
  200. ref.symbol:=nil;
  201. end;
  202. { base & index }
  203. if (ref.base<>NR_NO) and
  204. (ref.index<>NR_NO) then
  205. begin
  206. case op of
  207. A_LDR, A_STR:
  208. begin
  209. if (ref.shiftmode=SM_None) and
  210. (ref.shiftimm<>0) then
  211. internalerror(2014110805);
  212. { wrong shift? (possible in case of something like
  213. array_of_2byte_rec[x].bytefield -> shift will be set 1, but
  214. the final load is a 1 byte -> can't use shift after all }
  215. if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and
  216. ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or
  217. (ref.offset<>0)) then
  218. begin
  219. if preferred_newbasereg=NR_NO then
  220. preferred_newbasereg:=getaddressregister(list);
  221. { "add" supports a superset of the shift modes supported by
  222. load/store instructions }
  223. shifterop_reset(so);
  224. so.shiftmode:=ref.shiftmode;
  225. so.shiftimm:=ref.shiftimm;
  226. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  227. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  228. { possibly still an invalid offset -> fall through }
  229. end
  230. else if ref.offset<>0 then
  231. begin
  232. if (preferred_newbasereg=NR_NO) or
  233. { we keep ref.index, so it must not be overwritten }
  234. (ref.index=preferred_newbasereg) then
  235. preferred_newbasereg:=getaddressregister(list);
  236. { add to the base and not to the index, because the index
  237. may be scaled; this works even if the base is SP }
  238. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  239. ref.offset:=0;
  240. ref.base:=preferred_newbasereg;
  241. { finished }
  242. exit;
  243. end
  244. else
  245. { valid -> exit }
  246. exit;
  247. end;
  248. { todo }
  249. A_LD1,A_LD2,A_LD3,A_LD4,
  250. A_ST1,A_ST2,A_ST3,A_ST4:
  251. internalerror(2014110704);
  252. { these don't support base+index }
  253. A_LDUR,A_STUR,
  254. A_LDP,A_STP:
  255. begin
  256. { these either don't support pre-/post-indexing, or don't
  257. support it with base+index }
  258. if ref.addressmode<>AM_OFFSET then
  259. internalerror(2014110911);
  260. if preferred_newbasereg=NR_NO then
  261. preferred_newbasereg:=getaddressregister(list);
  262. if ref.shiftmode<>SM_None then
  263. begin
  264. { "add" supports a superset of the shift modes supported by
  265. load/store instructions }
  266. shifterop_reset(so);
  267. so.shiftmode:=ref.shiftmode;
  268. so.shiftimm:=ref.shiftimm;
  269. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  270. end
  271. else
  272. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg);
  273. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  274. { fall through to the handling of base + offset, since the
  275. offset may still be too big }
  276. end;
  277. else
  278. internalerror(2014110901);
  279. end;
  280. end;
  281. { base + offset }
  282. if ref.base<>NR_NO then
  283. begin
  284. { valid offset for LDUR/STUR -> use that }
  285. if (ref.addressmode=AM_OFFSET) and
  286. (op in [A_LDR,A_STR]) and
  287. (ref.offset>=-256) and
  288. (ref.offset<=255) then
  289. begin
  290. if op=A_LDR then
  291. op:=A_LDUR
  292. else
  293. op:=A_STUR
  294. end
  295. { if it's not a valid LDUR/STUR, use LDR/STR }
  296. else if (op in [A_LDUR,A_STUR]) and
  297. ((ref.offset<-256) or
  298. (ref.offset>255) or
  299. (ref.addressmode<>AM_OFFSET)) then
  300. begin
  301. if op=A_LDUR then
  302. op:=A_LDR
  303. else
  304. op:=A_STR
  305. end;
  306. case op of
  307. A_LDR,A_STR:
  308. begin
  309. case ref.addressmode of
  310. AM_PREINDEXED:
  311. begin
  312. { since the loaded/stored register cannot be the same
  313. as the base register, we can safely add the
  314. offset to the base if it doesn't fit}
  315. if (ref.offset<-256) or
  316. (ref.offset>255) then
  317. begin
  318. a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base);
  319. ref.offset:=0;
  320. end;
  321. end;
  322. AM_POSTINDEXED:
  323. begin
  324. { cannot emulate post-indexing if we have to fold the
  325. offset into the base register }
  326. if (ref.offset<-256) or
  327. (ref.offset>255) then
  328. internalerror(2014110909);
  329. { ok }
  330. end;
  331. AM_OFFSET:
  332. begin
  333. { unsupported offset -> fold into base register }
  334. accesssize:=1 shl tcgsizep2size[size];
  335. if (ref.offset<0) or
  336. (ref.offset>(((1 shl 12)-1)*accesssize)) or
  337. ((ref.offset mod accesssize)<>0) then
  338. begin
  339. if preferred_newbasereg=NR_NO then
  340. preferred_newbasereg:=getaddressregister(list);
  341. { can we split the offset beween an
  342. "add/sub (imm12 shl 12)" and the load (also an
  343. imm12)?
  344. -- the offset from the load will always be added,
  345. that's why the lower bound has a smaller range
  346. than the upper bound; it must also be a multiple
  347. of the access size }
  348. if (ref.offset>=-(((1 shl 12)-1) shl 12)) and
  349. (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and
  350. ((ref.offset mod accesssize)=0) then
  351. begin
  352. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg);
  353. ref.offset:=ref.offset-(ref.offset shr 12) shl 12;
  354. end
  355. else
  356. begin
  357. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  358. ref.offset:=0;
  359. end;
  360. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  361. end;
  362. end
  363. end;
  364. end;
  365. A_LDP,A_STP:
  366. begin
  367. { unsupported offset -> fold into base register (these
  368. instructions support all addressmodes) }
  369. if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or
  370. (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then
  371. begin
  372. case ref.addressmode of
  373. AM_POSTINDEXED:
  374. { don't emulate post-indexing if we have to fold the
  375. offset into the base register }
  376. internalerror(2014110910);
  377. AM_PREINDEXED:
  378. { this means the offset must be added to the current
  379. base register }
  380. preferred_newbasereg:=ref.base;
  381. AM_OFFSET:
  382. if preferred_newbasereg=NR_NO then
  383. preferred_newbasereg:=getaddressregister(list);
  384. end;
  385. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  386. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,ref.alignment,ref.volatility);
  387. end
  388. end;
  389. A_LDUR,A_STUR:
  390. begin
  391. { valid, checked above }
  392. end;
  393. { todo }
  394. A_LD1,A_LD2,A_LD3,A_LD4,
  395. A_ST1,A_ST2,A_ST3,A_ST4:
  396. internalerror(2014110908);
  397. else
  398. internalerror(2014110708);
  399. end;
  400. { done }
  401. exit;
  402. end;
  403. { only an offset -> change to base (+ offset 0) }
  404. if preferred_newbasereg=NR_NO then
  405. preferred_newbasereg:=getaddressregister(list);
  406. a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg);
  407. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,newalignment(8,ref.offset),ref.volatility);
  408. end;
  409. function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister;
  410. var
  411. subreg:Tsubregister;
  412. begin
  413. subreg:=cgsize2subreg(getregtype(reg),size);
  414. result:=reg;
  415. setsubreg(result,subreg);
  416. end;
  417. function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister;
  418. begin
  419. internalerror(2014122110);
  420. { squash warning }
  421. result:=NR_NO;
  422. end;
  423. function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  424. begin
  425. make_simple_ref(list,op,size,oppostfix,ref,NR_NO);
  426. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix));
  427. result:=ref;
  428. end;
  429. procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  430. var
  431. instr: taicpu;
  432. so: tshifterop;
  433. hadtmpreg: boolean;
  434. begin
  435. { imm12 }
  436. if (a>=0) and
  437. (a<=((1 shl 12)-1)) then
  438. if usedest then
  439. instr:=taicpu.op_reg_reg_const(op,dst,src,a)
  440. else
  441. instr:=taicpu.op_reg_const(op,src,a)
  442. { imm12 lsl 12 }
  443. else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then
  444. begin
  445. so.shiftmode:=SM_LSL;
  446. so.shiftimm:=12;
  447. if usedest then
  448. instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so)
  449. else
  450. instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so)
  451. end
  452. else
  453. begin
  454. { todo: other possible optimizations (e.g. load 16 bit constant in
  455. register and then add/sub/cmp/cmn shifted the rest) }
  456. if tmpreg=NR_NO then
  457. begin
  458. hadtmpreg:=false;
  459. tmpreg:=getintregister(list,size);
  460. end
  461. else
  462. begin
  463. hadtmpreg:=true;
  464. getcpuregister(list,tmpreg);
  465. end;
  466. a_load_const_reg(list,size,a,tmpreg);
  467. if usedest then
  468. instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg)
  469. else
  470. instr:=taicpu.op_reg_reg(op,src,tmpreg);
  471. if hadtmpreg then
  472. ungetcpuregister(list,tmpreg);
  473. end;
  474. if setflags then
  475. setoppostfix(instr,PF_S);
  476. list.concat(instr);
  477. end;
  478. {****************************************************************************
  479. Assembler code
  480. ****************************************************************************}
  481. procedure tcgaarch64.init_register_allocators;
  482. begin
  483. inherited init_register_allocators;
  484. rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
  485. [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
  486. RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
  487. RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
  488. { maybe we can enable this in the future for leaf functions (it's
  489. the frame pointer)
  490. ,RS_X29 }],
  491. first_int_imreg,[]);
  492. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD,
  493. [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7,
  494. RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15,
  495. RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23,
  496. RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31],
  497. first_mm_imreg,[]);
  498. end;
  499. procedure tcgaarch64.done_register_allocators;
  500. begin
  501. rg[R_INTREGISTER].free;
  502. rg[R_FPUREGISTER].free;
  503. rg[R_MMREGISTER].free;
  504. inherited done_register_allocators;
  505. end;
  506. function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister;
  507. begin
  508. case size of
  509. OS_F32:
  510. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
  511. OS_F64:
  512. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD)
  513. else
  514. internalerror(2014102701);
  515. end;
  516. end;
  517. procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean);
  518. begin
  519. if not weak then
  520. list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)))
  521. else
  522. list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION)));
  523. end;
  524. procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister);
  525. begin
  526. list.concat(taicpu.op_reg(A_BLR,reg));
  527. end;
  528. {********************** load instructions ********************}
  529. procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister);
  530. var
  531. preva: tcgint;
  532. opc: tasmop;
  533. shift,maxshift: byte;
  534. so: tshifterop;
  535. reginited: boolean;
  536. mask: tcgint;
  537. begin
  538. { if we load a value into a 32 bit register, it is automatically
  539. zero-extended to 64 bit }
  540. if (hi(a)=0) and
  541. (size in [OS_64,OS_S64]) then
  542. begin
  543. size:=OS_32;
  544. reg:=makeregsize(reg,size);
  545. end;
  546. { values <= 32 bit are stored in a 32 bit register }
  547. if not(size in [OS_64,OS_S64]) then
  548. a:=cardinal(a);
  549. if size in [OS_64,OS_S64] then
  550. begin
  551. mask:=-1;
  552. maxshift:=64;
  553. end
  554. else
  555. begin
  556. mask:=$ffffffff;
  557. maxshift:=32;
  558. end;
  559. { single movn enough? (to be extended) }
  560. shift:=16;
  561. preva:=a;
  562. repeat
  563. if (a shr shift)=(mask shr shift) then
  564. begin
  565. if shift=16 then
  566. list.concat(taicpu.op_reg_const(A_MOVN,reg,not(word(preva))))
  567. else
  568. begin
  569. shifterop_reset(so);
  570. so.shiftmode:=SM_LSL;
  571. so.shiftimm:=shift-16;
  572. list.concat(taicpu.op_reg_const_shifterop(A_MOVN,reg,not(word(preva)),so));
  573. end;
  574. exit;
  575. end;
  576. { only try the next 16 bits if the current one is all 1 bits, since
  577. the movn will set all lower bits to 1 }
  578. if word(a shr (shift-16))<>$ffff then
  579. break;
  580. inc(shift,16);
  581. until shift=maxshift;
  582. reginited:=false;
  583. shift:=0;
  584. { can be optimized later to use more movn }
  585. repeat
  586. { leftover is shifterconst? (don't check if we can represent it just
  587. as effectively with movz/movk, as this check is expensive) }
  588. if ((shift<tcgsize2size[size]*(8 div 2)) and
  589. (word(a)<>0) and
  590. ((a shr 16)<>0)) and
  591. is_shifter_const(a shl shift,size) then
  592. begin
  593. if reginited then
  594. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift))
  595. else
  596. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift));
  597. exit;
  598. end;
  599. { set all 16 bit parts <> 0 }
  600. if (word(a)<>0) or
  601. ((shift=0) and
  602. (a=0)) then
  603. if shift=0 then
  604. begin
  605. list.concat(taicpu.op_reg_const(A_MOVZ,reg,word(a)));
  606. reginited:=true;
  607. end
  608. else
  609. begin
  610. shifterop_reset(so);
  611. so.shiftmode:=SM_LSL;
  612. so.shiftimm:=shift;
  613. if not reginited then
  614. begin
  615. opc:=A_MOVZ;
  616. reginited:=true;
  617. end
  618. else
  619. opc:=A_MOVK;
  620. list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so));
  621. end;
  622. preva:=a;
  623. a:=a shr 16;
  624. inc(shift,16);
  625. until word(preva)=preva;
  626. if not reginited then
  627. internalerror(2014102702);
  628. end;
  629. procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference);
  630. var
  631. reg: tregister;
  632. begin
  633. { use the zero register if possible }
  634. if a=0 then
  635. begin
  636. if size in [OS_64,OS_S64] then
  637. reg:=NR_XZR
  638. else
  639. reg:=NR_WZR;
  640. a_load_reg_ref(list,size,size,reg,ref);
  641. end
  642. else
  643. inherited;
  644. end;
  645. procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  646. var
  647. oppostfix:toppostfix;
  648. hreg: tregister;
  649. begin
  650. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  651. begin
  652. fromsize:=tosize;
  653. reg:=makeregsize(list,reg,fromsize);
  654. end
  655. { have a 32 bit register but need a 64 bit one? }
  656. else if tosize in [OS_64,OS_S64] then
  657. begin
  658. { sign extend if necessary }
  659. if fromsize in [OS_S8,OS_S16,OS_S32] then
  660. begin
  661. { can't overwrite reg, may be a constant reg }
  662. hreg:=getintregister(list,tosize);
  663. a_load_reg_reg(list,fromsize,tosize,reg,hreg);
  664. reg:=hreg;
  665. end
  666. else
  667. { top 32 bit are zero by default }
  668. reg:=makeregsize(reg,OS_64);
  669. fromsize:=tosize;
  670. end;
  671. if (ref.alignment<>0) and
  672. (ref.alignment<tcgsize2size[tosize]) then
  673. begin
  674. a_load_reg_ref_unaligned(list,fromsize,tosize,reg,ref);
  675. end
  676. else
  677. begin
  678. case tosize of
  679. { signed integer registers }
  680. OS_8,
  681. OS_S8:
  682. oppostfix:=PF_B;
  683. OS_16,
  684. OS_S16:
  685. oppostfix:=PF_H;
  686. OS_32,
  687. OS_S32,
  688. OS_64,
  689. OS_S64:
  690. oppostfix:=PF_None;
  691. else
  692. InternalError(200308299);
  693. end;
  694. handle_load_store(list,A_STR,tosize,oppostfix,reg,ref);
  695. end;
  696. end;
  697. procedure tcgaarch64.a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  698. var
  699. oppostfix:toppostfix;
  700. begin
  701. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  702. fromsize:=tosize;
  703. { ensure that all bits of the 32/64 register are always correctly set:
  704. * default behaviour is always to zero-extend to the entire (64 bit)
  705. register -> unsigned 8/16/32 bit loads only exist with a 32 bit
  706. target register, as the upper 32 bit will be zeroed implicitly
  707. -> always make target register 32 bit
  708. * signed loads exist both with 32 and 64 bit target registers,
  709. depending on whether the value should be sign extended to 32 or
  710. to 64 bit (if sign extended to 32 bit, the upper 32 bits of the
  711. corresponding 64 bit register are again zeroed) -> no need to
  712. change anything (we only have 32 and 64 bit registers), except that
  713. when loading an OS_S32 to a 32 bit register, we don't need/can't
  714. use sign extension
  715. }
  716. if fromsize in [OS_8,OS_16,OS_32] then
  717. reg:=makeregsize(reg,OS_32);
  718. if (ref.alignment<>0) and
  719. (ref.alignment<tcgsize2size[fromsize]) then
  720. begin
  721. a_load_ref_reg_unaligned(list,fromsize,tosize,ref,reg);
  722. exit;
  723. end;
  724. case fromsize of
  725. { signed integer registers }
  726. OS_8:
  727. oppostfix:=PF_B;
  728. OS_S8:
  729. oppostfix:=PF_SB;
  730. OS_16:
  731. oppostfix:=PF_H;
  732. OS_S16:
  733. oppostfix:=PF_SH;
  734. OS_S32:
  735. if getsubreg(reg)=R_SUBD then
  736. oppostfix:=PF_NONE
  737. else
  738. oppostfix:=PF_SW;
  739. OS_32,
  740. OS_64,
  741. OS_S64:
  742. oppostfix:=PF_None;
  743. else
  744. InternalError(200308297);
  745. end;
  746. handle_load_store(list,A_LDR,fromsize,oppostfix,reg,ref);
  747. { clear upper 16 bits if the value was negative }
  748. if (fromsize=OS_S8) and (tosize=OS_16) then
  749. a_load_reg_reg(list,fromsize,tosize,reg,reg);
  750. end;
  751. procedure tcgaarch64.a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister);
  752. var
  753. href: treference;
  754. hreg1, hreg2, tmpreg,tmpreg2: tregister;
  755. i : Integer;
  756. begin
  757. case fromsize of
  758. OS_64,OS_S64:
  759. begin
  760. { split into two 32 bit loads }
  761. hreg1:=getintregister(list,OS_32);
  762. hreg2:=getintregister(list,OS_32);
  763. if target_info.endian=endian_big then
  764. begin
  765. tmpreg:=hreg1;
  766. hreg1:=hreg2;
  767. hreg2:=tmpreg;
  768. end;
  769. { can we use LDP? }
  770. if (ref.alignment=4) and
  771. (simple_ref_type(A_LDP,OS_32,PF_None,ref)=sr_simple) then
  772. list.concat(taicpu.op_reg_reg_ref(A_LDP,hreg1,hreg2,ref))
  773. else
  774. begin
  775. a_load_ref_reg(list,OS_32,OS_32,ref,hreg1);
  776. href:=ref;
  777. inc(href.offset,4);
  778. a_load_ref_reg(list,OS_32,OS_32,href,hreg2);
  779. end;
  780. a_load_reg_reg(list,OS_32,OS_64,hreg1,register);
  781. list.concat(taicpu.op_reg_reg_const_const(A_BFI,register,makeregsize(hreg2,OS_64),32,32));
  782. end;
  783. OS_16,OS_S16,
  784. OS_32,OS_S32:
  785. begin
  786. if ref.alignment=2 then
  787. begin
  788. href:=ref;
  789. if target_info.endian=endian_big then
  790. inc(href.offset,tcgsize2size[fromsize]-2);
  791. tmpreg:=getintregister(list,OS_32);
  792. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg);
  793. tmpreg2:=getintregister(list,OS_32);
  794. for i:=1 to (tcgsize2size[fromsize]-1) div 2 do
  795. begin
  796. if target_info.endian=endian_big then
  797. dec(href.offset,2)
  798. else
  799. inc(href.offset,2);
  800. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg2);
  801. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*16,16));
  802. end;
  803. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  804. end
  805. else
  806. begin
  807. href:=ref;
  808. if target_info.endian=endian_big then
  809. inc(href.offset,tcgsize2size[fromsize]-1);
  810. tmpreg:=getintregister(list,OS_32);
  811. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg);
  812. tmpreg2:=getintregister(list,OS_32);
  813. for i:=1 to tcgsize2size[fromsize]-1 do
  814. begin
  815. if target_info.endian=endian_big then
  816. dec(href.offset)
  817. else
  818. inc(href.offset);
  819. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg2);
  820. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*8,8));
  821. end;
  822. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  823. end;
  824. end;
  825. else
  826. inherited;
  827. end;
  828. end;
  829. procedure tcgaarch64.a_load_reg_reg(list:TAsmList;fromsize,tosize:tcgsize;reg1,reg2:tregister);
  830. var
  831. instr: taicpu;
  832. begin
  833. { we use both 32 and 64 bit registers -> insert conversion when when
  834. we have to truncate/sign extend inside the (32 or 64 bit) register
  835. holding the value, and when we sign extend from a 32 to a 64 bit
  836. register }
  837. if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or
  838. ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and
  839. (fromsize<>tosize) and
  840. not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or
  841. ((fromsize in [OS_S8,OS_S16,OS_S32]) and
  842. (tosize in [OS_64,OS_S64])) or
  843. { needs to mask out the sign in the top 16 bits }
  844. ((fromsize=OS_S8) and
  845. (tosize=OS_16)) then
  846. begin
  847. case tosize of
  848. OS_8:
  849. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  850. OS_16:
  851. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  852. OS_S8:
  853. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  854. OS_S16:
  855. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  856. { while "mov wN, wM" automatically inserts a zero-extension and
  857. hence we could encode a 64->32 bit move like that, the problem
  858. is that we then can't distinguish 64->32 from 32->32 moves, and
  859. the 64->32 truncation could be removed altogether... So use a
  860. different instruction }
  861. OS_32,
  862. OS_S32:
  863. { in theory, reg1 should be 64 bit here (since fromsize>tosize),
  864. but because of the way location_force_register() tries to
  865. avoid superfluous zero/sign extensions, it's not always the
  866. case -> also force reg1 to to 64 bit }
  867. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  868. OS_64,
  869. OS_S64:
  870. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_W));
  871. else
  872. internalerror(2002090901);
  873. end;
  874. end
  875. else
  876. begin
  877. { 32 -> 32 bit move implies zero extension (sign extensions have
  878. been handled above) -> also use for 32 <-> 64 bit moves }
  879. if not(fromsize in [OS_64,OS_S64]) or
  880. not(tosize in [OS_64,OS_S64]) then
  881. instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32))
  882. else
  883. instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1);
  884. list.Concat(instr);
  885. { Notify the register allocator that we have written a move instruction so
  886. it can try to eliminate it. }
  887. add_move_instruction(instr);
  888. end;
  889. end;
  890. procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister);
  891. var
  892. href: treference;
  893. so: tshifterop;
  894. op: tasmop;
  895. begin
  896. op:=A_LDR;
  897. href:=ref;
  898. { simplify as if we're going to perform a regular 64 bit load, using
  899. "r" as the new base register if possible/necessary }
  900. make_simple_ref(list,op,OS_ADDR,PF_None,href,r);
  901. { load literal? }
  902. if assigned(href.symbol) then
  903. begin
  904. if (href.base<>NR_NO) or
  905. (href.index<>NR_NO) or
  906. not assigned(href.symboldata) then
  907. internalerror(2014110912);
  908. list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset));
  909. end
  910. else
  911. begin
  912. if href.index<>NR_NO then
  913. begin
  914. if href.shiftmode<>SM_None then
  915. begin
  916. { "add" supports a supperset of the shift modes supported by
  917. load/store instructions }
  918. shifterop_reset(so);
  919. so.shiftmode:=href.shiftmode;
  920. so.shiftimm:=href.shiftimm;
  921. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so));
  922. end
  923. else
  924. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r);
  925. end
  926. else if href.offset<>0 then
  927. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r)
  928. else
  929. a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r);
  930. end;
  931. end;
  932. procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);
  933. begin
  934. internalerror(2014122107)
  935. end;
  936. procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  937. begin
  938. internalerror(2014122108)
  939. end;
  940. procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  941. begin
  942. internalerror(2014122109)
  943. end;
  944. procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
  945. var
  946. instr: taicpu;
  947. begin
  948. if assigned(shuffle) and
  949. not shufflescalar(shuffle) then
  950. internalerror(2014122104);
  951. if fromsize=tosize then
  952. begin
  953. instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1);
  954. { Notify the register allocator that we have written a move
  955. instruction so it can try to eliminate it. }
  956. add_move_instruction(instr);
  957. { FMOV cannot generate a floating point exception }
  958. end
  959. else
  960. begin
  961. if (reg_cgsize(reg1)<>fromsize) or
  962. (reg_cgsize(reg2)<>tosize) then
  963. internalerror(2014110913);
  964. instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1);
  965. maybe_check_for_fpu_exception(list);
  966. end;
  967. list.Concat(instr);
  968. end;
  969. procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
  970. var
  971. tmpreg: tregister;
  972. begin
  973. if assigned(shuffle) and
  974. not shufflescalar(shuffle) then
  975. internalerror(2014122105);
  976. tmpreg:=NR_NO;
  977. if (fromsize<>tosize) then
  978. begin
  979. tmpreg:=reg;
  980. reg:=getmmregister(list,fromsize);
  981. end;
  982. handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref);
  983. if (fromsize<>tosize) then
  984. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  985. end;
  986. procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
  987. var
  988. tmpreg: tregister;
  989. begin
  990. if assigned(shuffle) and
  991. not shufflescalar(shuffle) then
  992. internalerror(2014122106);
  993. if (fromsize<>tosize) then
  994. begin
  995. tmpreg:=getmmregister(list,tosize);
  996. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  997. reg:=tmpreg;
  998. end;
  999. handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref);
  1000. end;
  1001. procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  1002. begin
  1003. if not shufflescalar(shuffle) then
  1004. internalerror(2014122801);
  1005. if not(tcgsize2size[fromsize] in [4,8]) or
  1006. (tcgsize2size[fromsize]<>tcgsize2size[tosize]) then
  1007. internalerror(2014122803);
  1008. list.concat(taicpu.op_reg_reg(A_INS,mmreg,intreg));
  1009. end;
  1010. procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
  1011. var
  1012. r : tregister;
  1013. begin
  1014. if not shufflescalar(shuffle) then
  1015. internalerror(2014122802);
  1016. if not(tcgsize2size[fromsize] in [4,8]) or
  1017. (tcgsize2size[fromsize]>tcgsize2size[tosize]) then
  1018. internalerror(2014122804);
  1019. if tcgsize2size[fromsize]<tcgsize2size[tosize] then
  1020. r:=makeregsize(intreg,fromsize)
  1021. else
  1022. r:=intreg;
  1023. list.concat(taicpu.op_reg_reg(A_UMOV,r,mmreg));
  1024. end;
  1025. procedure tcgaarch64.a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
  1026. begin
  1027. case op of
  1028. { "xor Vx,Vx" is used to initialize global regvars to 0 }
  1029. OP_XOR:
  1030. begin
  1031. if (src<>dst) or
  1032. (reg_cgsize(src)<>size) or
  1033. assigned(shuffle) then
  1034. internalerror(2015011401);
  1035. case size of
  1036. OS_F32,
  1037. OS_F64:
  1038. list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0));
  1039. else
  1040. internalerror(2015011402);
  1041. end;
  1042. end
  1043. else
  1044. internalerror(2015011403);
  1045. end;
  1046. end;
  1047. procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister);
  1048. var
  1049. bitsize: longint;
  1050. begin
  1051. if srcsize in [OS_64,OS_S64] then
  1052. begin
  1053. bitsize:=64;
  1054. end
  1055. else
  1056. begin
  1057. bitsize:=32;
  1058. end;
  1059. { source is 0 -> dst will have to become 255 }
  1060. list.concat(taicpu.op_reg_const(A_CMP,src,0));
  1061. if reverse then
  1062. begin
  1063. list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src));
  1064. { xor 31/63 is the same as setting the lower 5/6 bits to
  1065. "31/63-(lower 5/6 bits of dst)" }
  1066. list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1));
  1067. end
  1068. else
  1069. begin
  1070. list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src));
  1071. list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
  1072. end;
  1073. { set dst to -1 if src was 0 }
  1074. list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE));
  1075. { mask the -1 to 255 if src was 0 (anyone find a two-instruction
  1076. branch-free version? All of mine are 3...) }
  1077. list.Concat(setoppostfix(taicpu.op_reg_reg(A_UXT,makeregsize(dst,OS_32),makeregsize(dst,OS_32)),PF_B));
  1078. end;
  1079. procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference);
  1080. var
  1081. href: treference;
  1082. hreg1, hreg2, tmpreg: tregister;
  1083. begin
  1084. if fromsize in [OS_64,OS_S64] then
  1085. begin
  1086. { split into two 32 bit stores }
  1087. hreg1:=getintregister(list,OS_32);
  1088. hreg2:=getintregister(list,OS_32);
  1089. a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1);
  1090. a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64));
  1091. if target_info.endian=endian_big then
  1092. begin
  1093. tmpreg:=hreg1;
  1094. hreg1:=hreg2;
  1095. hreg2:=tmpreg;
  1096. end;
  1097. { can we use STP? }
  1098. if (ref.alignment=4) and
  1099. (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then
  1100. list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref))
  1101. else
  1102. begin
  1103. a_load_reg_ref(list,OS_32,OS_32,hreg1,ref);
  1104. href:=ref;
  1105. inc(href.offset,4);
  1106. a_load_reg_ref(list,OS_32,OS_32,hreg2,href);
  1107. end;
  1108. end
  1109. else
  1110. inherited;
  1111. end;
  1112. procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  1113. const
  1114. overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG];
  1115. begin
  1116. if (op in overflowops) and
  1117. (size in [OS_8,OS_S8,OS_16,OS_S16]) then
  1118. a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32))
  1119. end;
  1120. procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);
  1121. begin
  1122. optimize_op_const(size,op,a);
  1123. case op of
  1124. OP_NONE:
  1125. exit;
  1126. OP_MOVE:
  1127. a_load_const_reg(list,size,a,reg);
  1128. OP_NEG,OP_NOT:
  1129. internalerror(200306011);
  1130. else
  1131. a_op_const_reg_reg(list,op,size,a,reg,reg);
  1132. end;
  1133. end;
  1134. procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister);
  1135. begin
  1136. Case op of
  1137. OP_NEG,
  1138. OP_NOT:
  1139. begin
  1140. list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src));
  1141. maybeadjustresult(list,op,size,dst);
  1142. end
  1143. else
  1144. a_op_reg_reg_reg(list,op,size,src,dst,dst);
  1145. end;
  1146. end;
  1147. procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);
  1148. var
  1149. l: tlocation;
  1150. begin
  1151. a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l);
  1152. end;
  1153. procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);
  1154. var
  1155. hreg: tregister;
  1156. begin
  1157. { no ROLV opcode... }
  1158. if op=OP_ROL then
  1159. begin
  1160. case size of
  1161. OS_32,OS_S32,
  1162. OS_64,OS_S64:
  1163. begin
  1164. hreg:=getintregister(list,size);
  1165. a_load_const_reg(list,size,tcgsize2size[size]*8,hreg);
  1166. a_op_reg_reg(list,OP_SUB,size,src1,hreg);
  1167. a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst);
  1168. exit;
  1169. end;
  1170. else
  1171. internalerror(2014111005);
  1172. end;
  1173. end
  1174. else if (op=OP_ROR) and
  1175. not(size in [OS_32,OS_S32,OS_64,OS_S64]) then
  1176. internalerror(2014111006);
  1177. if TOpCG2AsmOpReg[op]=A_NONE then
  1178. internalerror(2014111007);
  1179. list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1));
  1180. maybeadjustresult(list,op,size,dst);
  1181. end;
  1182. procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1183. var
  1184. shiftcountmask: longint;
  1185. constreg: tregister;
  1186. begin
  1187. { add/sub instructions have only positive immediate operands }
  1188. if (op in [OP_ADD,OP_SUB]) and
  1189. (a<0) then
  1190. begin
  1191. if op=OP_ADD then
  1192. op:=op_SUB
  1193. else
  1194. op:=OP_ADD;
  1195. { avoid range/overflow error in case a = low(tcgint) }
  1196. {$push}{$r-}{$q-}
  1197. a:=-a;
  1198. {$pop}
  1199. end;
  1200. ovloc.loc:=LOC_VOID;
  1201. optimize_op_const(size,op,a);
  1202. case op of
  1203. OP_NONE:
  1204. begin
  1205. a_load_reg_reg(list,size,size,src,dst);
  1206. exit;
  1207. end;
  1208. OP_MOVE:
  1209. begin
  1210. a_load_const_reg(list,size,a,dst);
  1211. exit;
  1212. end;
  1213. else
  1214. ;
  1215. end;
  1216. case op of
  1217. OP_ADD,
  1218. OP_SUB:
  1219. begin
  1220. handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true);
  1221. { on a 64 bit target, overflows with smaller data types
  1222. are handled via range errors }
  1223. if setflags and
  1224. (size in [OS_64,OS_S64]) then
  1225. begin
  1226. location_reset(ovloc,LOC_FLAGS,OS_8);
  1227. if size=OS_64 then
  1228. if op=OP_ADD then
  1229. ovloc.resflags:=F_CS
  1230. else
  1231. ovloc.resflags:=F_CC
  1232. else
  1233. ovloc.resflags:=F_VS;
  1234. end;
  1235. end;
  1236. OP_OR,
  1237. OP_AND,
  1238. OP_XOR:
  1239. begin
  1240. if not(size in [OS_64,OS_S64]) then
  1241. a:=cardinal(a);
  1242. if is_shifter_const(a,size) then
  1243. list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a))
  1244. else
  1245. begin
  1246. constreg:=getintregister(list,size);
  1247. a_load_const_reg(list,size,a,constreg);
  1248. a_op_reg_reg_reg(list,op,size,constreg,src,dst);
  1249. end;
  1250. end;
  1251. OP_SHL,
  1252. OP_SHR,
  1253. OP_SAR:
  1254. begin
  1255. if size in [OS_64,OS_S64] then
  1256. shiftcountmask:=63
  1257. else
  1258. shiftcountmask:=31;
  1259. if (a and shiftcountmask)<>0 Then
  1260. list.concat(taicpu.op_reg_reg_const(
  1261. TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask))
  1262. else
  1263. a_load_reg_reg(list,size,size,src,dst);
  1264. if (a and not(tcgint(shiftcountmask)))<>0 then
  1265. internalError(2014112101);
  1266. end;
  1267. OP_ROL,
  1268. OP_ROR:
  1269. begin
  1270. case size of
  1271. OS_32,OS_S32:
  1272. if (a and not(tcgint(31)))<>0 then
  1273. internalError(2014112102);
  1274. OS_64,OS_S64:
  1275. if (a and not(tcgint(63)))<>0 then
  1276. internalError(2014112103);
  1277. else
  1278. internalError(2014112104);
  1279. end;
  1280. { there's only a ror opcode }
  1281. if op=OP_ROL then
  1282. a:=(tcgsize2size[size]*8)-a;
  1283. list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a));
  1284. end;
  1285. OP_MUL,
  1286. OP_IMUL,
  1287. OP_DIV,
  1288. OP_IDIV:
  1289. begin
  1290. constreg:=getintregister(list,size);
  1291. a_load_const_reg(list,size,a,constreg);
  1292. a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc);
  1293. end;
  1294. else
  1295. internalerror(2014111403);
  1296. end;
  1297. maybeadjustresult(list,op,size,dst);
  1298. end;
  1299. procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1300. var
  1301. tmpreg1, tmpreg2: tregister;
  1302. begin
  1303. ovloc.loc:=LOC_VOID;
  1304. { overflow can only occur with 64 bit calculations on 64 bit cpus }
  1305. if setflags and
  1306. (size in [OS_64,OS_S64]) then
  1307. begin
  1308. case op of
  1309. OP_ADD,
  1310. OP_SUB:
  1311. begin
  1312. list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S));
  1313. ovloc.loc:=LOC_FLAGS;
  1314. if size=OS_64 then
  1315. if op=OP_ADD then
  1316. ovloc.resflags:=F_CS
  1317. else
  1318. ovloc.resflags:=F_CC
  1319. else
  1320. ovloc.resflags:=F_VS;
  1321. { finished }
  1322. exit;
  1323. end;
  1324. OP_MUL:
  1325. begin
  1326. { check whether the upper 64 bit of the 128 bit product is 0 }
  1327. tmpreg1:=getintregister(list,OS_64);
  1328. list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1));
  1329. list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0));
  1330. ovloc.loc:=LOC_FLAGS;
  1331. ovloc.resflags:=F_NE;
  1332. { still have to perform the actual multiplication }
  1333. end;
  1334. OP_IMUL:
  1335. begin
  1336. { check whether the upper 64 bits of the 128 bit multiplication
  1337. result have the same value as the replicated sign bit of the
  1338. lower 64 bits }
  1339. tmpreg1:=getintregister(list,OS_64);
  1340. list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1));
  1341. { calculate lower 64 bits (afterwards, because dst may be
  1342. equal to src1 or src2) }
  1343. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1344. { replicate sign bit }
  1345. tmpreg2:=getintregister(list,OS_64);
  1346. a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2);
  1347. list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2));
  1348. ovloc.loc:=LOC_FLAGS;
  1349. ovloc.resflags:=F_NE;
  1350. { finished }
  1351. exit;
  1352. end;
  1353. OP_IDIV,
  1354. OP_DIV:
  1355. begin
  1356. { not handled here, needs div-by-zero check (dividing by zero
  1357. just gives a 0 result on aarch64), and low(int64) div -1
  1358. check for overflow) }
  1359. internalerror(2014122101);
  1360. end;
  1361. else
  1362. internalerror(2019050936);
  1363. end;
  1364. end;
  1365. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1366. end;
  1367. {*************** compare instructructions ****************}
  1368. procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
  1369. var
  1370. op: tasmop;
  1371. begin
  1372. if a>=0 then
  1373. op:=A_CMP
  1374. else
  1375. op:=A_CMN;
  1376. { avoid range/overflow error in case a=low(tcgint) }
  1377. {$push}{$r-}{$q-}
  1378. handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false);
  1379. {$pop}
  1380. a_jmp_cond(list,cmp_op,l);
  1381. end;
  1382. procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel);
  1383. begin
  1384. list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1));
  1385. a_jmp_cond(list,cmp_op,l);
  1386. end;
  1387. procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel);
  1388. var
  1389. ai: taicpu;
  1390. begin
  1391. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION));
  1392. ai.is_jmp:=true;
  1393. list.Concat(ai);
  1394. end;
  1395. procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string);
  1396. var
  1397. ai: taicpu;
  1398. begin
  1399. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION));
  1400. ai.is_jmp:=true;
  1401. list.Concat(ai);
  1402. end;
  1403. procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel);
  1404. var
  1405. ai: taicpu;
  1406. begin
  1407. ai:=TAiCpu.op_sym(A_B,l);
  1408. ai.is_jmp:=true;
  1409. ai.SetCondition(TOpCmp2AsmCond[cond]);
  1410. list.Concat(ai);
  1411. end;
  1412. procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);
  1413. var
  1414. ai : taicpu;
  1415. begin
  1416. ai:=Taicpu.op_sym(A_B,l);
  1417. ai.is_jmp:=true;
  1418. ai.SetCondition(flags_to_cond(f));
  1419. list.Concat(ai);
  1420. end;
  1421. procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister);
  1422. begin
  1423. list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f)));
  1424. end;
  1425. procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);
  1426. begin
  1427. { we need an explicit overflow location, because there are many
  1428. possibilities (not just the overflow flag, which is only used for
  1429. signed add/sub) }
  1430. internalerror(2014112303);
  1431. end;
  1432. procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation);
  1433. var
  1434. hl : tasmlabel;
  1435. hflags : tresflags;
  1436. begin
  1437. if not(cs_check_overflow in current_settings.localswitches) then
  1438. exit;
  1439. current_asmdata.getjumplabel(hl);
  1440. case ovloc.loc of
  1441. LOC_FLAGS:
  1442. begin
  1443. hflags:=ovloc.resflags;
  1444. inverse_flags(hflags);
  1445. cg.a_jmp_flags(list,hflags,hl);
  1446. end;
  1447. else
  1448. internalerror(2014112304);
  1449. end;
  1450. a_call_name(list,'FPC_OVERFLOW',false);
  1451. a_label(list,hl);
  1452. end;
  1453. { *********** entry/exit code and address loading ************ }
  1454. function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  1455. var
  1456. ref: treference;
  1457. sr: tsuperregister;
  1458. pairreg: tregister;
  1459. begin
  1460. result:=0;
  1461. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1462. ref.addressmode:=AM_PREINDEXED;
  1463. pairreg:=NR_NO;
  1464. { store all used registers pairwise }
  1465. for sr:=lowsr to highsr do
  1466. if sr in rg[rt].used_in_proc then
  1467. if pairreg=NR_NO then
  1468. pairreg:=newreg(rt,sr,sub)
  1469. else
  1470. begin
  1471. inc(result,16);
  1472. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1473. pairreg:=NR_NO
  1474. end;
  1475. { one left -> store twice (stack must be 16 bytes aligned) }
  1476. if pairreg<>NR_NO then
  1477. begin
  1478. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
  1479. inc(result,16);
  1480. end;
  1481. end;
  1482. procedure FixupOffsets(p:TObject;arg:pointer);
  1483. var
  1484. sym: tabstractnormalvarsym absolute p;
  1485. begin
  1486. if (tsym(p).typ in [paravarsym,localvarsym]) and
  1487. (sym.localloc.loc=LOC_REFERENCE) and
  1488. (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
  1489. begin
  1490. sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
  1491. dec(sym.localloc.reference.offset,PLongint(arg)^);
  1492. end;
  1493. end;
  1494. procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
  1495. var
  1496. ref: treference;
  1497. totalstackframesize: longint;
  1498. begin
  1499. if nostackframe then
  1500. exit;
  1501. { stack pointer has to be aligned to 16 bytes at all times }
  1502. localsize:=align(localsize,16);
  1503. { save stack pointer and return address }
  1504. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1505. ref.addressmode:=AM_PREINDEXED;
  1506. list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
  1507. { initialise frame pointer }
  1508. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
  1509. totalstackframesize:=localsize;
  1510. { save modified integer registers }
  1511. inc(totalstackframesize,
  1512. save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
  1513. { only the lower 64 bits of the modified vector registers need to be
  1514. saved; if the caller needs the upper 64 bits, it has to save them
  1515. itself }
  1516. inc(totalstackframesize,
  1517. save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
  1518. { allocate stack space }
  1519. if localsize<>0 then
  1520. begin
  1521. localsize:=align(localsize,16);
  1522. current_procinfo.final_localsize:=localsize;
  1523. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1524. end;
  1525. { By default, we use the frame pointer to access parameters passed via
  1526. the stack and the stack pointer to address local variables and temps
  1527. because
  1528. a) we can use bigger positive than negative offsets (so accessing
  1529. locals via negative offsets from the frame pointer would be less
  1530. efficient)
  1531. b) we don't know the local size while generating the code, so
  1532. accessing the parameters via the stack pointer is not possible
  1533. without copying them
  1534. The problem with this is the get_frame() intrinsic:
  1535. a) it must return the same value as what we pass as parentfp
  1536. parameter, since that's how it's used in the TP-style objects unit
  1537. b) its return value must usable to access all local data from a
  1538. routine (locals and parameters), since it's all the nested
  1539. routines have access to
  1540. c) its return value must be usable to construct a backtrace, as it's
  1541. also used by the exception handling routines
  1542. The solution we use here, based on something similar that's done in
  1543. the MIPS port, is to generate all accesses to locals in the routine
  1544. itself SP-relative, and then after the code is generated and the local
  1545. size is known (namely, here), we change all SP-relative variables/
  1546. parameters into FP-relative ones. This means that they'll be accessed
  1547. less efficiently from nested routines, but those accesses are indirect
  1548. anyway and at least this way they can be accessed at all
  1549. }
  1550. if current_procinfo.has_nestedprocs then
  1551. begin
  1552. current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1553. current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1554. end;
  1555. end;
  1556. procedure tcgaarch64.g_maybe_got_init(list : TAsmList);
  1557. begin
  1558. { nothing to do on Darwin or Linux }
  1559. end;
  1560. procedure tcgaarch64.g_restore_registers(list:TAsmList);
  1561. begin
  1562. { done in g_proc_exit }
  1563. end;
  1564. procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  1565. var
  1566. ref: treference;
  1567. sr, highestsetsr: tsuperregister;
  1568. pairreg: tregister;
  1569. regcount: longint;
  1570. begin
  1571. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1572. ref.addressmode:=AM_POSTINDEXED;
  1573. { highest reg stored twice? }
  1574. regcount:=0;
  1575. highestsetsr:=RS_NO;
  1576. for sr:=lowsr to highsr do
  1577. if sr in rg[rt].used_in_proc then
  1578. begin
  1579. inc(regcount);
  1580. highestsetsr:=sr;
  1581. end;
  1582. if odd(regcount) then
  1583. begin
  1584. list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref));
  1585. highestsetsr:=pred(highestsetsr);
  1586. end;
  1587. { load all (other) used registers pairwise }
  1588. pairreg:=NR_NO;
  1589. for sr:=highestsetsr downto lowsr do
  1590. if sr in rg[rt].used_in_proc then
  1591. if pairreg=NR_NO then
  1592. pairreg:=newreg(rt,sr,sub)
  1593. else
  1594. begin
  1595. list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref));
  1596. pairreg:=NR_NO
  1597. end;
  1598. { There can't be any register left }
  1599. if pairreg<>NR_NO then
  1600. internalerror(2014112602);
  1601. end;
  1602. procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  1603. var
  1604. ref: treference;
  1605. regsstored: boolean;
  1606. sr: tsuperregister;
  1607. begin
  1608. if not nostackframe then
  1609. begin
  1610. { if no registers have been stored, we don't have to subtract the
  1611. allocated temp space from the stack pointer }
  1612. regsstored:=false;
  1613. for sr:=RS_X19 to RS_X28 do
  1614. if sr in rg[R_INTREGISTER].used_in_proc then
  1615. begin
  1616. regsstored:=true;
  1617. break;
  1618. end;
  1619. if not regsstored then
  1620. for sr:=RS_D8 to RS_D15 do
  1621. if sr in rg[R_MMREGISTER].used_in_proc then
  1622. begin
  1623. regsstored:=true;
  1624. break;
  1625. end;
  1626. { restore registers (and stack pointer) }
  1627. if regsstored then
  1628. begin
  1629. if current_procinfo.final_localsize<>0 then
  1630. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true);
  1631. load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
  1632. load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
  1633. end
  1634. else if current_procinfo.final_localsize<>0 then
  1635. { restore stack pointer }
  1636. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  1637. { restore framepointer and return address }
  1638. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1639. ref.addressmode:=AM_POSTINDEXED;
  1640. list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
  1641. end;
  1642. { return }
  1643. list.concat(taicpu.op_none(A_RET));
  1644. end;
  1645. procedure tcgaarch64.g_save_registers(list : TAsmList);
  1646. begin
  1647. { done in g_proc_entry }
  1648. end;
  1649. { ************* concatcopy ************ }
  1650. procedure tcgaarch64.g_concatcopy_move(list : TAsmList;const source,dest : treference;len : tcgint);
  1651. var
  1652. paraloc1,paraloc2,paraloc3 : TCGPara;
  1653. pd : tprocdef;
  1654. begin
  1655. pd:=search_system_proc('MOVE');
  1656. paraloc1.init;
  1657. paraloc2.init;
  1658. paraloc3.init;
  1659. paramanager.getintparaloc(list,pd,1,paraloc1);
  1660. paramanager.getintparaloc(list,pd,2,paraloc2);
  1661. paramanager.getintparaloc(list,pd,3,paraloc3);
  1662. a_load_const_cgpara(list,OS_SINT,len,paraloc3);
  1663. a_loadaddr_ref_cgpara(list,dest,paraloc2);
  1664. a_loadaddr_ref_cgpara(list,source,paraloc1);
  1665. paramanager.freecgpara(list,paraloc3);
  1666. paramanager.freecgpara(list,paraloc2);
  1667. paramanager.freecgpara(list,paraloc1);
  1668. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1669. alloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1670. a_call_name(list,'FPC_MOVE',false);
  1671. dealloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1672. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1673. paraloc3.done;
  1674. paraloc2.done;
  1675. paraloc1.done;
  1676. end;
  1677. procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
  1678. var
  1679. sourcebasereplaced, destbasereplaced: boolean;
  1680. { get optimal memory operation to use for loading/storing data
  1681. in an unrolled loop }
  1682. procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
  1683. begin
  1684. if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
  1685. (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
  1686. begin
  1687. memop:=unscaledop;
  1688. needsimplify:=true;
  1689. end
  1690. else if (unscaledop<>A_NONE) and
  1691. (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
  1692. (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
  1693. begin
  1694. memop:=unscaledop;
  1695. needsimplify:=false;
  1696. end
  1697. else
  1698. begin
  1699. memop:=scaledop;
  1700. needsimplify:=true;
  1701. end;
  1702. end;
  1703. { adjust the offset and/or addressing mode after a load/store so it's
  1704. correct for the next one of the same size }
  1705. procedure updaterefafterloadstore(var ref: treference; oplen: longint);
  1706. begin
  1707. case ref.addressmode of
  1708. AM_OFFSET:
  1709. inc(ref.offset,oplen);
  1710. AM_POSTINDEXED:
  1711. { base register updated by instruction, next offset can remain
  1712. the same }
  1713. ;
  1714. AM_PREINDEXED:
  1715. begin
  1716. { base register updated by instruction -> next instruction can
  1717. use post-indexing with offset = sizeof(operation) }
  1718. ref.offset:=0;
  1719. ref.addressmode:=AM_OFFSET;
  1720. end;
  1721. end;
  1722. end;
  1723. { generate a load/store and adjust the reference offset to the next
  1724. memory location if necessary }
  1725. procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1726. begin
  1727. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
  1728. updaterefafterloadstore(ref,tcgsize2size[opsize]);
  1729. end;
  1730. { generate a dual load/store (ldp/stp) and adjust the reference offset to
  1731. the next memory location if necessary }
  1732. procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1733. begin
  1734. list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
  1735. updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
  1736. end;
  1737. { turn a reference into a pre- or post-indexed reference for use in a
  1738. load/store of a particular size }
  1739. procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
  1740. var
  1741. tmpreg: tregister;
  1742. scaledoffset: longint;
  1743. orgaddressmode: taddressmode;
  1744. begin
  1745. scaledoffset:=tcgsize2size[opsize];
  1746. if scaledop in [A_LDP,A_STP] then
  1747. scaledoffset:=scaledoffset*2;
  1748. { can we use the reference as post-indexed without changes? }
  1749. if forcepostindexing then
  1750. begin
  1751. orgaddressmode:=ref.addressmode;
  1752. ref.addressmode:=AM_POSTINDEXED;
  1753. if (orgaddressmode=AM_POSTINDEXED) or
  1754. ((ref.offset=0) and
  1755. (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
  1756. begin
  1757. { just change the post-indexed offset to the access size }
  1758. ref.offset:=scaledoffset;
  1759. { and replace the base register if that didn't happen yet
  1760. (could be sp or a regvar) }
  1761. if not basereplaced then
  1762. begin
  1763. tmpreg:=getaddressregister(list);
  1764. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1765. ref.base:=tmpreg;
  1766. basereplaced:=true;
  1767. end;
  1768. exit;
  1769. end;
  1770. ref.addressmode:=orgaddressmode;
  1771. end;
  1772. {$ifdef dummy}
  1773. This could in theory be useful in case you have a concatcopy from
  1774. e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
  1775. very unlikely. Disabled because it still needs fixes, as it
  1776. also generates pre-indexed loads right now at the very end for the
  1777. left-over gencopies
  1778. { can we turn it into a pre-indexed reference for free? (after the
  1779. first operation, it will be turned into an offset one) }
  1780. if not forcepostindexing and
  1781. (ref.offset<>0) then
  1782. begin
  1783. orgaddressmode:=ref.addressmode;
  1784. ref.addressmode:=AM_PREINDEXED;
  1785. tmpreg:=ref.base;
  1786. if not basereplaced and
  1787. (ref.base=tmpreg) then
  1788. begin
  1789. tmpreg:=getaddressregister(list);
  1790. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1791. ref.base:=tmpreg;
  1792. basereplaced:=true;
  1793. end;
  1794. if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
  1795. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1796. exit;
  1797. end;
  1798. {$endif dummy}
  1799. if not forcepostindexing then
  1800. begin
  1801. ref.addressmode:=AM_OFFSET;
  1802. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1803. { this may still cause problems if the final offset is no longer
  1804. a simple ref; it's a bit complicated to pass all information
  1805. through at all places and check that here, so play safe: we
  1806. currently never generate unrolled copies for more than 64
  1807. bytes (32 with non-double-register copies) }
  1808. if ref.index=NR_NO then
  1809. begin
  1810. if ((scaledop in [A_LDP,A_STP]) and
  1811. (ref.offset<((64-8)*tcgsize2size[opsize]))) or
  1812. ((scaledop in [A_LDUR,A_STUR]) and
  1813. (ref.offset<(255-8*tcgsize2size[opsize]))) or
  1814. ((scaledop in [A_LDR,A_STR]) and
  1815. (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
  1816. exit;
  1817. end;
  1818. end;
  1819. tmpreg:=getaddressregister(list);
  1820. a_loadaddr_ref_reg(list,ref,tmpreg);
  1821. basereplaced:=true;
  1822. if forcepostindexing then
  1823. begin
  1824. reference_reset_base(ref,tmpreg,scaledoffset,ref.temppos,ref.alignment,ref.volatility);
  1825. ref.addressmode:=AM_POSTINDEXED;
  1826. end
  1827. else
  1828. begin
  1829. reference_reset_base(ref,tmpreg,0,ref.temppos,ref.alignment,ref.volatility);
  1830. ref.addressmode:=AM_OFFSET;
  1831. end
  1832. end;
  1833. { prepare a reference for use by gencopy. This is done both after the
  1834. unrolled and regular copy loop -> get rid of post-indexing mode, make
  1835. sure ref is valid }
  1836. procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
  1837. var
  1838. simplify: boolean;
  1839. begin
  1840. if ref.addressmode=AM_POSTINDEXED then
  1841. ref.offset:=tcgsize2size[opsize];
  1842. getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
  1843. if simplify then
  1844. begin
  1845. makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
  1846. op:=scaledop;
  1847. end;
  1848. end;
  1849. { generate a copy from source to dest of size opsize/postfix }
  1850. procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
  1851. var
  1852. reg: tregister;
  1853. loadop, storeop: tasmop;
  1854. begin
  1855. preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
  1856. preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
  1857. reg:=getintregister(list,opsize);
  1858. genloadstore(list,loadop,reg,source,postfix,opsize);
  1859. genloadstore(list,storeop,reg,dest,postfix,opsize);
  1860. end;
  1861. { copy the leftovers after an unrolled or regular copy loop }
  1862. procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
  1863. begin
  1864. { stop post-indexing if we did so in the loop, since in that case all
  1865. offsets definitely can be represented now }
  1866. if source.addressmode=AM_POSTINDEXED then
  1867. begin
  1868. source.addressmode:=AM_OFFSET;
  1869. source.offset:=0;
  1870. end;
  1871. if dest.addressmode=AM_POSTINDEXED then
  1872. begin
  1873. dest.addressmode:=AM_OFFSET;
  1874. dest.offset:=0;
  1875. end;
  1876. { transfer the leftovers }
  1877. if len>=8 then
  1878. begin
  1879. dec(len,8);
  1880. gencopy(list,source,dest,PF_NONE,OS_64);
  1881. end;
  1882. if len>=4 then
  1883. begin
  1884. dec(len,4);
  1885. gencopy(list,source,dest,PF_NONE,OS_32);
  1886. end;
  1887. if len>=2 then
  1888. begin
  1889. dec(len,2);
  1890. gencopy(list,source,dest,PF_H,OS_16);
  1891. end;
  1892. if len>=1 then
  1893. begin
  1894. dec(len);
  1895. gencopy(list,source,dest,PF_B,OS_8);
  1896. end;
  1897. end;
  1898. const
  1899. { load_length + loop dec + cbnz }
  1900. loopoverhead=12;
  1901. { loop overhead + load + store }
  1902. totallooplen=loopoverhead + 8;
  1903. var
  1904. totalalign: longint;
  1905. maxlenunrolled: tcgint;
  1906. loadop, storeop: tasmop;
  1907. opsize: tcgsize;
  1908. postfix: toppostfix;
  1909. tmpsource, tmpdest: treference;
  1910. scaledstoreop, unscaledstoreop,
  1911. scaledloadop, unscaledloadop: tasmop;
  1912. regs: array[1..8] of tregister;
  1913. countreg: tregister;
  1914. i, regcount: longint;
  1915. hl: tasmlabel;
  1916. simplifysource, simplifydest: boolean;
  1917. begin
  1918. if len=0 then
  1919. exit;
  1920. sourcebasereplaced:=false;
  1921. destbasereplaced:=false;
  1922. { maximum common alignment }
  1923. totalalign:=max(1,newalignment(source.alignment,dest.alignment));
  1924. { use a simple load/store? }
  1925. if (len in [1,2,4,8]) and
  1926. ((totalalign>=(len div 2)) or
  1927. (source.alignment=len) or
  1928. (dest.alignment=len)) then
  1929. begin
  1930. opsize:=int_cgsize(len);
  1931. a_load_ref_ref(list,opsize,opsize,source,dest);
  1932. exit;
  1933. end;
  1934. { alignment > length is not useful, and would break some checks below }
  1935. while totalalign>len do
  1936. totalalign:=totalalign div 2;
  1937. { operation sizes to use based on common alignment }
  1938. case totalalign of
  1939. 1:
  1940. begin
  1941. postfix:=PF_B;
  1942. opsize:=OS_8;
  1943. end;
  1944. 2:
  1945. begin
  1946. postfix:=PF_H;
  1947. opsize:=OS_16;
  1948. end;
  1949. 4:
  1950. begin
  1951. postfix:=PF_None;
  1952. opsize:=OS_32;
  1953. end
  1954. else
  1955. begin
  1956. totalalign:=8;
  1957. postfix:=PF_None;
  1958. opsize:=OS_64;
  1959. end;
  1960. end;
  1961. { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
  1962. maxlenunrolled:=min(totalalign,8)*4;
  1963. { ldp/stp -> 2 registers per instruction }
  1964. if (totalalign>=4) and
  1965. (len>=totalalign*2) then
  1966. begin
  1967. maxlenunrolled:=maxlenunrolled*2;
  1968. scaledstoreop:=A_STP;
  1969. scaledloadop:=A_LDP;
  1970. unscaledstoreop:=A_NONE;
  1971. unscaledloadop:=A_NONE;
  1972. end
  1973. else
  1974. begin
  1975. scaledstoreop:=A_STR;
  1976. scaledloadop:=A_LDR;
  1977. unscaledstoreop:=A_STUR;
  1978. unscaledloadop:=A_LDUR;
  1979. end;
  1980. { we only need 4 instructions extra to call FPC_MOVE }
  1981. if cs_opt_size in current_settings.optimizerswitches then
  1982. maxlenunrolled:=maxlenunrolled div 2;
  1983. if (len>maxlenunrolled) and
  1984. (len>totalalign*8) then
  1985. begin
  1986. g_concatcopy_move(list,source,dest,len);
  1987. exit;
  1988. end;
  1989. simplifysource:=true;
  1990. simplifydest:=true;
  1991. tmpsource:=source;
  1992. tmpdest:=dest;
  1993. { can we directly encode all offsets in an unrolled loop? }
  1994. if len<=maxlenunrolled then
  1995. begin
  1996. {$ifdef extdebug}
  1997. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
  1998. {$endif extdebug}
  1999. { the leftovers will be handled separately -> -(len mod opsize) }
  2000. inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
  2001. { additionally, the last regular load/store will be at
  2002. offset+len-opsize (if len-(len mod opsize)>len) }
  2003. if tmpsource.offset>source.offset then
  2004. dec(tmpsource.offset,tcgsize2size[opsize]);
  2005. getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
  2006. inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
  2007. if tmpdest.offset>dest.offset then
  2008. dec(tmpdest.offset,tcgsize2size[opsize]);
  2009. getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
  2010. tmpsource:=source;
  2011. tmpdest:=dest;
  2012. { if we can't directly encode all offsets, simplify }
  2013. if simplifysource then
  2014. begin
  2015. loadop:=scaledloadop;
  2016. makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
  2017. end;
  2018. if simplifydest then
  2019. begin
  2020. storeop:=scaledstoreop;
  2021. makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
  2022. end;
  2023. regcount:=len div tcgsize2size[opsize];
  2024. { in case we transfer two registers at a time, we copy an even
  2025. number of registers }
  2026. if loadop=A_LDP then
  2027. regcount:=regcount and not(1);
  2028. { initialise for dfa }
  2029. regs[low(regs)]:=NR_NO;
  2030. { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
  2031. for i:=1 to regcount do
  2032. regs[i]:=getintregister(list,opsize);
  2033. if loadop=A_LDP then
  2034. begin
  2035. { load registers }
  2036. for i:=1 to (regcount div 2) do
  2037. gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
  2038. { store registers }
  2039. for i:=1 to (regcount div 2) do
  2040. gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
  2041. end
  2042. else
  2043. begin
  2044. for i:=1 to regcount do
  2045. genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
  2046. for i:=1 to regcount do
  2047. genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
  2048. end;
  2049. { leftover }
  2050. len:=len-regcount*tcgsize2size[opsize];
  2051. {$ifdef extdebug}
  2052. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
  2053. {$endif extdebug}
  2054. end
  2055. else
  2056. begin
  2057. {$ifdef extdebug}
  2058. list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
  2059. {$endif extdebug}
  2060. { regular loop -> definitely use post-indexing }
  2061. loadop:=scaledloadop;
  2062. makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
  2063. storeop:=scaledstoreop;
  2064. makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
  2065. current_asmdata.getjumplabel(hl);
  2066. countreg:=getintregister(list,OS_32);
  2067. if loadop=A_LDP then
  2068. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
  2069. else
  2070. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
  2071. a_label(list,hl);
  2072. a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
  2073. if loadop=A_LDP then
  2074. begin
  2075. regs[1]:=getintregister(list,opsize);
  2076. regs[2]:=getintregister(list,opsize);
  2077. gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
  2078. gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
  2079. end
  2080. else
  2081. begin
  2082. regs[1]:=getintregister(list,opsize);
  2083. genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
  2084. genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
  2085. end;
  2086. list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
  2087. len:=len mod tcgsize2size[opsize];
  2088. end;
  2089. gencopyleftovers(list,tmpsource,tmpdest,len);
  2090. end;
  2091. procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
  2092. begin
  2093. { This method is integrated into g_intf_wrapper and shouldn't be called separately }
  2094. InternalError(2013020102);
  2095. end;
  2096. procedure tcgaarch64.g_check_for_fpu_exception(list: TAsmList;force,clear : boolean);
  2097. var
  2098. r : TRegister;
  2099. ai: taicpu;
  2100. l1,l2: TAsmLabel;
  2101. begin
  2102. { so far, we assume all flavours of AArch64 need explicit floating point exception checking }
  2103. if ((cs_check_fpu_exceptions in current_settings.localswitches) and
  2104. (force or current_procinfo.FPUExceptionCheckNeeded)) then
  2105. begin
  2106. r:=getintregister(list,OS_INT);
  2107. list.concat(taicpu.op_reg_reg(A_MRS,r,NR_FPSR));
  2108. list.concat(taicpu.op_reg_const(A_TST,r,$1f));
  2109. current_asmdata.getjumplabel(l1);
  2110. current_asmdata.getjumplabel(l2);
  2111. ai:=taicpu.op_sym(A_B,l1);
  2112. ai.is_jmp:=true;
  2113. ai.condition:=C_NE;
  2114. list.concat(ai);
  2115. list.concat(taicpu.op_reg_const(A_TST,r,$80));
  2116. ai:=taicpu.op_sym(A_B,l2);
  2117. ai.is_jmp:=true;
  2118. ai.condition:=C_EQ;
  2119. list.concat(ai);
  2120. a_label(list,l1);
  2121. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2122. cg.a_call_name(list,'FPC_THROWFPUEXCEPTION',false);
  2123. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2124. a_label(list,l2);
  2125. if clear then
  2126. current_procinfo.FPUExceptionCheckNeeded:=false;
  2127. end;
  2128. end;
  2129. procedure create_codegen;
  2130. begin
  2131. cg:=tcgaarch64.Create;
  2132. cg128:=tcg128.Create;
  2133. end;
  2134. end.