cgcpu.pas 109 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714
  1. {
  2. Copyright (c) 2014 by Jonas Maebe
  3. This unit implements the code generator for AArch64
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. globtype,parabase,
  22. cgbase,cgutils,cgobj,
  23. aasmbase,aasmtai,aasmdata,aasmcpu,
  24. cpubase,cpuinfo,
  25. node,symconst,SymType,symdef,
  26. rgcpu;
  27. type
  28. { tcgaarch64 }
  29. tcgaarch64=class(tcg)
  30. protected
  31. { changes register size without adding register allocation info }
  32. function makeregsize(reg: tregister; size: tcgsize): tregister; overload;
  33. public
  34. { simplifies "ref" so it can be used with "op". If "ref" can be used
  35. with a different load/Store operation that has the same meaning as the
  36. original one, "op" will be replaced with the alternative }
  37. procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  38. function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override;
  39. procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  40. procedure init_register_allocators;override;
  41. procedure done_register_allocators;override;
  42. function getmmregister(list:TAsmList;size:tcgsize):tregister;override;
  43. function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  44. procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override;
  45. procedure a_call_reg(list:TAsmList;Reg:tregister);override;
  46. { General purpose instructions }
  47. procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  48. procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override;
  49. procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override;
  50. procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override;
  51. procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override;
  52. procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  53. procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  54. { move instructions }
  55. procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override;
  56. procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override;
  57. procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override;
  58. procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override;
  59. procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override;
  60. procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override;
  61. procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override;
  62. procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override;
  63. { fpu move instructions (not used, all floating point is vector unit-based) }
  64. procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override;
  65. procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override;
  66. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  67. procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override;
  68. procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override;
  69. procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override;
  70. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  71. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override;
  72. procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override;
  73. procedure a_bit_scan_reg_reg(list: TAsmList; reverse,not_zero: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override;
  74. { comparison operations }
  75. procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override;
  76. procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override;
  77. procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override;
  78. procedure a_jmp_name(list: TAsmList; const s: string);override;
  79. procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;}
  80. procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override;
  81. procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override;
  82. procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override;
  83. procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override;
  84. procedure g_stackpointer_alloc(list: TAsmList; localsize: longint);override;
  85. procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override;
  86. procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override;
  87. procedure g_maybe_got_init(list: TAsmList); override;
  88. procedure g_restore_registers(list: TAsmList);override;
  89. procedure g_save_registers(list: TAsmList);override;
  90. procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
  91. procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
  92. procedure g_check_for_fpu_exception(list: TAsmList; force, clear: boolean);override;
  93. procedure g_profilecode(list: TAsmList);override;
  94. private
  95. function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  96. procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  97. end;
  98. procedure create_codegen;
  99. const
  100. TOpCG2AsmOpReg: array[topcg] of TAsmOp = (
  101. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV
  102. );
  103. TOpCG2AsmOpImm: array[topcg] of TAsmOp = (
  104. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR
  105. );
  106. TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT,
  107. C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI
  108. );
  109. winstackpagesize = 4096;
  110. implementation
  111. uses
  112. globals,verbose,systems,cutils,cclasses,
  113. paramgr,fmodule,
  114. symtable,symsym,
  115. tgobj,
  116. ncgutil,
  117. procinfo,cpupi;
  118. procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  119. var
  120. href: treference;
  121. so: tshifterop;
  122. accesssize: longint;
  123. begin
  124. if (ref.base=NR_NO) then
  125. begin
  126. if ref.shiftmode<>SM_None then
  127. internalerror(2014110701);
  128. ref.base:=ref.index;
  129. ref.index:=NR_NO;
  130. end;
  131. { no abitrary scale factor support (the generic code doesn't set it,
  132. AArch-specific code shouldn't either) }
  133. if not(ref.scalefactor in [0,1]) then
  134. internalerror(2014111002);
  135. case simple_ref_type(op,size,oppostfix,ref) of
  136. sr_simple:
  137. exit;
  138. sr_internal_illegal:
  139. internalerror(2014121702);
  140. sr_complex:
  141. { continue } ;
  142. end;
  143. if assigned(ref.symbol) then
  144. begin
  145. { internal "load symbol" instructions should already be valid }
  146. if assigned(ref.symboldata) or
  147. (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then
  148. internalerror(2014110802);
  149. { no relative symbol support (needed) yet }
  150. if assigned(ref.relsymbol) then
  151. internalerror(2014111001);
  152. { loading a symbol address (whether it's in the GOT or not) consists
  153. of two parts: first load the page on which it is located, then
  154. either the offset in the page or load the value at that offset in
  155. the page. This final GOT-load can be relaxed by the linker in case
  156. the variable itself can be stored directly in the GOT }
  157. if (preferred_newbasereg=NR_NO) or
  158. (ref.base=preferred_newbasereg) or
  159. (ref.index=preferred_newbasereg) then
  160. preferred_newbasereg:=getaddressregister(list);
  161. { load the (GOT) page }
  162. reference_reset_symbol(href,ref.symbol,0,8,[]);
  163. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL,AT_DATA]) and
  164. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  165. ((ref.symbol.typ=AT_DATA) and
  166. (ref.symbol.bind=AB_LOCAL)) or
  167. (target_info.system=system_aarch64_win64) then
  168. href.refaddr:=addr_page
  169. else
  170. href.refaddr:=addr_gotpage;
  171. list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href));
  172. { load the GOT entry (= address of the variable) }
  173. reference_reset_base(href,preferred_newbasereg,0,ctempposinvalid,sizeof(pint),[]);
  174. href.symbol:=ref.symbol;
  175. { code symbols defined in the current compilation unit do not
  176. have to be accessed via the GOT }
  177. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL,AT_DATA]) and
  178. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  179. ((ref.symbol.typ=AT_DATA) and
  180. (ref.symbol.bind=AB_LOCAL)) or
  181. (target_info.system=system_aarch64_win64) then
  182. begin
  183. href.base:=NR_NO;
  184. href.refaddr:=addr_pageoffset;
  185. list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href));
  186. end
  187. else
  188. begin
  189. href.refaddr:=addr_gotpageoffset;
  190. { use a_load_ref_reg() rather than directly encoding the LDR,
  191. so that we'll check the validity of the reference }
  192. a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg);
  193. end;
  194. { set as new base register }
  195. if ref.base=NR_NO then
  196. ref.base:=preferred_newbasereg
  197. else if ref.index=NR_NO then
  198. ref.index:=preferred_newbasereg
  199. else
  200. begin
  201. { make sure it's valid in case ref.base is SP -> make it
  202. the second operand}
  203. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg);
  204. ref.base:=preferred_newbasereg
  205. end;
  206. ref.symbol:=nil;
  207. end;
  208. { base & index }
  209. if (ref.base<>NR_NO) and
  210. (ref.index<>NR_NO) then
  211. begin
  212. case op of
  213. A_LDR, A_STR:
  214. begin
  215. if (ref.shiftmode=SM_None) and
  216. (ref.shiftimm<>0) then
  217. internalerror(2014110805);
  218. { wrong shift? (possible in case of something like
  219. array_of_2byte_rec[x].bytefield -> shift will be set 1, but
  220. the final load is a 1 byte -> can't use shift after all }
  221. if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and
  222. ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or
  223. (ref.offset<>0)) then
  224. begin
  225. if preferred_newbasereg=NR_NO then
  226. preferred_newbasereg:=getaddressregister(list);
  227. { "add" supports a superset of the shift modes supported by
  228. load/store instructions }
  229. shifterop_reset(so);
  230. so.shiftmode:=ref.shiftmode;
  231. so.shiftimm:=ref.shiftimm;
  232. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  233. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  234. { possibly still an invalid offset -> fall through }
  235. end
  236. else if ref.offset<>0 then
  237. begin
  238. if (preferred_newbasereg=NR_NO) or
  239. { we keep ref.index, so it must not be overwritten }
  240. (ref.index=preferred_newbasereg) then
  241. preferred_newbasereg:=getaddressregister(list);
  242. { add to the base and not to the index, because the index
  243. may be scaled; this works even if the base is SP }
  244. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  245. ref.offset:=0;
  246. ref.base:=preferred_newbasereg;
  247. { finished }
  248. exit;
  249. end
  250. else
  251. { valid -> exit }
  252. exit;
  253. end;
  254. { todo }
  255. A_LD1,A_LD2,A_LD3,A_LD4,
  256. A_ST1,A_ST2,A_ST3,A_ST4:
  257. internalerror(2014110702);
  258. { these don't support base+index }
  259. A_LDUR,A_STUR,
  260. A_LDP,A_STP:
  261. begin
  262. { these either don't support pre-/post-indexing, or don't
  263. support it with base+index }
  264. if ref.addressmode<>AM_OFFSET then
  265. internalerror(2014110911);
  266. if preferred_newbasereg=NR_NO then
  267. preferred_newbasereg:=getaddressregister(list);
  268. if ref.shiftmode<>SM_None then
  269. begin
  270. { "add" supports a superset of the shift modes supported by
  271. load/store instructions }
  272. shifterop_reset(so);
  273. so.shiftmode:=ref.shiftmode;
  274. so.shiftimm:=ref.shiftimm;
  275. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  276. end
  277. else
  278. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg);
  279. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  280. { fall through to the handling of base + offset, since the
  281. offset may still be too big }
  282. end;
  283. else
  284. internalerror(2014110903);
  285. end;
  286. end;
  287. { base + offset }
  288. if ref.base<>NR_NO then
  289. begin
  290. if ref.offset=0 then
  291. exit;
  292. { valid offset for LDUR/STUR -> use that }
  293. if (ref.addressmode=AM_OFFSET) and
  294. (op in [A_LDR,A_STR]) and
  295. (ref.offset>=-256) and
  296. (ref.offset<=255) then
  297. begin
  298. if op=A_LDR then
  299. op:=A_LDUR
  300. else
  301. op:=A_STUR
  302. end
  303. { if it's not a valid LDUR/STUR, use LDR/STR }
  304. else if (op in [A_LDUR,A_STUR]) and
  305. ((ref.offset<-256) or
  306. (ref.offset>255) or
  307. (ref.addressmode<>AM_OFFSET)) then
  308. begin
  309. if op=A_LDUR then
  310. op:=A_LDR
  311. else
  312. op:=A_STR
  313. end;
  314. case op of
  315. A_LDR,A_STR:
  316. begin
  317. case ref.addressmode of
  318. AM_PREINDEXED:
  319. begin
  320. { since the loaded/stored register cannot be the same
  321. as the base register, we can safely add the
  322. offset to the base if it doesn't fit}
  323. if (ref.offset<-256) or
  324. (ref.offset>255) then
  325. begin
  326. a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base);
  327. ref.offset:=0;
  328. end;
  329. end;
  330. AM_POSTINDEXED:
  331. begin
  332. { cannot emulate post-indexing if we have to fold the
  333. offset into the base register }
  334. if (ref.offset<-256) or
  335. (ref.offset>255) then
  336. internalerror(2014110909);
  337. { ok }
  338. end;
  339. AM_OFFSET:
  340. begin
  341. { unsupported offset -> fold into base register }
  342. accesssize:=1 shl tcgsizep2size[size];
  343. if (ref.offset<0) or
  344. (ref.offset>(((1 shl 12)-1)*accesssize)) or
  345. ((ref.offset mod accesssize)<>0) then
  346. begin
  347. if preferred_newbasereg=NR_NO then
  348. preferred_newbasereg:=getaddressregister(list);
  349. { can we split the offset beween an
  350. "add/sub (imm12 shl 12)" and the load (also an
  351. imm12)?
  352. -- the offset from the load will always be added,
  353. that's why the lower bound has a smaller range
  354. than the upper bound; it must also be a multiple
  355. of the access size }
  356. if (ref.offset>=-(((1 shl 12)-1) shl 12)) and
  357. (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and
  358. ((ref.offset mod accesssize)=0) then
  359. begin
  360. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg);
  361. ref.offset:=ref.offset-(ref.offset shr 12) shl 12;
  362. end
  363. else
  364. begin
  365. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  366. ref.offset:=0;
  367. end;
  368. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  369. end;
  370. end
  371. end;
  372. end;
  373. A_LDP,A_STP:
  374. begin
  375. { unsupported offset -> fold into base register (these
  376. instructions support all addressmodes) }
  377. if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or
  378. (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then
  379. begin
  380. case ref.addressmode of
  381. AM_POSTINDEXED:
  382. { don't emulate post-indexing if we have to fold the
  383. offset into the base register }
  384. internalerror(2014110910);
  385. AM_PREINDEXED:
  386. { this means the offset must be added to the current
  387. base register }
  388. preferred_newbasereg:=ref.base;
  389. AM_OFFSET:
  390. if preferred_newbasereg=NR_NO then
  391. preferred_newbasereg:=getaddressregister(list);
  392. end;
  393. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  394. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,ref.alignment,ref.volatility);
  395. end
  396. end;
  397. A_LDUR,A_STUR:
  398. begin
  399. { valid, checked above }
  400. end;
  401. { todo }
  402. A_LD1,A_LD2,A_LD3,A_LD4,
  403. A_ST1,A_ST2,A_ST3,A_ST4:
  404. internalerror(2014110908);
  405. else
  406. internalerror(2014110708);
  407. end;
  408. { done }
  409. exit;
  410. end;
  411. { only an offset -> change to base (+ offset 0) }
  412. if preferred_newbasereg=NR_NO then
  413. preferred_newbasereg:=getaddressregister(list);
  414. a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg);
  415. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,newalignment(8,ref.offset),ref.volatility);
  416. end;
  417. function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister;
  418. var
  419. subreg:Tsubregister;
  420. begin
  421. subreg:=cgsize2subreg(getregtype(reg),size);
  422. result:=reg;
  423. setsubreg(result,subreg);
  424. end;
  425. function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister;
  426. begin
  427. internalerror(2014122110);
  428. { squash warning }
  429. result:=NR_NO;
  430. end;
  431. function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  432. begin
  433. make_simple_ref(list,op,size,oppostfix,ref,NR_NO);
  434. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix));
  435. result:=ref;
  436. end;
  437. procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  438. var
  439. instr: taicpu;
  440. so: tshifterop;
  441. hadtmpreg: boolean;
  442. begin
  443. { imm12 }
  444. if (a>=0) and
  445. (a<=((1 shl 12)-1)) then
  446. if usedest then
  447. instr:=taicpu.op_reg_reg_const(op,dst,src,a)
  448. else
  449. instr:=taicpu.op_reg_const(op,src,a)
  450. { imm12 lsl 12 }
  451. else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then
  452. begin
  453. so.shiftmode:=SM_LSL;
  454. so.shiftimm:=12;
  455. if usedest then
  456. instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so)
  457. else
  458. instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so)
  459. end
  460. else
  461. begin
  462. { todo: other possible optimizations (e.g. load 16 bit constant in
  463. register and then add/sub/cmp/cmn shifted the rest) }
  464. if tmpreg=NR_NO then
  465. begin
  466. hadtmpreg:=false;
  467. tmpreg:=getintregister(list,size);
  468. end
  469. else
  470. begin
  471. hadtmpreg:=true;
  472. getcpuregister(list,tmpreg);
  473. end;
  474. a_load_const_reg(list,size,a,tmpreg);
  475. if usedest then
  476. instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg)
  477. else
  478. instr:=taicpu.op_reg_reg(op,src,tmpreg);
  479. if hadtmpreg then
  480. ungetcpuregister(list,tmpreg);
  481. end;
  482. if setflags then
  483. setoppostfix(instr,PF_S);
  484. list.concat(instr);
  485. end;
  486. {****************************************************************************
  487. Assembler code
  488. ****************************************************************************}
  489. procedure tcgaarch64.init_register_allocators;
  490. begin
  491. inherited init_register_allocators;
  492. rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
  493. [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
  494. RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
  495. RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
  496. { maybe we can enable this in the future for leaf functions (it's
  497. the frame pointer)
  498. ,RS_X29 }],
  499. first_int_imreg,[]);
  500. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD,
  501. [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7,
  502. RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15,
  503. RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23,
  504. RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31],
  505. first_mm_imreg,[]);
  506. end;
  507. procedure tcgaarch64.done_register_allocators;
  508. begin
  509. rg[R_INTREGISTER].free;
  510. rg[R_FPUREGISTER].free;
  511. rg[R_MMREGISTER].free;
  512. inherited done_register_allocators;
  513. end;
  514. function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister;
  515. begin
  516. case size of
  517. OS_F32:
  518. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
  519. OS_F64:
  520. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD)
  521. else
  522. internalerror(2014102701);
  523. end;
  524. end;
  525. procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean);
  526. begin
  527. if not weak then
  528. list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)))
  529. else
  530. list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION)));
  531. end;
  532. procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister);
  533. begin
  534. list.concat(taicpu.op_reg(A_BLR,reg));
  535. end;
  536. {********************** load instructions ********************}
  537. procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister);
  538. var
  539. opc: tasmop;
  540. shift: byte;
  541. so: tshifterop;
  542. reginited,doinverted,extendedsize: boolean;
  543. manipulated_a: tcgint;
  544. leftover_a: word;
  545. begin
  546. {$ifdef extdebug}
  547. list.concat(tai_comment.Create(strpnew('Generating constant ' + tostr(a) + ' / $' + hexstr(a, 16))));
  548. {$endif extdebug}
  549. extendedsize := (size in [OS_64,OS_S64]);
  550. case a of
  551. { Small positive number }
  552. $0..$FFFF:
  553. begin
  554. list.concat(taicpu.op_reg_const(A_MOVZ, reg, a));
  555. Exit;
  556. end;
  557. { Small negative number }
  558. -65536..-1:
  559. begin
  560. list.concat(taicpu.op_reg_const(A_MOVN, reg, Word(not a)));
  561. Exit;
  562. end;
  563. { Can be represented as a negative number more compactly }
  564. $FFFF0000..$FFFFFFFF:
  565. begin
  566. { if we load a value into a 32 bit register, it is automatically
  567. zero-extended to 64 bit }
  568. list.concat(taicpu.op_reg_const(A_MOVN, makeregsize(reg,OS_32), Word(not a)));
  569. Exit;
  570. end;
  571. else
  572. begin
  573. if not extendedsize then
  574. { Mostly so programmers don't get confused when they view the disassembly and
  575. 'a' is sign-extended to 64-bit, say, but also avoids potential problems with
  576. third-party assemblers if the number is out of bounds for a given size }
  577. a := Cardinal(a);
  578. { Check to see if a is a valid shifter constant that can be encoded in ORR as is }
  579. if is_shifter_const(a,size) then
  580. begin
  581. { Use synthetic "MOV" instruction instead of "ORR reg,wzr,#a" (an alias),
  582. since AArch64 conventions prefer this, and it's clearer in the
  583. disassembly }
  584. list.concat(taicpu.op_reg_const(A_MOV,reg,a));
  585. Exit;
  586. end;
  587. { If the value of a fits into 32 bits, it's fastest to use movz/movk regardless }
  588. if extendedsize and ((a shr 32) <> 0) then
  589. begin
  590. { This determines whether this write can be performed with an ORR followed by MOVK
  591. by copying the 3nd word to the 1st word for the ORR constant, then overwriting
  592. the 1st word. The alternative would require 4 instructions. This sequence is
  593. common when division reciprocals are calculated (e.g. 3 produces AAAAAAAAAAAAAAAB). }
  594. leftover_a := word(a and $FFFF);
  595. manipulated_a := (a and $FFFFFFFFFFFF0000) or ((a shr 32) and $FFFF);
  596. { if manipulated_a = a, don't check, because is_shifter_const was already
  597. called for a and it returned False. Reduces processing time. [Kit] }
  598. if (manipulated_a <> a) and is_shifter_const(manipulated_a, OS_64) then
  599. begin
  600. { Encode value as:
  601. orr reg,xzr,manipulated_a
  602. movk reg,#(leftover_a)
  603. Use "orr" instead of "mov" here for the assembly dump so it better
  604. implies that something special is happening with the number arrangement.
  605. }
  606. list.concat(taicpu.op_reg_reg_const(A_ORR, reg, NR_XZR, manipulated_a));
  607. list.concat(taicpu.op_reg_const(A_MOVK, reg, leftover_a));
  608. Exit;
  609. end;
  610. { This determines whether this write can be performed with an ORR followed by MOVK
  611. by copying the 2nd word to the 4th word for the ORR constant, then overwriting
  612. the 4th word. The alternative would require 3 instructions }
  613. leftover_a := word(a shr 48);
  614. manipulated_a := (a and $0000FFFFFFFFFFFF);
  615. if manipulated_a = $0000FFFFFFFFFFFF then
  616. begin
  617. { This is even better, as we can just use a single MOVN on the last word }
  618. shifterop_reset(so);
  619. so.shiftmode := SM_LSL;
  620. so.shiftimm := 48;
  621. list.concat(taicpu.op_reg_const_shifterop(A_MOVN, reg, word(not leftover_a), so));
  622. Exit;
  623. end;
  624. manipulated_a := manipulated_a or (((a shr 16) and $FFFF) shl 48);
  625. { if manipulated_a = a, don't check, because is_shifter_const was already
  626. called for a and it returned False. Reduces processing time. [Kit] }
  627. if (manipulated_a <> a) and is_shifter_const(manipulated_a, OS_64) then
  628. begin
  629. { Encode value as:
  630. orr reg,xzr,manipulated_a
  631. movk reg,#(leftover_a),lsl #48
  632. Use "orr" instead of "mov" here for the assembly dump so it better
  633. implies that something special is happening with the number arrangement.
  634. }
  635. list.concat(taicpu.op_reg_reg_const(A_ORR, reg, NR_XZR, manipulated_a));
  636. shifterop_reset(so);
  637. so.shiftmode := SM_LSL;
  638. so.shiftimm := 48;
  639. list.concat(taicpu.op_reg_const_shifterop(A_MOVK, reg, leftover_a, so));
  640. Exit;
  641. end;
  642. case a of
  643. { If a is in the given negative range, it can be stored
  644. more efficiently if it is inverted. }
  645. TCgInt($FFFF000000000000)..-65537:
  646. begin
  647. { NOTE: This excluded range can be more efficiently
  648. stored as the first 16 bits followed by a shifter constant }
  649. case a of
  650. TCgInt($FFFF0000FFFF0000)..TCgInt($FFFF0000FFFFFFFF):
  651. doinverted := False;
  652. else
  653. begin
  654. doinverted := True;
  655. a := not a;
  656. end;
  657. end;
  658. end;
  659. else
  660. doinverted := False;
  661. end;
  662. end
  663. else
  664. doinverted:=False;
  665. end;
  666. end;
  667. reginited:=false;
  668. shift:=0;
  669. if doinverted then
  670. opc:=A_MOVN
  671. else
  672. opc:=A_MOVZ;
  673. repeat
  674. { leftover is shifterconst? (don't check if we can represent it just
  675. as effectively with movz/movk, as this check is expensive) }
  676. if (word(a)<>0) then
  677. begin
  678. if not doinverted and
  679. ((shift<tcgsize2size[size]*(8 div 2)) and
  680. ((a shr 16)<>0)) and
  681. is_shifter_const(a shl shift,size) then
  682. begin
  683. if reginited then
  684. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift))
  685. else
  686. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift));
  687. exit;
  688. end;
  689. { set all 16 bit parts <> 0 }
  690. if shift=0 then
  691. begin
  692. list.concat(taicpu.op_reg_const(opc,reg,word(a)));
  693. reginited:=true;
  694. end
  695. else
  696. begin
  697. shifterop_reset(so);
  698. so.shiftmode:=SM_LSL;
  699. so.shiftimm:=shift;
  700. if not reginited then
  701. begin
  702. list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so));
  703. reginited:=true;
  704. end
  705. else
  706. begin
  707. if doinverted then
  708. list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(not a),so))
  709. else
  710. list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(a),so));
  711. end;
  712. end;
  713. end;
  714. a:=a shr 16;
  715. inc(shift,16);
  716. until a = 0;
  717. if not reginited then
  718. internalerror(2014102702);
  719. end;
  720. procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference);
  721. var
  722. reg: tregister;
  723. href: treference;
  724. i: Integer;
  725. begin
  726. { use the zero register if possible }
  727. if a=0 then
  728. begin
  729. href:=ref;
  730. inc(href.offset,tcgsize2size[size]-1);
  731. if (tcgsize2size[size]>1) and (ref.alignment=1) and (simple_ref_type(A_STUR,OS_8,PF_None,ref)=sr_simple) and
  732. (simple_ref_type(A_STUR,OS_8,PF_None,href)=sr_simple) then
  733. begin
  734. href:=ref;
  735. for i:=0 to tcgsize2size[size]-1 do
  736. begin
  737. a_load_const_ref(list,OS_8,0,href);
  738. inc(href.offset);
  739. end;
  740. end
  741. else
  742. begin
  743. if size in [OS_64,OS_S64] then
  744. reg:=NR_XZR
  745. else
  746. reg:=NR_WZR;
  747. a_load_reg_ref(list,size,size,reg,ref);
  748. end;
  749. end
  750. else
  751. inherited;
  752. end;
  753. procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  754. var
  755. oppostfix:toppostfix;
  756. hreg: tregister;
  757. begin
  758. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  759. begin
  760. fromsize:=tosize;
  761. reg:=makeregsize(list,reg,fromsize);
  762. end
  763. { have a 32 bit register but need a 64 bit one? }
  764. else if tosize in [OS_64,OS_S64] then
  765. begin
  766. { sign extend if necessary }
  767. if fromsize in [OS_S8,OS_S16,OS_S32] then
  768. begin
  769. { can't overwrite reg, may be a constant reg }
  770. hreg:=getintregister(list,tosize);
  771. a_load_reg_reg(list,fromsize,tosize,reg,hreg);
  772. reg:=hreg;
  773. end
  774. else
  775. { top 32 bit are zero by default }
  776. reg:=makeregsize(reg,OS_64);
  777. fromsize:=tosize;
  778. end;
  779. if not(target_info.system=system_aarch64_darwin) and (ref.alignment<>0) and
  780. (ref.alignment<tcgsize2size[tosize]) then
  781. begin
  782. a_load_reg_ref_unaligned(list,fromsize,tosize,reg,ref);
  783. end
  784. else
  785. begin
  786. case tosize of
  787. { signed integer registers }
  788. OS_8,
  789. OS_S8:
  790. oppostfix:=PF_B;
  791. OS_16,
  792. OS_S16:
  793. oppostfix:=PF_H;
  794. OS_32,
  795. OS_S32,
  796. OS_64,
  797. OS_S64:
  798. oppostfix:=PF_None;
  799. else
  800. InternalError(200308299);
  801. end;
  802. handle_load_store(list,A_STR,tosize,oppostfix,reg,ref);
  803. end;
  804. end;
  805. procedure tcgaarch64.a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  806. var
  807. oppostfix:toppostfix;
  808. begin
  809. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  810. fromsize:=tosize;
  811. { ensure that all bits of the 32/64 register are always correctly set:
  812. * default behaviour is always to zero-extend to the entire (64 bit)
  813. register -> unsigned 8/16/32 bit loads only exist with a 32 bit
  814. target register, as the upper 32 bit will be zeroed implicitly
  815. -> always make target register 32 bit
  816. * signed loads exist both with 32 and 64 bit target registers,
  817. depending on whether the value should be sign extended to 32 or
  818. to 64 bit (if sign extended to 32 bit, the upper 32 bits of the
  819. corresponding 64 bit register are again zeroed) -> no need to
  820. change anything (we only have 32 and 64 bit registers), except that
  821. when loading an OS_S32 to a 32 bit register, we don't need/can't
  822. use sign extension
  823. }
  824. if fromsize in [OS_8,OS_16,OS_32] then
  825. reg:=makeregsize(reg,OS_32);
  826. if not(target_info.system=system_aarch64_darwin) and (ref.alignment<>0) and
  827. (ref.alignment<tcgsize2size[fromsize]) then
  828. begin
  829. a_load_ref_reg_unaligned(list,fromsize,tosize,ref,reg);
  830. exit;
  831. end;
  832. case fromsize of
  833. { signed integer registers }
  834. OS_8:
  835. oppostfix:=PF_B;
  836. OS_S8:
  837. oppostfix:=PF_SB;
  838. OS_16:
  839. oppostfix:=PF_H;
  840. OS_S16:
  841. oppostfix:=PF_SH;
  842. OS_S32:
  843. if getsubreg(reg)=R_SUBD then
  844. oppostfix:=PF_NONE
  845. else
  846. oppostfix:=PF_SW;
  847. OS_32,
  848. OS_64,
  849. OS_S64:
  850. oppostfix:=PF_None;
  851. else
  852. InternalError(200308297);
  853. end;
  854. handle_load_store(list,A_LDR,fromsize,oppostfix,reg,ref);
  855. { clear upper 16 bits if the value was negative }
  856. if (fromsize=OS_S8) and (tosize=OS_16) then
  857. a_load_reg_reg(list,fromsize,tosize,reg,reg);
  858. end;
  859. procedure tcgaarch64.a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister);
  860. var
  861. href: treference;
  862. hreg1, hreg2, tmpreg,tmpreg2: tregister;
  863. i : Integer;
  864. begin
  865. case fromsize of
  866. OS_64,OS_S64:
  867. begin
  868. { split into two 32 bit loads }
  869. hreg1:=getintregister(list,OS_32);
  870. hreg2:=getintregister(list,OS_32);
  871. if target_info.endian=endian_big then
  872. begin
  873. tmpreg:=hreg1;
  874. hreg1:=hreg2;
  875. hreg2:=tmpreg;
  876. end;
  877. { can we use LDP? }
  878. if (ref.alignment=4) and
  879. (simple_ref_type(A_LDP,OS_32,PF_None,ref)=sr_simple) then
  880. list.concat(taicpu.op_reg_reg_ref(A_LDP,hreg1,hreg2,ref))
  881. else
  882. begin
  883. a_load_ref_reg(list,OS_32,OS_32,ref,hreg1);
  884. href:=ref;
  885. inc(href.offset,4);
  886. a_load_ref_reg(list,OS_32,OS_32,href,hreg2);
  887. end;
  888. a_load_reg_reg(list,OS_32,OS_64,hreg1,register);
  889. list.concat(taicpu.op_reg_reg_const_const(A_BFI,register,makeregsize(hreg2,OS_64),32,32));
  890. end;
  891. OS_16,OS_S16,
  892. OS_32,OS_S32:
  893. begin
  894. if ref.alignment=2 then
  895. begin
  896. href:=ref;
  897. if target_info.endian=endian_big then
  898. inc(href.offset,tcgsize2size[fromsize]-2);
  899. tmpreg:=getintregister(list,OS_32);
  900. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg);
  901. tmpreg2:=getintregister(list,OS_32);
  902. for i:=1 to (tcgsize2size[fromsize]-1) div 2 do
  903. begin
  904. if target_info.endian=endian_big then
  905. dec(href.offset,2)
  906. else
  907. inc(href.offset,2);
  908. a_load_ref_reg(list,OS_16,OS_32,href,tmpreg2);
  909. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*16,16));
  910. end;
  911. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  912. end
  913. else
  914. begin
  915. href:=ref;
  916. if target_info.endian=endian_big then
  917. inc(href.offset,tcgsize2size[fromsize]-1);
  918. tmpreg:=getintregister(list,OS_32);
  919. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg);
  920. tmpreg2:=getintregister(list,OS_32);
  921. for i:=1 to tcgsize2size[fromsize]-1 do
  922. begin
  923. if target_info.endian=endian_big then
  924. dec(href.offset)
  925. else
  926. inc(href.offset);
  927. a_load_ref_reg(list,OS_8,OS_32,href,tmpreg2);
  928. list.concat(taicpu.op_reg_reg_const_const(A_BFI,tmpreg,tmpreg2,i*8,8));
  929. end;
  930. if (tosize in [OS_S8,OS_S16]) then
  931. list.concat(taicpu.op_reg_reg(A_SXTH,tmpreg,tmpreg));
  932. a_load_reg_reg(list,fromsize,tosize,tmpreg,register);
  933. end;
  934. end;
  935. else
  936. inherited;
  937. end;
  938. end;
  939. procedure tcgaarch64.a_load_reg_reg(list:TAsmList;fromsize,tosize:tcgsize;reg1,reg2:tregister);
  940. var
  941. instr: taicpu;
  942. begin
  943. { we use both 32 and 64 bit registers -> insert conversion when when
  944. we have to truncate/sign extend inside the (32 or 64 bit) register
  945. holding the value, and when we sign extend from a 32 to a 64 bit
  946. register }
  947. if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or
  948. ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and
  949. (fromsize<>tosize) and
  950. not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or
  951. ((fromsize in [OS_S8,OS_S16,OS_S32]) and
  952. (tosize in [OS_64,OS_S64])) or
  953. { needs to mask out the sign in the top 16 bits }
  954. ((fromsize=OS_S8) and
  955. (tosize=OS_16)) then
  956. begin
  957. case tosize of
  958. OS_8:
  959. list.concat(taicpu.op_reg_reg(A_UXTB,reg2,makeregsize(reg1,OS_32)));
  960. OS_16:
  961. list.concat(taicpu.op_reg_reg(A_UXTH,reg2,makeregsize(reg1,OS_32)));
  962. OS_S8:
  963. list.concat(taicpu.op_reg_reg(A_SXTB,reg2,makeregsize(reg1,OS_32)));
  964. OS_S16:
  965. list.concat(taicpu.op_reg_reg(A_SXTH,reg2,makeregsize(reg1,OS_32)));
  966. { while "mov wN, wM" automatically inserts a zero-extension and
  967. hence we could encode a 64->32 bit move like that, the problem
  968. is that we then can't distinguish 64->32 from 32->32 moves, and
  969. the 64->32 truncation could be removed altogether... So use a
  970. different instruction }
  971. OS_32,
  972. OS_S32:
  973. { in theory, reg1 should be 64 bit here (since fromsize>tosize),
  974. but because of the way location_force_register() tries to
  975. avoid superfluous zero/sign extensions, it's not always the
  976. case -> also force reg1 to to 64 bit }
  977. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  978. OS_64,
  979. OS_S64:
  980. case fromsize of
  981. OS_8:
  982. list.concat(taicpu.op_reg_reg(A_UXTB,reg2,makeregsize(reg1,OS_64)));
  983. OS_S8:
  984. list.concat(taicpu.op_reg_reg(A_SXTB,reg2,makeregsize(reg1,OS_32)));
  985. OS_16:
  986. list.concat(taicpu.op_reg_reg(A_UXTH,reg2,makeregsize(reg1,OS_64)));
  987. OS_S16:
  988. list.concat(taicpu.op_reg_reg(A_SXTH,reg2,makeregsize(reg1,OS_32)));
  989. OS_32:
  990. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  991. OS_S32:
  992. list.concat(taicpu.op_reg_reg(A_SXTW,reg2,makeregsize(reg1,OS_32)));
  993. else
  994. internalerror(2024070701);
  995. end;
  996. else
  997. internalerror(2002090901);
  998. end;
  999. end
  1000. else
  1001. begin
  1002. { 32 -> 32 bit move implies zero extension (sign extensions have
  1003. been handled above) -> also use for 32 <-> 64 bit moves }
  1004. if not(fromsize in [OS_64,OS_S64]) or
  1005. not(tosize in [OS_64,OS_S64]) then
  1006. instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32))
  1007. else
  1008. instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1);
  1009. list.Concat(instr);
  1010. { Notify the register allocator that we have written a move instruction so
  1011. it can try to eliminate it. }
  1012. add_move_instruction(instr);
  1013. end;
  1014. end;
  1015. procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister);
  1016. var
  1017. href: treference;
  1018. so: tshifterop;
  1019. op: tasmop;
  1020. begin
  1021. op:=A_LDR;
  1022. href:=ref;
  1023. { simplify as if we're going to perform a regular 64 bit load, using
  1024. "r" as the new base register if possible/necessary }
  1025. make_simple_ref(list,op,OS_ADDR,PF_None,href,r);
  1026. { load literal? }
  1027. if assigned(href.symbol) then
  1028. begin
  1029. if (href.base<>NR_NO) or
  1030. (href.index<>NR_NO) or
  1031. not assigned(href.symboldata) then
  1032. internalerror(2014110912);
  1033. list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset));
  1034. end
  1035. else
  1036. begin
  1037. if href.index<>NR_NO then
  1038. begin
  1039. if href.shiftmode<>SM_None then
  1040. begin
  1041. { "add" supports a supperset of the shift modes supported by
  1042. load/store instructions }
  1043. shifterop_reset(so);
  1044. so.shiftmode:=href.shiftmode;
  1045. so.shiftimm:=href.shiftimm;
  1046. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so));
  1047. end
  1048. else
  1049. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r);
  1050. end
  1051. else if href.offset<>0 then
  1052. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r)
  1053. else
  1054. a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r);
  1055. end;
  1056. end;
  1057. procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);
  1058. begin
  1059. internalerror(2014122107)
  1060. end;
  1061. procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  1062. begin
  1063. internalerror(2014122108)
  1064. end;
  1065. procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  1066. begin
  1067. internalerror(2014122109)
  1068. end;
  1069. procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
  1070. var
  1071. instr: taicpu;
  1072. begin
  1073. if assigned(shuffle) and
  1074. not shufflescalar(shuffle) then
  1075. internalerror(2014122104);
  1076. if fromsize=tosize then
  1077. begin
  1078. instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1);
  1079. { Notify the register allocator that we have written a move
  1080. instruction so it can try to eliminate it. }
  1081. add_move_instruction(instr);
  1082. { FMOV cannot generate a floating point exception }
  1083. end
  1084. else
  1085. begin
  1086. if (reg_cgsize(reg1)<>fromsize) or
  1087. (reg_cgsize(reg2)<>tosize) then
  1088. internalerror(2014110913);
  1089. instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1);
  1090. maybe_check_for_fpu_exception(list);
  1091. end;
  1092. list.Concat(instr);
  1093. end;
  1094. procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
  1095. var
  1096. tmpreg: tregister;
  1097. begin
  1098. if assigned(shuffle) and
  1099. not shufflescalar(shuffle) then
  1100. internalerror(2014122105);
  1101. tmpreg:=NR_NO;
  1102. if (fromsize<>tosize) then
  1103. begin
  1104. tmpreg:=reg;
  1105. reg:=getmmregister(list,fromsize);
  1106. end;
  1107. handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref);
  1108. if (fromsize<>tosize) then
  1109. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  1110. end;
  1111. procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
  1112. var
  1113. tmpreg: tregister;
  1114. begin
  1115. if assigned(shuffle) and
  1116. not shufflescalar(shuffle) then
  1117. internalerror(2014122106);
  1118. if (fromsize<>tosize) then
  1119. begin
  1120. tmpreg:=getmmregister(list,tosize);
  1121. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  1122. reg:=tmpreg;
  1123. end;
  1124. handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref);
  1125. end;
  1126. procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  1127. begin
  1128. if not shufflescalar(shuffle) then
  1129. internalerror(2014122801);
  1130. if tcgsize2size[fromsize]<>tcgsize2size[tosize] then
  1131. internalerror(2014122803);
  1132. case tcgsize2size[tosize] of
  1133. 4:
  1134. setsubreg(mmreg,R_SUBMMS);
  1135. 8:
  1136. setsubreg(mmreg,R_SUBMMD);
  1137. else
  1138. internalerror(2020101310);
  1139. end;
  1140. list.concat(taicpu.op_indexedreg_reg(A_INS,mmreg,0,intreg));
  1141. end;
  1142. procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
  1143. var
  1144. r : tregister;
  1145. begin
  1146. if not shufflescalar(shuffle) then
  1147. internalerror(2014122802);
  1148. if tcgsize2size[fromsize]>tcgsize2size[tosize] then
  1149. internalerror(2014122804);
  1150. case tcgsize2size[fromsize] of
  1151. 4:
  1152. setsubreg(mmreg,R_SUBMMS);
  1153. 8:
  1154. setsubreg(mmreg,R_SUBMMD);
  1155. else
  1156. internalerror(2020101311);
  1157. end;
  1158. if tcgsize2size[fromsize]<tcgsize2size[tosize] then
  1159. r:=makeregsize(intreg,fromsize)
  1160. else
  1161. r:=intreg;
  1162. list.concat(taicpu.op_reg_reg(A_FMOV,r,mmreg));
  1163. end;
  1164. procedure tcgaarch64.a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
  1165. begin
  1166. case op of
  1167. { "xor Vx,Vx" is used to initialize global regvars to 0 }
  1168. OP_XOR:
  1169. begin
  1170. if shuffle=nil then
  1171. begin
  1172. dst:=newreg(R_MMREGISTER,getsupreg(dst),R_SUBMM16B);
  1173. src:=newreg(R_MMREGISTER,getsupreg(src),R_SUBMM16B);
  1174. list.concat(taicpu.op_reg_reg_reg(A_EOR,dst,dst,src))
  1175. end
  1176. else if (src<>dst) or
  1177. (reg_cgsize(src)<>size) or
  1178. assigned(shuffle) then
  1179. internalerror(2015011401)
  1180. else
  1181. case size of
  1182. OS_F32,
  1183. OS_F64:
  1184. list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0));
  1185. else
  1186. internalerror(2015011402);
  1187. end;
  1188. end
  1189. else
  1190. internalerror(2015011403);
  1191. end;
  1192. end;
  1193. procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse,not_zero: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister);
  1194. var
  1195. bitsize: longint;
  1196. begin
  1197. if srcsize in [OS_64,OS_S64] then
  1198. begin
  1199. bitsize:=64;
  1200. end
  1201. else
  1202. begin
  1203. bitsize:=32;
  1204. end;
  1205. if not(not_zero) then
  1206. { source is 0 -> dst will have to become 255 }
  1207. list.concat(taicpu.op_reg_const(A_CMP,src,0));
  1208. if reverse then
  1209. begin
  1210. list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src));
  1211. { xor 31/63 is the same as setting the lower 5/6 bits to
  1212. "31/63-(lower 5/6 bits of dst)" }
  1213. list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1));
  1214. end
  1215. else
  1216. begin
  1217. list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src));
  1218. list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
  1219. end;
  1220. { set dst to -1 if src was 0 }
  1221. if not(not_zero) then
  1222. begin
  1223. list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE));
  1224. { mask the -1 to 255 if src was 0 (anyone find a two-instruction
  1225. branch-free version? All of mine are 3...) }
  1226. list.Concat(taicpu.op_reg_reg(A_UXTB,makeregsize(dst,OS_32),makeregsize(dst,OS_32)));
  1227. end;
  1228. end;
  1229. procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference);
  1230. var
  1231. href: treference;
  1232. hreg1, hreg2, tmpreg: tregister;
  1233. begin
  1234. if fromsize in [OS_64,OS_S64] then
  1235. begin
  1236. { split into two 32 bit stores }
  1237. hreg1:=getintregister(list,OS_32);
  1238. hreg2:=getintregister(list,OS_32);
  1239. a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1);
  1240. a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64));
  1241. if target_info.endian=endian_big then
  1242. begin
  1243. tmpreg:=hreg1;
  1244. hreg1:=hreg2;
  1245. hreg2:=tmpreg;
  1246. end;
  1247. { can we use STP? }
  1248. if (ref.alignment=4) and
  1249. (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then
  1250. list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref))
  1251. else
  1252. begin
  1253. a_load_reg_ref(list,OS_32,OS_32,hreg1,ref);
  1254. href:=ref;
  1255. inc(href.offset,4);
  1256. a_load_reg_ref(list,OS_32,OS_32,hreg2,href);
  1257. end;
  1258. end
  1259. else
  1260. inherited;
  1261. end;
  1262. procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  1263. const
  1264. overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG];
  1265. begin
  1266. if (op in overflowops) and
  1267. (size in [OS_8,OS_S8,OS_16,OS_S16]) then
  1268. a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32))
  1269. end;
  1270. procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);
  1271. begin
  1272. optimize_op_const(size,op,a);
  1273. case op of
  1274. OP_NONE:
  1275. exit;
  1276. OP_MOVE:
  1277. a_load_const_reg(list,size,a,reg);
  1278. OP_NEG,OP_NOT:
  1279. internalerror(200306011);
  1280. else
  1281. a_op_const_reg_reg(list,op,size,a,reg,reg);
  1282. end;
  1283. end;
  1284. procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister);
  1285. begin
  1286. Case op of
  1287. OP_NEG,
  1288. OP_NOT:
  1289. begin
  1290. if (op=OP_NOT) and (size in [OS_8,OS_S8]) then
  1291. list.concat(taicpu.op_reg_reg_const(A_EOR,dst,src,255))
  1292. else
  1293. begin
  1294. list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src));
  1295. maybeadjustresult(list,op,size,dst);
  1296. end;
  1297. end
  1298. else
  1299. a_op_reg_reg_reg(list,op,size,src,dst,dst);
  1300. end;
  1301. end;
  1302. procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);
  1303. var
  1304. l: tlocation;
  1305. begin
  1306. a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l);
  1307. end;
  1308. procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);
  1309. var
  1310. hreg: tregister;
  1311. begin
  1312. { no ROLV opcode... }
  1313. if op=OP_ROL then
  1314. begin
  1315. case size of
  1316. OS_32,OS_S32,
  1317. OS_64,OS_S64:
  1318. begin
  1319. hreg:=getintregister(list,size);
  1320. a_load_const_reg(list,size,tcgsize2size[size]*8,hreg);
  1321. a_op_reg_reg(list,OP_SUB,size,src1,hreg);
  1322. a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst);
  1323. exit;
  1324. end;
  1325. else
  1326. internalerror(2014111005);
  1327. end;
  1328. end
  1329. else if (op=OP_ROR) and
  1330. not(size in [OS_32,OS_S32,OS_64,OS_S64]) then
  1331. internalerror(2014111006);
  1332. if TOpCG2AsmOpReg[op]=A_NONE then
  1333. internalerror(2014111007);
  1334. list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1));
  1335. maybeadjustresult(list,op,size,dst);
  1336. end;
  1337. procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1338. var
  1339. shiftcountmask: longint;
  1340. constreg: tregister;
  1341. begin
  1342. { add/sub instructions have only positive immediate operands }
  1343. if (op in [OP_ADD,OP_SUB]) and
  1344. (a<0) and
  1345. { this might result in a false positive overflow in case of a+0 }
  1346. (a<>$8000000000000000) then
  1347. begin
  1348. if op=OP_ADD then
  1349. op:=op_SUB
  1350. else
  1351. op:=OP_ADD;
  1352. { avoid range/overflow error in case a = low(tcgint) }
  1353. {$push}{$r-}{$q-}
  1354. a:=-a;
  1355. {$pop}
  1356. end;
  1357. ovloc.loc:=LOC_VOID;
  1358. optimize_op_const(size,op,a);
  1359. case op of
  1360. OP_NONE:
  1361. begin
  1362. a_load_reg_reg(list,size,size,src,dst);
  1363. exit;
  1364. end;
  1365. OP_MOVE:
  1366. begin
  1367. a_load_const_reg(list,size,a,dst);
  1368. exit;
  1369. end;
  1370. else
  1371. ;
  1372. end;
  1373. case op of
  1374. OP_ADD,
  1375. OP_SUB:
  1376. begin
  1377. handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true);
  1378. { on a 64 bit target, overflows with smaller data types
  1379. are handled via range errors }
  1380. if setflags and
  1381. (size in [OS_64,OS_S64]) then
  1382. begin
  1383. location_reset(ovloc,LOC_FLAGS,OS_8);
  1384. if size=OS_64 then
  1385. if op=OP_ADD then
  1386. ovloc.resflags:=F_CS
  1387. else
  1388. ovloc.resflags:=F_CC
  1389. else
  1390. ovloc.resflags:=F_VS;
  1391. end;
  1392. end;
  1393. OP_OR,
  1394. OP_AND,
  1395. OP_XOR:
  1396. begin
  1397. if not(size in [OS_64,OS_S64]) then
  1398. a:=cardinal(a);
  1399. if is_shifter_const(a,size) then
  1400. list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a))
  1401. else
  1402. begin
  1403. constreg:=getintregister(list,size);
  1404. a_load_const_reg(list,size,a,constreg);
  1405. a_op_reg_reg_reg(list,op,size,constreg,src,dst);
  1406. end;
  1407. end;
  1408. OP_SHL,
  1409. OP_SHR,
  1410. OP_SAR:
  1411. begin
  1412. if size in [OS_64,OS_S64] then
  1413. shiftcountmask:=63
  1414. else
  1415. shiftcountmask:=31;
  1416. if (a and shiftcountmask)<>0 Then
  1417. list.concat(taicpu.op_reg_reg_const(
  1418. TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask))
  1419. else
  1420. a_load_reg_reg(list,size,size,src,dst);
  1421. if (a and not(tcgint(shiftcountmask)))<>0 then
  1422. internalError(2014112101);
  1423. end;
  1424. OP_ROL,
  1425. OP_ROR:
  1426. begin
  1427. case size of
  1428. OS_32,OS_S32:
  1429. if (a and not(tcgint(31)))<>0 then
  1430. internalError(2014112102);
  1431. OS_64,OS_S64:
  1432. if (a and not(tcgint(63)))<>0 then
  1433. internalError(2014112103);
  1434. else
  1435. internalError(2014112104);
  1436. end;
  1437. { there's only a ror opcode }
  1438. if op=OP_ROL then
  1439. a:=(tcgsize2size[size]*8)-a;
  1440. list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a));
  1441. end;
  1442. OP_MUL,
  1443. OP_IMUL,
  1444. OP_DIV,
  1445. OP_IDIV:
  1446. begin
  1447. constreg:=getintregister(list,size);
  1448. a_load_const_reg(list,size,a,constreg);
  1449. a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc);
  1450. end;
  1451. else
  1452. internalerror(2014111403);
  1453. end;
  1454. maybeadjustresult(list,op,size,dst);
  1455. end;
  1456. procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1457. var
  1458. tmpreg1, tmpreg2: tregister;
  1459. begin
  1460. ovloc.loc:=LOC_VOID;
  1461. { overflow can only occur with 64 bit calculations on 64 bit cpus }
  1462. if setflags and
  1463. (size in [OS_64,OS_S64]) then
  1464. begin
  1465. case op of
  1466. OP_ADD,
  1467. OP_SUB:
  1468. begin
  1469. list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S));
  1470. ovloc.loc:=LOC_FLAGS;
  1471. if size=OS_64 then
  1472. if op=OP_ADD then
  1473. ovloc.resflags:=F_CS
  1474. else
  1475. ovloc.resflags:=F_CC
  1476. else
  1477. ovloc.resflags:=F_VS;
  1478. { finished }
  1479. exit;
  1480. end;
  1481. OP_MUL:
  1482. begin
  1483. { check whether the upper 64 bit of the 128 bit product is 0 }
  1484. tmpreg1:=getintregister(list,OS_64);
  1485. list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1));
  1486. list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0));
  1487. ovloc.loc:=LOC_FLAGS;
  1488. ovloc.resflags:=F_NE;
  1489. { still have to perform the actual multiplication }
  1490. end;
  1491. OP_IMUL:
  1492. begin
  1493. { check whether the upper 64 bits of the 128 bit multiplication
  1494. result have the same value as the replicated sign bit of the
  1495. lower 64 bits }
  1496. tmpreg1:=getintregister(list,OS_64);
  1497. list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1));
  1498. { calculate lower 64 bits (afterwards, because dst may be
  1499. equal to src1 or src2) }
  1500. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1501. { replicate sign bit }
  1502. tmpreg2:=getintregister(list,OS_64);
  1503. a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2);
  1504. list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2));
  1505. ovloc.loc:=LOC_FLAGS;
  1506. ovloc.resflags:=F_NE;
  1507. { finished }
  1508. exit;
  1509. end;
  1510. OP_IDIV,
  1511. OP_DIV:
  1512. begin
  1513. { not handled here, needs div-by-zero check (dividing by zero
  1514. just gives a 0 result on aarch64), and low(int64) div -1
  1515. check for overflow) }
  1516. internalerror(2014122101);
  1517. end;
  1518. else
  1519. internalerror(2019050936);
  1520. end;
  1521. end;
  1522. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1523. end;
  1524. {*************** compare instructructions ****************}
  1525. procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
  1526. var
  1527. op: tasmop;
  1528. begin
  1529. if a>=0 then
  1530. op:=A_CMP
  1531. else
  1532. op:=A_CMN;
  1533. { avoid range/overflow error in case a=low(tcgint) }
  1534. {$push}{$r-}{$q-}
  1535. handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false);
  1536. {$pop}
  1537. a_jmp_cond(list,cmp_op,l);
  1538. end;
  1539. procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel);
  1540. begin
  1541. list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1));
  1542. a_jmp_cond(list,cmp_op,l);
  1543. end;
  1544. procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel);
  1545. var
  1546. ai: taicpu;
  1547. begin
  1548. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION));
  1549. ai.is_jmp:=true;
  1550. list.Concat(ai);
  1551. end;
  1552. procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string);
  1553. var
  1554. ai: taicpu;
  1555. begin
  1556. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION));
  1557. ai.is_jmp:=true;
  1558. list.Concat(ai);
  1559. end;
  1560. procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel);
  1561. var
  1562. ai: taicpu;
  1563. begin
  1564. ai:=TAiCpu.op_sym(A_B,l);
  1565. ai.is_jmp:=true;
  1566. ai.SetCondition(TOpCmp2AsmCond[cond]);
  1567. list.Concat(ai);
  1568. end;
  1569. procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);
  1570. var
  1571. ai : taicpu;
  1572. begin
  1573. ai:=Taicpu.op_sym(A_B,l);
  1574. ai.is_jmp:=true;
  1575. ai.SetCondition(flags_to_cond(f));
  1576. list.Concat(ai);
  1577. end;
  1578. procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister);
  1579. begin
  1580. list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f)));
  1581. end;
  1582. procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);
  1583. begin
  1584. { we need an explicit overflow location, because there are many
  1585. possibilities (not just the overflow flag, which is only used for
  1586. signed add/sub) }
  1587. internalerror(2014112303);
  1588. end;
  1589. procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation);
  1590. var
  1591. hl : tasmlabel;
  1592. hflags : tresflags;
  1593. begin
  1594. if not(cs_check_overflow in current_settings.localswitches) then
  1595. exit;
  1596. current_asmdata.getjumplabel(hl);
  1597. case ovloc.loc of
  1598. LOC_FLAGS:
  1599. begin
  1600. hflags:=ovloc.resflags;
  1601. inverse_flags(hflags);
  1602. cg.a_jmp_flags(list,hflags,hl);
  1603. end;
  1604. else
  1605. internalerror(2014112304);
  1606. end;
  1607. a_call_name(list,'FPC_OVERFLOW',false);
  1608. a_label(list,hl);
  1609. end;
  1610. { *********** entry/exit code and address loading ************ }
  1611. function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  1612. var
  1613. ref: treference;
  1614. sr: tsuperregister;
  1615. pairreg: tregister;
  1616. sehreg,sehregp : TAsmSehDirective;
  1617. begin
  1618. result:=0;
  1619. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1620. ref.addressmode:=AM_PREINDEXED;
  1621. pairreg:=NR_NO;
  1622. { for SEH on Win64 we can only store consecutive register pairs, others
  1623. need to be stored with STR }
  1624. if target_info.system=system_aarch64_win64 then
  1625. begin
  1626. if rt=R_INTREGISTER then
  1627. begin
  1628. sehreg:=ash_savereg_x;
  1629. sehregp:=ash_saveregp_x;
  1630. end
  1631. else if rt=R_MMREGISTER then
  1632. begin
  1633. sehreg:=ash_savefreg_x;
  1634. sehregp:=ash_savefregp_x;
  1635. end
  1636. else
  1637. internalerror(2020041304);
  1638. for sr:=lowsr to highsr do
  1639. if sr in rg[rt].used_in_proc then
  1640. if pairreg=NR_NO then
  1641. pairreg:=newreg(rt,sr,sub)
  1642. else
  1643. begin
  1644. inc(result,16);
  1645. if getsupreg(pairreg)=sr-1 then
  1646. begin
  1647. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1648. list.concat(cai_seh_directive.create_reg_offset(sehregp,pairreg,16));
  1649. pairreg:=NR_NO;
  1650. end
  1651. else
  1652. begin
  1653. list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref));
  1654. list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16));
  1655. pairreg:=newreg(rt,sr,sub);
  1656. end;
  1657. end;
  1658. if pairreg<>NR_NO then
  1659. begin
  1660. inc(result,16);
  1661. list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref));
  1662. list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16));
  1663. end;
  1664. end
  1665. else
  1666. begin
  1667. { store all used registers pairwise }
  1668. for sr:=lowsr to highsr do
  1669. if sr in rg[rt].used_in_proc then
  1670. if pairreg=NR_NO then
  1671. pairreg:=newreg(rt,sr,sub)
  1672. else
  1673. begin
  1674. inc(result,16);
  1675. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1676. pairreg:=NR_NO
  1677. end;
  1678. { one left -> store twice (stack must be 16 bytes aligned) }
  1679. if pairreg<>NR_NO then
  1680. begin
  1681. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
  1682. inc(result,16);
  1683. end;
  1684. end;
  1685. end;
  1686. procedure FixupOffsets(p:TObject;arg:pointer);
  1687. var
  1688. sym: tabstractnormalvarsym absolute p;
  1689. begin
  1690. if (tsym(p).typ in [paravarsym,localvarsym]) and
  1691. (sym.localloc.loc=LOC_REFERENCE) and
  1692. (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
  1693. begin
  1694. sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
  1695. dec(sym.localloc.reference.offset,PLongint(arg)^);
  1696. end;
  1697. end;
  1698. procedure tcgaarch64.g_stackpointer_alloc(list : TAsmList;localsize : longint);
  1699. var
  1700. href : treference;
  1701. i : integer;
  1702. again : tasmlabel;
  1703. begin
  1704. if localsize>0 then
  1705. begin
  1706. { windows guards only a few pages for stack growing,
  1707. so we have to access every page first }
  1708. if (target_info.system=system_aarch64_win64) and
  1709. (localsize>=winstackpagesize) then
  1710. begin
  1711. if localsize div winstackpagesize<=4 then
  1712. begin
  1713. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1714. for i:=1 to localsize div winstackpagesize do
  1715. begin
  1716. reference_reset_base(href,NR_SP,localsize-i*winstackpagesize+4,ctempposinvalid,4,[]);
  1717. list.concat(Taicpu.op_reg_ref(A_STR,NR_WZR,href));
  1718. end;
  1719. reference_reset_base(href,NR_SP,0,ctempposinvalid,4,[]);
  1720. list.concat(Taicpu.op_reg_ref(A_STR,NR_WZR,href));
  1721. end
  1722. else
  1723. begin
  1724. current_asmdata.getjumplabel(again);
  1725. getcpuregister(list,NR_IP0);
  1726. a_load_const_reg(list,OS_ADDR,localsize div winstackpagesize,NR_IP0);
  1727. a_label(list,again);
  1728. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,winstackpagesize,NR_SP,NR_IP1,false,true);
  1729. reference_reset_base(href,NR_SP,0,ctempposinvalid,4,[]);
  1730. list.concat(Taicpu.op_reg_ref(A_STR,NR_WZR,href));
  1731. list.concat(setoppostfix(Taicpu.op_reg_reg_const(A_SUB,NR_IP0,NR_IP0,1),PF_S));
  1732. a_jmp_cond(list,OC_NE,again);
  1733. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize mod winstackpagesize,NR_SP,NR_IP1,false,true);
  1734. ungetcpuregister(list,NR_IP0);
  1735. end
  1736. end
  1737. else
  1738. begin
  1739. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1740. if target_info.system=system_aarch64_win64 then
  1741. list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize));
  1742. end;
  1743. end;
  1744. end;
  1745. procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
  1746. var
  1747. hitem: tlinkedlistitem;
  1748. seh_proc: tai_seh_directive;
  1749. templist: TAsmList;
  1750. genloadframeforexcept,
  1751. suppress_endprologue: boolean;
  1752. ref: treference;
  1753. totalstackframesize: longint;
  1754. begin
  1755. { on aarch64, we need to store the link register and the generate a frame pointer if the subroutine either
  1756. - receives parameters on the stack
  1757. - is not a leaf procedure
  1758. - has nested procedures
  1759. - helpers retrieve the stack pointer
  1760. }
  1761. hitem:=list.last;
  1762. { pi_has_unwind_info may already be set at this point if there are
  1763. SEH directives in assembler body. In this case, .seh_endprologue
  1764. is expected to be one of those directives, and not generated here. }
  1765. suppress_endprologue:=(pi_has_unwind_info in current_procinfo.flags);
  1766. genloadframeforexcept:=false;
  1767. if not nostackframe then
  1768. begin
  1769. { stack pointer has to be aligned to 16 bytes at all times }
  1770. localsize:=align(localsize,16);
  1771. if target_info.system=system_aarch64_win64 then
  1772. include(current_procinfo.flags,pi_has_unwind_info);
  1773. if not(pi_no_framepointer_needed in current_procinfo.flags) then
  1774. begin
  1775. { save stack pointer and return address }
  1776. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1777. ref.addressmode:=AM_PREINDEXED;
  1778. list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
  1779. current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
  1780. current_asmdata.asmcfi.cfa_offset(list,NR_FP,-16);
  1781. current_asmdata.asmcfi.cfa_offset(list,NR_LR,-8);
  1782. if target_info.system=system_aarch64_win64 then
  1783. list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16));
  1784. { initialise frame pointer }
  1785. if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then
  1786. begin
  1787. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
  1788. current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FP);
  1789. if target_info.system=system_aarch64_win64 then
  1790. list.concat(cai_seh_directive.create(ash_setfp));
  1791. end
  1792. else
  1793. begin
  1794. { do this after the prologue is done for aarch64-win64 as
  1795. there is no SEH directive for setting FP to a register }
  1796. if target_info.system<>system_aarch64_win64 then
  1797. gen_load_frame_for_exceptfilter(list)
  1798. else
  1799. genloadframeforexcept:=true;
  1800. localsize:=current_procinfo.maxpushedparasize;
  1801. end;
  1802. end;
  1803. totalstackframesize:=localsize;
  1804. { save modified integer registers }
  1805. inc(totalstackframesize,
  1806. save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
  1807. { only the lower 64 bits of the modified vector registers need to be
  1808. saved; if the caller needs the upper 64 bits, it has to save them
  1809. itself }
  1810. inc(totalstackframesize,
  1811. save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
  1812. { allocate stack space }
  1813. if localsize<>0 then
  1814. begin
  1815. localsize:=align(localsize,16);
  1816. current_procinfo.final_localsize:=localsize;
  1817. g_stackpointer_alloc(list,localsize);
  1818. end;
  1819. { By default, we use the frame pointer to access parameters passed via
  1820. the stack and the stack pointer to address local variables and temps
  1821. because
  1822. a) we can use bigger positive than negative offsets (so accessing
  1823. locals via negative offsets from the frame pointer would be less
  1824. efficient)
  1825. b) we don't know the local size while generating the code, so
  1826. accessing the parameters via the stack pointer is not possible
  1827. without copying them
  1828. The problem with this is the get_frame() intrinsic:
  1829. a) it must return the same value as what we pass as parentfp
  1830. parameter, since that's how it's used in the TP-style objects unit
  1831. b) its return value must usable to access all local data from a
  1832. routine (locals and parameters), since it's all the nested
  1833. routines have access to
  1834. c) its return value must be usable to construct a backtrace, as it's
  1835. also used by the exception handling routines
  1836. The solution we use here, based on something similar that's done in
  1837. the MIPS port, is to generate all accesses to locals in the routine
  1838. itself SP-relative, and then after the code is generated and the local
  1839. size is known (namely, here), we change all SP-relative variables/
  1840. parameters into FP-relative ones. This means that they'll be accessed
  1841. less efficiently from nested routines, but those accesses are indirect
  1842. anyway and at least this way they can be accessed at all
  1843. }
  1844. if current_procinfo.has_nestedprocs or
  1845. (
  1846. (target_info.system=system_aarch64_win64) and
  1847. (current_procinfo.flags*[pi_has_implicit_finally,pi_needs_implicit_finally,pi_uses_exceptions]<>[])
  1848. ) then
  1849. begin
  1850. current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1851. current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1852. end;
  1853. end;
  1854. if not (pi_has_unwind_info in current_procinfo.flags) then
  1855. begin
  1856. if genloadframeforexcept then
  1857. gen_load_frame_for_exceptfilter(list);
  1858. exit;
  1859. end;
  1860. { Generate unwind data for aarch64-win64 }
  1861. seh_proc:=cai_seh_directive.create_name(ash_proc,current_procinfo.procdef.mangledname);
  1862. if assigned(hitem) then
  1863. list.insertafter(seh_proc,hitem)
  1864. else
  1865. list.insert(seh_proc);
  1866. { the directive creates another section }
  1867. inc(list.section_count);
  1868. templist:=TAsmList.Create;
  1869. if not suppress_endprologue then
  1870. begin
  1871. templist.concat(cai_seh_directive.create(ash_endprologue));
  1872. end;
  1873. if assigned(current_procinfo.endprologue_ai) then
  1874. current_procinfo.aktproccode.insertlistafter(current_procinfo.endprologue_ai,templist)
  1875. else
  1876. list.concatlist(templist);
  1877. templist.free;
  1878. if genloadframeforexcept then
  1879. gen_load_frame_for_exceptfilter(list);
  1880. end;
  1881. procedure tcgaarch64.g_maybe_got_init(list : TAsmList);
  1882. begin
  1883. { nothing to do on Darwin or Linux }
  1884. end;
  1885. procedure tcgaarch64.g_restore_registers(list:TAsmList);
  1886. begin
  1887. { done in g_proc_exit }
  1888. end;
  1889. procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  1890. var
  1891. ref: treference;
  1892. sr, highestsetsr: tsuperregister;
  1893. pairreg: tregister;
  1894. i,
  1895. regcount: longint;
  1896. aiarr : array of tai;
  1897. begin
  1898. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1899. ref.addressmode:=AM_POSTINDEXED;
  1900. regcount:=0;
  1901. { due to SEH on Win64 we can only load consecutive registers and single
  1902. ones are done using LDR, so we need to handle this differently there }
  1903. if target_info.system=system_aarch64_win64 then
  1904. begin
  1905. setlength(aiarr,highsr-lowsr+1);
  1906. pairreg:=NR_NO;
  1907. for sr:=lowsr to highsr do
  1908. if sr in rg[rt].used_in_proc then
  1909. begin
  1910. if pairreg=NR_NO then
  1911. pairreg:=newreg(rt,sr,sub)
  1912. else
  1913. begin
  1914. if getsupreg(pairreg)=sr-1 then
  1915. begin
  1916. aiarr[regcount]:=taicpu.op_reg_reg_ref(A_LDP,pairreg,newreg(rt,sr,sub),ref);
  1917. inc(regcount);
  1918. pairreg:=NR_NO;
  1919. end
  1920. else
  1921. begin
  1922. aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref);
  1923. inc(regcount);
  1924. pairreg:=newreg(rt,sr,sub);
  1925. end;
  1926. end;
  1927. end;
  1928. if pairreg<>NR_NO then
  1929. begin
  1930. aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref);
  1931. inc(regcount);
  1932. pairreg:=NR_NO;
  1933. end;
  1934. for i:=regcount-1 downto 0 do
  1935. list.concat(aiarr[i]);
  1936. end
  1937. else
  1938. begin
  1939. { highest reg stored twice? }
  1940. highestsetsr:=RS_NO;
  1941. for sr:=lowsr to highsr do
  1942. if sr in rg[rt].used_in_proc then
  1943. begin
  1944. inc(regcount);
  1945. highestsetsr:=sr;
  1946. end;
  1947. if odd(regcount) then
  1948. begin
  1949. list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref));
  1950. highestsetsr:=pred(highestsetsr);
  1951. end;
  1952. { load all (other) used registers pairwise }
  1953. pairreg:=NR_NO;
  1954. for sr:=highestsetsr downto lowsr do
  1955. if sr in rg[rt].used_in_proc then
  1956. if pairreg=NR_NO then
  1957. pairreg:=newreg(rt,sr,sub)
  1958. else
  1959. begin
  1960. list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref));
  1961. pairreg:=NR_NO
  1962. end;
  1963. end;
  1964. { There can't be any register left }
  1965. if pairreg<>NR_NO then
  1966. internalerror(2014112602);
  1967. end;
  1968. procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  1969. var
  1970. ref: treference;
  1971. regsstored: boolean;
  1972. sr: tsuperregister;
  1973. begin
  1974. if not(nostackframe) and
  1975. { we do not need an exit stack frame when we never return
  1976. * the final ret is left so the peephole optimizer can easily do call/ret -> jmp or call conversions
  1977. * the entry stack frame must be normally generated because the subroutine could be still left by
  1978. an exception and then the unwinding code might need to restore the registers stored by the entry code
  1979. }
  1980. not(po_noreturn in current_procinfo.procdef.procoptions) then
  1981. begin
  1982. { if no registers have been stored, we don't have to subtract the
  1983. allocated temp space from the stack pointer }
  1984. regsstored:=false;
  1985. for sr:=RS_X19 to RS_X28 do
  1986. if sr in rg[R_INTREGISTER].used_in_proc then
  1987. begin
  1988. regsstored:=true;
  1989. break;
  1990. end;
  1991. if not regsstored then
  1992. for sr:=RS_D8 to RS_D15 do
  1993. if sr in rg[R_MMREGISTER].used_in_proc then
  1994. begin
  1995. regsstored:=true;
  1996. break;
  1997. end;
  1998. { restore registers (and stack pointer) }
  1999. if regsstored then
  2000. begin
  2001. if current_procinfo.final_localsize<>0 then
  2002. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true);
  2003. load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
  2004. load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
  2005. { on Windows also restore SP even if the add should be enough
  2006. to have matching exit sequence to the entry sequence }
  2007. if target_info.system=system_aarch64_win64 then
  2008. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  2009. end
  2010. else if current_procinfo.final_localsize<>0 then
  2011. begin
  2012. { restore stack pointer }
  2013. { Note: for Windows we need to restore the stack using an ADD
  2014. and to set FP back to SP }
  2015. if target_info.system=system_aarch64_win64 then
  2016. begin
  2017. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,current_procinfo.framepointer,current_procinfo.final_localsize,
  2018. current_procinfo.framepointer,NR_IP0,false,true);
  2019. if not (pi_no_framepointer_needed in current_procinfo.flags) then
  2020. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  2021. end
  2022. else if pi_no_framepointer_needed in current_procinfo.flags then
  2023. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,current_procinfo.framepointer,current_procinfo.final_localsize,
  2024. current_procinfo.framepointer,NR_IP0,false,true)
  2025. else
  2026. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  2027. end;
  2028. if not(pi_no_framepointer_needed in current_procinfo.flags) then
  2029. begin
  2030. { restore framepointer and return address }
  2031. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  2032. ref.addressmode:=AM_POSTINDEXED;
  2033. list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
  2034. end;
  2035. end;
  2036. { return }
  2037. list.concat(taicpu.op_none(A_RET));
  2038. if (pi_has_unwind_info in current_procinfo.flags) then
  2039. begin
  2040. tcpuprocinfo(current_procinfo).dump_scopes(list);
  2041. list.concat(cai_seh_directive.create(ash_endproc));
  2042. end;
  2043. end;
  2044. procedure tcgaarch64.g_save_registers(list : TAsmList);
  2045. begin
  2046. { done in g_proc_entry }
  2047. end;
  2048. procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
  2049. var
  2050. sourcebasereplaced, destbasereplaced: boolean;
  2051. { get optimal memory operation to use for loading/storing data
  2052. in an unrolled loop }
  2053. procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
  2054. begin
  2055. if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
  2056. (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
  2057. begin
  2058. memop:=unscaledop;
  2059. needsimplify:=true;
  2060. end
  2061. else if (unscaledop<>A_NONE) and
  2062. (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
  2063. (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
  2064. begin
  2065. memop:=unscaledop;
  2066. needsimplify:=false;
  2067. end
  2068. else
  2069. begin
  2070. memop:=scaledop;
  2071. needsimplify:=true;
  2072. end;
  2073. end;
  2074. { adjust the offset and/or addressing mode after a load/store so it's
  2075. correct for the next one of the same size }
  2076. procedure updaterefafterloadstore(var ref: treference; oplen: longint);
  2077. begin
  2078. case ref.addressmode of
  2079. AM_OFFSET:
  2080. inc(ref.offset,oplen);
  2081. AM_POSTINDEXED:
  2082. { base register updated by instruction, next offset can remain
  2083. the same }
  2084. ;
  2085. AM_PREINDEXED:
  2086. begin
  2087. { base register updated by instruction -> next instruction can
  2088. use post-indexing with offset = sizeof(operation) }
  2089. ref.offset:=0;
  2090. ref.addressmode:=AM_OFFSET;
  2091. end;
  2092. end;
  2093. end;
  2094. { generate a load/store and adjust the reference offset to the next
  2095. memory location if necessary }
  2096. procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  2097. begin
  2098. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
  2099. updaterefafterloadstore(ref,tcgsize2size[opsize]);
  2100. end;
  2101. { generate a dual load/store (ldp/stp) and adjust the reference offset to
  2102. the next memory location if necessary }
  2103. procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  2104. begin
  2105. list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
  2106. updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
  2107. end;
  2108. { turn a reference into a pre- or post-indexed reference for use in a
  2109. load/store of a particular size }
  2110. procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
  2111. var
  2112. tmpreg: tregister;
  2113. scaledoffset: longint;
  2114. orgaddressmode: taddressmode;
  2115. begin
  2116. scaledoffset:=tcgsize2size[opsize];
  2117. if scaledop in [A_LDP,A_STP] then
  2118. scaledoffset:=scaledoffset*2;
  2119. { can we use the reference as post-indexed without changes? }
  2120. if forcepostindexing then
  2121. begin
  2122. orgaddressmode:=ref.addressmode;
  2123. ref.addressmode:=AM_POSTINDEXED;
  2124. if (orgaddressmode=AM_POSTINDEXED) or
  2125. ((ref.offset=0) and
  2126. (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
  2127. begin
  2128. { just change the post-indexed offset to the access size }
  2129. ref.offset:=scaledoffset;
  2130. { and replace the base register if that didn't happen yet
  2131. (could be sp or a regvar) }
  2132. if not basereplaced then
  2133. begin
  2134. tmpreg:=getaddressregister(list);
  2135. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  2136. ref.base:=tmpreg;
  2137. basereplaced:=true;
  2138. end;
  2139. exit;
  2140. end;
  2141. ref.addressmode:=orgaddressmode;
  2142. end;
  2143. {$ifdef dummy}
  2144. This could in theory be useful in case you have a concatcopy from
  2145. e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
  2146. very unlikely. Disabled because it still needs fixes, as it
  2147. also generates pre-indexed loads right now at the very end for the
  2148. left-over gencopies
  2149. { can we turn it into a pre-indexed reference for free? (after the
  2150. first operation, it will be turned into an offset one) }
  2151. if not forcepostindexing and
  2152. (ref.offset<>0) then
  2153. begin
  2154. orgaddressmode:=ref.addressmode;
  2155. ref.addressmode:=AM_PREINDEXED;
  2156. tmpreg:=ref.base;
  2157. if not basereplaced and
  2158. (ref.base=tmpreg) then
  2159. begin
  2160. tmpreg:=getaddressregister(list);
  2161. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  2162. ref.base:=tmpreg;
  2163. basereplaced:=true;
  2164. end;
  2165. if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
  2166. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  2167. exit;
  2168. end;
  2169. {$endif dummy}
  2170. if not forcepostindexing then
  2171. begin
  2172. ref.addressmode:=AM_OFFSET;
  2173. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  2174. { this may still cause problems if the final offset is no longer
  2175. a simple ref; it's a bit complicated to pass all information
  2176. through at all places and check that here, so play safe: we
  2177. currently never generate unrolled copies for more than 64
  2178. bytes (32 with non-double-register copies) }
  2179. if ref.index=NR_NO then
  2180. begin
  2181. if ((scaledop in [A_LDP,A_STP]) and
  2182. (ref.offset<((64-8)*tcgsize2size[opsize]))) or
  2183. ((scaledop in [A_LDUR,A_STUR]) and
  2184. (ref.offset<(255-8*tcgsize2size[opsize]))) or
  2185. ((scaledop in [A_LDR,A_STR]) and
  2186. (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
  2187. exit;
  2188. end;
  2189. end;
  2190. tmpreg:=getaddressregister(list);
  2191. a_loadaddr_ref_reg(list,ref,tmpreg);
  2192. basereplaced:=true;
  2193. if forcepostindexing then
  2194. begin
  2195. reference_reset_base(ref,tmpreg,scaledoffset,ref.temppos,ref.alignment,ref.volatility);
  2196. ref.addressmode:=AM_POSTINDEXED;
  2197. end
  2198. else
  2199. begin
  2200. reference_reset_base(ref,tmpreg,0,ref.temppos,ref.alignment,ref.volatility);
  2201. ref.addressmode:=AM_OFFSET;
  2202. end
  2203. end;
  2204. { prepare a reference for use by gencopy. This is done both after the
  2205. unrolled and regular copy loop -> get rid of post-indexing mode, make
  2206. sure ref is valid }
  2207. procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
  2208. var
  2209. simplify: boolean;
  2210. begin
  2211. if ref.addressmode=AM_POSTINDEXED then
  2212. ref.offset:=tcgsize2size[opsize];
  2213. getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
  2214. if simplify then
  2215. begin
  2216. makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
  2217. op:=scaledop;
  2218. end;
  2219. end;
  2220. { generate a copy from source to dest of size opsize/postfix }
  2221. procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
  2222. var
  2223. reg: tregister;
  2224. loadop, storeop: tasmop;
  2225. begin
  2226. preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
  2227. preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
  2228. reg:=getintregister(list,opsize);
  2229. genloadstore(list,loadop,reg,source,postfix,opsize);
  2230. genloadstore(list,storeop,reg,dest,postfix,opsize);
  2231. end;
  2232. { copy the leftovers after an unrolled or regular copy loop }
  2233. procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
  2234. begin
  2235. { stop post-indexing if we did so in the loop, since in that case all
  2236. offsets definitely can be represented now }
  2237. if source.addressmode=AM_POSTINDEXED then
  2238. begin
  2239. source.addressmode:=AM_OFFSET;
  2240. source.offset:=0;
  2241. end;
  2242. if dest.addressmode=AM_POSTINDEXED then
  2243. begin
  2244. dest.addressmode:=AM_OFFSET;
  2245. dest.offset:=0;
  2246. end;
  2247. { transfer the leftovers }
  2248. if len>=8 then
  2249. begin
  2250. dec(len,8);
  2251. gencopy(list,source,dest,PF_NONE,OS_64);
  2252. end;
  2253. if len>=4 then
  2254. begin
  2255. dec(len,4);
  2256. gencopy(list,source,dest,PF_NONE,OS_32);
  2257. end;
  2258. if len>=2 then
  2259. begin
  2260. dec(len,2);
  2261. gencopy(list,source,dest,PF_H,OS_16);
  2262. end;
  2263. if len>=1 then
  2264. begin
  2265. dec(len);
  2266. gencopy(list,source,dest,PF_B,OS_8);
  2267. end;
  2268. end;
  2269. const
  2270. { load_length + loop dec + cbnz }
  2271. loopoverhead=12;
  2272. { loop overhead + load + store }
  2273. totallooplen=loopoverhead + 8;
  2274. var
  2275. totalalign: longint;
  2276. maxlenunrolled: tcgint;
  2277. loadop, storeop: tasmop;
  2278. opsize: tcgsize;
  2279. postfix: toppostfix;
  2280. tmpsource, tmpdest: treference;
  2281. scaledstoreop, unscaledstoreop,
  2282. scaledloadop, unscaledloadop: tasmop;
  2283. regs: array[1..8] of tregister;
  2284. countreg: tregister;
  2285. i, regcount: longint;
  2286. hl: tasmlabel;
  2287. simplifysource, simplifydest: boolean;
  2288. begin
  2289. if len=0 then
  2290. exit;
  2291. sourcebasereplaced:=false;
  2292. destbasereplaced:=false;
  2293. { maximum common alignment }
  2294. totalalign:=max(1,newalignment(source.alignment,dest.alignment));
  2295. { use a simple load/store? }
  2296. if (len in [1,2,4,8]) and
  2297. ((totalalign>=(len div 2)) or
  2298. (source.alignment=len) or
  2299. (dest.alignment=len)) then
  2300. begin
  2301. opsize:=int_cgsize(len);
  2302. a_load_ref_ref(list,opsize,opsize,source,dest);
  2303. exit;
  2304. end;
  2305. { alignment > length is not useful, and would break some checks below }
  2306. while totalalign>len do
  2307. totalalign:=totalalign div 2;
  2308. { operation sizes to use based on common alignment }
  2309. case totalalign of
  2310. 1:
  2311. begin
  2312. postfix:=PF_B;
  2313. opsize:=OS_8;
  2314. end;
  2315. 2:
  2316. begin
  2317. postfix:=PF_H;
  2318. opsize:=OS_16;
  2319. end;
  2320. 4:
  2321. begin
  2322. postfix:=PF_None;
  2323. opsize:=OS_32;
  2324. end
  2325. else
  2326. begin
  2327. totalalign:=8;
  2328. postfix:=PF_None;
  2329. opsize:=OS_64;
  2330. end;
  2331. end;
  2332. { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
  2333. maxlenunrolled:=min(totalalign,8)*4;
  2334. { ldp/stp -> 2 registers per instruction }
  2335. if (totalalign>=4) and
  2336. (len>=totalalign*2) then
  2337. begin
  2338. maxlenunrolled:=maxlenunrolled*2;
  2339. scaledstoreop:=A_STP;
  2340. scaledloadop:=A_LDP;
  2341. unscaledstoreop:=A_NONE;
  2342. unscaledloadop:=A_NONE;
  2343. end
  2344. else
  2345. begin
  2346. scaledstoreop:=A_STR;
  2347. scaledloadop:=A_LDR;
  2348. unscaledstoreop:=A_STUR;
  2349. unscaledloadop:=A_LDUR;
  2350. end;
  2351. { we only need 4 instructions extra to call FPC_MOVE }
  2352. if cs_opt_size in current_settings.optimizerswitches then
  2353. maxlenunrolled:=maxlenunrolled div 2;
  2354. if (len>maxlenunrolled) and
  2355. (len>totalalign*8) and
  2356. (pi_do_call in current_procinfo.flags) then
  2357. begin
  2358. g_concatcopy_move(list,source,dest,len);
  2359. exit;
  2360. end;
  2361. simplifysource:=true;
  2362. simplifydest:=true;
  2363. tmpsource:=source;
  2364. tmpdest:=dest;
  2365. { can we directly encode all offsets in an unrolled loop? }
  2366. if len<=maxlenunrolled then
  2367. begin
  2368. {$ifdef extdebug}
  2369. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
  2370. {$endif extdebug}
  2371. { the leftovers will be handled separately -> -(len mod opsize) }
  2372. inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
  2373. { additionally, the last regular load/store will be at
  2374. offset+len-opsize (if len-(len mod opsize)>len) }
  2375. if tmpsource.offset>source.offset then
  2376. dec(tmpsource.offset,tcgsize2size[opsize]);
  2377. getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
  2378. inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
  2379. if tmpdest.offset>dest.offset then
  2380. dec(tmpdest.offset,tcgsize2size[opsize]);
  2381. getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
  2382. tmpsource:=source;
  2383. tmpdest:=dest;
  2384. { if we can't directly encode all offsets, simplify }
  2385. if simplifysource then
  2386. begin
  2387. loadop:=scaledloadop;
  2388. makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
  2389. end;
  2390. if simplifydest then
  2391. begin
  2392. storeop:=scaledstoreop;
  2393. makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
  2394. end;
  2395. regcount:=len div tcgsize2size[opsize];
  2396. { in case we transfer two registers at a time, we copy an even
  2397. number of registers }
  2398. if loadop=A_LDP then
  2399. regcount:=regcount and not(1);
  2400. { initialise for dfa }
  2401. regs[low(regs)]:=NR_NO;
  2402. { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
  2403. for i:=1 to regcount do
  2404. regs[i]:=getintregister(list,opsize);
  2405. if loadop=A_LDP then
  2406. begin
  2407. { load registers }
  2408. for i:=1 to (regcount div 2) do
  2409. gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
  2410. { store registers }
  2411. for i:=1 to (regcount div 2) do
  2412. gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
  2413. end
  2414. else
  2415. begin
  2416. for i:=1 to regcount do
  2417. genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
  2418. for i:=1 to regcount do
  2419. genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
  2420. end;
  2421. { leftover }
  2422. len:=len-regcount*tcgsize2size[opsize];
  2423. {$ifdef extdebug}
  2424. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
  2425. {$endif extdebug}
  2426. end
  2427. else
  2428. begin
  2429. {$ifdef extdebug}
  2430. list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
  2431. {$endif extdebug}
  2432. { regular loop -> definitely use post-indexing }
  2433. loadop:=scaledloadop;
  2434. makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
  2435. storeop:=scaledstoreop;
  2436. makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
  2437. current_asmdata.getjumplabel(hl);
  2438. countreg:=getintregister(list,OS_32);
  2439. if loadop=A_LDP then
  2440. a_load_const_reg(list,OS_32,len div (tcgsize2size[opsize]*2),countreg)
  2441. else
  2442. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
  2443. a_label(list,hl);
  2444. a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
  2445. if loadop=A_LDP then
  2446. begin
  2447. regs[1]:=getintregister(list,opsize);
  2448. regs[2]:=getintregister(list,opsize);
  2449. gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
  2450. gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
  2451. end
  2452. else
  2453. begin
  2454. regs[1]:=getintregister(list,opsize);
  2455. genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
  2456. genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
  2457. end;
  2458. list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
  2459. if loadop=A_LDP then
  2460. len:=len mod (tcgsize2size[opsize]*2)
  2461. else
  2462. len:=len mod tcgsize2size[opsize];
  2463. end;
  2464. gencopyleftovers(list,tmpsource,tmpdest,len);
  2465. end;
  2466. procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
  2467. begin
  2468. { This method is integrated into g_intf_wrapper and shouldn't be called separately }
  2469. InternalError(2013020102);
  2470. end;
  2471. procedure tcgaarch64.g_check_for_fpu_exception(list: TAsmList;force,clear : boolean);
  2472. var
  2473. r, tmpreg: TRegister;
  2474. ai: taicpu;
  2475. l1,l2: TAsmLabel;
  2476. begin
  2477. { so far, we assume all flavours of AArch64 need explicit floating point exception checking }
  2478. if ((cs_check_fpu_exceptions in current_settings.localswitches) and
  2479. (force or current_procinfo.FPUExceptionCheckNeeded)) then
  2480. begin
  2481. r:=getintregister(list,OS_INT);
  2482. tmpreg:=getintregister(list,OS_INT);
  2483. list.concat(taicpu.op_reg_reg(A_MRS,r,NR_FPSR));
  2484. list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$1f));
  2485. current_asmdata.getjumplabel(l1);
  2486. current_asmdata.getjumplabel(l2);
  2487. ai:=taicpu.op_reg_sym_ofs(A_CBNZ,tmpreg,l1,0);
  2488. ai.is_jmp:=true;
  2489. list.concat(ai);
  2490. list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$80));
  2491. ai:=taicpu.op_reg_sym_ofs(A_CBZ,tmpreg,l2,0);
  2492. ai.is_jmp:=true;
  2493. list.concat(ai);
  2494. a_label(list,l1);
  2495. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2496. cg.a_call_name(list,'FPC_THROWFPUEXCEPTION',false);
  2497. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2498. a_label(list,l2);
  2499. if clear then
  2500. current_procinfo.FPUExceptionCheckNeeded:=false;
  2501. end;
  2502. end;
  2503. procedure tcgaarch64.g_profilecode(list : TAsmList);
  2504. begin
  2505. if target_info.system = system_aarch64_linux then
  2506. begin
  2507. list.concat(taicpu.op_reg_reg(A_MOV,NR_X0,NR_X30));
  2508. a_call_name(list,'_mcount',false);
  2509. end
  2510. else
  2511. internalerror(2020021901);
  2512. end;
  2513. procedure create_codegen;
  2514. begin
  2515. cg:=tcgaarch64.Create;
  2516. cg128:=tcg128.Create;
  2517. end;
  2518. end.