cgcpu.pas 91 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325
  1. {
  2. Copyright (c) 2014 by Jonas Maebe
  3. This unit implements the code generator for AArch64
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit cgcpu;
  18. {$i fpcdefs.inc}
  19. interface
  20. uses
  21. globtype,parabase,
  22. cgbase,cgutils,cgobj,
  23. aasmbase,aasmtai,aasmdata,aasmcpu,
  24. cpubase,cpuinfo,
  25. node,symconst,SymType,symdef,
  26. rgcpu;
  27. type
  28. tcgaarch64=class(tcg)
  29. protected
  30. { changes register size without adding register allocation info }
  31. function makeregsize(reg: tregister; size: tcgsize): tregister; overload;
  32. public
  33. { simplifies "ref" so it can be used with "op". If "ref" can be used
  34. with a different load/Store operation that has the same meaning as the
  35. original one, "op" will be replaced with the alternative }
  36. procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  37. function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override;
  38. procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  39. procedure init_register_allocators;override;
  40. procedure done_register_allocators;override;
  41. function getmmregister(list:TAsmList;size:tcgsize):tregister;override;
  42. function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  43. procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override;
  44. procedure a_call_reg(list:TAsmList;Reg:tregister);override;
  45. { General purpose instructions }
  46. procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  47. procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override;
  48. procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override;
  49. procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override;
  50. procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override;
  51. procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  52. procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override;
  53. { move instructions }
  54. procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override;
  55. procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override;
  56. procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override;
  57. procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override;
  58. procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override;
  59. procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override;
  60. procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override;
  61. procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override;
  62. { fpu move instructions (not used, all floating point is vector unit-based) }
  63. procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override;
  64. procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override;
  65. procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override;
  66. procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override;
  67. procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override;
  68. procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override;
  69. procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override;
  70. procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override;
  71. procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override;
  72. procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override;
  73. { comparison operations }
  74. procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override;
  75. procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override;
  76. procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override;
  77. procedure a_jmp_name(list: TAsmList; const s: string);override;
  78. procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;}
  79. procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override;
  80. procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override;
  81. procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override;
  82. procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override;
  83. procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override;
  84. procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override;
  85. procedure g_maybe_got_init(list: TAsmList); override;
  86. procedure g_restore_registers(list: TAsmList);override;
  87. procedure g_save_registers(list: TAsmList);override;
  88. procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
  89. procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
  90. procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
  91. procedure g_check_for_fpu_exception(list: TAsmList; force, clear: boolean);override;
  92. private
  93. function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  94. procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  95. end;
  96. procedure create_codegen;
  97. const
  98. TOpCG2AsmOpReg: array[topcg] of TAsmOp = (
  99. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV
  100. );
  101. TOpCG2AsmOpImm: array[topcg] of TAsmOp = (
  102. A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR
  103. );
  104. TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT,
  105. C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI
  106. );
  107. implementation
  108. uses
  109. globals,verbose,systems,cutils,
  110. paramgr,fmodule,
  111. symtable,symsym,
  112. tgobj,
  113. procinfo,cpupi;
  114. procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister);
  115. var
  116. href: treference;
  117. so: tshifterop;
  118. accesssize: longint;
  119. begin
  120. if (ref.base=NR_NO) then
  121. begin
  122. if ref.shiftmode<>SM_None then
  123. internalerror(2014110701);
  124. ref.base:=ref.index;
  125. ref.index:=NR_NO;
  126. end;
  127. { no abitrary scale factor support (the generic code doesn't set it,
  128. AArch-specific code shouldn't either) }
  129. if not(ref.scalefactor in [0,1]) then
  130. internalerror(2014111002);
  131. case simple_ref_type(op,size,oppostfix,ref) of
  132. sr_simple:
  133. exit;
  134. sr_internal_illegal:
  135. internalerror(2014121702);
  136. sr_complex:
  137. { continue } ;
  138. end;
  139. if assigned(ref.symbol) then
  140. begin
  141. { internal "load symbol" instructions should already be valid }
  142. if assigned(ref.symboldata) or
  143. (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then
  144. internalerror(2014110802);
  145. { no relative symbol support (needed) yet }
  146. if assigned(ref.relsymbol) then
  147. internalerror(2014111001);
  148. { loading a symbol address (whether it's in the GOT or not) consists
  149. of two parts: first load the page on which it is located, then
  150. either the offset in the page or load the value at that offset in
  151. the page. This final GOT-load can be relaxed by the linker in case
  152. the variable itself can be stored directly in the GOT }
  153. if (preferred_newbasereg=NR_NO) or
  154. (ref.base=preferred_newbasereg) or
  155. (ref.index=preferred_newbasereg) then
  156. preferred_newbasereg:=getaddressregister(list);
  157. { load the (GOT) page }
  158. reference_reset_symbol(href,ref.symbol,0,8,[]);
  159. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  160. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  161. ((ref.symbol.typ=AT_DATA) and
  162. (ref.symbol.bind=AB_LOCAL)) then
  163. href.refaddr:=addr_page
  164. else
  165. href.refaddr:=addr_gotpage;
  166. list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href));
  167. { load the GOT entry (= address of the variable) }
  168. reference_reset_base(href,preferred_newbasereg,0,ctempposinvalid,sizeof(pint),[]);
  169. href.symbol:=ref.symbol;
  170. { code symbols defined in the current compilation unit do not
  171. have to be accessed via the GOT }
  172. if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL]) and
  173. (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or
  174. ((ref.symbol.typ=AT_DATA) and
  175. (ref.symbol.bind=AB_LOCAL)) then
  176. begin
  177. href.base:=NR_NO;
  178. href.refaddr:=addr_pageoffset;
  179. list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href));
  180. end
  181. else
  182. begin
  183. href.refaddr:=addr_gotpageoffset;
  184. { use a_load_ref_reg() rather than directly encoding the LDR,
  185. so that we'll check the validity of the reference }
  186. a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg);
  187. end;
  188. { set as new base register }
  189. if ref.base=NR_NO then
  190. ref.base:=preferred_newbasereg
  191. else if ref.index=NR_NO then
  192. ref.index:=preferred_newbasereg
  193. else
  194. begin
  195. { make sure it's valid in case ref.base is SP -> make it
  196. the second operand}
  197. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg);
  198. ref.base:=preferred_newbasereg
  199. end;
  200. ref.symbol:=nil;
  201. end;
  202. { base & index }
  203. if (ref.base<>NR_NO) and
  204. (ref.index<>NR_NO) then
  205. begin
  206. case op of
  207. A_LDR, A_STR:
  208. begin
  209. if (ref.shiftmode=SM_None) and
  210. (ref.shiftimm<>0) then
  211. internalerror(2014110805);
  212. { wrong shift? (possible in case of something like
  213. array_of_2byte_rec[x].bytefield -> shift will be set 1, but
  214. the final load is a 1 byte -> can't use shift after all }
  215. if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and
  216. ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or
  217. (ref.offset<>0)) then
  218. begin
  219. if preferred_newbasereg=NR_NO then
  220. preferred_newbasereg:=getaddressregister(list);
  221. { "add" supports a superset of the shift modes supported by
  222. load/store instructions }
  223. shifterop_reset(so);
  224. so.shiftmode:=ref.shiftmode;
  225. so.shiftimm:=ref.shiftimm;
  226. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  227. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  228. { possibly still an invalid offset -> fall through }
  229. end
  230. else if ref.offset<>0 then
  231. begin
  232. if (preferred_newbasereg=NR_NO) or
  233. { we keep ref.index, so it must not be overwritten }
  234. (ref.index=preferred_newbasereg) then
  235. preferred_newbasereg:=getaddressregister(list);
  236. { add to the base and not to the index, because the index
  237. may be scaled; this works even if the base is SP }
  238. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  239. ref.offset:=0;
  240. ref.base:=preferred_newbasereg;
  241. { finished }
  242. exit;
  243. end
  244. else
  245. { valid -> exit }
  246. exit;
  247. end;
  248. { todo }
  249. A_LD1,A_LD2,A_LD3,A_LD4,
  250. A_ST1,A_ST2,A_ST3,A_ST4:
  251. internalerror(2014110704);
  252. { these don't support base+index }
  253. A_LDUR,A_STUR,
  254. A_LDP,A_STP:
  255. begin
  256. { these either don't support pre-/post-indexing, or don't
  257. support it with base+index }
  258. if ref.addressmode<>AM_OFFSET then
  259. internalerror(2014110911);
  260. if preferred_newbasereg=NR_NO then
  261. preferred_newbasereg:=getaddressregister(list);
  262. if ref.shiftmode<>SM_None then
  263. begin
  264. { "add" supports a superset of the shift modes supported by
  265. load/store instructions }
  266. shifterop_reset(so);
  267. so.shiftmode:=ref.shiftmode;
  268. so.shiftimm:=ref.shiftimm;
  269. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so));
  270. end
  271. else
  272. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg);
  273. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  274. { fall through to the handling of base + offset, since the
  275. offset may still be too big }
  276. end;
  277. else
  278. internalerror(2014110901);
  279. end;
  280. end;
  281. { base + offset }
  282. if ref.base<>NR_NO then
  283. begin
  284. { valid offset for LDUR/STUR -> use that }
  285. if (ref.addressmode=AM_OFFSET) and
  286. (op in [A_LDR,A_STR]) and
  287. (ref.offset>=-256) and
  288. (ref.offset<=255) then
  289. begin
  290. if op=A_LDR then
  291. op:=A_LDUR
  292. else
  293. op:=A_STUR
  294. end
  295. { if it's not a valid LDUR/STUR, use LDR/STR }
  296. else if (op in [A_LDUR,A_STUR]) and
  297. ((ref.offset<-256) or
  298. (ref.offset>255) or
  299. (ref.addressmode<>AM_OFFSET)) then
  300. begin
  301. if op=A_LDUR then
  302. op:=A_LDR
  303. else
  304. op:=A_STR
  305. end;
  306. case op of
  307. A_LDR,A_STR:
  308. begin
  309. case ref.addressmode of
  310. AM_PREINDEXED:
  311. begin
  312. { since the loaded/stored register cannot be the same
  313. as the base register, we can safely add the
  314. offset to the base if it doesn't fit}
  315. if (ref.offset<-256) or
  316. (ref.offset>255) then
  317. begin
  318. a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base);
  319. ref.offset:=0;
  320. end;
  321. end;
  322. AM_POSTINDEXED:
  323. begin
  324. { cannot emulate post-indexing if we have to fold the
  325. offset into the base register }
  326. if (ref.offset<-256) or
  327. (ref.offset>255) then
  328. internalerror(2014110909);
  329. { ok }
  330. end;
  331. AM_OFFSET:
  332. begin
  333. { unsupported offset -> fold into base register }
  334. accesssize:=1 shl tcgsizep2size[size];
  335. if (ref.offset<0) or
  336. (ref.offset>(((1 shl 12)-1)*accesssize)) or
  337. ((ref.offset mod accesssize)<>0) then
  338. begin
  339. if preferred_newbasereg=NR_NO then
  340. preferred_newbasereg:=getaddressregister(list);
  341. { can we split the offset beween an
  342. "add/sub (imm12 shl 12)" and the load (also an
  343. imm12)?
  344. -- the offset from the load will always be added,
  345. that's why the lower bound has a smaller range
  346. than the upper bound; it must also be a multiple
  347. of the access size }
  348. if (ref.offset>=-(((1 shl 12)-1) shl 12)) and
  349. (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and
  350. ((ref.offset mod accesssize)=0) then
  351. begin
  352. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg);
  353. ref.offset:=ref.offset-(ref.offset shr 12) shl 12;
  354. end
  355. else
  356. begin
  357. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  358. ref.offset:=0;
  359. end;
  360. reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility);
  361. end;
  362. end
  363. else
  364. internalerror(2014110904);
  365. end;
  366. end;
  367. A_LDP,A_STP:
  368. begin
  369. { unsupported offset -> fold into base register (these
  370. instructions support all addressmodes) }
  371. if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or
  372. (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then
  373. begin
  374. case ref.addressmode of
  375. AM_POSTINDEXED:
  376. { don't emulate post-indexing if we have to fold the
  377. offset into the base register }
  378. internalerror(2014110910);
  379. AM_PREINDEXED:
  380. { this means the offset must be added to the current
  381. base register }
  382. preferred_newbasereg:=ref.base;
  383. AM_OFFSET:
  384. if preferred_newbasereg=NR_NO then
  385. preferred_newbasereg:=getaddressregister(list);
  386. end;
  387. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg);
  388. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,ref.alignment,ref.volatility);
  389. end
  390. end;
  391. A_LDUR,A_STUR:
  392. begin
  393. { valid, checked above }
  394. end;
  395. { todo }
  396. A_LD1,A_LD2,A_LD3,A_LD4,
  397. A_ST1,A_ST2,A_ST3,A_ST4:
  398. internalerror(2014110908);
  399. else
  400. internalerror(2014110708);
  401. end;
  402. { done }
  403. exit;
  404. end;
  405. { only an offset -> change to base (+ offset 0) }
  406. if preferred_newbasereg=NR_NO then
  407. preferred_newbasereg:=getaddressregister(list);
  408. a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg);
  409. reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,newalignment(8,ref.offset),ref.volatility);
  410. end;
  411. function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister;
  412. var
  413. subreg:Tsubregister;
  414. begin
  415. subreg:=cgsize2subreg(getregtype(reg),size);
  416. result:=reg;
  417. setsubreg(result,subreg);
  418. end;
  419. function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister;
  420. begin
  421. internalerror(2014122110);
  422. { squash warning }
  423. result:=NR_NO;
  424. end;
  425. function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference;
  426. begin
  427. make_simple_ref(list,op,size,oppostfix,ref,NR_NO);
  428. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix));
  429. result:=ref;
  430. end;
  431. procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean);
  432. var
  433. instr: taicpu;
  434. so: tshifterop;
  435. hadtmpreg: boolean;
  436. begin
  437. { imm12 }
  438. if (a>=0) and
  439. (a<=((1 shl 12)-1)) then
  440. if usedest then
  441. instr:=taicpu.op_reg_reg_const(op,dst,src,a)
  442. else
  443. instr:=taicpu.op_reg_const(op,src,a)
  444. { imm12 lsl 12 }
  445. else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then
  446. begin
  447. so.shiftmode:=SM_LSL;
  448. so.shiftimm:=12;
  449. if usedest then
  450. instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so)
  451. else
  452. instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so)
  453. end
  454. else
  455. begin
  456. { todo: other possible optimizations (e.g. load 16 bit constant in
  457. register and then add/sub/cmp/cmn shifted the rest) }
  458. if tmpreg=NR_NO then
  459. begin
  460. hadtmpreg:=false;
  461. tmpreg:=getintregister(list,size);
  462. end
  463. else
  464. begin
  465. hadtmpreg:=true;
  466. getcpuregister(list,tmpreg);
  467. end;
  468. a_load_const_reg(list,size,a,tmpreg);
  469. if usedest then
  470. instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg)
  471. else
  472. instr:=taicpu.op_reg_reg(op,src,tmpreg);
  473. if hadtmpreg then
  474. ungetcpuregister(list,tmpreg);
  475. end;
  476. if setflags then
  477. setoppostfix(instr,PF_S);
  478. list.concat(instr);
  479. end;
  480. {****************************************************************************
  481. Assembler code
  482. ****************************************************************************}
  483. procedure tcgaarch64.init_register_allocators;
  484. begin
  485. inherited init_register_allocators;
  486. rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
  487. [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
  488. RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
  489. RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
  490. { maybe we can enable this in the future for leaf functions (it's
  491. the frame pointer)
  492. ,RS_X29 }],
  493. first_int_imreg,[]);
  494. rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD,
  495. [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7,
  496. RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15,
  497. RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23,
  498. RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31],
  499. first_mm_imreg,[]);
  500. end;
  501. procedure tcgaarch64.done_register_allocators;
  502. begin
  503. rg[R_INTREGISTER].free;
  504. rg[R_FPUREGISTER].free;
  505. rg[R_MMREGISTER].free;
  506. inherited done_register_allocators;
  507. end;
  508. function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister;
  509. begin
  510. case size of
  511. OS_F32:
  512. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
  513. OS_F64:
  514. result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD)
  515. else
  516. internalerror(2014102701);
  517. end;
  518. end;
  519. procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean);
  520. begin
  521. if not weak then
  522. list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)))
  523. else
  524. list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION)));
  525. end;
  526. procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister);
  527. begin
  528. list.concat(taicpu.op_reg(A_BLR,reg));
  529. end;
  530. {********************** load instructions ********************}
  531. procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister);
  532. var
  533. opc: tasmop;
  534. shift: byte;
  535. so: tshifterop;
  536. reginited,doinverted: boolean;
  537. manipulated_a: tcgint;
  538. leftover_a: word;
  539. begin
  540. {$ifdef extdebug}
  541. list.concat(tai_comment.Create(strpnew('Generating constant ' + tostr(a) + ' / $' + hexstr(a, 16))));
  542. {$endif extdebug}
  543. case a of
  544. { Small positive number }
  545. $0..$FFFF:
  546. begin
  547. list.concat(taicpu.op_reg_const(A_MOVZ, reg, a));
  548. Exit;
  549. end;
  550. { Small negative number }
  551. -65536..-1:
  552. begin
  553. list.concat(taicpu.op_reg_const(A_MOVN, reg, Word(not a)));
  554. Exit;
  555. end;
  556. { Can be represented as a negative number more compactly }
  557. $FFFF0000..$FFFFFFFF:
  558. begin
  559. { if we load a value into a 32 bit register, it is automatically
  560. zero-extended to 64 bit }
  561. list.concat(taicpu.op_reg_const(A_MOVN, makeregsize(reg,OS_32), Word(not a)));
  562. Exit;
  563. end;
  564. else
  565. begin
  566. if size in [OS_64,OS_S64] then
  567. begin
  568. { Check to see if a is a valid shifter constant that can be encoded in ORR as is }
  569. if is_shifter_const(a,size) then
  570. begin
  571. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a));
  572. Exit;
  573. end;
  574. { This determines whether this write can be performed with an ORR followed by MOVK
  575. by copying the 2nd word to the 4th word for the ORR constant, then overwriting
  576. the 4th word (unless the word is. The alternative would require 3 instructions }
  577. leftover_a := word(a shr 48);
  578. manipulated_a := (a and $0000FFFFFFFFFFFF);
  579. if manipulated_a = $0000FFFFFFFFFFFF then
  580. begin
  581. { This is even better, as we can just use a single MOVN on the last word }
  582. shifterop_reset(so);
  583. so.shiftmode := SM_LSL;
  584. so.shiftimm := 48;
  585. list.concat(taicpu.op_reg_const_shifterop(A_MOVN, reg, word(not leftover_a), so));
  586. Exit;
  587. end;
  588. manipulated_a := manipulated_a or (((a shr 16) and $FFFF) shl 48);
  589. { if manipulated_a = a, don't check, because is_shifter_const was already
  590. called for a and it returned False. Reduces processing time. [Kit] }
  591. if (manipulated_a <> a) and is_shifter_const(manipulated_a, size) then
  592. begin
  593. { Encode value as:
  594. orr reg,xzr,manipulated_a
  595. movk reg,#(leftover_a),lsl #48
  596. }
  597. list.concat(taicpu.op_reg_reg_const(A_ORR, reg, makeregsize(NR_XZR, size), manipulated_a));
  598. shifterop_reset(so);
  599. so.shiftmode := SM_LSL;
  600. so.shiftimm := 48;
  601. list.concat(taicpu.op_reg_const_shifterop(A_MOVK, reg, leftover_a, so));
  602. Exit;
  603. end;
  604. case a of
  605. { If a is in the given negative range, it can be stored
  606. more efficiently if it is inverted. }
  607. TCgInt($FFFF000000000000)..-65537:
  608. begin
  609. { NOTE: This excluded range can be more efficiently
  610. stored as the first 16 bits followed by a shifter constant }
  611. case a of
  612. TCgInt($FFFF0000FFFF0000)..TCgInt($FFFF0000FFFFFFFF):
  613. doinverted := False;
  614. else
  615. begin
  616. doinverted := True;
  617. a := not a;
  618. end;
  619. end;
  620. end;
  621. else
  622. doinverted := False;
  623. end;
  624. end
  625. else
  626. begin
  627. a:=cardinal(a);
  628. doinverted:=False;
  629. end;
  630. end;
  631. end;
  632. reginited:=false;
  633. shift:=0;
  634. if doinverted then
  635. opc:=A_MOVN
  636. else
  637. opc:=A_MOVZ;
  638. repeat
  639. { leftover is shifterconst? (don't check if we can represent it just
  640. as effectively with movz/movk, as this check is expensive) }
  641. if (word(a)<>0) then
  642. begin
  643. if not doinverted and
  644. ((shift<tcgsize2size[size]*(8 div 2)) and
  645. ((a shr 16)<>0)) and
  646. is_shifter_const(a shl shift,size) then
  647. begin
  648. if reginited then
  649. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift))
  650. else
  651. list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift));
  652. exit;
  653. end;
  654. { set all 16 bit parts <> 0 }
  655. if shift=0 then
  656. begin
  657. list.concat(taicpu.op_reg_const(opc,reg,word(a)));
  658. reginited:=true;
  659. end
  660. else
  661. begin
  662. shifterop_reset(so);
  663. so.shiftmode:=SM_LSL;
  664. so.shiftimm:=shift;
  665. if not reginited then
  666. begin
  667. list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so));
  668. reginited:=true;
  669. end
  670. else
  671. begin
  672. if doinverted then
  673. list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(not a),so))
  674. else
  675. list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(a),so));
  676. end;
  677. end;
  678. end;
  679. a:=a shr 16;
  680. inc(shift,16);
  681. until a = 0;
  682. if not reginited then
  683. internalerror(2014102702);
  684. end;
  685. procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference);
  686. var
  687. reg: tregister;
  688. begin
  689. { use the zero register if possible }
  690. if a=0 then
  691. begin
  692. if size in [OS_64,OS_S64] then
  693. reg:=NR_XZR
  694. else
  695. reg:=NR_WZR;
  696. a_load_reg_ref(list,size,size,reg,ref);
  697. end
  698. else
  699. inherited;
  700. end;
  701. procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  702. var
  703. oppostfix:toppostfix;
  704. hreg: tregister;
  705. begin
  706. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  707. begin
  708. fromsize:=tosize;
  709. reg:=makeregsize(list,reg,fromsize);
  710. end
  711. { have a 32 bit register but need a 64 bit one? }
  712. else if tosize in [OS_64,OS_S64] then
  713. begin
  714. { sign extend if necessary }
  715. if fromsize in [OS_S8,OS_S16,OS_S32] then
  716. begin
  717. { can't overwrite reg, may be a constant reg }
  718. hreg:=getintregister(list,tosize);
  719. a_load_reg_reg(list,fromsize,tosize,reg,hreg);
  720. reg:=hreg;
  721. end
  722. else
  723. { top 32 bit are zero by default }
  724. reg:=makeregsize(reg,OS_64);
  725. fromsize:=tosize;
  726. end;
  727. if (ref.alignment<>0) and
  728. (ref.alignment<tcgsize2size[tosize]) then
  729. begin
  730. a_load_reg_ref_unaligned(list,fromsize,tosize,reg,ref);
  731. end
  732. else
  733. begin
  734. case tosize of
  735. { signed integer registers }
  736. OS_8,
  737. OS_S8:
  738. oppostfix:=PF_B;
  739. OS_16,
  740. OS_S16:
  741. oppostfix:=PF_H;
  742. OS_32,
  743. OS_S32,
  744. OS_64,
  745. OS_S64:
  746. oppostfix:=PF_None;
  747. else
  748. InternalError(200308299);
  749. end;
  750. handle_load_store(list,A_STR,tosize,oppostfix,reg,ref);
  751. end;
  752. end;
  753. procedure tcgaarch64.a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  754. var
  755. oppostfix:toppostfix;
  756. begin
  757. if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then
  758. fromsize:=tosize;
  759. { ensure that all bits of the 32/64 register are always correctly set:
  760. * default behaviour is always to zero-extend to the entire (64 bit)
  761. register -> unsigned 8/16/32 bit loads only exist with a 32 bit
  762. target register, as the upper 32 bit will be zeroed implicitly
  763. -> always make target register 32 bit
  764. * signed loads exist both with 32 and 64 bit target registers,
  765. depending on whether the value should be sign extended to 32 or
  766. to 64 bit (if sign extended to 32 bit, the upper 32 bits of the
  767. corresponding 64 bit register are again zeroed) -> no need to
  768. change anything (we only have 32 and 64 bit registers), except that
  769. when loading an OS_S32 to a 32 bit register, we don't need/can't
  770. use sign extension
  771. }
  772. if fromsize in [OS_8,OS_16,OS_32] then
  773. reg:=makeregsize(reg,OS_32);
  774. if (ref.alignment<>0) and
  775. (ref.alignment<tcgsize2size[fromsize]) then
  776. begin
  777. a_load_ref_reg_unaligned(list,fromsize,tosize,ref,reg);
  778. exit;
  779. end;
  780. case fromsize of
  781. { signed integer registers }
  782. OS_8:
  783. oppostfix:=PF_B;
  784. OS_S8:
  785. oppostfix:=PF_SB;
  786. OS_16:
  787. oppostfix:=PF_H;
  788. OS_S16:
  789. oppostfix:=PF_SH;
  790. OS_S32:
  791. if getsubreg(reg)=R_SUBD then
  792. oppostfix:=PF_NONE
  793. else
  794. oppostfix:=PF_SW;
  795. OS_32,
  796. OS_64,
  797. OS_S64:
  798. oppostfix:=PF_None;
  799. else
  800. InternalError(200308297);
  801. end;
  802. handle_load_store(list,A_LDR,fromsize,oppostfix,reg,ref);
  803. { clear upper 16 bits if the value was negative }
  804. if (fromsize=OS_S8) and (tosize=OS_16) then
  805. a_load_reg_reg(list,fromsize,tosize,reg,reg);
  806. end;
  807. procedure tcgaarch64.a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister);
  808. var
  809. href: treference;
  810. hreg1, hreg2, tmpreg: tregister;
  811. begin
  812. if fromsize in [OS_64,OS_S64] then
  813. begin
  814. { split into two 32 bit loads }
  815. hreg1:=getintregister(list,OS_32);
  816. hreg2:=getintregister(list,OS_32);
  817. if target_info.endian=endian_big then
  818. begin
  819. tmpreg:=hreg1;
  820. hreg1:=hreg2;
  821. hreg2:=tmpreg;
  822. end;
  823. { can we use LDP? }
  824. if (ref.alignment=4) and
  825. (simple_ref_type(A_LDP,OS_32,PF_None,ref)=sr_simple) then
  826. list.concat(taicpu.op_reg_reg_ref(A_LDP,hreg1,hreg2,ref))
  827. else
  828. begin
  829. a_load_ref_reg(list,OS_32,OS_32,ref,hreg1);
  830. href:=ref;
  831. inc(href.offset,4);
  832. a_load_ref_reg(list,OS_32,OS_32,href,hreg2);
  833. end;
  834. a_load_reg_reg(list,OS_32,OS_64,hreg1,register);
  835. list.concat(taicpu.op_reg_reg_const_const(A_BFI,register,makeregsize(hreg2,OS_64),32,32));
  836. end
  837. else
  838. inherited;
  839. end;
  840. procedure tcgaarch64.a_load_reg_reg(list:TAsmList;fromsize,tosize:tcgsize;reg1,reg2:tregister);
  841. var
  842. instr: taicpu;
  843. begin
  844. { we use both 32 and 64 bit registers -> insert conversion when when
  845. we have to truncate/sign extend inside the (32 or 64 bit) register
  846. holding the value, and when we sign extend from a 32 to a 64 bit
  847. register }
  848. if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or
  849. ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and
  850. (fromsize<>tosize) and
  851. not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or
  852. ((fromsize in [OS_S8,OS_S16,OS_S32]) and
  853. (tosize in [OS_64,OS_S64])) or
  854. { needs to mask out the sign in the top 16 bits }
  855. ((fromsize=OS_S8) and
  856. (tosize=OS_16)) then
  857. begin
  858. case tosize of
  859. OS_8:
  860. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  861. OS_16:
  862. list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  863. OS_S8:
  864. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_B));
  865. OS_S16:
  866. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_H));
  867. { while "mov wN, wM" automatically inserts a zero-extension and
  868. hence we could encode a 64->32 bit move like that, the problem
  869. is that we then can't distinguish 64->32 from 32->32 moves, and
  870. the 64->32 truncation could be removed altogether... So use a
  871. different instruction }
  872. OS_32,
  873. OS_S32:
  874. { in theory, reg1 should be 64 bit here (since fromsize>tosize),
  875. but because of the way location_force_register() tries to
  876. avoid superfluous zero/sign extensions, it's not always the
  877. case -> also force reg1 to to 64 bit }
  878. list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32));
  879. OS_64,
  880. OS_S64:
  881. list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_W));
  882. else
  883. internalerror(2002090901);
  884. end;
  885. end
  886. else
  887. begin
  888. { 32 -> 32 bit move implies zero extension (sign extensions have
  889. been handled above) -> also use for 32 <-> 64 bit moves }
  890. if not(fromsize in [OS_64,OS_S64]) or
  891. not(tosize in [OS_64,OS_S64]) then
  892. instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32))
  893. else
  894. instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1);
  895. list.Concat(instr);
  896. { Notify the register allocator that we have written a move instruction so
  897. it can try to eliminate it. }
  898. add_move_instruction(instr);
  899. end;
  900. end;
  901. procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister);
  902. var
  903. href: treference;
  904. so: tshifterop;
  905. op: tasmop;
  906. begin
  907. op:=A_LDR;
  908. href:=ref;
  909. { simplify as if we're going to perform a regular 64 bit load, using
  910. "r" as the new base register if possible/necessary }
  911. make_simple_ref(list,op,OS_ADDR,PF_None,href,r);
  912. { load literal? }
  913. if assigned(href.symbol) then
  914. begin
  915. if (href.base<>NR_NO) or
  916. (href.index<>NR_NO) or
  917. not assigned(href.symboldata) then
  918. internalerror(2014110912);
  919. list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset));
  920. end
  921. else
  922. begin
  923. if href.index<>NR_NO then
  924. begin
  925. if href.shiftmode<>SM_None then
  926. begin
  927. { "add" supports a supperset of the shift modes supported by
  928. load/store instructions }
  929. shifterop_reset(so);
  930. so.shiftmode:=href.shiftmode;
  931. so.shiftimm:=href.shiftimm;
  932. list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so));
  933. end
  934. else
  935. a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r);
  936. end
  937. else if href.offset<>0 then
  938. a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r)
  939. else
  940. a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r);
  941. end;
  942. end;
  943. procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);
  944. begin
  945. internalerror(2014122107)
  946. end;
  947. procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister);
  948. begin
  949. internalerror(2014122108)
  950. end;
  951. procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference);
  952. begin
  953. internalerror(2014122109)
  954. end;
  955. procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle);
  956. var
  957. instr: taicpu;
  958. begin
  959. if assigned(shuffle) and
  960. not shufflescalar(shuffle) then
  961. internalerror(2014122104);
  962. if fromsize=tosize then
  963. begin
  964. instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1);
  965. { Notify the register allocator that we have written a move
  966. instruction so it can try to eliminate it. }
  967. add_move_instruction(instr);
  968. { FMOV cannot generate a floating point exception }
  969. end
  970. else
  971. begin
  972. if (reg_cgsize(reg1)<>fromsize) or
  973. (reg_cgsize(reg2)<>tosize) then
  974. internalerror(2014110913);
  975. instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1);
  976. maybe_check_for_fpu_exception(list);
  977. end;
  978. list.Concat(instr);
  979. end;
  980. procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle);
  981. var
  982. tmpreg: tregister;
  983. begin
  984. if assigned(shuffle) and
  985. not shufflescalar(shuffle) then
  986. internalerror(2014122105);
  987. tmpreg:=NR_NO;
  988. if (fromsize<>tosize) then
  989. begin
  990. tmpreg:=reg;
  991. reg:=getmmregister(list,fromsize);
  992. end;
  993. handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref);
  994. if (fromsize<>tosize) then
  995. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  996. end;
  997. procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle);
  998. var
  999. tmpreg: tregister;
  1000. begin
  1001. if assigned(shuffle) and
  1002. not shufflescalar(shuffle) then
  1003. internalerror(2014122106);
  1004. if (fromsize<>tosize) then
  1005. begin
  1006. tmpreg:=getmmregister(list,tosize);
  1007. a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil);
  1008. reg:=tmpreg;
  1009. end;
  1010. handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref);
  1011. end;
  1012. procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle);
  1013. begin
  1014. if not shufflescalar(shuffle) then
  1015. internalerror(2014122801);
  1016. if not(tcgsize2size[fromsize] in [4,8]) or
  1017. (tcgsize2size[fromsize]<>tcgsize2size[tosize]) then
  1018. internalerror(2014122803);
  1019. list.concat(taicpu.op_reg_reg(A_INS,mmreg,intreg));
  1020. end;
  1021. procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle);
  1022. var
  1023. r : tregister;
  1024. begin
  1025. if not shufflescalar(shuffle) then
  1026. internalerror(2014122802);
  1027. if not(tcgsize2size[fromsize] in [4,8]) or
  1028. (tcgsize2size[fromsize]>tcgsize2size[tosize]) then
  1029. internalerror(2014122804);
  1030. if tcgsize2size[fromsize]<tcgsize2size[tosize] then
  1031. r:=makeregsize(intreg,fromsize)
  1032. else
  1033. r:=intreg;
  1034. list.concat(taicpu.op_reg_reg(A_UMOV,r,mmreg));
  1035. end;
  1036. procedure tcgaarch64.a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle);
  1037. begin
  1038. case op of
  1039. { "xor Vx,Vx" is used to initialize global regvars to 0 }
  1040. OP_XOR:
  1041. begin
  1042. if (src<>dst) or
  1043. (reg_cgsize(src)<>size) or
  1044. assigned(shuffle) then
  1045. internalerror(2015011401);
  1046. case size of
  1047. OS_F32,
  1048. OS_F64:
  1049. list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0));
  1050. else
  1051. internalerror(2015011402);
  1052. end;
  1053. end
  1054. else
  1055. internalerror(2015011403);
  1056. end;
  1057. end;
  1058. procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister);
  1059. var
  1060. bitsize,
  1061. signbit: longint;
  1062. begin
  1063. if srcsize in [OS_64,OS_S64] then
  1064. begin
  1065. bitsize:=64;
  1066. signbit:=6;
  1067. end
  1068. else
  1069. begin
  1070. bitsize:=32;
  1071. signbit:=5;
  1072. end;
  1073. { source is 0 -> dst will have to become 255 }
  1074. list.concat(taicpu.op_reg_const(A_CMP,src,0));
  1075. if reverse then
  1076. begin
  1077. list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src));
  1078. { xor 31/63 is the same as setting the lower 5/6 bits to
  1079. "31/63-(lower 5/6 bits of dst)" }
  1080. list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1));
  1081. end
  1082. else
  1083. begin
  1084. list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src));
  1085. list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst));
  1086. end;
  1087. { set dst to -1 if src was 0 }
  1088. list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE));
  1089. { mask the -1 to 255 if src was 0 (anyone find a two-instruction
  1090. branch-free version? All of mine are 3...) }
  1091. list.Concat(setoppostfix(taicpu.op_reg_reg(A_UXT,makeregsize(dst,OS_32),makeregsize(dst,OS_32)),PF_B));
  1092. end;
  1093. procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference);
  1094. var
  1095. href: treference;
  1096. hreg1, hreg2, tmpreg: tregister;
  1097. begin
  1098. if fromsize in [OS_64,OS_S64] then
  1099. begin
  1100. { split into two 32 bit stores }
  1101. hreg1:=getintregister(list,OS_32);
  1102. hreg2:=getintregister(list,OS_32);
  1103. a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1);
  1104. a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64));
  1105. if target_info.endian=endian_big then
  1106. begin
  1107. tmpreg:=hreg1;
  1108. hreg1:=hreg2;
  1109. hreg2:=tmpreg;
  1110. end;
  1111. { can we use STP? }
  1112. if (ref.alignment=4) and
  1113. (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then
  1114. list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref))
  1115. else
  1116. begin
  1117. a_load_reg_ref(list,OS_32,OS_32,hreg1,ref);
  1118. href:=ref;
  1119. inc(href.offset,4);
  1120. a_load_reg_ref(list,OS_32,OS_32,hreg2,href);
  1121. end;
  1122. end
  1123. else
  1124. inherited;
  1125. end;
  1126. procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister);
  1127. const
  1128. overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG];
  1129. begin
  1130. if (op in overflowops) and
  1131. (size in [OS_8,OS_S8,OS_16,OS_S16]) then
  1132. a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32))
  1133. end;
  1134. procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);
  1135. begin
  1136. optimize_op_const(size,op,a);
  1137. case op of
  1138. OP_NONE:
  1139. exit;
  1140. OP_MOVE:
  1141. a_load_const_reg(list,size,a,reg);
  1142. OP_NEG,OP_NOT:
  1143. internalerror(200306011);
  1144. else
  1145. a_op_const_reg_reg(list,op,size,a,reg,reg);
  1146. end;
  1147. end;
  1148. procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister);
  1149. begin
  1150. Case op of
  1151. OP_NEG,
  1152. OP_NOT:
  1153. begin
  1154. list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src));
  1155. maybeadjustresult(list,op,size,dst);
  1156. end
  1157. else
  1158. a_op_reg_reg_reg(list,op,size,src,dst,dst);
  1159. end;
  1160. end;
  1161. procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);
  1162. var
  1163. l: tlocation;
  1164. begin
  1165. a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l);
  1166. end;
  1167. procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);
  1168. var
  1169. hreg: tregister;
  1170. begin
  1171. { no ROLV opcode... }
  1172. if op=OP_ROL then
  1173. begin
  1174. case size of
  1175. OS_32,OS_S32,
  1176. OS_64,OS_S64:
  1177. begin
  1178. hreg:=getintregister(list,size);
  1179. a_load_const_reg(list,size,tcgsize2size[size]*8,hreg);
  1180. a_op_reg_reg(list,OP_SUB,size,src1,hreg);
  1181. a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst);
  1182. exit;
  1183. end;
  1184. else
  1185. internalerror(2014111005);
  1186. end;
  1187. end
  1188. else if (op=OP_ROR) and
  1189. not(size in [OS_32,OS_S32,OS_64,OS_S64]) then
  1190. internalerror(2014111006);
  1191. if TOpCG2AsmOpReg[op]=A_NONE then
  1192. internalerror(2014111007);
  1193. list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1));
  1194. maybeadjustresult(list,op,size,dst);
  1195. end;
  1196. procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1197. var
  1198. shiftcountmask: longint;
  1199. constreg: tregister;
  1200. begin
  1201. { add/sub instructions have only positive immediate operands }
  1202. if (op in [OP_ADD,OP_SUB]) and
  1203. (a<0) then
  1204. begin
  1205. if op=OP_ADD then
  1206. op:=op_SUB
  1207. else
  1208. op:=OP_ADD;
  1209. { avoid range/overflow error in case a = low(tcgint) }
  1210. {$push}{$r-}{$q-}
  1211. a:=-a;
  1212. {$pop}
  1213. end;
  1214. ovloc.loc:=LOC_VOID;
  1215. optimize_op_const(size,op,a);
  1216. case op of
  1217. OP_NONE:
  1218. begin
  1219. a_load_reg_reg(list,size,size,src,dst);
  1220. exit;
  1221. end;
  1222. OP_MOVE:
  1223. begin
  1224. a_load_const_reg(list,size,a,dst);
  1225. exit;
  1226. end;
  1227. end;
  1228. case op of
  1229. OP_ADD,
  1230. OP_SUB:
  1231. begin
  1232. handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true);
  1233. { on a 64 bit target, overflows with smaller data types
  1234. are handled via range errors }
  1235. if setflags and
  1236. (size in [OS_64,OS_S64]) then
  1237. begin
  1238. location_reset(ovloc,LOC_FLAGS,OS_8);
  1239. if size=OS_64 then
  1240. if op=OP_ADD then
  1241. ovloc.resflags:=F_CS
  1242. else
  1243. ovloc.resflags:=F_CC
  1244. else
  1245. ovloc.resflags:=F_VS;
  1246. end;
  1247. end;
  1248. OP_OR,
  1249. OP_AND,
  1250. OP_XOR:
  1251. begin
  1252. if not(size in [OS_64,OS_S64]) then
  1253. a:=cardinal(a);
  1254. if is_shifter_const(a,size) then
  1255. list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a))
  1256. else
  1257. begin
  1258. constreg:=getintregister(list,size);
  1259. a_load_const_reg(list,size,a,constreg);
  1260. a_op_reg_reg_reg(list,op,size,constreg,src,dst);
  1261. end;
  1262. end;
  1263. OP_SHL,
  1264. OP_SHR,
  1265. OP_SAR:
  1266. begin
  1267. if size in [OS_64,OS_S64] then
  1268. shiftcountmask:=63
  1269. else
  1270. shiftcountmask:=31;
  1271. if (a and shiftcountmask)<>0 Then
  1272. list.concat(taicpu.op_reg_reg_const(
  1273. TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask))
  1274. else
  1275. a_load_reg_reg(list,size,size,src,dst);
  1276. if (a and not(tcgint(shiftcountmask)))<>0 then
  1277. internalError(2014112101);
  1278. end;
  1279. OP_ROL,
  1280. OP_ROR:
  1281. begin
  1282. case size of
  1283. OS_32,OS_S32:
  1284. if (a and not(tcgint(31)))<>0 then
  1285. internalError(2014112102);
  1286. OS_64,OS_S64:
  1287. if (a and not(tcgint(63)))<>0 then
  1288. internalError(2014112103);
  1289. else
  1290. internalError(2014112104);
  1291. end;
  1292. { there's only a ror opcode }
  1293. if op=OP_ROL then
  1294. a:=(tcgsize2size[size]*8)-a;
  1295. list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a));
  1296. end;
  1297. OP_MUL,
  1298. OP_IMUL,
  1299. OP_DIV,
  1300. OP_IDIV:
  1301. begin
  1302. constreg:=getintregister(list,size);
  1303. a_load_const_reg(list,size,a,constreg);
  1304. a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc);
  1305. end;
  1306. else
  1307. internalerror(2014111403);
  1308. end;
  1309. maybeadjustresult(list,op,size,dst);
  1310. end;
  1311. procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);
  1312. var
  1313. tmpreg1, tmpreg2: tregister;
  1314. begin
  1315. ovloc.loc:=LOC_VOID;
  1316. { overflow can only occur with 64 bit calculations on 64 bit cpus }
  1317. if setflags and
  1318. (size in [OS_64,OS_S64]) then
  1319. begin
  1320. case op of
  1321. OP_ADD,
  1322. OP_SUB:
  1323. begin
  1324. list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S));
  1325. ovloc.loc:=LOC_FLAGS;
  1326. if size=OS_64 then
  1327. if op=OP_ADD then
  1328. ovloc.resflags:=F_CS
  1329. else
  1330. ovloc.resflags:=F_CC
  1331. else
  1332. ovloc.resflags:=F_VS;
  1333. { finished }
  1334. exit;
  1335. end;
  1336. OP_MUL:
  1337. begin
  1338. { check whether the upper 64 bit of the 128 bit product is 0 }
  1339. tmpreg1:=getintregister(list,OS_64);
  1340. list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1));
  1341. list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0));
  1342. ovloc.loc:=LOC_FLAGS;
  1343. ovloc.resflags:=F_NE;
  1344. { still have to perform the actual multiplication }
  1345. end;
  1346. OP_IMUL:
  1347. begin
  1348. { check whether the upper 64 bits of the 128 bit multiplication
  1349. result have the same value as the replicated sign bit of the
  1350. lower 64 bits }
  1351. tmpreg1:=getintregister(list,OS_64);
  1352. list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1));
  1353. { calculate lower 64 bits (afterwards, because dst may be
  1354. equal to src1 or src2) }
  1355. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1356. { replicate sign bit }
  1357. tmpreg2:=getintregister(list,OS_64);
  1358. a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2);
  1359. list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2));
  1360. ovloc.loc:=LOC_FLAGS;
  1361. ovloc.resflags:=F_NE;
  1362. { finished }
  1363. exit;
  1364. end;
  1365. OP_IDIV,
  1366. OP_DIV:
  1367. begin
  1368. { not handled here, needs div-by-zero check (dividing by zero
  1369. just gives a 0 result on aarch64), and low(int64) div -1
  1370. check for overflow) }
  1371. internalerror(2014122101);
  1372. end;
  1373. end;
  1374. end;
  1375. a_op_reg_reg_reg(list,op,size,src1,src2,dst);
  1376. end;
  1377. {*************** compare instructructions ****************}
  1378. procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);
  1379. var
  1380. op: tasmop;
  1381. begin
  1382. if a>=0 then
  1383. op:=A_CMP
  1384. else
  1385. op:=A_CMN;
  1386. { avoid range/overflow error in case a=low(tcgint) }
  1387. {$push}{$r-}{$q-}
  1388. handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false);
  1389. {$pop}
  1390. a_jmp_cond(list,cmp_op,l);
  1391. end;
  1392. procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel);
  1393. begin
  1394. list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1));
  1395. a_jmp_cond(list,cmp_op,l);
  1396. end;
  1397. procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel);
  1398. var
  1399. ai: taicpu;
  1400. begin
  1401. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION));
  1402. ai.is_jmp:=true;
  1403. list.Concat(ai);
  1404. end;
  1405. procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string);
  1406. var
  1407. ai: taicpu;
  1408. begin
  1409. ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION));
  1410. ai.is_jmp:=true;
  1411. list.Concat(ai);
  1412. end;
  1413. procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel);
  1414. var
  1415. ai: taicpu;
  1416. begin
  1417. ai:=TAiCpu.op_sym(A_B,l);
  1418. ai.is_jmp:=true;
  1419. ai.SetCondition(TOpCmp2AsmCond[cond]);
  1420. list.Concat(ai);
  1421. end;
  1422. procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);
  1423. var
  1424. ai : taicpu;
  1425. begin
  1426. ai:=Taicpu.op_sym(A_B,l);
  1427. ai.is_jmp:=true;
  1428. ai.SetCondition(flags_to_cond(f));
  1429. list.Concat(ai);
  1430. end;
  1431. procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister);
  1432. begin
  1433. list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f)));
  1434. end;
  1435. procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);
  1436. begin
  1437. { we need an explicit overflow location, because there are many
  1438. possibilities (not just the overflow flag, which is only used for
  1439. signed add/sub) }
  1440. internalerror(2014112303);
  1441. end;
  1442. procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation);
  1443. var
  1444. hl : tasmlabel;
  1445. hflags : tresflags;
  1446. begin
  1447. if not(cs_check_overflow in current_settings.localswitches) then
  1448. exit;
  1449. current_asmdata.getjumplabel(hl);
  1450. case ovloc.loc of
  1451. LOC_FLAGS:
  1452. begin
  1453. hflags:=ovloc.resflags;
  1454. inverse_flags(hflags);
  1455. cg.a_jmp_flags(list,hflags,hl);
  1456. end;
  1457. else
  1458. internalerror(2014112304);
  1459. end;
  1460. a_call_name(list,'FPC_OVERFLOW',false);
  1461. a_label(list,hl);
  1462. end;
  1463. { *********** entry/exit code and address loading ************ }
  1464. function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
  1465. var
  1466. ref: treference;
  1467. sr: tsuperregister;
  1468. pairreg: tregister;
  1469. begin
  1470. result:=0;
  1471. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1472. ref.addressmode:=AM_PREINDEXED;
  1473. pairreg:=NR_NO;
  1474. { store all used registers pairwise }
  1475. for sr:=lowsr to highsr do
  1476. if sr in rg[rt].used_in_proc then
  1477. if pairreg=NR_NO then
  1478. pairreg:=newreg(rt,sr,sub)
  1479. else
  1480. begin
  1481. inc(result,16);
  1482. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
  1483. pairreg:=NR_NO
  1484. end;
  1485. { one left -> store twice (stack must be 16 bytes aligned) }
  1486. if pairreg<>NR_NO then
  1487. begin
  1488. list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
  1489. inc(result,16);
  1490. end;
  1491. end;
  1492. procedure FixupOffsets(p:TObject;arg:pointer);
  1493. var
  1494. sym: tabstractnormalvarsym absolute p;
  1495. begin
  1496. if (tsym(p).typ in [paravarsym,localvarsym]) and
  1497. (sym.localloc.loc=LOC_REFERENCE) and
  1498. (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
  1499. begin
  1500. sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
  1501. dec(sym.localloc.reference.offset,PLongint(arg)^);
  1502. end;
  1503. end;
  1504. procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
  1505. var
  1506. ref: treference;
  1507. totalstackframesize: longint;
  1508. begin
  1509. if nostackframe then
  1510. exit;
  1511. { stack pointer has to be aligned to 16 bytes at all times }
  1512. localsize:=align(localsize,16);
  1513. { save stack pointer and return address }
  1514. reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
  1515. ref.addressmode:=AM_PREINDEXED;
  1516. list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
  1517. { initialise frame pointer }
  1518. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
  1519. totalstackframesize:=localsize;
  1520. { save modified integer registers }
  1521. inc(totalstackframesize,
  1522. save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
  1523. { only the lower 64 bits of the modified vector registers need to be
  1524. saved; if the caller needs the upper 64 bits, it has to save them
  1525. itself }
  1526. inc(totalstackframesize,
  1527. save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
  1528. { allocate stack space }
  1529. if localsize<>0 then
  1530. begin
  1531. localsize:=align(localsize,16);
  1532. current_procinfo.final_localsize:=localsize;
  1533. handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
  1534. end;
  1535. { By default, we use the frame pointer to access parameters passed via
  1536. the stack and the stack pointer to address local variables and temps
  1537. because
  1538. a) we can use bigger positive than negative offsets (so accessing
  1539. locals via negative offsets from the frame pointer would be less
  1540. efficient)
  1541. b) we don't know the local size while generating the code, so
  1542. accessing the parameters via the stack pointer is not possible
  1543. without copying them
  1544. The problem with this is the get_frame() intrinsic:
  1545. a) it must return the same value as what we pass as parentfp
  1546. parameter, since that's how it's used in the TP-style objects unit
  1547. b) its return value must usable to access all local data from a
  1548. routine (locals and parameters), since it's all the nested
  1549. routines have access to
  1550. c) its return value must be usable to construct a backtrace, as it's
  1551. also used by the exception handling routines
  1552. The solution we use here, based on something similar that's done in
  1553. the MIPS port, is to generate all accesses to locals in the routine
  1554. itself SP-relative, and then after the code is generated and the local
  1555. size is known (namely, here), we change all SP-relative variables/
  1556. parameters into FP-relative ones. This means that they'll be accessed
  1557. less efficiently from nested routines, but those accesses are indirect
  1558. anyway and at least this way they can be accessed at all
  1559. }
  1560. if current_procinfo.has_nestedprocs then
  1561. begin
  1562. current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1563. current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
  1564. end;
  1565. end;
  1566. procedure tcgaarch64.g_maybe_got_init(list : TAsmList);
  1567. begin
  1568. { nothing to do on Darwin or Linux }
  1569. end;
  1570. procedure tcgaarch64.g_restore_registers(list:TAsmList);
  1571. begin
  1572. { done in g_proc_exit }
  1573. end;
  1574. procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
  1575. var
  1576. ref: treference;
  1577. sr, highestsetsr: tsuperregister;
  1578. pairreg: tregister;
  1579. regcount: longint;
  1580. begin
  1581. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1582. ref.addressmode:=AM_POSTINDEXED;
  1583. { highest reg stored twice? }
  1584. regcount:=0;
  1585. highestsetsr:=RS_NO;
  1586. for sr:=lowsr to highsr do
  1587. if sr in rg[rt].used_in_proc then
  1588. begin
  1589. inc(regcount);
  1590. highestsetsr:=sr;
  1591. end;
  1592. if odd(regcount) then
  1593. begin
  1594. list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref));
  1595. highestsetsr:=pred(highestsetsr);
  1596. end;
  1597. { load all (other) used registers pairwise }
  1598. pairreg:=NR_NO;
  1599. for sr:=highestsetsr downto lowsr do
  1600. if sr in rg[rt].used_in_proc then
  1601. if pairreg=NR_NO then
  1602. pairreg:=newreg(rt,sr,sub)
  1603. else
  1604. begin
  1605. list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref));
  1606. pairreg:=NR_NO
  1607. end;
  1608. { There can't be any register left }
  1609. if pairreg<>NR_NO then
  1610. internalerror(2014112602);
  1611. end;
  1612. procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
  1613. var
  1614. ref: treference;
  1615. regsstored: boolean;
  1616. sr: tsuperregister;
  1617. begin
  1618. if not nostackframe then
  1619. begin
  1620. { if no registers have been stored, we don't have to subtract the
  1621. allocated temp space from the stack pointer }
  1622. regsstored:=false;
  1623. for sr:=RS_X19 to RS_X28 do
  1624. if sr in rg[R_INTREGISTER].used_in_proc then
  1625. begin
  1626. regsstored:=true;
  1627. break;
  1628. end;
  1629. if not regsstored then
  1630. for sr:=RS_D8 to RS_D15 do
  1631. if sr in rg[R_MMREGISTER].used_in_proc then
  1632. begin
  1633. regsstored:=true;
  1634. break;
  1635. end;
  1636. { restore registers (and stack pointer) }
  1637. if regsstored then
  1638. begin
  1639. if current_procinfo.final_localsize<>0 then
  1640. handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true);
  1641. load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
  1642. load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
  1643. end
  1644. else if current_procinfo.final_localsize<>0 then
  1645. { restore stack pointer }
  1646. a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
  1647. { restore framepointer and return address }
  1648. reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
  1649. ref.addressmode:=AM_POSTINDEXED;
  1650. list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
  1651. end;
  1652. { return }
  1653. list.concat(taicpu.op_none(A_RET));
  1654. end;
  1655. procedure tcgaarch64.g_save_registers(list : TAsmList);
  1656. begin
  1657. { done in g_proc_entry }
  1658. end;
  1659. { ************* concatcopy ************ }
  1660. procedure tcgaarch64.g_concatcopy_move(list : TAsmList;const source,dest : treference;len : tcgint);
  1661. var
  1662. paraloc1,paraloc2,paraloc3 : TCGPara;
  1663. pd : tprocdef;
  1664. begin
  1665. pd:=search_system_proc('MOVE');
  1666. paraloc1.init;
  1667. paraloc2.init;
  1668. paraloc3.init;
  1669. paramanager.getintparaloc(list,pd,1,paraloc1);
  1670. paramanager.getintparaloc(list,pd,2,paraloc2);
  1671. paramanager.getintparaloc(list,pd,3,paraloc3);
  1672. a_load_const_cgpara(list,OS_SINT,len,paraloc3);
  1673. a_loadaddr_ref_cgpara(list,dest,paraloc2);
  1674. a_loadaddr_ref_cgpara(list,source,paraloc1);
  1675. paramanager.freecgpara(list,paraloc3);
  1676. paramanager.freecgpara(list,paraloc2);
  1677. paramanager.freecgpara(list,paraloc1);
  1678. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1679. alloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1680. a_call_name(list,'FPC_MOVE',false);
  1681. dealloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default));
  1682. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  1683. paraloc3.done;
  1684. paraloc2.done;
  1685. paraloc1.done;
  1686. end;
  1687. procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
  1688. var
  1689. sourcebasereplaced, destbasereplaced: boolean;
  1690. { get optimal memory operation to use for loading/storing data
  1691. in an unrolled loop }
  1692. procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
  1693. begin
  1694. if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
  1695. (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
  1696. begin
  1697. memop:=unscaledop;
  1698. needsimplify:=true;
  1699. end
  1700. else if (unscaledop<>A_NONE) and
  1701. (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
  1702. (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
  1703. begin
  1704. memop:=unscaledop;
  1705. needsimplify:=false;
  1706. end
  1707. else
  1708. begin
  1709. memop:=scaledop;
  1710. needsimplify:=true;
  1711. end;
  1712. end;
  1713. { adjust the offset and/or addressing mode after a load/store so it's
  1714. correct for the next one of the same size }
  1715. procedure updaterefafterloadstore(var ref: treference; oplen: longint);
  1716. begin
  1717. case ref.addressmode of
  1718. AM_OFFSET:
  1719. inc(ref.offset,oplen);
  1720. AM_POSTINDEXED:
  1721. { base register updated by instruction, next offset can remain
  1722. the same }
  1723. ;
  1724. AM_PREINDEXED:
  1725. begin
  1726. { base register updated by instruction -> next instruction can
  1727. use post-indexing with offset = sizeof(operation) }
  1728. ref.offset:=0;
  1729. ref.addressmode:=AM_OFFSET;
  1730. end;
  1731. end;
  1732. end;
  1733. { generate a load/store and adjust the reference offset to the next
  1734. memory location if necessary }
  1735. procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1736. begin
  1737. list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
  1738. updaterefafterloadstore(ref,tcgsize2size[opsize]);
  1739. end;
  1740. { generate a dual load/store (ldp/stp) and adjust the reference offset to
  1741. the next memory location if necessary }
  1742. procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
  1743. begin
  1744. list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
  1745. updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
  1746. end;
  1747. { turn a reference into a pre- or post-indexed reference for use in a
  1748. load/store of a particular size }
  1749. procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
  1750. var
  1751. tmpreg: tregister;
  1752. scaledoffset: longint;
  1753. orgaddressmode: taddressmode;
  1754. begin
  1755. scaledoffset:=tcgsize2size[opsize];
  1756. if scaledop in [A_LDP,A_STP] then
  1757. scaledoffset:=scaledoffset*2;
  1758. { can we use the reference as post-indexed without changes? }
  1759. if forcepostindexing then
  1760. begin
  1761. orgaddressmode:=ref.addressmode;
  1762. ref.addressmode:=AM_POSTINDEXED;
  1763. if (orgaddressmode=AM_POSTINDEXED) or
  1764. ((ref.offset=0) and
  1765. (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
  1766. begin
  1767. { just change the post-indexed offset to the access size }
  1768. ref.offset:=scaledoffset;
  1769. { and replace the base register if that didn't happen yet
  1770. (could be sp or a regvar) }
  1771. if not basereplaced then
  1772. begin
  1773. tmpreg:=getaddressregister(list);
  1774. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1775. ref.base:=tmpreg;
  1776. basereplaced:=true;
  1777. end;
  1778. exit;
  1779. end;
  1780. ref.addressmode:=orgaddressmode;
  1781. end;
  1782. {$ifdef dummy}
  1783. This could in theory be useful in case you have a concatcopy from
  1784. e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
  1785. very unlikely. Disabled because it still needs fixes, as it
  1786. also generates pre-indexed loads right now at the very end for the
  1787. left-over gencopies
  1788. { can we turn it into a pre-indexed reference for free? (after the
  1789. first operation, it will be turned into an offset one) }
  1790. if not forcepostindexing and
  1791. (ref.offset<>0) then
  1792. begin
  1793. orgaddressmode:=ref.addressmode;
  1794. ref.addressmode:=AM_PREINDEXED;
  1795. tmpreg:=ref.base;
  1796. if not basereplaced and
  1797. (ref.base=tmpreg) then
  1798. begin
  1799. tmpreg:=getaddressregister(list);
  1800. a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
  1801. ref.base:=tmpreg;
  1802. basereplaced:=true;
  1803. end;
  1804. if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
  1805. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1806. exit;
  1807. end;
  1808. {$endif dummy}
  1809. if not forcepostindexing then
  1810. begin
  1811. ref.addressmode:=AM_OFFSET;
  1812. make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
  1813. { this may still cause problems if the final offset is no longer
  1814. a simple ref; it's a bit complicated to pass all information
  1815. through at all places and check that here, so play safe: we
  1816. currently never generate unrolled copies for more than 64
  1817. bytes (32 with non-double-register copies) }
  1818. if ref.index=NR_NO then
  1819. begin
  1820. if ((scaledop in [A_LDP,A_STP]) and
  1821. (ref.offset<((64-8)*tcgsize2size[opsize]))) or
  1822. ((scaledop in [A_LDUR,A_STUR]) and
  1823. (ref.offset<(255-8*tcgsize2size[opsize]))) or
  1824. ((scaledop in [A_LDR,A_STR]) and
  1825. (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
  1826. exit;
  1827. end;
  1828. end;
  1829. tmpreg:=getaddressregister(list);
  1830. a_loadaddr_ref_reg(list,ref,tmpreg);
  1831. basereplaced:=true;
  1832. if forcepostindexing then
  1833. begin
  1834. reference_reset_base(ref,tmpreg,scaledoffset,ref.temppos,ref.alignment,ref.volatility);
  1835. ref.addressmode:=AM_POSTINDEXED;
  1836. end
  1837. else
  1838. begin
  1839. reference_reset_base(ref,tmpreg,0,ref.temppos,ref.alignment,ref.volatility);
  1840. ref.addressmode:=AM_OFFSET;
  1841. end
  1842. end;
  1843. { prepare a reference for use by gencopy. This is done both after the
  1844. unrolled and regular copy loop -> get rid of post-indexing mode, make
  1845. sure ref is valid }
  1846. procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
  1847. var
  1848. simplify: boolean;
  1849. begin
  1850. if ref.addressmode=AM_POSTINDEXED then
  1851. ref.offset:=tcgsize2size[opsize];
  1852. getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
  1853. if simplify then
  1854. begin
  1855. makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
  1856. op:=scaledop;
  1857. end;
  1858. end;
  1859. { generate a copy from source to dest of size opsize/postfix }
  1860. procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
  1861. var
  1862. reg: tregister;
  1863. loadop, storeop: tasmop;
  1864. begin
  1865. preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
  1866. preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
  1867. reg:=getintregister(list,opsize);
  1868. genloadstore(list,loadop,reg,source,postfix,opsize);
  1869. genloadstore(list,storeop,reg,dest,postfix,opsize);
  1870. end;
  1871. { copy the leftovers after an unrolled or regular copy loop }
  1872. procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
  1873. begin
  1874. { stop post-indexing if we did so in the loop, since in that case all
  1875. offsets definitely can be represented now }
  1876. if source.addressmode=AM_POSTINDEXED then
  1877. begin
  1878. source.addressmode:=AM_OFFSET;
  1879. source.offset:=0;
  1880. end;
  1881. if dest.addressmode=AM_POSTINDEXED then
  1882. begin
  1883. dest.addressmode:=AM_OFFSET;
  1884. dest.offset:=0;
  1885. end;
  1886. { transfer the leftovers }
  1887. if len>=8 then
  1888. begin
  1889. dec(len,8);
  1890. gencopy(list,source,dest,PF_NONE,OS_64);
  1891. end;
  1892. if len>=4 then
  1893. begin
  1894. dec(len,4);
  1895. gencopy(list,source,dest,PF_NONE,OS_32);
  1896. end;
  1897. if len>=2 then
  1898. begin
  1899. dec(len,2);
  1900. gencopy(list,source,dest,PF_H,OS_16);
  1901. end;
  1902. if len>=1 then
  1903. begin
  1904. dec(len);
  1905. gencopy(list,source,dest,PF_B,OS_8);
  1906. end;
  1907. end;
  1908. const
  1909. { load_length + loop dec + cbnz }
  1910. loopoverhead=12;
  1911. { loop overhead + load + store }
  1912. totallooplen=loopoverhead + 8;
  1913. var
  1914. totalalign: longint;
  1915. maxlenunrolled: tcgint;
  1916. loadop, storeop: tasmop;
  1917. opsize: tcgsize;
  1918. postfix: toppostfix;
  1919. tmpsource, tmpdest: treference;
  1920. scaledstoreop, unscaledstoreop,
  1921. scaledloadop, unscaledloadop: tasmop;
  1922. regs: array[1..8] of tregister;
  1923. countreg: tregister;
  1924. i, regcount: longint;
  1925. hl: tasmlabel;
  1926. simplifysource, simplifydest: boolean;
  1927. begin
  1928. if len=0 then
  1929. exit;
  1930. sourcebasereplaced:=false;
  1931. destbasereplaced:=false;
  1932. { maximum common alignment }
  1933. totalalign:=max(1,newalignment(source.alignment,dest.alignment));
  1934. { use a simple load/store? }
  1935. if (len in [1,2,4,8]) and
  1936. ((totalalign>=(len div 2)) or
  1937. (source.alignment=len) or
  1938. (dest.alignment=len)) then
  1939. begin
  1940. opsize:=int_cgsize(len);
  1941. a_load_ref_ref(list,opsize,opsize,source,dest);
  1942. exit;
  1943. end;
  1944. { alignment > length is not useful, and would break some checks below }
  1945. while totalalign>len do
  1946. totalalign:=totalalign div 2;
  1947. { operation sizes to use based on common alignment }
  1948. case totalalign of
  1949. 1:
  1950. begin
  1951. postfix:=PF_B;
  1952. opsize:=OS_8;
  1953. end;
  1954. 2:
  1955. begin
  1956. postfix:=PF_H;
  1957. opsize:=OS_16;
  1958. end;
  1959. 4:
  1960. begin
  1961. postfix:=PF_None;
  1962. opsize:=OS_32;
  1963. end
  1964. else
  1965. begin
  1966. totalalign:=8;
  1967. postfix:=PF_None;
  1968. opsize:=OS_64;
  1969. end;
  1970. end;
  1971. { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
  1972. maxlenunrolled:=min(totalalign,8)*4;
  1973. { ldp/stp -> 2 registers per instruction }
  1974. if (totalalign>=4) and
  1975. (len>=totalalign*2) then
  1976. begin
  1977. maxlenunrolled:=maxlenunrolled*2;
  1978. scaledstoreop:=A_STP;
  1979. scaledloadop:=A_LDP;
  1980. unscaledstoreop:=A_NONE;
  1981. unscaledloadop:=A_NONE;
  1982. end
  1983. else
  1984. begin
  1985. scaledstoreop:=A_STR;
  1986. scaledloadop:=A_LDR;
  1987. unscaledstoreop:=A_STUR;
  1988. unscaledloadop:=A_LDUR;
  1989. end;
  1990. { we only need 4 instructions extra to call FPC_MOVE }
  1991. if cs_opt_size in current_settings.optimizerswitches then
  1992. maxlenunrolled:=maxlenunrolled div 2;
  1993. if (len>maxlenunrolled) and
  1994. (len>totalalign*8) then
  1995. begin
  1996. g_concatcopy_move(list,source,dest,len);
  1997. exit;
  1998. end;
  1999. simplifysource:=true;
  2000. simplifydest:=true;
  2001. tmpsource:=source;
  2002. tmpdest:=dest;
  2003. { can we directly encode all offsets in an unrolled loop? }
  2004. if len<=maxlenunrolled then
  2005. begin
  2006. {$ifdef extdebug}
  2007. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
  2008. {$endif extdebug}
  2009. { the leftovers will be handled separately -> -(len mod opsize) }
  2010. inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
  2011. { additionally, the last regular load/store will be at
  2012. offset+len-opsize (if len-(len mod opsize)>len) }
  2013. if tmpsource.offset>source.offset then
  2014. dec(tmpsource.offset,tcgsize2size[opsize]);
  2015. getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
  2016. inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
  2017. if tmpdest.offset>dest.offset then
  2018. dec(tmpdest.offset,tcgsize2size[opsize]);
  2019. getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
  2020. tmpsource:=source;
  2021. tmpdest:=dest;
  2022. { if we can't directly encode all offsets, simplify }
  2023. if simplifysource then
  2024. begin
  2025. loadop:=scaledloadop;
  2026. makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
  2027. end;
  2028. if simplifydest then
  2029. begin
  2030. storeop:=scaledstoreop;
  2031. makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
  2032. end;
  2033. regcount:=len div tcgsize2size[opsize];
  2034. { in case we transfer two registers at a time, we copy an even
  2035. number of registers }
  2036. if loadop=A_LDP then
  2037. regcount:=regcount and not(1);
  2038. { initialise for dfa }
  2039. regs[low(regs)]:=NR_NO;
  2040. { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
  2041. for i:=1 to regcount do
  2042. regs[i]:=getintregister(list,opsize);
  2043. if loadop=A_LDP then
  2044. begin
  2045. { load registers }
  2046. for i:=1 to (regcount div 2) do
  2047. gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
  2048. { store registers }
  2049. for i:=1 to (regcount div 2) do
  2050. gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
  2051. end
  2052. else
  2053. begin
  2054. for i:=1 to regcount do
  2055. genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
  2056. for i:=1 to regcount do
  2057. genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
  2058. end;
  2059. { leftover }
  2060. len:=len-regcount*tcgsize2size[opsize];
  2061. {$ifdef extdebug}
  2062. list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
  2063. {$endif extdebug}
  2064. end
  2065. else
  2066. begin
  2067. {$ifdef extdebug}
  2068. list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
  2069. {$endif extdebug}
  2070. { regular loop -> definitely use post-indexing }
  2071. loadop:=scaledloadop;
  2072. makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
  2073. storeop:=scaledstoreop;
  2074. makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
  2075. current_asmdata.getjumplabel(hl);
  2076. countreg:=getintregister(list,OS_32);
  2077. if loadop=A_LDP then
  2078. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
  2079. else
  2080. a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
  2081. a_label(list,hl);
  2082. a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
  2083. if loadop=A_LDP then
  2084. begin
  2085. regs[1]:=getintregister(list,opsize);
  2086. regs[2]:=getintregister(list,opsize);
  2087. gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
  2088. gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
  2089. end
  2090. else
  2091. begin
  2092. regs[1]:=getintregister(list,opsize);
  2093. genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
  2094. genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
  2095. end;
  2096. list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
  2097. len:=len mod tcgsize2size[opsize];
  2098. end;
  2099. gencopyleftovers(list,tmpsource,tmpdest,len);
  2100. end;
  2101. procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint);
  2102. begin
  2103. { This method is integrated into g_intf_wrapper and shouldn't be called separately }
  2104. InternalError(2013020102);
  2105. end;
  2106. procedure tcgaarch64.g_check_for_fpu_exception(list: TAsmList;force,clear : boolean);
  2107. var
  2108. r, tmpreg: TRegister;
  2109. ai: taicpu;
  2110. l1,l2: TAsmLabel;
  2111. begin
  2112. { so far, we assume all flavours of AArch64 need explicit floating point exception checking }
  2113. if ((cs_check_fpu_exceptions in current_settings.localswitches) and
  2114. (force or current_procinfo.FPUExceptionCheckNeeded)) then
  2115. begin
  2116. r:=getintregister(list,OS_INT);
  2117. tmpreg:=getintregister(list,OS_INT);
  2118. list.concat(taicpu.op_reg_reg(A_MRS,r,NR_FPSR));
  2119. list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$1f));
  2120. current_asmdata.getjumplabel(l1);
  2121. current_asmdata.getjumplabel(l2);
  2122. ai:=taicpu.op_reg_sym_ofs(A_CBNZ,tmpreg,l1,0);
  2123. ai.is_jmp:=true;
  2124. list.concat(ai);
  2125. list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$80));
  2126. ai:=taicpu.op_reg_sym_ofs(A_CBZ,tmpreg,l2,0);
  2127. ai.is_jmp:=true;
  2128. list.concat(ai);
  2129. a_label(list,l1);
  2130. alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2131. cg.a_call_name(list,'FPC_THROWFPUEXCEPTION',false);
  2132. dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default));
  2133. a_label(list,l2);
  2134. if clear then
  2135. current_procinfo.FPUExceptionCheckNeeded:=false;
  2136. end;
  2137. end;
  2138. procedure create_codegen;
  2139. begin
  2140. cg:=tcgaarch64.Create;
  2141. cg128:=tcg128.Create;
  2142. end;
  2143. end.