optcse.pas 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857
  1. {
  2. Common subexpression elimination on base blocks
  3. Copyright (c) 2005-2012 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optcse;
  18. {$i fpcdefs.inc}
  19. { $define csedebug}
  20. { $define csestats}
  21. interface
  22. uses
  23. sysutils,node;
  24. {
  25. the function creates non optimal code so far:
  26. - call para nodes are cse barriers because they can be reordered and thus the
  27. temp. creation could be done too late
  28. - the cse knows nothing about register pressure. In case of high register pressure, cse might
  29. have a negative impact
  30. - the list of cseinvariant node types and inline numbers is not complete yet
  31. Further, it could be done probably in a faster way though the complexity can't probably not reduced
  32. }
  33. function do_optcse(var rootnode : tnode) : tnode;
  34. function do_consttovar(var rootnode : tnode) : tnode;
  35. implementation
  36. uses
  37. globtype,globals,
  38. systems,
  39. cutils,cclasses,
  40. nutils,compinnr,
  41. nbas,nld,ninl,ncal,nadd,nmem,ncnv,
  42. pass_1,
  43. procinfo,
  44. paramgr,
  45. cpubase,
  46. symconst,symdef,symsym,symtable,symtype,
  47. defutil,
  48. optbase;
  49. const
  50. cseinvariant : set of tnodetype = [addn,muln,subn,divn,slashn,modn,andn,orn,xorn,notn,vecn,
  51. derefn,equaln,unequaln,ltn,gtn,lten,gten,typeconvn,subscriptn,
  52. inn,symdifn,shrn,shln,ordconstn,realconstn,unaryminusn,pointerconstn,stringconstn,setconstn,niln,
  53. setelementn,{arrayconstructorn,arrayconstructorrangen,}
  54. isn,asn,starstarn,nothingn,temprefn,loadparentfpn {,callparan},assignn,addrn];
  55. function searchsubdomain(var n:tnode; arg: pointer) : foreachnoderesult;
  56. begin
  57. if (n.nodetype in cseinvariant) or
  58. ((n.nodetype=inlinen) and
  59. (tinlinenode(n).inlinenumber in [in_length_x,in_assigned_x,in_sqr_real,in_sqrt_real,in_sin_real,in_cos_real,in_abs_long,
  60. in_abs_real,in_exp_real,in_ln_real,in_pi_real,in_popcnt_x,in_arctan_real,in_round_real,in_trunc_real,
  61. { cse on fma will still not work because it would require proper handling of call nodes
  62. with more than one parameter }
  63. in_fma_single,in_fma_double,in_fma_extended,in_fma_float128,
  64. in_min_single,in_min_double,in_max_single,in_max_double,
  65. in_max_longint,in_max_dword,in_min_longint,in_min_dword,
  66. in_max_int64,in_max_qword,in_min_int64,in_min_qword
  67. ])
  68. ) or
  69. ((n.nodetype=callparan) and not(assigned(tcallparanode(n).right))) or
  70. ((n.nodetype=loadn) and
  71. not((tloadnode(n).symtableentry.typ in [staticvarsym,localvarsym,paravarsym]) and
  72. (vo_volatile in tabstractvarsym(tloadnode(n).symtableentry).varoptions))
  73. ) then
  74. result:=fen_true
  75. else
  76. begin
  77. pboolean(arg)^:=false;
  78. result:=fen_norecurse_true;
  79. end;
  80. end;
  81. type
  82. tlists = record
  83. nodelist : tfplist;
  84. locationlist : tfplist;
  85. equalto : tfplist;
  86. refs : tfplist;
  87. avail : TDFASet;
  88. end;
  89. plists = ^tlists;
  90. { collectnodes needs the address of itself to call foreachnodestatic,
  91. so we need a wrapper because @<func> inside <func doesn't work }
  92. function collectnodes(var n:tnode; arg: pointer) : foreachnoderesult;forward;
  93. function collectnodes2(var n:tnode; arg: pointer) : foreachnoderesult;
  94. begin
  95. result:=collectnodes(n,arg);
  96. end;
  97. function collectnodes(var n:tnode; arg: pointer) : foreachnoderesult;
  98. { when compiling a tree like
  99. and
  100. / \
  101. and C
  102. / \
  103. A B
  104. all expressions of B are available during evaluation of C. However considerung the whole expression,
  105. values of B and C might not be available due to short boolean evaluation.
  106. So recurseintobooleanchain detectes such chained and/or expressions and makes sub-expressions of B
  107. available during the evaluation of C
  108. firstleftend is later used to remove all sub expressions of B and C by storing the expression count
  109. in the cse table after handling A
  110. }
  111. var
  112. firstleftend : longint;
  113. procedure recurseintobooleanchain(t : tnodetype;n : tnode);
  114. begin
  115. if (tbinarynode(n).left.nodetype=t) and is_boolean(tbinarynode(n).left.resultdef) then
  116. recurseintobooleanchain(t,tbinarynode(n).left)
  117. else
  118. foreachnodestatic(pm_postprocess,tbinarynode(n).left,@collectnodes2,arg);
  119. firstleftend:=min(plists(arg)^.nodelist.count,firstleftend);
  120. foreachnodestatic(pm_postprocess,tbinarynode(n).right,@collectnodes2,arg);
  121. end;
  122. var
  123. i : longint;
  124. tempdef : tdef;
  125. begin
  126. result:=fen_false;
  127. { don't add the tree below an untyped const parameter: there is
  128. no information available that this kind of tree actually needs
  129. to be addresable, this could be improved }
  130. { the nodes below a type conversion node created for an absolute
  131. reference cannot be handled separately, because the absolute reference
  132. may have special requirements (no regability, must be in memory, ...)
  133. }
  134. if (((n.nodetype=callparan) and
  135. (tcallparanode(n).left.resultdef.typ=formaldef) and
  136. (tcallparanode(n).parasym.varspez=vs_const)) or
  137. ((n.nodetype=typeconvn) and
  138. (nf_absolute in n.flags))
  139. ) then
  140. begin
  141. result:=fen_norecurse_false;
  142. exit;
  143. end;
  144. if
  145. { node possible to add? }
  146. assigned(n.resultdef) and
  147. ((
  148. { regable expressions }
  149. (actualtargetnode(@n)^.flags*[nf_write,nf_modify,nf_address_taken]=[]) and
  150. (
  151. tstoreddef(n.resultdef).is_intregable or
  152. tstoreddef(n.resultdef).is_fpuregable or
  153. tstoreddef(n.resultdef).is_const_intregable
  154. ) and
  155. { is_int/fpuregable allows arrays and records to be in registers, cse cannot handle this }
  156. (
  157. not(n.resultdef.typ in [arraydef,recorddef]) or
  158. (
  159. (
  160. (n.resultdef.typ = recorddef) and
  161. tabstractrecordsymtable(tabstractrecorddef(n.resultdef).symtable).has_single_field(tempdef)
  162. ) or
  163. is_dynamic_array(n.resultdef) or
  164. (
  165. not(is_special_array(tstoreddef(n.resultdef))) and
  166. not(tstoreddef(n.resultdef).is_intregable) and
  167. not(tstoreddef(n.resultdef).is_fpuregable)
  168. )
  169. )
  170. ) and
  171. { same for voiddef }
  172. not(is_void(n.resultdef)) and
  173. { adding tempref and callpara nodes itself is worthless but
  174. their complexity is probably <= 1 anyways
  175. neither add setelementn nodes because the compiler sometimes depends on the fact
  176. that a certain node stays a setelementn, this does not hurt either because
  177. setelementn nodes itself generate no real code (except moving data into register) }
  178. not(n.nodetype in [temprefn,callparan,setelementn]) and
  179. { node worth to add?
  180. We consider almost every node because even loading a variables from
  181. a register instead of memory is more beneficial. This behaviour should
  182. not increase register pressure because if a variable is already
  183. in a register, the reg. allocator can merge the nodes. If a variable
  184. is loaded from memory, loading this variable and spilling another register
  185. should not add a speed penalty.
  186. }
  187. {
  188. load nodes are not considered if they load para or local symbols from the
  189. current stack frame, those are in registers anyways if possible
  190. }
  191. (not(actualtargetnode(@n)^.nodetype=loadn) or
  192. not(tloadnode(actualtargetnode(@n)^).symtableentry.typ in [paravarsym,localvarsym,staticvarsym]) or
  193. { apply cse on non-regable variables }
  194. ((tloadnode(actualtargetnode(@n)^).symtableentry.typ in [paravarsym,localvarsym,staticvarsym]) and
  195. not(tabstractvarsym(tloadnode(actualtargetnode(@n)^).symtableentry).is_regvar(true)) and
  196. not(vo_volatile in tabstractvarsym(tloadnode(actualtargetnode(@n)^).symtableentry).varoptions)) or
  197. (node_complexity(n)>1)
  198. ) and
  199. {
  200. Const nodes however are only considered if their complexity is >1
  201. This might be the case for the risc architectures if they need
  202. more than one instruction to load this particular value
  203. }
  204. (not(is_constnode(n)) or (node_complexity(n)>1)))
  205. {$if not(defined(i386)) and not(defined(i8086))}
  206. or
  207. { store reference of expression? }
  208. { loading the address of a global symbol takes typically more than
  209. one instruction on every platform except i8086/i386
  210. so consider in this case loading the address of the data
  211. }
  212. (((n.resultdef.typ in [arraydef,recorddef]) or is_object(n.resultdef)) and not(is_dynamic_array(n.resultdef)) and
  213. (n.nodetype=loadn) and
  214. (tloadnode(n).symtableentry.typ=staticvarsym)
  215. )
  216. {$endif not(defined(i386)) and not(defined(i8086))}
  217. ) then
  218. begin
  219. plists(arg)^.nodelist.Add(n);
  220. plists(arg)^.locationlist.Add(@n);
  221. plists(arg)^.refs.Add(nil);
  222. plists(arg)^.equalto.Add(pointer(-1));
  223. DFASetInclude(plists(arg)^.avail,plists(arg)^.nodelist.count-1);
  224. for i:=0 to plists(arg)^.nodelist.count-2 do
  225. begin
  226. if tnode(plists(arg)^.nodelist[i]).isequal(n) and DFASetIn(plists(arg)^.avail,i) then
  227. begin
  228. { use always the first occurence }
  229. if plists(arg)^.equalto[i]<>pointer(-1) then
  230. plists(arg)^.equalto[plists(arg)^.nodelist.count-1]:=plists(arg)^.equalto[i]
  231. else
  232. plists(arg)^.equalto[plists(arg)^.nodelist.count-1]:=pointer(ptrint(i));
  233. plists(arg)^.refs[i]:=pointer(plists(arg)^.refs[i])+1;
  234. { tree has been found, no need to search further,
  235. sub-trees have been added by the first occurence of
  236. the tree already }
  237. result:=fen_norecurse_false;
  238. break;
  239. end;
  240. end;
  241. end;
  242. { boolean and/or require a special handling: after evaluating the and/or node,
  243. the expressions of the right side might not be available due to short boolean
  244. evaluation, so after handling the right side, mark those expressions
  245. as unavailable }
  246. if (n.nodetype in [orn,andn]) and is_boolean(taddnode(n).left.resultdef) then
  247. begin
  248. firstleftend:=high(longint);
  249. recurseintobooleanchain(n.nodetype,n);
  250. for i:=firstleftend to plists(arg)^.nodelist.count-1 do
  251. DFASetExclude(plists(arg)^.avail,i);
  252. result:=fen_norecurse_false;
  253. end;
  254. {$ifdef cpuhighleveltarget}
  255. { The high level targets use the functionality from ncgnstld for
  256. nested accesses, and that one stores the complete location of the
  257. nested variable in tloadnode.left rather than only the location of
  258. the parent context containing it. This causes problems with the
  259. CSE in case the nested variable is used as an lvalue, so disable
  260. CSE in that case
  261. }
  262. if (n.nodetype=loadn) and assigned(tloadnode(n).left) then
  263. result:=fen_norecurse_false;
  264. {$endif}
  265. end;
  266. function searchcsedomain(var n: tnode; arg: pointer) : foreachnoderesult;
  267. var
  268. csedomain : boolean;
  269. lists : tlists;
  270. templist : tfplist;
  271. i : longint;
  272. def : tstoreddef;
  273. nodes : tblocknode;
  274. creates,
  275. statements : tstatementnode;
  276. deletetemp : ttempdeletenode;
  277. hp : ttempcreatenode;
  278. addrstored : boolean;
  279. hp2 : tnode;
  280. begin
  281. result:=fen_false;
  282. nodes:=nil;
  283. if (n.nodetype in cseinvariant) and
  284. { a setelement node is cseinvariant, but it might not be replaced by a block so
  285. it cannot be the root of the cse search }
  286. (n.nodetype<>setelementn) then
  287. begin
  288. csedomain:=true;
  289. foreachnodestatic(pm_postprocess,n,@searchsubdomain,@csedomain);
  290. if not(csedomain) then
  291. begin
  292. { try to transform the tree to get better cse domains, consider:
  293. + (1)
  294. / \
  295. (2) + C
  296. / \
  297. A B
  298. if A is not cse'able but B and C are, then the compiler cannot do cse so the tree is transformed into
  299. +
  300. / \
  301. A +
  302. / \
  303. B C
  304. Because A could be another tree of this kind, the whole process is done in a while loop
  305. }
  306. if (n.nodetype in [andn,orn,addn,muln]) and
  307. (n.nodetype=tbinarynode(n).left.nodetype) and
  308. { do is optimizations only for integers, reals (no currency!), vectors, sets or booleans }
  309. (is_integer(n.resultdef) or is_real(n.resultdef) or is_vector(n.resultdef) or is_set(n.resultdef) or
  310. is_boolean(n.resultdef)) and
  311. { either if fastmath is on }
  312. ((cs_opt_fastmath in current_settings.optimizerswitches) or
  313. { or for the logical operators, they cannot overflow }
  314. (n.nodetype in [andn,orn]) or
  315. { or for integers if range checking is off }
  316. ((is_integer(n.resultdef) and
  317. (n.localswitches*[cs_check_range,cs_check_overflow]=[]) and
  318. (tbinarynode(n).left.localswitches*[cs_check_range,cs_check_overflow]=[]))) or
  319. { for sets, we can do this always }
  320. (is_set(n.resultdef))
  321. ) then
  322. while (n.nodetype=tbinarynode(n).left.nodetype) and
  323. { if not both or none of the nodes is short/full boolean evaluated, we cannot do this optimization as it might change
  324. semantics, there are border cases where this might not apply, but so far it is not worth the effort to check }
  325. (not(is_boolean(n.resultdef)) or not(n.nodetype in [andn,orn]) or (doshortbooleval(n)=doshortbooleval(tbinarynode(n).left))) and
  326. { the resulttypes of the operands we'll swap must be equal,
  327. required in case of a 32x32->64 multiplication, then we
  328. cannot swap out one of the 32 bit operands for a 64 bit one
  329. }
  330. (tbinarynode(tbinarynode(n).left).left.resultdef=tbinarynode(n).left.resultdef) and
  331. (tbinarynode(n).left.resultdef=tbinarynode(n).right.resultdef) do
  332. begin
  333. csedomain:=true;
  334. foreachnodestatic(pm_postprocess,tbinarynode(n).right,@searchsubdomain,@csedomain);
  335. if csedomain then
  336. begin
  337. csedomain:=true;
  338. foreachnodestatic(pm_postprocess,tbinarynode(tbinarynode(n).left).right,@searchsubdomain,@csedomain);
  339. if csedomain then
  340. begin
  341. {
  342. move the full boolean evaluation of (2) to (1), if it was there (so it again applies to A and
  343. what follows) }
  344. { this cannot happen anymore, see above, so the condition is not needed anymore
  345. if not(doshortbooleval(tbinarynode(n).left)) and
  346. doshortbooleval(n) then }
  347. begin
  348. n.localswitches:=n.localswitches+(tbinarynode(n).left.localswitches*[cs_full_boolean_eval]);
  349. exclude(tbinarynode(n).left.localswitches,cs_full_boolean_eval);
  350. if (n.nodetype in [orn,andn]) then
  351. begin
  352. if (tbinarynode(n).left.nodetype in [orn,andn]) then
  353. taddnode(tbinarynode(n).left).addnodeflags:=taddnode(tbinarynode(n).left).addnodeflags+
  354. (taddnode(n).addnodeflags*[anf_short_bool]);
  355. exclude(taddnode(n).addnodeflags,anf_short_bool);
  356. end;
  357. end;
  358. hp2:=tbinarynode(tbinarynode(n).left).left;
  359. tbinarynode(tbinarynode(n).left).left:=tbinarynode(tbinarynode(n).left).right;
  360. tbinarynode(tbinarynode(n).left).right:=tbinarynode(n).right;
  361. tbinarynode(n).right:=tbinarynode(n).left;
  362. tbinarynode(n).left:=hp2;
  363. { the transformed tree could result in new possibilities to fold constants
  364. so force a firstpass on the root node }
  365. exclude(tbinarynode(n).right.transientflags,tnf_pass1_done);
  366. do_firstpass(tbinarynode(n).right);
  367. end
  368. else
  369. break;
  370. end
  371. else
  372. break;
  373. end;
  374. end
  375. else
  376. begin
  377. statements:=nil;
  378. result:=fen_norecurse_true;
  379. {$ifdef csedebug}
  380. writeln('============ cse domain ==================');
  381. printnode(output,n);
  382. writeln('Complexity: ',node_complexity(n));
  383. {$endif csedebug}
  384. lists.nodelist:=tfplist.create;
  385. lists.locationlist:=tfplist.create;
  386. lists.equalto:=tfplist.create;
  387. lists.refs:=tfplist.create;
  388. foreachnodestatic(pm_postprocess,n,@collectnodes,@lists);
  389. templist:=tfplist.create;
  390. templist.count:=lists.nodelist.count;
  391. { check all nodes if one is used more than once }
  392. for i:=0 to lists.nodelist.count-1 do
  393. begin
  394. { current node used more than once? }
  395. if assigned(lists.refs[i]) then
  396. begin
  397. if not(assigned(statements)) then
  398. begin
  399. nodes:=internalstatements(statements);
  400. addstatement(statements,internalstatements(creates));
  401. end;
  402. def:=tstoreddef(tnode(lists.nodelist[i]).resultdef);
  403. { we cannot handle register stored records or array in CSE yet
  404. but we can store their reference }
  405. addrstored:=((def.typ in [arraydef,recorddef]) or is_object(def)) and not(is_dynamic_array(def));
  406. if addrstored then
  407. templist[i]:=ctempcreatenode.create_value(cpointerdef.getreusable(def),voidpointertype.size,tt_persistent,
  408. true,caddrnode.create_internal(tnode(lists.nodelist[i])))
  409. else
  410. templist[i]:=ctempcreatenode.create_value(def,def.size,tt_persistent,
  411. def.is_intregable or def.is_fpuregable or def.is_const_intregable,tnode(lists.nodelist[i]));
  412. { the value described by the temp. is immutable and the temp. can be always in register
  413. ttempcreatenode.create normally takes care of the register location but it does not
  414. know about immutability so it cannot take care of managed types }
  415. ttempcreatenode(templist[i]).includetempflag(ti_const);
  416. ttempcreatenode(templist[i]).includetempflag(ti_may_be_in_reg);
  417. { make debugging easier and set temp. location to the original location }
  418. tnode(templist[i]).fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  419. addstatement(creates,tnode(templist[i]));
  420. { the delete node has no semantic use yet, it is just used to clean up memory }
  421. deletetemp:=ctempdeletenode.create(ttempcreatenode(templist[i]));
  422. deletetemp.includetempflag(ti_cleanup_only);
  423. addstatement(tstatementnode(arg^),deletetemp);
  424. { make debugging easier and set temp. location to the original location }
  425. creates.fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  426. hp:=ttempcreatenode(templist[i]);
  427. do_firstpass(tnode(hp));
  428. templist[i]:=hp;
  429. if addrstored then
  430. pnode(lists.locationlist[i])^:=cderefnode.Create(ctemprefnode.create(ttempcreatenode(templist[i])))
  431. else
  432. pnode(lists.locationlist[i])^:=ctemprefnode.create(ttempcreatenode(templist[i]));
  433. { make debugging easier and set temp. location to the original location }
  434. pnode(lists.locationlist[i])^.fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  435. do_firstpass(pnode(lists.locationlist[i])^);
  436. {$ifdef csedebug}
  437. printnode(output,statements);
  438. {$endif csedebug}
  439. end
  440. { current node reference to another node? }
  441. else if lists.equalto[i]<>pointer(-1) then
  442. begin
  443. def:=tstoreddef(tnode(lists.nodelist[i]).resultdef);
  444. { we cannot handle register stored records or array in CSE yet
  445. but we can store their reference }
  446. addrstored:=((def.typ in [arraydef,recorddef]) or is_object(def)) and not(is_dynamic_array(def));
  447. {$if defined(csedebug) or defined(csestats)}
  448. writeln;
  449. writeln('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!');
  450. writeln('Complexity: ',node_complexity(tnode(lists.nodelist[i])),' Node ',i,' equals Node ',ptrint(lists.equalto[i]));
  451. printnode(output,tnode(lists.nodelist[i]));
  452. printnode(output,tnode(lists.nodelist[ptrint(lists.equalto[i])]));
  453. writeln('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!');
  454. writeln;
  455. {$endif defined(csedebug) or defined(csestats)}
  456. templist[i]:=templist[ptrint(lists.equalto[i])];
  457. if addrstored then
  458. pnode(lists.locationlist[i])^:=cderefnode.Create(ctemprefnode.create(ttempcreatenode(templist[ptrint(lists.equalto[i])])))
  459. else
  460. pnode(lists.locationlist[i])^:=ctemprefnode.create(ttempcreatenode(templist[ptrint(lists.equalto[i])]));
  461. { make debugging easier and set temp. location to the original location }
  462. pnode(lists.locationlist[i])^.fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  463. do_firstpass(pnode(lists.locationlist[i])^);
  464. end;
  465. end;
  466. { clean up unused trees }
  467. for i:=0 to lists.nodelist.count-1 do
  468. if lists.equalto[i]<>pointer(-1) then
  469. tnode(lists.nodelist[i]).free; // no nil needed
  470. {$ifdef csedebug}
  471. writeln('nodes: ',lists.nodelist.count);
  472. writeln('==========================================');
  473. {$endif csedebug}
  474. FreeAndNil(lists.nodelist);
  475. FreeAndNil(lists.locationlist);
  476. FreeAndNil(lists.equalto);
  477. FreeAndNil(lists.refs);
  478. templist.free;
  479. templist := nil;
  480. if assigned(statements) then
  481. begin
  482. { call para nodes need a special handling because
  483. they can be only children nodes of call nodes
  484. so the initialization code is inserted below the
  485. call para node
  486. }
  487. if n.nodetype=callparan then
  488. begin
  489. addstatement(statements,tcallparanode(n).left);
  490. tcallparanode(n).left:=nodes;
  491. do_firstpass(tcallparanode(n).left);
  492. end
  493. else
  494. begin
  495. addstatement(statements,n);
  496. n:=nodes;
  497. do_firstpass(n);
  498. end;
  499. {$ifdef csedebug}
  500. printnode(output,nodes);
  501. {$endif csedebug}
  502. end;
  503. end
  504. end;
  505. end;
  506. function do_optcse(var rootnode : tnode) : tnode;
  507. var
  508. deletes,
  509. statements : tstatementnode;
  510. deleteblock,
  511. rootblock : tblocknode;
  512. begin
  513. {$ifdef csedebug}
  514. writeln('====================================================================================');
  515. writeln('CSE optimization pass started');
  516. writeln('====================================================================================');
  517. printnode(rootnode);
  518. writeln('====================================================================================');
  519. writeln;
  520. {$endif csedebug}
  521. deleteblock:=internalstatements(deletes);
  522. foreachnodestatic(pm_postprocess,rootnode,@searchcsedomain,@deletes);
  523. rootblock:=internalstatements(statements);
  524. addstatement(statements,rootnode);
  525. addstatement(statements,deleteblock);
  526. rootnode:=rootblock;
  527. do_firstpass(rootnode);
  528. {$ifdef csedebug}
  529. writeln('====================================================================================');
  530. writeln('CSE optimization result');
  531. writeln('====================================================================================');
  532. printnode(rootnode);
  533. writeln('====================================================================================');
  534. writeln;
  535. {$endif csedebug}
  536. result:=nil;
  537. end;
  538. type
  539. tconstentry = record
  540. valuenode : tnode;
  541. weight : TCGInt;
  542. temp : ttempcreatenode;
  543. end;
  544. pconstentry = ^tconstentry;
  545. tconstentries = array of tconstentry;
  546. pconstentries = ^tconstentries;
  547. function CSEOnReference(n : tnode) : Boolean;
  548. begin
  549. Result:=(n.nodetype=loadn) and (tloadnode(n).symtableentry.typ=staticvarsym)
  550. and ((vo_is_thread_var in tstaticvarsym(tloadnode(n).symtableentry).varoptions) or
  551. (cs_create_pic in current_settings.moduleswitches)
  552. {$if defined(aarch64) or defined(sparc) or defined(sparc64) or defined(riscv)}
  553. or (not(tabstractvarsym(tloadnode(n).symtableentry).is_regvar(false)))
  554. {$endif defined(aarch64) or defined(sparc) or defined(sparc64) or defined(riscv)}
  555. );
  556. end;
  557. function collectconsts(var n:tnode; arg: pointer) : foreachnoderesult;
  558. var
  559. consts: pconstentries;
  560. found: Boolean;
  561. i: Integer;
  562. begin
  563. result:=fen_true;
  564. consts:=pconstentries(arg);
  565. if ((n.nodetype=realconstn)
  566. {$ifdef x86}
  567. { x87 consts would end up in memory, so loading them in temps. makes no sense }
  568. and use_vectorfpu(n.resultdef)
  569. {$endif x86}
  570. ) or
  571. CSEOnReference(n) then
  572. begin
  573. found:=false;
  574. i:=0;
  575. for i:=0 to High(consts^) do
  576. begin
  577. if tnode(consts^[i].valuenode).isequal(n) then
  578. begin
  579. found:=true;
  580. break;
  581. end;
  582. end;
  583. if found then
  584. begin
  585. inc(consts^[i].weight);
  586. { non-sectioned threadvars really hurt so do more aggressive cse on them }
  587. if not(tf_section_threadvars in target_info.flags) and (n.nodetype=loadn) and (tloadnode(n).symtableentry.typ=staticvarsym) and (vo_is_thread_var in tstaticvarsym(tloadnode(n).symtableentry).varoptions) then
  588. inc(consts^[i].weight);
  589. end
  590. else
  591. begin
  592. SetLength(consts^,length(consts^)+1);
  593. with consts^[high(consts^)] do
  594. begin
  595. valuenode:=n.getcopy;
  596. valuenode.fileinfo:=current_procinfo.entrypos;
  597. weight:=1;
  598. end;
  599. end;
  600. end;
  601. end;
  602. function replaceconsts(var n:tnode; arg: pointer) : foreachnoderesult;
  603. var
  604. hp: tnode;
  605. begin
  606. result:=fen_true;
  607. if tnode(pconstentry(arg)^.valuenode).isequal(n) then
  608. begin
  609. { shall we take the address? }
  610. if CSEOnReference(pconstentry(arg)^.valuenode) then
  611. begin
  612. hp:=ctypeconvnode.create_internal(cderefnode.create(ctemprefnode.create(pconstentry(arg)^.temp)),pconstentry(arg)^.valuenode.resultdef);
  613. ttypeconvnode(hp).left.fileinfo:=n.fileinfo;
  614. end
  615. else
  616. hp:=ctemprefnode.create(pconstentry(arg)^.temp);
  617. hp.fileinfo:=n.fileinfo;
  618. n.Free;
  619. n:=hp;
  620. do_firstpass(n);
  621. end;
  622. end;
  623. function do_consttovar(var rootnode : tnode) : tnode;
  624. var
  625. constentries : tconstentries;
  626. Procedure QuickSort(L, R : Longint);
  627. var
  628. I, J, P: Longint;
  629. Item, Q : tconstentry;
  630. begin
  631. repeat
  632. I := L;
  633. J := R;
  634. P := (L + R) div 2;
  635. repeat
  636. Item := constentries[P];
  637. while Item.weight>constentries[I].weight do
  638. I := I + 1;
  639. while Item.weight<constentries[J].weight do
  640. J := J - 1;
  641. If I <= J then
  642. begin
  643. Q := constentries[I];
  644. constentries[I] := constentries[J];
  645. constentries[J] := Q;
  646. if P = I then
  647. P := J
  648. else if P = J then
  649. P := I;
  650. I := I + 1;
  651. J := J - 1;
  652. end;
  653. until I > J;
  654. if L < J then
  655. QuickSort(L, J);
  656. L := I;
  657. until I >= R;
  658. end;
  659. var
  660. creates,
  661. deletes,
  662. statements : tstatementnode;
  663. createblock,
  664. deleteblock,
  665. rootblock : tblocknode;
  666. i, max_fpu_regs_assigned, fpu_regs_assigned,
  667. max_int_regs_assigned, int_regs_assigned: Integer;
  668. old_current_filepos: tfileposinfo;
  669. begin
  670. {$ifdef csedebug}
  671. writeln('====================================================================================');
  672. writeln('Const optimization pass started');
  673. writeln('====================================================================================');
  674. printnode(rootnode);
  675. writeln('====================================================================================');
  676. writeln;
  677. {$endif csedebug}
  678. foreachnodestatic(pm_postprocess,rootnode,@collectconsts,@constentries);
  679. createblock:=nil;
  680. deleteblock:=nil;
  681. rootblock:=nil;
  682. { estimate how many int registers can be used }
  683. if pi_do_call in current_procinfo.flags then
  684. max_int_regs_assigned:=length(paramanager.get_saved_registers_int(current_procinfo.procdef.proccalloption))
  685. { we store only addresses, so take care of the relation between address sizes and register sizes }
  686. div max(sizeof(PtrUInt) div sizeof(ALUUInt),1)
  687. { heuristics, just use a quarter of all registers at maximum }
  688. div 4
  689. else
  690. max_int_regs_assigned:=max(first_int_imreg div 4,1);
  691. {$if defined(x86) or defined(aarch64) or defined(arm)}
  692. { x86, aarch64 and arm (neglecting fpa) use mm registers for floats }
  693. if pi_do_call in current_procinfo.flags then
  694. { heuristics, just use a fifth of all registers at maximum }
  695. max_fpu_regs_assigned:=length(paramanager.get_saved_registers_mm(current_procinfo.procdef.proccalloption)) div 5
  696. else
  697. max_fpu_regs_assigned:=max(first_mm_imreg div 5,1);
  698. {$else defined(x86) or defined(aarch64) or defined(arm)}
  699. if pi_do_call in current_procinfo.flags then
  700. { heuristics, just use a fifth of all registers at maximum }
  701. max_fpu_regs_assigned:=length(paramanager.get_saved_registers_fpu(current_procinfo.procdef.proccalloption)) div 5
  702. else
  703. max_fpu_regs_assigned:=max(first_fpu_imreg div 5,1);
  704. {$endif defined(x86) or defined(aarch64) or defined(arm)}
  705. fpu_regs_assigned:=0;
  706. int_regs_assigned:=0;
  707. if Length(constentries)>0 then
  708. begin
  709. { sort entries by weight }
  710. QuickSort(0,High(constentries));
  711. { assign only the constants with the highest weight to a register }
  712. for i:=High(constentries) downto 0 do
  713. begin
  714. if (constentries[i].valuenode.nodetype=realconstn) and
  715. { if there is a call, we need most likely to save/restore a register }
  716. ((constentries[i].weight>3) or
  717. ((constentries[i].weight>1) and not(pi_do_call in current_procinfo.flags)))
  718. then
  719. begin
  720. if fpu_regs_assigned>=max_fpu_regs_assigned then
  721. break;
  722. old_current_filepos:=current_filepos;
  723. current_filepos:=current_procinfo.entrypos;
  724. if not(assigned(createblock)) then
  725. begin
  726. rootblock:=internalstatements(statements);
  727. createblock:=internalstatements(creates);
  728. deleteblock:=internalstatements(deletes);
  729. end;
  730. constentries[i].temp:=ctempcreatenode.create(constentries[i].valuenode.resultdef,
  731. constentries[i].valuenode.resultdef.size,tt_persistent,true);
  732. addstatement(creates,constentries[i].temp);
  733. addstatement(creates,cassignmentnode.create_internal(ctemprefnode.create(constentries[i].temp),constentries[i].valuenode));
  734. current_filepos:=old_current_filepos;
  735. foreachnodestatic(pm_postprocess,rootnode,@replaceconsts,@constentries[i]);
  736. inc(fpu_regs_assigned);
  737. end
  738. else if CSEOnReference(constentries[i].valuenode) and
  739. { if there is a call, we need most likely to save/restore a register }
  740. ((constentries[i].weight>2) or
  741. ((constentries[i].weight>1) and not(pi_do_call in current_procinfo.flags)))
  742. then
  743. begin
  744. if int_regs_assigned>=max_int_regs_assigned then
  745. break;
  746. old_current_filepos:=current_filepos;
  747. current_filepos:=current_procinfo.entrypos;
  748. if not(assigned(createblock)) then
  749. begin
  750. rootblock:=internalstatements(statements);
  751. createblock:=internalstatements(creates);
  752. deleteblock:=internalstatements(deletes);
  753. end;
  754. constentries[i].temp:=ctempcreatenode.create(cpointerdef.getreusable(constentries[i].valuenode.resultdef),
  755. voidpointertype.size,tt_persistent,true);
  756. addstatement(creates,constentries[i].temp);
  757. addstatement(creates,cassignmentnode.create_internal(ctemprefnode.create(constentries[i].temp),
  758. caddrnode.create_internal(constentries[i].valuenode)));
  759. current_filepos:=old_current_filepos;
  760. foreachnodestatic(pm_postprocess,rootnode,@replaceconsts,@constentries[i]);
  761. inc(int_regs_assigned);
  762. end;
  763. end;
  764. end;
  765. if assigned(createblock) then
  766. begin
  767. addstatement(statements,createblock);
  768. addstatement(statements,rootnode);
  769. addstatement(statements,deleteblock);
  770. rootnode:=rootblock;
  771. do_firstpass(rootnode);
  772. end;
  773. {$ifdef csedebug}
  774. writeln('====================================================================================');
  775. writeln('Const optimization result');
  776. writeln('====================================================================================');
  777. printnode(rootnode);
  778. writeln('====================================================================================');
  779. writeln;
  780. {$endif csedebug}
  781. result:=nil;
  782. end;
  783. end.