optcse.pas 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. {
  2. Common subexpression elimination on base blocks
  3. Copyright (c) 2005-2012 by Florian Klaempfl
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  15. ****************************************************************************
  16. }
  17. unit optcse;
  18. {$i fpcdefs.inc}
  19. { $define csedebug}
  20. { $define csestats}
  21. interface
  22. uses
  23. node;
  24. {
  25. the function creates non optimal code so far:
  26. - call para nodes are cse barriers because they can be reordered and thus the
  27. temp. creation could be done too late
  28. - the cse knows nothing about register pressure. In case of high register pressure, cse might
  29. have a negative impact
  30. - the list of cseinvariant node types and inline numbers is not complete yet
  31. Further, it could be done probably in a faster way though the complexity can't probably not reduced
  32. }
  33. function do_optcse(var rootnode : tnode) : tnode;
  34. implementation
  35. uses
  36. globtype,globals,
  37. cutils,cclasses,
  38. nutils,
  39. nbas,nld,ninl,ncal,nadd,nmem,
  40. pass_1,
  41. symconst,symdef,symsym,
  42. defutil,
  43. optbase;
  44. const
  45. cseinvariant : set of tnodetype = [addn,muln,subn,divn,slashn,modn,andn,orn,xorn,notn,vecn,
  46. derefn,equaln,unequaln,ltn,gtn,lten,gten,typeconvn,subscriptn,
  47. inn,symdifn,shrn,shln,ordconstn,realconstn,unaryminusn,pointerconstn,stringconstn,setconstn,niln,
  48. setelementn,{arrayconstructorn,arrayconstructorrangen,}
  49. isn,asn,starstarn,nothingn,temprefn,loadparentfpn {,callparan},assignn];
  50. function searchsubdomain(var n:tnode; arg: pointer) : foreachnoderesult;
  51. begin
  52. if (n.nodetype in cseinvariant) or
  53. ((n.nodetype=inlinen) and
  54. (tinlinenode(n).inlinenumber in [in_assigned_x])
  55. ) or
  56. ((n.nodetype=callparan) and not(assigned(tcallparanode(n).right))) or
  57. ((n.nodetype=loadn) and
  58. not((tloadnode(n).symtableentry.typ in [staticvarsym,localvarsym,paravarsym]) and
  59. (vo_volatile in tabstractvarsym(tloadnode(n).symtableentry).varoptions))
  60. ) then
  61. result:=fen_true
  62. else
  63. begin
  64. pboolean(arg)^:=false;
  65. result:=fen_norecurse_true;
  66. end;
  67. end;
  68. type
  69. tlists = record
  70. nodelist : tfplist;
  71. locationlist : tfplist;
  72. equalto : tfplist;
  73. refs : tfplist;
  74. avail : TDFASet;
  75. end;
  76. plists = ^tlists;
  77. { collectnodes needs the address of itself to call foreachnodestatic,
  78. so we need a wrapper because @<func> inside <func doesn't work }
  79. function collectnodes(var n:tnode; arg: pointer) : foreachnoderesult;forward;
  80. function collectnodes2(var n:tnode; arg: pointer) : foreachnoderesult;
  81. begin
  82. result:=collectnodes(n,arg);
  83. end;
  84. function collectnodes(var n:tnode; arg: pointer) : foreachnoderesult;
  85. { when compiling a tree like
  86. and
  87. / \
  88. and C
  89. / \
  90. A B
  91. all expressions of B are available during evaluation of C. However considerung the whole expression,
  92. values of B and C might not be available due to short boolean evaluation.
  93. So recurseintobooleanchain detectes such chained and/or expressions and makes sub-expressions of B
  94. available during the evaluation of C
  95. firstleftend is later used to remove all sub expressions of B and C by storing the expression count
  96. in the cse table after handling A
  97. }
  98. var
  99. firstleftend : longint;
  100. procedure recurseintobooleanchain(t : tnodetype;n : tnode);
  101. begin
  102. if (tbinarynode(n).left.nodetype=t) and is_boolean(tbinarynode(n).left.resultdef) then
  103. recurseintobooleanchain(t,tbinarynode(n).left)
  104. else
  105. foreachnodestatic(pm_postprocess,tbinarynode(n).left,@collectnodes2,arg);
  106. firstleftend:=min(plists(arg)^.nodelist.count,firstleftend);
  107. foreachnodestatic(pm_postprocess,tbinarynode(n).right,@collectnodes2,arg);
  108. end;
  109. var
  110. i : longint;
  111. begin
  112. result:=fen_false;
  113. { don't add the tree below an untyped const parameter: there is
  114. no information available that this kind of tree actually needs
  115. to be addresable, this could be improved }
  116. if ((n.nodetype=callparan) and
  117. (tcallparanode(n).left.resultdef.typ=formaldef) and
  118. (tcallparanode(n).parasym.varspez=vs_const)) then
  119. begin
  120. result:=fen_norecurse_false;
  121. exit;
  122. end;
  123. if
  124. { node possible to add? }
  125. assigned(n.resultdef) and
  126. (
  127. { regable expressions }
  128. (actualtargetnode(@n)^.flags*[nf_write,nf_modify]=[]) and
  129. ((tstoreddef(n.resultdef).is_intregable or tstoreddef(n.resultdef).is_fpuregable) and
  130. { is_int/fpuregable allows arrays and records to be in registers, cse cannot handle this }
  131. (not(n.resultdef.typ in [arraydef,recorddef])) and
  132. { same for voiddef }
  133. not(is_void(n.resultdef)) and
  134. { adding tempref and callpara nodes itself is worthless but
  135. their complexity is probably <= 1 anyways
  136. neither add setelementn nodes because the compiler sometimes depends on the fact
  137. that a certain node stays a setelementn, this does not hurt either because
  138. setelementn nodes itself generate no real code (except moving data into register) }
  139. not(n.nodetype in [temprefn,callparan,setelementn]) and
  140. { node worth to add?
  141. We consider almost every node because even loading a variables from
  142. a register instead of memory is more beneficial. This behaviour should
  143. not increase register pressure because if a variable is already
  144. in a register, the reg. allocator can merge the nodes. If a variable
  145. is loaded from memory, loading this variable and spilling another register
  146. should not add a speed penalty.
  147. }
  148. {
  149. load nodes are not considered if they load para or local symbols from the
  150. current stack frame, those are in registers anyways if possible
  151. }
  152. (not(actualtargetnode(@n)^.nodetype=loadn) or
  153. not(tloadnode(actualtargetnode(@n)^).symtableentry.typ in [paravarsym,localvarsym,staticvarsym]) or
  154. { apply cse on non-regable variables }
  155. ((tloadnode(actualtargetnode(@n)^).symtableentry.typ in [paravarsym,localvarsym,staticvarsym]) and
  156. not(tabstractvarsym(tloadnode(actualtargetnode(@n)^).symtableentry).is_regvar(false)) and
  157. not(vo_volatile in tabstractvarsym(tloadnode(actualtargetnode(@n)^).symtableentry).varoptions)) or
  158. (node_complexity(n)>1)
  159. ) and
  160. {
  161. Const nodes however are only considered if their complexity is >1
  162. This might be the case for the risc architectures if they need
  163. more than one instruction to load this particular value
  164. }
  165. (not(is_constnode(n)) or (node_complexity(n)>1)))
  166. {$ifndef x86}
  167. or
  168. { store reference of expression? }
  169. { loading the address of a global symbol takes typically more than
  170. one instruction on every platform except x86
  171. so consider in this case loading the address of the data
  172. }
  173. (((n.resultdef.typ in [arraydef,recorddef]) or is_object(n.resultdef)) and
  174. (n.nodetype=loadn) and
  175. (tloadnode(n).symtableentry.typ=staticvarsym)
  176. )
  177. {$endif x86}
  178. ) then
  179. begin
  180. plists(arg)^.nodelist.Add(n);
  181. plists(arg)^.locationlist.Add(@n);
  182. plists(arg)^.refs.Add(nil);
  183. plists(arg)^.equalto.Add(pointer(-1));
  184. DFASetInclude(plists(arg)^.avail,plists(arg)^.nodelist.count-1);
  185. for i:=0 to plists(arg)^.nodelist.count-2 do
  186. begin
  187. if tnode(plists(arg)^.nodelist[i]).isequal(n) and DFASetIn(plists(arg)^.avail,i) then
  188. begin
  189. { use always the first occurence }
  190. if plists(arg)^.equalto[i]<>pointer(-1) then
  191. plists(arg)^.equalto[plists(arg)^.nodelist.count-1]:=plists(arg)^.equalto[i]
  192. else
  193. plists(arg)^.equalto[plists(arg)^.nodelist.count-1]:=pointer(ptrint(i));
  194. plists(arg)^.refs[i]:=pointer(plists(arg)^.refs[i])+1;
  195. break;
  196. end;
  197. end;
  198. end;
  199. { boolean and/or require a special handling: after evaluating the and/or node,
  200. the expressions of the right side might not be available due to short boolean
  201. evaluation, so after handling the right side, mark those expressions
  202. as unavailable }
  203. if (n.nodetype in [orn,andn]) and is_boolean(taddnode(n).left.resultdef) then
  204. begin
  205. firstleftend:=high(longint);
  206. recurseintobooleanchain(n.nodetype,n);
  207. for i:=firstleftend to plists(arg)^.nodelist.count-1 do
  208. DFASetExclude(plists(arg)^.avail,i);
  209. result:=fen_norecurse_false;
  210. end;
  211. {$ifdef cpuhighleveltarget}
  212. { The high level targets use the functionality from ncgnstld for
  213. nested accesses, and that one stores the complete location of the
  214. nested variable in tloadnode.left rather than only the location of
  215. the parent context containing it. This causes problems with the
  216. CSE in case the nested variable is used as an lvalue, so disable
  217. CSE in that case
  218. }
  219. if (n.nodetype=loadn) and assigned(tloadnode(n).left) then
  220. result:=fen_norecurse_false;
  221. {$endif}
  222. end;
  223. function searchcsedomain(var n: tnode; arg: pointer) : foreachnoderesult;
  224. var
  225. csedomain : boolean;
  226. lists : tlists;
  227. templist : tfplist;
  228. i : longint;
  229. def : tstoreddef;
  230. nodes : tblocknode;
  231. creates,
  232. statements : tstatementnode;
  233. hp : ttempcreatenode;
  234. addrstored : boolean;
  235. hp2 : tnode;
  236. begin
  237. result:=fen_false;
  238. nodes:=nil;
  239. if n.nodetype in cseinvariant then
  240. begin
  241. csedomain:=true;
  242. foreachnodestatic(pm_postprocess,n,@searchsubdomain,@csedomain);
  243. if not(csedomain) then
  244. begin
  245. { try to transform the tree to get better cse domains, consider:
  246. +
  247. / \
  248. + C
  249. / \
  250. A B
  251. if A is not cse'able but B and C are, then the compiler cannot do cse so the tree is transformed into
  252. +
  253. / \
  254. A +
  255. / \
  256. B C
  257. Because A could be another tree of this kind, the whole process is done in a while loop
  258. }
  259. if (n.nodetype in [andn,orn,addn,muln]) and
  260. (n.nodetype=tbinarynode(n).left.nodetype) and
  261. { do is optimizations only for integers, reals (no currency!), vectors, sets or booleans }
  262. (is_integer(n.resultdef) or is_real(n.resultdef) or is_vector(n.resultdef) or is_set(n.resultdef) or
  263. is_boolean(n.resultdef)) and
  264. { either if fastmath is on }
  265. ((cs_opt_fastmath in current_settings.optimizerswitches) or
  266. { or for the logical operators, they cannot overflow }
  267. (n.nodetype in [andn,orn]) or
  268. { or for integers if range checking is off }
  269. ((is_integer(n.resultdef) and
  270. (n.localswitches*[cs_check_range,cs_check_overflow]=[]) and
  271. (tbinarynode(n).left.localswitches*[cs_check_range,cs_check_overflow]=[]))) or
  272. { for sets, we can do this always }
  273. (is_set(n.resultdef))
  274. ) then
  275. while n.nodetype=tbinarynode(n).left.nodetype do
  276. begin
  277. csedomain:=true;
  278. foreachnodestatic(pm_postprocess,tbinarynode(n).right,@searchsubdomain,@csedomain);
  279. if csedomain then
  280. begin
  281. csedomain:=true;
  282. foreachnodestatic(pm_postprocess,tbinarynode(tbinarynode(n).left).right,@searchsubdomain,@csedomain);
  283. if csedomain then
  284. begin
  285. hp2:=tbinarynode(tbinarynode(n).left).left;
  286. tbinarynode(tbinarynode(n).left).left:=tbinarynode(tbinarynode(n).left).right;
  287. tbinarynode(tbinarynode(n).left).right:=tbinarynode(n).right;
  288. tbinarynode(n).right:=tbinarynode(n).left;
  289. tbinarynode(n).left:=hp2;
  290. { the transformed tree could result in new possibilities to fold constants
  291. so force a firstpass on the root node }
  292. exclude(tbinarynode(n).right.flags,nf_pass1_done);
  293. do_firstpass(tbinarynode(n).right);
  294. end
  295. else
  296. break;
  297. end
  298. else
  299. break;
  300. end;
  301. end
  302. else
  303. begin
  304. statements:=nil;
  305. result:=fen_norecurse_true;
  306. {$ifdef csedebug}
  307. writeln('============ cse domain ==================');
  308. printnode(output,n);
  309. writeln('Complexity: ',node_complexity(n));
  310. {$endif csedebug}
  311. lists.nodelist:=tfplist.create;
  312. lists.locationlist:=tfplist.create;
  313. lists.equalto:=tfplist.create;
  314. lists.refs:=tfplist.create;
  315. foreachnodestatic(pm_postprocess,n,@collectnodes,@lists);
  316. templist:=tfplist.create;
  317. templist.count:=lists.nodelist.count;
  318. { check all nodes if one is used more than once }
  319. for i:=0 to lists.nodelist.count-1 do
  320. begin
  321. { current node used more than once? }
  322. if assigned(lists.refs[i]) then
  323. begin
  324. if not(assigned(statements)) then
  325. begin
  326. nodes:=internalstatements(statements);
  327. addstatement(statements,internalstatements(creates));
  328. end;
  329. def:=tstoreddef(tnode(lists.nodelist[i]).resultdef);
  330. { we cannot handle register stored records or array in CSE yet
  331. but we can store their reference }
  332. addrstored:=(def.typ in [arraydef,recorddef]) or is_object(def);
  333. if addrstored then
  334. templist[i]:=ctempcreatenode.create_value(getpointerdef(def),voidpointertype.size,tt_persistent,
  335. true,caddrnode.create(tnode(lists.nodelist[i])))
  336. else
  337. templist[i]:=ctempcreatenode.create_value(def,def.size,tt_persistent,
  338. def.is_intregable or def.is_fpuregable,tnode(lists.nodelist[i]));
  339. { make debugging easier and set temp. location to the original location }
  340. tnode(templist[i]).fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  341. addstatement(creates,tnode(templist[i]));
  342. { make debugging easier and set temp. location to the original location }
  343. creates.fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  344. hp:=ttempcreatenode(templist[i]);
  345. do_firstpass(tnode(hp));
  346. templist[i]:=hp;
  347. if addrstored then
  348. pnode(lists.locationlist[i])^:=cderefnode.Create(ctemprefnode.create(ttempcreatenode(templist[i])))
  349. else
  350. pnode(lists.locationlist[i])^:=ctemprefnode.create(ttempcreatenode(templist[i]));
  351. { make debugging easier and set temp. location to the original location }
  352. pnode(lists.locationlist[i])^.fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  353. do_firstpass(pnode(lists.locationlist[i])^);
  354. {$ifdef csedebug}
  355. printnode(output,statements);
  356. {$endif csedebug}
  357. end
  358. { current node reference to another node? }
  359. else if lists.equalto[i]<>pointer(-1) then
  360. begin
  361. def:=tstoreddef(tnode(lists.nodelist[i]).resultdef);
  362. { we cannot handle register stored records or array in CSE yet
  363. but we can store their reference }
  364. addrstored:=(def.typ in [arraydef,recorddef]) or is_object(def);
  365. {$if defined(csedebug) or defined(csestats)}
  366. writeln;
  367. writeln('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!');
  368. writeln('Complexity: ',node_complexity(tnode(lists.nodelist[i])),' Node ',i,' equals Node ',ptrint(lists.equalto[i]));
  369. printnode(output,tnode(lists.nodelist[i]));
  370. printnode(output,tnode(lists.nodelist[ptrint(lists.equalto[i])]));
  371. writeln('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!');
  372. writeln;
  373. {$endif defined(csedebug) or defined(csestats)}
  374. templist[i]:=templist[ptrint(lists.equalto[i])];
  375. if addrstored then
  376. pnode(lists.locationlist[i])^:=cderefnode.Create(ctemprefnode.create(ttempcreatenode(templist[ptrint(lists.equalto[i])])))
  377. else
  378. pnode(lists.locationlist[i])^:=ctemprefnode.create(ttempcreatenode(templist[ptrint(lists.equalto[i])]));
  379. { make debugging easier and set temp. location to the original location }
  380. pnode(lists.locationlist[i])^.fileinfo:=tnode(lists.nodelist[i]).fileinfo;
  381. do_firstpass(pnode(lists.locationlist[i])^);
  382. end;
  383. end;
  384. { clean up unused trees }
  385. for i:=0 to lists.nodelist.count-1 do
  386. if lists.equalto[i]<>pointer(-1) then
  387. tnode(lists.nodelist[i]).free;
  388. {$ifdef csedebug}
  389. writeln('nodes: ',lists.nodelist.count);
  390. writeln('==========================================');
  391. {$endif csedebug}
  392. lists.nodelist.free;
  393. lists.locationlist.free;
  394. lists.equalto.free;
  395. lists.refs.free;
  396. templist.free;
  397. if assigned(statements) then
  398. begin
  399. { call para nodes need a special handling because
  400. they can be only children nodes of call nodes
  401. so the initialization code is inserted below the
  402. call para node
  403. }
  404. if n.nodetype=callparan then
  405. begin
  406. addstatement(statements,tcallparanode(n).left);
  407. tcallparanode(n).left:=nodes;
  408. do_firstpass(tcallparanode(n).left);
  409. end
  410. else
  411. begin
  412. addstatement(statements,n);
  413. n:=nodes;
  414. do_firstpass(n);
  415. end;
  416. {$ifdef csedebug}
  417. printnode(output,nodes);
  418. {$endif csedebug}
  419. end;
  420. end
  421. end;
  422. end;
  423. function do_optcse(var rootnode : tnode) : tnode;
  424. begin
  425. {$ifdef csedebug}
  426. writeln('====================================================================================');
  427. writeln('CSE optimization pass started');
  428. writeln('====================================================================================');
  429. printnode(rootnode);
  430. writeln('====================================================================================');
  431. writeln;
  432. {$endif csedebug}
  433. foreachnodestatic(pm_postprocess,rootnode,@searchcsedomain,nil);
  434. result:=nil;
  435. end;
  436. end.