mem.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. #include "all.h"
  2. typedef struct Range Range;
  3. typedef struct Store Store;
  4. typedef struct Slot Slot;
  5. /* require use, maintains use counts */
  6. void
  7. promote(Fn *fn)
  8. {
  9. Blk *b;
  10. Ins *i, *l;
  11. Tmp *t;
  12. Use *u, *ue;
  13. int s, k;
  14. /* promote uniform stack slots to temporaries */
  15. b = fn->start;
  16. for (i=b->ins; i<&b->ins[b->nins]; i++) {
  17. if (Oalloc > i->op || i->op > Oalloc1)
  18. continue;
  19. /* specific to NAlign == 3 */
  20. assert(rtype(i->to) == RTmp);
  21. t = &fn->tmp[i->to.val];
  22. if (t->ndef != 1)
  23. goto Skip;
  24. k = -1;
  25. s = -1;
  26. for (u=t->use; u<&t->use[t->nuse]; u++) {
  27. if (u->type != UIns)
  28. goto Skip;
  29. l = u->u.ins;
  30. if (isload(l->op))
  31. if (s == -1 || s == loadsz(l)) {
  32. s = loadsz(l);
  33. continue;
  34. }
  35. if (isstore(l->op))
  36. if (req(i->to, l->arg[1]) && !req(i->to, l->arg[0]))
  37. if (s == -1 || s == storesz(l))
  38. if (k == -1 || k == optab[l->op].argcls[0][0]) {
  39. s = storesz(l);
  40. k = optab[l->op].argcls[0][0];
  41. continue;
  42. }
  43. goto Skip;
  44. }
  45. /* get rid of the alloc and replace uses */
  46. *i = (Ins){.op = Onop};
  47. t->ndef--;
  48. ue = &t->use[t->nuse];
  49. for (u=t->use; u!=ue; u++) {
  50. l = u->u.ins;
  51. if (isstore(l->op)) {
  52. l->cls = k;
  53. l->op = Ocopy;
  54. l->to = l->arg[1];
  55. l->arg[1] = R;
  56. t->nuse--;
  57. t->ndef++;
  58. } else {
  59. if (k == -1)
  60. err("slot %%%s is read but never stored to",
  61. fn->tmp[l->arg[0].val].name);
  62. /* try to turn loads into copies so we
  63. * can eliminate them later */
  64. switch(l->op) {
  65. case Oloadsw:
  66. case Oloaduw:
  67. if (k == Kl)
  68. goto Extend;
  69. /* fall through */
  70. case Oload:
  71. if (KBASE(k) != KBASE(l->cls))
  72. l->op = Ocast;
  73. else
  74. l->op = Ocopy;
  75. break;
  76. default:
  77. Extend:
  78. l->op = Oextsb + (l->op - Oloadsb);
  79. break;
  80. }
  81. }
  82. }
  83. Skip:;
  84. }
  85. if (debug['M']) {
  86. fprintf(stderr, "\n> After slot promotion:\n");
  87. printfn(fn, stderr);
  88. }
  89. }
  90. /* [a, b) with 0 <= a */
  91. struct Range {
  92. int a, b;
  93. };
  94. struct Store {
  95. int ip;
  96. Ins *i;
  97. };
  98. struct Slot {
  99. int t;
  100. int sz;
  101. bits m;
  102. bits l;
  103. Range r;
  104. Slot *s;
  105. Store *st;
  106. int nst;
  107. };
  108. static inline int
  109. rin(Range r, int n)
  110. {
  111. return r.a <= n && n < r.b;
  112. }
  113. static inline int
  114. rovlap(Range r0, Range r1)
  115. {
  116. return r0.b && r1.b && r0.a < r1.b && r1.a < r0.b;
  117. }
  118. static void
  119. radd(Range *r, int n)
  120. {
  121. if (!r->b)
  122. *r = (Range){n, n+1};
  123. else if (n < r->a)
  124. r->a = n;
  125. else if (n >= r->b)
  126. r->b = n+1;
  127. }
  128. static int
  129. slot(Slot **ps, int64_t *off, Ref r, Fn *fn, Slot *sl)
  130. {
  131. Alias a;
  132. Tmp *t;
  133. getalias(&a, r, fn);
  134. if (a.type != ALoc)
  135. return 0;
  136. t = &fn->tmp[a.base];
  137. if (t->visit < 0)
  138. return 0;
  139. *off = a.offset;
  140. *ps = &sl[t->visit];
  141. return 1;
  142. }
  143. static void
  144. load(Ref r, bits x, int ip, Fn *fn, Slot *sl)
  145. {
  146. int64_t off;
  147. Slot *s;
  148. if (slot(&s, &off, r, fn, sl)) {
  149. s->l |= x << off;
  150. s->l &= s->m;
  151. if (s->l)
  152. radd(&s->r, ip);
  153. }
  154. }
  155. static void
  156. store(Ref r, bits x, int ip, Ins *i, Fn *fn, Slot *sl)
  157. {
  158. int64_t off;
  159. Slot *s;
  160. if (slot(&s, &off, r, fn, sl)) {
  161. if (s->l) {
  162. radd(&s->r, ip);
  163. s->l &= ~(x << off);
  164. } else {
  165. vgrow(&s->st, ++s->nst);
  166. s->st[s->nst-1].ip = ip;
  167. s->st[s->nst-1].i = i;
  168. }
  169. }
  170. }
  171. static int
  172. scmp(const void *pa, const void *pb)
  173. {
  174. Slot *a, *b;
  175. a = (Slot *)pa, b = (Slot *)pb;
  176. if (a->sz != b->sz)
  177. return b->sz - a->sz;
  178. return a->r.a - b->r.a;
  179. }
  180. static void
  181. maxrpo(Blk *hd, Blk *b)
  182. {
  183. if (hd->loop < (int)b->id)
  184. hd->loop = b->id;
  185. }
  186. void
  187. coalesce(Fn *fn)
  188. {
  189. Range r, *br;
  190. Slot *s, *s0, *sl;
  191. Blk *b, **ps, *succ[3];
  192. Ins *i, **bl;
  193. Use *u;
  194. Tmp *t, *ts;
  195. Ref *arg;
  196. bits x;
  197. int64_t off0, off1;
  198. int n, m, ip, sz, nsl, nbl, *stk;
  199. uint total, freed, fused;
  200. /* minimize the stack usage
  201. * by coalescing slots
  202. */
  203. nsl = 0;
  204. sl = vnew(0, sizeof sl[0], PHeap);
  205. for (n=Tmp0; n<fn->ntmp; n++) {
  206. t = &fn->tmp[n];
  207. t->visit = -1;
  208. if (t->alias.type == ALoc)
  209. if (t->alias.slot == &t->alias)
  210. if (t->bid == fn->start->id)
  211. if (t->alias.u.loc.sz != -1) {
  212. t->visit = nsl;
  213. vgrow(&sl, ++nsl);
  214. s = &sl[nsl-1];
  215. s->t = n;
  216. s->sz = t->alias.u.loc.sz;
  217. s->m = t->alias.u.loc.m;
  218. s->s = 0;
  219. s->st = vnew(0, sizeof s->st[0], PHeap);
  220. s->nst = 0;
  221. }
  222. }
  223. /* one-pass liveness analysis */
  224. for (b=fn->start; b; b=b->link)
  225. b->loop = -1;
  226. loopiter(fn, maxrpo);
  227. nbl = 0;
  228. bl = vnew(0, sizeof bl[0], PHeap);
  229. br = emalloc(fn->nblk * sizeof br[0]);
  230. ip = INT_MAX - 1;
  231. for (n=fn->nblk-1; n>=0; n--) {
  232. b = fn->rpo[n];
  233. succ[0] = b->s1;
  234. succ[1] = b->s2;
  235. succ[2] = 0;
  236. br[n].b = ip--;
  237. for (s=sl; s<&sl[nsl]; s++) {
  238. s->l = 0;
  239. for (ps=succ; *ps; ps++) {
  240. m = (*ps)->id;
  241. if (m > n && rin(s->r, br[m].a)) {
  242. s->l = s->m;
  243. radd(&s->r, ip);
  244. }
  245. }
  246. }
  247. if (b->jmp.type == Jretc)
  248. load(b->jmp.arg, -1, --ip, fn, sl);
  249. for (i=&b->ins[b->nins]; i!=b->ins;) {
  250. --i;
  251. arg = i->arg;
  252. if (i->op == Oargc) {
  253. load(arg[1], -1, --ip, fn, sl);
  254. }
  255. if (isload(i->op)) {
  256. x = BIT(loadsz(i)) - 1;
  257. load(arg[0], x, --ip, fn, sl);
  258. }
  259. if (isstore(i->op)) {
  260. x = BIT(storesz(i)) - 1;
  261. store(arg[1], x, ip--, i, fn, sl);
  262. }
  263. if (i->op == Oblit0) {
  264. assert((i+1)->op == Oblit1);
  265. assert(rtype((i+1)->arg[0]) == RInt);
  266. sz = abs(rsval((i+1)->arg[0]));
  267. x = sz >= NBit ? (bits)-1 : BIT(sz) - 1;
  268. store(arg[1], x, ip--, i, fn, sl);
  269. load(arg[0], x, ip, fn, sl);
  270. vgrow(&bl, ++nbl);
  271. bl[nbl-1] = i;
  272. }
  273. }
  274. for (s=sl; s<&sl[nsl]; s++)
  275. if (s->l) {
  276. radd(&s->r, ip);
  277. if (b->loop != -1) {
  278. assert(b->loop >= n);
  279. radd(&s->r, br[b->loop].b - 1);
  280. }
  281. }
  282. br[n].a = ip;
  283. }
  284. free(br);
  285. /* kill dead stores */
  286. for (s=sl; s<&sl[nsl]; s++)
  287. for (n=0; n<s->nst; n++)
  288. if (!rin(s->r, s->st[n].ip)) {
  289. i = s->st[n].i;
  290. if (i->op == Oblit0)
  291. *(i+1) = (Ins){.op = Onop};
  292. *i = (Ins){.op = Onop};
  293. }
  294. /* kill slots with an empty live range */
  295. total = 0;
  296. freed = 0;
  297. stk = vnew(0, sizeof stk[0], PHeap);
  298. n = 0;
  299. for (s=s0=sl; s<&sl[nsl]; s++) {
  300. total += s->sz;
  301. if (!s->r.b) {
  302. vfree(s->st);
  303. vgrow(&stk, ++n);
  304. stk[n-1] = s->t;
  305. freed += s->sz;
  306. } else
  307. *s0++ = *s;
  308. }
  309. nsl = s0-sl;
  310. if (debug['M']) {
  311. fputs("\n> Slot coalescing:\n", stderr);
  312. if (n) {
  313. fputs("\tkill [", stderr);
  314. for (m=0; m<n; m++)
  315. fprintf(stderr, " %%%s",
  316. fn->tmp[stk[m]].name);
  317. fputs(" ]\n", stderr);
  318. }
  319. }
  320. while (n--) {
  321. t = &fn->tmp[stk[n]];
  322. assert(t->ndef == 1 && t->def);
  323. i = t->def;
  324. if (isload(i->op)) {
  325. i->op = Ocopy;
  326. i->arg[0] = UNDEF;
  327. continue;
  328. }
  329. *i = (Ins){.op = Onop};
  330. for (u=t->use; u<&t->use[t->nuse]; u++) {
  331. if (u->type == UJmp) {
  332. b = fn->rpo[u->bid];
  333. assert(isret(b->jmp.type));
  334. b->jmp.type = Jret0;
  335. b->jmp.arg = R;
  336. continue;
  337. }
  338. assert(u->type == UIns);
  339. i = u->u.ins;
  340. if (!req(i->to, R)) {
  341. assert(rtype(i->to) == RTmp);
  342. vgrow(&stk, ++n);
  343. stk[n-1] = i->to.val;
  344. } else if (isarg(i->op)) {
  345. assert(i->op == Oargc);
  346. i->arg[1] = CON_Z; /* crash */
  347. } else {
  348. if (i->op == Oblit0)
  349. *(i+1) = (Ins){.op = Onop};
  350. *i = (Ins){.op = Onop};
  351. }
  352. }
  353. }
  354. vfree(stk);
  355. /* fuse slots by decreasing size */
  356. qsort(sl, nsl, sizeof *sl, scmp);
  357. fused = 0;
  358. for (n=0; n<nsl; n++) {
  359. s0 = &sl[n];
  360. if (s0->s)
  361. continue;
  362. s0->s = s0;
  363. r = s0->r;
  364. for (s=s0+1; s<&sl[nsl]; s++) {
  365. if (s->s || !s->r.b)
  366. goto Skip;
  367. if (rovlap(r, s->r))
  368. /* O(n); can be approximated
  369. * by 'goto Skip;' if need be
  370. */
  371. for (m=n; &sl[m]<s; m++)
  372. if (sl[m].s == s0)
  373. if (rovlap(sl[m].r, s->r))
  374. goto Skip;
  375. radd(&r, s->r.a);
  376. radd(&r, s->r.b - 1);
  377. s->s = s0;
  378. fused += s->sz;
  379. Skip:;
  380. }
  381. }
  382. /* substitute fused slots */
  383. for (s=sl; s<&sl[nsl]; s++) {
  384. t = &fn->tmp[s->t];
  385. /* the visit link is stale,
  386. * reset it before the slot()
  387. * calls below
  388. */
  389. t->visit = s-sl;
  390. assert(t->ndef == 1 && t->def);
  391. if (s->s == s)
  392. continue;
  393. *t->def = (Ins){.op = Onop};
  394. ts = &fn->tmp[s->s->t];
  395. assert(t->bid == ts->bid);
  396. if (t->def < ts->def) {
  397. /* make sure the slot we
  398. * selected has a def that
  399. * dominates its new uses
  400. */
  401. *t->def = *ts->def;
  402. *ts->def = (Ins){.op = Onop};
  403. ts->def = t->def;
  404. }
  405. for (u=t->use; u<&t->use[t->nuse]; u++) {
  406. if (u->type == UJmp) {
  407. b = fn->rpo[u->bid];
  408. b->jmp.arg = TMP(s->s->t);
  409. continue;
  410. }
  411. assert(u->type == UIns);
  412. arg = u->u.ins->arg;
  413. for (n=0; n<2; n++)
  414. if (req(arg[n], TMP(s->t)))
  415. arg[n] = TMP(s->s->t);
  416. }
  417. }
  418. /* fix newly overlapping blits */
  419. for (n=0; n<nbl; n++) {
  420. i = bl[n];
  421. if (i->op == Oblit0)
  422. if (slot(&s, &off0, i->arg[0], fn, sl))
  423. if (slot(&s0, &off1, i->arg[1], fn, sl))
  424. if (s->s == s0->s) {
  425. if (off0 < off1) {
  426. sz = rsval((i+1)->arg[0]);
  427. assert(sz >= 0);
  428. (i+1)->arg[0] = INT(-sz);
  429. } else if (off0 == off1) {
  430. *i = (Ins){.op = Onop};
  431. *(i+1) = (Ins){.op = Onop};
  432. }
  433. }
  434. }
  435. vfree(bl);
  436. if (debug['M']) {
  437. for (s0=sl; s0<&sl[nsl]; s0++) {
  438. if (s0->s != s0)
  439. continue;
  440. fprintf(stderr, "\tfuse (% 3db) [", s0->sz);
  441. for (s=s0; s<&sl[nsl]; s++) {
  442. if (s->s != s0)
  443. continue;
  444. fprintf(stderr, " %%%s", fn->tmp[s->t].name);
  445. if (s->r.b)
  446. fprintf(stderr, "[%d,%d)",
  447. s->r.a-ip, s->r.b-ip);
  448. else
  449. fputs("{}", stderr);
  450. }
  451. fputs(" ]\n", stderr);
  452. }
  453. fprintf(stderr, "\tsums %u/%u/%u (killed/fused/total)\n\n",
  454. freed, fused, total);
  455. printfn(fn, stderr);
  456. }
  457. for (s=sl; s<&sl[nsl]; s++)
  458. vfree(s->st);
  459. vfree(sl);
  460. }