| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537 |
- #include "all.h"
- static void
- aggreg(Blk *hd, Blk *b)
- {
- int k;
- /* aggregate looping information at
- * loop headers */
- bsunion(hd->gen, b->gen);
- for (k=0; k<2; k++)
- if (b->nlive[k] > hd->nlive[k])
- hd->nlive[k] = b->nlive[k];
- }
- static void
- tmpuse(Ref r, int use, int loop, Fn *fn)
- {
- Mem *m;
- Tmp *t;
- if (rtype(r) == RMem) {
- m = &fn->mem[r.val];
- tmpuse(m->base, 1, loop, fn);
- tmpuse(m->index, 1, loop, fn);
- }
- else if (rtype(r) == RTmp && r.val >= Tmp0) {
- t = &fn->tmp[r.val];
- t->nuse += use;
- t->ndef += !use;
- t->cost += loop;
- }
- }
- /* evaluate spill costs of temporaries,
- * this also fills usage information
- * requires rpo, preds
- */
- void
- fillcost(Fn *fn)
- {
- int n;
- uint a;
- Blk *b;
- Ins *i;
- Tmp *t;
- Phi *p;
- loopiter(fn, aggreg);
- if (debug['S']) {
- fprintf(stderr, "\n> Loop information:\n");
- for (b=fn->start; b; b=b->link) {
- for (a=0; a<b->npred; ++a)
- if (b->id <= b->pred[a]->id)
- break;
- if (a != b->npred) {
- fprintf(stderr, "\t%-10s", b->name);
- fprintf(stderr, " (% 3d ", b->nlive[0]);
- fprintf(stderr, "% 3d) ", b->nlive[1]);
- dumpts(b->gen, fn->tmp, stderr);
- }
- }
- }
- for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
- t->cost = t-fn->tmp < Tmp0 ? UINT_MAX : 0;
- t->nuse = 0;
- t->ndef = 0;
- }
- for (b=fn->start; b; b=b->link) {
- for (p=b->phi; p; p=p->link) {
- t = &fn->tmp[p->to.val];
- tmpuse(p->to, 0, 0, fn);
- for (a=0; a<p->narg; a++) {
- n = p->blk[a]->loop;
- t->cost += n;
- tmpuse(p->arg[a], 1, n, fn);
- }
- }
- n = b->loop;
- for (i=b->ins; i<&b->ins[b->nins]; i++) {
- tmpuse(i->to, 0, n, fn);
- tmpuse(i->arg[0], 1, n, fn);
- tmpuse(i->arg[1], 1, n, fn);
- }
- tmpuse(b->jmp.arg, 1, n, fn);
- }
- if (debug['S']) {
- fprintf(stderr, "\n> Spill costs:\n");
- for (n=Tmp0; n<fn->ntmp; n++)
- fprintf(stderr, "\t%-10s %d\n",
- fn->tmp[n].name,
- fn->tmp[n].cost);
- fprintf(stderr, "\n");
- }
- }
- static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
- static Tmp *tmp; /* current temporaries (for tcmpX) */
- static int ntmp; /* current # of temps (for limit) */
- static int locs; /* stack size used by locals */
- static int slot4; /* next slot of 4 bytes */
- static int slot8; /* ditto, 8 bytes */
- static BSet mask[2][1]; /* class masks */
- static int
- tcmp0(const void *pa, const void *pb)
- {
- uint ca, cb;
- ca = tmp[*(int *)pa].cost;
- cb = tmp[*(int *)pb].cost;
- return (cb < ca) ? -1 : (cb > ca);
- }
- static int
- tcmp1(const void *pa, const void *pb)
- {
- int c;
- c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
- return c ? c : tcmp0(pa, pb);
- }
- static Ref
- slot(int t)
- {
- int s;
- assert(t >= Tmp0 && "cannot spill register");
- s = tmp[t].slot;
- if (s == -1) {
- /* specific to NAlign == 3 */
- /* nice logic to pack stack slots
- * on demand, there can be only
- * one hole and slot4 points to it
- *
- * invariant: slot4 <= slot8
- */
- if (KWIDE(tmp[t].cls)) {
- s = slot8;
- if (slot4 == slot8)
- slot4 += 2;
- slot8 += 2;
- } else {
- s = slot4;
- if (slot4 == slot8) {
- slot8 += 2;
- slot4 += 1;
- } else
- slot4 = slot8;
- }
- s += locs;
- tmp[t].slot = s;
- }
- return SLOT(s);
- }
- /* restricts b to hold at most k
- * temporaries, preferring those
- * present in f (if given), then
- * those with the largest spill
- * cost
- */
- static void
- limit(BSet *b, int k, BSet *f)
- {
- static int *tarr, maxt;
- int i, t, nt;
- nt = bscount(b);
- if (nt <= k)
- return;
- if (nt > maxt) {
- free(tarr);
- tarr = emalloc(nt * sizeof tarr[0]);
- maxt = nt;
- }
- for (i=0, t=0; bsiter(b, &t); t++) {
- bsclr(b, t);
- tarr[i++] = t;
- }
- if (nt > 1) {
- if (!f)
- qsort(tarr, nt, sizeof tarr[0], tcmp0);
- else {
- fst = f;
- qsort(tarr, nt, sizeof tarr[0], tcmp1);
- }
- }
- for (i=0; i<k && i<nt; i++)
- bsset(b, tarr[i]);
- for (; i<nt; i++)
- slot(tarr[i]);
- }
- /* spills temporaries to fit the
- * target limits using the same
- * preferences as limit(); assumes
- * that k1 gprs and k2 fprs are
- * currently in use
- */
- static void
- limit2(BSet *b1, int k1, int k2, BSet *f)
- {
- BSet b2[1];
- bsinit(b2, ntmp); /* todo, free those */
- bscopy(b2, b1);
- bsinter(b1, mask[0]);
- bsinter(b2, mask[1]);
- limit(b1, T.ngpr - k1, f);
- limit(b2, T.nfpr - k2, f);
- bsunion(b1, b2);
- }
- static void
- sethint(BSet *u, bits r)
- {
- int t;
- for (t=Tmp0; bsiter(u, &t); t++)
- tmp[phicls(t, tmp)].hint.m |= r;
- }
- /* reloads temporaries in u that are
- * not in v from their slots
- */
- static void
- reloads(BSet *u, BSet *v)
- {
- int t;
- for (t=Tmp0; bsiter(u, &t); t++)
- if (!bshas(v, t))
- emit(Oload, tmp[t].cls, TMP(t), slot(t), R);
- }
- static void
- store(Ref r, int s)
- {
- if (s != -1)
- emit(Ostorew + tmp[r.val].cls, 0, R, r, SLOT(s));
- }
- static int
- regcpy(Ins *i)
- {
- return i->op == Ocopy && isreg(i->arg[0]);
- }
- static Ins *
- dopm(Blk *b, Ins *i, BSet *v)
- {
- int n, t;
- BSet u[1];
- Ins *i1;
- bits r;
- bsinit(u, ntmp); /* todo, free those */
- /* consecutive copies from
- * registers need to be handled
- * as one large instruction
- *
- * fixme: there is an assumption
- * that calls are always followed
- * by copy instructions here, this
- * might not be true if previous
- * passes change
- */
- i1 = ++i;
- do {
- i--;
- t = i->to.val;
- if (!req(i->to, R))
- if (bshas(v, t)) {
- bsclr(v, t);
- store(i->to, tmp[t].slot);
- }
- bsset(v, i->arg[0].val);
- } while (i != b->ins && regcpy(i-1));
- bscopy(u, v);
- if (i != b->ins && (i-1)->op == Ocall) {
- v->t[0] &= ~T.retregs((i-1)->arg[1], 0);
- limit2(v, T.nrsave[0], T.nrsave[1], 0);
- for (n=0, r=0; T.rsave[n]>=0; n++)
- r |= BIT(T.rsave[n]);
- v->t[0] |= T.argregs((i-1)->arg[1], 0);
- } else {
- limit2(v, 0, 0, 0);
- r = v->t[0];
- }
- sethint(v, r);
- reloads(u, v);
- do
- emiti(*--i1);
- while (i1 != i);
- return i;
- }
- static void
- merge(BSet *u, Blk *bu, BSet *v, Blk *bv)
- {
- int t;
- if (bu->loop <= bv->loop)
- bsunion(u, v);
- else
- for (t=0; bsiter(v, &t); t++)
- if (tmp[t].slot == -1)
- bsset(u, t);
- }
- /* spill code insertion
- * requires spill costs, rpo, liveness
- *
- * Note: this will replace liveness
- * information (in, out) with temporaries
- * that must be in registers at block
- * borders
- *
- * Be careful with:
- * - Ocopy instructions to ensure register
- * constraints
- */
- void
- spill(Fn *fn)
- {
- Blk *b, *s1, *s2, *hd, **bp;
- int j, l, t, k, lvarg[2];
- uint n;
- BSet u[1], v[1], w[1];
- Ins *i;
- Phi *p;
- Mem *m;
- bits r;
- tmp = fn->tmp;
- ntmp = fn->ntmp;
- bsinit(u, ntmp);
- bsinit(v, ntmp);
- bsinit(w, ntmp);
- bsinit(mask[0], ntmp);
- bsinit(mask[1], ntmp);
- locs = fn->slot;
- slot4 = 0;
- slot8 = 0;
- for (t=0; t<ntmp; t++) {
- k = 0;
- if (t >= T.fpr0 && t < T.fpr0 + T.nfpr)
- k = 1;
- if (t >= Tmp0)
- k = KBASE(tmp[t].cls);
- bsset(mask[k], t);
- }
- for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
- b = *--bp;
- /* invariant: all blocks with bigger rpo got
- * their in,out updated. */
- /* 1. find temporaries in registers at
- * the end of the block (put them in v) */
- curi = 0;
- s1 = b->s1;
- s2 = b->s2;
- hd = 0;
- if (s1 && s1->id <= b->id)
- hd = s1;
- if (s2 && s2->id <= b->id)
- if (!hd || s2->id >= hd->id)
- hd = s2;
- if (hd) {
- /* back-edge */
- bszero(v);
- hd->gen->t[0] |= T.rglob; /* don't spill registers */
- for (k=0; k<2; k++) {
- n = k == 0 ? T.ngpr : T.nfpr;
- bscopy(u, b->out);
- bsinter(u, mask[k]);
- bscopy(w, u);
- bsinter(u, hd->gen);
- bsdiff(w, hd->gen);
- if (bscount(u) < n) {
- j = bscount(w); /* live through */
- l = hd->nlive[k];
- limit(w, n - (l - j), 0);
- bsunion(u, w);
- } else
- limit(u, n, 0);
- bsunion(v, u);
- }
- } else if (s1) {
- /* avoid reloading temporaries
- * in the middle of loops */
- bszero(v);
- liveon(w, b, s1);
- merge(v, b, w, s1);
- if (s2) {
- liveon(u, b, s2);
- merge(v, b, u, s2);
- bsinter(w, u);
- }
- limit2(v, 0, 0, w);
- } else {
- bscopy(v, b->out);
- if (rtype(b->jmp.arg) == RCall)
- v->t[0] |= T.retregs(b->jmp.arg, 0);
- }
- for (t=Tmp0; bsiter(b->out, &t); t++)
- if (!bshas(v, t))
- slot(t);
- bscopy(b->out, v);
- /* 2. process the block instructions */
- if (rtype(b->jmp.arg) == RTmp) {
- t = b->jmp.arg.val;
- assert(KBASE(tmp[t].cls) == 0);
- lvarg[0] = bshas(v, t);
- bsset(v, t);
- bscopy(u, v);
- limit2(v, 0, 0, NULL);
- if (!bshas(v, t)) {
- if (!lvarg[0])
- bsclr(u, t);
- b->jmp.arg = slot(t);
- }
- reloads(u, v);
- }
- curi = &insb[NIns];
- for (i=&b->ins[b->nins]; i!=b->ins;) {
- i--;
- if (regcpy(i)) {
- i = dopm(b, i, v);
- continue;
- }
- bszero(w);
- if (!req(i->to, R)) {
- assert(rtype(i->to) == RTmp);
- t = i->to.val;
- if (bshas(v, t))
- bsclr(v, t);
- else {
- /* make sure we have a reg
- * for the result */
- assert(t >= Tmp0 && "dead reg");
- bsset(v, t);
- bsset(w, t);
- }
- }
- j = T.memargs(i->op);
- for (n=0; n<2; n++)
- if (rtype(i->arg[n]) == RMem)
- j--;
- for (n=0; n<2; n++)
- switch (rtype(i->arg[n])) {
- case RMem:
- t = i->arg[n].val;
- m = &fn->mem[t];
- if (rtype(m->base) == RTmp) {
- bsset(v, m->base.val);
- bsset(w, m->base.val);
- }
- if (rtype(m->index) == RTmp) {
- bsset(v, m->index.val);
- bsset(w, m->index.val);
- }
- break;
- case RTmp:
- t = i->arg[n].val;
- lvarg[n] = bshas(v, t);
- bsset(v, t);
- if (j-- <= 0)
- bsset(w, t);
- break;
- }
- bscopy(u, v);
- limit2(v, 0, 0, w);
- for (n=0; n<2; n++)
- if (rtype(i->arg[n]) == RTmp) {
- t = i->arg[n].val;
- if (!bshas(v, t)) {
- /* do not reload if the
- * argument is dead
- */
- if (!lvarg[n])
- bsclr(u, t);
- i->arg[n] = slot(t);
- }
- }
- reloads(u, v);
- if (!req(i->to, R)) {
- t = i->to.val;
- store(i->to, tmp[t].slot);
- if (t >= Tmp0)
- /* in case i->to was a
- * dead temporary */
- bsclr(v, t);
- }
- emiti(*i);
- r = v->t[0]; /* Tmp0 is NBit */
- if (r)
- sethint(v, r);
- }
- if (b == fn->start)
- assert(v->t[0] == (T.rglob | fn->reg));
- else
- assert(v->t[0] == T.rglob);
- for (p=b->phi; p; p=p->link) {
- assert(rtype(p->to) == RTmp);
- t = p->to.val;
- if (bshas(v, t)) {
- bsclr(v, t);
- store(p->to, tmp[t].slot);
- } else if (bshas(b->in, t))
- /* only if the phi is live */
- p->to = slot(p->to.val);
- }
- bscopy(b->in, v);
- idup(b, curi, &insb[NIns]-curi);
- }
- /* align the locals to a 16 byte boundary */
- /* specific to NAlign == 3 */
- slot8 += slot8 & 3;
- fn->slot += slot8;
- if (debug['S']) {
- fprintf(stderr, "\n> Block information:\n");
- for (b=fn->start; b; b=b->link) {
- fprintf(stderr, "\t%-10s (% 5d) ", b->name, b->loop);
- dumpts(b->out, fn->tmp, stderr);
- }
- fprintf(stderr, "\n> After spilling:\n");
- printfn(fn, stderr);
- }
- }
|