Browse Source

gvn/gcm review

- Many stylistic nits.
- Removed blkmerge().
- Some minor bug fixes.
- GCM reassoc is now "sink"; a pass that
  moves trivial ops in their target block
  with the same goal of reducing register
  pressure, but starting from instructions
  that benefit from having their inputs
  close.
Quentin Carbonneaux 9 months ago
parent
commit
f3ca257737
16 changed files with 785 additions and 1151 deletions
  1. 12 22
      all.h
  2. 8 1
      amd64/emit.c
  3. 64 280
      cfg.c
  4. 219 149
      copy.c
  5. 1 1
      fold.c
  6. 167 194
      gcm.c
  7. 218 462
      gvn.c
  8. 1 0
      load.c
  9. 0 3
      main.c
  10. 12 2
      ops.h
  11. 2 2
      parse.c
  12. 1 2
      ssa.c
  13. 15 0
      test/copy.ssa
  14. 0 0
      test/gvn1.ssa
  15. 0 0
      test/gvn2.ssa
  16. 65 33
      util.c

+ 12 - 22
all.h

@@ -223,7 +223,7 @@ struct Op {
 	uint cmpeqwl:1;   /* Kl/Kw cmp eq/ne? */
 	uint cmpeqwl:1;   /* Kl/Kw cmp eq/ne? */
 	uint cmplgtewl:1; /* Kl/Kw cmp lt/gt/le/ge? */
 	uint cmplgtewl:1; /* Kl/Kw cmp lt/gt/le/ge? */
 	uint eqval:1;     /* 1 for eq; 0 for ne */
 	uint eqval:1;     /* 1 for eq; 0 for ne */
-	uint ispinned:1;  /* GCM pinned op? */
+	uint pinned:1;    /* GCM pinned op? */
 };
 };
 
 
 struct Ins {
 struct Ins {
@@ -260,9 +260,9 @@ struct Blk {
 
 
 	Blk *idom;
 	Blk *idom;
 	Blk *dom, *dlink;
 	Blk *dom, *dlink;
-	int domdpth;
 	Blk **fron;
 	Blk **fron;
 	uint nfron;
 	uint nfron;
+	int depth;
 
 
 	Blk **pred;
 	Blk **pred;
 	uint npred;
 	uint npred;
@@ -489,13 +489,14 @@ char *str(uint32_t);
 int argcls(Ins *, int);
 int argcls(Ins *, int);
 int isreg(Ref);
 int isreg(Ref);
 int iscmp(int, int *, int *);
 int iscmp(int, int *, int *);
-int invcmpwl(int);
+void igroup(Blk *, Ins *, Ins **, Ins **);
 void emit(int, int, Ref, Ref, Ref);
 void emit(int, int, Ref, Ref, Ref);
 void emiti(Ins);
 void emiti(Ins);
 void idup(Blk *, Ins *, ulong);
 void idup(Blk *, Ins *, ulong);
 Ins *icpy(Ins *, Ins *, ulong);
 Ins *icpy(Ins *, Ins *, ulong);
 int cmpop(int);
 int cmpop(int);
 int cmpneg(int);
 int cmpneg(int);
+int cmpwlneg(int);
 int clsmerge(short *, short);
 int clsmerge(short *, short);
 int phicls(int, Tmp *);
 int phicls(int, Tmp *);
 uint phiargn(Phi *, Blk *);
 uint phiargn(Phi *, Blk *);
@@ -507,7 +508,6 @@ Ref newcon(Con *, Fn *);
 Ref getcon(int64_t, Fn *);
 Ref getcon(int64_t, Fn *);
 int addcon(Con *, Con *, int);
 int addcon(Con *, Con *, int);
 int isconbits(Fn *fn, Ref r, int64_t *v);
 int isconbits(Fn *fn, Ref r, int64_t *v);
-int istmpconbits(Fn *fn, Ins *i, int64_t *v);
 void salloc(Ref, Ref, Fn *);
 void salloc(Ref, Ref, Fn *);
 void dumpts(BSet *, Tmp *, FILE *);
 void dumpts(BSet *, Tmp *, FILE *);
 void runmatch(uchar *, Num *, Ref, Ref *);
 void runmatch(uchar *, Num *, Ref, Ref *);
@@ -549,19 +549,11 @@ int sdom(Blk *, Blk *);
 int dom(Blk *, Blk *);
 int dom(Blk *, Blk *);
 void fillfron(Fn *);
 void fillfron(Fn *);
 void loopiter(Fn *, void (*)(Blk *, Blk *));
 void loopiter(Fn *, void (*)(Blk *, Blk *));
-void filldomdpth(Fn *);
+void filldepth(Fn *);
 Blk *lca(Blk *, Blk *);
 Blk *lca(Blk *, Blk *);
 void fillloop(Fn *);
 void fillloop(Fn *);
 void simpljmp(Fn *);
 void simpljmp(Fn *);
-void replacepreds(Blk *, Blk *, Blk *);
-void killblks(Fn *);
-void blkmerge(Fn *);
-int ifgraph(Blk *, Blk **, Blk **, Blk **);
-int ifjoin(Blk *, Blk **, Blk **, Blk **);
-int emptyblk(Blk *);
-void ifelim(Fn *);
-void clrbvisit(Fn *);
-int reaches(Fn *,Blk *, Blk *);
+int reaches(Fn *, Blk *, Blk *);
 int reachesnotvia(Fn *, Blk *, Blk *, Blk *);
 int reachesnotvia(Fn *, Blk *, Blk *, Blk *);
 
 
 /* mem.c */
 /* mem.c */
@@ -586,25 +578,23 @@ void ssa(Fn *);
 void ssacheck(Fn *);
 void ssacheck(Fn *);
 
 
 /* copy.c */
 /* copy.c */
-int iswu1(Fn *, Ref);
-void narrowpars(Fn *);
+void narrowpars(Fn *fn);
 Ref copyref(Fn *, Blk *, Ins *);
 Ref copyref(Fn *, Blk *, Ins *);
+Ref phicopyref(Fn *, Blk *, Phi *);
 
 
 /* fold.c */
 /* fold.c */
+int foldint(Con *, int, int, Con *, Con *);
 Ref foldref(Fn *, Ins *);
 Ref foldref(Fn *, Ins *);
 
 
 /* gvn.c */
 /* gvn.c */
-extern Ref con01[2];
-int is0non0(Fn *, Blk *, Ref, int, int *);
+extern Ref con01[2];  /* 0 and 1 */
+int zeroval(Fn *, Blk *, Ref, int, int *);
 void gvn(Fn *);
 void gvn(Fn *);
 
 
 /* gcm.c */
 /* gcm.c */
-int isfixed(Fn *, Ins *);
+int pinned(Ins *);
 void gcm(Fn *);
 void gcm(Fn *);
 
 
-/* reassoc.c */
-void reassoc(Fn *);
-
 /* simpl.c */
 /* simpl.c */
 void simpl(Fn *);
 void simpl(Fn *);
 
 

+ 8 - 1
amd64/emit.c

@@ -612,6 +612,7 @@ amd64_emitfn(Fn *fn, FILE *f)
 	Blk *b, *s;
 	Blk *b, *s;
 	Ins *i, itmp;
 	Ins *i, itmp;
 	int *r, c, o, n, lbl;
 	int *r, c, o, n, lbl;
+	uint p;
 	E *e;
 	E *e;
 
 
 	e = &(E){.f = f, .fn = fn};
 	e = &(E){.f = f, .fn = fn};
@@ -640,8 +641,14 @@ amd64_emitfn(Fn *fn, FILE *f)
 		}
 		}
 
 
 	for (lbl=0, b=fn->start; b; b=b->link) {
 	for (lbl=0, b=fn->start; b; b=b->link) {
-		if (lbl || b->npred > 1)
+		if (lbl || b->npred > 1) {
+			for (p=0; p<b->npred; p++)
+				if (b->pred[p]->id >= b->id)
+					break;
+			if (p != b->npred)
+				fprintf(f, ".p2align 4\n");
 			fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
 			fprintf(f, "%sbb%d:\n", T.asloc, id0+b->id);
+		}
 		for (i=b->ins; i!=&b->ins[b->nins]; i++)
 		for (i=b->ins; i!=&b->ins[b->nins]; i++)
 			emitins(*i, e);
 			emitins(*i, e);
 		lbl = 1;
 		lbl = 1;

+ 64 - 280
cfg.c

@@ -8,49 +8,40 @@ newblk()
 
 
 	b = alloc(sizeof *b);
 	b = alloc(sizeof *b);
 	*b = z;
 	*b = z;
+	b->ins = vnew(0, sizeof b->ins[0], PFn);
+	b->pred = vnew(0, sizeof b->pred[0], PFn);
 	return b;
 	return b;
 }
 }
 
 
-/* TODO - this never seems to do anything??? */
 static void
 static void
-prunephis(Fn *f)
+fixphis(Fn *f)
 {
 {
 	Blk *b;
 	Blk *b;
-	Phi *p, **pp;
-	uint na, na0;
+	Phi *p;
+	uint n, n0;
 
 
-	for (b = f->start; b; b = b->link) {
+	for (b=f->start; b; b=b->link) {
 		assert(b->id < f->nblk);
 		assert(b->id < f->nblk);
-		for (pp = &b->phi; (*pp);) {
-			p = *pp;
-			for (na = na0 = 0; na < p->narg; na++)
-				if (p->blk[na]->id != -1u) {
-					p->blk[na0] = p->blk[na];
-					p->arg[na0] = p->arg[na];
-					na0++;
+		for (p=b->phi; p; p=p->link) {
+			for (n=n0=0; n<p->narg; n++)
+				if (p->blk[n]->id != -1u) {
+					p->blk[n0] = p->blk[n];
+					p->arg[n0] = p->arg[n];
+					n0++;
 				}
 				}
-			if (na0 == 0) {
-				*pp = p->link;
-			} else {
-				p->narg = na0;
-				pp = &p->link;
-			}
+			assert(n0 > 0);
+			p->narg = n0;
 		}
 		}
 	}
 	}
 }
 }
 
 
 static void
 static void
-addpred(Blk *bp, Blk *bc)
+addpred(Blk *bp, Blk *b)
 {
 {
-	++bc->npred;
-	if (bc->pred)
-		vgrow(&bc->pred, bc->npred);
-	else
-		bc->pred = vnew(bc->npred, sizeof bc->pred[0], PFn);
-	bc->pred[bc->npred-1] = bp;
+	vgrow(&b->pred, ++b->npred);
+	b->pred[b->npred-1] = bp;
 }
 }
 
 
-/* fill predecessors information in blocks; prune dead phi refs */
 void
 void
 fillpreds(Fn *f)
 fillpreds(Fn *f)
 {
 {
@@ -64,8 +55,6 @@ fillpreds(Fn *f)
 		if (b->s2 && b->s2 != b->s1)
 		if (b->s2 && b->s2 != b->s1)
 			addpred(b, b->s2);
 			addpred(b, b->s2);
 	}
 	}
-	/* TODO - this never seems to do anything??? */
-	prunephis(f);
 }
 }
 
 
 static void
 static void
@@ -87,7 +76,6 @@ porec(Blk *b, uint *npo)
 	b->id = (*npo)++;
 	b->id = (*npo)++;
 }
 }
 
 
-/* fill the rpo information; prune dead blks */
 static void
 static void
 fillrpo(Fn *f)
 fillrpo(Fn *f)
 {
 {
@@ -97,27 +85,25 @@ fillrpo(Fn *f)
 		b->id = -1u;
 		b->id = -1u;
 	f->nblk = 0;
 	f->nblk = 0;
 	porec(f->start, &f->nblk);
 	porec(f->start, &f->nblk);
-	if (f->rpo)
-		vgrow(&f->rpo, f->nblk);
-	else
-		f->rpo = vnew(f->nblk, sizeof f->rpo[0], PFn);
+	vgrow(&f->rpo, f->nblk);
 	for (p=&f->start; (b=*p);) {
 	for (p=&f->start; (b=*p);) {
 		if (b->id == -1u) {
 		if (b->id == -1u) {
 			*p = b->link;
 			*p = b->link;
 		} else {
 		} else {
-			b->id = f->nblk-b->id-1; /* po -> rpo */
+			b->id = f->nblk-b->id-1;
 			f->rpo[b->id] = b;
 			f->rpo[b->id] = b;
 			p = &b->link;
 			p = &b->link;
 		}
 		}
 	}
 	}
 }
 }
 
 
-/* fill rpo, preds; prune dead blks, prune dead blk refs from phis */
+/* fill rpo, preds; prune dead blks */
 void
 void
 fillcfg(Fn *f)
 fillcfg(Fn *f)
 {
 {
 	fillrpo(f);
 	fillrpo(f);
 	fillpreds(f);
 	fillpreds(f);
+	fixphis(f);
 }
 }
 
 
 /* for dominators computation, read
 /* for dominators computation, read
@@ -260,34 +246,28 @@ loopiter(Fn *fn, void f(Blk *, Blk *))
 	}
 	}
 }
 }
 
 
+/* dominator tree depth */
 void
 void
-multloop(Blk *hd, Blk *b)
-{
-	(void)hd;
-	b->loop *= 10;
-}
-
-void
-filldomdpth(Fn *fn)
+filldepth(Fn *fn)
 {
 {
-	Blk *b, *dom;
-	uint dpth;
+	Blk *b, *d;
+	int depth;
 
 
 	for (b=fn->start; b; b=b->link)
 	for (b=fn->start; b; b=b->link)
-		b->domdpth = -1;
+		b->depth = -1;
 
 
-	fn->start->domdpth = 0;
+	fn->start->depth = 0;
 
 
 	for (b=fn->start; b; b=b->link) {
 	for (b=fn->start; b; b=b->link) {
-		if (b->domdpth != -1)
+		if (b->depth != -1)
 			continue;
 			continue;
-		dpth = 1;
-		for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
-			dpth++;
-		dpth += dom->domdpth;
-		b->domdpth = dpth;
-		for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
-			dom->domdpth = --dpth;
+		depth = 1;
+		for (d=b->idom; d->depth==-1; d=d->idom)
+			depth++;
+		depth += d->depth;
+		b->depth = depth;
+		for (d=b->idom; d->depth==-1; d=d->idom)
+			d->depth = --depth;
 	}
 	}
 }
 }
 
 
@@ -299,9 +279,9 @@ lca(Blk *b1, Blk *b2)
 		return b2;
 		return b2;
 	if (!b2)
 	if (!b2)
 		return b1;
 		return b1;
-	while (b1->domdpth > b2->domdpth)
+	while (b1->depth > b2->depth)
 		b1 = b1->idom;
 		b1 = b1->idom;
-	while (b2->domdpth > b1->domdpth)
+	while (b2->depth > b1->depth)
 		b2 = b2->idom;
 		b2 = b2->idom;
 	while (b1 != b2) {
 	while (b1 != b2) {
 		b1 = b1->idom;
 		b1 = b1->idom;
@@ -310,6 +290,13 @@ lca(Blk *b1, Blk *b2)
 	return b1;
 	return b1;
 }
 }
 
 
+void
+multloop(Blk *hd, Blk *b)
+{
+	(void)hd;
+	b->loop *= 10;
+}
+
 void
 void
 fillloop(Fn *fn)
 fillloop(Fn *fn)
 {
 {
@@ -371,244 +358,41 @@ simpljmp(Fn *fn)
 	free(uf);
 	free(uf);
 }
 }
 
 
-static void
-replacepred(Blk **blks, uint nblk, Blk *to, Blk *from)
-{
-	uint n;
-	for(n=0; n<nblk; n++)
-		if (blks[n] == from) {
-			blks[n] = to;
-			break;
-		}
-	assert(n != nblk);
-}
-
-/* replace b->pred[] and p->blk[] entries */
-void
-replacepreds(Blk *s, Blk *to, Blk *from)
-{
-	Phi *p;
-
-	if (!s)
-		return;
-	assert(s->npred);
-	replacepred(s->pred, s->npred, to, from);
-	for (p = s->phi; p; p = p->link) {
-		assert(p->narg == s->npred);
-		replacepred(p->blk, p->narg, to, from);
-	}
-}
-
-/* remove marked-dead blks - marked as fn->rpo[id] == 0 */
-void
-killblks(Fn *fn)
-{
-	Blk **pb;
-
-	for (pb = &fn->start; *pb;)
-		if (fn->rpo[(*pb)->id])
-			pb = &(*pb)->link;
-		else
-			*pb = (*pb)->link;
-}
-
-/* merge linear jmp chains */
-/* requires rpo pred, breaks cfg use */
-void
-blkmerge(Fn *fn)
-{
-	uint bid, nins;
-	Blk *curb, *b;
-	Ins *vins;
-
-	if (debug['B'])
-		fputs("\n> Block merge:\n", stderr);
-
-	vins = vnew(0, sizeof vins[0], PFn);
-	curb = 0;
-	/* linear jmp chains will be consecutive in rpo */
-	for (bid=0; bid<fn->nblk; bid++) {
-		b = fn->rpo[bid];
-		if (curb == 0) {
-			curb = b;
-			nins = 0;
-		} else
-			fn->rpo[bid] = 0;
-		addbins(b, &vins, &nins);
-		/* note - there are cases where GVN does not eliminate singleton phis */
-		if (b->jmp.type != Jjmp || b->s1->npred != 1 || b->s1->phi) {
-			idup(curb, vins, nins);
-			curb->nins = nins;
-			curb->jmp = b->jmp;
-			replacepreds(b->s1, curb, b);
-			if (b->s1 != b->s2)
-				replacepreds(b->s2, curb, b);
-			curb->s1 = b->s1;
-			curb->s2 = b->s2;
-			curb = 0;
-		} else {
-			assert(b->s1->id == bid+1);
-			if (debug['B'])
-				fprintf(stderr, "    merging blocks @%s -> @%s\n", b->name, b->s1->name);
-		}
-	}
-	assert(curb == 0);
-	killblks(fn);
-}
-
-int
-lonesucc(Blk *b, Blk *s)
+static int
+reachrec(Blk *b, Blk *to)
 {
 {
-	assert(s);
-	if (s != b)
-	if (s->npred == 1)
-	if (s->pred[0] == b)
-	if (s->phi == 0)
+	if (b == to)
 		return 1;
 		return 1;
-	return 0;
-}
-
-int
-lonejmpsucc(Blk *b, Blk *s)
-{
-	return s->jmp.type == Jjmp && lonesucc(b, s);
-}
-
-/* (otherwise) isolated if-then[-else] graphlet */
-int
-ifgraph(Blk *ifb, Blk **pthenb, Blk **pelseb, Blk **pjoinb)
-{
-	if (ifb->jmp.type != Jjnz)
+	if (!b || b->visit)
 		return 0;
 		return 0;
-	assert(ifb->s1 && ifb->s2);
-	assert(ifb->s1 != ifb->s2); /* dubious */
-	*pthenb = ifb->s1;
-	*pelseb = ifb->s2;
-	*pjoinb = ifb->s1->s1;
-	if (ifb->s1 == ifb->s2->s1) {
-		/* empty then */
-		*pthenb = ifb;
-		*pjoinb = ifb->s1;
-	}
-	if (ifb->s1->s1 == ifb->s2)
-		/* empty else */
-		*pelseb = ifb;
-
-	if (*pthenb == ifb ||
-	    ((*pthenb)->s1 == *pjoinb && lonejmpsucc(ifb, *pthenb)))
-	if (*pelseb == ifb ||
-	    ((*pelseb)->s1 == *pjoinb && lonejmpsucc(ifb, *pelseb)))
-	/* there are cases where npred == 2 is not strictly needed - ifconvert for example */
-	if ((*pjoinb)->npred == 2)
-		return 1;
-
-	return 0;
-}
-
-/* join blk of if-then[-else] graphlet */
-int
-ifjoin(Blk *joinb, Blk **pifb, Blk **pthenb, Blk **pelseb)
-{
-	Blk *joinb1;
 
 
-	if (joinb->npred)
-	if (ifgraph(joinb->pred[0], pthenb, pelseb, &joinb1))
-	if (joinb == joinb1) {
-		*pifb = joinb->pred[0];
+	b->visit = 1;
+	if (reachrec(b->s1, to))
 		return 1;
 		return 1;
-	}
-	if (joinb->npred && joinb->pred[0]->npred)
-	if (ifgraph(joinb->pred[0]->pred[0], pthenb, pelseb, &joinb1))
-	if (joinb == joinb1) {
-		*pifb = joinb->pred[0]->pred[0];
+	if (reachrec(b->s2, to))
 		return 1;
 		return 1;
-	}
+
 	return 0;
 	return 0;
 }
 }
 
 
+/* Blk.visit needs to be clear at entry */
 int
 int
-emptyblk(Blk *b)
+reaches(Fn *fn, Blk *b, Blk *to)
 {
 {
-	Ins *i;
-
-	for (i = b->ins; i < &b->ins[b->nins]; i++)
-		if (i->op != Onop)
-		if (i->op != Odbgloc)
-			return 0;
-	return 1;
-}
+	int r;
 
 
-/* remove empty jnz graphlets */
-/* needs rpo; breaks cfg */
-void
-ifelim(Fn *fn)
-{
-	uint bid;
-	Blk *ifb, *thenb, *elseb, *joinb;
-
-	for (bid = 0; bid < fn->nblk; bid++) {
-		ifb = fn->rpo[bid];
-		if (ifb == 0)
-			continue;
-		if (ifgraph(ifb, &thenb, &elseb, &joinb))
-		if (joinb->phi == 0)
-		if (thenb == ifb || emptyblk(thenb))
-		if (elseb == ifb || emptyblk(elseb)) {
-			if (debug['B'])
-				fprintf(stderr, "    eliminating empty if @%s -> @%s, @%s -> @%s\n",
-					ifb->name, thenb->name, elseb->name, joinb->name);
-			if (thenb != ifb)
-				fn->rpo[thenb->id] = 0;
-			if (elseb != ifb)
-				fn->rpo[elseb->id] = 0;
-			ifb->jmp.type = Jjmp;
-			ifb->jmp.arg = R;
-			ifb->s1 = joinb;
-			ifb->s2 = 0;
-		}
-	}
-	killblks(fn);
-}
-
-void
-clrbvisit(Fn *fn)
-{
-	Blk *b;
-	for (b = fn->start; b; b = b->link)
+	assert(to);
+	r = reachrec(b, to);
+	for (b=fn->start; b; b=b->link)
 		b->visit = 0;
 		b->visit = 0;
+	return r;
 }
 }
 
 
-static int
-reaches1(Blk *b1, Blk *b2)
-{
-	assert(b2);
-	if (b1 == b2)
-		return 1;
-	if (b1 == 0 || b1->visit)
-		return 0;
-	b1->visit = 1;
-	return reaches1(b1->s1, b2) || reaches1(b1->s2, b2);
-}
-
-/* path from b1 to b2 not thru bnotvia? */
-/* uses b->visit */
+/* can b reach 'to' not through excl
+ * Blk.visit needs to be clear at entry */
 int
 int
-reachesnotvia(Fn *fn, Blk *b1, Blk *b2, Blk *bnotvia)
+reachesnotvia(Fn *fn, Blk *b, Blk *to, Blk *excl)
 {
 {
-	int rc;
-
-	if (bnotvia)
-		bnotvia->visit = 1;
-	rc = reaches1(b1, b2);
-	clrbvisit(fn);
-	return rc;
+	excl->visit = 1;
+	return reaches(fn, b, to);
 }
 }
-
-/* path from b1 to b2? */
-/* uses b->visit */
-int
-reaches(Fn *fn, Blk *b1, Blk *b2)
-{
-	return reachesnotvia(fn, b1, b2, 0);
-}
-

+ 219 - 149
copy.c

@@ -1,9 +1,35 @@
 #include "all.h"
 #include "all.h"
 
 
-static uint
-u64_wbits(uint64_t v)
+typedef struct Ext Ext;
+
+struct Ext {
+	char zext;
+	char nopw; /* is a no-op if arg width is <= nopw */
+	char usew; /* uses only the low usew bits of arg */
+};
+
+static int
+ext(Ins *i, Ext *e)
 {
 {
-	uint n;
+	static Ext tbl[] = {
+		/*extsb*/ {0,  7,  8},
+		/*extub*/ {1,  8,  8},
+		/*extsh*/ {0, 15, 16},
+		/*extuh*/ {1, 16, 16},
+		/*extsw*/ {0, 31, 32},
+		/*extuw*/ {1, 32, 32},
+	};
+
+	if (!isext(i->op))
+		return 0;
+	*e = tbl[i->op - Oextsb];
+	return 1;
+}
+
+static int
+bitwidth(uint64_t v)
+{
+	int n;
 
 
 	n = 0;
 	n = 0;
 	if (v >> 32) { n += 32; v >>= 32; }
 	if (v >> 32) { n += 32; v >>= 32; }
@@ -15,44 +41,29 @@ u64_wbits(uint64_t v)
 	return n+v;
 	return n+v;
 }
 }
 
 
+/* no more than w bits are used */
 static int
 static int
-EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 };
-
-static uint
-EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 };
-
-static uint
-EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 };
-
-static uint
-STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 };
-
-/* is the ref used only as a narrow value? */
-static int
-usewidthle(Fn *fn, Ref r, uint wbits)
+usewidthle(Fn *fn, Ref r, int w)
 {
 {
+	Ext e;
 	Tmp *t;
 	Tmp *t;
 	Use *u;
 	Use *u;
 	Phi *p;
 	Phi *p;
-	int b;
 	Ins *i;
 	Ins *i;
 	Ref rc;
 	Ref rc;
 	int64_t v;
 	int64_t v;
+	int b;
 
 
-	if (isconbits(fn, r, &v))
-	if (u64_wbits(v) <= wbits)
-		return 1;
-	if (rtype(r) != RTmp)
-		return 0;
+	assert(rtype(r) == RTmp);
 	t = &fn->tmp[r.val];
 	t = &fn->tmp[r.val];
-	for (u = t->use; u < &t->use[t->nuse]; u++) {
+	for (u=t->use; u<&t->use[t->nuse]; u++) {
 		switch (u->type) {
 		switch (u->type) {
 		case UPhi:
 		case UPhi:
 			p = u->u.phi;
 			p = u->u.phi;
 			if (p->visit)
 			if (p->visit)
 				continue;
 				continue;
 			p->visit = 1;
 			p->visit = 1;
-			b = usewidthle(fn, p->to, wbits);
+			b = usewidthle(fn, p->to, w);
 			p->visit = 0;
 			p->visit = 0;
 			if (b)
 			if (b)
 				continue;
 				continue;
@@ -61,14 +72,13 @@ usewidthle(Fn *fn, Ref r, uint wbits)
 			i = u->u.ins;
 			i = u->u.ins;
 			assert(i != 0);
 			assert(i != 0);
 			if (i->op == Ocopy)
 			if (i->op == Ocopy)
-				if (usewidthle(fn, i->to, wbits))
+				if (usewidthle(fn, i->to, w))
 					continue;
 					continue;
-			if (isext(i->op)) {
-				if (EXTW[i->op - Oextsb] <= wbits)
+			if (ext(i, &e)) {
+				if (e.usew <= w)
+					continue;
+				if (usewidthle(fn, i->to, w))
 					continue;
 					continue;
-				else
-					if (usewidthle(fn, i->to, wbits))
-						continue;;
 			}
 			}
 			if (i->op == Oand) {
 			if (i->op == Oand) {
 				if (req(r, i->arg[0]))
 				if (req(r, i->arg[0]))
@@ -77,15 +87,11 @@ usewidthle(Fn *fn, Ref r, uint wbits)
 					assert(req(r, i->arg[1]));
 					assert(req(r, i->arg[1]));
 					rc = i->arg[0];
 					rc = i->arg[0];
 				}
 				}
-				if (isconbits(fn, rc, &v))
-				if (u64_wbits(v) <= wbits)
+				if (isconbits(fn, rc, &v)
+				&& bitwidth(v) <= w)
 					continue;
 					continue;
 				break;
 				break;
 			}
 			}
-			if (isstore(i->op))
-			if (req(r, i->arg[1]))
-			if (STW[i->op - Ostoreb] > wbits)
-				continue;
 			break;
 			break;
 		default:
 		default:
 			break;
 			break;
@@ -95,27 +101,17 @@ usewidthle(Fn *fn, Ref r, uint wbits)
 	return 1;
 	return 1;
 }
 }
 
 
-static Phi*
-findphi(Fn *fn, uint bid, Ref to)
-{
-	Phi *p;
-	for (p = fn->rpo[bid]->phi; p; p = p->link)
-		if (req(p->to, to))
-			break;
-	assert(p);
-	return p;
-}
-
-static uint
-uint_min(uint v1, uint v2)
+static int
+min(int v1, int v2)
 {
 {
 	return v1 < v2 ? v1 : v2;
 	return v1 < v2 ? v1 : v2;
 }
 }
 
 
-/* is the ref def a narrow value? */
+/* is the ref narrower than w bits */
 static int
 static int
-defwidthle(Fn *fn, Ref r, uint wbits)
+defwidthle(Fn *fn, Ref r, int w)
 {
 {
+	Ext e;
 	Tmp *t;
 	Tmp *t;
 	Phi *p;
 	Phi *p;
 	Ins *i;
 	Ins *i;
@@ -123,93 +119,99 @@ defwidthle(Fn *fn, Ref r, uint wbits)
 	int64_t v;
 	int64_t v;
 	int x;
 	int x;
 
 
-	if (isconbits(fn, r, &v))
-	if (u64_wbits(v) <= wbits)
+	if (isconbits(fn, r, &v)
+	&& bitwidth(v) <= w)
 		return 1;
 		return 1;
 	if (rtype(r) != RTmp)
 	if (rtype(r) != RTmp)
 		return 0;
 		return 0;
 	t = &fn->tmp[r.val];
 	t = &fn->tmp[r.val];
 	if (t->cls != Kw)
 	if (t->cls != Kw)
 		return 0;
 		return 0;
-	i = t->def;
-	if (i == 0) {
+
+	if (!t->def) {
 		/* phi def */
 		/* phi def */
-		p = findphi(fn, t->bid, r);
+		for (p=fn->rpo[t->bid]->phi; p; p=p->link)
+			if (req(p->to, r))
+				break;
+		assert(p);
 		if (p->visit)
 		if (p->visit)
 			return 1;
 			return 1;
 		p->visit = 1;
 		p->visit = 1;
-		for (n = 0; n < p->narg; n++)
-			if (!defwidthle(fn, p->arg[n], wbits)) {
+		for (n=0; n<p->narg; n++)
+			if (!defwidthle(fn, p->arg[n], w)) {
 				p->visit = 0;
 				p->visit = 0;
 				return 0;
 				return 0;
 			}
 			}
 		p->visit = 0;
 		p->visit = 0;
 		return 1;
 		return 1;
 	}
 	}
-	/* ins def */
+
+	i = t->def;
 	if (i->op == Ocopy)
 	if (i->op == Ocopy)
-                return defwidthle(fn, i->arg[0], wbits);
+                return defwidthle(fn, i->arg[0], w);
 	if (i->op == Oshr || i->op == Osar) {
 	if (i->op == Oshr || i->op == Osar) {
 		if (isconbits(fn, i->arg[1], &v))
 		if (isconbits(fn, i->arg[1], &v))
 		if (0 < v && v <= 32) {
 		if (0 < v && v <= 32) {
-			if (i->op == Oshr && 32-v <= wbits)
+			if (i->op == Oshr && w+v >= 32)
 				return 1;
 				return 1;
-			if (0 <= v && v < 32 && wbits < 32)
-				return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v));
+			if (w < 32) {
+				if (i->op == Osar)
+					w = min(31, w+v);
+				else
+					w = min(32, w+v);
+			}
 		}
 		}
-		return defwidthle(fn, i->arg[0], wbits);
+		return defwidthle(fn, i->arg[0], w);
 	}
 	}
 	if (iscmp(i->op, &x, &x))
 	if (iscmp(i->op, &x, &x))
-		return wbits >= 1;
-	if (i->op == Oand)
-		return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits);
-	if (i->op == Oor || i->op == Oxor)
-		return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits);
-	if (isext(i->op)) {
-		if (EXTSIGNED[i->op - Oextsb])
-			return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb]));
-		if (EXTW[i->op - Oextsb] <= wbits)
+		return w >= 1;
+	if (i->op == Oand) {
+		if (defwidthle(fn, i->arg[0], w)
+		|| defwidthle(fn, i->arg[1], w))
 			return 1;
 			return 1;
-		return defwidthle(fn, i->arg[0], wbits);
+		return 0;
+	}
+	if (i->op == Oor || i->op == Oxor) {
+		if (defwidthle(fn, i->arg[0], w)
+		&& defwidthle(fn, i->arg[1], w))
+			return 1;
+		return 0;
+	}
+	if (ext(i, &e)) {
+		if (e.zext && e.usew <= w)
+			return 1;
+		w = min(w, e.nopw);
+		return defwidthle(fn, i->arg[0], w);
 	}
 	}
-	return 0;
-}
 
 
-/* is the ref a boolean - 0, 1 - value? */
-int
-iswu1(Fn *fn, Ref r)
-{
-	return defwidthle(fn, r, 1);
+	return 0;
 }
 }
 
 
 static int
 static int
-isnarrowpar(Fn *fn, Ref r)
+isw1(Fn *fn, Ref r)
 {
 {
-	Tmp *t;
-
-	if (rtype(r) != RTmp)
-		return 0;
-	t = &fn->tmp[r.val];
-	if (t->bid != fn->start->id || t->def == 0)
-		return 0;
-	return ispar(t->def->op);
+	return defwidthle(fn, r, 1);
 }
 }
 
 
-/* Insert extub/extuh instructions in start for pars used only narrowly */
-/* needs use; breaks use */
+/* insert early extub/extuh instructions
+ * for pars used only narrowly; this
+ * helps factoring extensions out of
+ * loops
+ *
+ * needs use; breaks use
+ */
 void
 void
 narrowpars(Fn *fn)
 narrowpars(Fn *fn)
 {
 {
 	Blk *b;
 	Blk *b;
 	int loop;
 	int loop;
-	Ins *i, *ins;
+	Ins ext, *i, *ins;
 	uint npar, nins;
 	uint npar, nins;
-	enum O extop;
 	Ref r;
 	Ref r;
 
 
 	/* only useful for functions with loops */
 	/* only useful for functions with loops */
 	loop = 0;
 	loop = 0;
-	for (b = fn->start; b; b = b->link)
+	for (b=fn->start; b; b=b->link)
 		if (b->loop > 1) {
 		if (b->loop > 1) {
 			loop = 1;
 			loop = 1;
 			break;
 			break;
@@ -218,49 +220,47 @@ narrowpars(Fn *fn)
 		return;
 		return;
 
 
 	b = fn->start;
 	b = fn->start;
-	npar = 0;
 
 
-	for (i = b->ins; i < &b->ins[b->nins]; i++) {
+	npar = 0;
+	for (i=b->ins; i<&b->ins[b->nins]; i++) {
 		if (!ispar(i->op))
 		if (!ispar(i->op))
 			break;
 			break;
 		npar++;
 		npar++;
 	}
 	}
-
 	if (npar == 0)
 	if (npar == 0)
 		return;
 		return;
 
 
 	nins = b->nins + npar;
 	nins = b->nins + npar;
-	ins = vnew(nins, sizeof ins[0], PFn); //alloc(nins * sizeof ins[0]);
-	memcpy(ins, b->ins, npar * sizeof ins[0]);
-	memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]);
+	ins = vnew(nins, sizeof ins[0], PFn);
+	icpy(ins, b->ins, npar);
+	icpy(ins + 2*npar, b->ins+npar, b->nins-npar);
 	b->ins = ins;
 	b->ins = ins;
 	b->nins = nins;
 	b->nins = nins;
 
 
-	for (i = b->ins; i < &b->ins[b->nins]; i++) {
+	for (i=b->ins; i<&b->ins[b->nins]; i++) {
 		if (!ispar(i->op))
 		if (!ispar(i->op))
 			break;
 			break;
-		extop = Onop;
+		ext = (Ins){.op = Onop};
 		if (i->cls == Kw)
 		if (i->cls == Kw)
-			if (usewidthle(fn, i->to, 16)) {
-				if (usewidthle(fn, i->to, 8))
-					extop = Oextub;
-				else
-					extop = Oextuh;
-			}
-		if (extop == Onop) {
-			*(i+npar) = (Ins) {.op = Onop};
-		} else {
+		if (usewidthle(fn, i->to, 16)) {
+			ext.op = Oextuh;
+			if (usewidthle(fn, i->to, 8))
+				ext.op = Oextub;
 			r = newtmp("vw", i->cls, fn);
 			r = newtmp("vw", i->cls, fn);
-			*(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}};
+			ext.cls = i->cls;
+			ext.to = i->to;
+			ext.arg[0] = r;
 			i->to = r;
 			i->to = r;
 		}
 		}
+		*(i+npar) = ext;
 	}
 	}
 }
 }
 
 
-/* used by GVN */
 Ref
 Ref
 copyref(Fn *fn, Blk *b, Ins *i)
 copyref(Fn *fn, Blk *b, Ins *i)
 {
 {
+	/* which extensions are copies for a given
+	 * argument width */
 	static bits extcpy[] = {
 	static bits extcpy[] = {
 		[WFull] = 0,
 		[WFull] = 0,
 		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
 		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
@@ -270,63 +270,133 @@ copyref(Fn *fn, Blk *b, Ins *i)
 		[Wsw] = BIT(Wsw),
 		[Wsw] = BIT(Wsw),
 		[Wuw] = BIT(Wuw),
 		[Wuw] = BIT(Wuw),
 	};
 	};
-	bits bext;
+	Ext e;
 	Tmp *t;
 	Tmp *t;
 	int64_t v;
 	int64_t v;
-	int is0;
+	int w, z;
 
 
 	if (i->op == Ocopy)
 	if (i->op == Ocopy)
 		return i->arg[0];
 		return i->arg[0];
 
 
 	/* op identity value */
 	/* op identity value */
-	if (optab[i->op].hasid)
-	if (KBASE(i->cls) == 0) /* integer only - fp NaN! */
-	if (req(i->arg[1], con01[optab[i->op].idval]))
-	if (!optab[i->op].cmpeqwl || iswu1(fn, i->arg[0]))
+	if (optab[i->op].hasid
+	&& KBASE(i->cls) == 0 /* integer only - fp NaN! */
+	&& req(i->arg[1], con01[optab[i->op].idval])
+	&& (!optab[i->op].cmpeqwl || isw1(fn, i->arg[0])))
 		return i->arg[0];
 		return i->arg[0];
 
 
 	/* idempotent op with identical args */
 	/* idempotent op with identical args */
-	if (optab[i->op].idemp)
-	if (req(i->arg[0], i->arg[1]))
+	if (optab[i->op].idemp
+	&& req(i->arg[0], i->arg[1]))
 		return i->arg[0];
 		return i->arg[0];
 
 
 	/* integer cmp with identical args */
 	/* integer cmp with identical args */
-	if (optab[i->op].cmpeqwl || optab[i->op].cmplgtewl)
-	if (req(i->arg[0], i->arg[1]))
+	if ((optab[i->op].cmpeqwl || optab[i->op].cmplgtewl)
+	&& req(i->arg[0], i->arg[1]))
 		return con01[optab[i->op].eqval];
 		return con01[optab[i->op].eqval];
 
 
-	/* cmpeq/ne 0 with 0/non-0 inference from dominating jnz */
-	if (optab[i->op].cmpeqwl)
-	if (req(i->arg[1], con01[0]))
-	if (is0non0(fn, b, i->arg[0], argcls(i,0), &is0))
-		return con01[optab[i->op].eqval^is0^1];
+	/* cmpeq/ne 0 with 0/non-0 inference */
+	if (optab[i->op].cmpeqwl
+	&& req(i->arg[1], CON_Z)
+	&& zeroval(fn, b, i->arg[0], argcls(i, 0), &z))
+		return con01[optab[i->op].eqval^z^1];
 
 
 	/* redundant and mask */
 	/* redundant and mask */
-	if (i->op == Oand)
-	if (isconbits(fn, i->arg[1], &v))
-	if (((v+1) & v) == 0) /* v == 2^N-1 */
-	if (defwidthle(fn, i->arg[0], u64_wbits(v)))
+	if (i->op == Oand
+	&& isconbits(fn, i->arg[1], &v)
+	&& (v > 0 && ((v+1) & v) == 0)
+	&& defwidthle(fn, i->arg[0], bitwidth(v)))
 		return i->arg[0];
 		return i->arg[0];
 
 
-	if (!isext(i->op) || rtype(i->arg[0]) != RTmp)
-		return R;
-	if (i->op == Oextsw || i->op == Oextuw)
-	if (i->cls == Kw)
+	if (i->cls == Kw
+	&& (i->op == Oextsw || i->op == Oextuw))
 		return i->arg[0];
 		return i->arg[0];
 
 
-	t = &fn->tmp[i->arg[0].val];
-	assert(KBASE(t->cls) == 0);
-	if (i->cls == Kl && t->cls == Kw)
+	if (ext(i, &e) && rtype(i->arg[0]) == RTmp) {
+		t = &fn->tmp[i->arg[0].val];
+		assert(KBASE(t->cls) == 0);
+
+		/* do not break typing by returning
+		 * a narrower temp */
+		if (KWIDE(i->cls) > KWIDE(t->cls))
+			return R;
+
+		w  = Wsb + (i->op - Oextsb);
+		if (BIT(w) & extcpy[t->width])
+			return i->arg[0];
+
+		/* avoid eliding extensions of params
+		 * inserted in the start block; their
+		 * point is to make further extensions
+		 * redundant */
+		if ((!t->def || !ispar(t->def->op))
+		&& usewidthle(fn, i->to, e.usew))
+			return i->arg[0];
+
+		if (defwidthle(fn, i->arg[0], e.nopw))
+			return i->arg[0];
+	}
+
+	return R;
+}
+
+static int
+phieq(Phi *pa, Phi *pb)
+{
+	Ref r;
+	uint n;
+
+	assert(pa->narg == pb->narg);
+	for (n=0; n<pa->narg; n++) {
+		r = phiarg(pb, pa->blk[n]);
+		if (!req(pa->arg[n], r))
+			return 0;
+	}
+	return 1;
+}
+
+Ref
+phicopyref(Fn *fn, Blk *b, Phi *p)
+{
+	Blk *d, **s;
+	Phi *p1;
+	uint n, c;
+
+	/* identical args */
+	for (n=0; n<p->narg-1; n++)
+		if (!req(p->arg[n], p->arg[n+1]))
+			break;
+	if (n == p->narg-1)
+		return p->arg[n];
+
+	/* same as a previous phi */
+	for (p1=b->phi; p1!=p; p1=p1->link) {
+		assert(p1);
+		if (phieq(p1, p))
+			return p1->to;
+	}
+
+	/* can be replaced by a
+	 * dominating jnz arg */
+	d = b->idom;
+	if (p->narg != 2
+	|| d->jmp.type != Jjnz
+	|| !isw1(fn, d->jmp.arg))
 		return R;
 		return R;
-	bext = extcpy[t->width];
-	if ((BIT(Wsb + (i->op-Oextsb)) & bext) != 0)
-		return i->arg[0];
 
 
-	if (!isnarrowpar(fn, i->arg[0]))
-	if (usewidthle(fn, i->to, EXTW[i->op - Oextsb]))
-		return i->arg[0];
-	if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb]))
-		return i->arg[0];
+	s = (Blk*[]){0, 0};
+	for (n=0; n<2; n++)
+		for (c=0; c<2; c++)
+			if (req(p->arg[n], con01[c]))
+				s[c] = p->blk[n];
+
+	/* if s1 ends with a jnz on either b
+	 * or s2; the inference below is wrong
+	 * without the jump type checks */
+	if (d->s1 == s[1] && d->s2 == s[0]
+	&& d->s1->jmp.type == Jjmp
+	&& d->s2->jmp.type == Jjmp)
+		return d->jmp.arg;
 
 
 	return R;
 	return R;
 }
 }

+ 1 - 1
fold.c

@@ -13,7 +13,7 @@ iscon(Con *c, int w, uint64_t k)
 		return (uint32_t)c->bits.i == (uint32_t)k;
 		return (uint32_t)c->bits.i == (uint32_t)k;
 }
 }
 
 
-static int
+int
 foldint(Con *res, int op, int w, Con *cl, Con *cr)
 foldint(Con *res, int op, int w, Con *cl, Con *cr)
 {
 {
 	union {
 	union {

+ 167 - 194
gcm.c

@@ -2,36 +2,35 @@
 
 
 #define NOBID (-1u)
 #define NOBID (-1u)
 
 
-/* ins can trap at runtime */
 static int
 static int
-istrapping(Fn *fn, Ins *i)
+isdivwl(Ins *i)
 {
 {
-	int64_t v;
-
-	if (KBASE(i->cls) == 0)
-	if (INRANGE(i->op, Odiv, Ourem))
-	if (!isconbits(fn, i->arg[1], &v) || v == 0)
-		return 1;
-	return 0;
+	switch (i->op) {
+	case Odiv:
+	case Orem:
+	case Oudiv:
+	case Ourem:
+		return KBASE(i->cls) == 0;
+	default:
+		return 0;
+	}
 }
 }
 
 
-/* fixed ins that can be eliminated if unused */
-static int
-canelim(Fn *fn, Ins *i)
+int
+pinned(Ins *i)
 {
 {
-	return isload(i->op) || isalloc(i->op) || istrapping(fn, i);
+	return optab[i->op].pinned || isdivwl(i);
 }
 }
 
 
-/* ins must stay in def blk */
-int
-isfixed(Fn *fn, Ins *i)
+/* pinned ins that can be eliminated if unused */
+static int
+canelim(Ins *i)
 {
 {
-	return optab[i->op].ispinned || istrapping(fn, i);
+	return isload(i->op) || isalloc(i->op) || isdivwl(i);
 }
 }
 
 
 static uint earlyins(Fn *, Blk *, Ins *);
 static uint earlyins(Fn *, Blk *, Ins *);
 
 
-/* return earlybid of ref def ins */
 static uint
 static uint
 schedearly(Fn *fn, Ref r)
 schedearly(Fn *fn, Ref r)
 {
 {
@@ -39,43 +38,39 @@ schedearly(Fn *fn, Ref r)
 	Blk *b;
 	Blk *b;
 
 
 	if (rtype(r) != RTmp)
 	if (rtype(r) != RTmp)
-		return 0 /* root/start */;
+		return 0;
 
 
 	t = &fn->tmp[r.val];
 	t = &fn->tmp[r.val];
 	if (t->gcmbid != NOBID)
 	if (t->gcmbid != NOBID)
-		return t->gcmbid; /* already visited/visiting */
+		return t->gcmbid;
 
 
 	b = fn->rpo[t->bid];
 	b = fn->rpo[t->bid];
 	if (t->def) {
 	if (t->def) {
-		/* def is an ins */
 		assert(b->ins <= t->def && t->def < &b->ins[b->nins]);
 		assert(b->ins <= t->def && t->def < &b->ins[b->nins]);
-		t->gcmbid = 0; /* mark visiting root/start blk */
-		t->gcmbid = earlyins(fn, b, t->def); /* schedule ins input defs */
+		t->gcmbid = 0;  /* mark as visiting */
+		t->gcmbid = earlyins(fn, b, t->def);
 	} else {
 	} else {
-		/* def is a phi - it stays in its def blk */
+		/* phis do not move */
 		t->gcmbid = t->bid;
 		t->gcmbid = t->bid;
 	}
 	}
 
 
 	return t->gcmbid;
 	return t->gcmbid;
 }
 }
 
 
-/* return deepest domdpth bid of arg defs */
 static uint
 static uint
-earlyins(Fn *fn, Blk *b, Ins* i)
+earlyins(Fn *fn, Blk *b, Ins *i)
 {
 {
-	uint earlybid, arg1earlybid;
-
-	earlybid = schedearly(fn, i->arg[0]);
-	assert(earlybid != NOBID);
-	arg1earlybid = schedearly(fn, i->arg[1]);
-	assert(arg1earlybid != NOBID);
-	if (fn->rpo[earlybid]->domdpth < fn->rpo[arg1earlybid]->domdpth) {
-		assert(dom(fn->rpo[earlybid], fn->rpo[arg1earlybid]));
-		earlybid = arg1earlybid;
+	uint b0, b1;
+
+	b0 = schedearly(fn, i->arg[0]);
+	assert(b0 != NOBID);
+	b1 = schedearly(fn, i->arg[1]);
+	assert(b1 != NOBID);
+	if (fn->rpo[b0]->depth < fn->rpo[b1]->depth) {
+		assert(dom(fn->rpo[b0], fn->rpo[b1]));
+		b0 = b1;
 	}
 	}
-
-	/* fixed ins remain in their defining blk */
-	return isfixed(fn, i) ? b->id : earlybid;
+	return pinned(i) ? b->id : b0;
 }
 }
 
 
 static void
 static void
@@ -87,12 +82,14 @@ earlyblk(Fn *fn, uint bid)
 	uint n;
 	uint n;
 
 
 	b = fn->rpo[bid];
 	b = fn->rpo[bid];
-	for (p = b->phi; p; p = p->link)
-		for (n = 0; n < p->narg; n++)
+	for (p=b->phi; p; p=p->link)
+		for (n=0; n<p->narg; n++)
 			schedearly(fn, p->arg[n]);
 			schedearly(fn, p->arg[n]);
-	for (i = b->ins; i < &b->ins[b->nins]; i++)
-		if (isfixed(fn, i))
-			earlyins(fn, b, i);
+	for (i=b->ins; i<&b->ins[b->nins]; i++)
+		if (pinned(i)) {
+			schedearly(fn, i->arg[0]);
+			schedearly(fn, i->arg[1]);
+		}
 	schedearly(fn, b->jmp.arg);
 	schedearly(fn, b->jmp.arg);
 }
 }
 
 
@@ -154,16 +151,17 @@ schedlate(Fn *fn, Ref r)
 
 
 	t = &fn->tmp[r.val];
 	t = &fn->tmp[r.val];
 	if (t->visit)
 	if (t->visit)
-		return t->gcmbid; /* already visited/visiting */
+		return t->gcmbid;
 
 
-	t->visit = 1; /* mark visiting/visited */
+	t->visit = 1;
 	earlybid = t->gcmbid;
 	earlybid = t->gcmbid;
 	if (earlybid == NOBID)
 	if (earlybid == NOBID)
 		return NOBID; /* not used */
 		return NOBID; /* not used */
-	t->gcmbid = t->bid; /* t->gcmbid is now latebid */
 
 
+	/* reuse gcmbid for late bid */
+	t->gcmbid = t->bid;
 	latebid = NOBID;
 	latebid = NOBID;
-	for (u = t->use; u < &t->use[t->nuse]; u++) {
+	for (u=t->use; u<&t->use[t->nuse]; u++) {
 		assert(u->bid < fn->nblk);
 		assert(u->bid < fn->nblk);
 		b = fn->rpo[u->bid];
 		b = fn->rpo[u->bid];
 		switch (u->type) {
 		switch (u->type) {
@@ -182,39 +180,37 @@ schedlate(Fn *fn, Ref r)
 		}
 		}
 		latebid = lcabid(fn, latebid, uselatebid);
 		latebid = lcabid(fn, latebid, uselatebid);
 	}
 	}
+	/* latebid may be NOBID if the temp is used
+	 * in fixed instructions that may be eliminated
+	 * and are themselves unused transitively */
 
 
-	/* phis stay in their def blk */
-	if (t->def) {
-		/* allow elimination of unused load/alloc/trapping ins */
-		if (latebid == NOBID && canelim(fn, t->def))
-			t->gcmbid = NOBID;
-		/* ... otherwise fixed ins stay in defining blk */
-		else if(!isfixed(fn, t->def))
-			t->gcmbid = bestbid(fn, earlybid, latebid);
-	}
+	if (t->def && !pinned(t->def))
+		t->gcmbid = bestbid(fn, earlybid, latebid);
+	/* else, keep the early one */
 
 
+	/* now, gcmbid is the best bid */
 	return t->gcmbid;
 	return t->gcmbid;
 }
 }
 
 
-/* return lca bid of uses, or NOBID if no active uses */
+/* returns lca bid of uses or NOBID if
+ * the definition can be eliminated */
 static uint
 static uint
 lateins(Fn *fn, Blk *b, Ins *i, Ref r)
 lateins(Fn *fn, Blk *b, Ins *i, Ref r)
 {
 {
 	uint latebid;
 	uint latebid;
 
 
-	assert(i->op == Onop || req(i->arg[0], r) || req(i->arg[1], r));
-	if (i->op == Onop)
-		return NOBID; /* already eliminated */
-
 	assert(b->ins <= i && i < &b->ins[b->nins]);
 	assert(b->ins <= i && i < &b->ins[b->nins]);
+	assert(req(i->arg[0], r) || req(i->arg[1], r));
 
 
 	latebid = schedlate(fn, i->to);
 	latebid = schedlate(fn, i->to);
-	/* allow elimination of unused load/alloc/trapping ins */
-	if (latebid == NOBID)
-	if (canelim(fn, i))
-		return NOBID;
-	/* ... otherwise fixed ins stay in defining blk */
-	return isfixed(fn, i) ? b->id : latebid;
+	if (pinned(i)) {
+		if (latebid == NOBID)
+		if (canelim(i))
+			return NOBID;
+		return b->id;
+	}
+
+	return latebid;
 }
 }
 
 
 static uint
 static uint
@@ -254,17 +250,16 @@ lateblk(Fn *fn, uint bid)
 	Ins *i;
 	Ins *i;
 
 
 	b = fn->rpo[bid];
 	b = fn->rpo[bid];
-	for (pp = &b->phi; *(pp);)
+	for (pp=&b->phi; *(pp);)
 		if (schedlate(fn, (*pp)->to) == NOBID) {
 		if (schedlate(fn, (*pp)->to) == NOBID) {
-			/* unused */
 			(*pp)->narg = 0; /* mark unused */
 			(*pp)->narg = 0; /* mark unused */
 			*pp = (*pp)->link; /* remove phi */
 			*pp = (*pp)->link; /* remove phi */
 		} else
 		} else
 			pp = &(*pp)->link;
 			pp = &(*pp)->link;
 
 
-	for (i = b->ins; i < &b->ins[b->nins]; i++)
-		if (isfixed(fn, i))
-			lateins(fn, b, i, i->arg[0]);
+	for (i=b->ins; i<&b->ins[b->nins]; i++)
+		if (pinned(i))
+			schedlate(fn, i->to);
 }
 }
 
 
 static void
 static void
@@ -274,7 +269,7 @@ addgcmins(Fn *fn, Ins *vins, uint nins)
 	Tmp *t;
 	Tmp *t;
 	Blk *b;
 	Blk *b;
 
 
-	for (i = vins; i < &vins[nins]; i++) {
+	for (i=vins; i<&vins[nins]; i++) {
 		assert(rtype(i->to) == RTmp);
 		assert(rtype(i->to) == RTmp);
 		t = &fn->tmp[i->to.val];
 		t = &fn->tmp[i->to.val];
 		b = fn->rpo[t->gcmbid];
 		b = fn->rpo[t->gcmbid];
@@ -282,9 +277,10 @@ addgcmins(Fn *fn, Ins *vins, uint nins)
 	}
 	}
 }
 }
 
 
-/* remove unused instructions */
-/* move instructions to (end of) target block */
-/* use-before-def is fixed up afterwards */
+/* move live instructions to the
+ * end of their target block; use-
+ * before-def errors are fixed by
+ * schedblk */
 static void
 static void
 gcmmove(Fn *fn)
 gcmmove(Fn *fn)
 {
 {
@@ -295,13 +291,13 @@ gcmmove(Fn *fn)
 	nins = 0;
 	nins = 0;
 	vins = vnew(nins, sizeof vins[0], PFn);
 	vins = vnew(nins, sizeof vins[0], PFn);
 
 
-	for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) {
+	for (t=fn->tmp; t<&fn->tmp[fn->ntmp]; t++) {
 		if (t->def == 0)
 		if (t->def == 0)
 			continue;
 			continue;
 		if (t->bid == t->gcmbid)
 		if (t->bid == t->gcmbid)
 			continue;
 			continue;
 		i = t->def;
 		i = t->def;
-		if (isfixed(fn, i) && !canelim(fn, i))
+		if (pinned(i) && !canelim(i))
 			continue;
 			continue;
 		assert(rtype(i->to) == RTmp);
 		assert(rtype(i->to) == RTmp);
 		assert(t == &fn->tmp[i->to.val]);
 		assert(t == &fn->tmp[i->to.val]);
@@ -309,176 +305,153 @@ gcmmove(Fn *fn)
 			addins(&vins, &nins, i);
 			addins(&vins, &nins, i);
 		*i = (Ins){.op = Onop};
 		*i = (Ins){.op = Onop};
 	}
 	}
-
 	addgcmins(fn, vins, nins);
 	addgcmins(fn, vins, nins);
 }
 }
 
 
-static void
-schedins(Fn *fn, Blk *b, Ins *i0, Ins **pvins, uint *pnins)
+/* dfs ordering */
+static Ins *
+schedins(Fn *fn, Blk *b, Ins *i, Ins **pvins, uint *pnins)
 {
 {
-	Ins *i, *i1;
+	Ins *i0, *i1;
 	Tmp *t;
 	Tmp *t;
-	uint na;
+	uint n;
 
 
-	if (i0->op == Onop)
-		return;
-	/* arg...call have to stay together */
-	/* TODO - sel0...sel1 too */
-	for (i1 = i0; isarg(i1->op); i1++) {}
-	for (i = i0; i <= i1; i++)
-		for (na = 0; na < 2; na++) {
-			if (rtype(i->arg[na]) != RTmp)
-				continue;
-			t = &fn->tmp[i->arg[na].val];
-			/* def in different blk, or phi in this blk */
-			if (t->bid != b->id || t->def == 0)
+	igroup(b, i, &i0, &i1);
+	for (i=i0; i<i1; i++)
+		for (n=0; n<2; n++) {
+			if (rtype(i->arg[n]) != RTmp)
 				continue;
 				continue;
-			/* already scheduled */
-			if (t->def->op == Onop) {
+			t = &fn->tmp[i->arg[n].val];
+			if (t->bid != b->id || !t->def)
 				continue;
 				continue;
-			}
 			schedins(fn, b, t->def, pvins, pnins);
 			schedins(fn, b, t->def, pvins, pnins);
 		}
 		}
-	for (i = i0; i <= i1; i++) {
+	for (i=i0; i<i1; i++) {
 		addins(pvins, pnins, i);
 		addins(pvins, pnins, i);
 		*i = (Ins){.op = Onop};
 		*i = (Ins){.op = Onop};
 	}
 	}
+	return i1;
 }
 }
 
 
+/* order ins within a block */
 static void
 static void
-fixbub4d(Fn *fn, Blk *b, Ins **pvins)
-{
-	Ins *i;
-	uint nins;
-
-	nins = 0;
-	for (i = b->ins; i < &b->ins[b->nins]; i++)
-		schedins(fn, b, i, pvins, &nins);
-	idup(b, *pvins, nins);
-}
-
-static void
-fixub4d(Fn *fn)
+schedblk(Fn *fn)
 {
 {
 	Blk *b;
 	Blk *b;
-	Ins *vins; // TODO insb
+	Ins *i, *vins;
+	uint nins;
 
 
-	vins = vnew(0, sizeof vins[0], PFn);
-	for (b = fn->start; b; b = b->link)
-		fixbub4d(fn, b, &vins);
+	vins = vnew(0, sizeof vins[0], PHeap);
+	for (b=fn->start; b; b=b->link) {
+		nins = 0;
+		for (i=b->ins; i<&b->ins[b->nins];)
+			i = schedins(fn, b, i, &vins, &nins);
+		idup(b, vins, nins);
+	}
+	vfree(vins);
 }
 }
 
 
 static int
 static int
-iskladdcon(Fn *fn, Ins *i, int64_t *v)
+cheap(Ins *i)
 {
 {
-	if (i->op == Oadd)
-	if (i->cls == Kl)
-	if (rtype(i->arg[0]) == RTmp)
-	if (isconbits(fn, i->arg[1], v))
-		return 1;
-	return 0;
-}
-
-static void
-ireassoc(Fn *fn, Blk *b, Ins *i, Ins **pvins, uint *pnins) 
-{
-	Blk *b2;
-	Tmp *t, *t2;
-	Use *u;
-	Ref r2;
-	int64_t v;
 	int x;
 	int x;
 
 
-	assert(b->ins <= i && i < &b->ins[b->nins]);
-	if (!iscmp(i->op, &x, &x))
-	if (!iskladdcon(fn, i, &v))
-		return;
-
-	assert(rtype(i->to) == RTmp);
-	t = &fn->tmp[i->to.val];
-	for (u = t->use; u < &t->use[t->nuse]; u++) {
-		if (u->type == UPhi)
-			continue;
-		b2 = fn->rpo[u->bid];
-		addins(pvins, pnins, i);
-		/* careful, can move fn->tmp */
-		r2 = newtmp("rea", t->cls, fn);
-		t = &fn->tmp[i->to.val];
-		t2 = &fn->tmp[r2.val];
-		t2->gcmbid = u->bid;
-		(*pvins)[(*pnins)-1].to = r2;
-		if (u->type == UIns) {
-			assert(req(u->u.ins->arg[0], i->to)
-			       || req(u->u.ins->arg[1], i->to));
-			if (req(u->u.ins->arg[0], i->to))
-				u->u.ins->arg[0] = r2;
-			if (req(u->u.ins->arg[1], i->to))
-				u->u.ins->arg[1] = r2;
-		} else {
-			assert(u->type == UJmp);
-			assert(req(b2->jmp.arg, i->to));
-			b2->jmp.arg = r2;
-		}
+	if (KBASE(i->cls) != 0)
+		return 0;
+	switch (i->op) {
+	case Oneg:
+	case Oadd:
+	case Osub:
+	case Omul:
+	case Oand:
+	case Oor:
+	case Oxor:
+	case Osar:
+	case Oshr:
+	case Oshl:
+		return 1;
+	default:
+		return iscmp(i->op, &x, &x);
 	}
 	}
 }
 }
 
 
-/* Redistribute trivial ops to point of use. */
-/* Reduces register pressure. */
-/* needs rpo, use; breaks use */
-void
-reassoc(Fn *fn)
+static void
+sinkref(Fn *fn, Blk *b, Ref *pr)
 {
 {
-	Blk *b;
-	Ins *vins, *i;
-	uint nins;
-
-	nins = 0;
-	vins = vnew(nins, sizeof vins[0], PFn);
+	Ins i;
+	Tmp *t;
+	Ref r;
 
 
-	/* identify trivial ins */
-	for (b = fn->start; b; b = b->link) {
-		for (i = b->ins; i < &b->ins[b->nins]; i++)
-			ireassoc(fn, b, i, &vins, &nins);
-	}
+	if (rtype(*pr) != RTmp)
+		return;
+	t = &fn->tmp[pr->val];
+	if (!t->def
+	|| t->bid == b->id
+	|| pinned(t->def)
+	|| !cheap(t->def))
+		return;
 
 
-	addgcmins(fn, vins, nins);
+	/* sink t->def to b */
+	i = *t->def;
+	r = newtmp("snk", t->cls, fn);
+	t = 0;  /* invalidated */
+	*pr = r;
+	i.to = r;
+	fn->tmp[r.val].gcmbid = b->id;
+	emiti(i);
+	sinkref(fn, b, &i.arg[0]);
+	sinkref(fn, b, &i.arg[1]);
 }
 }
 
 
+/* redistribute trivial ops to point of
+ * use to reduce register pressure
+ * requires rpo, use; breaks use
+ */
 static void
 static void
-cleartmps(Fn *fn)
+sink(Fn *fn)
 {
 {
-	Tmp *t;
+	Blk *b;
+	Ins *i;
 
 
-	for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) {
-		t->visit = 0;
-		t->gcmbid = NOBID;
+	for (b=fn->start; b; b=b->link) {
+		for (i=b->ins; i<&b->ins[b->nins]; i++)
+			if (isload(i->op))
+				sinkref(fn, b, &i->arg[0]);
+			else if (isstore(i->op))
+				sinkref(fn, b, &i->arg[1]);
+		sinkref(fn, b, &b->jmp.arg);
 	}
 	}
+	addgcmins(fn, curi, &insb[NIns] - curi);
 }
 }
 
 
-/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
-/* require use dom */
-/* maintains rpo pred dom */
-/* breaks use */
+/* requires use dom
+ * maintains rpo pred dom
+ * breaks use
+ */
 void
 void
 gcm(Fn *fn)
 gcm(Fn *fn)
 {
 {
+	Tmp *t;
 	uint bid;
 	uint bid;
 
 
-	filldomdpth(fn);
+	filldepth(fn);
 	fillloop(fn);
 	fillloop(fn);
 
 
-	cleartmps(fn);
+	for (t=fn->tmp; t<&fn->tmp[fn->ntmp]; t++) {
+		t->visit = 0;
+		t->gcmbid = NOBID;
+	}
 	for (bid=0; bid<fn->nblk; bid++)
 	for (bid=0; bid<fn->nblk; bid++)
 		earlyblk(fn, bid);
 		earlyblk(fn, bid);
 	for (bid=0; bid<fn->nblk; bid++)
 	for (bid=0; bid<fn->nblk; bid++)
 		lateblk(fn, bid);
 		lateblk(fn, bid);
 
 
 	gcmmove(fn);
 	gcmmove(fn);
-	cleartmps(fn); /* filluse() uses visit */
 	filluse(fn);
 	filluse(fn);
-	reassoc(fn);
+	curi = &insb[NIns];
+	sink(fn);
 	filluse(fn);
 	filluse(fn);
-	fixub4d(fn);
+	schedblk(fn);
 	
 	
 	if (debug['G']) {
 	if (debug['G']) {
 		fprintf(stderr, "\n> After GCM:\n");
 		fprintf(stderr, "\n> After GCM:\n");

+ 218 - 462
gvn.c

@@ -1,6 +1,5 @@
 #include "all.h"
 #include "all.h"
 
 
-/* literal constants 0, 1 */
 Ref con01[2];
 Ref con01[2];
 
 
 static inline uint
 static inline uint
@@ -18,121 +17,91 @@ rhash(Ref r)
 static uint
 static uint
 ihash(Ins *i)
 ihash(Ins *i)
 {
 {
-	uint a0h, a1h, ah, h;
+	uint h;
 
 
-	a0h = rhash(i->arg[0]);
-	a1h = rhash(i->arg[1]);
-	ah = mix(a0h, a1h);
-	h = mix(i->cls, i->op);
-	h = mix(h, ah);
+	h = mix(i->op, i->cls);
+	h = mix(h, rhash(i->arg[0]));
+	h = mix(h, rhash(i->arg[1]));
 
 
 	return h;
 	return h;
-
 }
 }
 
 
 static int
 static int
 ieq(Ins *ia, Ins *ib)
 ieq(Ins *ia, Ins *ib)
 {
 {
-	if (ia->cls == ib->cls)
 	if (ia->op == ib->op)
 	if (ia->op == ib->op)
+	if (ia->cls == ib->cls)
 	if (req(ia->arg[0], ib->arg[0]))
 	if (req(ia->arg[0], ib->arg[0]))
 	if (req(ia->arg[1], ib->arg[1]))
 	if (req(ia->arg[1], ib->arg[1]))
 		return 1;
 		return 1;
 	return 0;
 	return 0;
 }
 }
 
 
-static Ins** gvntbl;
+static Ins **gvntbl;
 static uint gvntbln;
 static uint gvntbln;
-static uint lookupn;
-static uint proben;
-static uint maxproben;
 
 
 static Ins *
 static Ins *
-gvndup(Ins *i, int insert) {
+gvndup(Ins *i, int insert)
+{
 	uint idx, n;
 	uint idx, n;
-	Ins *i2;
-
-	lookupn++;
+	Ins *ii;
 
 
 	idx = ihash(i) % gvntbln;
 	idx = ihash(i) % gvntbln;
-	for (n = 1;; n++) {
-		proben++;
-		if (n > maxproben)
-			maxproben = n;
-		i2 = gvntbl[idx];
-		if (!i2)
+	for (n=1;; n++) {
+		ii = gvntbl[idx];
+		if (!ii)
 			break;
 			break;
-		if (ieq(i, i2))
-			return i2;
+		if (ieq(i, ii))
+			return ii;
 
 
 		idx++;
 		idx++;
 		if (gvntbln <= idx)
 		if (gvntbln <= idx)
 			idx = 0;
 			idx = 0;
 	}
 	}
-	/* not found */
-	if (insert) {
+	if (insert)
 		gvntbl[idx] = i;
 		gvntbl[idx] = i;
-	}
 	return 0;
 	return 0;
 }
 }
 
 
 static void
 static void
-replaceref(Ref *r, Ref r1, Ref r2)
-{
-	if (req(*r, r1))
-		*r = r2;
-}
-
-static void
-replacepuse(Phi *p, Ref r1, Ref r2)
-{
-	Ref *a;
-
-	for (a = p->arg; a < &p->arg[p->narg]; a++)
-		replaceref(a, r1, r2);
-}
-
-static void
-replaceiuse(Ins *i, Ref r1, Ref r2)
-{
-	replaceref(&i->arg[0], r1, r2);
-	replaceref(&i->arg[1], r1, r2);
-}
-
-static void
-replacejuse(Blk* b, Ref r1, Ref r2)
-{
-	replaceref(&b->jmp.arg, r1, r2);
-}
-
-static void
-replaceuse(Fn *fn, Use* u, Ref r1, Ref r2)
+replaceuse(Fn *fn, Use *u, Ref r1, Ref r2)
 {
 {
-	Tmp *t2;
 	Blk *b;
 	Blk *b;
+	Ins *i;
+	Phi *p;
+	Ref *pr;
+	Tmp *t2;
+	int n;
 
 
-	t2 = rtype(r2) == RTmp ? &fn->tmp[r2.val] : 0;
+	t2 = 0;
+	if (rtype(r2) == RTmp)
+		t2 = &fn->tmp[r2.val];
 	b = fn->rpo[u->bid];
 	b = fn->rpo[u->bid];
-
 	switch (u->type) {
 	switch (u->type) {
-	case UXXX:
-		die("unreachable");
-		break;
 	case UPhi:
 	case UPhi:
-		replacepuse(u->u.phi, r1, r2);
+		p = u->u.phi;
+		for (pr=p->arg; pr<&p->arg[p->narg]; pr++)
+			if (req(*pr, r1))
+				*pr = r2;
 		if (t2)
 		if (t2)
-			adduse(t2, UPhi, b, u->u.phi);
+			adduse(t2, UPhi, b, p);
 		break;
 		break;
 	case UIns:
 	case UIns:
-		replaceiuse(u->u.ins, r1, r2);
+		i = u->u.ins;
+		for (n=0; n<2; n++)
+			if (req(i->arg[n], r1))
+				i->arg[n] = r2;
 		if (t2)
 		if (t2)
-			adduse(t2, UIns, b, u->u.ins);
+			adduse(t2, UIns, b, i);
 		break;
 		break;
 	case UJmp:
 	case UJmp:
-		replacejuse(fn->rpo[u->bid], r1, r2);
+		if (req(b->jmp.arg, r1))
+			b->jmp.arg = r2;
 		if (t2)
 		if (t2)
 			adduse(t2, UJmp, b);
 			adduse(t2, UJmp, b);
 		break;
 		break;
+	case UXXX:
+		die("unreachable");
 	}
 	}
 }
 }
 
 
@@ -144,216 +113,33 @@ replaceuses(Fn *fn, Ref r1, Ref r2)
 
 
 	assert(rtype(r1) == RTmp);
 	assert(rtype(r1) == RTmp);
 	t1 = &fn->tmp[r1.val];
 	t1 = &fn->tmp[r1.val];
-
-	for (u = t1->use; u < &t1->use[t1->nuse]; u++)
+	for (u=t1->use; u<&t1->use[t1->nuse]; u++)
 		replaceuse(fn, u, r1, r2);
 		replaceuse(fn, u, r1, r2);
-
 	t1->nuse = 0;
 	t1->nuse = 0;
 }
 }
 
 
 static void
 static void
-rmuse(Fn *fn, Blk *b, uint ty, Ins *i, Phi *p, Ref r, int strict)
-{
-	Tmp *t;
-	Use *u;
-	int found, rm;
-
-	if (rtype(r) != RTmp)
-		return;
-	found = 0;
-	t = &fn->tmp[r.val];
-	for (u = t->use; u < &t->use[t->nuse];) {
-		rm = 0;
-		if (u->type == ty) {
-			switch (ty) {
-			case UXXX:
-				die("unreachable");
-				break;
-			case UIns:
-				assert(p == 0);
-				assert(i);
-				rm = u->u.ins == i;
-				break;
-			case UPhi:
-				assert(i == 0);
-				assert(p);
-				rm = u->u.phi == p;
-				break;
-			case UJmp:
-				assert(i == 0);
-				assert(p == 0);
-				rm = u->bid == b->id;
-				break;
-			default:
-				die("unreachable");
-				break;
-			}
-		}
-		if (rm) {
-			found++;
-			assert(u < &t->use[t->nuse]);
-			assert(u->bid == b->id);
-			/* careful - deleting below iterator */
-			memcpy(u, u+1, ((t->nuse) - ((u+1)-t->use)) * sizeof u[0]);
-			t->nuse--;
-		} else
-			u++;
-	}
-	if (strict)
-		assert(found);
-}
-
-static int
-phieq(Phi *pa, Phi *pb)
-{
-	uint n;
-
-	assert(pa->narg == pb->narg);
-	for (n=0; n<pa->narg; n++) {
-		if (!req(pa->arg[n], phiarg(pb, pa->blk[n])))
-			return 0;
-	}
-	return 1;
-}
-
-static void
-killphi(Fn *fn, Blk *b, Phi **pp, Ref r)
-{
-	Phi *p;
-	uint n;
-
-	p = *pp;
-	replaceuses(fn, p->to, r);
-	assert(b->npred == p->narg);
-	for (n = 0; n < p->narg; n++)
-		rmuse(fn, b, UPhi, 0, p, p->arg[n], 0/*!strict TODO dups*/);
-	p->narg = 0; /* mark as unused - TODO should not be necessary with rmuse*/
-	*pp = p->link;
-}
-
-static void
-ifphiopt(Fn *fn, Blk *b)
+dedupphi(Fn *fn, Blk *b)
 {
 {
-	Blk *ifb, *thenb, *elseb, *tmp;
-	Phi *p, **pp;
-	Tmp *t;
-	Ref argt, argf, r1;
-	int k, x, caninv;
-
-	/* is b the join blk of an if[-then][-else] graphlet? */
-	if (!ifjoin(b, &ifb, &thenb, &elseb))
-		return;
-
-	assert(ifb->jmp.type == Jjnz);
-	assert(ifb->s1 == b || (ifb->s1 == thenb && thenb->s1 == b));
-	assert(ifb->s2 == b || (ifb->s2 == elseb && elseb->s1 == b));
-	if (!iswu1(fn, ifb->jmp.arg))
-		return; /* no opportunity */
-
-	caninv = 1;
-	for (p = b->phi; p; p = p->link)
-		if (req(phiarg(p, elseb), con01[0])) {
-			caninv = 0;
-			break;
-		}
-
-	/* phi bool 1/0 "copy" */
-	for (pp = &b->phi; *pp;) {
-		p = *pp;
-		assert(p->narg == 2);
-		argt = phiarg(p, thenb);
-		argf = phiarg(p, elseb);
-		/* jnz jmp.arg in phi is 1/0 */
-		if (req(argt, ifb->jmp.arg)) {
-			if (!req(argf, ifb->jmp.arg))
-				rmuse(fn, b, UPhi, 0, p, argt, 1/*strict*/);
-			argt = p->arg[phiargn(p, thenb)] = con01[1];
-		}
-		if (req(argf, ifb->jmp.arg)) {
-			rmuse(fn, b, UPhi, 0, p, argf, 1/*strict*/);
-			argf = p->arg[phiargn(p, elseb)] = con01[0];
-		}
-		/* prefer 0 as argf and/or 1 as argt, so try to invert the cmp */
-		if (caninv &&
-		    ((req(argt, con01[0]) && !req(argf, con01[0])) ||
-		     (req(argf, con01[1]) && !req(argt, con01[1])))) {
-			assert(rtype(ifb->jmp.arg) == RTmp);
-			t = &fn->tmp[ifb->jmp.arg.val];
-			if (t->nuse == 1)
-			if (t->def && iscmp(t->def->op, &k, &x) && KBASE(k) == 0) {
-				assert(t->use[0].type == UJmp);
-				assert(t->bid == ifb->id);
-				t->def->op = invcmpwl(t->def->op);
-				tmp = ifb->s1;
-				ifb->s1 = ifb->s2;
-				ifb->s2 = tmp;
-				r1 = argt; argt = argf; argf = r1;
-				caninv = 0; /* only once */
-			}
-		}
-		if (req(argt, con01[1]) && req(argf, con01[0])) {
-			killphi(fn, b, pp, ifb->jmp.arg);
-			caninv = 0; /* used already */
-			continue;
-		}
-		pp = &(*pp)->link;
-	}
-}
-
-/* phi "copy" - singleton or all args identical */
-static void
-copyphielim(Fn *fn, Blk *b) {
 	Phi *p, **pp;
 	Phi *p, **pp;
-	uint n;
-
-	for (pp = &b->phi; *pp;) {
-		p = *pp;
-		assert(p->narg == b->npred);
-		for (n = 0; n < p->narg-1; n++) {
-			if (!req(p->arg[n], p->arg[n+1]))
-				goto Skip;
-		}
-		killphi(fn, b, pp, p->arg[0]);
-		continue;
-	Skip:;
-		pp = &(*pp)->link;
-	}
-}
+	Ref r;
 
 
-/* redundant phis */
-static void
-dupphielim(Fn *fn, Blk *b) {
-	Phi *p, *p2, **pp;
-
-	/* O(N^2.M) but meh */
-	for (pp = &b->phi; *pp;) {
-		p = *pp;
-		assert(p->narg == b->npred);
-		for (p2 = p->link; p2; p2 = p2->link) {
-			assert(p2->narg == b->npred);
-			if (phieq(p, p2)) {
-				killphi(fn, b, pp, p2->to);
-				goto Skip;
-			}
-		}
-		pp = &(*pp)->link;
-	Skip:;
+	for (pp=&b->phi; (p=*pp);) {
+		r = phicopyref(fn, b, p);
+		if (!req(r, R)) {
+			replaceuses(fn, p->to, r);
+			p->to = R;
+			*pp = p->link;
+		} else
+			pp = &p->link;
 	}
 	}
-	return;
-}
-
-static void
-dedupphis(Fn *fn, Blk *b) {
-	ifphiopt(fn, b);
-	copyphielim(fn,b);
-	dupphielim(fn, b);
 }
 }
 
 
 static int
 static int
 rcmp(Ref a, Ref b)
 rcmp(Ref a, Ref b)
 {
 {
 	if (rtype(a) != rtype(b))
 	if (rtype(a) != rtype(b))
-		return rtype(a)-rtype(b);
+		return rtype(a) - rtype(b);
 	return a.val - b.val;
 	return a.val - b.val;
 }
 }
 
 
@@ -364,14 +150,17 @@ normins(Fn *fn, Ins *i)
 	int64_t v;
 	int64_t v;
 	Ref r;
 	Ref r;
 
 
-	/* truncate constant bits to 32 bits for "s"/w" uses */
-	for (n = 0; n < 2; n++) {
+	/* truncate constant bits to
+	 * 32 bits for s/w uses */
+	for (n=0; n<2; n++) {
 		if (!KWIDE(argcls(i, n)))
 		if (!KWIDE(argcls(i, n)))
 		if (isconbits(fn, i->arg[n], &v))
 		if (isconbits(fn, i->arg[n], &v))
 		if ((v & 0xffffffff) != v)
 		if ((v & 0xffffffff) != v)
 			i->arg[n] = getcon(v & 0xffffffff, fn);
 			i->arg[n] = getcon(v & 0xffffffff, fn);
 	}
 	}
-	/* order arg[0] <= arg[1] for commutative ops, preferring RTmp in arg[0] */
+	/* order arg[0] <= arg[1] for
+	 * commutative ops, preferring
+	 * RTmp in arg[0] */
 	if (optab[i->op].commutes)
 	if (optab[i->op].commutes)
 	if (rcmp(i->arg[0], i->arg[1]) > 0) {
 	if (rcmp(i->arg[0], i->arg[1]) > 0) {
 		r = i->arg[1];
 		r = i->arg[1];
@@ -380,140 +169,106 @@ normins(Fn *fn, Ins *i)
 	}
 	}
 }
 }
 
 
-static Ref
-negcon(Fn *fn, int cls, Ref r)
+static int
+negcon(int cls, Con *c)
 {
 {
-	int64_t v, v1;
-
-	assert(isconbits(fn, r, &v));
-	assert(KBASE(cls) == 0);
-	v1 = -v;
-	if (cls == Kw)
-		v1 = ((int64_t)(-(int32_t)v)) & 0xffffffff;
-	return getcon(v1, fn);
+	static Con z = {.type = CBits, .bits.i = 0};
+
+	return foldint(c, Osub, cls, &z, c);
 }
 }
 
 
 static void
 static void
 assoccon(Fn *fn, Blk *b, Ins *i1)
 assoccon(Fn *fn, Blk *b, Ins *i1)
 {
 {
 	Tmp *t2;
 	Tmp *t2;
-	Ins *i2, i;
-	Ref r0, r1, rc;
-	int op1, op2;
-	int64_t v1, v2, vc;
-
-	op1 = i1->op;
-	if (op1 == Osub)
-		op1 = Oadd;
-	if (!optab[op1].assoc || KBASE(i1->cls) != 0 || rtype(i1->arg[0]) != RTmp
-	    || !isconbits(fn, i1->arg[1], &v1) || (optab[op1].hasid && v1 == optab[op1].idval))
+	Ins *i2;
+	int op, fail;
+	Con c, c1, c2;
+
+	op = i1->op;
+	if (op == Osub)
+		op = Oadd;
+
+	if (!optab[op].assoc
+	|| KBASE(i1->cls) != 0
+	|| rtype(i1->arg[0]) != RTmp
+	|| rtype(i1->arg[1]) != RCon)
 		return;
 		return;
+	c1 = fn->con[i1->arg[1].val];
+
 	t2 = &fn->tmp[i1->arg[0].val];
 	t2 = &fn->tmp[i1->arg[0].val];
 	if (t2->def == 0)
 	if (t2->def == 0)
 		return;
 		return;
 	i2 = t2->def;
 	i2 = t2->def;
-	op2 = i2->op;
-	if (op2 == Osub)
-		op2 = Oadd;
-	if (op1 != op2 || rtype(i2->arg[0]) != RTmp || !isconbits(fn, i2->arg[1], &v2))
+
+	if (op != (i2->op == Osub ? Oadd : i2->op)
+	|| rtype(i2->arg[1]) != RCon)
 		return;
 		return;
+	c2 = fn->con[i2->arg[1].val];
+
 	assert(KBASE(i2->cls) == 0);
 	assert(KBASE(i2->cls) == 0);
-	assert(i1->cls == Kl || i2->cls == Kw);
-	r0 = i1->arg[1];
-	if (i1->op == Osub)
-		r0 = negcon(fn, i1->cls, r0);
-	r1 = i2->arg[1];
-	if (i2->op == Osub)
-		r1 = negcon(fn, i2->cls, r1);
-	i = (Ins){.to = i2->to, .op = op2, .cls = i2->cls, .arg = {r0, r1}};
-	rc = foldref(fn, &i);
-	assert(isconbits(fn, rc, &vc));
-	if (op1 == Oadd) {
-		if (i2->cls == Kw) {
-			if ((int32_t)vc < 0) {
-				op1 = Osub;
-				rc = negcon(fn, Kw, rc);
-			}
-		} else {
-			assert(i2->cls == Kl);
-			if (vc < 0) {
-				op1 = Osub;
-				rc = negcon(fn, Kl, rc);
-			}
-		}
+	assert(KWIDE(i2->cls) >= KWIDE(i1->cls));
+
+	if (i1->op == Osub && negcon(i1->cls, &c1))
+		return;
+	if (i2->op == Osub && negcon(i2->cls, &c2))
+		return;
+	if (foldint(&c, op, i1->cls, &c1, &c2))
+		return;
+
+	if (op == Oadd && c.type == CBits)
+	if ((i1->cls == Kl  && c.bits.i < 0)
+	|| (i1->cls == Kw && (int32_t)c.bits.i < 0)) {
+		fail = negcon(i1->cls, &c);
+		assert(fail == 0);
+		op = Osub;
 	}
 	}
-	i1->op = op1;
-	rmuse(fn, b, UIns, i1, 0, i1->arg[0], 1/*strict*/);
+
+	i1->op = op;
 	i1->arg[0] = i2->arg[0];
 	i1->arg[0] = i2->arg[0];
+	i1->arg[1] = newcon(&c, fn);
 	adduse(&fn->tmp[i1->arg[0].val], UIns, b, i1);
 	adduse(&fn->tmp[i1->arg[0].val], UIns, b, i1);
-	i1->arg[1] = rc;
 }
 }
 
 
 static void
 static void
-killins(Fn *fn, Blk *b, Ins *i, Ref r1, Ref r2)
+killins(Fn *fn, Ins *i, Ref r)
 {
 {
-	replaceuses(fn, r1, r2);
-	rmuse(fn, b, UIns, i, 0, i->arg[0], 1/*strict*/);
-	if (!req(i->arg[0], i->arg[1]))
-		rmuse(fn, b, UIns, i, 0, i->arg[1], 1/*strict*/);
+	replaceuses(fn, i->to, r);
 	*i = (Ins){.op = Onop};
 	*i = (Ins){.op = Onop};
 }
 }
 
 
 static void
 static void
-dedupi(Fn *fn, Blk *b, Ins *i)
+dedupins(Fn *fn, Blk *b, Ins *i)
 {
 {
-	Ref r2;
-	Ins *i2;
-
-	if (i->op == Onop || i->op == Odbgloc)
-		return;
+	Ref r;
+	Ins *i1;
 
 
 	normins(fn, i);
 	normins(fn, i);
-
-	if (optab[i->op].ispinned)
+	if (i->op == Onop || pinned(i))
 		return;
 		return;
-	assert(rtype(i->to) == RTmp);
 
 
-	/* merge associative ops with constant arg[1] */
+	assert(!req(i->to, R));
 	assoccon(fn, b, i);
 	assoccon(fn, b, i);
 
 
-	/* effective copy? */
-	r2 = copyref(fn, b, i);
-	if (!req(r2, R)) {
-		killins(fn, b, i, i->to, r2);
+	r = copyref(fn, b, i);
+	if (!req(r, R)) {
+		killins(fn, i, r);
 		return;
 		return;
 	}
 	}
-
-	/* effective constant? */
-	r2 = foldref(fn, i);
-	if (!req(r2, R)) {
-		killins(fn, b, i, i->to, r2);
+	r = foldref(fn, i);
+	if (!req(r, R)) {
+		killins(fn, i, r);
 		return;
 		return;
 	}
 	}
-
-	/* do not dedup (trapping) ins that GCM will not move */
-	if (isfixed(fn, i))
-		return;
-
-	/* duplicate? */
-	i2 = gvndup(i, 1);
-	if (i2) {
-		killins(fn, b, i, i->to, i2->to);
+	i1 = gvndup(i, 1);
+	if (i1) {
+		killins(fn, i, i1->to);
 		return;
 		return;
 	}
 	}
 }
 }
 
 
-static void
-dedupins(Fn *fn, Blk *b)
-{
-	Ins *i;
-
-	for (i = b->ins; i < &b->ins[b->nins]; i++)
-		dedupi(fn, b, i);
-}
-
 int
 int
-cmpeq0def(Fn *fn, Ref r, Ref *arg0, int *cls, int *eqval)
+cmpeqz(Fn *fn, Ref r, Ref *arg, int *cls, int *eqval)
 {
 {
 	Ins *i;
 	Ins *i;
 
 
@@ -522,8 +277,8 @@ cmpeq0def(Fn *fn, Ref r, Ref *arg0, int *cls, int *eqval)
 	i = fn->tmp[r.val].def;
 	i = fn->tmp[r.val].def;
 	if (i)
 	if (i)
 	if (optab[i->op].cmpeqwl)
 	if (optab[i->op].cmpeqwl)
-	if (req(i->arg[1], con01[0])) {
-		*arg0 = i->arg[0];
+	if (req(i->arg[1], CON_Z)) {
+		*arg = i->arg[0];
 		*cls = argcls(i, 0);
 		*cls = argcls(i, 0);
 		*eqval = optab[i->op].eqval;
 		*eqval = optab[i->op].eqval;
 		return 1;
 		return 1;
@@ -534,23 +289,25 @@ cmpeq0def(Fn *fn, Ref r, Ref *arg0, int *cls, int *eqval)
 static int
 static int
 branchdom(Fn *fn, Blk *bif, Blk *bbr1, Blk *bbr2, Blk *b)
 branchdom(Fn *fn, Blk *bif, Blk *bbr1, Blk *bbr2, Blk *b)
 {
 {
-	if (bif->jmp.type == Jjnz)
-	if (b != bif)
-	if (dom(bbr1, b))
-	if (!reachesnotvia(fn, bbr2, b, bif))
+	assert(bif->jmp.type == Jjnz);
+
+	if (b != bif
+	&& dom(bbr1, b)
+	&& !reachesnotvia(fn, bbr2, b, bif))
 		return 1;
 		return 1;
+
 	return 0;
 	return 0;
 }
 }
 
 
 static int
 static int
-dom0non0(Fn *fn, Blk *bdom, Blk *b, int *is0)
+domzero(Fn *fn, Blk *d, Blk *b, int *z)
 {
 {
-	if (branchdom(fn, bdom, bdom->s1, bdom->s2, b)) {
-		*is0 = 0;
+	if (branchdom(fn, d, d->s1, d->s2, b)) {
+		*z = 0;
 		return 1;
 		return 1;
 	}
 	}
-	if (branchdom(fn, bdom, bdom->s2, bdom->s1, b)) {
-		*is0 = 1;
+	if (branchdom(fn, d, d->s2, d->s1, b)) {
+		*z = 1;
 		return 1;
 		return 1;
 	}
 	}
 	return 0;
 	return 0;
@@ -558,24 +315,25 @@ dom0non0(Fn *fn, Blk *bdom, Blk *b, int *is0)
 
 
 /* infer 0/non-0 value from dominating jnz */
 /* infer 0/non-0 value from dominating jnz */
 int
 int
-is0non0(Fn *fn, Blk *b, Ref r, int cls, int *is0)
+zeroval(Fn *fn, Blk *b, Ref r, int cls, int *z)
 {
 {
-	Blk *bdom;
-	Ref arg0;
+	Blk *d;
+	Ref arg;
 	int cls1, eqval;
 	int cls1, eqval;
 
 
-	for (bdom = b->idom; bdom; bdom = bdom->idom) {
-		if (bdom->jmp.type != Jjnz)
+	for (d=b->idom; d; d=d->idom) {
+		if (d->jmp.type != Jjnz)
 			continue;
 			continue;
-		if (cls == Kw)
-		if (req(r, bdom->jmp.arg))
-		if (dom0non0(fn, bdom, b, is0))
+		if (req(r, d->jmp.arg)
+		&& cls == Kw
+		&& domzero(fn, d, b, z)) {
 			return 1;
 			return 1;
-		if (cmpeq0def(fn, bdom->jmp.arg, &arg0, &cls1, &eqval))
-		if (cls == cls1)
-		if (req(r, arg0))
-		if (dom0non0(fn, bdom, b, is0)) {
-			*is0 = *is0 ^ eqval;
+		}
+		if (cmpeqz(fn, d->jmp.arg, &arg, &cls1, &eqval)
+		&& req(r, arg)
+		&& cls == cls1
+		&& domzero(fn, d, b, z)) {
+			*z ^= eqval;
 			return 1;
 			return 1;
 		}
 		}
 	}
 	}
@@ -583,19 +341,27 @@ is0non0(Fn *fn, Blk *b, Ref r, int cls, int *is0)
 }
 }
 
 
 static int
 static int
-usecls(Use *u, Ref r)
+usecls(Use *u, Ref r, int cls)
 {
 {
+	int k;
+
 	switch (u->type) {
 	switch (u->type) {
-	case UXXX: break;
 	case UIns:
 	case UIns:
-		/* safe to take arg[0] cls if both args == r, even for store */
+		k = Kx;  /* widest use */
 		if (req(u->u.ins->arg[0], r))
 		if (req(u->u.ins->arg[0], r))
-			return argcls(u->u.ins, 0);
+			k = argcls(u->u.ins, 0);
 		if (req(u->u.ins->arg[1], r))
 		if (req(u->u.ins->arg[1], r))
-			return argcls(u->u.ins, 1);
+		if (k == Kx || !KWIDE(k))
+			k = argcls(u->u.ins, 1);
+		return k == Kx ? cls : k;
+	case UPhi:
+		if (req(u->u.phi->to, R))
+			return cls; /* eliminated */
+		return u->u.phi->cls;
+	case UJmp:
+		return Kw;
+	default:
 		break;
 		break;
-	case UPhi: return u->u.phi->cls;
-	case UJmp: return Kw;
 	}
 	}
 	die("unreachable");
 	die("unreachable");
 }
 }
@@ -610,136 +376,126 @@ propjnz0(Fn *fn, Blk *bif, Blk *s0, Blk *snon0, Ref r, int cls)
 	if (s0->npred != 1 || rtype(r) != RTmp)
 	if (s0->npred != 1 || rtype(r) != RTmp)
 		return;
 		return;
 	t = &fn->tmp[r.val];
 	t = &fn->tmp[r.val];
-	for (u = t->use; u < &t->use[t->nuse]; u++) {
+	for (u=t->use; u<&t->use[t->nuse]; u++) {
 		b = fn->rpo[u->bid];
 		b = fn->rpo[u->bid];
-		if (usecls(u, r) == cls)
+		/* we may compare an l temp with a w
+		 * comparison; so check that the use
+		 * does not involve high bits */
+		if (usecls(u, r, cls) == cls)
 		if (branchdom(fn, bif, s0, snon0, b))
 		if (branchdom(fn, bif, s0, snon0, b))
-			replaceuse(fn, u, r, con01[0]);
+			replaceuse(fn, u, r, CON_Z);
 	}
 	}
 }
 }
 
 
 static void
 static void
-dedupjmp(Fn *fn, Blk *b) {
-	Blk *s1s2[2];
+dedupjmp(Fn *fn, Blk *b)
+{
+	Blk **ps;
 	int64_t v;
 	int64_t v;
-	Ref arg0;
-	int cls, eqval, is0;
+	Ref arg;
+	int cls, eqval, z;
 
 
 	if (b->jmp.type != Jjnz)
 	if (b->jmp.type != Jjnz)
 		return;
 		return;
 
 
-	/* propagate jmp arg as 0 thru s2 */
+	/* propagate jmp arg as 0 through s2 */
 	propjnz0(fn, b, b->s2, b->s1, b->jmp.arg, Kw);
 	propjnz0(fn, b, b->s2, b->s1, b->jmp.arg, Kw);
 	/* propagate cmp eq/ne 0 def of jmp arg as 0 */
 	/* propagate cmp eq/ne 0 def of jmp arg as 0 */
-	s1s2[0] = b->s1; s1s2[1] = b->s2;
-	if (cmpeq0def(fn, b->jmp.arg, &arg0, &cls, &eqval))
-		propjnz0(fn, b, s1s2[eqval^1], s1s2[eqval], arg0, cls);
+	if (cmpeqz(fn, b->jmp.arg, &arg, &cls, &eqval)) {
+		ps = (Blk*[]){b->s1, b->s2};
+		propjnz0(fn, b, ps[eqval^1], ps[eqval], arg, cls);
+	}
 
 
 	/* collapse trivial/constant jnz to jmp */
 	/* collapse trivial/constant jnz to jmp */
 	v = 1;
 	v = 1;
-	is0 = 0;
+	z = 0;
 	if (b->s1 == b->s2
 	if (b->s1 == b->s2
-	    || isconbits(fn, b->jmp.arg, &v)
-	    || is0non0(fn, b, b->jmp.arg, Kw, &is0)) {
-		if (v == 0 || is0)
+	|| isconbits(fn, b->jmp.arg, &v)
+	|| zeroval(fn, b, b->jmp.arg, Kw, &z)) {
+		if (v == 0 || z)
 			b->s1 = b->s2;
 			b->s1 = b->s2;
 		/* we later move active ins out of dead blks */
 		/* we later move active ins out of dead blks */
 		b->s2 = 0;
 		b->s2 = 0;
 		b->jmp.type = Jjmp;
 		b->jmp.type = Jjmp;
-		rmuse(fn, b, UJmp, 0, 0, b->jmp.arg, 1/*strict*/);
 		b->jmp.arg = R;
 		b->jmp.arg = R;
-		return;
 	}
 	}
 }
 }
 
 
-/* rebuild rpo pred use */
-/* careful not to lose active ins in dead blks */
 static void
 static void
-rebuildcfg(Fn *fn) {
-	uint n, prevnblk;
-	Blk **prevrpo;
-	Blk *b;
+rebuildcfg(Fn *fn)
+{
+	uint n, nblk;
+	Blk *b, *s, **rpo;
 	Ins *i;
 	Ins *i;
 
 
-	prevnblk = fn->nblk;
-	prevrpo = emalloc(prevnblk * sizeof prevrpo[0]);
-	memcpy(prevrpo, fn->rpo, prevnblk * sizeof prevrpo[0]);
-
-	fillcfg(fn); // TODO - OK?
-
-	for (n=0; n<prevnblk; n++) {
-		b = prevrpo[n];
-		if (b->id == -1u) {
-			assert(b != fn->start);
-			/* blk unreachable after GVN */
-			for (i = b->ins; i < &b->ins[b->nins]; i++)
-				if (i->op != Onop)
-				if (!optab[i->op].ispinned)
-				if (gvndup(i, 0) == i)
-					/* (possibly) active ins - add to start blk */
-					addins(&fn->start->ins, &fn->start->nins, i);
-		}
+	nblk = fn->nblk;
+	rpo = emalloc(nblk * sizeof rpo[0]);
+	memcpy(rpo, fn->rpo, nblk * sizeof rpo[0]);
+
+	fillcfg(fn);
+
+	/* move instructions that were in
+	 * killed blocks and may be active
+	 * in the computation in the start
+	 * block */
+	s = fn->start;
+	for (n=0; n<nblk; n++) {
+		b = rpo[n];
+		if (b->id != -1u)
+			continue;
+		/* blk unreachable after GVN */
+		assert(b != s);
+		for (i=b->ins; i<&b->ins[b->nins]; i++)
+			if (!optab[i->op].pinned)
+			if (gvndup(i, 0) == i)
+				addins(&s->ins, &s->nins, i);
 	}
 	}
-	free(prevrpo);
+	free(rpo);
 }
 }
 
 
-/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
-/* require rpo pred ssa use */
-/* recreates rpo */
-/* breaks pred use dom ssa (GCM fixes ssa) */
+/* requires rpo pred ssa use
+ * recreates rpo preds
+ * breaks pred use dom ssa (GCM fixes ssa)
+ */
 void
 void
 gvn(Fn *fn)
 gvn(Fn *fn)
 {
 {
-	uint n, nins;
 	Blk *b;
 	Blk *b;
 	Phi *p;
 	Phi *p;
-
-	/* facilitate jnz simplification */
-	blkmerge(fn);
-	fillcfg(fn);
-	filluse(fn);
-	filldom(fn);
+	Ins *i;
+	uint n, nins;
 
 
 	con01[0] = getcon(0, fn);
 	con01[0] = getcon(0, fn);
 	con01[1] = getcon(1, fn);
 	con01[1] = getcon(1, fn);
 
 
-	nins = 0;
+	/* copy.c uses the visit bit */
 	for (b=fn->start; b; b=b->link)
 	for (b=fn->start; b; b=b->link)
-		for (p = b->phi; p; p = p->link)
+		for (p=b->phi; p; p=p->link)
 			p->visit = 0;
 			p->visit = 0;
 
 
-	/* facilitate ext elimination */
 	fillloop(fn);
 	fillloop(fn);
 	narrowpars(fn);
 	narrowpars(fn);
 	filluse(fn);
 	filluse(fn);
 	ssacheck(fn);
 	ssacheck(fn);
 
 
-	for (b=fn->start; b; b=b->link)
+	nins = 0;
+	for (b=fn->start; b; b=b->link) {
+		b->visit = 0;
 		nins += b->nins;
 		nins += b->nins;
+	}
 
 
 	gvntbln = nins + nins/2;
 	gvntbln = nins + nins/2;
 	gvntbl = emalloc(gvntbln * sizeof gvntbl[0]);
 	gvntbl = emalloc(gvntbln * sizeof gvntbl[0]);
-	lookupn = 0;
-	proben = 0;
-	maxproben = 0;
-
-	/* GVN */
-	clrbvisit(fn);
 	for (n=0; n<fn->nblk; n++) {
 	for (n=0; n<fn->nblk; n++) {
 		b = fn->rpo[n];
 		b = fn->rpo[n];
-		dedupphis(fn, b);
-		dedupins(fn, b);
+		dedupphi(fn, b);
+		for (i=b->ins; i<&b->ins[b->nins]; i++)
+			dedupins(fn, b, i);
 		dedupjmp(fn, b);
 		dedupjmp(fn, b);
 	}
 	}
-
 	rebuildcfg(fn);
 	rebuildcfg(fn);
-
 	free(gvntbl);
 	free(gvntbl);
 	gvntbl = 0;
 	gvntbl = 0;
-	gvntbln = 0;
-	lookupn = 0;
-	proben = 0;
-	maxproben = 0;
 
 
 	if (debug['G']) {
 	if (debug['G']) {
 		fprintf(stderr, "\n> After GVN:\n");
 		fprintf(stderr, "\n> After GVN:\n");

+ 1 - 0
load.c

@@ -380,6 +380,7 @@ def(Slice sl, bits msk, Blk *b, Ins *i, Loc *il)
 			goto Load;
 			goto Load;
 		p->arg[np] = r1;
 		p->arg[np] = r1;
 		p->blk[np] = bp;
 		p->blk[np] = bp;
+		/* XXX - multiplicity in predecessors!!! */
 	}
 	}
 	if (msk != msks)
 	if (msk != msks)
 		mask(cls, &r, msk, il);
 		mask(cls, &r, msk, il);

+ 0 - 3
main.c

@@ -77,9 +77,6 @@ func(Fn *fn)
 	gvn(fn);
 	gvn(fn);
 	fillcfg(fn);
 	fillcfg(fn);
 	filluse(fn);
 	filluse(fn);
-	ifelim(fn);
-	fillcfg(fn);
-	filluse(fn);
 	filldom(fn);
 	filldom(fn);
 	gcm(fn);
 	gcm(fn);
 	filluse(fn);
 	filluse(fn);

+ 12 - 2
ops.h

@@ -7,7 +7,7 @@
 #endif
 #endif
 
 
 #ifndef F
 #ifndef F
-#define F(CanFold, HasId, IdVal, Commutes, Associates, Idemp, IsCmpEq, IsCmpLgte, CmpEqVal, IsPinned)
+#define F(a,b,c,d,e,f,g,h,i,j)
 #endif
 #endif
 
 
 #define T(a,b,c,d,e,f,g,h) {                          \
 #define T(a,b,c,d,e,f,g,h) {                          \
@@ -19,7 +19,17 @@
 /* PUBLIC OPERATIONS */
 /* PUBLIC OPERATIONS */
 /*********************/
 /*********************/
 
 
-/* Arithmetic and Bits */
+/*                                can fold                        */
+/*                                | has identity                  */
+/*                                | | identity value for arg[1]   */
+/*                                | | | commutative               */
+/*                                | | | | associative             */
+/*                                | | | | | idempotent            */
+/*                                | | | | | | c{eq,ne}[wl]        */
+/*                                | | | | | | | c[us][gl][et][wl] */
+/*                                | | | | | | | | value if = args */
+/*                                | | | | | | | | | pinned        */
+/* Arithmetic and Bits            v v v v v v v v v v             */
 O(add,     T(w,l,s,d, w,l,s,d), F(1,1,0,1,1,0,0,0,0,0)) X(2,1,0) V(1)
 O(add,     T(w,l,s,d, w,l,s,d), F(1,1,0,1,1,0,0,0,0,0)) X(2,1,0) V(1)
 O(sub,     T(w,l,s,d, w,l,s,d), F(1,1,0,0,0,0,0,0,0,0)) X(2,1,0) V(0)
 O(sub,     T(w,l,s,d, w,l,s,d), F(1,1,0,0,0,0,0,0,0,0)) X(2,1,0) V(0)
 O(neg,     T(w,l,s,d, x,x,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
 O(neg,     T(w,l,s,d, x,x,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)

+ 2 - 2
parse.c

@@ -22,7 +22,7 @@ Op optab[NOp] = {
 	.commutes = co, .assoc = as, \
 	.commutes = co, .assoc = as, \
 	.idemp = im, \
 	.idemp = im, \
 	.cmpeqwl = ic, .cmplgtewl = lg, .eqval = cv, \
 	.cmpeqwl = ic, .cmplgtewl = lg, .eqval = cv, \
-	.ispinned = pn
+	.pinned = pn
 #define O(op, k, flags) [O##op]={.name = #op, .argcls = k, flags},
 #define O(op, k, flags) [O##op]={.name = #op, .argcls = k, flags},
 	#include "ops.h"
 	#include "ops.h"
 #undef F
 #undef F
@@ -940,7 +940,7 @@ parsefn(Lnk *lnk)
 	curf->mem = vnew(0, sizeof curf->mem[0], PFn);
 	curf->mem = vnew(0, sizeof curf->mem[0], PFn);
 	curf->nmem = 0;
 	curf->nmem = 0;
 	curf->nblk = nblk;
 	curf->nblk = nblk;
-	curf->rpo = 0;
+	curf->rpo = vnew(nblk, sizeof curf->rpo[0], PFn);
 	for (b=curf->start; b; b=b->link)
 	for (b=curf->start; b; b=b->link)
 		b->dlink = 0; /* was trashed by findblk() */
 		b->dlink = 0; /* was trashed by findblk() */
 	for (i=0; i<BMask+1; ++i)
 	for (i=0; i<BMask+1; ++i)

+ 1 - 2
ssa.c

@@ -44,7 +44,6 @@ filluse(Fn *fn)
 	uint a;
 	uint a;
 	Tmp *tmp;
 	Tmp *tmp;
 
 
-	/* todo, is this the correct file? */
 	tmp = fn->tmp;
 	tmp = fn->tmp;
 	for (t=Tmp0; t<fn->ntmp; t++) {
 	for (t=Tmp0; t<fn->ntmp; t++) {
 		tmp[t].def = 0;
 		tmp[t].def = 0;
@@ -145,7 +144,7 @@ phiins(Fn *fn)
 				continue;
 				continue;
 		}
 		}
 		bszero(u);
 		bszero(u);
-		k = -1;
+		k = Kx;
 		bp = be;
 		bp = be;
 		for (b=fn->start; b; b=b->link) {
 		for (b=fn->start; b; b=b->link) {
 			b->visit = 0;
 			b->visit = 0;

+ 15 - 0
test/copy.ssa

@@ -0,0 +1,15 @@
+export function w $f() {
+@start
+	%x0 =w loadsb $a
+	# the extension must not be eliminated
+	# even though the load already extended
+	%x1 =l extsb %x0
+	%c =w ceql %x1, -1
+	ret %c
+}
+
+# >>> driver
+# char a = -1;
+# extern int f();
+# int main() { return !(f() == 1); }
+# <<<

+ 0 - 0
test/gvn-live-dead.ssa → test/gvn1.ssa


+ 0 - 0
test/non0jnz.ssa → test/gvn2.ssa


+ 65 - 33
util.c

@@ -168,7 +168,7 @@ addbins(Blk *b, Ins **pvins, uint *pnins)
 {
 {
 	Ins *i;
 	Ins *i;
 
 
-	for (i = b->ins; i < &b->ins[b->nins]; i++)
+	for (i=b->ins; i<&b->ins[b->nins]; i++)
 		addins(pvins, pnins, i);
 		addins(pvins, pnins, i);
 }
 }
 
 
@@ -247,23 +247,50 @@ iscmp(int op, int *pk, int *pc)
 	return 1;
 	return 1;
 }
 }
 
 
-static int INVCMPWL[] = {
-	/*Oceqw*/Ocnew, /*Ocnew*/Oceqw,
-	/*Ocsgew*/Ocsltw, /*Ocsgtw*/Ocslew, /*Ocslew*/Ocsgtw, /*Ocsltw*/Ocsgew,
-	/*Ocugew*/Ocultw, /*Ocugtw*/Oculew, /*Oculew*/Ocugtw, /*Ocultw*/Ocugew,
-	/*Oceql*/Ocnel, /*Ocnel*/Oceql,
-	/*Ocsgel*/Ocsltl, /*Ocsgtl*/Ocslel, /*Ocslel*/Ocsgtl, /*Ocsltl*/Ocsgel,
-	/*Ocugel*/Ocultl, /*Ocugtl*/Oculel, /*Oculel*/Ocugtl, /*Ocultl*/Ocugel,
-};
-
-int
-invcmpwl(int cmp)
+void
+igroup(Blk *b, Ins *i, Ins **i0, Ins **i1)
 {
 {
-	assert(Oceqw <= cmp && cmp <= Ocultl);
-	return INVCMPWL[cmp - Oceqw];
-}
-
+	Ins *ib, *ie;
 
 
+	ib = b->ins;
+	ie = ib + b->nins;
+	switch (i->op) {
+	case Oblit0:
+		*i0 = i;
+		*i1 = i + 2;
+		return;
+	case Oblit1:
+		*i0 = i - 1;
+		*i1 = i + 1;
+		return;
+	case_Opar:
+		for (; i>ib && ispar((i-1)->op); i--)
+			;
+		*i0 = i;
+		for (; i<ie && ispar(i->op); i++)
+			;
+		*i1 = i;
+		return;
+	case Ocall:
+	case_Oarg:
+		for (; i>ib && isarg((i-1)->op); i--)
+			;
+		*i0 = i;
+		for (; i<ie && i->op != Ocall; i++)
+			;
+		assert(i < ie);
+		*i1 = i + 1;
+		return;
+	default:
+		if (ispar(i->op))
+			goto case_Opar;
+		if (isarg(i->op))
+			goto case_Oarg;
+		*i0 = i;
+		*i1 = i + 1;
+		return;
+	}
+}
 
 
 int
 int
 argcls(Ins *i, int n)
 argcls(Ins *i, int n)
@@ -291,12 +318,8 @@ emiti(Ins i)
 void
 void
 idup(Blk *b, Ins *s, ulong n)
 idup(Blk *b, Ins *s, ulong n)
 {
 {
-	if (b->ins)
-		vgrow(&b->ins, n);
-	else
-		b->ins = vnew(n, sizeof(Ins), PFn);
-	if (n)
-		memcpy(b->ins, s, n * sizeof(Ins));
+	vgrow(&b->ins, n);
+	icpy(b->ins, s, n);
 	b->nins = n;
 	b->nins = n;
 }
 }
 
 
@@ -344,6 +367,16 @@ cmpop(int c)
 	return cmptab[c][1];
 	return cmptab[c][1];
 }
 }
 
 
+int
+cmpwlneg(int op)
+{
+	if (INRANGE(op, Ocmpw, Ocmpw1))
+		return cmpneg(op - Ocmpw) + Ocmpw;
+	if (INRANGE(op, Ocmpl, Ocmpl1))
+		return cmpneg(op - Ocmpl) + Ocmpl;
+	die("not a wl comparison");
+}
+
 int
 int
 clsmerge(short *pk, short k)
 clsmerge(short *pk, short k)
 {
 {
@@ -379,16 +412,21 @@ phiargn(Phi *p, Blk *b)
 {
 {
 	uint n;
 	uint n;
 
 
-	for (n = 0; n < p->narg; n++)
-		if (p->blk[n] == b)
-			return n;
-	die("unreachable");
+	if (p)
+		for (n=0; n<p->narg; n++)
+			if (p->blk[n] == b)
+				return n;
+	return -1;
 }
 }
 
 
 Ref
 Ref
 phiarg(Phi *p, Blk *b)
 phiarg(Phi *p, Blk *b)
 {
 {
-	return p->arg[phiargn(p, b)];
+	uint n;
+
+	n = phiargn(p, b);
+	assert(n != -1u && "block not found");
+	return p->arg[n];
 }
 }
 
 
 Ref
 Ref
@@ -489,12 +527,6 @@ isconbits(Fn *fn, Ref r, int64_t *v)
 	return 0;
 	return 0;
 }
 }
 
 
-int
-istmpconbits(Fn *fn, Ins *i, int64_t *v)
-{
-	return rtype(i->arg[0]) == RTmp && isconbits(fn, i->arg[1], v);
-}
-
 void
 void
 salloc(Ref rt, Ref rs, Fn *fn)
 salloc(Ref rt, Ref rs, Fn *fn)
 {
 {