浏览代码

Global Value Numbering / Global Code Motion

More or less as proposed in its ninth iteration with the
addition of a gcmmove() functionality to restore coherent
local schedules.

Changes since RFC 8:

Features:

 - generalization of phi 1/0 detection
 - collapse linear jmp chains before GVN; simplifies if-graph
     detection used in 0/non-0 value inference and if-elim...
 - infer 0/non-0 values from dominating blk jnz; eliminates
     redundant cmp eq/ne 0 and associated jnz/blocks, for example
     redundant null pointer checks (hare codebase likes this)
 - remove (emergent) empty if-then-else graphlets between GVN and
     GCM; improves GCM instruction placement, particularly cmps.
 - merge %addr =l add %addr1, N sequences - reduces tmp count,
     register pressure.
 - squash consecutive associative ops with constant args, e.g.
     t1 = add t, N ... t2 = add t2, M -> t2 = add t, N+M

Bug Fixes:

 - remove "cmp eq/ne of non-identical RCon's " in copyref().
   RCon's are not guaranteed to be dedup'ed, and symbols can
   alias.

Codebase:

  - moved some stuff into cfg.c including blkmerge()
  - some refactoring in gvn.c
  - simplification of reassoc.c - always reassoc all cmp ops
      and Kl add %t, N. Better on coremark, smaller codebase.
  - minor simplification of movins() - use vins

Testing - standard QBE, cproc, hare, harec, coremark
          [still have Rust build issues with latest roland]

Benchmark
- coremark is ~15%+ faster than master
- hare "HARETEST_INCLUDE='slow' make check" ~8% faster
    (crypto::sha1::sha1_1gb is biggest obvious win - ~25% faster)

Changes since RFC 7:

Bug fixes:

- remove isbad4gcm() in GVN/GCM - it is unsound due to different state
    at GVN vs GCM time; replace with "reassociation" pass after GCM
- fix intra-blk use-before-def after GCM
- prevent GVN from deduping trapping instructions cos GCM will not
    move them
- remove cmp eq/ne identical arg copy detection for floating point, it
    is not valid for NaN
- fix cges/cged flagged as commutative in ops.h instead of cnes/cned
    respectively; just a typo

Minor features:

- copy detection handles cmp le/lt/ge/gt with identical args
- treat (integer) div/rem by non-zero constant as non-trapping
- eliminate add N/sub N pairs in copy detection
- maintain accurate tmp use in GVN; not strictly necessary but enables
    interim global state sanity checking
- "reassociation" of trivial constant offset load/store addresses, and
    cmp ops with point-of-use in pass after GCM
- normalise commutative op arg order - e.g. op con, tmp -> op tmp, con
    to simplify copy detection and GVN instruction dedup

Codebase:

- split out core copy detection and constant folding (back) out into
    copy.c, fold.c respectively; gvn.c was getting monolithic
- generic support for instruction moving in ins.c - used by GCM and
    reassoc
- new reassociation pass in reassoc.c
- other minor clean-up/refactor

Changes since RFC 6:

- More ext elimination in GVN by examination of def and use bit width
- elimination of redundant and mask by bit width examination
- Incorporation of Song's patch

Changes since RFC 5:

- avoidance of "bad" candidates for GVN/GCM - trivial address offset
    calculations, and comparisons
- more copy detection mostly around boolean values
- allow elimination of unused load, alloc, trapping instructions
- detection of trivial boolean v ? 1 : 0 phi patterns
- bug fix for (removal of) "chg" optimisation in ins recreation - it
    was missing removal of unused instructions in some cases

ifelim() between GVN and GCM; deeper nopunused()
Roland Paterson-Jones 1 年之前
父节点
当前提交
c2ff93e75e
共有 20 个文件被更改,包括 2612 次插入691 次删除
  1. 1 0
      .gitignore
  2. 2 1
      Makefile
  3. 70 7
      all.h
  4. 285 31
      cfg.c
  5. 295 180
      copy.c
  6. 29 318
      fold.c
  7. 507 0
      gcm.c
  8. 748 0
      gvn.c
  9. 195 0
      ins.c
  10. 10 2
      main.c
  11. 144 147
      ops.h
  12. 10 4
      parse.c
  13. 101 0
      reassoc.c
  14. 1 1
      ssa.c
  15. 48 0
      test/_gcm1.ssa
  16. 43 0
      test/_gcm2.ssa
  17. 17 0
      test/_load-elim.ssa
  18. 19 0
      test/gvn-live-dead.ssa
  19. 31 0
      test/non0jnz.ssa
  20. 56 0
      util.c

+ 1 - 0
.gitignore

@@ -3,3 +3,4 @@ qbe
 config.h
 .comfile
 *.out
+*~

+ 2 - 1
Makefile

@@ -5,7 +5,8 @@ PREFIX = /usr/local
 BINDIR = $(PREFIX)/bin
 
 COMMOBJ  = main.o util.o parse.o abi.o cfg.o mem.o ssa.o alias.o load.o \
-           copy.o fold.o simpl.o live.o spill.o rega.o emit.o
+           copy.o fold.o gvn.o gcm.o ins.o simpl.o live.o \
+           spill.o rega.o emit.o
 AMD64OBJ = amd64/targ.o amd64/sysv.o amd64/isel.o amd64/emit.o
 ARM64OBJ = arm64/targ.o arm64/abi.o arm64/isel.o arm64/emit.o
 RV64OBJ  = rv64/targ.o rv64/abi.o rv64/isel.o rv64/emit.o

+ 70 - 7
all.h

@@ -191,6 +191,7 @@ enum {
 #define INRANGE(x, l, u) ((unsigned)(x) - l <= u - l) /* linear in x */
 #define isstore(o) INRANGE(o, Ostoreb, Ostored)
 #define isload(o) INRANGE(o, Oloadsb, Oload)
+#define isalloc(o) INRANGE(o, Oalloc4, Oalloc16)
 #define isext(o) INRANGE(o, Oextsb, Oextuw)
 #define ispar(o) INRANGE(o, Opar, Opare)
 #define isarg(o) INRANGE(o, Oarg, Oargv)
@@ -214,8 +215,15 @@ struct Op {
 	char *name;
 	short argcls[2][4];
 	uint canfold:1;
-	uint hasid:1;
-	uint idval:1; /* identity value 0/1 */
+	uint hasid:1;     /* op identity value? */
+	uint idval:1;     /* identity value 0/1 */
+	uint commutes:1;  /* commutative op? */
+	uint assoc:1;     /* associative op? */
+	uint idemp:1;     /* idempotent op? */
+	uint cmpeqwl:1;   /* Kl/Kw cmp eq/ne? */
+	uint cmplgtewl:1; /* Kl/Kw cmp lt/gt/le/ge? */
+	uint eqval:1;     /* 1 for eq; 0 for ne */
+	uint ispinned:1;  /* GCM pinned op? */
 };
 
 struct Ins {
@@ -230,7 +238,8 @@ struct Phi {
 	Ref *arg;
 	Blk **blk;
 	uint narg;
-	int cls;
+	short cls;
+	uint visit:1;
 	Phi *link;
 };
 
@@ -251,6 +260,7 @@ struct Blk {
 
 	Blk *idom;
 	Blk *dom, *dlink;
+	int domdpth;
 	Blk **fron;
 	uint nfron;
 
@@ -344,6 +354,9 @@ struct Tmp {
 		Wuw
 	} width;
 	int visit;
+	uint gcmbid;
+	uint gcminsn; // TODO get rid
+	uint gcmdefinsn; // TODO get rid
 };
 
 struct Con {
@@ -450,6 +463,20 @@ struct Dat {
 	char isstr;
 };
 
+typedef struct InsLoc InsLoc;
+
+struct InsLoc {
+	uint bid;
+	uint insn;
+};
+
+typedef struct InsMov InsMov;
+
+struct InsMov {
+	InsLoc from;
+	InsLoc to;
+};
+
 /* main.c */
 extern Target T;
 extern char debug['Z'+1];
@@ -476,6 +503,7 @@ char *str(uint32_t);
 int argcls(Ins *, int);
 int isreg(Ref);
 int iscmp(int, int *, int *);
+int invcmpwl(int);
 void emit(int, int, Ref, Ref, Ref);
 void emiti(Ins);
 void idup(Blk *, Ins *, ulong);
@@ -484,16 +512,19 @@ int cmpop(int);
 int cmpneg(int);
 int clsmerge(short *, short);
 int phicls(int, Tmp *);
+uint phiargn(Phi *, Blk *);
+Ref phiarg(Phi *, Blk *);
 Ref newtmp(char *, int, Fn *);
 void chuse(Ref, int, Fn *);
 int symeq(Sym, Sym);
 Ref newcon(Con *, Fn *);
 Ref getcon(int64_t, Fn *);
 int addcon(Con *, Con *, int);
+int isconbits(Fn *fn, Ref r, int64_t *v);
+int istmpconbits(Fn *fn, Ins *i, int64_t *v);
 void salloc(Ref, Ref, Fn *);
 void dumpts(BSet *, Tmp *, FILE *);
 void runmatch(uchar *, Num *, Ref, Ref *);
-
 void bsinit(BSet *, uint);
 void bszero(BSet *);
 uint bscount(BSet *);
@@ -525,7 +556,6 @@ void elimsb(Fn *);
 
 /* cfg.c */
 Blk *newblk(void);
-void edgedel(Blk *, Blk **);
 void fillpreds(Fn *);
 void fillcfg(Fn *);
 void filldom(Fn *);
@@ -533,8 +563,20 @@ int sdom(Blk *, Blk *);
 int dom(Blk *, Blk *);
 void fillfron(Fn *);
 void loopiter(Fn *, void (*)(Blk *, Blk *));
+void filldomdpth(Fn *);
+Blk *lca(Blk *, Blk *);
 void fillloop(Fn *);
 void simpljmp(Fn *);
+void replacepreds(Blk *, Blk *, Blk *);
+void killblks(Fn *);
+void blkmerge(Fn *);
+int ifgraph(Blk *, Blk **, Blk **, Blk **);
+int ifjoin(Blk *, Blk **, Blk **, Blk **);
+int emptyblk(Blk *);
+void ifelim(Fn *);
+void clrbvisit(Fn *);
+int reaches(Fn *,Blk *, Blk *);
+int reachesnotvia(Fn *, Blk *, Blk *, Blk *);
 
 /* mem.c */
 void promote(Fn *);
@@ -552,15 +594,36 @@ int storesz(Ins *);
 void loadopt(Fn *);
 
 /* ssa.c */
+void adduse(Tmp *, int, Blk *, ...);
 void filluse(Fn *);
 void ssa(Fn *);
 void ssacheck(Fn *);
 
 /* copy.c */
-void copy(Fn *);
+int iswu1(Fn *, Ref);
+void narrowpars(Fn *);
+Ref copyref(Fn *, Blk *, Ins *);
 
 /* fold.c */
-void fold(Fn *);
+Ref foldref(Fn *, Ins *);
+
+/* gvn.c */
+extern Ref con01[2];
+int is0non0(Fn *, Blk *, Ref, int, int *);
+void gvn(Fn *);
+
+/* gcm.c */
+int isfixed(Fn *, Ins *);
+void gcm(Fn *);
+
+/* ins.c */
+void addins(Ins **, uint *, Ins *);
+void addbins(Blk *, Ins **, uint *);
+void nopunused(Fn *);
+void movins(Fn *, InsMov *, uint, int);
+
+/* reassoc.c */
+void reassoc(Fn *);
 
 /* simpl.c */
 void simpl(Fn *);

+ 285 - 31
cfg.c

@@ -39,37 +39,6 @@ prunephis(Fn *f)
 	}
 }
 
-void
-edgedel(Blk *bs, Blk **pbd)
-{
-	Blk *bd;
-	Phi *p;
-	uint a;
-	int mult;
-
-	bd = *pbd;
-	mult = 1 + (bs->s1 == bs->s2);
-	*pbd = 0;
-	if (!bd || mult > 1)
-		return;
-	for (p=bd->phi; p; p=p->link) {
-		for (a=0; p->blk[a]!=bs; a++)
-			assert(a+1<p->narg);
-		p->narg--;
-		memmove(&p->blk[a], &p->blk[a+1],
-			sizeof p->blk[0] * (p->narg-a));
-		memmove(&p->arg[a], &p->arg[a+1],
-			sizeof p->arg[0] * (p->narg-a));
-	}
-	if (bd->npred != 0) {
-		for (a=0; bd->pred[a]!=bs; a++)
-			assert(a+1<bd->npred);
-		bd->npred--;
-		memmove(&bd->pred[a], &bd->pred[a+1],
-			sizeof bd->pred[0] * (bd->npred-a));
-	}
-}
-
 static void
 addpred(Blk *bp, Blk *bc)
 {
@@ -298,6 +267,49 @@ multloop(Blk *hd, Blk *b)
 	b->loop *= 10;
 }
 
+void
+filldomdpth(Fn *fn)
+{
+	Blk *b, *dom;
+	uint dpth;
+
+	for (b=fn->start; b; b=b->link)
+		b->domdpth = -1;
+
+	fn->start->domdpth = 0;
+
+	for (b=fn->start; b; b=b->link) {
+		if (b->domdpth != -1)
+			continue;
+		dpth = 1;
+		for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
+			dpth++;
+		dpth += dom->domdpth;
+		b->domdpth = dpth;
+		for (dom = b->idom; dom->domdpth == -1; dom = dom->idom)
+			dom->domdpth = --dpth;
+	}
+}
+
+/* least common ancestor in dom tree */
+Blk *
+lca(Blk *b1, Blk *b2)
+{
+	if (!b1)
+		return b2;
+	if (!b2)
+		return b1;
+	while (b1->domdpth > b2->domdpth)
+		b1 = b1->idom;
+	while (b2->domdpth > b1->domdpth)
+		b2 = b2->idom;
+	while (b1 != b2) {
+		b1 = b1->idom;
+		b2 = b2->idom;
+	}
+	return b1;
+}
+
 void
 fillloop(Fn *fn)
 {
@@ -358,3 +370,245 @@ simpljmp(Fn *fn)
 	*p = ret;
 	free(uf);
 }
+
+static void
+replacepred(Blk **blks, uint nblk, Blk *to, Blk *from)
+{
+	uint n;
+	for(n=0; n<nblk; n++)
+		if (blks[n] == from) {
+			blks[n] = to;
+			break;
+		}
+	assert(n != nblk);
+}
+
+/* replace b->pred[] and p->blk[] entries */
+void
+replacepreds(Blk *s, Blk *to, Blk *from)
+{
+	Phi *p;
+
+	if (!s)
+		return;
+	assert(s->npred);
+	replacepred(s->pred, s->npred, to, from);
+	for (p = s->phi; p; p = p->link) {
+		assert(p->narg == s->npred);
+		replacepred(p->blk, p->narg, to, from);
+	}
+}
+
+/* remove marked-dead blks - marked as fn->rpo[id] == 0 */
+void
+killblks(Fn *fn)
+{
+	Blk **pb;
+
+	for (pb = &fn->start; *pb;)
+		if (fn->rpo[(*pb)->id])
+			pb = &(*pb)->link;
+		else
+			*pb = (*pb)->link;
+}
+
+/* merge linear jmp chains */
+/* requires rpo pred, breaks cfg use */
+void
+blkmerge(Fn *fn)
+{
+	uint bid, nins;
+	Blk *curb, *b;
+	Ins *vins;
+
+	if (debug['B'])
+		fputs("\n> Block merge:\n", stderr);
+
+	vins = vnew(0, sizeof vins[0], PFn);
+	curb = 0;
+	/* linear jmp chains will be consecutive in rpo */
+	for (bid=0; bid<fn->nblk; bid++) {
+		b = fn->rpo[bid];
+		if (curb == 0) {
+			curb = b;
+			nins = 0;
+		} else
+			fn->rpo[bid] = 0;
+		addbins(b, &vins, &nins);
+		/* note - there are cases where GVN does not eliminate singleton phis */
+		if (b->jmp.type != Jjmp || b->s1->npred != 1 || b->s1->phi) {
+			idup(curb, vins, nins);
+			curb->nins = nins;
+			curb->jmp = b->jmp;
+			replacepreds(b->s1, curb, b);
+			if (b->s1 != b->s2)
+				replacepreds(b->s2, curb, b);
+			curb->s1 = b->s1;
+			curb->s2 = b->s2;
+			curb = 0;
+		} else {
+			assert(b->s1->id == bid+1);
+			if (debug['B'])
+				fprintf(stderr, "    merging blocks @%s -> @%s\n", b->name, b->s1->name);
+		}
+	}
+	assert(curb == 0);
+	killblks(fn);
+}
+
+int
+lonesucc(Blk *b, Blk *s)
+{
+	assert(s);
+	if (s != b)
+	if (s->npred == 1)
+	if (s->pred[0] == b)
+	if (s->phi == 0)
+		return 1;
+	return 0;
+}
+
+int
+lonejmpsucc(Blk *b, Blk *s)
+{
+	return s->jmp.type == Jjmp && lonesucc(b, s);
+}
+
+/* (otherwise) isolated if-then[-else] graphlet */
+int
+ifgraph(Blk *ifb, Blk **pthenb, Blk **pelseb, Blk **pjoinb)
+{
+	if (ifb->jmp.type != Jjnz)
+		return 0;
+	assert(ifb->s1 && ifb->s2);
+	assert(ifb->s1 != ifb->s2); /* dubious */
+	*pthenb = ifb->s1;
+	*pelseb = ifb->s2;
+	*pjoinb = ifb->s1->s1;
+	if (ifb->s1 == ifb->s2->s1) {
+		/* empty then */
+		*pthenb = ifb;
+		*pjoinb = ifb->s1;
+	}
+	if (ifb->s1->s1 == ifb->s2)
+		/* empty else */
+		*pelseb = ifb;
+
+	if (*pthenb == ifb ||
+	    ((*pthenb)->s1 == *pjoinb && lonejmpsucc(ifb, *pthenb)))
+	if (*pelseb == ifb ||
+	    ((*pelseb)->s1 == *pjoinb && lonejmpsucc(ifb, *pelseb)))
+	/* there are cases where npred == 2 is not strictly needed - ifconvert for example */
+	if ((*pjoinb)->npred == 2)
+		return 1;
+
+	return 0;
+}
+
+/* join blk of if-then[-else] graphlet */
+int
+ifjoin(Blk *joinb, Blk **pifb, Blk **pthenb, Blk **pelseb)
+{
+	Blk *joinb1;
+
+	if (joinb->npred)
+	if (ifgraph(joinb->pred[0], pthenb, pelseb, &joinb1))
+	if (joinb == joinb1) {
+		*pifb = joinb->pred[0];
+		return 1;
+	}
+	if (joinb->npred && joinb->pred[0]->npred)
+	if (ifgraph(joinb->pred[0]->pred[0], pthenb, pelseb, &joinb1))
+	if (joinb == joinb1) {
+		*pifb = joinb->pred[0]->pred[0];
+		return 1;
+	}
+	return 0;
+}
+
+int
+emptyblk(Blk *b)
+{
+	Ins *i;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++)
+		if (i->op != Onop)
+		if (i->op != Odbgloc)
+			return 0;
+	return 1;
+}
+
+/* remove empty jnz graphlets */
+/* needs rpo; breaks cfg */
+void
+ifelim(Fn *fn)
+{
+	uint bid;
+	Blk *ifb, *thenb, *elseb, *joinb;
+
+	for (bid = 0; bid < fn->nblk; bid++) {
+		ifb = fn->rpo[bid];
+		if (ifb == 0)
+			continue;
+		if (ifgraph(ifb, &thenb, &elseb, &joinb))
+		if (joinb->phi == 0)
+		if (thenb == ifb || emptyblk(thenb))
+		if (elseb == ifb || emptyblk(elseb)) {
+			if (debug['B'])
+				fprintf(stderr, "    eliminating empty if @%s -> @%s, @%s -> @%s\n",
+					ifb->name, thenb->name, elseb->name, joinb->name);
+			if (thenb != ifb)
+				fn->rpo[thenb->id] = 0;
+			if (elseb != ifb)
+				fn->rpo[elseb->id] = 0;
+			ifb->jmp.type = Jjmp;
+			ifb->jmp.arg = R;
+			ifb->s1 = joinb;
+			ifb->s2 = 0;
+		}
+	}
+	killblks(fn);
+}
+
+void
+clrbvisit(Fn *fn)
+{
+	Blk *b;
+	for (b = fn->start; b; b = b->link)
+		b->visit = 0;
+}
+
+static int
+reaches1(Blk *b1, Blk *b2)
+{
+	assert(b2);
+	if (b1 == b2)
+		return 1;
+	if (b1 == 0 || b1->visit)
+		return 0;
+	b1->visit = 1;
+	return reaches1(b1->s1, b2) || reaches1(b1->s2, b2);
+}
+
+/* path from b1 to b2 not thru bnotvia? */
+/* uses b->visit */
+int
+reachesnotvia(Fn *fn, Blk *b1, Blk *b2, Blk *bnotvia)
+{
+	int rc;
+
+	if (bnotvia)
+		bnotvia->visit = 1;
+	rc = reaches1(b1, b2);
+	clrbvisit(fn);
+	return rc;
+}
+
+/* path from b1 to b2? */
+/* uses b->visit */
+int
+reaches(Fn *fn, Blk *b1, Blk *b2)
+{
+	return reachesnotvia(fn, b1, b2, 0);
+}
+

+ 295 - 180
copy.c

@@ -1,217 +1,332 @@
 #include "all.h"
 
-static int
-iscon(Ref r, int64_t bits, Fn *fn)
+static uint
+u64_wbits(uint64_t v)
 {
-	return rtype(r) == RCon
-		&& fn->con[r.val].type == CBits
-		&& fn->con[r.val].bits.i == bits;
+	uint n;
+
+	n = 0;
+	if (v >> 32) { n += 32; v >>= 32; }
+	if (v >> 16) { n += 16; v >>= 16; }
+	if (v >>  8) { n +=  8; v >>=  8; }
+	if (v >>  4) { n +=  4; v >>=  4; }
+	if (v >>  2) { n +=  2; v >>=  2; }
+	if (v >>  1) { n +=  1; v >>=  1; }
+	return n+v;
 }
 
 static int
-iscopy(Ins *i, Ref r, Fn *fn)
+EXTSIGNED[] = { /*extsb*/1, /*extub*/0, /*extsh*/1, /*extuh*/0, /*extsw*/1, /*extuw*/0 };
+
+static uint
+EXTMAXW[] = { /*extsb*/7, /*extub*/8, /*extsh*/15, /*extuh*/16, /*extsw*/31, /*extuw*/32 };
+
+static uint
+EXTW[] = { /*extsb*/8, /*extub*/8, /*extsh*/16, /*extuh*/16, /*extsw*/32, /*extuw*/32 };
+
+static uint
+STW[] = { /*storeb*/8, /*storeh*/16, /*storew*/32, /*storel*/64, /*stores*/32, /*stored*/64 };
+
+/* is the ref used only as a narrow value? */
+static int
+usewidthle(Fn *fn, Ref r, uint wbits)
 {
-	static bits extcpy[] = {
-		[WFull] = 0,
-		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
-		[Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
-		[Wsh] = BIT(Wsh) | BIT(Wsw),
-		[Wuh] = BIT(Wuh) | BIT(Wuw),
-		[Wsw] = BIT(Wsw),
-		[Wuw] = BIT(Wuw),
-	};
-	Op *op;
-	bits b;
 	Tmp *t;
+	Use *u;
+	Phi *p;
+	int b;
+	Ins *i;
+	Ref rc;
+	int64_t v;
 
-	if (i->op == Ocopy)
+	if (isconbits(fn, r, &v))
+	if (u64_wbits(v) <= wbits)
 		return 1;
-	op = &optab[i->op];
-	if (op->hasid && KBASE(i->cls) == 0)
-		return iscon(i->arg[1], op->idval, fn);
-	if (!isext(i->op) || rtype(r) != RTmp)
+	if (rtype(r) != RTmp)
 		return 0;
-	if (i->op == Oextsw || i->op == Oextuw)
-	if (i->cls == Kw)
-		return 1;
-
 	t = &fn->tmp[r.val];
-	assert(KBASE(t->cls) == 0);
-	if (i->cls == Kl && t->cls == Kw)
+	for (u = t->use; u < &t->use[t->nuse]; u++) {
+		switch (u->type) {
+		case UPhi:
+			p = u->u.phi;
+			if (p->visit)
+				continue;
+			p->visit = 1;
+			b = usewidthle(fn, p->to, wbits);
+			p->visit = 0;
+			if (b)
+				continue;
+			break;
+		case UIns:
+			i = u->u.ins;
+			assert(i != 0);
+			if (i->op == Ocopy)
+				if (usewidthle(fn, i->to, wbits))
+					continue;
+			if (isext(i->op)) {
+				if (EXTW[i->op - Oextsb] <= wbits)
+					continue;
+				else
+					if (usewidthle(fn, i->to, wbits))
+						continue;;
+			}
+			if (i->op == Oand) {
+				if (req(r, i->arg[0]))
+					rc = i->arg[1];
+				else {
+					assert(req(r, i->arg[1]));
+					rc = i->arg[0];
+				}
+				if (isconbits(fn, rc, &v))
+				if (u64_wbits(v) <= wbits)
+					continue;
+				break;
+			}
+			if (isstore(i->op))
+			if (req(r, i->arg[1]))
+			if (STW[i->op - Ostoreb] > wbits)
+				continue;
+			break;
+		default:
+			break;
+		}
 		return 0;
-	b = extcpy[t->width];
-	return (BIT(Wsb + (i->op-Oextsb)) & b) != 0;
+	}
+	return 1;
 }
 
-static Ref
-copyof(Ref r, Ref *cpy)
+static Phi*
+findphi(Fn *fn, uint bid, Ref to)
 {
-	if (rtype(r) == RTmp && !req(cpy[r.val], R))
-		return cpy[r.val];
-	return r;
+	Phi *p;
+	for (p = fn->rpo[bid]->phi; p; p = p->link)
+		if (req(p->to, to))
+			break;
+	assert(p);
+	return p;
 }
 
-/* detects a cluster of phis/copies redundant with 'r';
- * the algorithm is inspired by Section 3.2 of "Simple
- * and Efficient SSA Construction" by Braun M. et al.
- */
-static void
-phisimpl(Phi *p, Ref r, Ref *cpy, Use ***pstk, BSet *ts, BSet *as, Fn *fn)
+static uint
+uint_min(uint v1, uint v2)
 {
-	Use **stk, *u, *u1;
-	uint nstk, a;
-	int t;
-	Ref r1;
-	Phi *p0;
-
-	bszero(ts);
-	bszero(as);
-	p0 = &(Phi){.narg = 0};
-	stk = *pstk;
-	nstk = 1;
-	stk[0] = &(Use){.type = UPhi, .u.phi = p};
-	while (nstk) {
-		u = stk[--nstk];
-		if (u->type == UIns && iscopy(u->u.ins, r, fn)) {
-			p = p0;
-			t = u->u.ins->to.val;
-		}
-		else if (u->type == UPhi) {
-			p = u->u.phi;
-			t = p->to.val;
-		}
-		else
-			continue;
-		if (bshas(ts, t))
-			continue;
-		bsset(ts, t);
-		for (a=0; a<p->narg; a++) {
-			r1 = copyof(p->arg[a], cpy);
-			if (req(r1, r))
-				continue;
-			if (rtype(r1) != RTmp)
-				return;
-			bsset(as, r1.val);
+	return v1 < v2 ? v1 : v2;
+}
+
+/* is the ref def a narrow value? */
+static int
+defwidthle(Fn *fn, Ref r, uint wbits)
+{
+	Tmp *t;
+	Phi *p;
+	Ins *i;
+	uint n;
+	int64_t v;
+	int x;
+
+	if (isconbits(fn, r, &v))
+	if (u64_wbits(v) <= wbits)
+		return 1;
+	if (rtype(r) != RTmp)
+		return 0;
+	t = &fn->tmp[r.val];
+	if (t->cls != Kw)
+		return 0;
+	i = t->def;
+	if (i == 0) {
+		/* phi def */
+		p = findphi(fn, t->bid, r);
+		if (p->visit)
+			return 1;
+		p->visit = 1;
+		for (n = 0; n < p->narg; n++)
+			if (!defwidthle(fn, p->arg[n], wbits)) {
+				p->visit = 0;
+				return 0;
+			}
+		p->visit = 0;
+		return 1;
+	}
+	/* ins def */
+	if (i->op == Ocopy)
+                return defwidthle(fn, i->arg[0], wbits);
+	if (i->op == Oshr || i->op == Osar) {
+		if (isconbits(fn, i->arg[1], &v))
+		if (0 < v && v <= 32) {
+			if (i->op == Oshr && 32-v <= wbits)
+				return 1;
+			if (0 <= v && v < 32 && wbits < 32)
+				return defwidthle(fn, i->arg[0], uint_min((i->op == Osar ? 31 : 32), wbits+v));
 		}
-		u = fn->tmp[t].use;
-		u1 = &u[fn->tmp[t].nuse];
-		vgrow(pstk, nstk+(u1-u));
-		stk = *pstk;
-		for (; u<u1; u++)
-			stk[nstk++] = u;
+		return defwidthle(fn, i->arg[0], wbits);
+	}
+	if (iscmp(i->op, &x, &x))
+		return wbits >= 1;
+	if (i->op == Oand)
+		return defwidthle(fn, i->arg[0], wbits) || defwidthle(fn, i->arg[1], wbits);
+	if (i->op == Oor || i->op == Oxor)
+		return defwidthle(fn, i->arg[0], wbits) && defwidthle(fn, i->arg[1], wbits);
+	if (isext(i->op)) {
+		if (EXTSIGNED[i->op - Oextsb])
+			return defwidthle(fn, i->arg[0], uint_min(wbits, EXTMAXW[i->op - Oextsb]));
+		if (EXTW[i->op - Oextsb] <= wbits)
+			return 1;
+		return defwidthle(fn, i->arg[0], wbits);
 	}
-	bsdiff(as, ts);
-	if (!bscount(as))
-		for (t=0; bsiter(ts, &t); t++)
-			cpy[t] = r;
+	return 0;
+}
+
+/* is the ref a boolean - 0, 1 - value? */
+int
+iswu1(Fn *fn, Ref r)
+{
+	return defwidthle(fn, r, 1);
 }
 
-static void
-subst(Ref *pr, Ref *cpy)
+static int
+isnarrowpar(Fn *fn, Ref r)
 {
-	assert(rtype(*pr) != RTmp || !req(cpy[pr->val], R));
-	*pr = copyof(*pr, cpy);
+	Tmp *t;
+
+	if (rtype(r) != RTmp)
+		return 0;
+	t = &fn->tmp[r.val];
+	if (t->bid != fn->start->id || t->def == 0)
+		return 0;
+	return ispar(t->def->op);
 }
 
-/* requires use and dom, breaks use */
+/* Insert extub/extuh instructions in start for pars used only narrowly */
+/* needs use; breaks use */
 void
-copy(Fn *fn)
+narrowpars(Fn *fn)
 {
-	BSet ts[1], as[1];
-	Use **stk;
-	Phi *p, **pp;
-	Ins *i;
 	Blk *b;
-	uint n, a, eq;
-	Ref *cpy, r, r1;
-	int t;
-
-	bsinit(ts, fn->ntmp);
-	bsinit(as, fn->ntmp);
-	cpy = emalloc(fn->ntmp * sizeof cpy[0]);
-	stk = vnew(10, sizeof stk[0], PHeap);
-
-	/* 1. build the copy-of map */
-	for (n=0; n<fn->nblk; n++) {
-		b = fn->rpo[n];
-		for (p=b->phi; p; p=p->link) {
-			assert(rtype(p->to) == RTmp);
-			if (!req(cpy[p->to.val], R))
-				continue;
-			eq = 0;
-			r = R;
-			for (a=0; a<p->narg; a++)
-				if (p->blk[a]->id < n) {
-					r1 = copyof(p->arg[a], cpy);
-					if (req(r, R) || req(r, UNDEF))
-						r = r1;
-					if (req(r1, r) || req(r1, UNDEF))
-						eq++;
-				}
-			assert(!req(r, R));
-			if (rtype(r) == RTmp
-			&& !dom(fn->rpo[fn->tmp[r.val].bid], b))
-				cpy[p->to.val] = p->to;
-			else if (eq == p->narg)
-				cpy[p->to.val] = r;
-			else {
-				cpy[p->to.val] = p->to;
-				phisimpl(p, r, cpy, &stk, ts, as, fn);
-			}
-		}
-		for (i=b->ins; i<&b->ins[b->nins]; i++) {
-			assert(rtype(i->to) <= RTmp);
-			if (!req(cpy[i->to.val], R))
-				continue;
-			r = copyof(i->arg[0], cpy);
-			if (iscopy(i, r, fn))
-				cpy[i->to.val] = r;
-			else
-				cpy[i->to.val] = i->to;
-		}
-	}
+	int loop;
+	Ins *i, *ins;
+	uint npar, nins;
+	enum O extop;
+	Ref r;
 
-	/* 2. remove redundant phis/copies
-	 * and rewrite their uses */
-	for (b=fn->start; b; b=b->link) {
-		for (pp=&b->phi; (p=*pp);) {
-			r = cpy[p->to.val];
-			if (!req(r, p->to)) {
-				*pp = p->link;
-				continue;
-			}
-			for (a=0; a<p->narg; a++)
-				subst(&p->arg[a], cpy);
-			pp=&p->link;
-		}
-		for (i=b->ins; i<&b->ins[b->nins]; i++) {
-			r = cpy[i->to.val];
-			if (!req(r, i->to)) {
-				*i = (Ins){.op = Onop};
-				continue;
-			}
-			subst(&i->arg[0], cpy);
-			subst(&i->arg[1], cpy);
+	/* only useful for functions with loops */
+	loop = 0;
+	for (b = fn->start; b; b = b->link)
+		if (b->loop > 1) {
+			loop = 1;
+			break;
 		}
-		subst(&b->jmp.arg, cpy);
+	if (!loop)
+		return;
+
+	b = fn->start;
+	npar = 0;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++) {
+		if (!ispar(i->op))
+			break;
+		npar++;
 	}
 
-	if (debug['C']) {
-		fprintf(stderr, "\n> Copy information:");
-		for (t=Tmp0; t<fn->ntmp; t++) {
-			if (req(cpy[t], R)) {
-				fprintf(stderr, "\n%10s not seen!",
-					fn->tmp[t].name);
-			}
-			else if (!req(cpy[t], TMP(t))) {
-				fprintf(stderr, "\n%10s copy of ",
-					fn->tmp[t].name);
-				printref(cpy[t], fn, stderr);
+	if (npar == 0)
+		return;
+
+	nins = b->nins + npar;
+	ins = vnew(nins, sizeof ins[0], PFn); //alloc(nins * sizeof ins[0]);
+	memcpy(ins, b->ins, npar * sizeof ins[0]);
+	memcpy(ins + 2*npar, b->ins + npar, (b->nins - npar) * sizeof ins[0]);
+	b->ins = ins;
+	b->nins = nins;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++) {
+		if (!ispar(i->op))
+			break;
+		extop = Onop;
+		if (i->cls == Kw)
+			if (usewidthle(fn, i->to, 16)) {
+				if (usewidthle(fn, i->to, 8))
+					extop = Oextub;
+				else
+					extop = Oextuh;
 			}
+		if (extop == Onop) {
+			*(i+npar) = (Ins) {.op = Onop};
+		} else {
+			r = newtmp("vw", i->cls, fn);
+			*(i+npar) = (Ins) {.op = extop, .cls = i->cls, .to = i->to, .arg = {r}};
+			i->to = r;
 		}
-		fprintf(stderr, "\n\n> After copy elimination:\n");
-		printfn(fn, stderr);
 	}
-	vfree(stk);
-	free(cpy);
+}
+
+/* used by GVN */
+Ref
+copyref(Fn *fn, Blk *b, Ins *i)
+{
+	static bits extcpy[] = {
+		[WFull] = 0,
+		[Wsb] = BIT(Wsb) | BIT(Wsh) | BIT(Wsw),
+		[Wub] = BIT(Wub) | BIT(Wuh) | BIT(Wuw),
+		[Wsh] = BIT(Wsh) | BIT(Wsw),
+		[Wuh] = BIT(Wuh) | BIT(Wuw),
+		[Wsw] = BIT(Wsw),
+		[Wuw] = BIT(Wuw),
+	};
+	bits bext;
+	Tmp *t;
+	int64_t v;
+	int is0;
+
+	if (i->op == Ocopy)
+		return i->arg[0];
+
+	/* op identity value */
+	if (optab[i->op].hasid)
+	if (KBASE(i->cls) == 0) /* integer only - fp NaN! */
+	if (req(i->arg[1], con01[optab[i->op].idval]))
+	if (!optab[i->op].cmpeqwl || iswu1(fn, i->arg[0]))
+		return i->arg[0];
+
+	/* idempotent op with identical args */
+	if (optab[i->op].idemp)
+	if (req(i->arg[0], i->arg[1]))
+		return i->arg[0];
+
+	/* integer cmp with identical args */
+	if (optab[i->op].cmpeqwl || optab[i->op].cmplgtewl)
+	if (req(i->arg[0], i->arg[1]))
+		return con01[optab[i->op].eqval];
+
+	/* cmpeq/ne 0 with 0/non-0 inference from dominating jnz */
+	if (optab[i->op].cmpeqwl)
+	if (req(i->arg[1], con01[0]))
+	if (is0non0(fn, b, i->arg[0], argcls(i,0), &is0))
+		return con01[optab[i->op].eqval^is0^1];
+
+	/* redundant and mask */
+	if (i->op == Oand)
+	if (isconbits(fn, i->arg[1], &v))
+	if (((v+1) & v) == 0) /* v == 2^N-1 */
+	if (defwidthle(fn, i->arg[0], u64_wbits(v)))
+		return i->arg[0];
+
+	if (!isext(i->op) || rtype(i->arg[0]) != RTmp)
+		return R;
+	if (i->op == Oextsw || i->op == Oextuw)
+	if (i->cls == Kw)
+		return i->arg[0];
+
+	t = &fn->tmp[i->arg[0].val];
+	assert(KBASE(t->cls) == 0);
+	if (i->cls == Kl && t->cls == Kw)
+		return R;
+	bext = extcpy[t->width];
+	if ((BIT(Wsb + (i->op-Oextsb)) & bext) != 0)
+		return i->arg[0];
+
+	if (!isnarrowpar(fn, i->arg[0]))
+	if (usewidthle(fn, i->to, EXTW[i->op - Oextsb]))
+		return i->arg[0];
+	if (defwidthle(fn, i->arg[0], EXTMAXW[i->op - Oextsb]))
+		return i->arg[0];
+
+	return R;
 }

+ 29 - 318
fold.c

@@ -1,22 +1,6 @@
 #include "all.h"
 
-enum {
-	Bot = -1, /* lattice bottom */
-	Top = 0,  /* lattice top (matches UNDEF) */
-};
-
-typedef struct Edge Edge;
-
-struct Edge {
-	int dest;
-	int dead;
-	Edge *work;
-};
-
-static int *val;
-static Edge *flowrk, (*edge)[2];
-static Use **usewrk;
-static uint nuse;
+/* boring folding code */
 
 static int
 iscon(Con *c, int w, uint64_t k)
@@ -29,304 +13,6 @@ iscon(Con *c, int w, uint64_t k)
 		return (uint32_t)c->bits.i == (uint32_t)k;
 }
 
-static int
-latval(Ref r)
-{
-	switch (rtype(r)) {
-	case RTmp:
-		return val[r.val];
-	case RCon:
-		return r.val;
-	default:
-		die("unreachable");
-	}
-}
-
-static int
-latmerge(int v, int m)
-{
-	return m == Top ? v : (v == Top || v == m) ? m : Bot;
-}
-
-static void
-update(int t, int m, Fn *fn)
-{
-	Tmp *tmp;
-	uint u;
-
-	m = latmerge(val[t], m);
-	if (m != val[t]) {
-		tmp = &fn->tmp[t];
-		for (u=0; u<tmp->nuse; u++) {
-			vgrow(&usewrk, ++nuse);
-			usewrk[nuse-1] = &tmp->use[u];
-		}
-		val[t] = m;
-	}
-}
-
-static int
-deadedge(int s, int d)
-{
-	Edge *e;
-
-	e = edge[s];
-	if (e[0].dest == d && !e[0].dead)
-		return 0;
-	if (e[1].dest == d && !e[1].dead)
-		return 0;
-	return 1;
-}
-
-static void
-visitphi(Phi *p, int n, Fn *fn)
-{
-	int v;
-	uint a;
-
-	v = Top;
-	for (a=0; a<p->narg; a++)
-		if (!deadedge(p->blk[a]->id, n))
-			v = latmerge(v, latval(p->arg[a]));
-	update(p->to.val, v, fn);
-}
-
-static int opfold(int, int, Con *, Con *, Fn *);
-
-static void
-visitins(Ins *i, Fn *fn)
-{
-	int v, l, r;
-
-	if (rtype(i->to) != RTmp)
-		return;
-	if (optab[i->op].canfold) {
-		l = latval(i->arg[0]);
-		if (!req(i->arg[1], R))
-			r = latval(i->arg[1]);
-		else
-			r = CON_Z.val;
-		if (l == Bot || r == Bot)
-			v = Bot;
-		else if (l == Top || r == Top)
-			v = Top;
-		else
-			v = opfold(i->op, i->cls, &fn->con[l], &fn->con[r], fn);
-	} else
-		v = Bot;
-	/* fprintf(stderr, "\nvisiting %s (%p)", optab[i->op].name, (void *)i); */
-	update(i->to.val, v, fn);
-}
-
-static void
-visitjmp(Blk *b, int n, Fn *fn)
-{
-	int l;
-
-	switch (b->jmp.type) {
-	case Jjnz:
-		l = latval(b->jmp.arg);
-		if (l == Bot) {
-			edge[n][1].work = flowrk;
-			edge[n][0].work = &edge[n][1];
-			flowrk = &edge[n][0];
-		}
-		else if (iscon(&fn->con[l], 0, 0)) {
-			assert(edge[n][0].dead);
-			edge[n][1].work = flowrk;
-			flowrk = &edge[n][1];
-		}
-		else {
-			assert(edge[n][1].dead);
-			edge[n][0].work = flowrk;
-			flowrk = &edge[n][0];
-		}
-		break;
-	case Jjmp:
-		edge[n][0].work = flowrk;
-		flowrk = &edge[n][0];
-		break;
-	case Jhlt:
-		break;
-	default:
-		if (isret(b->jmp.type))
-			break;
-		die("unreachable");
-	}
-}
-
-static void
-initedge(Edge *e, Blk *s)
-{
-	if (s)
-		e->dest = s->id;
-	else
-		e->dest = -1;
-	e->dead = 1;
-	e->work = 0;
-}
-
-static int
-renref(Ref *r)
-{
-	int l;
-
-	if (rtype(*r) == RTmp)
-		if ((l=val[r->val]) != Bot) {
-			*r = CON(l);
-			return 1;
-		}
-	return 0;
-}
-
-/* require rpo, use, pred */
-void
-fold(Fn *fn)
-{
-	Edge *e, start;
-	Use *u;
-	Blk *b, **pb;
-	Phi *p, **pp;
-	Ins *i;
-	int t, d;
-	uint n, a;
-
-	val = emalloc(fn->ntmp * sizeof val[0]);
-	edge = emalloc(fn->nblk * sizeof edge[0]);
-	usewrk = vnew(0, sizeof usewrk[0], PHeap);
-
-	for (t=0; t<fn->ntmp; t++)
-		val[t] = Top;
-	for (n=0; n<fn->nblk; n++) {
-		b = fn->rpo[n];
-		b->visit = 0;
-		initedge(&edge[n][0], b->s1);
-		initedge(&edge[n][1], b->s2);
-	}
-	initedge(&start, fn->start);
-	flowrk = &start;
-	nuse = 0;
-
-	/* 1. find out constants and dead cfg edges */
-	for (;;) {
-		e = flowrk;
-		if (e) {
-			flowrk = e->work;
-			e->work = 0;
-			if (e->dest == -1 || !e->dead)
-				continue;
-			e->dead = 0;
-			n = e->dest;
-			b = fn->rpo[n];
-			for (p=b->phi; p; p=p->link)
-				visitphi(p, n, fn);
-			if (b->visit == 0) {
-				for (i=b->ins; i<&b->ins[b->nins]; i++)
-					visitins(i, fn);
-				visitjmp(b, n, fn);
-			}
-			b->visit++;
-			assert(b->jmp.type != Jjmp
-				|| !edge[n][0].dead
-				|| flowrk == &edge[n][0]);
-		}
-		else if (nuse) {
-			u = usewrk[--nuse];
-			n = u->bid;
-			b = fn->rpo[n];
-			if (b->visit == 0)
-				continue;
-			switch (u->type) {
-			case UPhi:
-				visitphi(u->u.phi, u->bid, fn);
-				break;
-			case UIns:
-				visitins(u->u.ins, fn);
-				break;
-			case UJmp:
-				visitjmp(b, n, fn);
-				break;
-			default:
-				die("unreachable");
-			}
-		}
-		else
-			break;
-	}
-
-	if (debug['F']) {
-		fprintf(stderr, "\n> SCCP findings:");
-		for (t=Tmp0; t<fn->ntmp; t++) {
-			if (val[t] == Bot)
-				continue;
-			fprintf(stderr, "\n%10s: ", fn->tmp[t].name);
-			if (val[t] == Top)
-				fprintf(stderr, "Top");
-			else
-				printref(CON(val[t]), fn, stderr);
-		}
-		fprintf(stderr, "\n dead code: ");
-	}
-
-	/* 2. trim dead code, replace constants */
-	d = 0;
-	for (pb=&fn->start; (b=*pb);) {
-		if (b->visit == 0) {
-			d = 1;
-			if (debug['F'])
-				fprintf(stderr, "%s ", b->name);
-			edgedel(b, &b->s1);
-			edgedel(b, &b->s2);
-			*pb = b->link;
-			continue;
-		}
-		for (pp=&b->phi; (p=*pp);)
-			if (val[p->to.val] != Bot)
-				*pp = p->link;
-			else {
-				for (a=0; a<p->narg; a++)
-					if (!deadedge(p->blk[a]->id, b->id))
-						renref(&p->arg[a]);
-				pp = &p->link;
-			}
-		for (i=b->ins; i<&b->ins[b->nins]; i++)
-			if (renref(&i->to))
-				*i = (Ins){.op = Onop};
-			else {
-				for (n=0; n<2; n++)
-					renref(&i->arg[n]);
-				if (isstore(i->op))
-				if (req(i->arg[0], UNDEF))
-					*i = (Ins){.op = Onop};
-			}
-		renref(&b->jmp.arg);
-		if (b->jmp.type == Jjnz && rtype(b->jmp.arg) == RCon) {
-				if (iscon(&fn->con[b->jmp.arg.val], 0, 0)) {
-					edgedel(b, &b->s1);
-					b->s1 = b->s2;
-					b->s2 = 0;
-				} else
-					edgedel(b, &b->s2);
-				b->jmp.type = Jjmp;
-				b->jmp.arg = R;
-		}
-		pb = &b->link;
-	}
-
-	if (debug['F']) {
-		if (!d)
-			fprintf(stderr, "(none)");
-		fprintf(stderr, "\n\n> After constant folding:\n");
-		printfn(fn, stderr);
-	}
-
-	free(val);
-	free(edge);
-	vfree(usewrk);
-}
-
-/* boring folding code */
-
 static int
 foldint(Con *res, int op, int w, Con *cl, Con *cr)
 {
@@ -516,7 +202,7 @@ foldflt(Con *res, int op, int w, Con *cl, Con *cr)
 	}
 }
 
-static int
+static Ref
 opfold(int op, int cls, Con *cl, Con *cr, Fn *fn)
 {
 	Ref r;
@@ -524,12 +210,37 @@ opfold(int op, int cls, Con *cl, Con *cr, Fn *fn)
 
 	if (cls == Kw || cls == Kl) {
 		if (foldint(&c, op, cls == Kl, cl, cr))
-			return Bot;
+			return R;
 	} else
 		foldflt(&c, op, cls == Kd, cl, cr);
 	if (!KWIDE(cls))
 		c.bits.i &= 0xffffffff;
 	r = newcon(&c, fn);
 	assert(!(cls == Ks || cls == Kd) || c.flt);
-	return r.val;
+	return r;
+}
+
+/* used by GVN */
+Ref
+foldref(Fn *fn, Ins *i)
+{
+	Ref rr;
+	Con *cl, *cr;
+
+	if (rtype(i->to) != RTmp)
+		return R;
+	if (optab[i->op].canfold) {
+		if (rtype(i->arg[0]) != RCon)
+			return R;
+		cl = &fn->con[i->arg[0].val];
+		rr = i->arg[1];
+		if (req(rr, R))
+		    rr = CON_Z;
+		if (rtype(rr) != RCon)
+			return R;
+		cr = &fn->con[rr.val];
+
+		return opfold(i->op, i->cls, cl, cr, fn);
+	}
+	return R;
 }

+ 507 - 0
gcm.c

@@ -0,0 +1,507 @@
+#include "all.h"
+
+#define NOBID (-1u)
+
+/* ins can trap at runtime */
+static int
+istrapping(Fn *fn, Ins *i)
+{
+	int64_t v;
+
+	if (KBASE(i->cls) == 0)
+	if (INRANGE(i->op, Odiv, Ourem))
+	if (!isconbits(fn, i->arg[1], &v) || v == 0)
+		return 1;
+	return 0;
+}
+
+/* fixed ins that can be eliminated if unused */
+static int
+canelim(Fn *fn, Ins *i)
+{
+	return isload(i->op) || isalloc(i->op) || istrapping(fn, i);
+}
+
+/* ins must stay in def blk */
+int
+isfixed(Fn *fn, Ins *i)
+{
+	return optab[i->op].ispinned || istrapping(fn, i);
+}
+
+static uint earlyins(Fn *, Blk *, Ins *);
+
+/* return earlybid of ref def ins */
+static uint
+schedearly(Fn *fn, Ref r)
+{
+	Tmp *t;
+	Blk *b;
+
+	if (rtype(r) != RTmp)
+		return 0 /* root/start */;
+
+	t = &fn->tmp[r.val];
+	if (t->gcmbid != NOBID)
+		return t->gcmbid; /* already visited/visiting */
+
+	b = fn->rpo[t->bid];
+	if (t->def) {
+		/* def is an ins */
+		assert(b->ins <= t->def && t->def < &b->ins[b->nins]);
+		t->gcmbid = 0; /* mark visiting root/start blk */
+		t->gcmbid = earlyins(fn, b, t->def); /* schedule ins input defs */
+	} else {
+		/* def is a phi - it stays in its def blk */
+		t->gcmbid = t->bid;
+	}
+
+	return t->gcmbid;
+}
+
+/* return deepest domdpth bid of arg defs */
+static uint
+earlyins(Fn *fn, Blk *b, Ins* i)
+{
+	uint earlybid, arg1earlybid;
+
+	earlybid = schedearly(fn, i->arg[0]);
+	assert(earlybid != NOBID);
+	arg1earlybid = schedearly(fn, i->arg[1]);
+	assert(arg1earlybid != NOBID);
+	if (fn->rpo[earlybid]->domdpth < fn->rpo[arg1earlybid]->domdpth) {
+		assert(dom(fn->rpo[earlybid], fn->rpo[arg1earlybid]));
+		earlybid = arg1earlybid;
+	}
+
+	/* fixed ins remain in their defining blk */
+	return isfixed(fn, i) ? b->id : earlybid;
+}
+
+static void
+earlyblk(Fn *fn, uint bid)
+{
+	Blk *b;
+	Phi *p;
+	Ins *i;
+	uint n;
+
+	b = fn->rpo[bid];
+	for (p = b->phi; p; p = p->link)
+		for (n = 0; n < p->narg; n++)
+			schedearly(fn, p->arg[n]);
+	for (i = b->ins; i < &b->ins[b->nins]; i++)
+		if (isfixed(fn, i))
+			earlyins(fn, b, i);
+	schedearly(fn, b->jmp.arg);
+}
+
+/* least common ancestor in dom tree */
+static uint
+lcabid(Fn *fn, uint bid1, uint bid2)
+{
+	Blk *b;
+
+	if (bid1 == NOBID)
+		return bid2;
+	if (bid2 == NOBID)
+		return bid1;
+
+	b = lca(fn->rpo[bid1], fn->rpo[bid2]);
+	assert(b);
+	return b->id;
+}
+
+static uint
+bestbid(Fn *fn, uint earlybid, uint latebid)
+{
+	Blk *curb, *earlyb, *bestb;
+
+	if (latebid == NOBID)
+		return NOBID; /* unused */
+
+	assert(earlybid != NOBID);
+
+	earlyb = fn->rpo[earlybid];
+	bestb = curb = fn->rpo[latebid];
+	assert(dom(earlyb, curb));
+
+	while (curb != earlyb) {
+		curb = curb->idom;
+		if (curb->loop < bestb->loop)
+			bestb = curb;
+	}
+	return bestb->id;
+}
+
+static uint lateins(Fn *, Blk *, Ins *, Ref r);
+static uint latephi(Fn *, Phi *, Ref r);
+static uint latejmp(Blk *, Ref r);
+
+/* return lca bid of ref uses */
+static uint
+schedlate(Fn *fn, Ref r)
+{
+	Tmp *t;
+	Blk *b;
+	Use *u;
+	uint earlybid;
+	uint latebid;
+	uint uselatebid;
+
+	if (rtype(r) != RTmp)
+		return NOBID;
+
+	t = &fn->tmp[r.val];
+	if (t->visit)
+		return t->gcmbid; /* already visited/visiting */
+
+	t->visit = 1; /* mark visiting/visited */
+	earlybid = t->gcmbid;
+	if (earlybid == NOBID)
+		return NOBID; /* not used */
+	t->gcmbid = t->bid; /* t->gcmbid is now latebid */
+
+	latebid = NOBID;
+	for (u = t->use; u < &t->use[t->nuse]; u++) {
+		assert(u->bid < fn->nblk);
+		b = fn->rpo[u->bid];
+		switch (u->type) {
+		case UXXX:
+			die("unreachable");
+			break;
+		case UPhi:
+			uselatebid = latephi(fn, u->u.phi, r);
+			break;
+		case UIns:
+			uselatebid = lateins(fn, b, u->u.ins, r);
+			break;
+		case UJmp:
+			uselatebid = latejmp(b, r);
+			break;
+		}
+		latebid = lcabid(fn, latebid, uselatebid);
+	}
+
+	/* phis stay in their def blk */
+	if (t->def) {
+		/* allow elimination of unused load/alloc/trapping ins */
+		if (latebid == NOBID && canelim(fn, t->def))
+			t->gcmbid = NOBID;
+		/* ... otherwise fixed ins stay in defining blk */
+		else if(!isfixed(fn, t->def))
+			t->gcmbid = bestbid(fn, earlybid, latebid);
+	}
+
+	return t->gcmbid;
+}
+
+/* return lca bid of uses, or NOBID if no active uses */
+static uint
+lateins(Fn *fn, Blk *b, Ins *i, Ref r)
+{
+	uint latebid;
+
+	assert(i->op == Onop || req(i->arg[0], r) || req(i->arg[1], r));
+	if (i->op == Onop)
+		return NOBID; /* already eliminated */
+
+	assert(b->ins <= i && i < &b->ins[b->nins]);
+
+	latebid = schedlate(fn, i->to);
+	/* allow elimination of unused load/alloc/trapping ins */
+	if (latebid == NOBID)
+	if (canelim(fn, i))
+		return NOBID;
+	/* ... otherwise fixed ins stay in defining blk */
+	return isfixed(fn, i) ? b->id : latebid;
+}
+
+static uint
+latephi(Fn *fn, Phi *p, Ref r)
+{
+	uint n;
+	uint latebid;
+
+	if (!p->narg)
+		return NOBID; /* marked as unused */
+
+	latebid = NOBID;
+	for (n = 0; n < p->narg; n++)
+		if (req(p->arg[n], r))
+			latebid = lcabid(fn, latebid, p->blk[n]->id);
+
+	assert(latebid != NOBID);
+	return latebid;
+}
+
+static uint
+latejmp(Blk *b, Ref r)
+{
+	if (req(b->jmp.arg, R))
+		return NOBID;
+	else {
+		assert(req(b->jmp.arg, r));
+		return b->id;
+	}
+}
+
+static void
+lateblk(Fn *fn, uint bid)
+{
+	Blk *b;
+	Phi **pp;
+	Ins *i;
+
+	b = fn->rpo[bid];
+	for (pp = &b->phi; *(pp);)
+		if (schedlate(fn, (*pp)->to) == NOBID) {
+			/* unused */
+			(*pp)->narg = 0; /* mark unused */
+			*pp = (*pp)->link; /* remove phi */
+		} else
+			pp = &(*pp)->link;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++)
+		if (isfixed(fn, i))
+			lateins(fn, b, i, i->arg[0]);
+}
+
+static void
+addgcmins(Fn *fn, Ins *vins, uint nins)
+{
+	Ins *i;
+	Tmp *t;
+	Blk *b;
+
+	for (i = vins; i < &vins[nins]; i++) {
+		assert(rtype(i->to) == RTmp);
+		t = &fn->tmp[i->to.val];
+		b = fn->rpo[t->gcmbid];
+		addins(&b->ins, &b->nins, i);
+	}
+}
+
+/* remove unused instructions */
+/* move instructions to (end of) target block */
+/* use-before-def is fixed up afterwards */
+static void
+gcmmove(Fn *fn)
+{
+	Tmp *t;
+	Ins *vins, *i;
+	uint nins;
+
+	nins = 0;
+	vins = vnew(nins, sizeof vins[0], PFn);
+
+	for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) {
+		if (t->def == 0)
+			continue;
+		if (t->bid == t->gcmbid)
+			continue;
+		i = t->def;
+		if (isfixed(fn, i) && !canelim(fn, i))
+			continue;
+		assert(rtype(i->to) == RTmp);
+		assert(t == &fn->tmp[i->to.val]);
+		if (t->gcmbid != NOBID)
+			addins(&vins, &nins, i);
+		*i = (Ins){.op = Onop};
+	}
+
+	addgcmins(fn, vins, nins);
+}
+
+static void
+schedins(Fn *fn, Blk *b, Ins *i0, Ins **pvins, uint *pnins)
+{
+	Ins *i, *i1;
+	Tmp *t;
+	uint na;
+
+	if (i0->op == Onop)
+		return;
+	/* arg...call have to stay together */
+	/* TODO - sel0...sel1 too */
+	for (i1 = i0; isarg(i1->op); i1++) {}
+	for (i = i0; i <= i1; i++)
+		for (na = 0; na < 2; na++) {
+			if (rtype(i->arg[na]) != RTmp)
+				continue;
+			t = &fn->tmp[i->arg[na].val];
+			/* def in different blk, or phi in this blk */
+			if (t->bid != b->id || t->def == 0)
+				continue;
+			/* already scheduled */
+			if (t->def->op == Onop) {
+				continue;
+			}
+			schedins(fn, b, t->def, pvins, pnins);
+		}
+	for (i = i0; i <= i1; i++) {
+		addins(pvins, pnins, i);
+		*i = (Ins){.op = Onop};
+	}
+}
+
+static void
+fixbub4d(Fn *fn, Blk *b, Ins **pvins)
+{
+	Ins *i;
+	uint nins;
+
+	nins = 0;
+	for (i = b->ins; i < &b->ins[b->nins]; i++)
+		schedins(fn, b, i, pvins, &nins);
+	idup(b, *pvins, nins);
+}
+
+static void
+fixub4d(Fn *fn)
+{
+	Blk *b;
+	Ins *vins; // TODO insb
+
+	vins = vnew(0, sizeof vins[0], PFn);
+	for (b = fn->start; b; b = b->link)
+		fixbub4d(fn, b, &vins);
+}
+
+static int
+iskladdcon(Fn *fn, Ins *i, int64_t *v)
+{
+	if (i->op == Oadd)
+	if (i->cls == Kl)
+	if (rtype(i->arg[0]) == RTmp)
+	if (isconbits(fn, i->arg[1], v))
+		return 1;
+	return 0;
+}
+
+static void
+ireassoc(Fn *fn, Blk *b, Ins *i, Ins **pvins, uint *pnins) 
+{
+	Blk *b2;
+	Tmp *t, *t2;
+	Use *u;
+	Ref r2;
+	int64_t v;
+	int x;
+
+	assert(b->ins <= i && i < &b->ins[b->nins]);
+	if (!iscmp(i->op, &x, &x))
+	if (!iskladdcon(fn, i, &v))
+		return;
+
+	assert(rtype(i->to) == RTmp);
+	t = &fn->tmp[i->to.val];
+	for (u = t->use; u < &t->use[t->nuse]; u++) {
+		if (u->type == UPhi)
+			continue;
+		b2 = fn->rpo[u->bid];
+		addins(pvins, pnins, i);
+		/* careful, can move fn->tmp */
+		r2 = newtmp("rea", t->cls, fn);
+		t = &fn->tmp[i->to.val];
+		t2 = &fn->tmp[r2.val];
+		t2->gcmbid = u->bid;
+		(*pvins)[(*pnins)-1].to = r2;
+		if (u->type == UIns) {
+			assert(req(u->u.ins->arg[0], i->to)
+			       || req(u->u.ins->arg[1], i->to));
+			if (req(u->u.ins->arg[0], i->to))
+				u->u.ins->arg[0] = r2;
+			if (req(u->u.ins->arg[1], i->to))
+				u->u.ins->arg[1] = r2;
+		} else {
+			assert(u->type == UJmp);
+			assert(req(b2->jmp.arg, i->to));
+			b2->jmp.arg = r2;
+		}
+	}
+}
+
+/* Redistribute trivial ops to point of use. */
+/* Reduces register pressure. */
+/* needs rpo, use; breaks use */
+void
+reassoc(Fn *fn)
+{
+	Blk *b;
+	Ins *vins, *i;
+	uint nins;
+
+	nins = 0;
+	vins = vnew(nins, sizeof vins[0], PFn);
+
+	/* identify trivial ins */
+	for (b = fn->start; b; b = b->link) {
+		for (i = b->ins; i < &b->ins[b->nins]; i++)
+			ireassoc(fn, b, i, &vins, &nins);
+	}
+
+	addgcmins(fn, vins, nins);
+}
+
+static void
+cleartmps(Fn *fn)
+{
+	Tmp *t;
+
+	for (t=&fn->tmp[Tmp0]; t < &fn->tmp[fn->ntmp]; t++) {
+		t->visit = 0;
+		t->gcmbid = NOBID;
+		t->gcminsn = -1u; // TODO - get rid
+	}
+}
+
+/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
+/* require use dom */
+/* maintains rpo pred dom */
+/* breaks use */
+void
+gcm(Fn *fn)
+{
+	uint bid;
+
+	/* fprintf(stderr, "\n\nBefore gcm:\n\n"); */
+	/* printfn(fn, stderr); */
+
+	filldomdpth(fn);
+	fillloop(fn);
+
+	cleartmps(fn);
+	for (bid=0; bid<fn->nblk; bid++)
+		earlyblk(fn, bid);
+	for (bid=0; bid<fn->nblk; bid++)
+		lateblk(fn, bid);
+
+	/* fprintf(stderr, "\n\nBefore gcmmove/fixub4d:\n\n"); */
+	/* printfn(fn, stderr); */
+
+	gcmmove(fn);
+	cleartmps(fn);
+	/* filluse(fn); */
+	/* fixub4d(fn); */
+
+	/* fprintf(stderr, "\n\nAfter gcmmove/fixub4d:\n\n"); */
+	/* printfn(fn, stderr); */
+
+	//fillcfg(fn);
+	filluse(fn);
+	reassoc(fn);
+
+	/* cleartmps(fn); */
+	filluse(fn);
+	fixub4d(fn);
+	
+	/* delete (now) unused ins  - already done later??? */
+	filluse(fn);
+	nopunused(fn);
+	
+	if (debug['G']) {
+		fprintf(stderr, "\n> After GCM:\n");
+		printfn(fn, stderr);
+	}
+}

+ 748 - 0
gvn.c

@@ -0,0 +1,748 @@
+#include "all.h"
+
+/* literal constants 0, 1 */
+Ref con01[2];
+
+static inline uint
+mix(uint x0, uint x1)
+{
+	return x0 + 17*x1;
+}
+
+static inline uint
+rhash(Ref r)
+{
+	return mix(r.type, r.val);
+}
+
+static uint
+ihash(Ins *i)
+{
+	uint a0h, a1h, ah, h;
+
+	a0h = rhash(i->arg[0]);
+	a1h = rhash(i->arg[1]);
+	ah = mix(a0h, a1h);
+	h = mix(i->cls, i->op);
+	h = mix(h, ah);
+
+	return h;
+
+}
+
+static int
+ieq(Ins *ia, Ins *ib)
+{
+	if (ia->cls == ib->cls)
+	if (ia->op == ib->op)
+	if (req(ia->arg[0], ib->arg[0]))
+	if (req(ia->arg[1], ib->arg[1]))
+		return 1;
+	return 0;
+}
+
+static Ins** gvntbl;
+static uint gvntbln;
+static uint lookupn;
+static uint proben;
+static uint maxproben;
+
+static Ins *
+gvndup(Ins *i, int insert) {
+	uint idx, n;
+	Ins *i2;
+
+	lookupn++;
+
+	idx = ihash(i) % gvntbln;
+	for (n = 1;; n++) {
+		proben++;
+		if (n > maxproben)
+			maxproben = n;
+		i2 = gvntbl[idx];
+		if (!i2)
+			break;
+		if (ieq(i, i2))
+			return i2;
+
+		idx++;
+		if (gvntbln <= idx)
+			idx = 0;
+	}
+	/* not found */
+	if (insert) {
+		gvntbl[idx] = i;
+	}
+	return 0;
+}
+
+static void
+replaceref(Ref *r, Ref r1, Ref r2)
+{
+	if (req(*r, r1))
+		*r = r2;
+}
+
+static void
+replacepuse(Phi *p, Ref r1, Ref r2)
+{
+	Ref *a;
+
+	for (a = p->arg; a < &p->arg[p->narg]; a++)
+		replaceref(a, r1, r2);
+}
+
+static void
+replaceiuse(Ins *i, Ref r1, Ref r2)
+{
+	replaceref(&i->arg[0], r1, r2);
+	replaceref(&i->arg[1], r1, r2);
+}
+
+static void
+replacejuse(Blk* b, Ref r1, Ref r2)
+{
+	replaceref(&b->jmp.arg, r1, r2);
+}
+
+static void
+replaceuse(Fn *fn, Use* u, Ref r1, Ref r2)
+{
+	Tmp *t2;
+	Blk *b;
+
+	t2 = rtype(r2) == RTmp ? &fn->tmp[r2.val] : 0;
+	b = fn->rpo[u->bid];
+
+	switch (u->type) {
+	case UXXX:
+		die("unreachable");
+		break;
+	case UPhi:
+		replacepuse(u->u.phi, r1, r2);
+		if (t2)
+			adduse(t2, UPhi, b, u->u.phi);
+		break;
+	case UIns:
+		replaceiuse(u->u.ins, r1, r2);
+		if (t2)
+			adduse(t2, UIns, b, u->u.ins);
+		break;
+	case UJmp:
+		replacejuse(fn->rpo[u->bid], r1, r2);
+		if (t2)
+			adduse(t2, UJmp, b);
+		break;
+	}
+}
+
+static void
+replaceuses(Fn *fn, Ref r1, Ref r2)
+{
+	Tmp *t1;
+	Use *u;
+
+	assert(rtype(r1) == RTmp);
+	t1 = &fn->tmp[r1.val];
+
+	for (u = t1->use; u < &t1->use[t1->nuse]; u++)
+		replaceuse(fn, u, r1, r2);
+
+	t1->nuse = 0;
+}
+
+static void
+rmuse(Fn *fn, Blk *b, uint ty, Ins *i, Phi *p, Ref r, int strict)
+{
+	Tmp *t;
+	Use *u;
+	int found, rm;
+
+	if (rtype(r) != RTmp)
+		return;
+	found = 0;
+	t = &fn->tmp[r.val];
+	for (u = t->use; u < &t->use[t->nuse];) {
+		rm = 0;
+		if (u->type == ty) {
+			switch (ty) {
+			case UXXX:
+				die("unreachable");
+				break;
+			case UIns:
+				assert(p == 0);
+				assert(i);
+				rm = u->u.ins == i;
+				break;
+			case UPhi:
+				assert(i == 0);
+				assert(p);
+				rm = u->u.phi == p;
+				break;
+			case UJmp:
+				assert(i == 0);
+				assert(p == 0);
+				rm = u->bid == b->id;
+				break;
+			default:
+				die("unreachable");
+				break;
+			}
+		}
+		if (rm) {
+			found++;
+			assert(u < &t->use[t->nuse]);
+			assert(u->bid == b->id);
+			/* careful - deleting below iterator */
+			memcpy(u, u+1, ((t->nuse) - ((u+1)-t->use)) * sizeof u[0]);
+			t->nuse--;
+		} else
+			u++;
+	}
+	if (strict)
+		assert(found);
+}
+
+static int
+phieq(Phi *pa, Phi *pb)
+{
+	uint n;
+
+	assert(pa->narg == pb->narg);
+	for (n=0; n<pa->narg; n++) {
+		if (!req(pa->arg[n], phiarg(pb, pa->blk[n])))
+			return 0;
+	}
+	return 1;
+}
+
+static void
+killphi(Fn *fn, Blk *b, Phi **pp, Ref r)
+{
+	Phi *p;
+	uint n;
+
+	p = *pp;
+	replaceuses(fn, p->to, r);
+	assert(b->npred == p->narg);
+	for (n = 0; n < p->narg; n++)
+		rmuse(fn, b, UPhi, 0, p, p->arg[n], 0/*!strict TODO dups*/);
+	p->narg = 0; /* mark as unused - TODO should not be necessary with rmuse*/
+	*pp = p->link;
+}
+
+static void
+ifphiopt(Fn *fn, Blk *b)
+{
+	Blk *ifb, *thenb, *elseb, *tmp;
+	Phi *p, **pp;
+	Tmp *t;
+	Ref argt, argf, r1;
+	int k, x, caninv;
+
+	/* is b the join blk of an if[-then][-else] graphlet? */
+	if (!ifjoin(b, &ifb, &thenb, &elseb))
+		return;
+
+	assert(ifb->jmp.type == Jjnz);
+	assert(ifb->s1 == b || (ifb->s1 == thenb && thenb->s1 == b));
+	assert(ifb->s2 == b || (ifb->s2 == elseb && elseb->s1 == b));
+	if (!iswu1(fn, ifb->jmp.arg))
+		return; /* no opportunity */
+
+	caninv = 1;
+	for (p = b->phi; p; p = p->link)
+		if (req(phiarg(p, elseb), con01[0])) {
+			caninv = 0;
+			break;
+		}
+
+	/* phi bool 1/0 "copy" */
+	for (pp = &b->phi; *pp;) {
+		p = *pp;
+		assert(p->narg == 2);
+		argt = phiarg(p, thenb);
+		argf = phiarg(p, elseb);
+		/* jnz jmp.arg in phi is 1/0 */
+		if (req(argt, ifb->jmp.arg)) {
+			if (!req(argf, ifb->jmp.arg))
+				rmuse(fn, b, UPhi, 0, p, argt, 1/*strict*/);
+			argt = p->arg[phiargn(p, thenb)] = con01[1];
+		}
+		if (req(argf, ifb->jmp.arg)) {
+			rmuse(fn, b, UPhi, 0, p, argf, 1/*strict*/);
+			argf = p->arg[phiargn(p, elseb)] = con01[0];
+		}
+		/* prefer 0 as argf and/or 1 as argt, so try to invert the cmp */
+		if (caninv &&
+		    ((req(argt, con01[0]) && !req(argf, con01[0])) ||
+		     (req(argf, con01[1]) && !req(argt, con01[1])))) {
+			assert(rtype(ifb->jmp.arg) == RTmp);
+			t = &fn->tmp[ifb->jmp.arg.val];
+			if (t->nuse == 1)
+			if (t->def && iscmp(t->def->op, &k, &x) && KBASE(k) == 0) {
+				assert(t->use[0].type == UJmp);
+				assert(t->bid == ifb->id);
+				t->def->op = invcmpwl(t->def->op);
+				tmp = ifb->s1;
+				ifb->s1 = ifb->s2;
+				ifb->s2 = tmp;
+				r1 = argt; argt = argf; argf = r1;
+				caninv = 0; /* only once */
+			}
+		}
+		if (req(argt, con01[1]) && req(argf, con01[0])) {
+			killphi(fn, b, pp, ifb->jmp.arg);
+			caninv = 0; /* used already */
+			continue;
+		}
+		pp = &(*pp)->link;
+	}
+}
+
+/* phi "copy" - singleton or all args identical */
+static void
+copyphielim(Fn *fn, Blk *b) {
+	Phi *p, **pp;
+	uint n;
+
+	for (pp = &b->phi; *pp;) {
+		p = *pp;
+		assert(p->narg == b->npred);
+		for (n = 0; n < p->narg-1; n++) {
+			if (!req(p->arg[n], p->arg[n+1]))
+				goto Skip;
+		}
+		killphi(fn, b, pp, p->arg[0]);
+		continue;
+	Skip:;
+		pp = &(*pp)->link;
+	}
+}
+
+/* redundant phis */
+static void
+dupphielim(Fn *fn, Blk *b) {
+	Phi *p, *p2, **pp;
+
+	/* O(N^2.M) but meh */
+	for (pp = &b->phi; *pp;) {
+		p = *pp;
+		assert(p->narg == b->npred);
+		for (p2 = p->link; p2; p2 = p2->link) {
+			assert(p2->narg == b->npred);
+			if (phieq(p, p2)) {
+				killphi(fn, b, pp, p2->to);
+				goto Skip;
+			}
+		}
+		pp = &(*pp)->link;
+	Skip:;
+	}
+	return;
+}
+
+static void
+dedupphis(Fn *fn, Blk *b) {
+	ifphiopt(fn, b);
+	copyphielim(fn,b);
+	dupphielim(fn, b);
+}
+
+static int
+rcmp(Ref a, Ref b)
+{
+	if (rtype(a) != rtype(b))
+		return rtype(a)-rtype(b);
+	return a.val - b.val;
+}
+
+static void
+normins(Fn *fn, Ins *i)
+{
+	uint n;
+	int64_t v;
+	Ref r;
+
+	/* truncate constant bits to 32 bits for "s"/w" uses */
+	for (n = 0; n < 2; n++) {
+		if (!KWIDE(argcls(i, n)))
+		if (isconbits(fn, i->arg[n], &v))
+		if ((v & 0xffffffff) != v)
+			i->arg[n] = getcon(v & 0xffffffff, fn);
+	}
+	/* order arg[0] <= arg[1] for commutative ops, preferring RTmp in arg[0] */
+	if (optab[i->op].commutes)
+	if (rcmp(i->arg[0], i->arg[1]) > 0) {
+		r = i->arg[1];
+		i->arg[1] = i->arg[0];
+		i->arg[0] = r;
+	}
+}
+
+static Ref
+negcon(Fn *fn, int cls, Ref r)
+{
+	int64_t v, v1;
+
+	assert(isconbits(fn, r, &v));
+	assert(KBASE(cls) == 0);
+	v1 = -v;
+	if (cls == Kw)
+		v1 = ((int64_t)(-(int32_t)v)) & 0xffffffff;
+	return getcon(v1, fn);
+}
+
+static void
+assoccon(Fn *fn, Blk *b, Ins *i1)
+{
+	Tmp *t2;
+	Ins *i2, i;
+	Ref r0, r1, rc;
+	int op1, op2;
+	int64_t v1, v2, vc;
+
+	op1 = i1->op;
+	if (op1 == Osub)
+		op1 = Oadd;
+	if (!optab[op1].assoc || KBASE(i1->cls) != 0 || rtype(i1->arg[0]) != RTmp
+	    || !isconbits(fn, i1->arg[1], &v1) || (optab[op1].hasid && v1 == optab[op1].idval))
+		return;
+	t2 = &fn->tmp[i1->arg[0].val];
+	if (t2->def == 0)
+		return;
+	i2 = t2->def;
+	op2 = i2->op;
+	if (op2 == Osub)
+		op2 = Oadd;
+	if (op1 != op2 || rtype(i2->arg[0]) != RTmp || !isconbits(fn, i2->arg[1], &v2))
+		return;
+	assert(KBASE(i2->cls) == 0);
+	assert(i1->cls == Kl || i2->cls == Kw);
+	r0 = i1->arg[1];
+	if (i1->op == Osub)
+		r0 = negcon(fn, i1->cls, r0);
+	r1 = i2->arg[1];
+	if (i2->op == Osub)
+		r1 = negcon(fn, i2->cls, r1);
+	i = (Ins){.to = i2->to, .op = op2, .cls = i2->cls, .arg = {r0, r1}};
+	rc = foldref(fn, &i);
+	assert(isconbits(fn, rc, &vc));
+	if (op1 == Oadd) {
+		if (i2->cls == Kw) {
+			if ((int32_t)vc < 0) {
+				op1 = Osub;
+				rc = negcon(fn, Kw, rc);
+			}
+		} else {
+			assert(i2->cls == Kl);
+			if (vc < 0) {
+				op1 = Osub;
+				rc = negcon(fn, Kl, rc);
+			}
+		}
+	}
+	i1->op = op1;
+	rmuse(fn, b, UIns, i1, 0, i1->arg[0], 1/*strict*/);
+	i1->arg[0] = i2->arg[0];
+	adduse(&fn->tmp[i1->arg[0].val], UIns, b, i1);
+	i1->arg[1] = rc;
+}
+
+static void
+killins(Fn *fn, Blk *b, Ins *i, Ref r1, Ref r2)
+{
+	replaceuses(fn, r1, r2);
+	rmuse(fn, b, UIns, i, 0, i->arg[0], 1/*strict*/);
+	if (!req(i->arg[0], i->arg[1]))
+		rmuse(fn, b, UIns, i, 0, i->arg[1], 1/*strict*/);
+	*i = (Ins){.op = Onop};
+}
+
+static void
+dedupi(Fn *fn, Blk *b, Ins *i)
+{
+	Ref r2;
+	Ins *i2;
+
+	if (i->op == Onop || i->op == Odbgloc)
+		return;
+
+	normins(fn, i);
+
+	if (optab[i->op].ispinned)
+		return;
+	assert(rtype(i->to) == RTmp);
+
+	/* merge associative ops with constant arg[1] */
+	assoccon(fn, b, i);
+
+	/* effective copy? */
+	r2 = copyref(fn, b, i);
+	if (!req(r2, R)) {
+		killins(fn, b, i, i->to, r2);
+		return;
+	}
+
+	/* effective constant? */
+	r2 = foldref(fn, i);
+	if (!req(r2, R)) {
+		killins(fn, b, i, i->to, r2);
+		return;
+	}
+
+	/* do not dedup (trapping) ins that GCM will not move */
+	if (isfixed(fn, i))
+		return;
+
+	/* duplicate? */
+	i2 = gvndup(i, 1);
+	if (i2) {
+		killins(fn, b, i, i->to, i2->to);
+		return;
+	}
+}
+
+static void
+dedupins(Fn *fn, Blk *b)
+{
+	Ins *i;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++)
+		dedupi(fn, b, i);
+}
+
+int
+cmpeq0def(Fn *fn, Ref r, Ref *arg0, int *cls, int *eqval)
+{
+	Ins *i;
+
+	if (rtype(r) != RTmp)
+		return 0;
+	i = fn->tmp[r.val].def;
+	if (i)
+	if (optab[i->op].cmpeqwl)
+	if (req(i->arg[1], con01[0])) {
+		*arg0 = i->arg[0];
+		*cls = argcls(i, 0);
+		*eqval = optab[i->op].eqval;
+		return 1;
+	}
+	return 0;
+}
+
+static int
+branchdom(Fn *fn, Blk *bif, Blk *bbr1, Blk *bbr2, Blk *b)
+{
+	if (bif->jmp.type == Jjnz)
+	if (b != bif)
+	if (dom(bbr1, b))
+	if (!reachesnotvia(fn, bbr2, b, bif))
+		return 1;
+	return 0;
+}
+
+static int
+dom0non0(Fn *fn, Blk *bdom, Blk *b, int *is0)
+{
+	if (branchdom(fn, bdom, bdom->s1, bdom->s2, b)) {
+		*is0 = 0;
+		return 1;
+	}
+	if (branchdom(fn, bdom, bdom->s2, bdom->s1, b)) {
+		*is0 = 1;
+		return 1;
+	}
+	return 0;
+}
+
+/* infer 0/non-0 value from dominating jnz */
+int
+is0non0(Fn *fn, Blk *b, Ref r, int cls, int *is0)
+{
+	Blk *bdom;
+	Ref arg0;
+	int cls1, eqval;
+
+	for (bdom = b->idom; bdom; bdom = bdom->idom) {
+		if (bdom->jmp.type != Jjnz)
+			continue;
+		if (cls == Kw)
+		if (req(r, bdom->jmp.arg))
+		if (dom0non0(fn, bdom, b, is0))
+			return 1;
+		if (cmpeq0def(fn, bdom->jmp.arg, &arg0, &cls1, &eqval))
+		if (cls == cls1)
+		if (req(r, arg0))
+		if (dom0non0(fn, bdom, b, is0)) {
+			*is0 = *is0 ^ eqval;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int
+usecls(Use *u, Ref r)
+{
+	switch (u->type) {
+	case UXXX: break;
+	case UIns:
+		/* safe to take arg[0] cls if both args == r, even for store */
+		if (req(u->u.ins->arg[0], r))
+			return argcls(u->u.ins, 0);
+		if (req(u->u.ins->arg[1], r))
+			return argcls(u->u.ins, 1);
+		break;
+	case UPhi: return u->u.phi->cls;
+	case UJmp: return Kw;
+	}
+	die("unreachable");
+}
+
+static void
+propjnz0(Fn *fn, Blk *bif, Blk *s0, Blk *snon0, Ref r, int cls)
+{
+	Blk *b;
+	Tmp *t;
+	Use *u;
+
+	if (s0->npred != 1 || rtype(r) != RTmp)
+		return;
+	t = &fn->tmp[r.val];
+	for (u = t->use; u < &t->use[t->nuse]; u++) {
+		b = fn->rpo[u->bid];
+		if (usecls(u, r) == cls)
+		if (branchdom(fn, bif, s0, snon0, b))
+			replaceuse(fn, u, r, con01[0]);
+	}
+}
+
+static void
+dedupjmp(Fn *fn, Blk *b) {
+	Blk *s1s2[2];
+	int64_t v;
+	Ref arg0;
+	int cls, eqval, is0;
+
+	if (b->jmp.type != Jjnz)
+		return;
+
+	/* propagate jmp arg as 0 thru s2 */
+	propjnz0(fn, b, b->s2, b->s1, b->jmp.arg, Kw);
+	/* propagate cmp eq/ne 0 def of jmp arg as 0 */
+	s1s2[0] = b->s1; s1s2[1] = b->s2;
+	if (cmpeq0def(fn, b->jmp.arg, &arg0, &cls, &eqval))
+		propjnz0(fn, b, s1s2[eqval^1], s1s2[eqval], arg0, cls);
+
+	/* collapse trivial/constant jnz to jmp */
+	v = 1;
+	is0 = 0;
+	if (b->s1 == b->s2
+	    || isconbits(fn, b->jmp.arg, &v)
+	    || is0non0(fn, b, b->jmp.arg, Kw, &is0)) {
+		if (v == 0 || is0)
+			b->s1 = b->s2;
+		/* we later move active ins out of dead blks */
+		b->s2 = 0;
+		b->jmp.type = Jjmp;
+		rmuse(fn, b, UJmp, 0, 0, b->jmp.arg, 1/*strict*/);
+		b->jmp.arg = R;
+		return;
+	}
+}
+
+/* rebuild rpo pred use */
+/* careful not to lose active ins in dead blks */
+static void
+rebuildcfg(Fn *fn) {
+	uint n, prevnblk;
+	Blk **prevrpo;
+	Blk *b;
+	Ins *i;
+
+	prevnblk = fn->nblk;
+	prevrpo = emalloc(prevnblk * sizeof prevrpo[0]);
+	memcpy(prevrpo, fn->rpo, prevnblk * sizeof prevrpo[0]);
+
+	fillcfg(fn); // TODO - OK?
+
+	for (n=0; n<prevnblk; n++) {
+		b = prevrpo[n];
+		if (b->id == -1u) {
+			assert(b != fn->start);
+			/* blk unreachable after GVN */
+			for (i = b->ins; i < &b->ins[b->nins]; i++)
+				if (i->op != Onop)
+				if (!optab[i->op].ispinned)
+				if (gvndup(i, 0) == i)
+					/* (possibly) active ins - add to start blk */
+					addins(&fn->start->ins, &fn->start->nins, i);
+		}
+	}
+	free(prevrpo);
+}
+
+/* https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf */
+/* require rpo pred ssa use */
+/* recreates rpo */
+/* breaks pred use dom ssa (GCM fixes ssa) */
+void
+gvn(Fn *fn)
+{
+	uint n, nins;
+	Blk *b;
+	Phi *p;
+
+	/* facilitate jnz simplification */
+	blkmerge(fn);
+	fillcfg(fn);
+	filluse(fn);
+	filldom(fn);
+
+	con01[0] = getcon(0, fn);
+	con01[1] = getcon(1, fn);
+
+	nins = 0;
+	for (b=fn->start; b; b=b->link)
+		for (p = b->phi; p; p = p->link)
+			p->visit = 0;
+
+	/* facilitate ext elimination */
+	fillloop(fn);
+	narrowpars(fn);
+	filluse(fn);
+	ssacheck(fn);
+
+	for (b=fn->start; b; b=b->link)
+		nins += b->nins;
+
+	gvntbln = nins + nins/2;
+	gvntbl = emalloc(gvntbln * sizeof gvntbl[0]);
+	lookupn = 0;
+	proben = 0;
+	maxproben = 0;
+
+	/* GVN */
+	clrbvisit(fn);
+	for (n=0; n<fn->nblk; n++) {
+		b = fn->rpo[n];
+		dedupphis(fn, b);
+		dedupins(fn, b);
+		dedupjmp(fn, b);
+	}
+
+	rebuildcfg(fn);
+
+	free(gvntbl);
+	gvntbl = 0;
+	gvntbln = 0;
+	lookupn = 0;
+	proben = 0;
+	maxproben = 0;
+
+	if (debug['G']) {
+		fprintf(stderr, "\n> After GVN:\n");
+		printfn(fn, stderr);
+	}
+}

+ 195 - 0
ins.c

@@ -0,0 +1,195 @@
+#include "all.h"
+
+void
+addins(Ins **pvins, uint *pnins, Ins *i)
+{
+	if (i->op == Onop)
+		return;
+	vgrow(pvins, ++(*pnins));
+	(*pvins)[(*pnins)-1] = *i;
+}
+
+void
+addbins(Blk *b, Ins **pvins, uint *pnins)
+{
+	Ins *i;
+
+	for (i = b->ins; i < &b->ins[b->nins]; i++)
+		addins(pvins, pnins, i);
+}
+
+static int
+unusedins(Fn *fn, Ins *i);
+
+static int
+unusedtmp(Fn *fn, Tmp *t)
+{
+	if (t->nuse == 0)
+		return 1;
+	if (t->nuse == 1)
+	if (t->use[0].type == UIns)
+		return unusedins(fn, t->use[0].u.ins);
+	return 0;
+}
+
+static int
+unusedtmpref(Fn *fn, Ref r)
+{
+	if (rtype(r) != RTmp)
+		return 0;
+	return unusedtmp(fn, &fn->tmp[r.val]);
+}
+
+static int
+unusedins(Fn *fn, Ins *i)
+{
+	if (!INRANGE(i->op, Opar, Ocall))
+	if (unusedtmpref(fn, i->to))
+		return 1;
+	return 0;
+}
+
+/* replace unused ins with nop */
+/* needs use; breaks use */
+void
+nopunused(Fn *fn)
+{
+	Blk *b;
+	Ins *i;
+
+	for (b = fn->start; b; b = b->link)
+		for (i = b->ins; i < &b->ins[b->nins]; i++)
+			if (unusedins(fn, i))
+				*i = (Ins){.op = Onop};
+}
+
+typedef struct FromLoc FromLoc;
+struct FromLoc {
+	InsLoc from;
+	uint ton;
+};
+
+static int
+loccmp(InsLoc *la, InsLoc *lb)
+{
+	if (la->bid != lb->bid)
+		return (int)la->bid - (int)lb->bid;
+	return la->insn - lb->insn;
+}
+
+static int
+tocmp(const void *a, const void *b)
+{
+	InsMov *ma, *mb;
+
+	ma = (InsMov*)a;
+	mb = (InsMov*)b;
+	return loccmp(&ma->to, &mb->to);
+}
+
+static int
+fromcmp(const void *a, const void *b)
+{
+	FromLoc *fa, *fb;
+
+	fa = (FromLoc*)a;
+	fb = (FromLoc*)b;
+	return loccmp(&fa->from, &fb->from);
+}
+
+static int
+loceq(InsLoc *a, InsLoc *b)
+{
+	return a->bid == b->bid && a->insn == b->insn;
+}
+
+/* after, mov is sorted by to, and to.insn, from.insn are updated */
+void
+movins(Fn *fn, InsMov *mov, uint nmov, int del)
+{
+	Blk *b, *b2;
+	uint bid, n, fromn, ton, ifromn, nins;
+	FromLoc *from;
+	uint *newbnins;
+	Ins **newbins, *vins;
+
+	qsort(mov, nmov, sizeof mov[0], tocmp);
+
+	from = emalloc(nmov * sizeof from[0]);
+	for (n = 0; n < nmov; n++) {
+		from[n].from = mov[n].from;
+		from[n].ton = n;
+	}
+	qsort(from, nmov, sizeof from[0], fromcmp);
+
+	nins = 0;
+	vins = vnew(nins, sizeof vins[0], PFn);
+
+	newbnins = emalloc(fn->nblk * sizeof newbnins[0]);
+	newbins = emalloc(fn->nblk * sizeof newbins[0]);
+
+	/* populate new ins buffers */
+	/* update mov to.insn, from insn */
+	fromn = ton = 0;
+	for (bid = 0; bid < fn->nblk; bid++) {
+		/* no moves to this block */
+		if (nmov <= ton || mov[ton].to.bid != bid) {
+			while (fromn < nmov && from[fromn].from.bid == bid)
+				fromn++;
+			continue;
+		}
+		b = fn->rpo[bid];
+		nins = 0;
+		for (ifromn = 0; ifromn < b->nins; ifromn++) {
+			/* insert new ins, update to */
+			while (ton < nmov && loceq(&mov[ton].to, &(InsLoc){.bid = bid, .insn = ifromn})) {
+				b2 = fn->rpo[mov[ton].from.bid];
+				assert(mov[ton].from.insn < b2->nins);
+				addins(&vins, &nins, &b2->ins[mov[ton].from.insn]);
+				mov[ton++].to.insn = nins-1;
+			}
+			/* update from */
+			while (fromn < nmov && loceq(&from[fromn].from, &(InsLoc){.bid = bid, .insn = ifromn}))
+				from[fromn++].from.insn = nins;
+			/* copy original ins */
+			addins(&vins, &nins, &b->ins[ifromn]);
+		}
+		/* append new ins, update to */
+		while (ton < nmov && mov[ton].to.bid == bid) {
+			assert(mov[ton].to.insn == b->nins);
+			b2 = fn->rpo[mov[ton].from.bid];
+			assert(mov[ton].from.insn < b2->nins);
+			addins(&vins, &nins, &b2->ins[mov[ton].from.insn]);
+			mov[ton++].to.insn = nins-1;
+		}
+		assert(ifromn == b->nins);
+		/* idup(&newbins[bid], vins, nins); */
+		newbins[bid] = vins;
+		vins = vnew(0, sizeof vins[0], PFn);
+		newbnins[bid] = nins;
+	}
+	assert(fromn == nmov);
+	assert(ton == nmov);
+
+	/* install new b->ins */
+	for (bid = 0; bid < fn->nblk; bid++) {
+		if (newbnins[bid] == 0)
+			continue;
+		b = fn->rpo[bid];
+		b->ins = newbins[bid];
+		b->nins = newbnins[bid];
+	}
+
+	/* remove from ins, update mov from insn */
+	for (fromn = 0; fromn < nmov; fromn++) {
+		b = fn->rpo[from[fromn].from.bid];
+		assert(from[fromn].from.insn < b->nins);
+		if (del)
+			b->ins[from[fromn].from.insn] = (Ins){.op = Onop};
+		mov[from[fromn].ton].from.insn = from[fromn].from.insn;
+	}
+
+	free(from);
+	free(newbins);
+	free(newbnins);
+}

+ 10 - 2
main.c

@@ -72,10 +72,18 @@ func(Fn *fn)
 	fillalias(fn);
 	coalesce(fn);
 	filluse(fn);
+	filldom(fn);
 	ssacheck(fn);
-	copy(fn);
+	gvn(fn);
+	fillcfg(fn);
+	filluse(fn);
+	ifelim(fn);
+	fillcfg(fn);
 	filluse(fn);
-	fold(fn);
+	filldom(fn);
+	gcm(fn);
+	filluse(fn);
+	ssacheck(fn);
 	T.abi1(fn);
 	simpl(fn);
 	fillcfg(fn);

+ 144 - 147
ops.h

@@ -6,188 +6,185 @@
 	#define V(Imm)
 #endif
 
-#ifndef P
-	#define P(CanFold, HasId, IdVal)
+#ifndef F
+#define F(CanFold, HasId, IdVal, Commutes, Associates, Idemp, IsCmpEq, IsCmpLgte, CmpEqVal, IsPinned)
 #endif
 
-
 #define T(a,b,c,d,e,f,g,h) {                          \
 	{[Kw]=K##a, [Kl]=K##b, [Ks]=K##c, [Kd]=K##d}, \
 	{[Kw]=K##e, [Kl]=K##f, [Ks]=K##g, [Kd]=K##h}  \
 }
 
-
 /*********************/
 /* PUBLIC OPERATIONS */
 /*********************/
 
 /* Arithmetic and Bits */
-O(add,     T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(1)
-O(sub,     T(w,l,s,d, w,l,s,d), P(1,1,0)) X(2,1,0) V(0)
-O(neg,     T(w,l,s,d, x,x,x,x), P(1,0,0)) X(1,1,0) V(0)
-O(div,     T(w,l,s,d, w,l,s,d), P(1,1,1)) X(0,0,0) V(0)
-O(rem,     T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0)
-O(udiv,    T(w,l,e,e, w,l,e,e), P(1,1,1)) X(0,0,0) V(0)
-O(urem,    T(w,l,e,e, w,l,e,e), P(1,0,0)) X(0,0,0) V(0)
-O(mul,     T(w,l,s,d, w,l,s,d), P(1,1,1)) X(2,0,0) V(0)
-O(and,     T(w,l,e,e, w,l,e,e), P(1,0,0)) X(2,1,0) V(1)
-O(or,      T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1)
-O(xor,     T(w,l,e,e, w,l,e,e), P(1,1,0)) X(2,1,0) V(1)
-O(sar,     T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
-O(shr,     T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
-O(shl,     T(w,l,e,e, w,w,e,e), P(1,1,0)) X(1,1,0) V(1)
+O(add,     T(w,l,s,d, w,l,s,d), F(1,1,0,1,1,0,0,0,0,0)) X(2,1,0) V(1)
+O(sub,     T(w,l,s,d, w,l,s,d), F(1,1,0,0,0,0,0,0,0,0)) X(2,1,0) V(0)
+O(neg,     T(w,l,s,d, x,x,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
+O(div,     T(w,l,s,d, w,l,s,d), F(1,1,1,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(rem,     T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(udiv,    T(w,l,e,e, w,l,e,e), F(1,1,1,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(urem,    T(w,l,e,e, w,l,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(mul,     T(w,l,s,d, w,l,s,d), F(1,1,1,1,0,0,0,0,0,0)) X(2,0,0) V(0)
+O(and,     T(w,l,e,e, w,l,e,e), F(1,0,0,1,1,1,0,0,0,0)) X(2,1,0) V(1)
+O(or,      T(w,l,e,e, w,l,e,e), F(1,1,0,1,1,1,0,0,0,0)) X(2,1,0) V(1)
+O(xor,     T(w,l,e,e, w,l,e,e), F(1,1,0,1,1,0,0,0,0,0)) X(2,1,0) V(1)
+O(sar,     T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0,0)) X(1,1,0) V(1)
+O(shr,     T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0,0)) X(1,1,0) V(1)
+O(shl,     T(w,l,e,e, w,w,e,e), F(1,1,0,0,0,0,0,0,0,0)) X(1,1,0) V(1)
 
 /* Comparisons */
-O(ceqw,    T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cnew,    T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgtw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cslew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csltw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1)
-O(cugew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cugtw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(culew,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cultw,   T(w,w,e,e, w,w,e,e), P(1,0,0)) X(0,1,0) V(1)
-
-O(ceql,    T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cnel,    T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csgtl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cslel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(csltl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1)
-O(cugel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cugtl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(culel,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cultl,   T(l,l,e,e, l,l,e,e), P(1,0,0)) X(0,1,0) V(1)
-
-O(ceqs,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cges,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cgts,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cles,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(clts,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cnes,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cos,     T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cuos,    T(s,s,e,e, s,s,e,e), P(1,0,0)) X(0,1,0) V(0)
-
-O(ceqd,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cged,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cgtd,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cled,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cltd,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cned,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cod,     T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
-O(cuod,    T(d,d,e,e, d,d,e,e), P(1,0,0)) X(0,1,0) V(0)
+O(ceqw,    T(w,w,e,e, w,w,e,e), F(1,1,1,1,0,0,1,0,1,0)) X(0,1,0) V(0)
+O(cnew,    T(w,w,e,e, w,w,e,e), F(1,1,0,1,0,0,1,0,0,0)) X(0,1,0) V(0)
+O(csgew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csgtw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(cslew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csltw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+O(cugew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cugtw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(culew,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cultw,   T(w,w,e,e, w,w,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+
+O(ceql,    T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,0,1,0,1,0)) X(0,1,0) V(0)
+O(cnel,    T(l,l,e,e, l,l,e,e), F(1,0,0,1,0,0,1,0,0,0)) X(0,1,0) V(0)
+O(csgel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csgtl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(cslel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(csltl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+O(cugel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cugtl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(0)
+O(culel,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,1,0)) X(0,1,0) V(0)
+O(cultl,   T(l,l,e,e, l,l,e,e), F(1,0,0,0,0,0,0,1,0,0)) X(0,1,0) V(1)
+
+O(ceqs,    T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cges,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cgts,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cles,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(clts,    T(s,s,e,e, s,s,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cnes,    T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cos,     T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cuos,    T(s,s,e,e, s,s,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
+
+O(ceqd,    T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cged,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cgtd,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cled,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cltd,    T(d,d,e,e, d,d,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cned,    T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cod,     T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
+O(cuod,    T(d,d,e,e, d,d,e,e), F(1,0,0,1,0,0,0,0,0,0)) X(0,1,0) V(0)
 
 /* Memory */
-O(storeb,  T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(storeh,  T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(storew,  T(w,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(storel,  T(l,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(stores,  T(s,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(stored,  T(d,e,e,e, m,e,e,e), P(0,0,0)) X(0,0,1) V(0)
-
-O(loadsb,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loadub,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loadsh,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loaduh,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loadsw,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(loaduw,  T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(load,    T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
+O(storeb,  T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(storeh,  T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(storew,  T(w,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(storel,  T(l,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(stores,  T(s,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(stored,  T(d,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+
+O(loadsb,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loadub,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loadsh,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loaduh,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loadsw,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(loaduw,  T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
+O(load,    T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
 
 /* Extensions and Truncations */
-O(extsb,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extub,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extsh,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extuh,   T(w,w,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extsw,   T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(extuw,   T(e,w,e,e, e,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-
-O(exts,    T(e,e,e,s, e,e,e,x), P(1,0,0)) X(0,0,1) V(0)
-O(truncd,  T(e,e,d,e, e,e,x,e), P(1,0,0)) X(0,0,1) V(0)
-O(stosi,   T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(stoui,   T(s,s,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(dtosi,   T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(dtoui,   T(d,d,e,e, x,x,e,e), P(1,0,0)) X(0,0,1) V(0)
-O(swtof,   T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(uwtof,   T(e,e,w,w, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(sltof,   T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(ultof,   T(e,e,l,l, e,e,x,x), P(1,0,0)) X(0,0,1) V(0)
-O(cast,    T(s,d,w,l, x,x,x,x), P(1,0,0)) X(0,0,1) V(0)
+O(extsb,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extub,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extsh,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extuh,   T(w,w,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extsw,   T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(extuw,   T(e,w,e,e, e,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+
+O(exts,    T(e,e,e,s, e,e,e,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(truncd,  T(e,e,d,e, e,e,x,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(stosi,   T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(stoui,   T(s,s,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(dtosi,   T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(dtoui,   T(d,d,e,e, x,x,e,e), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(swtof,   T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(uwtof,   T(e,e,w,w, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(sltof,   T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(ultof,   T(e,e,l,l, e,e,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(cast,    T(s,d,w,l, x,x,x,x), F(1,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
 
 /* Stack Allocation */
-O(alloc4,  T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(alloc8,  T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(alloc16, T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
+O(alloc4,  T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(alloc8,  T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(alloc16, T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
 
 /* Variadic Function Helpers */
-O(vaarg,   T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(vastart, T(m,e,e,e, x,e,e,e), P(0,0,0)) X(0,0,0) V(0)
+O(vaarg,   T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(vastart, T(m,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
 
-O(copy,    T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
+O(copy,    T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
 
 /* Debug */
-O(dbgloc,  T(w,e,e,e, w,e,e,e), P(0,0,0)) X(0,0,1) V(0)
+O(dbgloc,  T(w,e,e,e, w,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,1) V(0)
 
 /****************************************/
 /* INTERNAL OPERATIONS (keep nop first) */
 /****************************************/
 
 /* Miscellaneous and Architecture-Specific Operations */
-O(nop,     T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,1) V(0)
-O(addr,    T(m,m,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(blit0,   T(m,e,e,e, m,e,e,e), P(0,0,0)) X(0,1,0) V(0)
-O(blit1,   T(w,e,e,e, x,e,e,e), P(0,0,0)) X(0,1,0) V(0)
-O(swap,    T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,0,0) V(0)
-O(sign,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(salloc,  T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(xidiv,   T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0)
-O(xdiv,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(1,0,0) V(0)
-O(xcmp,    T(w,l,s,d, w,l,s,d), P(0,0,0)) X(1,1,0) V(0)
-O(xtest,   T(w,l,e,e, w,l,e,e), P(0,0,0)) X(1,1,0) V(0)
-O(acmp,    T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(acmn,    T(w,l,e,e, w,l,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(afcmp,   T(e,e,s,d, e,e,s,d), P(0,0,0)) X(0,0,0) V(0)
-O(reqz,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(rnez,    T(w,l,e,e, x,x,e,e), P(0,0,0)) X(0,0,0) V(0)
+O(nop,     T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(addr,    T(m,m,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(blit0,   T(m,e,e,e, m,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0)
+O(blit1,   T(w,e,e,e, x,e,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,1,0) V(0)
+O(swap,    T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
+O(sign,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(salloc,  T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(xidiv,   T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
+O(xdiv,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(1,0,0) V(0)
+O(xcmp,    T(w,l,s,d, w,l,s,d), F(0,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
+O(xtest,   T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(1,1,0) V(0)
+O(acmp,    T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(acmn,    T(w,l,e,e, w,l,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(afcmp,   T(e,e,s,d, e,e,s,d), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(reqz,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
+O(rnez,    T(w,l,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,0) V(0)
 
 /* Arguments, Parameters, and Calls */
-O(par,     T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parsb,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parub,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parsh,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(paruh,   T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(parc,    T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(pare,    T(e,x,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(arg,     T(w,l,s,d, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argsb,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argub,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argsh,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(arguh,   T(w,e,e,e, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(argc,    T(e,x,e,e, e,l,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(arge,    T(e,l,e,e, e,x,e,e), P(0,0,0)) X(0,0,0) V(0)
-O(argv,    T(x,x,x,x, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
-O(call,    T(m,m,m,m, x,x,x,x), P(0,0,0)) X(0,0,0) V(0)
+O(par,     T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parsb,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parub,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parsh,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(paruh,   T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(parc,    T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(pare,    T(e,x,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(arg,     T(w,l,s,d, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argsb,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argub,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argsh,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(arguh,   T(w,e,e,e, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argc,    T(e,x,e,e, e,l,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(arge,    T(e,l,e,e, e,x,e,e), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(argv,    T(x,x,x,x, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
+O(call,    T(m,m,m,m, x,x,x,x), F(0,0,0,0,0,0,0,0,0,1)) X(0,0,0) V(0)
 
 /* Flags Setting */
-O(flagieq,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagine,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagisge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagisgt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagisle, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagislt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiuge, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiugt, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiule, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagiult, T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfeq,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfge,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfgt,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfle,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagflt,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfne,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfo,   T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-O(flagfuo,  T(x,x,e,e, x,x,e,e), P(0,0,0)) X(0,0,1) V(0)
-
+O(flagieq,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagine,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagisge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagisgt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagisle, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagislt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiuge, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiugt, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiule, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagiult, T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfeq,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfge,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfgt,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfle,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagflt,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfne,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfo,   T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
+O(flagfuo,  T(x,x,e,e, x,x,e,e), F(0,0,0,0,0,0,0,0,0,0)) X(0,0,1) V(0)
 
 #undef T
 #undef X

+ 10 - 4
parse.c

@@ -15,11 +15,17 @@ enum {
 };
 
 Op optab[NOp] = {
-#undef P
-#define P(cf, hi, id) .canfold = cf, .hasid = hi, .idval = id
-#define O(op, t, p) [O##op]={.name = #op, .argcls = t, p},
+#undef F
+#define F(cf, hi, id, co, as, im, ic, lg, cv, pn) \
+	.canfold = cf, \
+	.hasid = hi, .idval = id, \
+	.commutes = co, .assoc = as, \
+	.idemp = im, \
+	.cmpeqwl = ic, .cmplgtewl = lg, .eqval = cv, \
+	.ispinned = pn
+#define O(op, k, flags) [O##op]={.name = #op, .argcls = k, flags},
 	#include "ops.h"
-#undef P
+#undef F
 };
 
 typedef enum {

+ 101 - 0
reassoc.c

@@ -0,0 +1,101 @@
+#include "all.h"
+
+static void
+ireassoc(Fn *fn, Blk *b, Ins *i, uint *pnim, InsMov **pim)
+{
+	Blk *b2;
+	Tmp *t;
+	Use *u;
+	int64_t v;
+	int x;
+
+	assert(b->ins <= i && i < &b->ins[b->nins]);
+	if (!iscmp(i->op, &x, &x))
+	if (!iskladdcon(fn, i, &v))
+		return;
+
+	assert(rtype(i->to) == RTmp);
+	t = &fn->tmp[i->to.val];
+	for (u = t->use; u < &t->use[t->nuse]; u++) {
+		if (u->type == UPhi)
+			continue;
+		if (u->type == UIns)
+		if (INRANGE(u->u.ins->op, Oarg, Ocall))
+			continue;
+		b2 = fn->rpo[u->bid];
+		vgrow(pim, ++(*pnim));
+		(*pim)[(*pnim)-1] = (InsMov){
+			.from = {.bid = b->id, .insn = i-b->ins},
+			.to = {
+				.bid = u->bid,
+				.insn = u->type == UJmp ? b2->nins : u->u.ins-b2->ins
+			}};
+	}
+}
+
+static void
+trealloc(Fn *fn, Blk *b, Ins *i)
+{
+	Ins *i2;
+	Ref r;
+
+	assert(b->ins <= i && i < &b->ins[b->nins]);
+	r = newtmp("rea", i->cls, fn);
+	if (i < &b->ins[b->nins-1]) {
+		i2 = i+1;
+		/* special case of both args of target instruction reassociated */
+		if (!req(i->to, i2->arg[0]) && !req(i->to, i2->arg[1])) {
+			assert(i < &b->ins[b->nins-2]);
+			i2 = i+2;
+		}
+		assert(req(i->to, i2->arg[0]) || req(i->to, i2->arg[1]));
+		if (req(i->to, i2->arg[0]))
+			i2->arg[0] = r;
+		if (req(i->to, i2->arg[1]))
+			i2->arg[1] = r;
+	} else {
+		assert(req(i->to, b->jmp.arg));
+		b->jmp.arg = r;
+	}
+	i->to = r;
+}
+
+/* Redistribute trivial ops to point of use. */
+/* Reduces register pressure. */
+/* needs rpo, use; breaks use */
+void
+reassoc(Fn *fn)
+{
+	Blk *b;
+	Ins *i;
+	InsMov *im;
+	uint n, nim;
+
+	nim = 0;
+	im = vnew(nim, sizeof im[0], PHeap);
+
+	/* identify trivial ins */
+	for (b = fn->start; b; b = b->link) {
+		for (i = b->ins; i < &b->ins[b->nins]; i++)
+			ireassoc(fn, b, i, &nim, &im);
+	}
+
+	/* duplicate trivial ins */
+	movins(fn, im, nim, 0/*!del*/);
+
+	/* create new tmps for dups */
+	for (n = 0; n < nim; n++) {
+		b = fn->rpo[im[n].to.bid];
+		i = &b->ins[im[n].to.insn];
+		trealloc(fn, b, i);
+	}
+
+	/* delete (now) unused ins */
+	filluse(fn);
+	nopunused(fn);
+
+	if (debug['G']) {
+		fprintf(stderr, "\n> After Reassociation:\n");
+		printfn(fn, stderr);
+	}
+}

+ 1 - 1
ssa.c

@@ -1,7 +1,7 @@
 #include "all.h"
 #include <stdarg.h>
 
-static void
+void
 adduse(Tmp *tmp, int ty, Blk *b, ...)
 {
 	Use *u;

+ 48 - 0
test/_gcm1.ssa

@@ -0,0 +1,48 @@
+export
+function w $ifmv(w %p1, w %p2, w %p3) {
+@start
+@entry
+	%rt =w add %p2, %p3    # gcm moves to @true
+	%rf =w sub %p2, %p3    # gcm moves to @false
+	jnz %p1, @true, @false
+@true
+	%r =w copy %rt
+	jmp @exit
+@false
+	%r =w copy %rf
+	jmp @exit
+@exit
+	ret %r
+}
+
+export
+function w $hoist1(w %p1, w %p2, w %p3) {
+@start
+@entry
+	%n =w copy 0
+	%i =w copy %p1
+@loop
+	%base =w add %p2, %p3   # gcm moves to @exit
+	%i =w sub %i, 1
+	%n =w add %n, 1
+	jnz %i, @loop, @exit
+@exit
+	%r =w add %base, %n
+	ret %r
+}
+
+export
+function w $hoist2(w %p1, w %p2, w %p3) {
+@start
+@entry
+	%n =w copy 0
+	%i =w copy %p1
+@loop
+	%base =w add %p2, %p3   # gcm moves to @entry
+	%i =w sub %i, 1
+	%n =w add %n, %base
+	jnz %i, @loop, @exit
+@exit
+	%r =w add %base, %n
+	ret %r
+}

+ 43 - 0
test/_gcm2.ssa

@@ -0,0 +1,43 @@
+# Programs from "Global Code Motion Global Value Numbering" by Cliff Click
+# https://courses.cs.washington.edu/courses/cse501/06wi/reading/click-pldi95.pdf
+
+# GCM program in Figure 1
+
+function w $gcm_test(w %a){
+@start
+	%i.0 =w copy 0
+@loop
+	%i.1 =w phi @start %i.0, @loop %i.2
+	%b =w add %a, 1                     # early schedule moves to @start
+	%i.2 =w add %i.1, %b
+	%c =w mul %i.2, 2                   #  late schedule moves to @end
+	%x =w csltw %i.2, 10
+	jnz %x, @loop, @end
+@end
+	ret %c
+}
+
+# GCM program in "Figure 3 x's definition does not dominate it's use"
+#
+# SSA contruction will insert phi instruction for "x" in @if_false
+# preventing the "add" in @if_false from being moved to @if_true
+
+function $gcm_test2 (w %a){
+@start
+	%f =w copy 1
+	%x =w copy 0
+	%s.0 =w copy 0
+@loop
+	%s.1 = w phi @start %s.0, @if_false %s.2
+	jnz %a, @if, @end
+@if
+	jnz %f, @if_true, @if_false
+@if_true
+	%f =w copy 0
+	%x =w add %x, 1
+@if_false
+	%s.2 =w add %s.1, %x
+	jmp @loop
+@end
+	ret
+}

+ 17 - 0
test/_load-elim.ssa

@@ -0,0 +1,17 @@
+# GCM can eliminate unused add/load instructions
+
+export
+function w $f(l %p, w %c) {
+@start
+        jnz %c, @true, @false
+@true
+	%p1 =l add %p, 4
+	%v1 =w loaduw %p1
+        jmp @end
+@false
+	%p2 =l add %p, 4
+	%v2 =w loaduw %p2
+        jmp @end
+@end
+        ret 0
+}

+ 19 - 0
test/gvn-live-dead.ssa

@@ -0,0 +1,19 @@
+export
+function w $test(w %p1, w %p2) {
+@start
+@entry
+	%t1 =w copy 1
+	jnz %t1, @live, @dead1
+@live
+	%t2 =w add %p1, %p2
+	ret %t2
+@dead1
+	%t2 =w add %p1, %p2  # live ins in dead blk
+@dead2
+	jnz %t1, @live, @dead1
+}
+
+# >>> driver
+# extern int test(int p1, int p2);
+# int main() { return test(1, 2) != 3; }
+# <<<

+ 31 - 0
test/non0jnz.ssa

@@ -0,0 +1,31 @@
+# GVN 0/non-0 inference removes @yesyes, @yesno, @noyes, @nono
+
+export
+function w $test(w %c) {
+@start
+	jnz %c, @yes, @no
+@yes
+	%c0 =w cnew %c, 0
+	jnz %c0, @yesyes, @yesno
+@yesyes
+	%rc =w copy 1
+	jmp @end
+@yesno
+	%rc =w copy 111
+	jmp @end
+@no
+	%c1 =w cnew %c, 0
+	jnz %c1, @noyes, @nono
+@noyes
+	%rc =w copy 222
+	jmp @end
+@nono
+	%rc =w copy 0
+@end
+	ret %rc
+}
+
+# >>> driver
+# int test(int);
+# int main(void) { return test(0); }
+# <<<

+ 56 - 0
util.c

@@ -229,6 +229,24 @@ iscmp(int op, int *pk, int *pc)
 	return 1;
 }
 
+static int INVCMPWL[] = {
+	/*Oceqw*/Ocnew, /*Ocnew*/Oceqw,
+	/*Ocsgew*/Ocsltw, /*Ocsgtw*/Ocslew, /*Ocslew*/Ocsgtw, /*Ocsltw*/Ocsgew,
+	/*Ocugew*/Ocultw, /*Ocugtw*/Oculew, /*Oculew*/Ocugtw, /*Ocultw*/Ocugew,
+	/*Oceql*/Ocnel, /*Ocnel*/Oceql,
+	/*Ocsgel*/Ocsltl, /*Ocsgtl*/Ocslel, /*Ocslel*/Ocsgtl, /*Ocsltl*/Ocsgel,
+	/*Ocugel*/Ocultl, /*Ocugtl*/Oculel, /*Oculel*/Ocugtl, /*Ocultl*/Ocugel,
+};
+
+int
+invcmpwl(int cmp)
+{
+	assert(Oceqw <= cmp && cmp <= Ocultl);
+	return INVCMPWL[cmp - Oceqw];
+}
+
+
+
 int
 argcls(Ins *i, int n)
 {
@@ -338,6 +356,23 @@ phicls(int t, Tmp *tmp)
 	return t1;
 }
 
+uint
+phiargn(Phi *p, Blk *b)
+{
+	uint n;
+
+	for (n = 0; n < p->narg; n++)
+		if (p->blk[n] == b)
+			return n;
+	die("unreachable");
+}
+
+Ref
+phiarg(Phi *p, Blk *b)
+{
+	return p->arg[phiargn(p, b)];
+}
+
 Ref
 newtmp(char *prfx, int k,  Fn *fn)
 {
@@ -421,6 +456,27 @@ addcon(Con *c0, Con *c1, int m)
 	return 1;
 }
 
+int
+isconbits(Fn *fn, Ref r, int64_t *v)
+{
+	Con *c;
+
+	if (rtype(r) == RCon) {
+		c = &fn->con[r.val];
+		if (c->type == CBits) {
+			*v = c->bits.i;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int
+istmpconbits(Fn *fn, Ins *i, int64_t *v)
+{
+	return rtype(i->arg[0]) == RTmp && isconbits(fn, i->arg[1], v);
+}
+
 void
 salloc(Ref rt, Ref rs, Fn *fn)
 {