gingerBill пре 2 година
родитељ
комит
984a95b8c7
8 измењених фајлова са 239 додато и 116 уклоњено
  1. 208 97
      src/tilde/tb.h
  2. BIN
      src/tilde/tb.lib
  3. 3 3
      src/tilde/tb_arena.h
  4. 3 3
      src/tilde_builtin.cpp
  5. 3 0
      src/tilde_expr.cpp
  6. 4 4
      src/tilde_proc.cpp
  7. 12 8
      src/tilde_stmt.cpp
  8. 6 1
      src/tilde_type_info.cpp

+ 208 - 97
src/tilde/tb.h

@@ -1,3 +1,12 @@
+// Glossary (because i don't know where else to put it)
+//   IR   - intermediate representation
+//   SoN  - sea of nodes (https://www.oracle.com/technetwork/java/javase/tech/c2-ir95-150110.pdf)
+//   SSA  - single static assignment
+//   GVN  - global value numbering
+//   CSE  - common subexpression elimination
+//   DSE  - dead store elimination
+//   GCM  - global code motion
+//   SROA - scalar replacement of aggregates
 #ifndef TB_CORE_H
 #define TB_CORE_H
 
@@ -150,18 +159,19 @@ typedef enum TB_ISelMode {
 
 typedef enum TB_DataTypeEnum {
     // Integers, note void is an i0 and bool is an i1
-    //   i(0-2047)
+    //   i(0-64)
     TB_INT,
     // Floating point numbers
     //   f{32,64}
     TB_FLOAT,
     // Pointers
-    //   ptr(0-2047)
     TB_PTR,
     // Tuples, these cannot be used in memory ops, just accessed via projections
     TB_TUPLE,
-    // represents control flow as a kind of data
+    // represents control flow for REGION, BRANCH
     TB_CONTROL,
+    // represents memory (and I/O)
+    TB_MEMORY,
 } TB_DataTypeEnum;
 
 typedef enum TB_FloatFormat {
@@ -179,6 +189,7 @@ typedef union TB_DataType {
     };
     uint32_t raw;
 } TB_DataType;
+static_assert(sizeof(TB_DataType) == 4, "im expecting this to be a uint32_t");
 
 // classify data types
 #define TB_IS_VOID_TYPE(x)     ((x).type == TB_INT && (x).data == 0)
@@ -192,75 +203,136 @@ typedef union TB_DataType {
 #define TB_GET_FLOAT_FORMAT(x) ((x).data)
 #define TB_GET_PTR_ADDRSPACE(x) ((x).data)
 
+////////////////////////////////
+// ANNOTATIONS
+////////////////////////////////
+//
+//   (A, B) -> (C, D)
+//
+//   node takes A and B, produces C, D. if there's multiple
+//   results we need to use projections and the indices are
+//   based on the order seen here, proj0 is C, proj1 is D.
+//
+//   (A, B) & C -> Int
+//
+//   nodes takes A and B along with C in it's extra data. this is
+//   where non-node inputs fit.
+//
 typedef enum TB_NodeTypeEnum {
     TB_NULL = 0,
 
-    // Immediates
+    ////////////////////////////////
+    // CONSTANTS
+    ////////////////////////////////
     TB_INTEGER_CONST,
     TB_FLOAT32_CONST,
     TB_FLOAT64_CONST,
 
-    // only one per function
-    TB_START, // fn()
-
-    // regions represent the begining of BBs
-    TB_REGION, // fn(preds: []region)
-
-    // projection
-    TB_PROJ,
-
-    TB_CALL,    // normal call
-    TB_SYSCALL, // system call
-
-    // Managed ops
-    TB_SAFEPOINT,
-
-    // Memory operations
-    TB_STORE, // fn(r: control, addr: data, src: data)
-    TB_MEMCPY,
-    TB_MEMSET,
-
-    // Atomics
-    TB_ATOMIC_TEST_AND_SET,
-    TB_ATOMIC_CLEAR,
-
-    TB_ATOMIC_LOAD,
-    TB_ATOMIC_XCHG,
-    TB_ATOMIC_ADD,
-    TB_ATOMIC_SUB,
-    TB_ATOMIC_AND,
-    TB_ATOMIC_XOR,
-    TB_ATOMIC_OR,
-
-    TB_ATOMIC_CMPXCHG,
-    TB_DEBUGBREAK,
-
-    // Terminators
-    TB_BRANCH,
-    TB_RET,
-    TB_UNREACHABLE,
-    TB_TRAP,
-
-    TB_POISON,
-
-    // Load
-    TB_LOAD,
-
-    // Pointers
-    TB_LOCAL,
-
-    TB_GET_SYMBOL_ADDRESS,
-
-    TB_MEMBER_ACCESS,
-    TB_ARRAY_ACCESS,
+    ////////////////////////////////
+    // MISCELLANEOUS
+    ////////////////////////////////
+    // this is an unspecified value, usually generated by the optimizer
+    // when malformed input is folded into an operation.
+    TB_POISON,        // () -> Any
+    // projections just extract a single field of a tuple
+    TB_PROJ,          // Tuple & Int -> Any
+    // this is a simple way to embed machine code into the code
+    TB_MACHINE_OP,    // (Control, Memory) & Buffer -> (Control, Memory)
+    // reads the TSC on x64
+    TB_CYCLE_COUNTER, // (Control) -> Int64
+
+    ////////////////////////////////
+    // CONTROL
+    ////////////////////////////////
+    //   there's only one START and STOP per function
+    TB_START,      // () -> (Control, Memory, Data...)
+    TB_END,        // (Control, Memory, Data?) -> ()
+    //   regions are used to represent paths which have multiple entries.
+    //   each input is a predecessor.
+    TB_REGION,     // (Control...) -> (Control)
+    //   phi nodes work the same as in SSA CFG, the value is based on which predecessor was taken.
+    //   each input lines up with the regions such that region.in[i] will use phi.in[i+1] as the
+    //   subsequent data.
+    TB_PHI,        // (Control, Data...) -> Data
+    //   branch is used to implement most control flow, it acts like a switch
+    //   statement in C usually. they take a key and match against some cases,
+    //   if they match, it'll jump to that successor, if none match it'll take
+    //   the default successor.
+    //
+    //   if (cond) { A; } else { B; }    is just     switch (cond) { case 0: B; default: A; }
+    //
+    //   it's possible to not pass a key and the default successor is always called, this is
+    //   a GOTO. tb_inst_goto, tb_inst_if can handle common cases for you.
+    TB_BRANCH,      // (Control, Data?) -> (Control...)
+    //   debugbreak will trap in a continuable manner.
+    TB_DEBUGBREAK,  // (Control, Memory) -> (Control)
+    //   trap will not be continuable but will stop execution.
+    TB_TRAP,        // (Control) -> (Control)
+    //   unreachable means it won't trap or be continuable.
+    TB_UNREACHABLE, // (Control) -> (Control)
+
+    ////////////////////////////////
+    // CONTROL + MEMORY
+    ////////////////////////////////
+    //   nothing special, it's just a function call, 3rd argument here is the
+    //   target pointer (or syscall number) and the rest are just data args.
+    TB_CALL,           // (Control, Memory, Data, Data...) -> (Control, Memory, Data)
+    TB_SYSCALL,        // (Control, Memory, Data, Data...) -> (Control, Memory, Data)
+    //   safepoint polls are the same except they only trigger if the poll site
+    //   says to (platform specific but almost always just the page being made
+    //   unmapped/guard), 3rd argument is the poll site.
+    TB_SAFEPOINT_POLL, // (Control, Memory, Ptr, Data...)  -> (Control)
+
+    ////////////////////////////////
+    // MEMORY
+    ////////////////////////////////
+    //   LOAD and STORE are standard memory accesses, they can be folded away.
+    TB_LOAD,    // (Memory, Ptr)       -> Data
+    TB_STORE,   // (Memory, Ptr, Data) -> Memory
+    //   bulk memory ops.
+    TB_MEMCPY,  // (Memory, Ptr, Ptr, Size)  -> Memory
+    TB_MEMSET,  // (Memory, Ptr, Int8, Size) -> Memory
+    //   these memory accesses represent "volatile" which means
+    //   they may produce side effects and thus cannot be eliminated.
+    TB_READ,    // (Memory, Ptr)       -> (Memory, Data)
+    TB_WRITE,   // (Memory, Ptr, Data) -> (Memory, Data)
+    //   atomics have multiple observers (if not they wouldn't need to
+    //   be atomic) and thus produce side effects everywhere just like
+    //   volatiles except they have synchronization guarentees. the atomic
+    //   data ops will return the value before the operation is performed.
+    //   Atomic CAS return the old value and a boolean for success (true if
+    //   the value was changed)
+    TB_ATOMIC_LOAD, // (Memory, Ptr)        -> (Memory, Data)
+    TB_ATOMIC_XCHG, // (Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_ADD,  // (Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_SUB,  // (Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_AND,  // (Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_XOR,  // (Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_OR,   // (Memory, Ptr, Data)  -> (Memory, Data)
+    TB_ATOMIC_CAS,  // (Memory, Data, Data) -> (Memory, Data, Bool)
+
+    ////////////////////////////////
+    // POINTERS
+    ////////////////////////////////
+    //   LOCAL will statically allocate stack space
+    TB_LOCAL,         // () & (Int, Int) -> Ptr
+    //   SYMBOL will return a pointer to a TB_Symbol
+    TB_SYMBOL,        // () & TB_Symbol* -> Ptr
+    //   offsets pointer by constant value
+    TB_MEMBER_ACCESS, // Ptr & Int -> Ptr
+    //   arguments represent base, index, and stride respectively
+    //   and will perform `base + index*stride`
+    TB_ARRAY_ACCESS,  // (Ptr, Int) & Int -> Ptr
+    //   converts an integer to a pointer
+    TB_INT2PTR,       // Int -> Ptr
+    //   converts a pointer to an integer
+    TB_PTR2INT,       // Ptr -> Int
 
     // Conversions
     TB_TRUNCATE,
     TB_FLOAT_EXT,
     TB_SIGN_EXT,
     TB_ZERO_EXT,
-    TB_INT2PTR,
-    TB_PTR2INT,
     TB_UINT2FLOAT,
     TB_FLOAT2UINT,
     TB_INT2FLOAT,
@@ -315,18 +387,17 @@ typedef enum TB_NodeTypeEnum {
     TB_CMP_FLE,
 
     // Special ops
-    // does full multiplication (64x64=128 and so on) returning
-    // the low and high values in separate projections
+    //   adds two paired integers to two other paired integers and returns
+    //   a low and high value
+    TB_ADDPAIR,
+    //   does full multiplication (64x64=128 and so on) returning
+    //   the low and high values in separate projections
     TB_MULPAIR,
 
-    // PHI
-    TB_PHI, // fn(r: region, x: []data)
-
     // variadic
     TB_VA_START,
 
     // x86 intrinsics
-    TB_X86INTRIN_RDTSC,
     TB_X86INTRIN_LDMXCSR,
     TB_X86INTRIN_STMXCSR,
     TB_X86INTRIN_SQRT,
@@ -372,6 +443,9 @@ typedef struct TB_FunctionPrototype TB_FunctionPrototype;
 
 typedef struct TB_Attrib            TB_Attrib;
 
+// target-specific, just a unique ID for the registers
+typedef int TB_PhysicalReg;
+
 // Refers generically to objects within a module
 //
 // TB_Function, TB_Global, and TB_External are all subtypes of TB_Symbol
@@ -412,12 +486,11 @@ typedef struct TB_Symbol {
 typedef struct TB_Node TB_Node;
 struct TB_Node {
     TB_NodeType type;
+    uint16_t input_count; // number of node inputs.
     TB_DataType dt;
-    uint16_t input_count; // number of node inputs
-    uint16_t extra_count; // number of bytes for extra operand data
 
-    // local to the TB_Passes
-    uint32_t lattice_id;
+    // makes it easier to track in graph walks
+    size_t gvn;
 
     TB_Attrib* attribs;
     TB_Node** inputs;
@@ -442,9 +515,8 @@ typedef struct { // TB_PROJ
     int index;
 } TB_NodeProj;
 
-typedef struct { // TB_INT
-    uint64_t num_words;
-    uint64_t words[];
+typedef struct { // TB_INTEGER_CONST
+    uint64_t value;
 } TB_NodeInt;
 
 typedef struct { // any compare operator
@@ -457,17 +529,26 @@ typedef struct { // any integer binary operator
 
 typedef struct { // TB_MULPAIR
     TB_Node *lo, *hi;
-} TB_NodeMulPair;
+} TB_NodeArithPair;
 
 typedef struct {
     TB_CharUnits align;
-    bool is_volatile;
 } TB_NodeMemAccess;
 
 typedef struct {
     TB_CharUnits size, align;
 } TB_NodeLocal;
 
+typedef struct {
+    // this is the raw buffer
+    size_t length;
+    const uint8_t* data;
+
+    // represents the outputs, inputs and temporaries in that order
+    size_t outs, ins, tmps;
+    TB_PhysicalReg regs[];
+} TB_NodeMachineOp;
+
 typedef struct {
     float value;
 } TB_NodeFloat32;
@@ -491,6 +572,8 @@ typedef struct {
 typedef struct {
     TB_MemoryOrder order;
     TB_MemoryOrder order2;
+    TB_Node* proj0;
+    TB_Node* proj1;
 } TB_NodeAtomic;
 
 typedef struct {
@@ -506,9 +589,18 @@ typedef struct {
     TB_Node* end;
     const char* tag;
 
+    // position in a postorder walk
+    int postorder_id;
     // immediate dominator (can be approximate)
     int dom_depth;
     TB_Node* dom;
+
+    // used for IR building only, stale after that.
+    //
+    // this represents the first and last memory values within a region,
+    // if a region ever has multiple predecessors we apply a join on these
+    // memory.
+    TB_Node *mem_in, *mem_out;
 } TB_NodeRegion;
 
 typedef struct TB_MultiOutput {
@@ -558,7 +650,7 @@ typedef struct {
 #define TB_TYPE_F64     TB_DataType{ { TB_FLOAT, 0, TB_FLT_64 } }
 #define TB_TYPE_BOOL    TB_DataType{ { TB_INT,   0, 1 } }
 #define TB_TYPE_PTR     TB_DataType{ { TB_PTR,   0, 0 } }
-
+#define TB_TYPE_MEMORY  TB_DataType{ { TB_MEMORY,0, 0 } }
 #define TB_TYPE_INTN(N) TB_DataType{ { TB_INT,   0, (N) } }
 #define TB_TYPE_PTRN(N) TB_DataType{ { TB_PTR,   0, (N) } }
 
@@ -575,8 +667,9 @@ typedef struct {
 #define TB_TYPE_F64     (TB_DataType){ { TB_FLOAT, 0, TB_FLT_64 } }
 #define TB_TYPE_BOOL    (TB_DataType){ { TB_INT,   0, 1 } }
 #define TB_TYPE_PTR     (TB_DataType){ { TB_PTR,   0, 0 } }
-#define TB_TYPE_INTN(N) (TB_DataType){ { TB_INT,  0, (N) } }
-#define TB_TYPE_PTRN(N) (TB_DataType){ { TB_PTR,  0, (N) } }
+#define TB_TYPE_MEMORY  (TB_DataType){ { TB_MEMORY,0, 0 } }
+#define TB_TYPE_INTN(N) (TB_DataType){ { TB_INT,   0, (N) } }
+#define TB_TYPE_PTRN(N) (TB_DataType){ { TB_PTR,   0, (N) } }
 
 #endif
 
@@ -737,7 +830,7 @@ TB_API TB_External* tb_next_external(TB_External* e);
 
 // this is used JIT scenarios to tell the compiler what externals map to
 TB_API TB_ExternalType tb_extern_get_type(TB_External* e);
-TB_Global* tb_extern_transmute(TB_External* e, TB_DebugType* dbg_type, TB_Linkage linkage);
+TB_API TB_Global* tb_extern_transmute(TB_External* e, TB_DebugType* dbg_type, TB_Linkage linkage);
 
 TB_API TB_External* tb_extern_create(TB_Module* m, ptrdiff_t len, const char* name, TB_ExternalType type);
 
@@ -884,6 +977,11 @@ TB_API void tb_default_print_callback(void* user_data, const char* fmt, ...);
 TB_API void tb_inst_set_location(TB_Function* f, TB_SourceFile* file, int line, int column);
 TB_API void tb_inst_reset_location(TB_Function* f);
 
+// this is where the STOP will be
+TB_API void tb_inst_set_exit_location(TB_Function* f, TB_SourceFile* file, int line, int column);
+
+TB_API bool tb_has_effects(TB_Node* n);
+
 // if section is NULL, default to .text
 TB_API TB_Function* tb_function_create(TB_Module* m, ptrdiff_t len, const char* name, TB_Linkage linkage, TB_ComdatType comdat);
 
@@ -927,9 +1025,12 @@ TB_API TB_Node* tb_inst_float2int(TB_Function* f, TB_Node* src, TB_DataType dt,
 TB_API TB_Node* tb_inst_bitcast(TB_Function* f, TB_Node* src, TB_DataType dt);
 
 TB_API TB_Node* tb_inst_local(TB_Function* f, TB_CharUnits size, TB_CharUnits align);
+
 TB_API TB_Node* tb_inst_load(TB_Function* f, TB_DataType dt, TB_Node* addr, TB_CharUnits align, bool is_volatile);
 TB_API void tb_inst_store(TB_Function* f, TB_DataType dt, TB_Node* addr, TB_Node* val, TB_CharUnits align, bool is_volatile);
 
+TB_API void tb_inst_safepoint_poll(TB_Function* f, TB_Node* addr, int input_count, TB_Node** inputs);
+
 TB_API TB_Node* tb_inst_bool(TB_Function* f, bool imm);
 TB_API TB_Node* tb_inst_sint(TB_Function* f, TB_DataType dt, int64_t imm);
 TB_API TB_Node* tb_inst_uint(TB_Function* f, TB_DataType dt, uint64_t imm);
@@ -939,14 +1040,14 @@ TB_API TB_Node* tb_inst_cstring(TB_Function* f, const char* str);
 TB_API TB_Node* tb_inst_string(TB_Function* f, size_t len, const char* str);
 
 // write 'val' over 'count' bytes on 'dst'
-TB_API void tb_inst_memset(TB_Function* f, TB_Node* dst, TB_Node* val, TB_Node* count, TB_CharUnits align, bool is_volatile);
+TB_API void tb_inst_memset(TB_Function* f, TB_Node* dst, TB_Node* val, TB_Node* count, TB_CharUnits align);
 
 // zero 'count' bytes on 'dst'
-TB_API void tb_inst_memzero(TB_Function* f, TB_Node* dst, TB_Node* count, TB_CharUnits align, bool is_volatile);
+TB_API void tb_inst_memzero(TB_Function* f, TB_Node* dst, TB_Node* count, TB_CharUnits align);
 
 // performs a copy of 'count' elements from one memory location to another
 // both locations cannot overlap.
-TB_API void tb_inst_memcpy(TB_Function* f, TB_Node* dst, TB_Node* src, TB_Node* count, TB_CharUnits align, bool is_volatile);
+TB_API void tb_inst_memcpy(TB_Function* f, TB_Node* dst, TB_Node* src, TB_Node* count, TB_CharUnits align);
 
 // result = base + (index * stride)
 TB_API TB_Node* tb_inst_array_access(TB_Function* f, TB_Node* base, TB_Node* index, int64_t stride);
@@ -1033,9 +1134,9 @@ TB_API TB_Node* tb_inst_cmp_fge(TB_Function* f, TB_Node* a, TB_Node* b);
 
 // General intrinsics
 TB_API TB_Node* tb_inst_va_start(TB_Function* f, TB_Node* a);
+TB_API TB_Node* tb_inst_cycle_counter(TB_Function* f);
 
 // x86 Intrinsics
-TB_API TB_Node* tb_inst_x86_rdtsc(TB_Function* f);
 TB_API TB_Node* tb_inst_x86_ldmxcsr(TB_Function* f, TB_Node* a);
 TB_API TB_Node* tb_inst_x86_stmxcsr(TB_Function* f);
 TB_API TB_Node* tb_inst_x86_sqrt(TB_Function* f, TB_Node* a);
@@ -1061,6 +1162,19 @@ TB_API void tb_inst_ret(TB_Function* f, size_t count, TB_Node** values);
 ////////////////////////////////
 // Passes
 ////////////////////////////////
+typedef enum {
+    // allowed to remove PHIs nodes, this is
+    // helpful because the default IR building
+    // will produce tons of useless memory PHIs.
+    TB_PEEPHOLE_PHI = 1,
+
+    // it's allowed to fold memory operations (store or load elimination)
+    TB_PEEPHOLE_MEMORY = 2,
+
+    // just do every reduction rule i can provide you
+    TB_PEEPHOLE_ALL = 7,
+} TB_PeepholeFlags;
+
 // Function analysis, optimizations, and codegen are all part of this
 typedef struct TB_Passes TB_Passes;
 
@@ -1069,35 +1183,32 @@ TB_API TB_Passes* tb_pass_enter(TB_Function* f, TB_Arena* arena);
 TB_API void tb_pass_exit(TB_Passes* opt);
 
 // transformation passes:
-//   peephole: runs most simple reductions on the code,
-//     should be run after any bigger passes (it's incremental
-//     so it's not that bad)
+//   peephole: 99% of the optimizer, i'm sea of nodes pilled so i
+//     break down most optimizations into local rewrites, it's
+//     incremental and recommended to run after any non-peephole
+//     pass.
 //
-//   mem2reg: lowers TB_LOCALs into SSA values, this makes more
+//   mem2reg: lowers TB_LOCALs into SoN values, this makes more
 //     data flow analysis possible on the code and allows to codegen
 //     to place variables into registers.
 //
-//   cfg: performs simplifications on the CFG like `a && b => select(a, b, 0)`
-//     or removing redundant branches.
-//
-//   loop: NOT READY
-//
-TB_API bool tb_pass_peephole(TB_Passes* opt);
+//   SROA: splits LOCALs into multiple to allow for more dataflow
+//     analysis later on.
+TB_API void tb_pass_peephole(TB_Passes* opt, TB_PeepholeFlags flags);
+TB_API void tb_pass_sroa(TB_Passes* opt);
 TB_API bool tb_pass_mem2reg(TB_Passes* opt);
-TB_API bool tb_pass_loop(TB_Passes* opt);
-TB_API bool tb_pass_cfg(TB_Passes* opt);
+
+TB_API void tb_pass_schedule(TB_Passes* opt);
 
 // analysis
 //   print: prints IR in a flattened text form.
 TB_API bool tb_pass_print(TB_Passes* opt);
 
-TB_API void tb_pass_schedule(TB_Passes* opt);
-
 // codegen
 TB_API TB_FunctionOutput* tb_pass_codegen(TB_Passes* opt, bool emit_asm);
 
 TB_API void tb_pass_kill_node(TB_Passes* opt, TB_Node* n);
-TB_API bool tb_pass_mark(TB_Passes* opt, TB_Node* n);
+TB_API void tb_pass_mark(TB_Passes* opt, TB_Node* n);
 TB_API void tb_pass_mark_users(TB_Passes* opt, TB_Node* n);
 
 ////////////////////////////////


+ 3 - 3
src/tilde/tb_arena.h

@@ -20,9 +20,9 @@
 #endif
 
 enum {
-    TB_ARENA_SMALL_CHUNK_SIZE  =        4 * 1024,
-    TB_ARENA_MEDIUM_CHUNK_SIZE =      512 * 1024,
-    TB_ARENA_LARGE_CHUNK_SIZE  = 2 * 1024 * 1024,
+    TB_ARENA_SMALL_CHUNK_SIZE  =         4 * 1024,
+    TB_ARENA_MEDIUM_CHUNK_SIZE =       512 * 1024,
+    TB_ARENA_LARGE_CHUNK_SIZE  = 16 * 1024 * 1024,
 
     TB_ARENA_ALIGNMENT = 16,
 };

+ 3 - 3
src/tilde_builtin.cpp

@@ -230,7 +230,7 @@ gb_internal cgValue cg_builtin_clamp(cgProcedure *p, Type *t, cgValue const &x,
 gb_internal cgValue cg_builtin_mem_zero(cgProcedure *p, cgValue const &ptr, cgValue const &len) {
 	GB_ASSERT(ptr.kind == cgValue_Value);
 	GB_ASSERT(len.kind == cgValue_Value);
-	tb_inst_memzero(p->func, ptr.node, len.node, 1, false);
+	tb_inst_memzero(p->func, ptr.node, len.node, 1);
 	return ptr;
 }
 
@@ -239,7 +239,7 @@ gb_internal cgValue cg_builtin_mem_copy(cgProcedure *p, cgValue const &dst, cgVa
 	GB_ASSERT(src.kind == cgValue_Value);
 	GB_ASSERT(len.kind == cgValue_Value);
 	// TODO(bill): This needs to be memmove
-	tb_inst_memcpy(p->func, dst.node, src.node, len.node, 1, false);
+	tb_inst_memcpy(p->func, dst.node, src.node, len.node, 1);
 	return dst;
 }
 
@@ -247,7 +247,7 @@ gb_internal cgValue cg_builtin_mem_copy_non_overlapping(cgProcedure *p, cgValue
 	GB_ASSERT(dst.kind == cgValue_Value);
 	GB_ASSERT(src.kind == cgValue_Value);
 	GB_ASSERT(len.kind == cgValue_Value);
-	tb_inst_memcpy(p->func, dst.node, src.node, len.node, 1, false);
+	tb_inst_memcpy(p->func, dst.node, src.node, len.node, 1);
 	return dst;
 }
 

+ 3 - 0
src/tilde_expr.cpp

@@ -1193,6 +1193,9 @@ gb_internal cgValue cg_emit_conv(cgProcedure *p, cgValue value, Type *t) {
 		GB_ASSERT(is_type_typed(st));
 
 		data = cg_emit_conv(p, data, t_rawptr);
+		if (p->name == "main@main") {
+			GB_PANIC("HERE %s %llu", type_to_string(st), cg_typeid_as_u64(p->module, value.type));
+		}
 
 		cgValue id = cg_typeid(p, st);
 		cgValue data_ptr = cg_emit_struct_ep(p, result.addr, 0);

+ 4 - 4
src/tilde_proc.cpp

@@ -373,9 +373,9 @@ gb_internal WORKER_TASK_PROC(cg_procedure_compile_worker_proc) {
 
 	// optimization passes
 	if (false) {
-		tb_pass_peephole(opt);
+		tb_pass_peephole(opt, TB_PEEPHOLE_ALL);
 		tb_pass_mem2reg(opt);
-		tb_pass_peephole(opt);
+		tb_pass_peephole(opt, TB_PEEPHOLE_ALL);
 	}
 
 	bool emit_asm = false;
@@ -572,7 +572,7 @@ gb_internal cgValue cg_emit_call(cgProcedure * p, cgValue value, Slice<cgValue>
 			TB_CharUnits size = cast(TB_CharUnits)type_size_of(return_type);
 			TB_CharUnits align = cast(TB_CharUnits)gb_max(type_align_of(return_type), 16);
 			TB_Node *local = tb_inst_local(p->func, size, align);
-			tb_inst_memzero(p->func, local, tb_inst_uint(p->func, TB_TYPE_INT, size), align, false);
+			tb_inst_memzero(p->func, local, tb_inst_uint(p->func, TB_TYPE_INT, size), align);
 			params[param_index++] = local;
 		}
 	}
@@ -626,7 +626,7 @@ gb_internal cgValue cg_emit_call(cgProcedure * p, cgValue value, Slice<cgValue>
 			TB_CharUnits align = cast(TB_CharUnits)gb_max(type_align_of(result), 16);
 			TB_Node *local = tb_inst_local(p->func, size, align);
 			// TODO(bill): Should this need to be zeroed any way?
-			tb_inst_memzero(p->func, local, tb_inst_uint(p->func, TB_TYPE_INT, size), align, false);
+			tb_inst_memzero(p->func, local, tb_inst_uint(p->func, TB_TYPE_INT, size), align);
 			params[param_index++] = local;
 		}
 	}

+ 12 - 8
src/tilde_stmt.cpp

@@ -42,6 +42,8 @@ gb_internal cgValue cg_emit_load(cgProcedure *p, cgValue const &ptr, bool is_vol
 			return cg_lvalue_addr(tb_inst_get_symbol_address(p->func, ptr.symbol), type);
 		}
 	}
+	GB_ASSERT(dt.type != TB_MEMORY);
+	GB_ASSERT(dt.type != TB_TUPLE);
 
 	// use the natural alignment
 	// if people need a special alignment, they can use `intrinsics.unaligned_load`
@@ -118,7 +120,7 @@ gb_internal void cg_emit_store(cgProcedure *p, cgValue dst, cgValue src, bool is
 		// IMPORTANT TODO(bill): needs to be memmove
 		i64 sz = type_size_of(dst_type);
 		TB_Node *count = tb_inst_uint(p->func, TB_TYPE_INT, cast(u64)sz);
-		tb_inst_memcpy(p->func, dst_ptr, src_ptr, count, alignment, is_volatile);
+		tb_inst_memcpy(p->func, dst_ptr, src_ptr, count, alignment/*, is_volatile*/);
 		return;
 	}
 
@@ -159,7 +161,7 @@ gb_internal cgValue cg_address_from_load(cgProcedure *p, cgValue value) {
 		{
 			TB_Node *load_inst = value.node;
 			GB_ASSERT_MSG(load_inst->type == TB_LOAD, "expected a load instruction");
-			TB_Node *ptr = load_inst->inputs[1];
+			TB_Node *ptr = load_inst->inputs[2];
 			return cg_value(ptr, alloc_type_pointer(value.type));
 		}
 	case cgValue_Addr:
@@ -813,9 +815,10 @@ gb_internal cgAddr cg_add_local(cgProcedure *p, Type *type, Entity *e, bool zero
 
 	if (zero_init) {
 		bool is_volatile = false;
+		gb_unused(is_volatile);
 		TB_Node *zero = tb_inst_uint(p->func, TB_TYPE_I8, 0);
 		TB_Node *count = tb_inst_uint(p->func, TB_TYPE_I32, cast(u64)size);
-		tb_inst_memset(p->func, local, zero, count, alignment, is_volatile);
+		tb_inst_memset(p->func, local, zero, count, alignment/*, is_volatile*/);
 	}
 
 	cgAddr addr = cg_addr(cg_value(local, alloc_type_pointer(type)));
@@ -861,7 +864,7 @@ gb_internal cgValue cg_copy_value_to_ptr(cgProcedure *p, cgValue value, Type *or
 		tb_inst_store(p->func, cg_data_type(original_type), copy, value.node, align, false);
 	} else {
 		GB_ASSERT(value.kind == cgValue_Addr);
-		tb_inst_memcpy(p->func, copy, value.node, tb_inst_uint(p->func, TB_TYPE_INT, size), align, false);
+		tb_inst_memcpy(p->func, copy, value.node, tb_inst_uint(p->func, TB_TYPE_INT, size), align);
 	}
 
 	return cg_value(copy, alloc_type_pointer(original_type));
@@ -871,7 +874,7 @@ gb_internal cgValue cg_address_from_load_or_generate_local(cgProcedure *p, cgVal
 	switch (value.kind) {
 	case cgValue_Value:
 		if (value.node->type == TB_LOAD) {
-			TB_Node *ptr = value.node->inputs[1];
+			TB_Node *ptr = value.node->inputs[2];
 			return cg_value(ptr, alloc_type_pointer(value.type));
 		}
 		break;
@@ -1042,7 +1045,7 @@ gb_internal void cg_build_assignment(cgProcedure *p, Array<cgAddr> const &lvals,
 		    	TB_CharUnits size  = cast(TB_CharUnits)type_size_of(type);
 		    	TB_CharUnits align = cast(TB_CharUnits)type_align_of(type);
 		    	TB_Node *copy = tb_inst_local(p->func, size, align);
-		    	tb_inst_memcpy(p->func, copy, init.node, tb_inst_uint(p->func, TB_TYPE_INT, size), align, false);
+		    	tb_inst_memcpy(p->func, copy, init.node, tb_inst_uint(p->func, TB_TYPE_INT, size), align);
 		    	// use the copy instead
 		    	init.node = copy;
 		}
@@ -2399,8 +2402,7 @@ gb_internal void cg_build_type_switch_stmt(cgProcedure *p, Ast *node) {
 				               backing_ptr, // dst
 				               data.node,   // src
 				               tb_inst_uint(p->func, TB_TYPE_INT, size),
-				               cast(TB_CharUnits)align,
-				               false
+				               cast(TB_CharUnits)align
 				);
 
 				ptr = cg_value(backing_ptr, ct_ptr);
@@ -2522,6 +2524,8 @@ gb_internal void cg_build_mutable_value_decl(cgProcedure *p, Ast *node) {
 
 	TEMPORARY_ALLOCATOR_GUARD();
 
+
+
 	auto inits = array_make<cgValue>(temporary_allocator(), 0, vd->values.count != 0 ? vd->names.count : 0);
 	for (Ast *rhs : vd->values) {
 		cgValue init = cg_build_expr(p, rhs);

+ 6 - 1
src/tilde_type_info.cpp

@@ -118,6 +118,11 @@ gb_internal u64 cg_typeid_as_u64(cgModule *m, Type *type) {
 		data |= (special  &~ (1ull<<1))  << 62ull; // special
 		data |= (reserved &~ (1ull<<1))  << 63ull; // reserved
 	}
+
+	if (type == t_string) {
+		gb_printf_err("%llu\n", data);
+	}
+
 	return data;
 }
 
@@ -449,7 +454,7 @@ gb_internal void cg_setup_type_info_data(cgModule *m) {
 		u32 flags = type_info_flags_of_type(t);
 		u64 id    = cg_typeid_as_u64(m, t);
 
-		void *size_ptr  = tb_global_add_region(m->mod,  global, offset+size_offset, build_context.int_size);
+		void *size_ptr  = tb_global_add_region(m->mod, global, offset+size_offset,  build_context.int_size);
 		void *align_ptr = tb_global_add_region(m->mod, global, offset+align_offset, build_context.int_size);
 		void *flags_ptr = tb_global_add_region(m->mod, global, offset+flags_offset, 4);
 		void *id_ptr    = tb_global_add_region(m->mod, global, offset+id_offset,    build_context.ptr_size);