瀏覽代碼

Merge remote-tracking branch 'upstream/master'

Tohei Ichikawa 2 周之前
父節點
當前提交
6ed9351955

+ 42 - 22
base/runtime/core_builtin.odin

@@ -54,7 +54,12 @@ container_of :: #force_inline proc "contextless" (ptr: $P/^$Field_Type, $T: type
 
 
 when !NO_DEFAULT_TEMP_ALLOCATOR {
-	@thread_local global_default_temp_allocator_data: Default_Temp_Allocator
+	when ODIN_ARCH == .i386 && ODIN_OS == .Windows {
+		// Thread-local storage is problematic on Windows i386
+		global_default_temp_allocator_data: Default_Temp_Allocator
+	} else {
+		@thread_local global_default_temp_allocator_data: Default_Temp_Allocator
+	}
 }
 
 @(builtin, disabled=NO_DEFAULT_TEMP_ALLOCATOR)
@@ -166,11 +171,17 @@ remove_range :: proc(array: ^$D/[dynamic]$T, #any_int lo, hi: int, loc := #calle
 @builtin
 pop :: proc(array: ^$T/[dynamic]$E, loc := #caller_location) -> (res: E) #no_bounds_check {
 	assert(len(array) > 0, loc=loc)
-	res = array[len(array)-1]
-	(^Raw_Dynamic_Array)(array).len -= 1
+	_pop_type_erased(&res, (^Raw_Dynamic_Array)(array), size_of(E))
 	return res
 }
 
+_pop_type_erased :: proc(res: rawptr, array: ^Raw_Dynamic_Array, elem_size: int, loc := #caller_location) {
+	end := rawptr(uintptr(array.data) + uintptr(elem_size*(array.len-1)))
+	intrinsics.mem_copy_non_overlapping(res, end, elem_size)
+	array.len -= 1
+}
+
+
 
 // `pop_safe` trys to remove and return the end value of dynamic array `array` and reduces the length of `array` by 1.
 // If the operation is not possible, it will return false.
@@ -334,20 +345,19 @@ delete :: proc{
 // The new built-in procedure allocates memory. The first argument is a type, not a value, and the value
 // return is a pointer to a newly allocated value of that type using the specified allocator, default is context.allocator
 @(builtin, require_results)
-new :: proc($T: typeid, allocator := context.allocator, loc := #caller_location) -> (^T, Allocator_Error) #optional_allocator_error {
-	return new_aligned(T, align_of(T), allocator, loc)
+new :: proc($T: typeid, allocator := context.allocator, loc := #caller_location) -> (t: ^T, err: Allocator_Error) #optional_allocator_error {
+	t = (^T)(raw_data(mem_alloc_bytes(size_of(T), align_of(T), allocator, loc) or_return))
+	return
 }
 @(require_results)
 new_aligned :: proc($T: typeid, alignment: int, allocator := context.allocator, loc := #caller_location) -> (t: ^T, err: Allocator_Error) {
-	data := mem_alloc_bytes(size_of(T), alignment, allocator, loc) or_return
-	t = (^T)(raw_data(data))
+	t = (^T)(raw_data(mem_alloc_bytes(size_of(T), alignment, allocator, loc) or_return))
 	return
 }
 
 @(builtin, require_results)
 new_clone :: proc(data: $T, allocator := context.allocator, loc := #caller_location) -> (t: ^T, err: Allocator_Error) #optional_allocator_error {
-	t_data := mem_alloc_bytes(size_of(T), align_of(T), allocator, loc) or_return
-	t = (^T)(raw_data(t_data))
+	t = (^T)(raw_data(mem_alloc_bytes(size_of(T), align_of(T), allocator, loc) or_return))
 	if t != nil {
 		t^ = data
 	}
@@ -357,14 +367,21 @@ new_clone :: proc(data: $T, allocator := context.allocator, loc := #caller_locat
 DEFAULT_DYNAMIC_ARRAY_CAPACITY :: 8
 
 @(require_results)
-make_aligned :: proc($T: typeid/[]$E, #any_int len: int, alignment: int, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
+make_aligned :: proc($T: typeid/[]$E, #any_int len: int, alignment: int, allocator := context.allocator, loc := #caller_location) -> (res: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_aligned_type_erased(&res, size_of(E), len, alignment, allocator, loc)
+	return
+}
+
+@(require_results)
+_make_aligned_type_erased :: proc(slice: rawptr, elem_size: int, len: int, alignment: int, allocator: Allocator, loc := #caller_location) -> Allocator_Error {
 	make_slice_error_loc(loc, len)
-	data, err := mem_alloc_bytes(size_of(E)*len, alignment, allocator, loc)
-	if data == nil && size_of(E) != 0 {
-		return nil, err
+	data, err := mem_alloc_bytes(elem_size*len, alignment, allocator, loc)
+	if data == nil && elem_size != 0 {
+		return err
 	}
-	s := Raw_Slice{raw_data(data), len}
-	return transmute(T)s, err
+	(^Raw_Slice)(slice).data = raw_data(data)
+	(^Raw_Slice)(slice).len  = len
+	return err
 }
 
 // `make_slice` allocates and initializes a slice. Like `new`, the first argument is a type, not a value.
@@ -372,24 +389,27 @@ make_aligned :: proc($T: typeid/[]$E, #any_int len: int, alignment: int, allocat
 //
 // Note: Prefer using the procedure group `make`.
 @(builtin, require_results)
-make_slice :: proc($T: typeid/[]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
-	return make_aligned(T, len, align_of(E), allocator, loc)
+make_slice :: proc($T: typeid/[]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (res: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_aligned_type_erased(&res, size_of(E), len, align_of(E), allocator, loc)
+	return
 }
 // `make_dynamic_array` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // Note: Prefer using the procedure group `make`.
 @(builtin, require_results)
-make_dynamic_array :: proc($T: typeid/[dynamic]$E, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
-	return make_dynamic_array_len_cap(T, 0, 0, allocator, loc)
+make_dynamic_array :: proc($T: typeid/[dynamic]$E, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_dynamic_array_len_cap((^Raw_Dynamic_Array)(&array), size_of(E), align_of(E), 0, 0, allocator, loc)
+	return
 }
 // `make_dynamic_array_len` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
 //
 // Note: Prefer using the procedure group `make`.
 @(builtin, require_results)
-make_dynamic_array_len :: proc($T: typeid/[dynamic]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (T, Allocator_Error) #optional_allocator_error {
-	return make_dynamic_array_len_cap(T, len, len, allocator, loc)
+make_dynamic_array_len :: proc($T: typeid/[dynamic]$E, #any_int len: int, allocator := context.allocator, loc := #caller_location) -> (array: T, err: Allocator_Error) #optional_allocator_error {
+	err = _make_dynamic_array_len_cap((^Raw_Dynamic_Array)(&array), size_of(E), align_of(E), len, len, allocator, loc)
+	return
 }
 // `make_dynamic_array_len_cap` allocates and initializes a dynamic array. Like `new`, the first argument is a type, not a value.
 // Unlike `new`, `make`'s return value is the same as the type of its argument, not a pointer to it.
@@ -494,7 +514,7 @@ clear_map :: proc "contextless" (m: ^$T/map[$K]$V) {
 // Note: Prefer the procedure group `reserve`
 @builtin
 reserve_map :: proc(m: ^$T/map[$K]$V, #any_int capacity: int, loc := #caller_location) -> Allocator_Error {
-	return __dynamic_map_reserve((^Raw_Map)(m), map_info(T), uint(capacity), loc) if m != nil else nil
+	return __dynamic_map_reserve((^Raw_Map)(m), map_info(T), uint(capacity), loc)
 }
 
 // Shrinks the capacity of a map down to the current length.

+ 3 - 0
base/runtime/dynamic_map_internal.odin

@@ -985,6 +985,9 @@ __dynamic_map_entry :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_
 // IMPORTANT: USED WITHIN THE COMPILER
 @(private)
 __dynamic_map_reserve :: proc "odin" (#no_alias m: ^Raw_Map, #no_alias info: ^Map_Info, new_capacity: uint, loc := #caller_location) -> Allocator_Error {
+	if m == nil {
+		return nil
+	}
 	return map_reserve_dynamic(m, info, uintptr(new_capacity), loc)
 }
 

+ 13 - 1
base/runtime/entry_windows.odin

@@ -28,7 +28,19 @@ when ODIN_BUILD_MODE == .Dynamic {
 		return true
 	}
 } else when !ODIN_TEST && !ODIN_NO_ENTRY_POINT {
-	when ODIN_ARCH == .i386 || ODIN_NO_CRT {
+	when ODIN_ARCH == .i386 && !ODIN_NO_CRT {
+		// Windows i386 with CRT: libcmt provides mainCRTStartup which calls _main
+		// Note: "c" calling convention adds underscore prefix automatically on i386
+		@(link_name="main", linkage="strong", require)
+		main :: proc "c" (argc: i32, argv: [^]cstring) -> i32 {
+			args__ = argv[:argc]
+			context = default_context()
+			#force_no_inline _startup_runtime()
+			intrinsics.__entry_point()
+			#force_no_inline _cleanup_runtime()
+			return 0
+		}
+	} else when ODIN_NO_CRT {
 		@(link_name="mainCRTStartup", linkage="strong", require)
 		mainCRTStartup :: proc "system" () -> i32 {
 			context = default_context()

+ 3 - 3
core/container/small_array/small_array.odin

@@ -169,7 +169,7 @@ Output:
 	x
 
 */
-get_safe :: proc(a: $A/Small_Array($N, $T), index: int) -> (T, bool) #no_bounds_check {
+get_safe :: proc "contextless" (a: $A/Small_Array($N, $T), index: int) -> (T, bool) #no_bounds_check {
 	if index < 0 || index >= a.len {
 		return {}, false
 	}
@@ -183,11 +183,11 @@ Get a pointer to the item at the specified position.
 - `a`: A pointer to the small-array
 - `index`: The position of the item to get
 
-**Returns** 
+**Returns**
 - the pointer to the element at the specified position
 - true if element exists, false otherwise
 */
-get_ptr_safe :: proc(a: ^$A/Small_Array($N, $T), index: int) -> (^T, bool) #no_bounds_check {
+get_ptr_safe :: proc "contextless" (a: ^$A/Small_Array($N, $T), index: int) -> (^T, bool) #no_bounds_check {
 	if index < 0 || index >= a.len {
 		return {}, false
 	}

+ 12 - 4
core/testing/signal_handler_libc.odin

@@ -24,10 +24,18 @@ import "core:terminal/ansi"
 @(private="file") stop_test_passed: libc.sig_atomic_t
 @(private="file") stop_test_alert:  libc.sig_atomic_t
 
-@(private="file", thread_local)
-local_test_index: libc.sig_atomic_t
-@(private="file", thread_local)
-local_test_index_set: bool
+when ODIN_ARCH == .i386 && ODIN_OS == .Windows {
+	// Thread-local storage is problematic on Windows i386
+	@(private="file")
+	local_test_index: libc.sig_atomic_t
+	@(private="file")
+	local_test_index_set: bool
+} else {
+	@(private="file", thread_local)
+	local_test_index: libc.sig_atomic_t
+	@(private="file", thread_local)
+	local_test_index_set: bool
+}
 
 // Windows does not appear to have a SIGTRAP, so this is defined here, instead
 // of in the libc package, just so there's no confusion about it being

+ 4 - 0
src/build_settings.cpp

@@ -418,6 +418,7 @@ enum LinkerChoice : i32 {
 	Linker_Default = 0,
 	Linker_lld,
 	Linker_radlink,
+	Linker_mold,
 
 	Linker_COUNT,
 };
@@ -433,6 +434,7 @@ String linker_choices[Linker_COUNT] = {
 	str_lit("default"),
 	str_lit("lld"),
 	str_lit("radlink"),
+	str_lit("mold"),
 };
 
 enum IntegerDivisionByZeroKind : u8 {
@@ -554,6 +556,8 @@ struct BuildContext {
 
 	bool internal_no_inline;
 	bool internal_by_value;
+	bool internal_weak_monomorphization;
+	bool internal_ignore_llvm_verification;
 
 	bool   no_threaded_checker;
 

+ 5 - 5
src/check_decl.cpp

@@ -1549,7 +1549,7 @@ gb_internal void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
 				      "\tother at %s",
 				      LIT(name), token_pos_to_string(pos));
 			} else if (name == "main") {
-				if (d->entity->pkg->kind != Package_Runtime) {
+				if (d->entity.load()->pkg->kind != Package_Runtime) {
 					error(d->proc_lit, "The link name 'main' is reserved for internal use");
 				}
 			} else {
@@ -1565,7 +1565,7 @@ gb_internal void check_proc_decl(CheckerContext *ctx, Entity *e, DeclInfo *d) {
 	}
 }
 
-gb_internal void check_global_variable_decl(CheckerContext *ctx, Entity *&e, Ast *type_expr, Ast *init_expr) {
+gb_internal void check_global_variable_decl(CheckerContext *ctx, Entity *e, Ast *type_expr, Ast *init_expr) {
 	GB_ASSERT(e->type == nullptr);
 	GB_ASSERT(e->kind == Entity_Variable);
 
@@ -1967,8 +1967,8 @@ gb_internal bool check_proc_body(CheckerContext *ctx_, Token token, DeclInfo *de
 	ctx->curr_proc_sig  = type;
 	ctx->curr_proc_calling_convention = type->Proc.calling_convention;
 
-	if (decl->parent && decl->entity && decl->parent->entity) {
-		decl->entity->parent_proc_decl = decl->parent;
+	if (decl->parent && decl->entity.load() && decl->parent->entity) {
+		decl->entity.load()->parent_proc_decl = decl->parent;
 	}
 
 	if (ctx->pkg->name != "runtime") {
@@ -2072,7 +2072,7 @@ gb_internal bool check_proc_body(CheckerContext *ctx_, Token token, DeclInfo *de
 		GB_ASSERT(decl->proc_checked_state != ProcCheckedState_Checked);
 		if (decl->defer_use_checked) {
 			GB_ASSERT(is_type_polymorphic(type, true));
-			error(token, "Defer Use Checked: %.*s", LIT(decl->entity->token.string));
+			error(token, "Defer Use Checked: %.*s", LIT(decl->entity.load()->token.string));
 			GB_ASSERT(decl->defer_use_checked == false);
 		}
 

+ 31 - 5
src/check_expr.cpp

@@ -608,7 +608,7 @@ gb_internal bool find_or_generate_polymorphic_procedure(CheckerContext *old_c, E
 		entity->flags |= EntityFlag_Disabled;
 	}
 
-	d->entity = entity;
+	d->entity.store(entity);
 
 	AstFile *file = nullptr;
 	{
@@ -3500,6 +3500,24 @@ gb_internal bool check_is_castable_to(CheckerContext *c, Operand *operand, Type
 	return false;
 }
 
+gb_internal bool is_type_union_constantable(Type *type) {
+	Type *bt = base_type(type);
+	GB_ASSERT(bt->kind == Type_Union);
+
+	if (bt->Union.variants.count == 0) {
+		return true;
+	} else if (bt->Union.variants.count == 1) {
+		return is_type_constant_type(bt->Union.variants[0]);
+	}
+
+	for (Type *v : bt->Union.variants) {
+		if (!is_type_constant_type(v)) {
+			return false;
+		}
+	}
+	return false;
+}
+
 gb_internal bool check_cast_internal(CheckerContext *c, Operand *x, Type *type) {
 	bool is_const_expr = x->mode == Addressing_Constant;
 
@@ -3524,6 +3542,9 @@ gb_internal bool check_cast_internal(CheckerContext *c, Operand *x, Type *type)
 		} else if (is_type_slice(type) && is_type_string(x->type)) {
 			x->mode = Addressing_Value;
 		} else if (is_type_union(type)) {
+			if (is_type_union_constantable(type)) {
+				return true;
+			}
 			x->mode = Addressing_Value;
 		}
 		if (x->mode == Addressing_Value) {
@@ -3582,7 +3603,11 @@ gb_internal void check_cast(CheckerContext *c, Operand *x, Type *type, bool forb
 		Type *final_type = type;
 		if (is_const_expr && !is_type_constant_type(type)) {
 			if (is_type_union(type)) {
-				convert_to_typed(c, x, type);
+				if (is_type_union_constantable(type)) {
+
+				} else {
+					convert_to_typed(c, x, type);
+				}
 			}
 			final_type = default_type(x->type);
 		}
@@ -8339,9 +8364,10 @@ gb_internal ExprKind check_call_expr(CheckerContext *c, Operand *operand, Ast *c
 				if (c->curr_proc_decl == nullptr) {
 					error(call, "Calling a '#force_inline' procedure that enables target features is not allowed at file scope");
 				} else {
-					GB_ASSERT(c->curr_proc_decl->entity);
-					GB_ASSERT(c->curr_proc_decl->entity->type->kind == Type_Proc);
-					String scope_features = c->curr_proc_decl->entity->type->Proc.enable_target_feature;
+					Entity *e = c->curr_proc_decl->entity.load();
+					GB_ASSERT(e);
+					GB_ASSERT(e->type->kind == Type_Proc);
+					String scope_features = e->type->Proc.enable_target_feature;
 					if (!check_target_feature_is_superset_of(scope_features, pt->Proc.enable_target_feature, &invalid)) {
 						ERROR_BLOCK();
 						error(call, "Inlined procedure enables target feature '%.*s', this requires the calling procedure to at least enable the same feature", LIT(invalid));

+ 1 - 1
src/checker.cpp

@@ -2055,8 +2055,8 @@ gb_internal void add_entity_and_decl_info(CheckerContext *c, Ast *identifier, En
 	add_entity_definition(info, identifier, e);
 	GB_ASSERT(e->decl_info == nullptr);
 	e->decl_info = d;
-	d->entity = e;
 	e->pkg = c->pkg;
+	d->entity.store(e);
 
 	isize queue_count = -1;
 	bool is_lazy = false;

+ 1 - 1
src/checker.hpp

@@ -209,7 +209,7 @@ struct DeclInfo {
 
 	Scope *       scope;
 
-	Entity *entity;
+	std::atomic<Entity *> entity;
 
 	Ast *         decl_node;
 	Ast *         type_expr;

+ 4 - 3
src/common_memory.cpp

@@ -481,15 +481,16 @@ gb_internal gbAllocator permanent_allocator() {
 }
 
 gb_internal gbAllocator temporary_allocator() {
-	return {thread_arena_allocator_proc, cast(void *)cast(uintptr)ThreadArena_Permanent};
+	// return {thread_arena_allocator_proc, cast(void *)cast(uintptr)ThreadArena_Temporary};
+	return permanent_allocator();
 }
 
 
 #define TEMP_ARENA_GUARD(arena) ArenaTempGuard GB_DEFER_3(_arena_guard_){arena}
 
 
-// #define TEMPORARY_ALLOCATOR_GUARD()
-#define TEMPORARY_ALLOCATOR_GUARD() TEMP_ARENA_GUARD(get_arena(ThreadArena_Temporary))
+// #define TEMPORARY_ALLOCATOR_GUARD() TEMP_ARENA_GUARD(get_arena(ThreadArena_Temporary))
+#define TEMPORARY_ALLOCATOR_GUARD()
 #define PERMANENT_ALLOCATOR_GUARD()
 
 

+ 27 - 8
src/linker.cpp

@@ -161,21 +161,32 @@ gb_internal i32 linker_stage(LinkerData *gen) {
 try_cross_linking:;
 
 	#if defined(GB_SYSTEM_WINDOWS)
+		String section_name = str_lit("msvc-link");
 		bool is_windows = build_context.metrics.os == TargetOs_windows;
 	#else
+		String section_name = str_lit("lld-link");
 		bool is_windows = false;
 	#endif
 
 		bool is_osx = build_context.metrics.os == TargetOs_darwin;
 
 
+		switch (build_context.linker_choice) {
+		case Linker_Default:  break;
+		case Linker_lld:      section_name = str_lit("lld-link"); break;
+	#if defined(GB_SYSTEM_LINUX)
+		case Linker_mold:     section_name = str_lit("mold-link"); break;
+	#endif
+	#if defined(GB_SYSTEM_WINDOWS)
+		case Linker_radlink:  section_name = str_lit("rad-link"); break;
+	#endif
+		default:
+			gb_printf_err("'%.*s' linker is not support for this platform\n", LIT(linker_choices[build_context.linker_choice]));
+			return 1;
+		}
+
+
 		if (is_windows) {
-			String section_name = str_lit("msvc-link");
-			switch (build_context.linker_choice) {
-			case Linker_Default:  break;
-			case Linker_lld:      section_name = str_lit("lld-link"); break;
-			case Linker_radlink:  section_name = str_lit("rad-link"); break;
-			}
 			timings_start_section(timings, section_name);
 
 			gbString lib_str = gb_string_make(heap_allocator(), "");
@@ -281,7 +292,11 @@ try_cross_linking:;
 					link_settings = gb_string_append_fmt(link_settings, " /NOENTRY");
 				}
 			} else {
-				link_settings = gb_string_append_fmt(link_settings, " /ENTRY:mainCRTStartup");
+				// For i386 with CRT, libcmt provides the entry point
+				// For other cases or no_crt, we need to specify the entry point
+				if (!(build_context.metrics.arch == TargetArch_i386 && !build_context.no_crt)) {
+					link_settings = gb_string_append_fmt(link_settings, " /ENTRY:mainCRTStartup");
+				}
 			}
 
 			if (build_context.build_paths[BuildPath_Symbols].name != "") {
@@ -419,7 +434,8 @@ try_cross_linking:;
 			}
 			}
 		} else {
-			timings_start_section(timings, str_lit("ld-link"));
+
+			timings_start_section(timings, section_name);
 
 			int const ODIN_ANDROID_API_LEVEL = build_context.ODIN_ANDROID_API_LEVEL;
 
@@ -952,6 +968,9 @@ try_cross_linking:;
 			if (build_context.linker_choice == Linker_lld) {
 				link_command_line = gb_string_append_fmt(link_command_line, " -fuse-ld=lld");
 				result = system_exec_command_line_app("lld-link", link_command_line);
+			} else if (build_context.linker_choice == Linker_mold) {
+				link_command_line = gb_string_append_fmt(link_command_line, " -fuse-ld=mold");
+				result = system_exec_command_line_app("mold-link", link_command_line);
 			} else {
 				result = system_exec_command_line_app("ld-link", link_command_line);
 			}

+ 17 - 17
src/llvm_abi.cpp

@@ -522,6 +522,23 @@ namespace lbAbiAmd64Win64 {
 	}
 };
 
+
+gb_internal bool is_llvm_type_slice_like(LLVMTypeRef type) {
+	if (!lb_is_type_kind(type, LLVMStructTypeKind)) {
+		return false;
+	}
+	if (LLVMCountStructElementTypes(type) != 2) {
+		return false;
+	}
+	LLVMTypeRef fields[2] = {};
+	LLVMGetStructElementTypes(type, fields);
+	if (!lb_is_type_kind(fields[0], LLVMPointerTypeKind)) {
+		return false;
+	}
+	return lb_is_type_kind(fields[1], LLVMIntegerTypeKind) && lb_sizeof(fields[1]) == 8;
+
+}
+
 // NOTE(bill): I hate `namespace` in C++ but this is just because I don't want to prefix everything
 namespace lbAbiAmd64SysV {
 	enum RegClass {
@@ -652,23 +669,6 @@ namespace lbAbiAmd64SysV {
 		return false;
 	}
 
-	gb_internal bool is_llvm_type_slice_like(LLVMTypeRef type) {
-		if (!lb_is_type_kind(type, LLVMStructTypeKind)) {
-			return false;
-		}
-		if (LLVMCountStructElementTypes(type) != 2) {
-			return false;
-		}
-		LLVMTypeRef fields[2] = {};
-		LLVMGetStructElementTypes(type, fields);
-		if (!lb_is_type_kind(fields[0], LLVMPointerTypeKind)) {
-			return false;
-		}
-		return lb_is_type_kind(fields[1], LLVMIntegerTypeKind) && lb_sizeof(fields[1]) == 8;
-
-	}
-
-
 	gb_internal bool is_aggregate(LLVMTypeRef type) {
 		LLVMTypeKind kind = LLVMGetTypeKind(type);
 		switch (kind) {

+ 220 - 165
src/llvm_backend.cpp

@@ -8,7 +8,11 @@
 #endif
 
 #ifndef LLVM_IGNORE_VERIFICATION
-#define LLVM_IGNORE_VERIFICATION 0
+#define LLVM_IGNORE_VERIFICATION build_context.internal_ignore_llvm_verification
+#endif
+
+#ifndef LLVM_WEAK_MONOMORPHIZATION
+#define LLVM_WEAK_MONOMORPHIZATION (USE_SEPARATE_MODULES && build_context.internal_weak_monomorphization)
 #endif
 
 
@@ -242,26 +246,12 @@ gb_internal String lb_internal_gen_name_from_type(char const *prefix, Type *type
 	return proc_name;
 }
 
-
-gb_internal lbValue lb_equal_proc_for_type(lbModule *m, Type *type) {
-	type = base_type(type);
-	GB_ASSERT(is_type_comparable(type));
+gb_internal void lb_equal_proc_generate_body(lbModule *m, lbProcedure *p) {
+	Type *type = p->internal_gen_type;
 
 	Type *pt = alloc_type_pointer(type);
 	LLVMTypeRef ptr_type = lb_type(m, pt);
 
-	String proc_name = lb_internal_gen_name_from_type("__$equal", type);
-	lbProcedure **found = string_map_get(&m->gen_procs, proc_name);
-	lbProcedure *compare_proc = nullptr;
-	if (found) {
-		compare_proc = *found;
-		GB_ASSERT(compare_proc != nullptr);
-		return {compare_proc->value, compare_proc->type};
-	}
-
-
-	lbProcedure *p = lb_create_dummy_procedure(m, proc_name, t_equal_proc);
-	string_map_set(&m->gen_procs, proc_name, p);
 	lb_begin_procedure_body(p);
 
 	LLVMSetLinkage(p->value, LLVMInternalLinkage);
@@ -389,9 +379,29 @@ gb_internal lbValue lb_equal_proc_for_type(lbModule *m, Type *type) {
 	}
 
 	lb_end_procedure_body(p);
+}
+
+gb_internal lbValue lb_equal_proc_for_type(lbModule *m, Type *type) {
+	type = base_type(type);
+	GB_ASSERT(is_type_comparable(type));
+
+	String proc_name = lb_internal_gen_name_from_type("__$equal", type);
+	lbProcedure **found = string_map_get(&m->gen_procs, proc_name);
+	if (found) {
+		lbProcedure *p = *found;
+		GB_ASSERT(p != nullptr);
+		return {p->value, p->type};
+	}
+
+	lbProcedure *p = lb_create_dummy_procedure(m, proc_name, t_equal_proc);
+	string_map_set(&m->gen_procs, proc_name, p);
+	p->internal_gen_type = type;
+	p->generate_body = lb_equal_proc_generate_body;
 
-	compare_proc = p;
-	return {compare_proc->value, compare_proc->type};
+	// p->generate_body(m, p);
+	mpsc_enqueue(&m->procedures_to_generate, p);
+
+	return {p->value, p->type};
 }
 
 gb_internal lbValue lb_simple_compare_hash(lbProcedure *p, Type *type, lbValue data, lbValue seed) {
@@ -620,6 +630,7 @@ gb_internal lbValue lb_hasher_proc_for_type(lbModule *m, Type *type) {
 
 #define LLVM_SET_VALUE_NAME(value, name) LLVMSetValueName2((value), (name), gb_count_of((name))-1);
 
+
 gb_internal lbValue lb_map_get_proc_for_type(lbModule *m, Type *type) {
 	GB_ASSERT(!build_context.dynamic_map_calls);
 	type = base_type(type);
@@ -634,6 +645,9 @@ gb_internal lbValue lb_map_get_proc_for_type(lbModule *m, Type *type) {
 
 	lbProcedure *p = lb_create_dummy_procedure(m, proc_name, t_map_get_proc);
 	string_map_set(&m->gen_procs, proc_name, p);
+
+	p->internal_gen_type = type;
+
 	lb_begin_procedure_body(p);
 	defer (lb_end_procedure_body(p));
 
@@ -1153,15 +1167,6 @@ gb_internal lbValue lb_dynamic_map_reserve(lbProcedure *p, lbValue const &map_pt
 	return lb_emit_runtime_call(p, "__dynamic_map_reserve", args);
 }
 
-
-struct lbGlobalVariable {
-	lbValue var;
-	lbValue init;
-	DeclInfo *decl;
-	bool is_initialized;
-};
-
-
 gb_internal lbProcedure *lb_create_objc_names(lbModule *main_module) {
 	if (build_context.metrics.os != TargetOs_darwin) {
 		return nullptr;
@@ -1900,12 +1905,16 @@ gb_internal void lb_verify_function(lbModule *m, lbProcedure *p, bool dump_ll=fa
 }
 
 gb_internal WORKER_TASK_PROC(lb_llvm_module_verification_worker_proc) {
+	if (LLVM_IGNORE_VERIFICATION) {
+		return 0;
+	}
+
 	char *llvm_error = nullptr;
 	defer (LLVMDisposeMessage(llvm_error));
 	lbModule *m = cast(lbModule *)data;
 
 	if (LLVMVerifyModule(m->mod, LLVMReturnStatusAction, &llvm_error)) {
-		gb_printf_err("LLVM Error:\n%s\n", llvm_error);
+		gb_printf_err("LLVM Error in module %s:\n%s\n", m->module_name, llvm_error);
 		if (build_context.keep_temp_files) {
 			TIME_SECTION("LLVM Print Module to File");
 			String filepath_ll = lb_filepath_ll_for_module(m);
@@ -1921,118 +1930,151 @@ gb_internal WORKER_TASK_PROC(lb_llvm_module_verification_worker_proc) {
 	return 0;
 }
 
+gb_internal bool lb_init_global_var(lbModule *m, lbProcedure *p, Entity *e, Ast *init_expr, lbGlobalVariable &var) {
+	if (init_expr != nullptr)  {
+		lbValue init = lb_build_expr(p, init_expr);
+		if (init.value == nullptr) {
+			LLVMTypeRef global_type = llvm_addr_type(p->module, var.var);
+			if (is_type_untyped_nil(init.type)) {
+				LLVMSetInitializer(var.var.value, LLVMConstNull(global_type));
+				var.is_initialized = true;
 
+				if (e->Variable.is_rodata) {
+					LLVMSetGlobalConstant(var.var.value, true);
+				}
+				return true;
+			}
+			GB_PANIC("Invalid init value, got %s", expr_to_string(init_expr));
+		}
 
-gb_internal lbProcedure *lb_create_startup_runtime(lbModule *main_module, lbProcedure *objc_names, Array<lbGlobalVariable> &global_variables) { // Startup Runtime
-	Type *proc_type = alloc_type_proc(nullptr, nullptr, 0, nullptr, 0, false, ProcCC_Odin);
+		if (is_type_any(e->type) || is_type_union(e->type)) {
+			var.init = init;
+		} else if (lb_is_const_or_global(init)) {
+			if (!var.is_initialized) {
+				if (is_type_proc(init.type)) {
+					init.value = LLVMConstPointerCast(init.value, lb_type(p->module, init.type));
+				}
+				LLVMSetInitializer(var.var.value, init.value);
+				var.is_initialized = true;
 
-	lbProcedure *p = lb_create_dummy_procedure(main_module, str_lit(LB_STARTUP_RUNTIME_PROC_NAME), proc_type);
-	p->is_startup = true;
-	lb_add_attribute_to_proc(p->module, p->value, "optnone");
-	lb_add_attribute_to_proc(p->module, p->value, "noinline");
+				if (e->Variable.is_rodata) {
+					LLVMSetGlobalConstant(var.var.value, true);
+				}
+				return true;
+			}
+		} else {
+			var.init = init;
+		}
+	}
 
-	// Make sure shared libraries call their own runtime startup on Linux.
-	LLVMSetVisibility(p->value, LLVMHiddenVisibility);
-	LLVMSetLinkage(p->value, LLVMWeakAnyLinkage);
+	if (var.init.value != nullptr) {
+		GB_ASSERT(!var.is_initialized);
+		Type *t = type_deref(var.var.type);
+
+		if (is_type_any(t)) {
+			// NOTE(bill): Edge case for 'any' type
+			Type *var_type = default_type(var.init.type);
+			gbString var_name = gb_string_make(permanent_allocator(), "__$global_any::");
+			gbString e_str = string_canonical_entity_name(temporary_allocator(), e);
+			var_name = gb_string_append_length(var_name, e_str, gb_strlen(e_str));
+			lbAddr g = lb_add_global_generated_with_name(m, var_type, {}, make_string_c(var_name));
+			lb_addr_store(p, g, var.init);
+			lbValue gp = lb_addr_get_ptr(p, g);
 
+			lbValue data = lb_emit_struct_ep(p, var.var, 0);
+			lbValue ti   = lb_emit_struct_ep(p, var.var, 1);
+			lb_emit_store(p, data, lb_emit_conv(p, gp, t_rawptr));
+			lb_emit_store(p, ti,   lb_typeid(p->module, var_type));
+		} else {
+			LLVMTypeRef vt = llvm_addr_type(p->module, var.var);
+			lbValue src0 = lb_emit_conv(p, var.init, t);
+			LLVMValueRef src = OdinLLVMBuildTransmute(p, src0.value, vt);
+			LLVMValueRef dst = var.var.value;
+			LLVMBuildStore(p->builder, src, dst);
+		}
+
+		var.is_initialized = true;
+	}
+	return false;
+}
+
+
+gb_internal void lb_create_startup_runtime_generate_body(lbModule *m, lbProcedure *p) {
 	lb_begin_procedure_body(p);
 
-	lb_setup_type_info_data(main_module);
+	lb_setup_type_info_data(m);
 
-	if (objc_names) {
-		LLVMBuildCall2(p->builder, lb_type_internal_for_procedures_raw(main_module, objc_names->type), objc_names->value, nullptr, 0, "");
+	if (p->objc_names) {
+		LLVMBuildCall2(p->builder, lb_type_internal_for_procedures_raw(m, p->objc_names->type), p->objc_names->value, nullptr, 0, "");
 	}
+	Type *dummy_type = alloc_type_proc(nullptr, nullptr, 0, nullptr, 0, false, ProcCC_Odin);
+	LLVMTypeRef raw_dummy_type = lb_type_internal_for_procedures_raw(m, dummy_type);
 
-	for (auto &var : global_variables) {
+	for (auto &var : *p->global_variables) {
 		if (var.is_initialized) {
 			continue;
 		}
 
-		lbModule *entity_module = main_module;
+		lbModule *entity_module = m;
 
 		Entity *e = var.decl->entity;
 		GB_ASSERT(e->kind == Entity_Variable);
 		e->code_gen_module = entity_module;
-
 		Ast *init_expr = var.decl->init_expr;
-		if (init_expr != nullptr)  {
-			lbValue init = lb_build_expr(p, init_expr);
-			if (init.value == nullptr) {
-				LLVMTypeRef global_type = llvm_addr_type(p->module, var.var);
-				if (is_type_untyped_nil(init.type)) {
-					LLVMSetInitializer(var.var.value, LLVMConstNull(global_type));
-					var.is_initialized = true;
-
-					if (e->Variable.is_rodata) {
-						LLVMSetGlobalConstant(var.var.value, true);
-					}
-					continue;
-				}
-				GB_PANIC("Invalid init value, got %s", expr_to_string(init_expr));
-			}
 
-			if (is_type_any(e->type) || is_type_union(e->type)) {
-				var.init = init;
-			} else if (lb_is_const_or_global(init)) {
-				if (!var.is_initialized) {
-					if (is_type_proc(init.type)) {
-						init.value = LLVMConstPointerCast(init.value, lb_type(p->module, init.type));
-					}
-					LLVMSetInitializer(var.var.value, init.value);
-					var.is_initialized = true;
-
-					if (e->Variable.is_rodata) {
-						LLVMSetGlobalConstant(var.var.value, true);
-					}
-					continue;
-				}
-			} else {
-				var.init = init;
-			}
+		if (init_expr == nullptr && var.init.value == nullptr) {
+			continue;
 		}
 
-		if (var.init.value != nullptr) {
-			GB_ASSERT(!var.is_initialized);
-			Type *t = type_deref(var.var.type);
-
-			if (is_type_any(t)) {
-				// NOTE(bill): Edge case for 'any' type
-				Type *var_type = default_type(var.init.type);
-				gbString var_name = gb_string_make(permanent_allocator(), "__$global_any::");
-				gbString e_str = string_canonical_entity_name(temporary_allocator(), e);
-				var_name = gb_string_append_length(var_name, e_str, gb_strlen(e_str));
-				lbAddr g = lb_add_global_generated_with_name(main_module, var_type, {}, make_string_c(var_name));
-				lb_addr_store(p, g, var.init);
-				lbValue gp = lb_addr_get_ptr(p, g);
-
-				lbValue data = lb_emit_struct_ep(p, var.var, 0);
-				lbValue ti   = lb_emit_struct_ep(p, var.var, 1);
-				lb_emit_store(p, data, lb_emit_conv(p, gp, t_rawptr));
-				lb_emit_store(p, ti,   lb_typeid(p->module, var_type));
-			} else {
-				LLVMTypeRef vt = llvm_addr_type(p->module, var.var);
-				lbValue src0 = lb_emit_conv(p, var.init, t);
-				LLVMValueRef src = OdinLLVMBuildTransmute(p, src0.value, vt);
-				LLVMValueRef dst = var.var.value;
-				LLVMBuildStore(p->builder, src, dst);
-			}
+		if (type_size_of(e->type) > 8) {
+			String ename = lb_get_entity_name(m, e);
+			gbString name = gb_string_make(permanent_allocator(), "");
+			name = gb_string_appendc(name, "__$startup$");
+			name = gb_string_append_length(name, ename.text, ename.len);
 
-			var.is_initialized = true;
-		}
+			lbProcedure *dummy = lb_create_dummy_procedure(m, make_string_c(name), dummy_type);
+			LLVMSetVisibility(dummy->value, LLVMHiddenVisibility);
+			LLVMSetLinkage(dummy->value, LLVMWeakAnyLinkage);
 
+			lb_begin_procedure_body(dummy);
+			lb_init_global_var(m, dummy, e, init_expr, var);
+			lb_end_procedure_body(dummy);
 
+			LLVMValueRef context_ptr = lb_find_or_generate_context_ptr(p).addr.value;
+			LLVMBuildCall2(p->builder, raw_dummy_type, dummy->value, &context_ptr, 1, "");
+		} else {
+			lb_init_global_var(m, p, e, init_expr, var);
+		}
 	}
-	CheckerInfo *info = main_module->gen->info;
-	
+	CheckerInfo *info = m->gen->info;
+
 	for (Entity *e : info->init_procedures) {
-		lbValue value = lb_find_procedure_value_from_entity(main_module, e);
+		lbValue value = lb_find_procedure_value_from_entity(m, e);
 		lb_emit_call(p, value, {}, ProcInlining_none);
 	}
 
 
 	lb_end_procedure_body(p);
+}
+
+
+gb_internal lbProcedure *lb_create_startup_runtime(lbModule *main_module, lbProcedure *objc_names, Array<lbGlobalVariable> &global_variables) { // Startup Runtime
+	Type *proc_type = alloc_type_proc(nullptr, nullptr, 0, nullptr, 0, false, ProcCC_Odin);
+
+	lbProcedure *p = lb_create_dummy_procedure(main_module, str_lit(LB_STARTUP_RUNTIME_PROC_NAME), proc_type);
+	p->is_startup = true;
+	lb_add_attribute_to_proc(p->module, p->value, "optnone");
+	lb_add_attribute_to_proc(p->module, p->value, "noinline");
+
+	// Make sure shared libraries call their own runtime startup on Linux.
+	LLVMSetVisibility(p->value, LLVMHiddenVisibility);
+	LLVMSetLinkage(p->value, LLVMWeakAnyLinkage);
+
+	p->global_variables = &global_variables;
+	p->objc_names       = objc_names;
+
+	lb_create_startup_runtime_generate_body(main_module, p);
 
-	lb_verify_function(main_module, p);
 	return p;
 }
 
@@ -2073,7 +2115,7 @@ gb_internal WORKER_TASK_PROC(lb_generate_procedures_and_types_per_module) {
 
 	for (Entity *e : m->global_procedures_to_create) {
 		(void)lb_get_entity_name(m, e);
-		array_add(&m->procedures_to_generate, lb_create_procedure(m, e));
+		mpsc_enqueue(&m->procedures_to_generate, lb_create_procedure(m, e));
 	}
 	return 0;
 }
@@ -2145,7 +2187,7 @@ gb_internal void lb_create_global_procedures_and_types(lbGenerator *gen, Checker
 
 		lbModule *m = &gen->default_module;
 		if (USE_SEPARATE_MODULES) {
-			m = lb_module_of_entity(gen, e);
+			m = lb_module_of_entity(gen, e, m);
 		}
 		GB_ASSERT(m != nullptr);
 
@@ -2261,7 +2303,7 @@ gb_internal WORKER_TASK_PROC(lb_llvm_function_pass_per_module) {
 		lb_llvm_function_pass_per_function_internal(m, m->gen->objc_names);
 	}
 
-	for (lbProcedure *p : m->procedures_to_generate) {
+	MUTEX_GUARD_BLOCK(&m->generated_procedures_mutex) for (lbProcedure *p : m->generated_procedures) {
 		if (p->body != nullptr) { // Build Procedure
 			lbFunctionPassManagerKind pass_manager_kind = lbFunctionPassManager_default;
 			if (p->flags & lbProcedureFlag_WithoutMemcpyPass) {
@@ -2300,17 +2342,23 @@ gb_internal WORKER_TASK_PROC(lb_llvm_function_pass_per_module) {
 }
 
 
+void lb_remove_unused_functions_and_globals(lbGenerator *gen) {
+	for (auto &entry : gen->modules) {
+		lbModule *m = entry.value;
+		lb_run_remove_unused_function_pass(m);
+		lb_run_remove_unused_globals_pass(m);
+	}
+}
+
 struct lbLLVMModulePassWorkerData {
 	lbModule *m;
 	LLVMTargetMachineRef target_machine;
+	bool do_threading;
 };
 
 gb_internal WORKER_TASK_PROC(lb_llvm_module_pass_worker_proc) {
 	auto wd = cast(lbLLVMModulePassWorkerData *)data;
 
-	lb_run_remove_unused_function_pass(wd->m);
-	lb_run_remove_unused_globals_pass(wd->m);
-
 	LLVMPassManagerRef module_pass_manager = LLVMCreatePassManager();
 	lb_populate_module_pass_manager(wd->target_machine, module_pass_manager, build_context.optimization_level);
 	LLVMRunPassManager(module_pass_manager, wd->m->mod);
@@ -2386,6 +2434,17 @@ gb_internal WORKER_TASK_PROC(lb_llvm_module_pass_worker_proc) {
 		return 1;
 	}
 #endif
+
+	if (LLVM_IGNORE_VERIFICATION) {
+		return 0;
+	}
+
+	if (wd->do_threading) {
+		thread_pool_add_task(lb_llvm_module_verification_worker_proc, wd->m);
+	} else {
+		lb_llvm_module_verification_worker_proc(wd->m);
+	}
+
 	return 0;
 }
 
@@ -2393,8 +2452,7 @@ gb_internal WORKER_TASK_PROC(lb_llvm_module_pass_worker_proc) {
 
 gb_internal WORKER_TASK_PROC(lb_generate_procedures_worker_proc) {
 	lbModule *m = cast(lbModule *)data;
-	for (isize i = 0; i < m->procedures_to_generate.count; i++) {
-		lbProcedure *p = m->procedures_to_generate[i];
+	for (lbProcedure *p = nullptr; mpsc_dequeue(&m->procedures_to_generate, &p); /**/) {
 		lb_generate_procedure(p->module, p);
 	}
 	return 0;
@@ -2418,10 +2476,15 @@ gb_internal void lb_generate_procedures(lbGenerator *gen, bool do_threading) {
 
 gb_internal WORKER_TASK_PROC(lb_generate_missing_procedures_to_check_worker_proc) {
 	lbModule *m = cast(lbModule *)data;
-	for (isize i = 0; i < m->missing_procedures_to_check.count; i++) {
-		lbProcedure *p = m->missing_procedures_to_check[i];
-		debugf("Generate missing procedure: %.*s module %p\n", LIT(p->name), m);
-		lb_generate_procedure(m, p);
+	for (lbProcedure *p = nullptr; mpsc_dequeue(&m->missing_procedures_to_check, &p); /**/) {
+		if (!p->is_done.load(std::memory_order_relaxed)) {
+			debugf("Generate missing procedure: %.*s module %p\n", LIT(p->name), m);
+			lb_generate_procedure(m, p);
+		}
+
+		for (lbProcedure *nested = nullptr; mpsc_dequeue(&m->procedures_to_generate, &nested); /**/) {
+			mpsc_enqueue(&m->missing_procedures_to_check, nested);
+		}
 	}
 	return 0;
 }
@@ -2441,6 +2504,12 @@ gb_internal void lb_generate_missing_procedures(lbGenerator *gen, bool do_thread
 			lb_generate_missing_procedures_to_check_worker_proc(m);
 		}
 	}
+
+	for (auto const &entry : gen->modules) {
+		lbModule *m = entry.value;
+		GB_ASSERT(m->missing_procedures_to_check.count == 0);
+		GB_ASSERT(m->procedures_to_generate.count == 0);
+	}
 }
 
 gb_internal void lb_debug_info_complete_types_and_finalize(lbGenerator *gen) {
@@ -2468,19 +2537,16 @@ gb_internal void lb_llvm_function_passes(lbGenerator *gen, bool do_threading) {
 }
 
 
-gb_internal void lb_llvm_module_passes(lbGenerator *gen, bool do_threading) {
+gb_internal void lb_llvm_module_passes_and_verification(lbGenerator *gen, bool do_threading) {
 	if (do_threading) {
 		for (auto const &entry : gen->modules) {
 			lbModule *m = entry.value;
 			auto wd = gb_alloc_item(permanent_allocator(), lbLLVMModulePassWorkerData);
 			wd->m = m;
 			wd->target_machine = m->target_machine;
+			wd->do_threading = true;
 
-			if (do_threading) {
-				thread_pool_add_task(lb_llvm_module_pass_worker_proc, wd);
-			} else {
-				lb_llvm_module_pass_worker_proc(wd);
-			}
+			thread_pool_add_task(lb_llvm_module_pass_worker_proc, wd);
 		}
 		thread_pool_wait();
 	} else {
@@ -2489,6 +2555,7 @@ gb_internal void lb_llvm_module_passes(lbGenerator *gen, bool do_threading) {
 			auto wd = gb_alloc_item(permanent_allocator(), lbLLVMModulePassWorkerData);
 			wd->m = m;
 			wd->target_machine = m->target_machine;
+			wd->do_threading = false;
 			lb_llvm_module_pass_worker_proc(wd);
 		}
 	}
@@ -2569,31 +2636,6 @@ gb_internal String lb_filepath_obj_for_module(lbModule *m) {
 
 }
 
-
-gb_internal bool lb_llvm_module_verification(lbGenerator *gen, bool do_threading) {
-	if (LLVM_IGNORE_VERIFICATION) {
-		return true;
-	}
-
-	if (do_threading) {
-		for (auto const &entry : gen->modules) {
-			lbModule *m = entry.value;
-			thread_pool_add_task(lb_llvm_module_verification_worker_proc, m);
-		}
-		thread_pool_wait();
-
-	} else {
-		for (auto const &entry : gen->modules) {
-			lbModule *m = entry.value;
-			if (lb_llvm_module_verification_worker_proc(m)) {
-				return false;
-			}
-		}
-	}
-
-	return true;
-}
-
 gb_internal void lb_add_foreign_library_paths(lbGenerator *gen) {
 	for (auto const &entry : gen->modules) {
 		lbModule *m = entry.value;
@@ -2689,8 +2731,15 @@ gb_internal lbProcedure *lb_create_main_procedure(lbModule *m, lbProcedure *star
 		params->Tuple.variables[1] = alloc_entity_param(nullptr, make_token_ident("fdwReason"),  t_u32,    false, true);
 		params->Tuple.variables[2] = alloc_entity_param(nullptr, make_token_ident("lpReserved"), t_rawptr, false, true);
 		call_cleanup = false;
-	} else if (build_context.metrics.os == TargetOs_windows && (build_context.metrics.arch == TargetArch_i386 || build_context.no_crt)) {
+	} else if (build_context.metrics.os == TargetOs_windows && build_context.no_crt) {
 		name = str_lit("mainCRTStartup");
+	} else if (build_context.metrics.os == TargetOs_windows && build_context.metrics.arch == TargetArch_i386 && !build_context.no_crt) {
+		// Windows i386 with CRT: libcmt expects _main (main with underscore prefix)
+		name = str_lit("main");
+		has_args = true;
+		slice_init(&params->Tuple.variables, permanent_allocator(), 2);
+		params->Tuple.variables[0] = alloc_entity_param(nullptr, make_token_ident("argc"), t_i32, false, true);
+		params->Tuple.variables[1] = alloc_entity_param(nullptr, make_token_ident("argv"), t_ptr_cstring, false, true);
 	} else if (is_arch_wasm()) {
 		name = str_lit("_start");
 		call_cleanup = false;
@@ -2816,16 +2865,19 @@ gb_internal lbProcedure *lb_create_main_procedure(lbModule *m, lbProcedure *star
 }
 
 gb_internal void lb_generate_procedure(lbModule *m, lbProcedure *p) {
-	if (p->is_done) {
+	if (p->is_done.load(std::memory_order_relaxed)) {
 		return;
 	}
+
 	if (p->body != nullptr) { // Build Procedure
 		m->curr_procedure = p;
 		lb_begin_procedure_body(p);
 		lb_build_stmt(p, p->body);
 		lb_end_procedure_body(p);
-		p->is_done = true;
+		p->is_done.store(true, std::memory_order_relaxed);
 		m->curr_procedure = nullptr;
+	} else if (p->generate_body != nullptr) {
+		p->generate_body(m, p);
 	}
 
 	// Add Flags
@@ -2834,6 +2886,9 @@ gb_internal void lb_generate_procedure(lbModule *m, lbProcedure *p) {
 	}
 
 	lb_verify_function(m, p, true);
+
+	MUTEX_GUARD(&m->generated_procedures_mutex);
+	array_add(&m->generated_procedures, p);
 }
 
 
@@ -3163,6 +3218,7 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 				String link_name = e->Procedure.link_name;
 				if (e->pkg->kind == Package_Runtime) {
 					if (link_name == "main"           ||
+					    link_name == "_main"          ||
 					    link_name == "DllMain"        ||
 					    link_name == "WinMain"        ||
 					    link_name == "wWinMain"       ||
@@ -3476,16 +3532,20 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 		}
 	}
 
+	TIME_SECTION("LLVM Add Foreign Library Paths");
+	lb_add_foreign_library_paths(gen);
+
 	TIME_SECTION("LLVM Function Pass");
 	lb_llvm_function_passes(gen, do_threading && !build_context.ODIN_DEBUG);
 
-	TIME_SECTION("LLVM Module Pass");
-	lb_llvm_module_passes(gen, do_threading);
+	TIME_SECTION("LLVM Remove Unused Functions and Globals");
+	lb_remove_unused_functions_and_globals(gen);
 
-	TIME_SECTION("LLVM Module Verification");
-	if (!lb_llvm_module_verification(gen, do_threading)) {
-		return false;
-	}
+	TIME_SECTION("LLVM Module Pass and Verification");
+	lb_llvm_module_passes_and_verification(gen, do_threading);
+
+	TIME_SECTION("LLVM Correct Entity Linkage");
+	lb_correct_entity_linkage(gen);
 
 	llvm_error = nullptr;
 	defer (LLVMDisposeMessage(llvm_error));
@@ -3513,11 +3573,6 @@ gb_internal bool lb_generate_code(lbGenerator *gen) {
 		}
 	}
 
-	TIME_SECTION("LLVM Add Foreign Library Paths");
-	lb_add_foreign_library_paths(gen);
-
-	TIME_SECTION("LLVM Correct Entity Linkage");
-	lb_correct_entity_linkage(gen);
 
 	////////////////////////////////////////////
 	for (auto const &entry: gen->modules) {

+ 28 - 7
src/llvm_backend.hpp

@@ -147,9 +147,13 @@ struct lbModule {
 	LLVMModuleRef mod;
 	LLVMContextRef ctx;
 
+	Checker *checker;
+
 	struct lbGenerator *gen;
 	LLVMTargetMachineRef target_machine;
 
+	lbModule *polymorphic_module;
+
 	CheckerInfo *info;
 	AstPackage *pkg; // possibly associated
 	AstFile *file;   // possibly associated
@@ -171,7 +175,8 @@ struct lbModule {
 	StringMap<lbValue>  members;
 	StringMap<lbProcedure *> procedures;
 	PtrMap<LLVMValueRef, Entity *> procedure_values;
-	Array<lbProcedure *> missing_procedures_to_check;
+
+	MPSCQueue<lbProcedure *> missing_procedures_to_check;
 
 	StringMap<LLVMValueRef>   const_strings;
 	String16Map<LLVMValueRef> const_string16s;
@@ -180,10 +185,13 @@ struct lbModule {
 
 	StringMap<lbProcedure *> gen_procs;   // key is the canonicalized name
 
-	Array<lbProcedure *> procedures_to_generate;
+	MPSCQueue<lbProcedure *> procedures_to_generate;
 	Array<Entity *> global_procedures_to_create;
 	Array<Entity *> global_types_to_create;
 
+	BlockingMutex generated_procedures_mutex;
+	Array<lbProcedure *> generated_procedures;
+
 	lbProcedure *curr_procedure;
 
 	LLVMBuilderRef const_dummy_builder;
@@ -232,8 +240,7 @@ struct lbGenerator : LinkerData {
 	PtrMap<LLVMContextRef, lbModule *> modules_through_ctx; 
 	lbModule default_module;
 
-	RecursiveMutex anonymous_proc_lits_mutex;
-	PtrMap<Ast *, lbProcedure *> anonymous_proc_lits; 
+	lbModule *equal_module;
 
 	isize used_module_count;
 
@@ -329,6 +336,14 @@ struct lbVariadicReuseSlices {
 	lbAddr slice_addr;
 };
 
+struct lbGlobalVariable {
+	lbValue var;
+	lbValue init;
+	DeclInfo *decl;
+	bool is_initialized;
+};
+
+
 struct lbProcedure {
 	u32 flags;
 	u16 state_flags;
@@ -351,9 +366,9 @@ struct lbProcedure {
 
 	lbFunctionType *abi_function_type;
 
-	LLVMValueRef    value;
-	LLVMBuilderRef  builder;
-	bool            is_done;
+	LLVMValueRef      value;
+	LLVMBuilderRef    builder;
+	std::atomic<bool> is_done;
 
 	lbAddr           return_ptr;
 	Array<lbDefer>   defer_stmts;
@@ -391,6 +406,12 @@ struct lbProcedure {
 	PtrMap<LLVMValueRef, lbTupleFix> tuple_fix_map;
 
 	Array<lbValue> asan_stack_locals;
+
+	void (*generate_body)(lbModule *m, lbProcedure *p);
+	Array<lbGlobalVariable> *global_variables;
+	lbProcedure *objc_names;
+
+	Type *internal_gen_type; // map_set, map_get, etc.
 };
 
 

+ 281 - 1
src/llvm_backend_const.cpp

@@ -168,7 +168,7 @@ gb_internal LLVMValueRef llvm_const_named_struct(lbModule *m, Type *t, LLVMValue
 		return llvm_const_named_struct_internal(struct_type, values, value_count_);
 	}
 	Type *bt = base_type(t);
-	GB_ASSERT(bt->kind == Type_Struct);
+	GB_ASSERT(bt->kind == Type_Struct || bt->kind == Type_Union);
 	
 	GB_ASSERT(value_count_ == bt->Struct.fields.count);
 	
@@ -537,6 +537,245 @@ gb_internal bool lb_is_nested_possibly_constant(Type *ft, Selection const &sel,
 	return lb_is_elem_const(elem, ft);
 }
 
+gb_internal Slice<LLVMValueRef> lb_construct_const_union_flatten_values(lbModule *m, LLVMValueRef variant_value, Type *variant_type, LLVMTypeRef elem) {
+	LLVMTypeRef llvm_variant_type = lb_type(m, variant_type);
+	LLVMTypeKind variant_kind = LLVMGetTypeKind(llvm_variant_type);
+	LLVMTypeKind elem_kind = LLVMGetTypeKind(elem);
+
+	if (is_type_struct(variant_type)) {
+		Type *st = base_type(variant_type);
+		GB_ASSERT(st->kind == Type_Struct);
+		if (st->Struct.fields.count == 1) {
+			LLVMValueRef f = llvm_const_extract_value(m, variant_value, 0);
+			return lb_construct_const_union_flatten_values(m, f, st->Struct.fields[0]->type, elem);
+		}
+	} else if (is_llvm_type_slice_like(llvm_variant_type)) {
+		if (lb_sizeof(elem) == build_context.ptr_size) {
+			LLVMValueRef *elems = temporary_alloc_array<LLVMValueRef>(2);
+			elems[0] = llvm_const_extract_value(m, variant_value, 0);
+			elems[0] = LLVMConstPtrToInt(elems[0], elem);
+
+			elems[1] = llvm_const_extract_value(m, variant_value, 1);
+
+			return {elems, 2};
+		}
+	} else if (is_type_array_like(variant_type)) {
+		Type *array_elem = base_array_type(variant_type);
+		isize array_count = get_array_type_count(variant_type);
+		Slice<LLVMValueRef> array = temporary_slice_make<LLVMValueRef>(array_count);
+		for (isize i = 0; i < array_count; i++) {
+			LLVMValueRef v = llvm_const_extract_value(m, variant_value, 0);
+			auto res = lb_construct_const_union_flatten_values(m, v, array_elem, elem);
+			if (res.count != 1) {
+				return {};
+			}
+			array[i] = res[0];
+		}
+		return array;
+	} else if (variant_kind == LLVMIntegerTypeKind) {
+		if (elem == llvm_variant_type) {
+			LLVMValueRef *elems = temporary_alloc_array<LLVMValueRef>(1);
+			elems[0] = variant_value;
+			return {elems, 1};
+		} else if (!is_type_different_to_arch_endianness(variant_type)) {
+			i64 elem_size = lb_sizeof(elem);
+			i64 variant_size = lb_sizeof(llvm_variant_type);
+			if (elem_size > variant_size) {
+				u64 val = LLVMConstIntGetZExtValue(variant_value);
+
+				LLVMValueRef *elems = temporary_alloc_array<LLVMValueRef>(1);
+				elems[0] = LLVMConstInt(elem, val, false);
+				return {elems, 1};
+			}
+		}
+	} else if (!is_type_different_to_arch_endianness(variant_type) &&
+	           elem_kind == LLVMIntegerTypeKind) {
+		switch (variant_kind) {
+		case LLVMHalfTypeKind:
+			{
+				LLVMBool loses = false;
+				f64 res = LLVMConstRealGetDouble(variant_value, &loses);
+				u16 val = f32_to_f16(cast(f32)res);
+
+				LLVMValueRef *elems = temporary_alloc_array<LLVMValueRef>(1);
+				elems[0] = LLVMConstInt(elem, val, false);
+				return {elems, 1};
+			}
+			break;
+		case LLVMFloatTypeKind:
+			{
+				LLVMBool loses = false;
+				f64 res = LLVMConstRealGetDouble(variant_value, &loses);
+				union { f32 f; u32 i; } val = {};
+				val.f = cast(f32)res;
+
+				LLVMValueRef *elems = temporary_alloc_array<LLVMValueRef>(1);
+				elems[0] = LLVMConstInt(elem, val.i, false);
+				return {elems, 1};
+			}
+			break;
+		case LLVMDoubleTypeKind:
+			{
+				LLVMBool loses = false;
+				f64 res = LLVMConstRealGetDouble(variant_value, &loses);
+				union { f64 f; u64 i; } val = {};
+				val.f = res;
+
+				LLVMValueRef *elems = temporary_alloc_array<LLVMValueRef>(1);
+				elems[0] = LLVMConstInt(elem, val.i, false);
+				return {elems, 1};
+			}
+			break;
+		}
+	}
+
+	return {};
+}
+
+gb_internal LLVMValueRef lb_construct_const_union(lbModule *m, LLVMValueRef variant_value, Type *variant_type, Type *union_type) {
+	Type *bt = base_type(union_type);
+	GB_ASSERT(bt->kind == Type_Union);
+	GB_ASSERT(lb_type(m, variant_type) == LLVMTypeOf(variant_value));
+
+	LLVMTypeRef llvm_type = lb_type(m, union_type);
+
+	if (LLVMIsNull(variant_value)) {
+		return LLVMConstNull(llvm_type);
+	}
+
+	if (bt->Union.variants.count == 0) {
+		GB_ASSERT(LLVMIsNull(variant_value));
+		return variant_value;
+	}
+
+	i64 block_size = bt->Union.variant_block_size;
+	i64 variant_size = type_size_of(variant_type);
+
+	LLVMTypeRef llvm_variant_type = lb_type(m, variant_type);
+
+	if (is_type_union_maybe_pointer(bt)) {
+		GB_ASSERT(lb_sizeof(LLVMTypeOf(variant_value)) == lb_sizeof(llvm_type));
+		return LLVMConstBitCast(variant_value, llvm_type);
+	}
+
+	if (bt->Union.variants.count == 1) {
+		unsigned long long the_tag = cast(unsigned long long)union_variant_index(union_type, variant_type);
+		LLVMTypeRef tag_type = lb_type(m, union_tag_type(bt));
+
+		LLVMValueRef values[3] = {};
+		unsigned i = 0;
+		values[i++] = variant_value;
+		values[i++] = LLVMConstInt(tag_type, the_tag, false);
+
+		i64 used_size = block_size + lb_sizeof(tag_type);
+		i64 padding = type_size_of(union_type) - used_size;
+		i64 align = type_align_of(union_type);
+		if (padding > 0) {
+			LLVMTypeRef padding_type = lb_type_padding_filler(m, padding, align);
+			values[i++] = LLVMConstNull(padding_type);
+		}
+
+		return LLVMConstNamedStruct(llvm_type, values, i);
+	}
+
+	LLVMTypeRef block_type = LLVMStructGetTypeAtIndex(llvm_type, 0);
+
+	LLVMTypeRef tag_type = lb_type(m, union_tag_type(bt));
+
+	i64 used_size = block_size + lb_sizeof(tag_type);
+	i64 padding = type_size_of(union_type) - used_size;
+	i64 align = type_align_of(union_type);
+	LLVMTypeRef padding_type = nullptr;
+	if (padding > 0) {
+		padding_type = lb_type_padding_filler(m, padding, align);
+	}
+
+
+	unsigned i = 0;
+	LLVMValueRef values[3] = {};
+
+	LLVMValueRef block_value = variant_value;
+
+	if (block_size == 0) {
+		block_value = LLVMConstNull(block_type);
+	} else if (lb_sizeof(llvm_variant_type) == 0) {
+		block_value = LLVMConstNull(block_type);
+	} else if (block_type != llvm_variant_type) {
+		LLVMTypeKind block_kind   = LLVMGetTypeKind(block_type);
+		LLVMTypeKind variant_kind = LLVMGetTypeKind(llvm_variant_type);
+
+
+		if (block_kind == LLVMArrayTypeKind) {
+			LLVMTypeRef elem = LLVMGetElementType(block_type);
+			unsigned count = LLVMGetArrayLength(block_type);
+
+			Slice<LLVMValueRef> partial_elems = lb_construct_const_union_flatten_values(m, variant_value, variant_type, elem);
+			if (partial_elems.count == count) {
+				block_value = LLVMConstArray(elem, partial_elems.data, count);
+				goto assign_value_wrapped;
+			}
+
+			Slice<LLVMValueRef> full_elems = temporary_slice_make<LLVMValueRef>(count);
+			slice_copy(&full_elems, partial_elems);
+			for (isize j = partial_elems.count; j < count; j++) {
+				full_elems[j] = LLVMConstNull(elem);
+			}
+			block_value = LLVMConstArray(elem, full_elems.data, count);
+			goto assign_value_wrapped;
+
+		} else if (block_size != variant_size) {
+			if (block_kind == LLVMIntegerTypeKind && !is_type_different_to_arch_endianness(variant_type)) {
+				Slice<LLVMValueRef> partial_elems = lb_construct_const_union_flatten_values(m, variant_value, variant_type, block_type);
+				if (partial_elems.count == 1) {
+					block_value = partial_elems[0];
+					goto assign_value_wrapped;
+				}
+			}
+
+			return nullptr;
+		}
+		if (block_kind == LLVMIntegerTypeKind) {
+			GB_ASSERT(block_size == variant_size);
+
+			switch (variant_kind) {
+			case LLVMHalfTypeKind:
+			case LLVMFloatTypeKind:
+			case LLVMDoubleTypeKind:
+				block_value = LLVMConstBitCast(block_value, block_type);
+				goto assign_value_wrapped;
+			case LLVMPointerTypeKind:
+				block_value = LLVMConstPtrToInt(block_value, block_type);
+				goto assign_value_wrapped;
+			}
+		}
+
+		return nullptr;
+	}
+
+assign_value_wrapped:;
+	values[i++] = block_value;
+
+	unsigned long long the_tag = cast(unsigned long long)union_variant_index(union_type, variant_type);
+	values[i++] = LLVMConstInt(tag_type, the_tag, false);
+	if (padding > 0) {
+		values[i++] = LLVMConstNull(padding_type);
+	}
+	return LLVMConstNamedStruct(llvm_type, values, i);
+}
+
+gb_internal bool lb_try_construct_const_union(lbModule *m, lbValue *value, Type *variant_type, Type *union_type) {
+	if (lb_is_const(*value)) {
+		LLVMValueRef res = lb_construct_const_union(m, value->value, variant_type, union_type);
+		if (res != nullptr) {
+			*value = {res, union_type};
+			return true;
+		}
+		// gb_printf_err("%s -> %s\n", LLVMPrintValueToString(value->value), LLVMPrintTypeToString(lb_type(m, union_type)));
+	}
+	return false;
+}
+
+
 gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, lbConstContext cc) {
 	if (cc.allow_local) {
 		cc.is_rodata = false;
@@ -585,6 +824,47 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, lb
 
 	bool is_local = cc.allow_local && m->curr_procedure != nullptr;
 
+	if (is_type_union(type) && is_type_union_constantable(type)) {
+		Type *bt = base_type(type);
+		GB_ASSERT(bt->kind == Type_Union);
+		GB_ASSERT(bt->Union.variants.count <= 1);
+		if (bt->Union.variants.count == 0) {
+			return lb_const_nil(m, original_type);
+		} else if (bt->Union.variants.count == 1) {
+			Type *t = bt->Union.variants[0];
+			lbValue cv =  lb_const_value(m, t, value, cc);
+			GB_ASSERT(LLVMIsConstant(cv.value));
+
+			LLVMTypeRef llvm_type = lb_type(m, original_type);
+
+			if (is_type_union_maybe_pointer(type)) {
+				LLVMValueRef values[1] = {cv.value};
+				res.value = llvm_const_named_struct_internal(llvm_type, values, 1);
+				res.type = original_type;
+				return res;
+			} else {
+
+				unsigned tag_value = 1;
+				if (bt->Union.kind == UnionType_no_nil) {
+					tag_value = 0;
+				}
+				LLVMValueRef tag = LLVMConstInt(LLVMStructGetTypeAtIndex(llvm_type, 1), tag_value, false);
+				LLVMValueRef padding = nullptr;
+				LLVMValueRef values[3] = {cv.value, tag, padding};
+
+				isize value_count = 2;
+				if (LLVMCountStructElementTypes(llvm_type) > 2) {
+					value_count = 3;
+					padding = LLVMConstNull(LLVMStructGetTypeAtIndex(llvm_type, 2));
+				}
+				res.value = llvm_const_named_struct_internal(llvm_type, values, value_count);
+				res.type = original_type;
+				return res;
+			}
+		}
+
+	}
+
 	// GB_ASSERT_MSG(is_type_typed(type), "%s", type_to_string(type));
 
 	if (is_type_slice(type)) {

+ 1 - 1
src/llvm_backend_debug.cpp

@@ -1327,7 +1327,7 @@ gb_internal void lb_add_debug_info_for_global_constant_from_entity(lbGenerator *
 	}
 	lbModule *m = &gen->default_module;
 	if (USE_SEPARATE_MODULES) {
-		m = lb_module_of_entity(gen, e);
+		m = lb_module_of_entity(gen, e, m);
 	}
 	GB_ASSERT(m != nullptr);
 

+ 15 - 0
src/llvm_backend_expr.cpp

@@ -2495,6 +2495,11 @@ gb_internal lbValue lb_emit_conv(lbProcedure *p, lbValue value, Type *t) {
 			Type *vt = dst->Union.variants[0];
 			if (internal_check_is_assignable_to(src_type, vt)) {
 				value = lb_emit_conv(p, value, vt);
+				if (lb_is_const(value)) {
+					LLVMValueRef res = lb_construct_const_union(m, value.value, vt, t);
+					return {res, t};
+				}
+
 				lbAddr parent = lb_add_local_generated(p, t, true);
 				lb_emit_store_union_variant(p, parent.addr, value, vt);
 				return lb_addr_load(p, parent);
@@ -2503,11 +2508,18 @@ gb_internal lbValue lb_emit_conv(lbProcedure *p, lbValue value, Type *t) {
 		for (Type *vt : dst->Union.variants) {
 			if (src_type == t_llvm_bool && is_type_boolean(vt)) {
 				value = lb_emit_conv(p, value, vt);
+				if (lb_try_construct_const_union(m, &value, vt, t)) {
+					return value;
+				}
+
 				lbAddr parent = lb_add_local_generated(p, t, true);
 				lb_emit_store_union_variant(p, parent.addr, value, vt);
 				return lb_addr_load(p, parent);
 			}
 			if (are_types_identical(src_type, vt)) {
+				if (lb_try_construct_const_union(m, &value, vt, t)) {
+					return value;
+				}
 				lbAddr parent = lb_add_local_generated(p, t, true);
 				lb_emit_store_union_variant(p, parent.addr, value, vt);
 				return lb_addr_load(p, parent);
@@ -2545,6 +2557,9 @@ gb_internal lbValue lb_emit_conv(lbProcedure *p, lbValue value, Type *t) {
 		if (valid_count == 1) {
 			Type *vt = dst->Union.variants[first_success_index];
 			value = lb_emit_conv(p, value, vt);
+			if (lb_try_construct_const_union(m, &value, vt, t)) {
+					return value;
+				}
 			lbAddr parent = lb_add_local_generated(p, t, true);
 			lb_emit_store_union_variant(p, parent.addr, value, vt);
 			return lb_addr_load(p, parent);

+ 239 - 57
src/llvm_backend_general.cpp

@@ -15,7 +15,9 @@ gb_global isize lb_global_type_info_member_offsets_index = 0;
 gb_global isize lb_global_type_info_member_usings_index  = 0;
 gb_global isize lb_global_type_info_member_tags_index    = 0;
 
-gb_internal void lb_init_module(lbModule *m, Checker *c) {
+gb_internal WORKER_TASK_PROC(lb_init_module_worker_proc) {
+	lbModule *m = cast(lbModule *)data;
+	Checker *c = m->checker;
 	m->info = &c->info;
 
 
@@ -46,6 +48,12 @@ gb_internal void lb_init_module(lbModule *m, Checker *c) {
 		}
 		module_name = gb_string_appendc(module_name, "builtin");
 	}
+	if (m->polymorphic_module == m) {
+		if (gb_string_length(module_name)) {
+			module_name = gb_string_appendc(module_name, "-");
+		}
+		module_name = gb_string_appendc(module_name, "$parapoly");
+	}
 
 	m->module_name = module_name;
 	m->ctx = LLVMContextCreate();
@@ -89,15 +97,19 @@ gb_internal void lb_init_module(lbModule *m, Checker *c) {
 	map_init(&m->function_type_map);
 	string_map_init(&m->gen_procs);
 	if (USE_SEPARATE_MODULES) {
-		array_init(&m->procedures_to_generate, a, 0, 1<<10);
+		mpsc_init(&m->procedures_to_generate, a);
 		map_init(&m->procedure_values,               1<<11);
+		array_init(&m->generated_procedures,   a, 0, 1<<10);
 	} else {
-		array_init(&m->procedures_to_generate, a, 0, c->info.all_procedures.count);
+		mpsc_init(&m->procedures_to_generate, a);
 		map_init(&m->procedure_values,               c->info.all_procedures.count*2);
+		array_init(&m->generated_procedures,   a, 0, c->info.all_procedures.count*2);
 	}
+
+
 	array_init(&m->global_procedures_to_create, a, 0, 1024);
 	array_init(&m->global_types_to_create, a, 0, 1024);
-	array_init(&m->missing_procedures_to_check, a, 0, 16);
+	mpsc_init(&m->missing_procedures_to_check, a);
 	map_init(&m->debug_values);
 
 	string_map_init(&m->objc_classes);
@@ -113,6 +125,15 @@ gb_internal void lb_init_module(lbModule *m, Checker *c) {
 
 	m->const_dummy_builder = LLVMCreateBuilderInContext(m->ctx);
 
+	return 0;
+}
+
+gb_internal void lb_init_module(lbModule *m, bool do_threading) {
+	if (do_threading) {
+		thread_pool_add_task(lb_init_module_worker_proc, m);
+	} else {
+		lb_init_module_worker_proc(m);
+	}
 }
 
 gb_internal bool lb_init_generator(lbGenerator *gen, Checker *c) {
@@ -125,6 +146,10 @@ gb_internal bool lb_init_generator(lbGenerator *gen, Checker *c) {
 		return false;
 	}
 
+	isize thread_count = gb_max(build_context.thread_count, 1);
+	isize worker_count = thread_count-1;
+	bool do_threading = !!(LLVMIsMultithreaded() && USE_SEPARATE_MODULES && MULTITHREAD_OBJECT_GENERATION && worker_count > 0);
+
 	String init_fullpath = c->parser->init_fullpath;
 	linker_data_init(gen, &c->info, init_fullpath);
 
@@ -136,7 +161,6 @@ gb_internal bool lb_init_generator(lbGenerator *gen, Checker *c) {
 
 	map_init(&gen->modules, gen->info->packages.count*2);
 	map_init(&gen->modules_through_ctx, gen->info->packages.count*2);
-	map_init(&gen->anonymous_proc_lits, 1024);
 
 	if (USE_SEPARATE_MODULES) {
 		bool module_per_file = build_context.module_per_file && build_context.optimization_level <= 0;
@@ -145,26 +169,71 @@ gb_internal bool lb_init_generator(lbGenerator *gen, Checker *c) {
 			auto m = gb_alloc_item(permanent_allocator(), lbModule);
 			m->pkg = pkg;
 			m->gen = gen;
+			m->checker = c;
 			map_set(&gen->modules, cast(void *)pkg, m);
-			lb_init_module(m, c);
-			if (!module_per_file) {
+			lb_init_module(m, do_threading);
+
+			if (LLVM_WEAK_MONOMORPHIZATION) {
+				auto pm = gb_alloc_item(permanent_allocator(), lbModule);
+				pm->pkg = pkg;
+				pm->gen = gen;
+				pm->checker = c;
+				m->polymorphic_module  = pm;
+				pm->polymorphic_module = pm;
+
+				map_set(&gen->modules, cast(void *)pm, pm); // point to itself just add it to the list
+
+				lb_init_module(pm, do_threading);
+			}
+
+			if (pkg->kind == Package_Runtime) {
+				// allow this to be per file
+			} else if (!module_per_file) {
 				continue;
 			}
 			// NOTE(bill): Probably per file is not a good idea, so leave this for later
 			for (AstFile *file : pkg->files) {
-				auto m = gb_alloc_item(permanent_allocator(), lbModule);
+				auto m  = gb_alloc_item(permanent_allocator(), lbModule);
 				m->file = file;
-				m->pkg = pkg;
-				m->gen = gen;
+				m->pkg  = pkg;
+				m->gen  = gen;
+				m->checker = c;
 				map_set(&gen->modules, cast(void *)file, m);
-				lb_init_module(m, c);
+				lb_init_module(m, do_threading);
+
+
+				if (LLVM_WEAK_MONOMORPHIZATION) {
+					auto pm  = gb_alloc_item(permanent_allocator(), lbModule);
+					pm->file = file;
+					pm->pkg  = pkg;
+					pm->gen  = gen;
+					pm->checker = c;
+					m->polymorphic_module  = pm;
+					pm->polymorphic_module = pm;
+
+					map_set(&gen->modules, cast(void *)pm, pm); // point to itself just add it to the list
+
+					lb_init_module(pm, do_threading);
+				}
 			}
 		}
+
+		if (LLVM_WEAK_MONOMORPHIZATION) {
+			lbModule *m = gb_alloc_item(permanent_allocator(), lbModule);
+			gen->equal_module = m;
+			m->gen            = gen;
+			m->checker        = c;
+			map_set(&gen->modules, cast(void *)m, m); // point to itself just add it to the list
+			lb_init_module(m, do_threading);
+		}
 	}
 
 	gen->default_module.gen = gen;
+	gen->default_module.checker = c;
 	map_set(&gen->modules, cast(void *)1, &gen->default_module);
-	lb_init_module(&gen->default_module, c);
+	lb_init_module(&gen->default_module, do_threading);
+
+	thread_pool_wait();
 
 	for (auto const &entry : gen->modules) {
 		lbModule *m = entry.value;
@@ -403,9 +472,9 @@ gb_internal lbModule *lb_module_of_expr(lbGenerator *gen, Ast *expr) {
 	return &gen->default_module;
 }
 
-gb_internal lbModule *lb_module_of_entity(lbGenerator *gen, Entity *e) {
-	GB_ASSERT(e != nullptr);
+gb_internal lbModule *lb_module_of_entity_internal(lbGenerator *gen, Entity *e, lbModule *curr_module) {
 	lbModule **found = nullptr;
+
 	if (e->kind == Entity_Procedure &&
 	    e->decl_info &&
 	    e->decl_info->code_gen_module) {
@@ -428,6 +497,22 @@ gb_internal lbModule *lb_module_of_entity(lbGenerator *gen, Entity *e) {
 	return &gen->default_module;
 }
 
+
+gb_internal lbModule *lb_module_of_entity(lbGenerator *gen, Entity *e, lbModule *curr_module) {
+	GB_ASSERT(e != nullptr);
+	GB_ASSERT(curr_module != nullptr);
+	lbModule *m = lb_module_of_entity_internal(gen, e, curr_module);
+
+	if (USE_SEPARATE_MODULES) {
+		if (e->kind == Entity_Procedure && e->Procedure.generated_from_polymorphic) {
+			if (m->polymorphic_module) {
+				return m->polymorphic_module;
+			}
+		}
+	}
+	return m;
+}
+
 gb_internal lbAddr lb_addr(lbValue addr) {
 	lbAddr v = {lbAddr_Default, addr};
 	return v;
@@ -1634,8 +1719,92 @@ gb_internal LLVMTypeRef lb_type_internal_for_procedures_raw(lbModule *m, Type *t
 	map_set(&m->func_raw_types, type, new_abi_fn_type);
 
 	return new_abi_fn_type;
+}
 
+
+gb_internal LLVMTypeRef lb_type_internal_union_block_type(lbModule *m, Type *type) {
+	GB_ASSERT(type->kind == Type_Union);
+
+	if (type->Union.variants.count <= 0) {
+		return nullptr;
+	}
+	if (type->Union.variants.count == 1) {
+		return lb_type(m, type->Union.variants[0]);
+	}
+
+	i64 align = type_align_of(type);
+
+	unsigned block_size = cast(unsigned)type->Union.variant_block_size;
+	if (block_size == 0) {
+		return lb_type_padding_filler(m, block_size, align);
+	}
+
+	bool all_pointers = align == build_context.ptr_size;
+	for (isize i = 0; all_pointers && i < type->Union.variants.count; i++) {
+		Type *t = type->Union.variants[i];
+		if (!is_type_internally_pointer_like(t)) {
+			all_pointers = false;
+		}
+	}
+	if (all_pointers) {
+		return lb_type(m, t_rawptr);
+	}
+
+	{
+		Type *pt = type->Union.variants[0];
+		for (isize i = 1; i < type->Union.variants.count; i++) {
+			Type *t = type->Union.variants[i];
+			if (!are_types_identical(pt, t)) {
+				goto end_check_for_all_the_same;
+			}
+		}
+		return lb_type(m, pt);
+	} end_check_for_all_the_same:;
+
+	{
+		Type *first_different = nullptr;
+		for (isize i = 0; i < type->Union.variants.count; i++) {
+			Type *t = type->Union.variants[i];
+			if (type_size_of(t) == 0) {
+				continue;
+			}
+			if (first_different == nullptr) {
+				first_different = t;
+			} else if (!are_types_identical(first_different, t)) {
+				goto end_rest_zero_except_one;
+			}
+		}
+		if (first_different != nullptr) {
+			return lb_type(m, first_different);
+		}
+	} end_rest_zero_except_one:;
+
+	// {
+	// 	LLVMTypeRef first_different = nullptr;
+	// 	for (isize i = 0; i < type->Union.variants.count; i++) {
+	// 		Type *t = type->Union.variants[i];
+	// 		if (type_size_of(t) == 0) {
+	// 			continue;
+	// 		}
+	// 		if (first_different == nullptr) {
+	// 			first_different = lb_type(m, base_type(t));
+	// 		} else {
+	// 			LLVMTypeRef llvm_t = lb_type(m, base_type(t));
+	// 			if (llvm_t != first_different) {
+	// 				goto end_rest_zero_except_one_llvm_like;
+	// 			}
+	// 		}
+	// 	}
+	// 	if (first_different != nullptr) {
+	// 		return first_different;
+	// 	}
+	// } end_rest_zero_except_one_llvm_like:;
+
+
+	return lb_type_padding_filler(m, block_size, align);
 }
+
+
 gb_internal LLVMTypeRef lb_type_internal(lbModule *m, Type *type) {
 	LLVMContextRef ctx = m->ctx;
 	i64 size = type_size_of(type); // Check size
@@ -2148,27 +2317,24 @@ gb_internal LLVMTypeRef lb_type_internal(lbModule *m, Type *type) {
 				return LLVMStructTypeInContext(ctx, fields, gb_count_of(fields), false);
 			}
 
-			unsigned block_size = cast(unsigned)type->Union.variant_block_size;
-
 			auto fields = array_make<LLVMTypeRef>(temporary_allocator(), 0, 3);
 			if (is_type_union_maybe_pointer(type)) {
 				LLVMTypeRef variant = lb_type(m, type->Union.variants[0]);
 				array_add(&fields, variant);
-			} else {
-				LLVMTypeRef block_type = nullptr;
+			} else if (type->Union.variants.count == 1) {
+				LLVMTypeRef block_type = lb_type(m, type->Union.variants[0]);
 
-				bool all_pointers = align == build_context.ptr_size;
-				for (isize i = 0; all_pointers && i < type->Union.variants.count; i++) {
-					Type *t = type->Union.variants[i];
-					if (!is_type_internally_pointer_like(t)) {
-						all_pointers = false;
-					}
-				}
-				if (all_pointers) {
-					block_type = lb_type(m, t_rawptr);
-				} else {
-					block_type = lb_type_padding_filler(m, block_size, align);
+				LLVMTypeRef tag_type = lb_type(m, union_tag_type(type));
+				array_add(&fields, block_type);
+				array_add(&fields, tag_type);
+				i64 used_size = lb_sizeof(block_type) + lb_sizeof(tag_type);
+				i64 padding = size - used_size;
+				if (padding > 0) {
+					LLVMTypeRef padding_type = lb_type_padding_filler(m, padding, align);
+					array_add(&fields, padding_type);
 				}
+			} else {
+				LLVMTypeRef block_type = lb_type_internal_union_block_type(m, type);
 
 				LLVMTypeRef tag_type = lb_type(m, union_tag_type(type));
 				array_add(&fields, block_type);
@@ -2914,7 +3080,7 @@ gb_internal lbValue lb_find_ident(lbProcedure *p, lbModule *m, Entity *e, Ast *e
 		return lb_find_procedure_value_from_entity(m, e);
 	}
 	if (USE_SEPARATE_MODULES) {
-		lbModule *other_module = lb_module_of_entity(m->gen, e);
+		lbModule *other_module = lb_module_of_entity(m->gen, e, m);
 		if (other_module != m) {
 			String name = lb_get_entity_name(other_module, e);
 
@@ -2962,7 +3128,7 @@ gb_internal lbValue lb_find_procedure_value_from_entity(lbModule *m, Entity *e)
 
 	lbModule *other_module = m;
 	if (USE_SEPARATE_MODULES) {
-		other_module = lb_module_of_entity(gen, e);
+		other_module = lb_module_of_entity(gen, e, m);
 	}
 	if (other_module == m) {
 		debugf("Missing Procedure (lb_find_procedure_value_from_entity): %.*s module %p\n", LIT(e->token.string), m);
@@ -2979,9 +3145,6 @@ gb_internal lbValue lb_find_procedure_value_from_entity(lbModule *m, Entity *e)
 	}
 
 	if (ignore_body) {
-		mutex_lock(&gen->anonymous_proc_lits_mutex);
-		defer (mutex_unlock(&gen->anonymous_proc_lits_mutex));
-
 		GB_ASSERT(other_module != nullptr);
 		rw_mutex_shared_lock(&other_module->values_mutex);
 		auto *found = map_get(&other_module->values, e);
@@ -2989,10 +3152,10 @@ gb_internal lbValue lb_find_procedure_value_from_entity(lbModule *m, Entity *e)
 		if (found == nullptr) {
 			// THIS IS THE RACE CONDITION
 			lbProcedure *missing_proc_in_other_module = lb_create_procedure(other_module, e, false);
-			array_add(&other_module->missing_procedures_to_check, missing_proc_in_other_module);
+			mpsc_enqueue(&other_module->missing_procedures_to_check, missing_proc_in_other_module);
 		}
 	} else {
-		array_add(&m->missing_procedures_to_check, missing_proc);
+		mpsc_enqueue(&m->missing_procedures_to_check, missing_proc);
 	}
 
 	rw_mutex_shared_lock(&m->values_mutex);
@@ -3010,17 +3173,15 @@ gb_internal lbValue lb_find_procedure_value_from_entity(lbModule *m, Entity *e)
 
 gb_internal lbValue lb_generate_anonymous_proc_lit(lbModule *m, String const &prefix_name, Ast *expr, lbProcedure *parent) {
 	lbGenerator *gen = m->gen;
+	gb_unused(gen);
 
-	mutex_lock(&gen->anonymous_proc_lits_mutex);
-	defer (mutex_unlock(&gen->anonymous_proc_lits_mutex));
+	ast_node(pl, ProcLit, expr);
 
-	TokenPos pos = ast_token(expr).pos;
-	lbProcedure **found = map_get(&gen->anonymous_proc_lits, expr);
-	if (found) {
-		return lb_find_procedure_value_from_entity(m, (*found)->entity);
+	if (pl->decl->entity.load() != nullptr) {
+		return lb_find_procedure_value_from_entity(m, pl->decl->entity.load());
 	}
 
-	ast_node(pl, ProcLit, expr);
+	TokenPos pos = ast_token(expr).pos;
 
 	// NOTE(bill): Generate a new name
 	// parent$count
@@ -3039,30 +3200,51 @@ gb_internal lbValue lb_generate_anonymous_proc_lit(lbModule *m, String const &pr
 	token.string = name;
 	Entity *e = alloc_entity_procedure(nullptr, token, type, pl->tags);
 	e->file = expr->file();
+	e->scope = e->file->scope;
+
+	lbModule *target_module = m;
+	GB_ASSERT(target_module != nullptr);
 
 	// NOTE(bill): this is to prevent a race condition since these procedure literals can be created anywhere at any time
-	pl->decl->code_gen_module = m;
+	pl->decl->code_gen_module = target_module;
 	e->decl_info = pl->decl;
-	pl->decl->entity = e;
 	e->parent_proc_decl = pl->decl->parent;
 	e->Procedure.is_anonymous = true;
 	e->flags |= EntityFlag_ProcBodyChecked;
 
-	lbProcedure *p = lb_create_procedure(m, e);
-	GB_ASSERT(e->code_gen_module == m);
+	pl->decl->entity.store(e);
 
-	lbValue value = {};
-	value.value = p->value;
-	value.type = p->type;
 
-	map_set(&gen->anonymous_proc_lits, expr, p);
-	array_add(&m->procedures_to_generate, p);
-	if (parent != nullptr) {
-		array_add(&parent->children, p);
+	if (target_module != m) {
+		rw_mutex_shared_lock(&target_module->values_mutex);
+		lbValue *found = map_get(&target_module->values, e);
+		rw_mutex_shared_unlock(&target_module->values_mutex);
+		if (found == nullptr) {
+			lbProcedure *missing_proc_in_target_module = lb_create_procedure(target_module, e, false);
+			mpsc_enqueue(&target_module->missing_procedures_to_check, missing_proc_in_target_module);
+		}
+
+		lbProcedure *p = lb_create_procedure(m, e, true);
+
+		lbValue value = {};
+		value.value = p->value;
+		value.type = p->type;
+		return value;
 	} else {
-		string_map_set(&m->members, name, value);
+		lbProcedure *p = lb_create_procedure(m, e);
+
+		lbValue value = {};
+		value.value = p->value;
+		value.type = p->type;
+
+		mpsc_enqueue(&m->procedures_to_generate, p);
+		if (parent != nullptr) {
+			array_add(&parent->children, p);
+		} else {
+			string_map_set(&m->members, name, value);
+		}
+		return value;
 	}
-	return value;
 }
 
 
@@ -3145,7 +3327,7 @@ gb_internal lbValue lb_find_value_from_entity(lbModule *m, Entity *e) {
 	}
 
 	if (USE_SEPARATE_MODULES) {
-		lbModule *other_module = lb_module_of_entity(m->gen, e);
+		lbModule *other_module = lb_module_of_entity(m->gen, e, m);
 
 		bool is_external = other_module != m;
 		if (!is_external) {

+ 3 - 4
src/llvm_backend_proc.cpp

@@ -84,7 +84,7 @@ gb_internal lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool i
 	String link_name = {};
 
 	if (ignore_body) {
-		lbModule *other_module = lb_module_of_entity(m->gen, entity);
+		lbModule *other_module = lb_module_of_entity(m->gen, entity, m);
 		link_name = lb_get_entity_name(other_module, entity);
 	} else {
 		link_name = lb_get_entity_name(m, entity);
@@ -99,7 +99,6 @@ gb_internal lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool i
 		}
 	}
 
-
 	lbProcedure *p = gb_alloc_item(permanent_allocator(), lbProcedure);
 
 	p->module = m;
@@ -835,7 +834,7 @@ gb_internal void lb_build_nested_proc(lbProcedure *p, AstProcLit *pd, Entity *e)
 
 	lb_add_entity(m, e, value);
 	array_add(&p->children, nested_proc);
-	array_add(&m->procedures_to_generate, nested_proc);
+	mpsc_enqueue(&m->procedures_to_generate, nested_proc);
 }
 
 
@@ -2211,7 +2210,7 @@ gb_internal lbValue lb_build_builtin_proc(lbProcedure *p, Ast *expr, TypeAndValu
 				GB_ASSERT(e != nullptr);
 
 				if (e->parent_proc_decl != nullptr && e->parent_proc_decl->entity != nullptr) {
-					procedure = e->parent_proc_decl->entity->token.string;
+					procedure = e->parent_proc_decl->entity.load()->token.string;
 				} else {
 					procedure = str_lit("");
 				}

+ 1 - 1
src/llvm_backend_stmt.cpp

@@ -92,7 +92,7 @@ gb_internal void lb_build_constant_value_decl(lbProcedure *p, AstValueDecl *vd)
 			value.value = nested_proc->value;
 			value.type = nested_proc->type;
 
-			array_add(&p->module->procedures_to_generate, nested_proc);
+			mpsc_enqueue(&p->module->procedures_to_generate, nested_proc);
 			array_add(&p->children, nested_proc);
 			string_map_set(&p->module->members, name, value);
 		}

+ 3 - 2
src/llvm_backend_type.cpp

@@ -394,8 +394,9 @@ gb_internal void lb_setup_type_info_data_giant_array(lbModule *m, i64 global_typ
 			String proc_name = {};
 			if (t->Named.type_name->parent_proc_decl) {
 				DeclInfo *decl = t->Named.type_name->parent_proc_decl;
-				if (decl->entity && decl->entity->kind == Entity_Procedure) {
-					proc_name = decl->entity->token.string;
+				Entity *e = decl->entity.load();
+				if (e && e->kind == Entity_Procedure) {
+					proc_name = e->token.string;
 				}
 			}
 			TokenPos pos = t->Named.type_name->token.pos;

+ 16 - 0
src/main.cpp

@@ -403,6 +403,8 @@ enum BuildFlagKind {
 	BuildFlag_InternalCached,
 	BuildFlag_InternalNoInline,
 	BuildFlag_InternalByValue,
+	BuildFlag_InternalWeakMonomorphization,
+	BuildFlag_InternalLLVMVerification,
 
 	BuildFlag_Tilde,
 
@@ -626,6 +628,8 @@ gb_internal bool parse_build_flags(Array<String> args) {
 	add_flag(&build_flags, BuildFlag_InternalCached,          str_lit("internal-cached"),           BuildFlagParam_None,    Command_all);
 	add_flag(&build_flags, BuildFlag_InternalNoInline,        str_lit("internal-no-inline"),        BuildFlagParam_None,    Command_all);
 	add_flag(&build_flags, BuildFlag_InternalByValue,         str_lit("internal-by-value"),         BuildFlagParam_None,    Command_all);
+	add_flag(&build_flags, BuildFlag_InternalWeakMonomorphization, str_lit("internal-weak-monomorphization"), BuildFlagParam_None, Command_all);
+	add_flag(&build_flags, BuildFlag_InternalLLVMVerification, str_lit("internal-ignore-llvm-verification"), BuildFlagParam_None, Command_all);
 
 #if ALLOW_TILDE
 	add_flag(&build_flags, BuildFlag_Tilde,                   str_lit("tilde"),                     BuildFlagParam_None,    Command__does_build);
@@ -1584,6 +1588,13 @@ gb_internal bool parse_build_flags(Array<String> args) {
 						case BuildFlag_InternalByValue:
 							build_context.internal_by_value = true;
 							break;
+						case BuildFlag_InternalWeakMonomorphization:
+							build_context.internal_weak_monomorphization = true;
+							break;
+						case BuildFlag_InternalLLVMVerification:
+							build_context.internal_ignore_llvm_verification = true;
+							break;
+
 
 						case BuildFlag_Tilde:
 							build_context.tilde_backend = true;
@@ -3618,6 +3629,11 @@ int main(int arg_count, char const **arg_ptr) {
 	// 	print_usage_line(0, "%.*s 32-bit is not yet supported for this platform", LIT(args[0]));
 	// 	return 1;
 	// }
+	
+	// Warn about Windows i386 thread-local storage limitations
+	if (build_context.metrics.arch == TargetArch_i386 && build_context.metrics.os == TargetOs_windows) {
+		gb_printf_err("Warning: Thread-local storage is disabled on Windows i386.\n");
+	}
 
 	// Check chosen microarchitecture. If not found or ?, print list.
 	bool print_microarch_list = true;

+ 15 - 10
src/string.cpp

@@ -633,23 +633,28 @@ gb_internal String normalize_path(gbAllocator a, String const &path, String cons
 		return WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, widechar_input, input_length, output, output_size, nullptr, nullptr);
 	}
 #elif defined(GB_SYSTEM_UNIX) || defined(GB_SYSTEM_OSX)
-
-	#include <iconv.h>
+	#include <wchar.h>
 
 	gb_internal int convert_multibyte_to_widechar(char const *multibyte_input, usize input_length, wchar_t *output, usize output_size) {
-		iconv_t conv = iconv_open("WCHAR_T", "UTF-8");
-		size_t result = iconv(conv, cast(char **)&multibyte_input, &input_length, cast(char **)&output, &output_size);
-		iconv_close(conv);
+		String	string = copy_string(heap_allocator(), make_string(cast(u8 const*)multibyte_input, input_length)); /* Guarantee NULL terminator */
+		u8*		input = string.text;
+
+		mbstate_t	ps = { 0 };
+		size_t	result = mbsrtowcs(output, cast(const char**)&input, output_size, &ps);
 
-		return cast(int)result;
+		gb_free(heap_allocator(), string.text);
+		return (result == (size_t)-1) ? -1 : (int)result;
 	}
 
 	gb_internal int convert_widechar_to_multibyte(wchar_t const *widechar_input, usize input_length, char* output, usize output_size) {
-		iconv_t conv = iconv_open("UTF-8", "WCHAR_T");
-		size_t result = iconv(conv, cast(char**) &widechar_input, &input_length, cast(char **)&output, &output_size);
-		iconv_close(conv);
+		String	string = copy_string(heap_allocator(), make_string(cast(u8 const*)widechar_input, input_length)); /* Guarantee NULL terminator */
+		u8*		input = string.text;
+
+		mbstate_t	ps = { 0 };
+		size_t	result = wcsrtombs(output, cast(const wchar_t**)&input, output_size, &ps);
 
-		return cast(int)result;
+		gb_free(heap_allocator(), string.text);
+		return (result == (size_t)-1) ? -1 : (int)result;
 	}
 #else
 #error Implement system

+ 14 - 6
src/thread_pool.cpp

@@ -19,6 +19,11 @@ enum GrabState {
 	Grab_Failed  = 2,
 };
 
+enum BroadcastWaitState {
+	Nobody_Waiting  = 0,
+	Someone_Waiting = 1,
+};
+
 struct ThreadPool {
 	gbAllocator       threads_allocator;
 	Slice<Thread>     threads;
@@ -54,8 +59,8 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) {
 
 	for_array_off(i, 1, pool->threads) {
 		Thread *t = &pool->threads[i];
-		pool->tasks_available.fetch_add(1, std::memory_order_acquire);
-		futex_broadcast(&pool->tasks_available);
+		pool->tasks_available.store(Nobody_Waiting);
+		futex_broadcast(&t->pool->tasks_available);
 		thread_join_and_destroy(t);
 	}
 
@@ -87,8 +92,10 @@ void thread_pool_queue_push(Thread *thread, WorkerTask task) {
 	thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
 
 	thread->pool->tasks_left.fetch_add(1, std::memory_order_release);
-	thread->pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
-	futex_broadcast(&thread->pool->tasks_available);
+	i32 state = Someone_Waiting;
+	if (thread->pool->tasks_available.compare_exchange_strong(state, Nobody_Waiting)) {
+		futex_broadcast(&thread->pool->tasks_available);
+	}
 }
 
 GrabState thread_pool_queue_take(Thread *thread, WorkerTask *task) {
@@ -230,12 +237,13 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 		}
 
 		// if we've done all our work, and there's nothing to steal, go to sleep
-		state = pool->tasks_available.load(std::memory_order_acquire);
+		pool->tasks_available.store(Someone_Waiting);
 		if (!pool->running) { break; }
-		futex_wait(&pool->tasks_available, state);
+		futex_wait(&pool->tasks_available, Someone_Waiting);
 
 		main_loop_continue:;
 	}
 
 	return 0;
 }
+

+ 10 - 2
src/threading.cpp

@@ -195,7 +195,13 @@ gb_internal void mutex_lock(RecursiveMutex *m) {
 			// inside the lock
 			return;
 		}
-		futex_wait(&m->owner, prev_owner);
+
+		// NOTE(lucas): we are doing spin lock since futex signal is expensive on OSX. The recursive locks are
+		// very short lived so we don't hit this mega often and I see no perform regression on windows (with
+		// a performance uplift on OSX).
+
+		//futex_wait(&m->owner, prev_owner);
+		yield_thread();
 	}
 }
 gb_internal bool mutex_try_lock(RecursiveMutex *m) {
@@ -216,7 +222,9 @@ gb_internal void mutex_unlock(RecursiveMutex *m) {
 		return;
 	}
 	m->owner.exchange(0, std::memory_order_release);
-	futex_signal(&m->owner);
+	// NOTE(lucas): see comment about spin lock in mutex_lock above
+
+	// futex_signal(&m->owner);
 	// outside the lock
 }