瀏覽代碼

Add `runtime.conditional_mem_zero` to improve `heap_allocator` performance on non-Windows systems

gingerBill 2 周之前
父節點
當前提交
ac01d1b5bf
共有 2 個文件被更改,包括 56 次插入4 次删除
  1. 6 4
      base/runtime/heap_allocator.odin
  2. 50 0
      base/runtime/internal.odin

+ 6 - 4
base/runtime/heap_allocator.odin

@@ -71,10 +71,12 @@ heap_allocator_proc :: proc(allocator_data: rawptr, mode: Allocator_Mode,
 
 
 		new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return
 		new_memory = aligned_alloc(new_size, new_alignment, p, old_size, zero_memory) or_return
 
 
-		// NOTE: heap_resize does not zero the new memory, so we do it
-		if zero_memory && new_size > old_size {
-			new_region := raw_data(new_memory[old_size:])
-			intrinsics.mem_zero(new_region, new_size - old_size)
+		when ODIN_OS != .Windows {
+			// NOTE: heap_resize does not zero the new memory, so we do it
+			if zero_memory && new_size > old_size {
+				new_region := raw_data(new_memory[old_size:])
+				conditional_mem_zero(new_region, new_size - old_size)
+			}
 		}
 		}
 		return
 		return
 	}
 	}

+ 50 - 0
base/runtime/internal.odin

@@ -230,6 +230,56 @@ non_zero_mem_resize :: proc(ptr: rawptr, old_size, new_size: int, alignment: int
 	return _mem_resize(ptr, old_size, new_size, alignment, allocator, false, loc)
 	return _mem_resize(ptr, old_size, new_size, alignment, allocator, false, loc)
 }
 }
 
 
+conditional_mem_zero :: proc "contextless" (data: rawptr, n_: int) #no_bounds_check {
+	// When acquiring memory from the OS for the first time it's likely that the
+	// OS already gives the zero page mapped multiple times for the request. The
+	// actual allocation does not have physical pages allocated to it until those
+	// pages are written to which causes a page-fault. This is often called COW
+	// (Copy on Write)
+	//
+	// You do not want to actually zero out memory in this case because it would
+	// cause a bunch of page faults decreasing the speed of allocations and
+	// increase the amount of actual resident physical memory used.
+	//
+	// Instead a better technique is to check if memory is zerored before zeroing
+	// it. This turns out to be an important optimization in practice, saving
+	// nearly half (or more) the amount of physical memory used by an application.
+	// This is why every implementation of calloc in libc does this optimization.
+	//
+	// It may seem counter-intuitive but most allocations in an application are
+	// wasted and never used. When you consider something like a [dynamic]T which
+	// always doubles in capacity on resize but you rarely ever actually use the
+	// full capacity of a dynamic array it means you have a lot of resident waste
+	// if you actually zeroed the remainder of the memory.
+	//
+	// Keep in mind the OS is already guaranteed to give you zeroed memory by
+	// mapping in this zero page multiple times so in the best case there is no
+	// need to actually zero anything. As for testing all this memory for a zero
+	// value, it costs nothing because the the same zero page is used for the
+	// whole allocation and will exist in L1 cache for the entire zero checking
+	// process.
+
+	if n_ <= 0 {
+		return
+	}
+	n := uint(n_)
+
+	n_words := n / size_of(uintptr)
+	n_bytes := n % size_of(uintptr)
+	p_words := ([^]uintptr)(data)[:n_words]
+	p_bytes := ([^]byte)(data)[size_of(uintptr) * n_words:n]
+	for &p_word in p_words {
+		if p_word != 0 {
+			p_word = 0
+		}
+	}
+	for &p_byte in p_bytes {
+		if p_byte != 0 {
+			p_byte = 0
+		}
+	}
+}
+
 memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
 memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
 	switch {
 	switch {
 	case n == 0: return true
 	case n == 0: return true