Browse Source

Merge pull request #3919 from colrdavidson/growing_workpool

move to a growing workstealing queue
gingerBill 1 year ago
parent
commit
eff46c2e5f
2 changed files with 114 additions and 49 deletions
  1. 85 39
      src/thread_pool.cpp
  2. 29 10
      src/threading.cpp

+ 85 - 39
src/thread_pool.cpp

@@ -10,13 +10,18 @@ gb_internal void thread_pool_destroy(ThreadPool *pool);
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data);
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data);
 gb_internal void thread_pool_wait(ThreadPool *pool);
 gb_internal void thread_pool_wait(ThreadPool *pool);
 
 
+enum GrabState {
+	GrabSuccess = 0,
+	GrabEmpty   = 1,
+	GrabFailed  = 2,
+};
+
 struct ThreadPool {
 struct ThreadPool {
 	gbAllocator threads_allocator;
 	gbAllocator threads_allocator;
 	Slice<Thread> threads;
 	Slice<Thread> threads;
 	std::atomic<bool> running;
 	std::atomic<bool> running;
 
 
 	Futex tasks_available;
 	Futex tasks_available;
-
 	Futex tasks_left;
 	Futex tasks_left;
 };
 };
 
 
@@ -46,7 +51,7 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) {
 
 
 	for_array_off(i, 1, pool->threads) {
 	for_array_off(i, 1, pool->threads) {
 		Thread *t = &pool->threads[i];
 		Thread *t = &pool->threads[i];
-		pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
+		pool->tasks_available.fetch_add(1, std::memory_order_acquire);
 		futex_broadcast(&pool->tasks_available);
 		futex_broadcast(&pool->tasks_available);
 		thread_join_and_destroy(t);
 		thread_join_and_destroy(t);
 	}
 	}
@@ -54,51 +59,86 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) {
 	gb_free(pool->threads_allocator, pool->threads.data);
 	gb_free(pool->threads_allocator, pool->threads.data);
 }
 }
 
 
-void thread_pool_queue_push(Thread *thread, WorkerTask task) {
-	u64 capture;
-	u64 new_capture;
-	do {
-		capture = thread->head_and_tail.load();
-
-		u64 mask = thread->capacity - 1;
-		u64 head = (capture >> 32) & mask;
-		u64 tail = ((u32)capture) & mask;
+TaskRingBuffer *taskring_grow(TaskRingBuffer *ring, isize bottom, isize top) {
+	TaskRingBuffer *new_ring = taskring_init(ring->size * 2);
+	for (isize i = top; i < bottom; i++) {
+		new_ring->buffer[i % new_ring->size] = ring->buffer[i % ring->size];
+	}
+	return new_ring;
+}
 
 
-		u64 new_head = (head + 1) & mask;
-		GB_ASSERT_MSG(new_head != tail, "Thread Queue Full!");
+void thread_pool_queue_push(Thread *thread, WorkerTask task) {
+	isize bot                = thread->queue.bottom.load(std::memory_order_relaxed);
+	isize top                = thread->queue.top.load(std::memory_order_acquire);
+	TaskRingBuffer *cur_ring   = thread->queue.ring.load(std::memory_order_relaxed);
+
+	isize size = bot - top;
+	if (size > (cur_ring->size - 1)) {
+		// Queue is full
+		thread->queue.ring = taskring_grow(thread->queue.ring, bot, top);
+		cur_ring = thread->queue.ring.load(std::memory_order_relaxed);
+	}
 
 
-		// This *must* be done in here, to avoid a potential race condition where we no longer own the slot by the time we're assigning
-		thread->queue[head] = task;
-		new_capture = (new_head << 32) | tail;
-	} while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture));
+	cur_ring->buffer[bot % cur_ring->size] = task;
+	std::atomic_thread_fence(std::memory_order_release);
+	thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
 
 
 	thread->pool->tasks_left.fetch_add(1, std::memory_order_release);
 	thread->pool->tasks_left.fetch_add(1, std::memory_order_release);
 	thread->pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
 	thread->pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
 	futex_broadcast(&thread->pool->tasks_available);
 	futex_broadcast(&thread->pool->tasks_available);
 }
 }
 
 
-bool thread_pool_queue_pop(Thread *thread, WorkerTask *task) {
-	u64 capture;
-	u64 new_capture;
-	do {
-		capture = thread->head_and_tail.load(std::memory_order_acquire);
-
-		u64 mask = thread->capacity - 1;
-		u64 head = (capture >> 32) & mask;
-		u64 tail = ((u32)capture) & mask;
+GrabState thread_pool_queue_take(Thread *thread, WorkerTask *task) {
+	isize bot = thread->queue.bottom.load(std::memory_order_relaxed) - 1;
+	TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_relaxed);
+	thread->queue.bottom.store(bot, std::memory_order_relaxed);
+	std::atomic_thread_fence(std::memory_order_seq_cst);
+
+	isize top = thread->queue.top.load(std::memory_order_relaxed);
+	if (top <= bot) {
+
+		// Queue is not empty
+		*task = cur_ring->buffer[bot % cur_ring->size];
+		if (top == bot) {
+			// Only one entry left in queue
+			if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+				// Race failed
+				thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+				return GrabEmpty;
+			}
 
 
-		u64 new_tail = (tail + 1) & mask;
-		if (tail == head) {
-			return false;
+			thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+			return GrabSuccess;
 		}
 		}
 
 
-		// Making a copy of the task before we increment the tail, avoiding the same potential race condition as above
-		*task = thread->queue[tail];
-
-		new_capture = (head << 32) | new_tail;
-	} while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture, std::memory_order_release));
+		// We got a task without hitting a race
+		return GrabSuccess;
+	} else {
+		// Queue is empty
+		thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+		return GrabEmpty;
+	}
+}
 
 
-	return true;
+GrabState thread_pool_queue_steal(Thread *thread, WorkerTask *task) {
+	isize top = thread->queue.top.load(std::memory_order_acquire);
+	std::atomic_thread_fence(std::memory_order_seq_cst);
+	isize bot = thread->queue.bottom.load(std::memory_order_acquire);
+
+	GrabState ret = GrabEmpty;
+	if (top < bot) {
+		// Queue is not empty
+		TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_consume);
+		*task = cur_ring->buffer[top % cur_ring->size];
+
+		if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+			// Race failed
+			ret = GrabFailed;
+		} else {
+			ret = GrabSuccess;
+		}
+	}
+	return ret;
 }
 }
 
 
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data) {
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data) {
@@ -115,12 +155,11 @@ gb_internal void thread_pool_wait(ThreadPool *pool) {
 
 
 	while (pool->tasks_left.load(std::memory_order_acquire)) {
 	while (pool->tasks_left.load(std::memory_order_acquire)) {
 		// if we've got tasks on our queue, run them
 		// if we've got tasks on our queue, run them
-		while (thread_pool_queue_pop(current_thread, &task)) {
+		while (!thread_pool_queue_take(current_thread, &task)) {
 			task.do_work(task.data);
 			task.do_work(task.data);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 		}
 		}
 
 
-
 		// is this mem-barriered enough?
 		// is this mem-barriered enough?
 		// This *must* be executed in this order, so the futex wakes immediately
 		// This *must* be executed in this order, so the futex wakes immediately
 		// if rem_tasks has changed since we checked last, otherwise the program
 		// if rem_tasks has changed since we checked last, otherwise the program
@@ -145,7 +184,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 		usize finished_tasks = 0;
 		usize finished_tasks = 0;
 		i32 state;
 		i32 state;
 
 
-		while (thread_pool_queue_pop(current_thread, &task)) {
+		while (!thread_pool_queue_take(current_thread, &task)) {
 			task.do_work(task.data);
 			task.do_work(task.data);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 
 
@@ -167,7 +206,13 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 
 
 				Thread *thread = &pool->threads.data[idx];
 				Thread *thread = &pool->threads.data[idx];
 				WorkerTask task;
 				WorkerTask task;
-				if (thread_pool_queue_pop(thread, &task)) {
+
+				GrabState ret = thread_pool_queue_steal(thread, &task);
+				if (ret == GrabFailed) {
+					goto main_loop_continue;
+				} else if (ret == GrabEmpty) {
+					continue;
+				} else if (ret == GrabSuccess) {
 					task.do_work(task.data);
 					task.do_work(task.data);
 					pool->tasks_left.fetch_sub(1, std::memory_order_release);
 					pool->tasks_left.fetch_sub(1, std::memory_order_release);
 
 
@@ -182,6 +227,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 
 
 		// if we've done all our work, and there's nothing to steal, go to sleep
 		// if we've done all our work, and there's nothing to steal, go to sleep
 		state = pool->tasks_available.load(std::memory_order_acquire);
 		state = pool->tasks_available.load(std::memory_order_acquire);
+		if (!pool->running) { break; }
 		futex_wait(&pool->tasks_available, state);
 		futex_wait(&pool->tasks_available, state);
 
 
 		main_loop_continue:;
 		main_loop_continue:;

+ 29 - 10
src/threading.cpp

@@ -46,6 +46,18 @@ typedef struct WorkerTask {
 	void           *data;
 	void           *data;
 } WorkerTask;
 } WorkerTask;
 
 
+typedef struct TaskRingBuffer {
+	std::atomic<isize> size;
+	std::atomic<WorkerTask *> buffer;
+} TaskRingBuffer;
+
+typedef struct TaskQueue {
+	std::atomic<isize> top;
+	std::atomic<isize> bottom;
+
+	std::atomic<TaskRingBuffer *> ring;
+} TaskQueue;
+
 struct Thread {
 struct Thread {
 #if defined(GB_SYSTEM_WINDOWS)
 #if defined(GB_SYSTEM_WINDOWS)
 	void *win32_handle;
 	void *win32_handle;
@@ -54,12 +66,9 @@ struct Thread {
 #endif
 #endif
 
 
 	isize idx;
 	isize idx;
-
-	WorkerTask *queue;
-	size_t capacity;
-	std::atomic<uint64_t> head_and_tail;
-
 	isize  stack_size;
 	isize  stack_size;
+
+	struct TaskQueue queue;
 	struct ThreadPool *pool;
 	struct ThreadPool *pool;
 };
 };
 
 
@@ -551,6 +560,18 @@ gb_internal void *internal_thread_proc(void *arg) {
 }
 }
 #endif
 #endif
 
 
+TaskRingBuffer *taskring_init(isize size) {
+	TaskRingBuffer *ring = (TaskRingBuffer *)gb_alloc(heap_allocator(), sizeof(TaskRingBuffer));
+	ring->size = size;
+	ring->buffer = (WorkerTask *)gb_alloc_array(heap_allocator(), WorkerTask, ring->size);
+	return ring;
+}
+
+void thread_queue_destroy(TaskQueue *q) {
+	gb_free(heap_allocator(), (*q->ring).buffer);
+	gb_free(heap_allocator(), q->ring);
+}
+
 gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) {
 gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) {
 	gb_zero_item(t);
 	gb_zero_item(t);
 #if defined(GB_SYSTEM_WINDOWS)
 #if defined(GB_SYSTEM_WINDOWS)
@@ -559,14 +580,12 @@ gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) {
 	t->posix_handle = 0;
 	t->posix_handle = 0;
 #endif
 #endif
 
 
-	t->capacity = 1 << 14; // must be a power of 2
-	t->queue = gb_alloc_array(heap_allocator(), WorkerTask, t->capacity);
-	t->head_and_tail = 0;
+	// Size must be a power of 2
+	t->queue.ring = taskring_init(1 << 14);
 	t->pool = pool;
 	t->pool = pool;
 	t->idx = idx;
 	t->idx = idx;
 }
 }
 
 
-
 gb_internal void thread_init_and_start(ThreadPool *pool, Thread *t, isize idx) {
 gb_internal void thread_init_and_start(ThreadPool *pool, Thread *t, isize idx) {
 	thread_init(pool, t, idx);
 	thread_init(pool, t, idx);
 	isize stack_size = 0;
 	isize stack_size = 0;
@@ -598,7 +617,7 @@ gb_internal void thread_join_and_destroy(Thread *t) {
 	t->posix_handle = 0;
 	t->posix_handle = 0;
 #endif
 #endif
 
 
-	gb_free(heap_allocator(), t->queue);
+	thread_queue_destroy(&t->queue);
 }
 }
 
 
 gb_internal void thread_set_name(Thread *t, char const *name) {
 gb_internal void thread_set_name(Thread *t, char const *name) {