Browse Source

QuadTree / FixedSizeFreeList: Reorder variable layout to reduce false sharing & thread syncs (#1136)

* FixedSizeFreeList: Reorder variable layout to reduce false sharing/syncs

Currently, the bookkeeping variables used in `Get()` calls reside in
the same cache line as the atomic variables which are modified upon
allocation. If one thread allocates a new object and another thread
calls `Get()`, a cross-thread cache sync has to occur, causing a stall
and reducing performance.

By reordering variables in the allocator, we can move the variables used
in `Get()` to a different cache line, so threads don't need to sync
for object reads after an allocation occurs.

* QuadTree: Reorder variable layout to reduce false sharing/thread syncs

Testing across thread counts on modern desktop x86 processors using the built-in PerformanceTest (compiled with LLVM 17 via Clang-CL on Windows), this seems to give around a 5% average improvement in the general case. On other workloads I've tested which are heavily memory-bound, I've seen improvements of up to 15% at specific thread counts. This has also been tested on Apple silicon with no regressions found.
Abel Briggs 1 year ago
parent
commit
d3c4e3058a
2 changed files with 36 additions and 32 deletions
  1. 17 15
      Jolt/Core/FixedSizeFreeList.h
  2. 19 17
      Jolt/Physics/Collision/BroadPhase/QuadTree.h

+ 17 - 15
Jolt/Core/FixedSizeFreeList.h

@@ -34,17 +34,6 @@ private:
 	const ObjectStorage &	GetStorage(uint32 inObjectIndex) const	{ return mPages[inObjectIndex >> mPageShift][inObjectIndex & mObjectMask]; }
 	ObjectStorage &			GetStorage(uint32 inObjectIndex)		{ return mPages[inObjectIndex >> mPageShift][inObjectIndex & mObjectMask]; }
 
-	/// Number of objects that we currently have in the free list / new pages
-#ifdef JPH_ENABLE_ASSERTS
-	atomic<uint32>			mNumFreeObjects;
-#endif // JPH_ENABLE_ASSERTS
-
-	/// Simple counter that makes the first free object pointer update with every CAS so that we don't suffer from the ABA problem
-	atomic<uint32>			mAllocationTag;
-
-	/// Index of first free object, the first 32 bits of an object are used to point to the next free object
-	atomic<uint64>			mFirstFreeObjectAndTag;
-
 	/// Size (in objects) of a single page
 	uint32					mPageSize;
 
@@ -60,14 +49,27 @@ private:
 	/// Total number of objects that have been allocated
 	uint32					mNumObjectsAllocated;
 
-	/// The first free object to use when the free list is empty (may need to allocate a new page)
-	atomic<uint32>			mFirstFreeObjectInNewPage;
-
 	/// Array of pages of objects
 	ObjectStorage **		mPages = nullptr;
 
 	/// Mutex that is used to allocate a new page if the storage runs out
-	Mutex					mPageMutex;
+	/// This variable is aligned to the cache line to prevent false sharing with
+	/// the constants used to index into the list via `Get()`.
+	alignas(JPH_CACHE_LINE_SIZE) Mutex mPageMutex;
+
+	/// Number of objects that we currently have in the free list / new pages
+#ifdef JPH_ENABLE_ASSERTS
+	atomic<uint32>			mNumFreeObjects;
+#endif // JPH_ENABLE_ASSERTS
+
+	/// Simple counter that makes the first free object pointer update with every CAS so that we don't suffer from the ABA problem
+	atomic<uint32>			mAllocationTag;
+
+	/// Index of first free object, the first 32 bits of an object are used to point to the next free object
+	atomic<uint64>			mFirstFreeObjectAndTag;
+
+	/// The first free object to use when the free list is empty (may need to allocate a new page)
+	atomic<uint32>			mFirstFreeObjectInNewPage;
 
 public:
 	/// Invalid index

+ 19 - 17
Jolt/Physics/Collision/BroadPhase/QuadTree.h

@@ -325,6 +325,25 @@ private:
 	void						DumpTree(const NodeID &inRoot, const char *inFileNamePrefix) const;
 #endif
 
+	/// Allocator that controls adding / freeing nodes
+	Allocator *					mAllocator = nullptr;
+
+	/// This is a list of nodes that must be deleted after the trees are swapped and the old tree is no longer in use
+	Allocator::Batch			mFreeNodeBatch;
+
+	/// Number of bodies currently in the tree
+	/// This is aligned to be in a different cache line from the `Allocator` pointer to prevent cross-thread syncs
+	/// when reading nodes.
+	alignas(JPH_CACHE_LINE_SIZE) atomic<uint32> mNumBodies { 0 };
+
+	/// We alternate between two tree root nodes. When updating, we activate the new tree and we keep the old tree alive.
+	/// for queries that are in progress until the next time DiscardOldTree() is called.
+	RootNode					mRootNode[2];
+	atomic<uint32>				mRootNodeIndex { 0 };
+
+	/// Flag to keep track of changes to the broadphase, if false, we don't need to UpdatePrepare/Finalize()
+	atomic<bool>				mIsDirty = false;
+
 #ifdef JPH_TRACK_BROADPHASE_STATS
 	/// Mutex protecting the various LayerToStats members
 	mutable Mutex				mStatsMutex;
@@ -366,23 +385,6 @@ private:
 	/// Name of this tree for debugging purposes
 	const char *				mName = "Layer";
 #endif // JPH_EXTERNAL_PROFILE || JPH_PROFILE_ENABLED
-
-	/// Number of bodies currently in the tree
-	atomic<uint32>				mNumBodies { 0 };
-
-	/// We alternate between two tree root nodes. When updating, we activate the new tree and we keep the old tree alive.
-	/// for queries that are in progress until the next time DiscardOldTree() is called.
-	RootNode					mRootNode[2];
-	atomic<uint32>				mRootNodeIndex { 0 };
-
-	/// Allocator that controls adding / freeing nodes
-	Allocator *					mAllocator = nullptr;
-
-	/// This is a list of nodes that must be deleted after the trees are swapped and the old tree is no longer in use
-	Allocator::Batch			mFreeNodeBatch;
-
-	/// Flag to keep track of changes to the broadphase, if false, we don't need to UpdatePrepare/Finalize()
-	atomic<bool>				mIsDirty = false;
 };
 
 JPH_NAMESPACE_END