Browse Source

Relaxed memory ordering to improve performance on ARM (#148)

* Implemented memory order for RefTarget class
* AtomicMin/Max: Read can be relaxed
* JobSystem: Made compare exchange weak and added memory ordering arguments
* Memory order for FixedSizeFreeList
Jorrit Rouwe 3 years ago
parent
commit
305fba2a99
5 changed files with 70 additions and 46 deletions
  1. 2 2
      Jolt/Core/Atomics.h
  2. 25 24
      Jolt/Core/FixedSizeFreeList.inl
  3. 22 10
      Jolt/Core/JobSystem.h
  4. 2 2
      Jolt/Core/JobSystem.inl
  5. 19 8
      Jolt/Core/Reference.h

+ 2 - 2
Jolt/Core/Atomics.h

@@ -13,7 +13,7 @@ JPH_NAMESPACE_BEGIN
 template <class T>
 bool AtomicMin(atomic<T> &ioAtomic, const T inValue, const memory_order inMemoryOrder = memory_order_seq_cst)
 {
-	T cur_value = ioAtomic;
+	T cur_value = ioAtomic.load(memory_order_relaxed);
 	while (cur_value > inValue)
 		if (ioAtomic.compare_exchange_weak(cur_value, inValue, inMemoryOrder))
 			return true;
@@ -24,7 +24,7 @@ bool AtomicMin(atomic<T> &ioAtomic, const T inValue, const memory_order inMemory
 template <class T>
 bool AtomicMax(atomic<T> &ioAtomic, const T inValue, const memory_order inMemoryOrder = memory_order_seq_cst)
 {
-	T cur_value = ioAtomic;
+	T cur_value = ioAtomic.load(memory_order_relaxed);
 	while (cur_value < inValue)
 		if (ioAtomic.compare_exchange_weak(cur_value, inValue, inMemoryOrder))
 			return true;

+ 25 - 24
Jolt/Core/FixedSizeFreeList.inl

@@ -7,7 +7,7 @@ template <typename Object>
 FixedSizeFreeList<Object>::~FixedSizeFreeList()
 {
 	// Ensure everything is freed before the freelist is destructed
-	JPH_ASSERT(mNumFreeObjects == mNumPages * mPageSize);
+	JPH_ASSERT(mNumFreeObjects.load(memory_order_relaxed) == mNumPages * mPageSize);
 
 	// Free memory for pages
 	uint32 num_pages = mNumObjectsAllocated / mPageSize;
@@ -51,12 +51,12 @@ uint32 FixedSizeFreeList<Object>::ConstructObject(Parameters &&... inParameters)
 	for (;;)
 	{
 		// Get first object from the linked list
-		uint64 first_free_object_and_tag = mFirstFreeObjectAndTag;
+		uint64 first_free_object_and_tag = mFirstFreeObjectAndTag.load(memory_order_acquire);
 		uint32 first_free = uint32(first_free_object_and_tag);
 		if (first_free == cInvalidObjectIndex)
 		{
 			// The free list is empty, we take an object from the page that has never been used before
-			first_free = mFirstFreeObjectInNewPage++;
+			first_free = mFirstFreeObjectInNewPage.fetch_add(1, memory_order_relaxed);
 			if (first_free >= mNumObjectsAllocated)
 			{
 				// Allocate new page
@@ -72,28 +72,28 @@ uint32 FixedSizeFreeList<Object>::ConstructObject(Parameters &&... inParameters)
 			}
 
 			// Allocation successful
-			JPH_IF_ENABLE_ASSERTS(--mNumFreeObjects;)
+			JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_sub(1, memory_order_relaxed);)
 			ObjectStorage &storage = GetStorage(first_free);
 			new (&storage.mData) Object(forward<Parameters>(inParameters)...);
-			storage.mNextFreeObject = first_free;
+			storage.mNextFreeObject.store(first_free, memory_order_release);
 			return first_free;
 		}
 		else
 		{
 			// Load next pointer
-			uint32 new_first_free = GetStorage(first_free).mNextFreeObject;
+			uint32 new_first_free = GetStorage(first_free).mNextFreeObject.load(memory_order_acquire);
 
 			// Construct a new first free object tag
-			uint64 new_first_free_object_and_tag = uint64(new_first_free) + (uint64(mAllocationTag++) << 32);
+			uint64 new_first_free_object_and_tag = uint64(new_first_free) + (uint64(mAllocationTag.fetch_add(1, memory_order_relaxed)) << 32);
 
 			// Compare and swap
-			if (mFirstFreeObjectAndTag.compare_exchange_strong(first_free_object_and_tag, new_first_free_object_and_tag))
+			if (mFirstFreeObjectAndTag.compare_exchange_weak(first_free_object_and_tag, new_first_free_object_and_tag, memory_order_release))
 			{
 				// Allocation successful
-				JPH_IF_ENABLE_ASSERTS(--mNumFreeObjects;)
+				JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_sub(1, memory_order_relaxed);)
 				ObjectStorage &storage = GetStorage(first_free);
 				new (&storage.mData) Object(forward<Parameters>(inParameters)...);
-				storage.mNextFreeObject = first_free;
+				storage.mNextFreeObject.store(first_free, memory_order_release);
 				return first_free;
 			}
 		}
@@ -103,14 +103,14 @@ uint32 FixedSizeFreeList<Object>::ConstructObject(Parameters &&... inParameters)
 template <typename Object>
 void FixedSizeFreeList<Object>::AddObjectToBatch(Batch &ioBatch, uint32 inObjectIndex)
 {
-	JPH_ASSERT(GetStorage(inObjectIndex).mNextFreeObject == inObjectIndex, "Trying to add a object to the batch that is already in a free list");
+	JPH_ASSERT(GetStorage(inObjectIndex).mNextFreeObject.load(memory_order_relaxed) == inObjectIndex, "Trying to add a object to the batch that is already in a free list");
 	JPH_ASSERT(ioBatch.mNumObjects != uint32(-1), "Trying to reuse a batch that has already been freed");
 
 	// Link object in batch to free
 	if (ioBatch.mFirstObjectIndex == cInvalidObjectIndex)
 		ioBatch.mFirstObjectIndex = inObjectIndex;
 	else
-		GetStorage(ioBatch.mLastObjectIndex).mNextFreeObject = inObjectIndex;
+		GetStorage(ioBatch.mLastObjectIndex).mNextFreeObject.store(inObjectIndex, memory_order_release);
 	ioBatch.mLastObjectIndex = inObjectIndex;
 	ioBatch.mNumObjects++;
 }
@@ -128,29 +128,30 @@ void FixedSizeFreeList<Object>::DestructObjectBatch(Batch &ioBatch)
 			{
 				ObjectStorage &storage = GetStorage(object_idx);
 				reinterpret_cast<Object &>(storage.mData).~Object();
-				object_idx = storage.mNextFreeObject;
+				object_idx = storage.mNextFreeObject.load(memory_order_relaxed);
 			}
 			while (object_idx != cInvalidObjectIndex);
 		}
 
 		// Add to objects free list
+		ObjectStorage &storage = GetStorage(ioBatch.mLastObjectIndex);
 		for (;;)
 		{
 			// Get first object from the list
-			uint64 first_free_object_and_tag = mFirstFreeObjectAndTag;
+			uint64 first_free_object_and_tag = mFirstFreeObjectAndTag.load(memory_order_acquire);
 			uint32 first_free = uint32(first_free_object_and_tag);
 
 			// Make it the next pointer of the last object in the batch that is to be freed
-			GetStorage(ioBatch.mLastObjectIndex).mNextFreeObject = first_free;
+			storage.mNextFreeObject.store(first_free, memory_order_release);
 
 			// Construct a new first free object tag
-			uint64 new_first_free_object_and_tag = uint64(ioBatch.mFirstObjectIndex) + (uint64(mAllocationTag++) << 32);
+			uint64 new_first_free_object_and_tag = uint64(ioBatch.mFirstObjectIndex) + (uint64(mAllocationTag.fetch_add(1, memory_order_relaxed)) << 32);
 
 			// Compare and swap
-			if (mFirstFreeObjectAndTag.compare_exchange_strong(first_free_object_and_tag, new_first_free_object_and_tag))
+			if (mFirstFreeObjectAndTag.compare_exchange_weak(first_free_object_and_tag, new_first_free_object_and_tag, memory_order_release))
 			{
 				// Free successful
-				JPH_IF_ENABLE_ASSERTS(mNumFreeObjects += ioBatch.mNumObjects;)
+				JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_add(ioBatch.mNumObjects, memory_order_relaxed);)
 
 				// Mark the batch as freed
 #ifdef JPH_ENABLE_ASSERTS
@@ -175,20 +176,20 @@ void FixedSizeFreeList<Object>::DestructObject(uint32 inObjectIndex)
 	for (;;)
 	{
 		// Get first object from the list
-		uint64 first_free_object_and_tag = mFirstFreeObjectAndTag;
+		uint64 first_free_object_and_tag = mFirstFreeObjectAndTag.load(memory_order_acquire);
 		uint32 first_free = uint32(first_free_object_and_tag);
 
 		// Make it the next pointer of the last object in the batch that is to be freed
-		storage.mNextFreeObject = first_free;
+		storage.mNextFreeObject.store(first_free, memory_order_release);
 
 		// Construct a new first free object tag
-		uint64 new_first_free_object_and_tag = uint64(inObjectIndex) + (uint64(mAllocationTag++) << 32);
+		uint64 new_first_free_object_and_tag = uint64(inObjectIndex) + (uint64(mAllocationTag.fetch_add(1, memory_order_relaxed)) << 32);
 
 		// Compare and swap
-		if (mFirstFreeObjectAndTag.compare_exchange_strong(first_free_object_and_tag, new_first_free_object_and_tag))
+		if (mFirstFreeObjectAndTag.compare_exchange_weak(first_free_object_and_tag, new_first_free_object_and_tag, memory_order_release))
 		{
 			// Free successful
-			JPH_IF_ENABLE_ASSERTS(mNumFreeObjects++;)
+			JPH_IF_ENABLE_ASSERTS(mNumFreeObjects.fetch_add(1, memory_order_relaxed);)
 			return;
 		}
 	}
@@ -197,7 +198,7 @@ void FixedSizeFreeList<Object>::DestructObject(uint32 inObjectIndex)
 template<typename Object>
 inline void FixedSizeFreeList<Object>::DestructObject(Object *inObject)
 {
-	uint32 index = reinterpret_cast<ObjectStorage *>(inObject)->mNextFreeObject;
+	uint32 index = reinterpret_cast<ObjectStorage *>(inObject)->mNextFreeObject.load(memory_order_relaxed);
 	JPH_ASSERT(index < mNumObjectsAllocated);
 	DestructObject(index);
 }

+ 22 - 10
Jolt/Core/JobSystem.h

@@ -157,8 +157,21 @@ protected:
 		inline JobSystem *	GetJobSystem()								{ return mJobSystem; }
 
 		/// Add or release a reference to this object
-		inline void			AddRef() 									{ ++mReferenceCount; }
-		inline void			Release()									{ if (--mReferenceCount == 0) mJobSystem->FreeJob(this); }
+		inline void			AddRef()
+		{
+			// Adding a reference can use relaxed memory ordering
+			mReferenceCount.fetch_add(1, memory_order_relaxed);
+		} 
+		inline void			Release()
+		{
+			// Releasing a reference must use release semantics...
+			if (mReferenceCount.fetch_sub(1, memory_order_release) == 1)
+			{
+				// ... so that we can use aquire to ensure that we see any updates from other threads that released a ref before freeing the job
+				atomic_thread_fence(memory_order_acquire);
+				mJobSystem->FreeJob(this);
+			}
+		}
 
 		/// Add to the dependency counter. 
 		inline void			AddDependency(int inCount);
@@ -175,7 +188,7 @@ protected:
 		inline bool			SetBarrier(Barrier *inBarrier)
 		{ 
 			intptr_t barrier = 0; 
-			if (mBarrier.compare_exchange_strong(barrier, reinterpret_cast<intptr_t>(inBarrier)))
+			if (mBarrier.compare_exchange_strong(barrier, reinterpret_cast<intptr_t>(inBarrier), memory_order_relaxed))
 				return true; 
 			JPH_ASSERT(barrier == cBarrierDoneState, "A job can only belong to 1 barrier");
 			return false;
@@ -186,7 +199,7 @@ protected:
 		{
 			// Transition job to executing state
 			uint32 state = 0; // We can only start running with a dependency counter of 0
-			if (!mNumDependencies.compare_exchange_strong(state, cExecutingState))
+			if (!mNumDependencies.compare_exchange_strong(state, cExecutingState, memory_order_acquire))
 				return state; // state is updated by compare_exchange_strong to the current value
 
 			// Run the job function
@@ -196,18 +209,17 @@ protected:
 			}
 
 			// Fetch the barrier pointer and exchange it for the done state, so we're sure that no barrier gets set after we want to call the callback
-			intptr_t barrier;
+			intptr_t barrier = mBarrier.load(memory_order_relaxed);
 			for (;;)
 			{
-				barrier = mBarrier;
-				if (mBarrier.compare_exchange_strong(barrier, cBarrierDoneState))
+				if (mBarrier.compare_exchange_weak(barrier, cBarrierDoneState, memory_order_relaxed))
 					break;
 			}
 			JPH_ASSERT(barrier != cBarrierDoneState);
 
 			// Mark job as done
 			state = cExecutingState;
-			mNumDependencies.compare_exchange_strong(state, cDoneState);
+			mNumDependencies.compare_exchange_strong(state, cDoneState, memory_order_relaxed);
 			JPH_ASSERT(state == cExecutingState);
 
 			// Notify the barrier after we've changed the job to the done state so that any thread reading the state after receiving the callback will see that the job has finished
@@ -218,10 +230,10 @@ protected:
 		}
 
 		/// Test if the job can be executed
-		inline bool			CanBeExecuted() const						{ return mNumDependencies == 0; }
+		inline bool			CanBeExecuted() const						{ return mNumDependencies.load(memory_order_relaxed) == 0; }
 
 		/// Test if the job finished executing
-		inline bool			IsDone() const								{ return mNumDependencies == cDoneState; }
+		inline bool			IsDone() const								{ return mNumDependencies.load(memory_order_relaxed) == cDoneState; }
 
 		static constexpr uint32 cExecutingState = 0xe0e0e0e0;			///< Value of mNumDependencies when job is executing
 		static constexpr uint32 cDoneState		= 0xd0d0d0d0;			///< Value of mNumDependencies when job is done executing

+ 2 - 2
Jolt/Core/JobSystem.inl

@@ -5,13 +5,13 @@ JPH_NAMESPACE_BEGIN
 
 void JobSystem::Job::AddDependency(int inCount)
 {
-	JPH_IF_ENABLE_ASSERTS(uint32 old_value =) mNumDependencies.fetch_add(inCount);
+	JPH_IF_ENABLE_ASSERTS(uint32 old_value =) mNumDependencies.fetch_add(inCount, memory_order_relaxed);
 	JPH_ASSERT(old_value > 0 && old_value != cExecutingState && old_value != cDoneState, "Job is queued, running or done, it is not allowed to add a dependency to a running job");
 }
 
 bool JobSystem::Job::RemoveDependency(int inCount)
 {
-	uint32 old_value = mNumDependencies.fetch_sub(inCount);
+	uint32 old_value = mNumDependencies.fetch_sub(inCount, memory_order_release);
 	JPH_ASSERT(old_value != cExecutingState && old_value != cDoneState, "Job is running or done, it is not allowed to add a dependency to a running job");
 	uint32 new_value = old_value - inCount;
 	JPH_ASSERT(old_value > new_value, "Test wrap around, this is a logic error");

+ 19 - 8
Jolt/Core/Reference.h

@@ -38,29 +38,40 @@ public:
 	/// Constructor
 	inline					RefTarget() = default;
 	inline					RefTarget(const RefTarget &)					{ /* Do not copy refcount */ }
-	inline					~RefTarget()									{ JPH_ASSERT(mRefCount == 0 || mRefCount == cEmbedded); } ///< assert no one is referencing us
+	inline					~RefTarget()									{ JPH_IF_ENABLE_ASSERTS(uint32 value = mRefCount.load(memory_order_relaxed);) JPH_ASSERT(value == 0 || value == cEmbedded); } ///< assert no one is referencing us
 
 	/// Mark this class as embedded, this means the type can be used in a compound or constructed on the stack.
 	/// The Release function will never destruct the object, it is assumed the destructor will be called by whoever allocated
 	/// the object and at that point in time it is checked that no references are left to the structure.
-	inline void				SetEmbedded()									{ JPH_ASSERT(mRefCount < cEmbedded); mRefCount += cEmbedded; }
+	inline void				SetEmbedded()									{ JPH_IF_ENABLE_ASSERTS(uint32 old = ) mRefCount.fetch_add(cEmbedded, memory_order_relaxed); JPH_ASSERT(old < cEmbedded); }
 
 	/// Assignment operator
 	inline RefTarget &		operator = (const RefTarget &)					{ /* Don't copy refcount */ return *this; }
 
 	/// Get current refcount of this object
-	uint32					GetRefCount() const								{ return mRefCount; }
+	uint32					GetRefCount() const								{ return mRefCount.load(memory_order_relaxed); }
 
 	/// Add or release a reference to this object
-	inline void				AddRef() const									{ ++mRefCount; }
-	inline void				Release() const									{ if (--mRefCount == 0) delete static_cast<const T *>(this); }
+	inline void				AddRef() const
+	{
+		// Adding a reference can use relaxed memory ordering
+		mRefCount.fetch_add(1, memory_order_relaxed);
+	}
+
+	inline void				Release() const
+	{
+		// Releasing a reference must use release semantics...
+		if (mRefCount.fetch_sub(1, memory_order_release) == 1)
+		{
+			// ... so that we can use aquire to ensure that we see any updates from other threads that released a ref before deleting the object
+			atomic_thread_fence(memory_order_acquire);
+			delete static_cast<const T *>(this);
+		}
+	}
 
 	/// INTERNAL HELPER FUNCTION USED BY SERIALIZATION
 	static int				sInternalGetRefCountOffset()					{ return offsetof(T, mRefCount); }
 
-	/// INTERNAL HELPER FUNCTION TO OVERRIDE THE REFCOUNT OF AN OBJECT. USE WITH GREAT CARE! 
-	inline void				SetRefCountInternal(uint32 inRefCount)			{ JPH_ASSERT(inRefCount >= 0); mRefCount = inRefCount; }
-
 protected:
 	static constexpr uint32 cEmbedded = 0x0ebedded;							///< A large value that gets added to the refcount to mark the object as embedded