Browse Source

Better handling of large MeshShapes (#1355)

* GetPessimisticMemoryEstimate was very pessimistic, sometimes overestimating by a factor of 4, leading to out of memory issues
* SubShapeIDs need to address a triangle block, so we can use the highest block address instead of the total size of the tree. This often saves 1 bit and makes larger mesh shapes possible.
* TriangleCodecIndexed8BitPackSOA4Flags could silently overflow OFFSET_TO_VERTICES_MASK leading to hangs/crashes.
* Vertices are always 4 byte aligned, meaning we can get rid of the lowest 2 bits and allow for larger mesh shapes possible.
* Reduced the number of allocations that are needed to build a MeshShape.
* In the LargeMesh PerformanceTest this means we can now go up to 110M triangles per mesh. Before this change creating a mesh of more than 60M triangles would cause a crash/hang. Note that the actual number of allowed triangles depends on the layout/connectivity of the triangles. The test scenario uses a sloping terrain, which is one of the best cases for vertex reuse and connectivity.

Fixed #1349
Jorrit Rouwe 8 months ago
parent
commit
c738b3490c

+ 2 - 2
.github/workflows/determinism_check.yml

@@ -1,8 +1,8 @@
 name: Determinism Check
 
 env:
-  CONVEX_VS_MESH_HASH: '0x16ca5bf7f9da5f74'
-  RAGDOLL_HASH: '0xa50ce2dc5684626d'
+  CONVEX_VS_MESH_HASH: '0x610d538e15420778'
+  RAGDOLL_HASH: '0x275057ded572c916'
   PYRAMID_HASH: '0x198b8eeaee57e29a'
   EMSCRIPTEN_VERSION: 3.1.64
   UBUNTU_CLANG_VERSION: clang++-15

+ 103 - 46
Jolt/AABBTree/AABBTreeToBuffer.h

@@ -33,18 +33,87 @@ public:
 	/// Convert AABB tree. Returns false if failed.
 	bool							Convert(const Array<IndexedTriangle> &inTriangles, const Array<AABBTreeBuilder::Node> &inNodes, const VertexList &inVertices, const AABBTreeBuilder::Node *inRoot, bool inStoreUserData, const char *&outError)
 	{
-		const typename NodeCodec::EncodingContext node_ctx;
+		typename NodeCodec::EncodingContext node_ctx;
 		typename TriangleCodec::EncodingContext tri_ctx(inVertices);
 
-		// Estimate the amount of memory required
-		uint tri_count = inRoot->GetTriangleCountInTree(inNodes);
-		uint node_count = inRoot->GetNodeCount(inNodes);
-		uint nodes_size = node_ctx.GetPessimisticMemoryEstimate(node_count);
-		uint total_size = HeaderSize + TriangleHeaderSize + nodes_size + tri_ctx.GetPessimisticMemoryEstimate(tri_count, inStoreUserData);
-		mTree.reserve(total_size);
+		// Child nodes out of loop so we don't constantly realloc it
+		Array<const AABBTreeBuilder::Node *> child_nodes;
+		child_nodes.reserve(NumChildrenPerNode);
+
+		// First calculate how big the tree is going to be.
+		// Since the tree can be huge for very large meshes, we don't want
+		// to reallocate the buffer as it may cause out of memory situations.
+		// This loop mimics the construction loop below.
+		uint64 total_size = HeaderSize + TriangleHeaderSize;
+		size_t node_count = 1; // Start with root node
+		size_t to_process_max_size = 1; // Track size of queues so we can do a single reserve below
+		size_t to_process_triangles_max_size = 0;
+		{	// A scope to free the memory associated with to_estimate and to_estimate_triangles
+			Array<const AABBTreeBuilder::Node *> to_estimate;
+			Array<const AABBTreeBuilder::Node *> to_estimate_triangles;
+			to_estimate.push_back(inRoot);
+			for (;;)
+			{
+				while (!to_estimate.empty())
+				{
+					// Get the next node to process
+					const AABBTreeBuilder::Node *node = to_estimate.back();
+					to_estimate.pop_back();
+
+					// Update total size
+					node_ctx.PrepareNodeAllocate(node, total_size);
+
+					if (node->HasChildren())
+					{
+						// Collect the first NumChildrenPerNode sub-nodes in the tree
+						child_nodes.clear(); // Won't free the memory
+						node->GetNChildren(inNodes, NumChildrenPerNode, child_nodes);
+
+						// Increment the number of nodes we're going to store
+						node_count += child_nodes.size();
+
+						// Insert in reverse order so we estimate left child first when taking nodes from the back
+						for (int idx = int(child_nodes.size()) - 1; idx >= 0; --idx)
+						{
+							// Store triangles in separate list so we process them last
+							const AABBTreeBuilder::Node *child = child_nodes[idx];
+							if (child->HasChildren())
+							{
+								to_estimate.push_back(child);
+								to_process_max_size = max(to_estimate.size(), to_process_max_size);
+							}
+							else
+							{
+								to_estimate_triangles.push_back(child);
+								to_process_triangles_max_size = max(to_estimate_triangles.size(), to_process_triangles_max_size);
+							}
+						}
+					}
+					else
+					{
+						// Update total size
+						tri_ctx.PreparePack(&inTriangles[node->mTrianglesBegin], node->mNumTriangles, inStoreUserData, total_size);
+					}
+				}
+
+				// If we've got triangles to estimate, loop again with just the triangles
+				if (to_estimate_triangles.empty())
+					break;
+				else
+					to_estimate.swap(to_estimate_triangles);
+			}
+		}
 
-		// Reset counters
-		mNodesSize = 0;
+		// Finalize the prepare stage for the triangle context
+		tri_ctx.FinalizePreparePack(total_size);
+
+		// Reserve the buffer
+		if (size_t(total_size) != total_size)
+		{
+			outError = "AABBTreeToBuffer: Out of memory!";
+			return false;
+		}
+		mTree.reserve(size_t(total_size));
 
 		// Add headers
 		NodeHeader *header = HeaderSize > 0? mTree.Allocate<NodeHeader>() : nullptr;
@@ -55,19 +124,20 @@ public:
 			const AABBTreeBuilder::Node *	mNode = nullptr;							// Node that this entry belongs to
 			Vec3							mNodeBoundsMin;								// Quantized node bounds
 			Vec3							mNodeBoundsMax;
-			uint							mNodeStart = uint(-1);						// Start of node in mTree
-			uint							mTriangleStart = uint(-1);					// Start of the triangle data in mTree
+			size_t							mNodeStart = size_t(-1);					// Start of node in mTree
+			size_t							mTriangleStart = size_t(-1);				// Start of the triangle data in mTree
+			size_t							mChildNodeStart[NumChildrenPerNode];		// Start of the children of the node in mTree
+			size_t							mChildTrianglesStart[NumChildrenPerNode];	// Start of the triangle data in mTree
+			size_t *						mParentChildNodeStart = nullptr;			// Where to store mNodeStart (to patch mChildNodeStart of my parent)
+			size_t *						mParentTrianglesStart = nullptr;			// Where to store mTriangleStart (to patch mChildTrianglesStart of my parent)
 			uint							mNumChildren = 0;							// Number of children
-			uint							mChildNodeStart[NumChildrenPerNode];		// Start of the children of the node in mTree
-			uint							mChildTrianglesStart[NumChildrenPerNode];	// Start of the triangle data in mTree
-			uint *							mParentChildNodeStart = nullptr;			// Where to store mNodeStart (to patch mChildNodeStart of my parent)
-			uint *							mParentTrianglesStart = nullptr;			// Where to store mTriangleStart (to patch mChildTrianglesStart of my parent)
 		};
 
 		Array<NodeData *> to_process;
+		to_process.reserve(to_process_max_size);
 		Array<NodeData *> to_process_triangles;
+		to_process_triangles.reserve(to_process_triangles_max_size);
 		Array<NodeData> node_list;
-
 		node_list.reserve(node_count); // Needed to ensure that array is not reallocated, so we can keep pointers in the array
 
 		NodeData root;
@@ -77,10 +147,6 @@ public:
 		node_list.push_back(root);
 		to_process.push_back(&node_list.back());
 
-		// Child nodes out of loop so we don't constantly realloc it
-		Array<const AABBTreeBuilder::Node *> child_nodes;
-		child_nodes.reserve(NumChildrenPerNode);
-
 		for (;;)
 		{
 			while (!to_process.empty())
@@ -112,37 +178,31 @@ public:
 					}
 
 				// Start a new node
-				uint old_size = (uint)mTree.size();
 				node_data->mNodeStart = node_ctx.NodeAllocate(node_data->mNode, node_data->mNodeBoundsMin, node_data->mNodeBoundsMax, child_nodes, child_bounds_min, child_bounds_max, mTree, outError);
-				if (node_data->mNodeStart == uint(-1))
+				if (node_data->mNodeStart == size_t(-1))
 					return false;
-				mNodesSize += (uint)mTree.size() - old_size;
 
 				if (node_data->mNode->HasChildren())
 				{
 					// Insert in reverse order so we process left child first when taking nodes from the back
 					for (int idx = int(child_nodes.size()) - 1; idx >= 0; --idx)
 					{
+						const AABBTreeBuilder::Node *child_node = child_nodes[idx];
+
 						// Due to quantization box could have become bigger, not smaller
-						JPH_ASSERT(AABox(child_bounds_min[idx], child_bounds_max[idx]).Contains(child_nodes[idx]->mBounds), "AABBTreeToBuffer: Bounding box became smaller!");
+						JPH_ASSERT(AABox(child_bounds_min[idx], child_bounds_max[idx]).Contains(child_node->mBounds), "AABBTreeToBuffer: Bounding box became smaller!");
 
 						// Add child to list of nodes to be processed
 						NodeData child;
-						child.mNode = child_nodes[idx];
+						child.mNode = child_node;
 						child.mNodeBoundsMin = child_bounds_min[idx];
 						child.mNodeBoundsMax = child_bounds_max[idx];
 						child.mParentChildNodeStart = &node_data->mChildNodeStart[idx];
 						child.mParentTrianglesStart = &node_data->mChildTrianglesStart[idx];
-						NodeData *old = &node_list[0];
 						node_list.push_back(child);
-						if (old != &node_list[0])
-						{
-							outError = "Internal Error: Array reallocated, memory corruption!";
-							return false;
-						}
 
 						// Store triangles in separate list so we process them last
-						if (node_list.back().mNode->HasChildren())
+						if (child_node->HasChildren())
 							to_process.push_back(&node_list.back());
 						else
 							to_process_triangles.push_back(&node_list.back());
@@ -152,7 +212,7 @@ public:
 				{
 					// Add triangles
 					node_data->mTriangleStart = tri_ctx.Pack(&inTriangles[node_data->mNode->mTrianglesBegin], node_data->mNode->mNumTriangles, inStoreUserData, mTree, outError);
-					if (node_data->mTriangleStart == uint(-1))
+					if (node_data->mTriangleStart == size_t(-1))
 						return false;
 				}
 
@@ -171,6 +231,10 @@ public:
 				to_process.swap(to_process_triangles);
 		}
 
+		// Assert that our reservation was correct (we don't know if we swapped the arrays or not)
+		JPH_ASSERT(to_process_max_size == to_process.capacity() || to_process_triangles_max_size == to_process.capacity());
+		JPH_ASSERT(to_process_max_size == to_process_triangles.capacity() || to_process_triangles_max_size == to_process_triangles.capacity());
+
 		// Finalize all nodes
 		for (NodeData &n : node_list)
 			if (!node_ctx.NodeFinalize(n.mNode, n.mNodeStart, n.mNumChildren, n.mChildNodeStart, n.mChildTrianglesStart, mTree, outError))
@@ -179,26 +243,20 @@ public:
 		// Finalize the triangles
 		tri_ctx.Finalize(inVertices, triangle_header, mTree);
 
-		// Validate that we reserved enough memory
-		if (nodes_size < mNodesSize)
+		// Validate that our reservations were correct
+		if (node_count != node_list.size())
 		{
-			outError = "Internal Error: Not enough memory reserved for nodes!";
+			outError = "Internal Error: Node memory estimate was incorrect, memory corruption!";
 			return false;
 		}
-		if (total_size < (uint)mTree.size())
+		if (total_size != mTree.size())
 		{
-			outError = "Internal Error: Not enough memory reserved for triangles!";
+			outError = "Internal Error: Tree memory estimate was incorrect, memory corruption!";
 			return false;
 		}
 
 		// Finalize the nodes
-		if (!node_ctx.Finalize(header, inRoot, node_list[0].mNodeStart, node_list[0].mTriangleStart, outError))
-			return false;
-
-		// Shrink the tree, this will invalidate the header and triangle_header variables
-		mTree.shrink_to_fit();
-
-		return true;
+		return node_ctx.Finalize(header, inRoot, node_list[0].mNodeStart, node_list[0].mTriangleStart, outError);
 	}
 
 	/// Get resulting data
@@ -233,7 +291,6 @@ public:
 
 private:
 	ByteBuffer						mTree;									///< Resulting tree structure
-	uint							mNodesSize;								///< Size in bytes of the nodes in the buffer
 };
 
 JPH_NAMESPACE_END

+ 50 - 24
Jolt/AABBTree/NodeCodec/NodeCodecQuadTreeHalfFloat.h

@@ -10,7 +10,6 @@
 
 JPH_NAMESPACE_BEGIN
 
-template <int Alignment>
 class NodeCodecQuadTreeHalfFloat
 {
 public:
@@ -23,6 +22,8 @@ public:
 		Float3							mRootBoundsMin;
 		Float3							mRootBoundsMax;
 		uint32							mRootProperties;
+		uint8							mBlockIDBits;			///< Number of bits to address a triangle block
+		uint8							mPadding[3] = { 0 };
 	};
 
 	/// Size of the header (an empty struct is always > 0 bytes so this needs a separate variable)
@@ -61,26 +62,30 @@ public:
 	class EncodingContext
 	{
 	public:
-		/// Get an upper bound on the amount of bytes needed for a node tree with inNodeCount nodes
-		uint							GetPessimisticMemoryEstimate(uint inNodeCount) const
+		/// Mimics the size a call to NodeAllocate() would add to the buffer
+		void							PrepareNodeAllocate(const AABBTreeBuilder::Node *inNode, uint64 &ioBufferSize) const
 		{
-			return inNodeCount * (sizeof(Node) + Alignment - 1);
+			// We don't emit nodes for leafs
+			if (!inNode->HasChildren())
+				return;
+
+			// Add size of node
+			ioBufferSize += sizeof(Node);
 		}
 
 		/// Allocate a new node for inNode.
 		/// Algorithm can modify the order of ioChildren to indicate in which order children should be compressed
 		/// Algorithm can enlarge the bounding boxes of the children during compression and returns these in outChildBoundsMin, outChildBoundsMax
 		/// inNodeBoundsMin, inNodeBoundsMax is the bounding box if inNode possibly widened by compressing the parent node
-		/// Returns uint(-1) on error and reports the error in outError
-		uint							NodeAllocate(const AABBTreeBuilder::Node *inNode, Vec3Arg inNodeBoundsMin, Vec3Arg inNodeBoundsMax, Array<const AABBTreeBuilder::Node *> &ioChildren, Vec3 outChildBoundsMin[NumChildrenPerNode], Vec3 outChildBoundsMax[NumChildrenPerNode], ByteBuffer &ioBuffer, const char *&outError) const
+		/// Returns size_t(-1) on error and reports the error in outError
+		size_t							NodeAllocate(const AABBTreeBuilder::Node *inNode, Vec3Arg inNodeBoundsMin, Vec3Arg inNodeBoundsMax, Array<const AABBTreeBuilder::Node *> &ioChildren, Vec3 outChildBoundsMin[NumChildrenPerNode], Vec3 outChildBoundsMax[NumChildrenPerNode], ByteBuffer &ioBuffer, const char *&outError) const
 		{
 			// We don't emit nodes for leafs
 			if (!inNode->HasChildren())
-				return (uint)ioBuffer.size();
+				return ioBuffer.size();
 
-			// Align the buffer
-			ioBuffer.Align(Alignment);
-			uint node_start = (uint)ioBuffer.size();
+			// Remember the start of the node
+			size_t node_start = ioBuffer.size();
 
 			// Fill in bounds
 			Node *node = ioBuffer.Allocate<Node>();
@@ -104,7 +109,7 @@ public:
 					if (this_node->GetTriangleCount() >= TRIANGLE_COUNT_MASK)
 					{
 						outError = "NodeCodecQuadTreeHalfFloat: Too many triangles";
-						return uint(-1);
+						return size_t(-1);
 					}
 				}
 				else
@@ -133,7 +138,7 @@ public:
 		}
 
 		/// Once all nodes have been added, this call finalizes all nodes by patching in the offsets of the child nodes (that were added after the node itself was added)
-		bool						NodeFinalize(const AABBTreeBuilder::Node *inNode, uint inNodeStart, uint inNumChildren, const uint *inChildrenNodeStart, const uint *inChildrenTrianglesStart, ByteBuffer &ioBuffer, const char *&outError) const
+		bool						NodeFinalize(const AABBTreeBuilder::Node *inNode, size_t inNodeStart, uint inNumChildren, const size_t *inChildrenNodeStart, const size_t *inChildrenTrianglesStart, ByteBuffer &ioBuffer, const char *&outError)
 		{
 			if (!inNode->HasChildren())
 				return true;
@@ -141,46 +146,64 @@ public:
 			Node *node = ioBuffer.Get<Node>(inNodeStart);
 			for (uint i = 0; i < inNumChildren; ++i)
 			{
-				// If there are triangles, use the triangle offset otherwise use the node offset
-				uint offset = node->mNodeProperties[i] != 0? inChildrenTrianglesStart[i] : inChildrenNodeStart[i];
+				size_t offset;
+				if (node->mNodeProperties[i] != 0)
+				{
+					// This is a triangle block
+					offset = inChildrenTrianglesStart[i];
+
+					// Store highest block with triangles so we can count the number of bits we need
+					mHighestTriangleBlock = max(mHighestTriangleBlock, offset);
+				}
+				else
+				{
+					// This is a node block
+					offset = inChildrenNodeStart[i];
+				}
+
+				// Store offset of next node / triangles
 				if (offset & OFFSET_NON_SIGNIFICANT_MASK)
 				{
 					outError = "NodeCodecQuadTreeHalfFloat: Internal Error: Offset has non-significant bits set";
 					return false;
 				}
 				offset >>= OFFSET_NON_SIGNIFICANT_BITS;
-				if (offset & ~OFFSET_MASK)
+				if (offset > OFFSET_MASK)
 				{
 					outError = "NodeCodecQuadTreeHalfFloat: Offset too large. Too much data.";
 					return false;
 				}
-
-				// Store offset of next node / triangles
-				node->mNodeProperties[i] |= offset;
+				node->mNodeProperties[i] |= uint32(offset);
 			}
 
 			return true;
 		}
 
 		/// Once all nodes have been finalized, this will finalize the header of the nodes
-		bool						Finalize(Header *outHeader, const AABBTreeBuilder::Node *inRoot, uint inRootNodeStart, uint inRootTrianglesStart, const char *&outError) const
+		bool						Finalize(Header *outHeader, const AABBTreeBuilder::Node *inRoot, size_t inRootNodeStart, size_t inRootTrianglesStart, const char *&outError) const
 		{
-			uint offset = inRoot->HasChildren()? inRootNodeStart : inRootTrianglesStart;
+			// Check if we can address the root node
+			size_t offset = inRoot->HasChildren()? inRootNodeStart : inRootTrianglesStart;
 			if (offset & OFFSET_NON_SIGNIFICANT_MASK)
 			{
 				outError = "NodeCodecQuadTreeHalfFloat: Internal Error: Offset has non-significant bits set";
 				return false;
 			}
 			offset >>= OFFSET_NON_SIGNIFICANT_BITS;
-			if (offset & ~OFFSET_MASK)
+			if (offset > OFFSET_MASK)
 			{
 				outError = "NodeCodecQuadTreeHalfFloat: Offset too large. Too much data.";
 				return false;
 			}
 
+			// If the root has triangles, we need to take that offset instead since the mHighestTriangleBlock will be zero
+			size_t highest_triangle_block = inRootTrianglesStart != size_t(-1)? inRootTrianglesStart : mHighestTriangleBlock;
+			highest_triangle_block >>= OFFSET_NON_SIGNIFICANT_BITS;
+
 			inRoot->mBounds.mMin.StoreFloat3(&outHeader->mRootBoundsMin);
 			inRoot->mBounds.mMax.StoreFloat3(&outHeader->mRootBoundsMax);
-			outHeader->mRootProperties = offset + (inRoot->GetTriangleCount() << TRIANGLE_COUNT_SHIFT);
+			outHeader->mRootProperties = uint32(offset) + (inRoot->GetTriangleCount() << TRIANGLE_COUNT_SHIFT);
+			outHeader->mBlockIDBits = uint8(32 - CountLeadingZeros(uint32(highest_triangle_block)));
 			if (inRoot->GetTriangleCount() >= TRIANGLE_COUNT_MASK)
 			{
 				outError = "NodeCodecQuadTreeHalfFloat: Too many triangles";
@@ -189,6 +212,9 @@ public:
 
 			return true;
 		}
+
+	private:
+		size_t						mHighestTriangleBlock = 0;
 	};
 
 	/// This class decodes and decompresses quad tree nodes
@@ -196,9 +222,9 @@ public:
 	{
 	public:
 		/// Get the amount of bits needed to store an ID to a triangle block
-		inline static uint			sTriangleBlockIDBits(const ByteBuffer &inTree)
+		inline static uint			sTriangleBlockIDBits(const Header *inHeader)
 		{
-			return 32 - CountLeadingZeros((uint32)inTree.size()) - OFFSET_NON_SIGNIFICANT_BITS;
+			return inHeader->mBlockIDBits;
 		}
 
 		/// Convert a triangle block ID to the start of the triangle buffer

+ 91 - 40
Jolt/AABBTree/TriangleCodec/TriangleCodecIndexed8BitPackSOA4Flags.h

@@ -82,6 +82,8 @@ public:
 	{
 		OFFSET_TO_VERTICES_BITS = 29,							///< Offset from current block to start of vertices in bytes
 		OFFSET_TO_VERTICES_MASK = (1 << OFFSET_TO_VERTICES_BITS) - 1,
+		OFFSET_NON_SIGNIFICANT_BITS = 2,						///< The offset from the current block to the start of the vertices must be a multiple of 4 bytes
+		OFFSET_NON_SIGNIFICANT_MASK = (1 << OFFSET_NON_SIGNIFICANT_BITS) - 1,
 		OFFSET_TO_USERDATA_BITS = 3,							///< When user data is stored, this is the number of blocks to skip to get to the user data (0 = no user data)
 		OFFSET_TO_USERDATA_MASK = (1 << OFFSET_TO_USERDATA_BITS) - 1,
 	};
@@ -89,7 +91,7 @@ public:
 	/// A triangle header, will be followed by one or more TriangleBlocks
 	struct TriangleBlockHeader
 	{
-		const VertexData *			GetVertexData() const		{ return reinterpret_cast<const VertexData *>(reinterpret_cast<const uint8 *>(this) + (mFlags & OFFSET_TO_VERTICES_MASK)); }
+		const VertexData *			GetVertexData() const		{ return reinterpret_cast<const VertexData *>(reinterpret_cast<const uint8 *>(this) + ((mFlags & OFFSET_TO_VERTICES_MASK) << OFFSET_NON_SIGNIFICANT_BITS)); }
 		const TriangleBlock *		GetTriangleBlock() const	{ return reinterpret_cast<const TriangleBlock *>(reinterpret_cast<const uint8 *>(this) + sizeof(TriangleBlockHeader)); }
 		const uint32 *				GetUserData() const			{ uint32 offset = mFlags >> OFFSET_TO_VERTICES_BITS; return offset == 0? nullptr : reinterpret_cast<const uint32 *>(GetTriangleBlock() + offset); }
 
@@ -132,32 +134,80 @@ public:
 	class EncodingContext
 	{
 	public:
+		/// Indicates a vertex hasn't been seen yet in the triangle list
+		static constexpr uint32		cNotFound = 0xffffffff;
+
 		/// Construct the encoding context
 		explicit					EncodingContext(const VertexList &inVertices) :
-			mVertexMap(inVertices.size(), 0xffffffff) // Fill vertex map with 'not found'
+			mVertexMap(inVertices.size(), cNotFound)
+		{
+		}
+
+		/// Mimics the size a call to Pack() would add to the buffer
+		void						PreparePack(const IndexedTriangle *inTriangles, uint inNumTriangles, bool inStoreUserData, uint64 &ioBufferSize)
 		{
-			// Reserve for worst case to avoid allocating in the inner loop
-			mVertices.reserve(inVertices.size());
+			// Add triangle block header
+			ioBufferSize += sizeof(TriangleBlockHeader);
+
+			// Compute first vertex that this batch will use (ensuring there's enough room if none of the vertices are shared)
+			uint start_vertex = Clamp((int)mVertexCount - 256 + (int)inNumTriangles * 3, 0, (int)mVertexCount);
+
+			// Pack vertices
+			uint padded_triangle_count = AlignUp(inNumTriangles, 4);
+			for (uint t = 0; t < padded_triangle_count; t += 4)
+			{
+				// Add triangle block header
+				ioBufferSize += sizeof(TriangleBlock);
+
+				for (uint vertex_nr = 0; vertex_nr < 3; ++vertex_nr)
+					for (uint block_tri_idx = 0; block_tri_idx < 4; ++block_tri_idx)
+					{
+						// Fetch vertex index. Create degenerate triangles for padding triangles.
+						bool triangle_available = t + block_tri_idx < inNumTriangles;
+						uint32 src_vertex_index = triangle_available? inTriangles[t + block_tri_idx].mIdx[vertex_nr] : inTriangles[inNumTriangles - 1].mIdx[0];
+
+						// Check if we've seen this vertex before and if it is in the range that we can encode
+						uint32 &vertex_index = mVertexMap[src_vertex_index];
+						if (vertex_index == cNotFound || vertex_index < start_vertex)
+						{
+							// Add vertex
+							vertex_index = mVertexCount;
+							mVertexCount++;
+						}
+					}
+			}
+
+			// Add user data
+			if (inStoreUserData)
+				ioBufferSize += inNumTriangles * sizeof(uint32);
 		}
 
-		/// Get an upper bound on the amount of bytes needed to store inTriangleCount triangles
-		uint						GetPessimisticMemoryEstimate(uint inTriangleCount, bool inStoreUserData) const
+		/// Mimics the size the Finalize() call would add to ioBufferSize
+		void						FinalizePreparePack(uint64 &ioBufferSize)
 		{
-			// Worst case each triangle is alone in a block, none of the vertices are shared and we need to add 3 bytes to align the vertices
-			return inTriangleCount * (sizeof(TriangleBlockHeader) + sizeof(TriangleBlock) + (inStoreUserData? sizeof(uint32) : 0) + 3 * sizeof(VertexData)) + 3;
+			// Remember where the vertices are going to start in the output buffer
+			JPH_ASSERT(IsAligned(ioBufferSize, 4));
+			mVerticesStartIdx = size_t(ioBufferSize);
+
+			// Add vertices to buffer
+			ioBufferSize += uint64(mVertexCount) * sizeof(VertexData);
+
+			// Reserve the amount of memory we need for the vertices
+			mVertices.reserve(mVertexCount);
+
+			// Set vertex map back to 'not found'
+			for (uint32 &v : mVertexMap)
+				v = cNotFound;
 		}
 
 		/// Pack the triangles in inContainer to ioBuffer. This stores the mMaterialIndex of a triangle in the 8 bit flags.
-		/// Returns uint(-1) on error.
-		uint						Pack(const IndexedTriangle *inTriangles, uint inNumTriangles, bool inStoreUserData, ByteBuffer &ioBuffer, const char *&outError)
+		/// Returns size_t(-1) on error.
+		size_t						Pack(const IndexedTriangle *inTriangles, uint inNumTriangles, bool inStoreUserData, ByteBuffer &ioBuffer, const char *&outError)
 		{
 			JPH_ASSERT(inNumTriangles > 0);
 
 			// Determine position of triangles start
-			uint offset = (uint)ioBuffer.size();
-
-			// Update stats
-			mNumTriangles += inNumTriangles;
+			size_t triangle_block_start = ioBuffer.size();
 
 			// Allocate triangle block header
 			TriangleBlockHeader *header = ioBuffer.Allocate<TriangleBlockHeader>();
@@ -165,10 +215,20 @@ public:
 			// Compute first vertex that this batch will use (ensuring there's enough room if none of the vertices are shared)
 			uint start_vertex = Clamp((int)mVertices.size() - 256 + (int)inNumTriangles * 3, 0, (int)mVertices.size());
 
-			// Store the start vertex offset, this will later be patched to give the delta offset relative to the triangle block
-			mOffsetsToPatch.push_back(uint((uint8 *)&header->mFlags - &ioBuffer[0]));
-			header->mFlags = start_vertex * sizeof(VertexData);
-			JPH_ASSERT(header->mFlags <= OFFSET_TO_VERTICES_MASK, "Offset to vertices doesn't fit");
+			// Store the start vertex offset relative to TriangleBlockHeader
+			size_t offset_to_vertices = mVerticesStartIdx - triangle_block_start + size_t(start_vertex) * sizeof(VertexData);
+			if (offset_to_vertices & OFFSET_NON_SIGNIFICANT_MASK)
+			{
+				outError = "TriangleCodecIndexed8BitPackSOA4Flags: Internal Error: Offset has non-significant bits set";
+				return size_t(-1);
+			}
+			offset_to_vertices >>= OFFSET_NON_SIGNIFICANT_BITS;
+			if (offset_to_vertices > OFFSET_TO_VERTICES_MASK)
+			{
+				outError = "TriangleCodecIndexed8BitPackSOA4Flags: Offset to vertices doesn't fit. Too much data.";
+				return size_t(-1);
+			}
+			header->mFlags = uint32(offset_to_vertices);
 
 			// When we store user data we need to store the offset to the user data in TriangleBlocks
 			uint padded_triangle_count = AlignUp(inNumTriangles, 4);
@@ -192,7 +252,7 @@ public:
 
 						// Check if we've seen this vertex before and if it is in the range that we can encode
 						uint32 &vertex_index = mVertexMap[src_vertex_index];
-						if (vertex_index == 0xffffffff || vertex_index < start_vertex)
+						if (vertex_index == cNotFound || vertex_index < start_vertex)
 						{
 							// Add vertex
 							vertex_index = (uint32)mVertices.size();
@@ -204,7 +264,7 @@ public:
 						if (vertex_offset > 0xff)
 						{
 							outError = "TriangleCodecIndexed8BitPackSOA4Flags: Offset doesn't fit in 8 bit";
-							return uint(-1);
+							return size_t(-1);
 						}
 						block->mIndices[vertex_nr][block_tri_idx] = (uint8)vertex_offset;
 
@@ -213,7 +273,7 @@ public:
 						if (flags > 0xff)
 						{
 							outError = "TriangleCodecIndexed8BitPackSOA4Flags: Material index doesn't fit in 8 bit";
-							return uint(-1);
+							return size_t(-1);
 						}
 						block->mFlags[block_tri_idx] = (uint8)flags;
 					}
@@ -227,29 +287,20 @@ public:
 					user_data[t] = inTriangles[t].mUserData;
 			}
 
-			return offset;
+			return triangle_block_start;
 		}
 
 		/// After all triangles have been packed, this finalizes the header and triangle buffer
 		void						Finalize(const VertexList &inVertices, TriangleHeader *ioHeader, ByteBuffer &ioBuffer) const
 		{
+			// Assert that our reservations were correct
+			JPH_ASSERT(mVertices.size() == mVertexCount);
+			JPH_ASSERT(ioBuffer.size() == mVerticesStartIdx);
+
 			// Check if anything to do
 			if (mVertices.empty())
 				return;
-
-			// Align buffer to 4 bytes
-			uint vertices_idx = (uint)ioBuffer.Align(4);
-
-			// Patch the offsets
-			for (uint o : mOffsetsToPatch)
-			{
-				uint32 *flags = ioBuffer.Get<uint32>(o);
-				uint32 delta = vertices_idx - o;
-				if ((*flags & OFFSET_TO_VERTICES_MASK) + delta > OFFSET_TO_VERTICES_MASK)
-					JPH_ASSERT(false, "Offset to vertices doesn't fit");
-				*flags += delta;
-			}
-
+					
 			// Calculate bounding box
 			AABox bounds;
 			for (uint32 v : mVertices)
@@ -277,10 +328,10 @@ public:
 	private:
 		using VertexMap = Array<uint32>;
 
-		uint						mNumTriangles = 0;
-		Array<uint32>				mVertices;				///< Output vertices as an index into the original vertex list (inVertices), sorted according to occurrence
-		VertexMap					mVertexMap;				///< Maps from the original mesh vertex index (inVertices) to the index in our output vertices (mVertices)
-		Array<uint>					mOffsetsToPatch;		///< Offsets to the vertex buffer that need to be patched in once all nodes have been packed
+		uint32						mVertexCount = 0;			///< Number of vertices calculated during PreparePack
+		size_t						mVerticesStartIdx = 0;		///< Start of the vertices in the output buffer, calculated during PreparePack
+		Array<uint32>				mVertices;					///< Output vertices as an index into the original vertex list (inVertices), sorted according to occurrence
+		VertexMap					mVertexMap;					///< Maps from the original mesh vertex index (inVertices) to the index in our output vertices (mVertices)
 	};
 
 	/// This class is used to decode and decompress triangle data packed by the EncodingContext

+ 5 - 5
Jolt/Physics/Collision/Shape/MeshShape.cpp

@@ -59,7 +59,7 @@ JPH_IMPLEMENT_SERIALIZABLE_VIRTUAL(MeshShapeSettings)
 
 // Codecs this mesh shape is using
 using TriangleCodec = TriangleCodecIndexed8BitPackSOA4Flags;
-using NodeCodec = NodeCodecQuadTreeHalfFloat<1>;
+using NodeCodec = NodeCodecQuadTreeHalfFloat;
 
 // Get header for tree
 static JPH_INLINE const NodeCodec::Header *sGetNodeHeader(const ByteBuffer &inTree)
@@ -360,7 +360,7 @@ void MeshShape::DecodeSubShapeID(const SubShapeID &inSubShapeID, const void *&ou
 {
 	// Get block
 	SubShapeID triangle_idx_subshape_id;
-	uint32 block_id = inSubShapeID.PopID(NodeCodec::DecodingContext::sTriangleBlockIDBits(mTree), triangle_idx_subshape_id);
+	uint32 block_id = inSubShapeID.PopID(NodeCodec::DecodingContext::sTriangleBlockIDBits(sGetNodeHeader(mTree)), triangle_idx_subshape_id);
 	outTriangleBlock = NodeCodec::DecodingContext::sGetTriangleBlockStart(&mTree[0], block_id);
 
 	// Fetch the triangle index
@@ -438,7 +438,7 @@ AABox MeshShape::GetLocalBounds() const
 
 uint MeshShape::GetSubShapeIDBitsRecursive() const
 {
-	return NodeCodec::DecodingContext::sTriangleBlockIDBits(mTree) + NumTriangleBits;
+	return NodeCodec::DecodingContext::sTriangleBlockIDBits(sGetNodeHeader(mTree)) + NumTriangleBits;
 }
 
 template <class Visitor>
@@ -512,7 +512,7 @@ JPH_INLINE void MeshShape::WalkTreePerTriangle(const SubShapeIDCreator &inSubSha
 		uint				mTriangleBlockIDBits;
 	};
 
-	ChainedVisitor visitor(ioVisitor, inSubShapeIDCreator2, NodeCodec::DecodingContext::sTriangleBlockIDBits(mTree));
+	ChainedVisitor visitor(ioVisitor, inSubShapeIDCreator2, NodeCodec::DecodingContext::sTriangleBlockIDBits(sGetNodeHeader(mTree)));
 	WalkTree(visitor);
 }
 
@@ -714,7 +714,7 @@ bool MeshShape::CastRay(const RayCast &inRay, const SubShapeIDCreator &inSubShap
 	visitor.mRayOrigin = inRay.mOrigin;
 	visitor.mRayDirection = inRay.mDirection;
 	visitor.mRayInvDirection.Set(inRay.mDirection);
-	visitor.mTriangleBlockIDBits = NodeCodec::DecodingContext::sTriangleBlockIDBits(mTree);
+	visitor.mTriangleBlockIDBits = NodeCodec::DecodingContext::sTriangleBlockIDBits(sGetNodeHeader(mTree));
 	visitor.mSubShapeIDCreator = inSubShapeIDCreator;
 	WalkTree(visitor);