浏览代码

Added Bullet Physics (bullet3) dependencies and compiled libraries
Added PhysicsScene implementation
Added PhysicsObject as the main object of Physics system
Added PhysicsObject component to GameObject
Added CollisionShape and RigitBody components for Physics system
Added custom PhysicsMotionState for translating Bullet spatial data to GLM
Added PhysicsData container for Physics object data
Added PhysicsDataManager to handle Physics data updates
Added PhysicsDataManagerObject for sharing Physics data via inheritance
Added translations between GLM and Bullet LinearMath
Added more error codes and strings
Added more change types for BitMask
Added 'sprint' in camera movement

Paul A 3 年之前
父节点
当前提交
fe26e80e43
共有 100 个文件被更改,包括 29118 次插入0 次删除
  1. 1 0
      .gitignore
  2. 二进制
      Builds/x64/Debug/Praxis3D.exe
  3. 38 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3BroadphaseCallback.h
  4. 1352 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.cpp
  5. 1332 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h
  6. 808 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.cpp
  7. 197 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h
  8. 70 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h
  9. 559 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.cpp
  10. 427 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h
  11. 56 0
      Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h
  12. 93 0
      Dependencies/include/bullet3/Bullet3Collision/CMakeLists.txt
  13. 39 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3Config.h
  14. 55 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3Contact4.h
  15. 500 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.cpp
  16. 55 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h
  17. 297 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.cpp
  18. 92 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h
  19. 25 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h
  20. 28 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h
  21. 19 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h
  22. 123 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h
  23. 171 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h
  24. 69 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h
  25. 36 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h
  26. 486 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h
  27. 153 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactSphereSphere.h
  28. 38 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h
  29. 797 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h
  30. 197 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h
  31. 888 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h
  32. 175 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h
  33. 88 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h
  34. 89 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h
  35. 31 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h
  36. 35 0
      Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h
  37. 16 0
      Dependencies/include/bullet3/Bullet3Collision/premake4.lua
  38. 63 0
      Dependencies/include/bullet3/Bullet3Common/CMakeLists.txt
  39. 186 0
      Dependencies/include/bullet3/Bullet3Common/b3AlignedAllocator.cpp
  40. 110 0
      Dependencies/include/bullet3/Bullet3Common/b3AlignedAllocator.h
  41. 522 0
      Dependencies/include/bullet3/Bullet3Common/b3AlignedObjectArray.h
  42. 106 0
      Dependencies/include/bullet3/Bullet3Common/b3CommandLineArgs.h
  43. 133 0
      Dependencies/include/bullet3/Bullet3Common/b3FileUtils.h
  44. 462 0
      Dependencies/include/bullet3/Bullet3Common/b3HashMap.h
  45. 145 0
      Dependencies/include/bullet3/Bullet3Common/b3Logging.cpp
  46. 74 0
      Dependencies/include/bullet3/Bullet3Common/b3Logging.h
  47. 1354 0
      Dependencies/include/bullet3/Bullet3Common/b3Matrix3x3.h
  48. 69 0
      Dependencies/include/bullet3/Bullet3Common/b3MinMax.h
  49. 121 0
      Dependencies/include/bullet3/Bullet3Common/b3PoolAllocator.h
  50. 242 0
      Dependencies/include/bullet3/Bullet3Common/b3QuadWord.h
  51. 908 0
      Dependencies/include/bullet3/Bullet3Common/b3Quaternion.h
  52. 46 0
      Dependencies/include/bullet3/Bullet3Common/b3Random.h
  53. 171 0
      Dependencies/include/bullet3/Bullet3Common/b3ResizablePool.h
  54. 689 0
      Dependencies/include/bullet3/Bullet3Common/b3Scalar.h
  55. 118 0
      Dependencies/include/bullet3/Bullet3Common/b3StackAlloc.h
  56. 286 0
      Dependencies/include/bullet3/Bullet3Common/b3Transform.h
  57. 210 0
      Dependencies/include/bullet3/Bullet3Common/b3TransformUtil.h
  58. 1637 0
      Dependencies/include/bullet3/Bullet3Common/b3Vector3.cpp
  59. 1303 0
      Dependencies/include/bullet3/Bullet3Common/b3Vector3.h
  60. 16 0
      Dependencies/include/bullet3/Bullet3Common/premake4.lua
  61. 90 0
      Dependencies/include/bullet3/Bullet3Common/shared/b3Float4.h
  62. 63 0
      Dependencies/include/bullet3/Bullet3Common/shared/b3Int2.h
  63. 71 0
      Dependencies/include/bullet3/Bullet3Common/shared/b3Int4.h
  64. 157 0
      Dependencies/include/bullet3/Bullet3Common/shared/b3Mat3x3.h
  65. 41 0
      Dependencies/include/bullet3/Bullet3Common/shared/b3PlatformDefinitions.h
  66. 100 0
      Dependencies/include/bullet3/Bullet3Common/shared/b3Quat.h
  67. 61 0
      Dependencies/include/bullet3/Bullet3Dynamics/CMakeLists.txt
  68. 149 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h
  69. 103 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.cpp
  70. 34 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.h
  71. 737 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.cpp
  72. 517 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.h
  73. 150 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3JacobianEntry.h
  74. 1696 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.cpp
  75. 133 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h
  76. 190 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.cpp
  77. 153 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h
  78. 281 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h
  79. 73 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3SolverConstraint.h
  80. 151 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.cpp
  81. 469 0
      Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h
  82. 447 0
      Dependencies/include/bullet3/Bullet3Dynamics/b3CpuRigidBodyPipeline.cpp
  83. 62 0
      Dependencies/include/bullet3/Bullet3Dynamics/b3CpuRigidBodyPipeline.h
  84. 18 0
      Dependencies/include/bullet3/Bullet3Dynamics/premake4.lua
  85. 31 0
      Dependencies/include/bullet3/Bullet3Dynamics/shared/b3ContactConstraint4.h
  86. 148 0
      Dependencies/include/bullet3/Bullet3Dynamics/shared/b3ConvertConstraint4.h
  87. 14 0
      Dependencies/include/bullet3/Bullet3Dynamics/shared/b3Inertia.h
  88. 106 0
      Dependencies/include/bullet3/Bullet3Dynamics/shared/b3IntegrateTransforms.h
  89. 47 0
      Dependencies/include/bullet3/Bullet3Geometry/CMakeLists.txt
  90. 217 0
      Dependencies/include/bullet3/Bullet3Geometry/b3AabbUtil.h
  91. 2745 0
      Dependencies/include/bullet3/Bullet3Geometry/b3ConvexHullComputer.cpp
  92. 99 0
      Dependencies/include/bullet3/Bullet3Geometry/b3ConvexHullComputer.h
  93. 174 0
      Dependencies/include/bullet3/Bullet3Geometry/b3GeometryUtil.cpp
  94. 36 0
      Dependencies/include/bullet3/Bullet3Geometry/b3GeometryUtil.h
  95. 116 0
      Dependencies/include/bullet3/Bullet3Geometry/b3GrahamScan2dConvexHull.h
  96. 16 0
      Dependencies/include/bullet3/Bullet3Geometry/premake4.lua
  97. 42 0
      Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
  98. 338 0
      Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
  99. 80 0
      Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
  100. 557 0
      Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp

+ 1 - 0
.gitignore

@@ -32,3 +32,4 @@ Builds/x64/Debug/Praxis3D.exp
 Builds/x64/Debug/Praxis3D.lib
 *.sarif
 Praxis3D/x64/Debug/CodeAnalysisResultManifest.txt
+.vs/

二进制
Builds/x64/Debug/Praxis3D.exe


+ 38 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3BroadphaseCallback.h

@@ -0,0 +1,38 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_BROADPHASE_CALLBACK_H
+#define B3_BROADPHASE_CALLBACK_H
+
+#include "Bullet3Common/b3Vector3.h"
+struct b3BroadphaseProxy;
+
+struct b3BroadphaseAabbCallback
+{
+	virtual ~b3BroadphaseAabbCallback() {}
+	virtual bool process(const b3BroadphaseProxy* proxy) = 0;
+};
+
+struct b3BroadphaseRayCallback : public b3BroadphaseAabbCallback
+{
+	///added some cached data to accelerate ray-AABB tests
+	b3Vector3 m_rayDirectionInverse;
+	unsigned int m_signs[3];
+	b3Scalar m_lambda_max;
+
+	virtual ~b3BroadphaseRayCallback() {}
+};
+
+#endif  //B3_BROADPHASE_CALLBACK_H

+ 1352 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.cpp

@@ -0,0 +1,1352 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+///b3DynamicBvh implementation by Nathanael Presson
+
+#include "b3DynamicBvh.h"
+
+//
+typedef b3AlignedObjectArray<b3DbvtNode*> b3NodeArray;
+typedef b3AlignedObjectArray<const b3DbvtNode*> b3ConstNodeArray;
+
+//
+struct b3DbvtNodeEnumerator : b3DynamicBvh::ICollide
+{
+	b3ConstNodeArray nodes;
+	void Process(const b3DbvtNode* n) { nodes.push_back(n); }
+};
+
+//
+static B3_DBVT_INLINE int b3IndexOf(const b3DbvtNode* node)
+{
+	return (node->parent->childs[1] == node);
+}
+
+//
+static B3_DBVT_INLINE b3DbvtVolume b3Merge(const b3DbvtVolume& a,
+										   const b3DbvtVolume& b)
+{
+#if (B3_DBVT_MERGE_IMPL == B3_DBVT_IMPL_SSE)
+	B3_ATTRIBUTE_ALIGNED16(char locals[sizeof(b3DbvtAabbMm)]);
+	b3DbvtVolume& res = *(b3DbvtVolume*)locals;
+#else
+	b3DbvtVolume res;
+#endif
+	b3Merge(a, b, res);
+	return (res);
+}
+
+// volume+edge lengths
+static B3_DBVT_INLINE b3Scalar b3Size(const b3DbvtVolume& a)
+{
+	const b3Vector3 edges = a.Lengths();
+	return (edges.x * edges.y * edges.z +
+			edges.x + edges.y + edges.z);
+}
+
+//
+static void b3GetMaxDepth(const b3DbvtNode* node, int depth, int& maxdepth)
+{
+	if (node->isinternal())
+	{
+		b3GetMaxDepth(node->childs[0], depth + 1, maxdepth);
+		b3GetMaxDepth(node->childs[1], depth + 1, maxdepth);
+	}
+	else
+		maxdepth = b3Max(maxdepth, depth);
+}
+
+//
+static B3_DBVT_INLINE void b3DeleteNode(b3DynamicBvh* pdbvt,
+										b3DbvtNode* node)
+{
+	b3AlignedFree(pdbvt->m_free);
+	pdbvt->m_free = node;
+}
+
+//
+static void b3RecurseDeleteNode(b3DynamicBvh* pdbvt,
+								b3DbvtNode* node)
+{
+	if (!node->isleaf())
+	{
+		b3RecurseDeleteNode(pdbvt, node->childs[0]);
+		b3RecurseDeleteNode(pdbvt, node->childs[1]);
+	}
+	if (node == pdbvt->m_root) pdbvt->m_root = 0;
+	b3DeleteNode(pdbvt, node);
+}
+
+//
+static B3_DBVT_INLINE b3DbvtNode* b3CreateNode(b3DynamicBvh* pdbvt,
+											   b3DbvtNode* parent,
+											   void* data)
+{
+	b3DbvtNode* node;
+	if (pdbvt->m_free)
+	{
+		node = pdbvt->m_free;
+		pdbvt->m_free = 0;
+	}
+	else
+	{
+		node = new (b3AlignedAlloc(sizeof(b3DbvtNode), 16)) b3DbvtNode();
+	}
+	node->parent = parent;
+	node->data = data;
+	node->childs[1] = 0;
+	return (node);
+}
+
+//
+static B3_DBVT_INLINE b3DbvtNode* b3CreateNode(b3DynamicBvh* pdbvt,
+											   b3DbvtNode* parent,
+											   const b3DbvtVolume& volume,
+											   void* data)
+{
+	b3DbvtNode* node = b3CreateNode(pdbvt, parent, data);
+	node->volume = volume;
+	return (node);
+}
+
+//
+static B3_DBVT_INLINE b3DbvtNode* b3CreateNode(b3DynamicBvh* pdbvt,
+											   b3DbvtNode* parent,
+											   const b3DbvtVolume& volume0,
+											   const b3DbvtVolume& volume1,
+											   void* data)
+{
+	b3DbvtNode* node = b3CreateNode(pdbvt, parent, data);
+	b3Merge(volume0, volume1, node->volume);
+	return (node);
+}
+
+//
+static void b3InsertLeaf(b3DynamicBvh* pdbvt,
+						 b3DbvtNode* root,
+						 b3DbvtNode* leaf)
+{
+	if (!pdbvt->m_root)
+	{
+		pdbvt->m_root = leaf;
+		leaf->parent = 0;
+	}
+	else
+	{
+		if (!root->isleaf())
+		{
+			do
+			{
+				root = root->childs[b3Select(leaf->volume,
+											 root->childs[0]->volume,
+											 root->childs[1]->volume)];
+			} while (!root->isleaf());
+		}
+		b3DbvtNode* prev = root->parent;
+		b3DbvtNode* node = b3CreateNode(pdbvt, prev, leaf->volume, root->volume, 0);
+		if (prev)
+		{
+			prev->childs[b3IndexOf(root)] = node;
+			node->childs[0] = root;
+			root->parent = node;
+			node->childs[1] = leaf;
+			leaf->parent = node;
+			do
+			{
+				if (!prev->volume.Contain(node->volume))
+					b3Merge(prev->childs[0]->volume, prev->childs[1]->volume, prev->volume);
+				else
+					break;
+				node = prev;
+			} while (0 != (prev = node->parent));
+		}
+		else
+		{
+			node->childs[0] = root;
+			root->parent = node;
+			node->childs[1] = leaf;
+			leaf->parent = node;
+			pdbvt->m_root = node;
+		}
+	}
+}
+
+//
+static b3DbvtNode* b3RemoveLeaf(b3DynamicBvh* pdbvt,
+								b3DbvtNode* leaf)
+{
+	if (leaf == pdbvt->m_root)
+	{
+		pdbvt->m_root = 0;
+		return (0);
+	}
+	else
+	{
+		b3DbvtNode* parent = leaf->parent;
+		b3DbvtNode* prev = parent->parent;
+		b3DbvtNode* sibling = parent->childs[1 - b3IndexOf(leaf)];
+		if (prev)
+		{
+			prev->childs[b3IndexOf(parent)] = sibling;
+			sibling->parent = prev;
+			b3DeleteNode(pdbvt, parent);
+			while (prev)
+			{
+				const b3DbvtVolume pb = prev->volume;
+				b3Merge(prev->childs[0]->volume, prev->childs[1]->volume, prev->volume);
+				if (b3NotEqual(pb, prev->volume))
+				{
+					prev = prev->parent;
+				}
+				else
+					break;
+			}
+			return (prev ? prev : pdbvt->m_root);
+		}
+		else
+		{
+			pdbvt->m_root = sibling;
+			sibling->parent = 0;
+			b3DeleteNode(pdbvt, parent);
+			return (pdbvt->m_root);
+		}
+	}
+}
+
+//
+static void b3FetchLeaves(b3DynamicBvh* pdbvt,
+						  b3DbvtNode* root,
+						  b3NodeArray& leaves,
+						  int depth = -1)
+{
+	if (root->isinternal() && depth)
+	{
+		b3FetchLeaves(pdbvt, root->childs[0], leaves, depth - 1);
+		b3FetchLeaves(pdbvt, root->childs[1], leaves, depth - 1);
+		b3DeleteNode(pdbvt, root);
+	}
+	else
+	{
+		leaves.push_back(root);
+	}
+}
+
+static bool b3LeftOfAxis(const b3DbvtNode* node,
+						 const b3Vector3& org,
+						 const b3Vector3& axis)
+{
+	return b3Dot(axis, node->volume.Center() - org) <= 0;
+}
+
+// Partitions leaves such that leaves[0, n) are on the
+// left of axis, and leaves[n, count) are on the right
+// of axis. returns N.
+static int b3Split(b3DbvtNode** leaves,
+				   int count,
+				   const b3Vector3& org,
+				   const b3Vector3& axis)
+{
+	int begin = 0;
+	int end = count;
+	for (;;)
+	{
+		while (begin != end && b3LeftOfAxis(leaves[begin], org, axis))
+		{
+			++begin;
+		}
+
+		if (begin == end)
+		{
+			break;
+		}
+
+		while (begin != end && !b3LeftOfAxis(leaves[end - 1], org, axis))
+		{
+			--end;
+		}
+
+		if (begin == end)
+		{
+			break;
+		}
+
+		// swap out of place nodes
+		--end;
+		b3DbvtNode* temp = leaves[begin];
+		leaves[begin] = leaves[end];
+		leaves[end] = temp;
+		++begin;
+	}
+
+	return begin;
+}
+
+//
+static b3DbvtVolume b3Bounds(b3DbvtNode** leaves,
+							 int count)
+{
+#if B3_DBVT_MERGE_IMPL == B3_DBVT_IMPL_SSE
+	B3_ATTRIBUTE_ALIGNED16(char locals[sizeof(b3DbvtVolume)]);
+	b3DbvtVolume& volume = *(b3DbvtVolume*)locals;
+	volume = leaves[0]->volume;
+#else
+	b3DbvtVolume volume = leaves[0]->volume;
+#endif
+	for (int i = 1, ni = count; i < ni; ++i)
+	{
+		b3Merge(volume, leaves[i]->volume, volume);
+	}
+	return (volume);
+}
+
+//
+static void b3BottomUp(b3DynamicBvh* pdbvt,
+					   b3DbvtNode** leaves,
+					   int count)
+{
+	while (count > 1)
+	{
+		b3Scalar minsize = B3_INFINITY;
+		int minidx[2] = {-1, -1};
+		for (int i = 0; i < count; ++i)
+		{
+			for (int j = i + 1; j < count; ++j)
+			{
+				const b3Scalar sz = b3Size(b3Merge(leaves[i]->volume, leaves[j]->volume));
+				if (sz < minsize)
+				{
+					minsize = sz;
+					minidx[0] = i;
+					minidx[1] = j;
+				}
+			}
+		}
+		b3DbvtNode* n[] = {leaves[minidx[0]], leaves[minidx[1]]};
+		b3DbvtNode* p = b3CreateNode(pdbvt, 0, n[0]->volume, n[1]->volume, 0);
+		p->childs[0] = n[0];
+		p->childs[1] = n[1];
+		n[0]->parent = p;
+		n[1]->parent = p;
+		leaves[minidx[0]] = p;
+		leaves[minidx[1]] = leaves[count - 1];
+		--count;
+	}
+}
+
+//
+static b3DbvtNode* b3TopDown(b3DynamicBvh* pdbvt,
+							 b3DbvtNode** leaves,
+							 int count,
+							 int bu_treshold)
+{
+	static const b3Vector3 axis[] = {b3MakeVector3(1, 0, 0),
+									 b3MakeVector3(0, 1, 0),
+									 b3MakeVector3(0, 0, 1)};
+	b3Assert(bu_treshold > 1);
+	if (count > 1)
+	{
+		if (count > bu_treshold)
+		{
+			const b3DbvtVolume vol = b3Bounds(leaves, count);
+			const b3Vector3 org = vol.Center();
+			int partition;
+			int bestaxis = -1;
+			int bestmidp = count;
+			int splitcount[3][2] = {{0, 0}, {0, 0}, {0, 0}};
+			int i;
+			for (i = 0; i < count; ++i)
+			{
+				const b3Vector3 x = leaves[i]->volume.Center() - org;
+				for (int j = 0; j < 3; ++j)
+				{
+					++splitcount[j][b3Dot(x, axis[j]) > 0 ? 1 : 0];
+				}
+			}
+			for (i = 0; i < 3; ++i)
+			{
+				if ((splitcount[i][0] > 0) && (splitcount[i][1] > 0))
+				{
+					const int midp = (int)b3Fabs(b3Scalar(splitcount[i][0] - splitcount[i][1]));
+					if (midp < bestmidp)
+					{
+						bestaxis = i;
+						bestmidp = midp;
+					}
+				}
+			}
+			if (bestaxis >= 0)
+			{
+				partition = b3Split(leaves, count, org, axis[bestaxis]);
+				b3Assert(partition != 0 && partition != count);
+			}
+			else
+			{
+				partition = count / 2 + 1;
+			}
+			b3DbvtNode* node = b3CreateNode(pdbvt, 0, vol, 0);
+			node->childs[0] = b3TopDown(pdbvt, &leaves[0], partition, bu_treshold);
+			node->childs[1] = b3TopDown(pdbvt, &leaves[partition], count - partition, bu_treshold);
+			node->childs[0]->parent = node;
+			node->childs[1]->parent = node;
+			return (node);
+		}
+		else
+		{
+			b3BottomUp(pdbvt, leaves, count);
+			return (leaves[0]);
+		}
+	}
+	return (leaves[0]);
+}
+
+//
+static B3_DBVT_INLINE b3DbvtNode* b3Sort(b3DbvtNode* n, b3DbvtNode*& r)
+{
+	b3DbvtNode* p = n->parent;
+	b3Assert(n->isinternal());
+	if (p > n)
+	{
+		const int i = b3IndexOf(n);
+		const int j = 1 - i;
+		b3DbvtNode* s = p->childs[j];
+		b3DbvtNode* q = p->parent;
+		b3Assert(n == p->childs[i]);
+		if (q)
+			q->childs[b3IndexOf(p)] = n;
+		else
+			r = n;
+		s->parent = n;
+		p->parent = n;
+		n->parent = q;
+		p->childs[0] = n->childs[0];
+		p->childs[1] = n->childs[1];
+		n->childs[0]->parent = p;
+		n->childs[1]->parent = p;
+		n->childs[i] = p;
+		n->childs[j] = s;
+		b3Swap(p->volume, n->volume);
+		return (p);
+	}
+	return (n);
+}
+
+#if 0
+static B3_DBVT_INLINE b3DbvtNode*	walkup(b3DbvtNode* n,int count)
+{
+	while(n&&(count--)) n=n->parent;
+	return(n);
+}
+#endif
+
+//
+// Api
+//
+
+//
+b3DynamicBvh::b3DynamicBvh()
+{
+	m_root = 0;
+	m_free = 0;
+	m_lkhd = -1;
+	m_leaves = 0;
+	m_opath = 0;
+}
+
+//
+b3DynamicBvh::~b3DynamicBvh()
+{
+	clear();
+}
+
+//
+void b3DynamicBvh::clear()
+{
+	if (m_root)
+		b3RecurseDeleteNode(this, m_root);
+	b3AlignedFree(m_free);
+	m_free = 0;
+	m_lkhd = -1;
+	m_stkStack.clear();
+	m_opath = 0;
+}
+
+//
+void b3DynamicBvh::optimizeBottomUp()
+{
+	if (m_root)
+	{
+		b3NodeArray leaves;
+		leaves.reserve(m_leaves);
+		b3FetchLeaves(this, m_root, leaves);
+		b3BottomUp(this, &leaves[0], leaves.size());
+		m_root = leaves[0];
+	}
+}
+
+//
+void b3DynamicBvh::optimizeTopDown(int bu_treshold)
+{
+	if (m_root)
+	{
+		b3NodeArray leaves;
+		leaves.reserve(m_leaves);
+		b3FetchLeaves(this, m_root, leaves);
+		m_root = b3TopDown(this, &leaves[0], leaves.size(), bu_treshold);
+	}
+}
+
+//
+void b3DynamicBvh::optimizeIncremental(int passes)
+{
+	if (passes < 0) passes = m_leaves;
+	if (m_root && (passes > 0))
+	{
+		do
+		{
+			b3DbvtNode* node = m_root;
+			unsigned bit = 0;
+			while (node->isinternal())
+			{
+				node = b3Sort(node, m_root)->childs[(m_opath >> bit) & 1];
+				bit = (bit + 1) & (sizeof(unsigned) * 8 - 1);
+			}
+			update(node);
+			++m_opath;
+		} while (--passes);
+	}
+}
+
+//
+b3DbvtNode* b3DynamicBvh::insert(const b3DbvtVolume& volume, void* data)
+{
+	b3DbvtNode* leaf = b3CreateNode(this, 0, volume, data);
+	b3InsertLeaf(this, m_root, leaf);
+	++m_leaves;
+	return (leaf);
+}
+
+//
+void b3DynamicBvh::update(b3DbvtNode* leaf, int lookahead)
+{
+	b3DbvtNode* root = b3RemoveLeaf(this, leaf);
+	if (root)
+	{
+		if (lookahead >= 0)
+		{
+			for (int i = 0; (i < lookahead) && root->parent; ++i)
+			{
+				root = root->parent;
+			}
+		}
+		else
+			root = m_root;
+	}
+	b3InsertLeaf(this, root, leaf);
+}
+
+//
+void b3DynamicBvh::update(b3DbvtNode* leaf, b3DbvtVolume& volume)
+{
+	b3DbvtNode* root = b3RemoveLeaf(this, leaf);
+	if (root)
+	{
+		if (m_lkhd >= 0)
+		{
+			for (int i = 0; (i < m_lkhd) && root->parent; ++i)
+			{
+				root = root->parent;
+			}
+		}
+		else
+			root = m_root;
+	}
+	leaf->volume = volume;
+	b3InsertLeaf(this, root, leaf);
+}
+
+//
+bool b3DynamicBvh::update(b3DbvtNode* leaf, b3DbvtVolume& volume, const b3Vector3& velocity, b3Scalar margin)
+{
+	if (leaf->volume.Contain(volume)) return (false);
+	volume.Expand(b3MakeVector3(margin, margin, margin));
+	volume.SignedExpand(velocity);
+	update(leaf, volume);
+	return (true);
+}
+
+//
+bool b3DynamicBvh::update(b3DbvtNode* leaf, b3DbvtVolume& volume, const b3Vector3& velocity)
+{
+	if (leaf->volume.Contain(volume)) return (false);
+	volume.SignedExpand(velocity);
+	update(leaf, volume);
+	return (true);
+}
+
+//
+bool b3DynamicBvh::update(b3DbvtNode* leaf, b3DbvtVolume& volume, b3Scalar margin)
+{
+	if (leaf->volume.Contain(volume)) return (false);
+	volume.Expand(b3MakeVector3(margin, margin, margin));
+	update(leaf, volume);
+	return (true);
+}
+
+//
+void b3DynamicBvh::remove(b3DbvtNode* leaf)
+{
+	b3RemoveLeaf(this, leaf);
+	b3DeleteNode(this, leaf);
+	--m_leaves;
+}
+
+//
+void b3DynamicBvh::write(IWriter* iwriter) const
+{
+	b3DbvtNodeEnumerator nodes;
+	nodes.nodes.reserve(m_leaves * 2);
+	enumNodes(m_root, nodes);
+	iwriter->Prepare(m_root, nodes.nodes.size());
+	for (int i = 0; i < nodes.nodes.size(); ++i)
+	{
+		const b3DbvtNode* n = nodes.nodes[i];
+		int p = -1;
+		if (n->parent) p = nodes.nodes.findLinearSearch(n->parent);
+		if (n->isinternal())
+		{
+			const int c0 = nodes.nodes.findLinearSearch(n->childs[0]);
+			const int c1 = nodes.nodes.findLinearSearch(n->childs[1]);
+			iwriter->WriteNode(n, i, p, c0, c1);
+		}
+		else
+		{
+			iwriter->WriteLeaf(n, i, p);
+		}
+	}
+}
+
+//
+void b3DynamicBvh::clone(b3DynamicBvh& dest, IClone* iclone) const
+{
+	dest.clear();
+	if (m_root != 0)
+	{
+		b3AlignedObjectArray<sStkCLN> stack;
+		stack.reserve(m_leaves);
+		stack.push_back(sStkCLN(m_root, 0));
+		do
+		{
+			const int i = stack.size() - 1;
+			const sStkCLN e = stack[i];
+			b3DbvtNode* n = b3CreateNode(&dest, e.parent, e.node->volume, e.node->data);
+			stack.pop_back();
+			if (e.parent != 0)
+				e.parent->childs[i & 1] = n;
+			else
+				dest.m_root = n;
+			if (e.node->isinternal())
+			{
+				stack.push_back(sStkCLN(e.node->childs[0], n));
+				stack.push_back(sStkCLN(e.node->childs[1], n));
+			}
+			else
+			{
+				iclone->CloneLeaf(n);
+			}
+		} while (stack.size() > 0);
+	}
+}
+
+//
+int b3DynamicBvh::maxdepth(const b3DbvtNode* node)
+{
+	int depth = 0;
+	if (node) b3GetMaxDepth(node, 1, depth);
+	return (depth);
+}
+
+//
+int b3DynamicBvh::countLeaves(const b3DbvtNode* node)
+{
+	if (node->isinternal())
+		return (countLeaves(node->childs[0]) + countLeaves(node->childs[1]));
+	else
+		return (1);
+}
+
+//
+void b3DynamicBvh::extractLeaves(const b3DbvtNode* node, b3AlignedObjectArray<const b3DbvtNode*>& leaves)
+{
+	if (node->isinternal())
+	{
+		extractLeaves(node->childs[0], leaves);
+		extractLeaves(node->childs[1], leaves);
+	}
+	else
+	{
+		leaves.push_back(node);
+	}
+}
+
+//
+#if B3_DBVT_ENABLE_BENCHMARK
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+q6600,2.4ghz
+
+/Ox /Ob2 /Oi /Ot /I "." /I "..\.." /I "..\..\src" /D "NDEBUG" /D "_LIB" /D "_WINDOWS" /D "_CRT_SECURE_NO_DEPRECATE" /D "_CRT_NONSTDC_NO_DEPRECATE" /D "WIN32"
+/GF /FD /MT /GS- /Gy /arch:SSE2 /Zc:wchar_t- /Fp"..\..\out\release8\build\libbulletcollision\libbulletcollision.pch"
+/Fo"..\..\out\release8\build\libbulletcollision\\"
+/Fd"..\..\out\release8\build\libbulletcollision\bulletcollision.pdb"
+/W3 /nologo /c /Wp64 /Zi /errorReport:prompt
+
+Benchmarking dbvt...
+World scale: 100.000000
+Extents base: 1.000000
+Extents range: 4.000000
+Leaves: 8192
+sizeof(b3DbvtVolume): 32 bytes
+sizeof(b3DbvtNode):   44 bytes
+[1] b3DbvtVolume intersections: 3499 ms (-1%)
+[2] b3DbvtVolume merges: 1934 ms (0%)
+[3] b3DynamicBvh::collideTT: 5485 ms (-21%)
+[4] b3DynamicBvh::collideTT self: 2814 ms (-20%)
+[5] b3DynamicBvh::collideTT xform: 7379 ms (-1%)
+[6] b3DynamicBvh::collideTT xform,self: 7270 ms (-2%)
+[7] b3DynamicBvh::rayTest: 6314 ms (0%),(332143 r/s)
+[8] insert/remove: 2093 ms (0%),(1001983 ir/s)
+[9] updates (teleport): 1879 ms (-3%),(1116100 u/s)
+[10] updates (jitter): 1244 ms (-4%),(1685813 u/s)
+[11] optimize (incremental): 2514 ms (0%),(1668000 o/s)
+[12] b3DbvtVolume notequal: 3659 ms (0%)
+[13] culling(OCL+fullsort): 2218 ms (0%),(461 t/s)
+[14] culling(OCL+qsort): 3688 ms (5%),(2221 t/s)
+[15] culling(KDOP+qsort): 1139 ms (-1%),(7192 t/s)
+[16] insert/remove batch(256): 5092 ms (0%),(823704 bir/s)
+[17] b3DbvtVolume select: 3419 ms (0%)
+*/
+
+struct b3DbvtBenchmark
+{
+	struct NilPolicy : b3DynamicBvh::ICollide
+	{
+		NilPolicy() : m_pcount(0), m_depth(-B3_INFINITY), m_checksort(true) {}
+		void Process(const b3DbvtNode*, const b3DbvtNode*) { ++m_pcount; }
+		void Process(const b3DbvtNode*) { ++m_pcount; }
+		void Process(const b3DbvtNode*, b3Scalar depth)
+		{
+			++m_pcount;
+			if (m_checksort)
+			{
+				if (depth >= m_depth)
+					m_depth = depth;
+				else
+					printf("wrong depth: %f (should be >= %f)\r\n", depth, m_depth);
+			}
+		}
+		int m_pcount;
+		b3Scalar m_depth;
+		bool m_checksort;
+	};
+	struct P14 : b3DynamicBvh::ICollide
+	{
+		struct Node
+		{
+			const b3DbvtNode* leaf;
+			b3Scalar depth;
+		};
+		void Process(const b3DbvtNode* leaf, b3Scalar depth)
+		{
+			Node n;
+			n.leaf = leaf;
+			n.depth = depth;
+		}
+		static int sortfnc(const Node& a, const Node& b)
+		{
+			if (a.depth < b.depth) return (+1);
+			if (a.depth > b.depth) return (-1);
+			return (0);
+		}
+		b3AlignedObjectArray<Node> m_nodes;
+	};
+	struct P15 : b3DynamicBvh::ICollide
+	{
+		struct Node
+		{
+			const b3DbvtNode* leaf;
+			b3Scalar depth;
+		};
+		void Process(const b3DbvtNode* leaf)
+		{
+			Node n;
+			n.leaf = leaf;
+			n.depth = dot(leaf->volume.Center(), m_axis);
+		}
+		static int sortfnc(const Node& a, const Node& b)
+		{
+			if (a.depth < b.depth) return (+1);
+			if (a.depth > b.depth) return (-1);
+			return (0);
+		}
+		b3AlignedObjectArray<Node> m_nodes;
+		b3Vector3 m_axis;
+	};
+	static b3Scalar RandUnit()
+	{
+		return (rand() / (b3Scalar)RAND_MAX);
+	}
+	static b3Vector3 RandVector3()
+	{
+		return (b3Vector3(RandUnit(), RandUnit(), RandUnit()));
+	}
+	static b3Vector3 RandVector3(b3Scalar cs)
+	{
+		return (RandVector3() * cs - b3Vector3(cs, cs, cs) / 2);
+	}
+	static b3DbvtVolume RandVolume(b3Scalar cs, b3Scalar eb, b3Scalar es)
+	{
+		return (b3DbvtVolume::FromCE(RandVector3(cs), b3Vector3(eb, eb, eb) + RandVector3() * es));
+	}
+	static b3Transform RandTransform(b3Scalar cs)
+	{
+		b3Transform t;
+		t.setOrigin(RandVector3(cs));
+		t.setRotation(b3Quaternion(RandUnit() * B3_PI * 2, RandUnit() * B3_PI * 2, RandUnit() * B3_PI * 2).normalized());
+		return (t);
+	}
+	static void RandTree(b3Scalar cs, b3Scalar eb, b3Scalar es, int leaves, b3DynamicBvh& dbvt)
+	{
+		dbvt.clear();
+		for (int i = 0; i < leaves; ++i)
+		{
+			dbvt.insert(RandVolume(cs, eb, es), 0);
+		}
+	}
+};
+
+void b3DynamicBvh::benchmark()
+{
+	static const b3Scalar cfgVolumeCenterScale = 100;
+	static const b3Scalar cfgVolumeExentsBase = 1;
+	static const b3Scalar cfgVolumeExentsScale = 4;
+	static const int cfgLeaves = 8192;
+	static const bool cfgEnable = true;
+
+	//[1] b3DbvtVolume intersections
+	bool cfgBenchmark1_Enable = cfgEnable;
+	static const int cfgBenchmark1_Iterations = 8;
+	static const int cfgBenchmark1_Reference = 3499;
+	//[2] b3DbvtVolume merges
+	bool cfgBenchmark2_Enable = cfgEnable;
+	static const int cfgBenchmark2_Iterations = 4;
+	static const int cfgBenchmark2_Reference = 1945;
+	//[3] b3DynamicBvh::collideTT
+	bool cfgBenchmark3_Enable = cfgEnable;
+	static const int cfgBenchmark3_Iterations = 512;
+	static const int cfgBenchmark3_Reference = 5485;
+	//[4] b3DynamicBvh::collideTT self
+	bool cfgBenchmark4_Enable = cfgEnable;
+	static const int cfgBenchmark4_Iterations = 512;
+	static const int cfgBenchmark4_Reference = 2814;
+	//[5] b3DynamicBvh::collideTT xform
+	bool cfgBenchmark5_Enable = cfgEnable;
+	static const int cfgBenchmark5_Iterations = 512;
+	static const b3Scalar cfgBenchmark5_OffsetScale = 2;
+	static const int cfgBenchmark5_Reference = 7379;
+	//[6] b3DynamicBvh::collideTT xform,self
+	bool cfgBenchmark6_Enable = cfgEnable;
+	static const int cfgBenchmark6_Iterations = 512;
+	static const b3Scalar cfgBenchmark6_OffsetScale = 2;
+	static const int cfgBenchmark6_Reference = 7270;
+	//[7] b3DynamicBvh::rayTest
+	bool cfgBenchmark7_Enable = cfgEnable;
+	static const int cfgBenchmark7_Passes = 32;
+	static const int cfgBenchmark7_Iterations = 65536;
+	static const int cfgBenchmark7_Reference = 6307;
+	//[8] insert/remove
+	bool cfgBenchmark8_Enable = cfgEnable;
+	static const int cfgBenchmark8_Passes = 32;
+	static const int cfgBenchmark8_Iterations = 65536;
+	static const int cfgBenchmark8_Reference = 2105;
+	//[9] updates (teleport)
+	bool cfgBenchmark9_Enable = cfgEnable;
+	static const int cfgBenchmark9_Passes = 32;
+	static const int cfgBenchmark9_Iterations = 65536;
+	static const int cfgBenchmark9_Reference = 1879;
+	//[10] updates (jitter)
+	bool cfgBenchmark10_Enable = cfgEnable;
+	static const b3Scalar cfgBenchmark10_Scale = cfgVolumeCenterScale / 10000;
+	static const int cfgBenchmark10_Passes = 32;
+	static const int cfgBenchmark10_Iterations = 65536;
+	static const int cfgBenchmark10_Reference = 1244;
+	//[11] optimize (incremental)
+	bool cfgBenchmark11_Enable = cfgEnable;
+	static const int cfgBenchmark11_Passes = 64;
+	static const int cfgBenchmark11_Iterations = 65536;
+	static const int cfgBenchmark11_Reference = 2510;
+	//[12] b3DbvtVolume notequal
+	bool cfgBenchmark12_Enable = cfgEnable;
+	static const int cfgBenchmark12_Iterations = 32;
+	static const int cfgBenchmark12_Reference = 3677;
+	//[13] culling(OCL+fullsort)
+	bool cfgBenchmark13_Enable = cfgEnable;
+	static const int cfgBenchmark13_Iterations = 1024;
+	static const int cfgBenchmark13_Reference = 2231;
+	//[14] culling(OCL+qsort)
+	bool cfgBenchmark14_Enable = cfgEnable;
+	static const int cfgBenchmark14_Iterations = 8192;
+	static const int cfgBenchmark14_Reference = 3500;
+	//[15] culling(KDOP+qsort)
+	bool cfgBenchmark15_Enable = cfgEnable;
+	static const int cfgBenchmark15_Iterations = 8192;
+	static const int cfgBenchmark15_Reference = 1151;
+	//[16] insert/remove batch
+	bool cfgBenchmark16_Enable = cfgEnable;
+	static const int cfgBenchmark16_BatchCount = 256;
+	static const int cfgBenchmark16_Passes = 16384;
+	static const int cfgBenchmark16_Reference = 5138;
+	//[17] select
+	bool cfgBenchmark17_Enable = cfgEnable;
+	static const int cfgBenchmark17_Iterations = 4;
+	static const int cfgBenchmark17_Reference = 3390;
+
+	b3Clock wallclock;
+	printf("Benchmarking dbvt...\r\n");
+	printf("\tWorld scale: %f\r\n", cfgVolumeCenterScale);
+	printf("\tExtents base: %f\r\n", cfgVolumeExentsBase);
+	printf("\tExtents range: %f\r\n", cfgVolumeExentsScale);
+	printf("\tLeaves: %u\r\n", cfgLeaves);
+	printf("\tsizeof(b3DbvtVolume): %u bytes\r\n", sizeof(b3DbvtVolume));
+	printf("\tsizeof(b3DbvtNode):   %u bytes\r\n", sizeof(b3DbvtNode));
+	if (cfgBenchmark1_Enable)
+	{  // Benchmark 1
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume> volumes;
+		b3AlignedObjectArray<bool> results;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		for (int i = 0; i < cfgLeaves; ++i)
+		{
+			volumes[i] = b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale);
+		}
+		printf("[1] b3DbvtVolume intersections: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark1_Iterations; ++i)
+		{
+			for (int j = 0; j < cfgLeaves; ++j)
+			{
+				for (int k = 0; k < cfgLeaves; ++k)
+				{
+					results[k] = Intersect(volumes[j], volumes[k]);
+				}
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark1_Reference) * 100 / time);
+	}
+	if (cfgBenchmark2_Enable)
+	{  // Benchmark 2
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume> volumes;
+		b3AlignedObjectArray<b3DbvtVolume> results;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		for (int i = 0; i < cfgLeaves; ++i)
+		{
+			volumes[i] = b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale);
+		}
+		printf("[2] b3DbvtVolume merges: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark2_Iterations; ++i)
+		{
+			for (int j = 0; j < cfgLeaves; ++j)
+			{
+				for (int k = 0; k < cfgLeaves; ++k)
+				{
+					Merge(volumes[j], volumes[k], results[k]);
+				}
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark2_Reference) * 100 / time);
+	}
+	if (cfgBenchmark3_Enable)
+	{  // Benchmark 3
+		srand(380843);
+		b3DynamicBvh dbvt[2];
+		b3DbvtBenchmark::NilPolicy policy;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt[0]);
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt[1]);
+		dbvt[0].optimizeTopDown();
+		dbvt[1].optimizeTopDown();
+		printf("[3] b3DynamicBvh::collideTT: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark3_Iterations; ++i)
+		{
+			b3DynamicBvh::collideTT(dbvt[0].m_root, dbvt[1].m_root, policy);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark3_Reference) * 100 / time);
+	}
+	if (cfgBenchmark4_Enable)
+	{  // Benchmark 4
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3DbvtBenchmark::NilPolicy policy;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		printf("[4] b3DynamicBvh::collideTT self: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark4_Iterations; ++i)
+		{
+			b3DynamicBvh::collideTT(dbvt.m_root, dbvt.m_root, policy);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark4_Reference) * 100 / time);
+	}
+	if (cfgBenchmark5_Enable)
+	{  // Benchmark 5
+		srand(380843);
+		b3DynamicBvh dbvt[2];
+		b3AlignedObjectArray<b3Transform> transforms;
+		b3DbvtBenchmark::NilPolicy policy;
+		transforms.resize(cfgBenchmark5_Iterations);
+		for (int i = 0; i < transforms.size(); ++i)
+		{
+			transforms[i] = b3DbvtBenchmark::RandTransform(cfgVolumeCenterScale * cfgBenchmark5_OffsetScale);
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt[0]);
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt[1]);
+		dbvt[0].optimizeTopDown();
+		dbvt[1].optimizeTopDown();
+		printf("[5] b3DynamicBvh::collideTT xform: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark5_Iterations; ++i)
+		{
+			b3DynamicBvh::collideTT(dbvt[0].m_root, dbvt[1].m_root, transforms[i], policy);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark5_Reference) * 100 / time);
+	}
+	if (cfgBenchmark6_Enable)
+	{  // Benchmark 6
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<b3Transform> transforms;
+		b3DbvtBenchmark::NilPolicy policy;
+		transforms.resize(cfgBenchmark6_Iterations);
+		for (int i = 0; i < transforms.size(); ++i)
+		{
+			transforms[i] = b3DbvtBenchmark::RandTransform(cfgVolumeCenterScale * cfgBenchmark6_OffsetScale);
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		printf("[6] b3DynamicBvh::collideTT xform,self: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark6_Iterations; ++i)
+		{
+			b3DynamicBvh::collideTT(dbvt.m_root, dbvt.m_root, transforms[i], policy);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark6_Reference) * 100 / time);
+	}
+	if (cfgBenchmark7_Enable)
+	{  // Benchmark 7
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<b3Vector3> rayorg;
+		b3AlignedObjectArray<b3Vector3> raydir;
+		b3DbvtBenchmark::NilPolicy policy;
+		rayorg.resize(cfgBenchmark7_Iterations);
+		raydir.resize(cfgBenchmark7_Iterations);
+		for (int i = 0; i < rayorg.size(); ++i)
+		{
+			rayorg[i] = b3DbvtBenchmark::RandVector3(cfgVolumeCenterScale * 2);
+			raydir[i] = b3DbvtBenchmark::RandVector3(cfgVolumeCenterScale * 2);
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		printf("[7] b3DynamicBvh::rayTest: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark7_Passes; ++i)
+		{
+			for (int j = 0; j < cfgBenchmark7_Iterations; ++j)
+			{
+				b3DynamicBvh::rayTest(dbvt.m_root, rayorg[j], rayorg[j] + raydir[j], policy);
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		unsigned rays = cfgBenchmark7_Passes * cfgBenchmark7_Iterations;
+		printf("%u ms (%i%%),(%u r/s)\r\n", time, (time - cfgBenchmark7_Reference) * 100 / time, (rays * 1000) / time);
+	}
+	if (cfgBenchmark8_Enable)
+	{  // Benchmark 8
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		printf("[8] insert/remove: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark8_Passes; ++i)
+		{
+			for (int j = 0; j < cfgBenchmark8_Iterations; ++j)
+			{
+				dbvt.remove(dbvt.insert(b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale), 0));
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int ir = cfgBenchmark8_Passes * cfgBenchmark8_Iterations;
+		printf("%u ms (%i%%),(%u ir/s)\r\n", time, (time - cfgBenchmark8_Reference) * 100 / time, ir * 1000 / time);
+	}
+	if (cfgBenchmark9_Enable)
+	{  // Benchmark 9
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<const b3DbvtNode*> leaves;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		dbvt.extractLeaves(dbvt.m_root, leaves);
+		printf("[9] updates (teleport): ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark9_Passes; ++i)
+		{
+			for (int j = 0; j < cfgBenchmark9_Iterations; ++j)
+			{
+				dbvt.update(const_cast<b3DbvtNode*>(leaves[rand() % cfgLeaves]),
+							b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale));
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int up = cfgBenchmark9_Passes * cfgBenchmark9_Iterations;
+		printf("%u ms (%i%%),(%u u/s)\r\n", time, (time - cfgBenchmark9_Reference) * 100 / time, up * 1000 / time);
+	}
+	if (cfgBenchmark10_Enable)
+	{  // Benchmark 10
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<const b3DbvtNode*> leaves;
+		b3AlignedObjectArray<b3Vector3> vectors;
+		vectors.resize(cfgBenchmark10_Iterations);
+		for (int i = 0; i < vectors.size(); ++i)
+		{
+			vectors[i] = (b3DbvtBenchmark::RandVector3() * 2 - b3Vector3(1, 1, 1)) * cfgBenchmark10_Scale;
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		dbvt.extractLeaves(dbvt.m_root, leaves);
+		printf("[10] updates (jitter): ");
+		wallclock.reset();
+
+		for (int i = 0; i < cfgBenchmark10_Passes; ++i)
+		{
+			for (int j = 0; j < cfgBenchmark10_Iterations; ++j)
+			{
+				const b3Vector3& d = vectors[j];
+				b3DbvtNode* l = const_cast<b3DbvtNode*>(leaves[rand() % cfgLeaves]);
+				b3DbvtVolume v = b3DbvtVolume::FromMM(l->volume.Mins() + d, l->volume.Maxs() + d);
+				dbvt.update(l, v);
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int up = cfgBenchmark10_Passes * cfgBenchmark10_Iterations;
+		printf("%u ms (%i%%),(%u u/s)\r\n", time, (time - cfgBenchmark10_Reference) * 100 / time, up * 1000 / time);
+	}
+	if (cfgBenchmark11_Enable)
+	{  // Benchmark 11
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		printf("[11] optimize (incremental): ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark11_Passes; ++i)
+		{
+			dbvt.optimizeIncremental(cfgBenchmark11_Iterations);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int op = cfgBenchmark11_Passes * cfgBenchmark11_Iterations;
+		printf("%u ms (%i%%),(%u o/s)\r\n", time, (time - cfgBenchmark11_Reference) * 100 / time, op / time * 1000);
+	}
+	if (cfgBenchmark12_Enable)
+	{  // Benchmark 12
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume> volumes;
+		b3AlignedObjectArray<bool> results;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		for (int i = 0; i < cfgLeaves; ++i)
+		{
+			volumes[i] = b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale);
+		}
+		printf("[12] b3DbvtVolume notequal: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark12_Iterations; ++i)
+		{
+			for (int j = 0; j < cfgLeaves; ++j)
+			{
+				for (int k = 0; k < cfgLeaves; ++k)
+				{
+					results[k] = NotEqual(volumes[j], volumes[k]);
+				}
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark12_Reference) * 100 / time);
+	}
+	if (cfgBenchmark13_Enable)
+	{  // Benchmark 13
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<b3Vector3> vectors;
+		b3DbvtBenchmark::NilPolicy policy;
+		vectors.resize(cfgBenchmark13_Iterations);
+		for (int i = 0; i < vectors.size(); ++i)
+		{
+			vectors[i] = (b3DbvtBenchmark::RandVector3() * 2 - b3Vector3(1, 1, 1)).normalized();
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		printf("[13] culling(OCL+fullsort): ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark13_Iterations; ++i)
+		{
+			static const b3Scalar offset = 0;
+			policy.m_depth = -B3_INFINITY;
+			dbvt.collideOCL(dbvt.m_root, &vectors[i], &offset, vectors[i], 1, policy);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int t = cfgBenchmark13_Iterations;
+		printf("%u ms (%i%%),(%u t/s)\r\n", time, (time - cfgBenchmark13_Reference) * 100 / time, (t * 1000) / time);
+	}
+	if (cfgBenchmark14_Enable)
+	{  // Benchmark 14
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<b3Vector3> vectors;
+		b3DbvtBenchmark::P14 policy;
+		vectors.resize(cfgBenchmark14_Iterations);
+		for (int i = 0; i < vectors.size(); ++i)
+		{
+			vectors[i] = (b3DbvtBenchmark::RandVector3() * 2 - b3Vector3(1, 1, 1)).normalized();
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		policy.m_nodes.reserve(cfgLeaves);
+		printf("[14] culling(OCL+qsort): ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark14_Iterations; ++i)
+		{
+			static const b3Scalar offset = 0;
+			policy.m_nodes.resize(0);
+			dbvt.collideOCL(dbvt.m_root, &vectors[i], &offset, vectors[i], 1, policy, false);
+			policy.m_nodes.quickSort(b3DbvtBenchmark::P14::sortfnc);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int t = cfgBenchmark14_Iterations;
+		printf("%u ms (%i%%),(%u t/s)\r\n", time, (time - cfgBenchmark14_Reference) * 100 / time, (t * 1000) / time);
+	}
+	if (cfgBenchmark15_Enable)
+	{  // Benchmark 15
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<b3Vector3> vectors;
+		b3DbvtBenchmark::P15 policy;
+		vectors.resize(cfgBenchmark15_Iterations);
+		for (int i = 0; i < vectors.size(); ++i)
+		{
+			vectors[i] = (b3DbvtBenchmark::RandVector3() * 2 - b3Vector3(1, 1, 1)).normalized();
+		}
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		policy.m_nodes.reserve(cfgLeaves);
+		printf("[15] culling(KDOP+qsort): ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark15_Iterations; ++i)
+		{
+			static const b3Scalar offset = 0;
+			policy.m_nodes.resize(0);
+			policy.m_axis = vectors[i];
+			dbvt.collideKDOP(dbvt.m_root, &vectors[i], &offset, 1, policy);
+			policy.m_nodes.quickSort(b3DbvtBenchmark::P15::sortfnc);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int t = cfgBenchmark15_Iterations;
+		printf("%u ms (%i%%),(%u t/s)\r\n", time, (time - cfgBenchmark15_Reference) * 100 / time, (t * 1000) / time);
+	}
+	if (cfgBenchmark16_Enable)
+	{  // Benchmark 16
+		srand(380843);
+		b3DynamicBvh dbvt;
+		b3AlignedObjectArray<b3DbvtNode*> batch;
+		b3DbvtBenchmark::RandTree(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale, cfgLeaves, dbvt);
+		dbvt.optimizeTopDown();
+		batch.reserve(cfgBenchmark16_BatchCount);
+		printf("[16] insert/remove batch(%u): ", cfgBenchmark16_BatchCount);
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark16_Passes; ++i)
+		{
+			for (int j = 0; j < cfgBenchmark16_BatchCount; ++j)
+			{
+				batch.push_back(dbvt.insert(b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale), 0));
+			}
+			for (int j = 0; j < cfgBenchmark16_BatchCount; ++j)
+			{
+				dbvt.remove(batch[j]);
+			}
+			batch.resize(0);
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		const int ir = cfgBenchmark16_Passes * cfgBenchmark16_BatchCount;
+		printf("%u ms (%i%%),(%u bir/s)\r\n", time, (time - cfgBenchmark16_Reference) * 100 / time, int(ir * 1000.0 / time));
+	}
+	if (cfgBenchmark17_Enable)
+	{  // Benchmark 17
+		srand(380843);
+		b3AlignedObjectArray<b3DbvtVolume> volumes;
+		b3AlignedObjectArray<int> results;
+		b3AlignedObjectArray<int> indices;
+		volumes.resize(cfgLeaves);
+		results.resize(cfgLeaves);
+		indices.resize(cfgLeaves);
+		for (int i = 0; i < cfgLeaves; ++i)
+		{
+			indices[i] = i;
+			volumes[i] = b3DbvtBenchmark::RandVolume(cfgVolumeCenterScale, cfgVolumeExentsBase, cfgVolumeExentsScale);
+		}
+		for (int i = 0; i < cfgLeaves; ++i)
+		{
+			b3Swap(indices[i], indices[rand() % cfgLeaves]);
+		}
+		printf("[17] b3DbvtVolume select: ");
+		wallclock.reset();
+		for (int i = 0; i < cfgBenchmark17_Iterations; ++i)
+		{
+			for (int j = 0; j < cfgLeaves; ++j)
+			{
+				for (int k = 0; k < cfgLeaves; ++k)
+				{
+					const int idx = indices[k];
+					results[idx] = Select(volumes[idx], volumes[j], volumes[k]);
+				}
+			}
+		}
+		const int time = (int)wallclock.getTimeMilliseconds();
+		printf("%u ms (%i%%)\r\n", time, (time - cfgBenchmark17_Reference) * 100 / time);
+	}
+	printf("\r\n\r\n");
+}
+#endif

+ 1332 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h

@@ -0,0 +1,1332 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+///b3DynamicBvh implementation by Nathanael Presson
+
+#ifndef B3_DYNAMIC_BOUNDING_VOLUME_TREE_H
+#define B3_DYNAMIC_BOUNDING_VOLUME_TREE_H
+
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+
+//
+// Compile time configuration
+//
+
+// Implementation profiles
+#define B3_DBVT_IMPL_GENERIC 0  // Generic implementation
+#define B3_DBVT_IMPL_SSE 1      // SSE
+
+// Template implementation of ICollide
+#ifdef _WIN32
+#if (defined(_MSC_VER) && _MSC_VER >= 1400)
+#define B3_DBVT_USE_TEMPLATE 1
+#else
+#define B3_DBVT_USE_TEMPLATE 0
+#endif
+#else
+#define B3_DBVT_USE_TEMPLATE 0
+#endif
+
+// Use only intrinsics instead of inline asm
+#define B3_DBVT_USE_INTRINSIC_SSE 1
+
+// Using memmov for collideOCL
+#define B3_DBVT_USE_MEMMOVE 1
+
+// Enable benchmarking code
+#define B3_DBVT_ENABLE_BENCHMARK 0
+
+// Inlining
+#define B3_DBVT_INLINE B3_FORCE_INLINE
+
+// Specific methods implementation
+
+//SSE gives errors on a MSVC 7.1
+#if defined(B3_USE_SSE)  //&& defined (_WIN32)
+#define B3_DBVT_SELECT_IMPL B3_DBVT_IMPL_SSE
+#define B3_DBVT_MERGE_IMPL B3_DBVT_IMPL_SSE
+#define B3_DBVT_INT0_IMPL B3_DBVT_IMPL_SSE
+#else
+#define B3_DBVT_SELECT_IMPL B3_DBVT_IMPL_GENERIC
+#define B3_DBVT_MERGE_IMPL B3_DBVT_IMPL_GENERIC
+#define B3_DBVT_INT0_IMPL B3_DBVT_IMPL_GENERIC
+#endif
+
+#if (B3_DBVT_SELECT_IMPL == B3_DBVT_IMPL_SSE) || \
+	(B3_DBVT_MERGE_IMPL == B3_DBVT_IMPL_SSE) ||  \
+	(B3_DBVT_INT0_IMPL == B3_DBVT_IMPL_SSE)
+#include <emmintrin.h>
+#endif
+
+//
+// Auto config and checks
+//
+
+#if B3_DBVT_USE_TEMPLATE
+#define B3_DBVT_VIRTUAL
+#define B3_DBVT_VIRTUAL_DTOR(a)
+#define B3_DBVT_PREFIX template <typename T>
+#define B3_DBVT_IPOLICY T& policy
+#define B3_DBVT_CHECKTYPE                        \
+	static const ICollide& typechecker = *(T*)1; \
+	(void)typechecker;
+#else
+#define B3_DBVT_VIRTUAL_DTOR(a) \
+	virtual ~a() {}
+#define B3_DBVT_VIRTUAL virtual
+#define B3_DBVT_PREFIX
+#define B3_DBVT_IPOLICY ICollide& policy
+#define B3_DBVT_CHECKTYPE
+#endif
+
+#if B3_DBVT_USE_MEMMOVE
+#if !defined(__CELLOS_LV2__) && !defined(__MWERKS__)
+#include <memory.h>
+#endif
+#include <string.h>
+#endif
+
+#ifndef B3_DBVT_USE_TEMPLATE
+#error "B3_DBVT_USE_TEMPLATE undefined"
+#endif
+
+#ifndef B3_DBVT_USE_MEMMOVE
+#error "B3_DBVT_USE_MEMMOVE undefined"
+#endif
+
+#ifndef B3_DBVT_ENABLE_BENCHMARK
+#error "B3_DBVT_ENABLE_BENCHMARK undefined"
+#endif
+
+#ifndef B3_DBVT_SELECT_IMPL
+#error "B3_DBVT_SELECT_IMPL undefined"
+#endif
+
+#ifndef B3_DBVT_MERGE_IMPL
+#error "B3_DBVT_MERGE_IMPL undefined"
+#endif
+
+#ifndef B3_DBVT_INT0_IMPL
+#error "B3_DBVT_INT0_IMPL undefined"
+#endif
+
+//
+// Defaults volumes
+//
+
+/* b3DbvtAabbMm			*/
+struct b3DbvtAabbMm
+{
+	B3_DBVT_INLINE b3Vector3 Center() const { return ((mi + mx) / 2); }
+	B3_DBVT_INLINE b3Vector3 Lengths() const { return (mx - mi); }
+	B3_DBVT_INLINE b3Vector3 Extents() const { return ((mx - mi) / 2); }
+	B3_DBVT_INLINE const b3Vector3& Mins() const { return (mi); }
+	B3_DBVT_INLINE const b3Vector3& Maxs() const { return (mx); }
+	static inline b3DbvtAabbMm FromCE(const b3Vector3& c, const b3Vector3& e);
+	static inline b3DbvtAabbMm FromCR(const b3Vector3& c, b3Scalar r);
+	static inline b3DbvtAabbMm FromMM(const b3Vector3& mi, const b3Vector3& mx);
+	static inline b3DbvtAabbMm FromPoints(const b3Vector3* pts, int n);
+	static inline b3DbvtAabbMm FromPoints(const b3Vector3** ppts, int n);
+	B3_DBVT_INLINE void Expand(const b3Vector3& e);
+	B3_DBVT_INLINE void SignedExpand(const b3Vector3& e);
+	B3_DBVT_INLINE bool Contain(const b3DbvtAabbMm& a) const;
+	B3_DBVT_INLINE int Classify(const b3Vector3& n, b3Scalar o, int s) const;
+	B3_DBVT_INLINE b3Scalar ProjectMinimum(const b3Vector3& v, unsigned signs) const;
+	B3_DBVT_INLINE friend bool b3Intersect(const b3DbvtAabbMm& a,
+										   const b3DbvtAabbMm& b);
+
+	B3_DBVT_INLINE friend bool b3Intersect(const b3DbvtAabbMm& a,
+										   const b3Vector3& b);
+
+	B3_DBVT_INLINE friend b3Scalar b3Proximity(const b3DbvtAabbMm& a,
+											   const b3DbvtAabbMm& b);
+	B3_DBVT_INLINE friend int b3Select(const b3DbvtAabbMm& o,
+									   const b3DbvtAabbMm& a,
+									   const b3DbvtAabbMm& b);
+	B3_DBVT_INLINE friend void b3Merge(const b3DbvtAabbMm& a,
+									   const b3DbvtAabbMm& b,
+									   b3DbvtAabbMm& r);
+	B3_DBVT_INLINE friend bool b3NotEqual(const b3DbvtAabbMm& a,
+										  const b3DbvtAabbMm& b);
+
+	B3_DBVT_INLINE b3Vector3& tMins() { return (mi); }
+	B3_DBVT_INLINE b3Vector3& tMaxs() { return (mx); }
+
+private:
+	B3_DBVT_INLINE void AddSpan(const b3Vector3& d, b3Scalar& smi, b3Scalar& smx) const;
+
+private:
+	b3Vector3 mi, mx;
+};
+
+// Types
+typedef b3DbvtAabbMm b3DbvtVolume;
+
+/* b3DbvtNode				*/
+struct b3DbvtNode
+{
+	b3DbvtVolume volume;
+	b3DbvtNode* parent;
+	B3_DBVT_INLINE bool isleaf() const { return (childs[1] == 0); }
+	B3_DBVT_INLINE bool isinternal() const { return (!isleaf()); }
+	union {
+		b3DbvtNode* childs[2];
+		void* data;
+		int dataAsInt;
+	};
+};
+
+///The b3DynamicBvh class implements a fast dynamic bounding volume tree based on axis aligned bounding boxes (aabb tree).
+///This b3DynamicBvh is used for soft body collision detection and for the b3DynamicBvhBroadphase. It has a fast insert, remove and update of nodes.
+///Unlike the b3QuantizedBvh, nodes can be dynamically moved around, which allows for change in topology of the underlying data structure.
+struct b3DynamicBvh
+{
+	/* Stack element	*/
+	struct sStkNN
+	{
+		const b3DbvtNode* a;
+		const b3DbvtNode* b;
+		sStkNN() {}
+		sStkNN(const b3DbvtNode* na, const b3DbvtNode* nb) : a(na), b(nb) {}
+	};
+	struct sStkNP
+	{
+		const b3DbvtNode* node;
+		int mask;
+		sStkNP(const b3DbvtNode* n, unsigned m) : node(n), mask(m) {}
+	};
+	struct sStkNPS
+	{
+		const b3DbvtNode* node;
+		int mask;
+		b3Scalar value;
+		sStkNPS() {}
+		sStkNPS(const b3DbvtNode* n, unsigned m, b3Scalar v) : node(n), mask(m), value(v) {}
+	};
+	struct sStkCLN
+	{
+		const b3DbvtNode* node;
+		b3DbvtNode* parent;
+		sStkCLN(const b3DbvtNode* n, b3DbvtNode* p) : node(n), parent(p) {}
+	};
+	// Policies/Interfaces
+
+	/* ICollide	*/
+	struct ICollide
+	{
+		B3_DBVT_VIRTUAL_DTOR(ICollide)
+		B3_DBVT_VIRTUAL void Process(const b3DbvtNode*, const b3DbvtNode*) {}
+		B3_DBVT_VIRTUAL void Process(const b3DbvtNode*) {}
+		B3_DBVT_VIRTUAL void Process(const b3DbvtNode* n, b3Scalar) { Process(n); }
+		B3_DBVT_VIRTUAL bool Descent(const b3DbvtNode*) { return (true); }
+		B3_DBVT_VIRTUAL bool AllLeaves(const b3DbvtNode*) { return (true); }
+	};
+	/* IWriter	*/
+	struct IWriter
+	{
+		virtual ~IWriter() {}
+		virtual void Prepare(const b3DbvtNode* root, int numnodes) = 0;
+		virtual void WriteNode(const b3DbvtNode*, int index, int parent, int child0, int child1) = 0;
+		virtual void WriteLeaf(const b3DbvtNode*, int index, int parent) = 0;
+	};
+	/* IClone	*/
+	struct IClone
+	{
+		virtual ~IClone() {}
+		virtual void CloneLeaf(b3DbvtNode*) {}
+	};
+
+	// Constants
+	enum
+	{
+		B3_SIMPLE_STACKSIZE = 64,
+		B3_DOUBLE_STACKSIZE = B3_SIMPLE_STACKSIZE * 2
+	};
+
+	// Fields
+	b3DbvtNode* m_root;
+	b3DbvtNode* m_free;
+	int m_lkhd;
+	int m_leaves;
+	unsigned m_opath;
+
+	b3AlignedObjectArray<sStkNN> m_stkStack;
+	mutable b3AlignedObjectArray<const b3DbvtNode*> m_rayTestStack;
+
+	// Methods
+	b3DynamicBvh();
+	~b3DynamicBvh();
+	void clear();
+	bool empty() const { return (0 == m_root); }
+	void optimizeBottomUp();
+	void optimizeTopDown(int bu_treshold = 128);
+	void optimizeIncremental(int passes);
+	b3DbvtNode* insert(const b3DbvtVolume& box, void* data);
+	void update(b3DbvtNode* leaf, int lookahead = -1);
+	void update(b3DbvtNode* leaf, b3DbvtVolume& volume);
+	bool update(b3DbvtNode* leaf, b3DbvtVolume& volume, const b3Vector3& velocity, b3Scalar margin);
+	bool update(b3DbvtNode* leaf, b3DbvtVolume& volume, const b3Vector3& velocity);
+	bool update(b3DbvtNode* leaf, b3DbvtVolume& volume, b3Scalar margin);
+	void remove(b3DbvtNode* leaf);
+	void write(IWriter* iwriter) const;
+	void clone(b3DynamicBvh& dest, IClone* iclone = 0) const;
+	static int maxdepth(const b3DbvtNode* node);
+	static int countLeaves(const b3DbvtNode* node);
+	static void extractLeaves(const b3DbvtNode* node, b3AlignedObjectArray<const b3DbvtNode*>& leaves);
+#if B3_DBVT_ENABLE_BENCHMARK
+	static void benchmark();
+#else
+	static void benchmark()
+	{
+	}
+#endif
+	// B3_DBVT_IPOLICY must support ICollide policy/interface
+	B3_DBVT_PREFIX
+	static void enumNodes(const b3DbvtNode* root,
+						  B3_DBVT_IPOLICY);
+	B3_DBVT_PREFIX
+	static void enumLeaves(const b3DbvtNode* root,
+						   B3_DBVT_IPOLICY);
+	B3_DBVT_PREFIX
+	void collideTT(const b3DbvtNode* root0,
+				   const b3DbvtNode* root1,
+				   B3_DBVT_IPOLICY);
+
+	B3_DBVT_PREFIX
+	void collideTTpersistentStack(const b3DbvtNode* root0,
+								  const b3DbvtNode* root1,
+								  B3_DBVT_IPOLICY);
+#if 0
+	B3_DBVT_PREFIX
+		void		collideTT(	const b3DbvtNode* root0,
+		const b3DbvtNode* root1,
+		const b3Transform& xform,
+		B3_DBVT_IPOLICY);
+	B3_DBVT_PREFIX
+		void		collideTT(	const b3DbvtNode* root0,
+		const b3Transform& xform0,
+		const b3DbvtNode* root1,
+		const b3Transform& xform1,
+		B3_DBVT_IPOLICY);
+#endif
+
+	B3_DBVT_PREFIX
+	void collideTV(const b3DbvtNode* root,
+				   const b3DbvtVolume& volume,
+				   B3_DBVT_IPOLICY) const;
+	///rayTest is a re-entrant ray test, and can be called in parallel as long as the b3AlignedAlloc is thread-safe (uses locking etc)
+	///rayTest is slower than rayTestInternal, because it builds a local stack, using memory allocations, and it recomputes signs/rayDirectionInverses each time
+	B3_DBVT_PREFIX
+	static void rayTest(const b3DbvtNode* root,
+						const b3Vector3& rayFrom,
+						const b3Vector3& rayTo,
+						B3_DBVT_IPOLICY);
+	///rayTestInternal is faster than rayTest, because it uses a persistent stack (to reduce dynamic memory allocations to a minimum) and it uses precomputed signs/rayInverseDirections
+	///rayTestInternal is used by b3DynamicBvhBroadphase to accelerate world ray casts
+	B3_DBVT_PREFIX
+	void rayTestInternal(const b3DbvtNode* root,
+						 const b3Vector3& rayFrom,
+						 const b3Vector3& rayTo,
+						 const b3Vector3& rayDirectionInverse,
+						 unsigned int signs[3],
+						 b3Scalar lambda_max,
+						 const b3Vector3& aabbMin,
+						 const b3Vector3& aabbMax,
+						 B3_DBVT_IPOLICY) const;
+
+	B3_DBVT_PREFIX
+	static void collideKDOP(const b3DbvtNode* root,
+							const b3Vector3* normals,
+							const b3Scalar* offsets,
+							int count,
+							B3_DBVT_IPOLICY);
+	B3_DBVT_PREFIX
+	static void collideOCL(const b3DbvtNode* root,
+						   const b3Vector3* normals,
+						   const b3Scalar* offsets,
+						   const b3Vector3& sortaxis,
+						   int count,
+						   B3_DBVT_IPOLICY,
+						   bool fullsort = true);
+	B3_DBVT_PREFIX
+	static void collideTU(const b3DbvtNode* root,
+						  B3_DBVT_IPOLICY);
+	// Helpers
+	static B3_DBVT_INLINE int nearest(const int* i, const b3DynamicBvh::sStkNPS* a, b3Scalar v, int l, int h)
+	{
+		int m = 0;
+		while (l < h)
+		{
+			m = (l + h) >> 1;
+			if (a[i[m]].value >= v)
+				l = m + 1;
+			else
+				h = m;
+		}
+		return (h);
+	}
+	static B3_DBVT_INLINE int allocate(b3AlignedObjectArray<int>& ifree,
+									   b3AlignedObjectArray<sStkNPS>& stock,
+									   const sStkNPS& value)
+	{
+		int i;
+		if (ifree.size() > 0)
+		{
+			i = ifree[ifree.size() - 1];
+			ifree.pop_back();
+			stock[i] = value;
+		}
+		else
+		{
+			i = stock.size();
+			stock.push_back(value);
+		}
+		return (i);
+	}
+	//
+private:
+	b3DynamicBvh(const b3DynamicBvh&) {}
+};
+
+//
+// Inline's
+//
+
+//
+inline b3DbvtAabbMm b3DbvtAabbMm::FromCE(const b3Vector3& c, const b3Vector3& e)
+{
+	b3DbvtAabbMm box;
+	box.mi = c - e;
+	box.mx = c + e;
+	return (box);
+}
+
+//
+inline b3DbvtAabbMm b3DbvtAabbMm::FromCR(const b3Vector3& c, b3Scalar r)
+{
+	return (FromCE(c, b3MakeVector3(r, r, r)));
+}
+
+//
+inline b3DbvtAabbMm b3DbvtAabbMm::FromMM(const b3Vector3& mi, const b3Vector3& mx)
+{
+	b3DbvtAabbMm box;
+	box.mi = mi;
+	box.mx = mx;
+	return (box);
+}
+
+//
+inline b3DbvtAabbMm b3DbvtAabbMm::FromPoints(const b3Vector3* pts, int n)
+{
+	b3DbvtAabbMm box;
+	box.mi = box.mx = pts[0];
+	for (int i = 1; i < n; ++i)
+	{
+		box.mi.setMin(pts[i]);
+		box.mx.setMax(pts[i]);
+	}
+	return (box);
+}
+
+//
+inline b3DbvtAabbMm b3DbvtAabbMm::FromPoints(const b3Vector3** ppts, int n)
+{
+	b3DbvtAabbMm box;
+	box.mi = box.mx = *ppts[0];
+	for (int i = 1; i < n; ++i)
+	{
+		box.mi.setMin(*ppts[i]);
+		box.mx.setMax(*ppts[i]);
+	}
+	return (box);
+}
+
+//
+B3_DBVT_INLINE void b3DbvtAabbMm::Expand(const b3Vector3& e)
+{
+	mi -= e;
+	mx += e;
+}
+
+//
+B3_DBVT_INLINE void b3DbvtAabbMm::SignedExpand(const b3Vector3& e)
+{
+	if (e.x > 0)
+		mx.setX(mx.x + e[0]);
+	else
+		mi.setX(mi.x + e[0]);
+	if (e.y > 0)
+		mx.setY(mx.y + e[1]);
+	else
+		mi.setY(mi.y + e[1]);
+	if (e.z > 0)
+		mx.setZ(mx.z + e[2]);
+	else
+		mi.setZ(mi.z + e[2]);
+}
+
+//
+B3_DBVT_INLINE bool b3DbvtAabbMm::Contain(const b3DbvtAabbMm& a) const
+{
+	return ((mi.x <= a.mi.x) &&
+			(mi.y <= a.mi.y) &&
+			(mi.z <= a.mi.z) &&
+			(mx.x >= a.mx.x) &&
+			(mx.y >= a.mx.y) &&
+			(mx.z >= a.mx.z));
+}
+
+//
+B3_DBVT_INLINE int b3DbvtAabbMm::Classify(const b3Vector3& n, b3Scalar o, int s) const
+{
+	b3Vector3 pi, px;
+	switch (s)
+	{
+		case (0 + 0 + 0):
+			px = b3MakeVector3(mi.x, mi.y, mi.z);
+			pi = b3MakeVector3(mx.x, mx.y, mx.z);
+			break;
+		case (1 + 0 + 0):
+			px = b3MakeVector3(mx.x, mi.y, mi.z);
+			pi = b3MakeVector3(mi.x, mx.y, mx.z);
+			break;
+		case (0 + 2 + 0):
+			px = b3MakeVector3(mi.x, mx.y, mi.z);
+			pi = b3MakeVector3(mx.x, mi.y, mx.z);
+			break;
+		case (1 + 2 + 0):
+			px = b3MakeVector3(mx.x, mx.y, mi.z);
+			pi = b3MakeVector3(mi.x, mi.y, mx.z);
+			break;
+		case (0 + 0 + 4):
+			px = b3MakeVector3(mi.x, mi.y, mx.z);
+			pi = b3MakeVector3(mx.x, mx.y, mi.z);
+			break;
+		case (1 + 0 + 4):
+			px = b3MakeVector3(mx.x, mi.y, mx.z);
+			pi = b3MakeVector3(mi.x, mx.y, mi.z);
+			break;
+		case (0 + 2 + 4):
+			px = b3MakeVector3(mi.x, mx.y, mx.z);
+			pi = b3MakeVector3(mx.x, mi.y, mi.z);
+			break;
+		case (1 + 2 + 4):
+			px = b3MakeVector3(mx.x, mx.y, mx.z);
+			pi = b3MakeVector3(mi.x, mi.y, mi.z);
+			break;
+	}
+	if ((b3Dot(n, px) + o) < 0) return (-1);
+	if ((b3Dot(n, pi) + o) >= 0) return (+1);
+	return (0);
+}
+
+//
+B3_DBVT_INLINE b3Scalar b3DbvtAabbMm::ProjectMinimum(const b3Vector3& v, unsigned signs) const
+{
+	const b3Vector3* b[] = {&mx, &mi};
+	const b3Vector3 p = b3MakeVector3(b[(signs >> 0) & 1]->x,
+									  b[(signs >> 1) & 1]->y,
+									  b[(signs >> 2) & 1]->z);
+	return (b3Dot(p, v));
+}
+
+//
+B3_DBVT_INLINE void b3DbvtAabbMm::AddSpan(const b3Vector3& d, b3Scalar& smi, b3Scalar& smx) const
+{
+	for (int i = 0; i < 3; ++i)
+	{
+		if (d[i] < 0)
+		{
+			smi += mx[i] * d[i];
+			smx += mi[i] * d[i];
+		}
+		else
+		{
+			smi += mi[i] * d[i];
+			smx += mx[i] * d[i];
+		}
+	}
+}
+
+//
+B3_DBVT_INLINE bool b3Intersect(const b3DbvtAabbMm& a,
+								const b3DbvtAabbMm& b)
+{
+#if B3_DBVT_INT0_IMPL == B3_DBVT_IMPL_SSE
+	const __m128 rt(_mm_or_ps(_mm_cmplt_ps(_mm_load_ps(b.mx), _mm_load_ps(a.mi)),
+							  _mm_cmplt_ps(_mm_load_ps(a.mx), _mm_load_ps(b.mi))));
+#if defined(_WIN32)
+	const __int32* pu((const __int32*)&rt);
+#else
+	const int* pu((const int*)&rt);
+#endif
+	return ((pu[0] | pu[1] | pu[2]) == 0);
+#else
+	return ((a.mi.x <= b.mx.x) &&
+			(a.mx.x >= b.mi.x) &&
+			(a.mi.y <= b.mx.y) &&
+			(a.mx.y >= b.mi.y) &&
+			(a.mi.z <= b.mx.z) &&
+			(a.mx.z >= b.mi.z));
+#endif
+}
+
+//
+B3_DBVT_INLINE bool b3Intersect(const b3DbvtAabbMm& a,
+								const b3Vector3& b)
+{
+	return ((b.x >= a.mi.x) &&
+			(b.y >= a.mi.y) &&
+			(b.z >= a.mi.z) &&
+			(b.x <= a.mx.x) &&
+			(b.y <= a.mx.y) &&
+			(b.z <= a.mx.z));
+}
+
+//////////////////////////////////////
+
+//
+B3_DBVT_INLINE b3Scalar b3Proximity(const b3DbvtAabbMm& a,
+									const b3DbvtAabbMm& b)
+{
+	const b3Vector3 d = (a.mi + a.mx) - (b.mi + b.mx);
+	return (b3Fabs(d.x) + b3Fabs(d.y) + b3Fabs(d.z));
+}
+
+//
+B3_DBVT_INLINE int b3Select(const b3DbvtAabbMm& o,
+							const b3DbvtAabbMm& a,
+							const b3DbvtAabbMm& b)
+{
+#if B3_DBVT_SELECT_IMPL == B3_DBVT_IMPL_SSE
+
+#if defined(_WIN32)
+	static B3_ATTRIBUTE_ALIGNED16(const unsigned __int32) mask[] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+#else
+	static B3_ATTRIBUTE_ALIGNED16(const unsigned int) mask[] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x00000000 /*0x7fffffff*/};
+#endif
+	///@todo: the intrinsic version is 11% slower
+#if B3_DBVT_USE_INTRINSIC_SSE
+
+	union b3SSEUnion  ///NOTE: if we use more intrinsics, move b3SSEUnion into the LinearMath directory
+	{
+		__m128 ssereg;
+		float floats[4];
+		int ints[4];
+	};
+
+	__m128 omi(_mm_load_ps(o.mi));
+	omi = _mm_add_ps(omi, _mm_load_ps(o.mx));
+	__m128 ami(_mm_load_ps(a.mi));
+	ami = _mm_add_ps(ami, _mm_load_ps(a.mx));
+	ami = _mm_sub_ps(ami, omi);
+	ami = _mm_and_ps(ami, _mm_load_ps((const float*)mask));
+	__m128 bmi(_mm_load_ps(b.mi));
+	bmi = _mm_add_ps(bmi, _mm_load_ps(b.mx));
+	bmi = _mm_sub_ps(bmi, omi);
+	bmi = _mm_and_ps(bmi, _mm_load_ps((const float*)mask));
+	__m128 t0(_mm_movehl_ps(ami, ami));
+	ami = _mm_add_ps(ami, t0);
+	ami = _mm_add_ss(ami, _mm_shuffle_ps(ami, ami, 1));
+	__m128 t1(_mm_movehl_ps(bmi, bmi));
+	bmi = _mm_add_ps(bmi, t1);
+	bmi = _mm_add_ss(bmi, _mm_shuffle_ps(bmi, bmi, 1));
+
+	b3SSEUnion tmp;
+	tmp.ssereg = _mm_cmple_ss(bmi, ami);
+	return tmp.ints[0] & 1;
+
+#else
+	B3_ATTRIBUTE_ALIGNED16(__int32 r[1]);
+	__asm
+	{
+		mov		eax,o
+			mov		ecx,a
+			mov		edx,b
+			movaps	xmm0,[eax]
+		movaps	xmm5,mask
+			addps	xmm0,[eax+16]	
+		movaps	xmm1,[ecx]
+		movaps	xmm2,[edx]
+		addps	xmm1,[ecx+16]
+		addps	xmm2,[edx+16]
+		subps	xmm1,xmm0
+			subps	xmm2,xmm0
+			andps	xmm1,xmm5
+			andps	xmm2,xmm5
+			movhlps	xmm3,xmm1
+			movhlps	xmm4,xmm2
+			addps	xmm1,xmm3
+			addps	xmm2,xmm4
+			pshufd	xmm3,xmm1,1
+			pshufd	xmm4,xmm2,1
+			addss	xmm1,xmm3
+			addss	xmm2,xmm4
+			cmpless	xmm2,xmm1
+			movss	r,xmm2
+	}
+	return (r[0] & 1);
+#endif
+#else
+	return (b3Proximity(o, a) < b3Proximity(o, b) ? 0 : 1);
+#endif
+}
+
+//
+B3_DBVT_INLINE void b3Merge(const b3DbvtAabbMm& a,
+							const b3DbvtAabbMm& b,
+							b3DbvtAabbMm& r)
+{
+#if B3_DBVT_MERGE_IMPL == B3_DBVT_IMPL_SSE
+	__m128 ami(_mm_load_ps(a.mi));
+	__m128 amx(_mm_load_ps(a.mx));
+	__m128 bmi(_mm_load_ps(b.mi));
+	__m128 bmx(_mm_load_ps(b.mx));
+	ami = _mm_min_ps(ami, bmi);
+	amx = _mm_max_ps(amx, bmx);
+	_mm_store_ps(r.mi, ami);
+	_mm_store_ps(r.mx, amx);
+#else
+	for (int i = 0; i < 3; ++i)
+	{
+		if (a.mi[i] < b.mi[i])
+			r.mi[i] = a.mi[i];
+		else
+			r.mi[i] = b.mi[i];
+		if (a.mx[i] > b.mx[i])
+			r.mx[i] = a.mx[i];
+		else
+			r.mx[i] = b.mx[i];
+	}
+#endif
+}
+
+//
+B3_DBVT_INLINE bool b3NotEqual(const b3DbvtAabbMm& a,
+							   const b3DbvtAabbMm& b)
+{
+	return ((a.mi.x != b.mi.x) ||
+			(a.mi.y != b.mi.y) ||
+			(a.mi.z != b.mi.z) ||
+			(a.mx.x != b.mx.x) ||
+			(a.mx.y != b.mx.y) ||
+			(a.mx.z != b.mx.z));
+}
+
+//
+// Inline's
+//
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::enumNodes(const b3DbvtNode* root,
+									B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+	policy.Process(root);
+	if (root->isinternal())
+	{
+		enumNodes(root->childs[0], policy);
+		enumNodes(root->childs[1], policy);
+	}
+}
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::enumLeaves(const b3DbvtNode* root,
+									 B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+	if (root->isinternal())
+	{
+		enumLeaves(root->childs[0], policy);
+		enumLeaves(root->childs[1], policy);
+	}
+	else
+	{
+		policy.Process(root);
+	}
+}
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::collideTT(const b3DbvtNode* root0,
+									const b3DbvtNode* root1,
+									B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+	if (root0 && root1)
+	{
+		int depth = 1;
+		int treshold = B3_DOUBLE_STACKSIZE - 4;
+		b3AlignedObjectArray<sStkNN> stkStack;
+		stkStack.resize(B3_DOUBLE_STACKSIZE);
+		stkStack[0] = sStkNN(root0, root1);
+		do
+		{
+			sStkNN p = stkStack[--depth];
+			if (depth > treshold)
+			{
+				stkStack.resize(stkStack.size() * 2);
+				treshold = stkStack.size() - 4;
+			}
+			if (p.a == p.b)
+			{
+				if (p.a->isinternal())
+				{
+					stkStack[depth++] = sStkNN(p.a->childs[0], p.a->childs[0]);
+					stkStack[depth++] = sStkNN(p.a->childs[1], p.a->childs[1]);
+					stkStack[depth++] = sStkNN(p.a->childs[0], p.a->childs[1]);
+				}
+			}
+			else if (b3Intersect(p.a->volume, p.b->volume))
+			{
+				if (p.a->isinternal())
+				{
+					if (p.b->isinternal())
+					{
+						stkStack[depth++] = sStkNN(p.a->childs[0], p.b->childs[0]);
+						stkStack[depth++] = sStkNN(p.a->childs[1], p.b->childs[0]);
+						stkStack[depth++] = sStkNN(p.a->childs[0], p.b->childs[1]);
+						stkStack[depth++] = sStkNN(p.a->childs[1], p.b->childs[1]);
+					}
+					else
+					{
+						stkStack[depth++] = sStkNN(p.a->childs[0], p.b);
+						stkStack[depth++] = sStkNN(p.a->childs[1], p.b);
+					}
+				}
+				else
+				{
+					if (p.b->isinternal())
+					{
+						stkStack[depth++] = sStkNN(p.a, p.b->childs[0]);
+						stkStack[depth++] = sStkNN(p.a, p.b->childs[1]);
+					}
+					else
+					{
+						policy.Process(p.a, p.b);
+					}
+				}
+			}
+		} while (depth);
+	}
+}
+
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::collideTTpersistentStack(const b3DbvtNode* root0,
+												   const b3DbvtNode* root1,
+												   B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+	if (root0 && root1)
+	{
+		int depth = 1;
+		int treshold = B3_DOUBLE_STACKSIZE - 4;
+
+		m_stkStack.resize(B3_DOUBLE_STACKSIZE);
+		m_stkStack[0] = sStkNN(root0, root1);
+		do
+		{
+			sStkNN p = m_stkStack[--depth];
+			if (depth > treshold)
+			{
+				m_stkStack.resize(m_stkStack.size() * 2);
+				treshold = m_stkStack.size() - 4;
+			}
+			if (p.a == p.b)
+			{
+				if (p.a->isinternal())
+				{
+					m_stkStack[depth++] = sStkNN(p.a->childs[0], p.a->childs[0]);
+					m_stkStack[depth++] = sStkNN(p.a->childs[1], p.a->childs[1]);
+					m_stkStack[depth++] = sStkNN(p.a->childs[0], p.a->childs[1]);
+				}
+			}
+			else if (b3Intersect(p.a->volume, p.b->volume))
+			{
+				if (p.a->isinternal())
+				{
+					if (p.b->isinternal())
+					{
+						m_stkStack[depth++] = sStkNN(p.a->childs[0], p.b->childs[0]);
+						m_stkStack[depth++] = sStkNN(p.a->childs[1], p.b->childs[0]);
+						m_stkStack[depth++] = sStkNN(p.a->childs[0], p.b->childs[1]);
+						m_stkStack[depth++] = sStkNN(p.a->childs[1], p.b->childs[1]);
+					}
+					else
+					{
+						m_stkStack[depth++] = sStkNN(p.a->childs[0], p.b);
+						m_stkStack[depth++] = sStkNN(p.a->childs[1], p.b);
+					}
+				}
+				else
+				{
+					if (p.b->isinternal())
+					{
+						m_stkStack[depth++] = sStkNN(p.a, p.b->childs[0]);
+						m_stkStack[depth++] = sStkNN(p.a, p.b->childs[1]);
+					}
+					else
+					{
+						policy.Process(p.a, p.b);
+					}
+				}
+			}
+		} while (depth);
+	}
+}
+
+#if 0
+//
+B3_DBVT_PREFIX
+inline void		b3DynamicBvh::collideTT(	const b3DbvtNode* root0,
+								  const b3DbvtNode* root1,
+								  const b3Transform& xform,
+								  B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+		if(root0&&root1)
+		{
+			int								depth=1;
+			int								treshold=B3_DOUBLE_STACKSIZE-4;
+			b3AlignedObjectArray<sStkNN>	stkStack;
+			stkStack.resize(B3_DOUBLE_STACKSIZE);
+			stkStack[0]=sStkNN(root0,root1);
+			do	{
+				sStkNN	p=stkStack[--depth];
+				if(b3Intersect(p.a->volume,p.b->volume,xform))
+				{
+					if(depth>treshold)
+					{
+						stkStack.resize(stkStack.size()*2);
+						treshold=stkStack.size()-4;
+					}
+					if(p.a->isinternal())
+					{
+						if(p.b->isinternal())
+						{					
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b->childs[1]);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b->childs[1]);
+						}
+						else
+						{
+							stkStack[depth++]=sStkNN(p.a->childs[0],p.b);
+							stkStack[depth++]=sStkNN(p.a->childs[1],p.b);
+						}
+					}
+					else
+					{
+						if(p.b->isinternal())
+						{
+							stkStack[depth++]=sStkNN(p.a,p.b->childs[0]);
+							stkStack[depth++]=sStkNN(p.a,p.b->childs[1]);
+						}
+						else
+						{
+							policy.Process(p.a,p.b);
+						}
+					}
+				}
+			} while(depth);
+		}
+}
+//
+B3_DBVT_PREFIX
+inline void		b3DynamicBvh::collideTT(	const b3DbvtNode* root0,
+								  const b3Transform& xform0,
+								  const b3DbvtNode* root1,
+								  const b3Transform& xform1,
+								  B3_DBVT_IPOLICY)
+{
+	const b3Transform	xform=xform0.inverse()*xform1;
+	collideTT(root0,root1,xform,policy);
+}
+#endif
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::collideTV(const b3DbvtNode* root,
+									const b3DbvtVolume& vol,
+									B3_DBVT_IPOLICY) const
+{
+	B3_DBVT_CHECKTYPE
+	if (root)
+	{
+		B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)
+		volume(vol);
+		b3AlignedObjectArray<const b3DbvtNode*> stack;
+		stack.resize(0);
+		stack.reserve(B3_SIMPLE_STACKSIZE);
+		stack.push_back(root);
+		do
+		{
+			const b3DbvtNode* n = stack[stack.size() - 1];
+			stack.pop_back();
+			if (b3Intersect(n->volume, volume))
+			{
+				if (n->isinternal())
+				{
+					stack.push_back(n->childs[0]);
+					stack.push_back(n->childs[1]);
+				}
+				else
+				{
+					policy.Process(n);
+				}
+			}
+		} while (stack.size() > 0);
+	}
+}
+
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::rayTestInternal(const b3DbvtNode* root,
+										  const b3Vector3& rayFrom,
+										  const b3Vector3& rayTo,
+										  const b3Vector3& rayDirectionInverse,
+										  unsigned int signs[3],
+										  b3Scalar lambda_max,
+										  const b3Vector3& aabbMin,
+										  const b3Vector3& aabbMax,
+										  B3_DBVT_IPOLICY) const
+{
+	(void)rayTo;
+	B3_DBVT_CHECKTYPE
+	if (root)
+	{
+		int depth = 1;
+		int treshold = B3_DOUBLE_STACKSIZE - 2;
+		b3AlignedObjectArray<const b3DbvtNode*>& stack = m_rayTestStack;
+		stack.resize(B3_DOUBLE_STACKSIZE);
+		stack[0] = root;
+		b3Vector3 bounds[2];
+		do
+		{
+			const b3DbvtNode* node = stack[--depth];
+			bounds[0] = node->volume.Mins() - aabbMax;
+			bounds[1] = node->volume.Maxs() - aabbMin;
+			b3Scalar tmin = 1.f, lambda_min = 0.f;
+			unsigned int result1 = false;
+			result1 = b3RayAabb2(rayFrom, rayDirectionInverse, signs, bounds, tmin, lambda_min, lambda_max);
+			if (result1)
+			{
+				if (node->isinternal())
+				{
+					if (depth > treshold)
+					{
+						stack.resize(stack.size() * 2);
+						treshold = stack.size() - 2;
+					}
+					stack[depth++] = node->childs[0];
+					stack[depth++] = node->childs[1];
+				}
+				else
+				{
+					policy.Process(node);
+				}
+			}
+		} while (depth);
+	}
+}
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::rayTest(const b3DbvtNode* root,
+								  const b3Vector3& rayFrom,
+								  const b3Vector3& rayTo,
+								  B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+	if (root)
+	{
+		b3Vector3 rayDir = (rayTo - rayFrom);
+		rayDir.normalize();
+
+		///what about division by zero? --> just set rayDirection[i] to INF/B3_LARGE_FLOAT
+		b3Vector3 rayDirectionInverse;
+		rayDirectionInverse[0] = rayDir[0] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[0];
+		rayDirectionInverse[1] = rayDir[1] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[1];
+		rayDirectionInverse[2] = rayDir[2] == b3Scalar(0.0) ? b3Scalar(B3_LARGE_FLOAT) : b3Scalar(1.0) / rayDir[2];
+		unsigned int signs[3] = {rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0};
+
+		b3Scalar lambda_max = rayDir.dot(rayTo - rayFrom);
+#ifdef COMPARE_BTRAY_AABB2
+		b3Vector3 resultNormal;
+#endif  //COMPARE_BTRAY_AABB2
+
+		b3AlignedObjectArray<const b3DbvtNode*> stack;
+
+		int depth = 1;
+		int treshold = B3_DOUBLE_STACKSIZE - 2;
+
+		stack.resize(B3_DOUBLE_STACKSIZE);
+		stack[0] = root;
+		b3Vector3 bounds[2];
+		do
+		{
+			const b3DbvtNode* node = stack[--depth];
+
+			bounds[0] = node->volume.Mins();
+			bounds[1] = node->volume.Maxs();
+
+			b3Scalar tmin = 1.f, lambda_min = 0.f;
+			unsigned int result1 = b3RayAabb2(rayFrom, rayDirectionInverse, signs, bounds, tmin, lambda_min, lambda_max);
+
+#ifdef COMPARE_BTRAY_AABB2
+			b3Scalar param = 1.f;
+			bool result2 = b3RayAabb(rayFrom, rayTo, node->volume.Mins(), node->volume.Maxs(), param, resultNormal);
+			b3Assert(result1 == result2);
+#endif  //TEST_BTRAY_AABB2
+
+			if (result1)
+			{
+				if (node->isinternal())
+				{
+					if (depth > treshold)
+					{
+						stack.resize(stack.size() * 2);
+						treshold = stack.size() - 2;
+					}
+					stack[depth++] = node->childs[0];
+					stack[depth++] = node->childs[1];
+				}
+				else
+				{
+					policy.Process(node);
+				}
+			}
+		} while (depth);
+	}
+}
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::collideKDOP(const b3DbvtNode* root,
+									  const b3Vector3* normals,
+									  const b3Scalar* offsets,
+									  int count,
+									  B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+	if (root)
+	{
+		const int inside = (1 << count) - 1;
+		b3AlignedObjectArray<sStkNP> stack;
+		int signs[sizeof(unsigned) * 8];
+		b3Assert(count < int(sizeof(signs) / sizeof(signs[0])));
+		for (int i = 0; i < count; ++i)
+		{
+			signs[i] = ((normals[i].x >= 0) ? 1 : 0) +
+					   ((normals[i].y >= 0) ? 2 : 0) +
+					   ((normals[i].z >= 0) ? 4 : 0);
+		}
+		stack.reserve(B3_SIMPLE_STACKSIZE);
+		stack.push_back(sStkNP(root, 0));
+		do
+		{
+			sStkNP se = stack[stack.size() - 1];
+			bool out = false;
+			stack.pop_back();
+			for (int i = 0, j = 1; (!out) && (i < count); ++i, j <<= 1)
+			{
+				if (0 == (se.mask & j))
+				{
+					const int side = se.node->volume.Classify(normals[i], offsets[i], signs[i]);
+					switch (side)
+					{
+						case -1:
+							out = true;
+							break;
+						case +1:
+							se.mask |= j;
+							break;
+					}
+				}
+			}
+			if (!out)
+			{
+				if ((se.mask != inside) && (se.node->isinternal()))
+				{
+					stack.push_back(sStkNP(se.node->childs[0], se.mask));
+					stack.push_back(sStkNP(se.node->childs[1], se.mask));
+				}
+				else
+				{
+					if (policy.AllLeaves(se.node)) enumLeaves(se.node, policy);
+				}
+			}
+		} while (stack.size());
+	}
+}
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::collideOCL(const b3DbvtNode* root,
+									 const b3Vector3* normals,
+									 const b3Scalar* offsets,
+									 const b3Vector3& sortaxis,
+									 int count,
+									 B3_DBVT_IPOLICY,
+									 bool fsort)
+{
+	B3_DBVT_CHECKTYPE
+	if (root)
+	{
+		const unsigned srtsgns = (sortaxis[0] >= 0 ? 1 : 0) +
+								 (sortaxis[1] >= 0 ? 2 : 0) +
+								 (sortaxis[2] >= 0 ? 4 : 0);
+		const int inside = (1 << count) - 1;
+		b3AlignedObjectArray<sStkNPS> stock;
+		b3AlignedObjectArray<int> ifree;
+		b3AlignedObjectArray<int> stack;
+		int signs[sizeof(unsigned) * 8];
+		b3Assert(count < int(sizeof(signs) / sizeof(signs[0])));
+		for (int i = 0; i < count; ++i)
+		{
+			signs[i] = ((normals[i].x >= 0) ? 1 : 0) +
+					   ((normals[i].y >= 0) ? 2 : 0) +
+					   ((normals[i].z >= 0) ? 4 : 0);
+		}
+		stock.reserve(B3_SIMPLE_STACKSIZE);
+		stack.reserve(B3_SIMPLE_STACKSIZE);
+		ifree.reserve(B3_SIMPLE_STACKSIZE);
+		stack.push_back(allocate(ifree, stock, sStkNPS(root, 0, root->volume.ProjectMinimum(sortaxis, srtsgns))));
+		do
+		{
+			const int id = stack[stack.size() - 1];
+			sStkNPS se = stock[id];
+			stack.pop_back();
+			ifree.push_back(id);
+			if (se.mask != inside)
+			{
+				bool out = false;
+				for (int i = 0, j = 1; (!out) && (i < count); ++i, j <<= 1)
+				{
+					if (0 == (se.mask & j))
+					{
+						const int side = se.node->volume.Classify(normals[i], offsets[i], signs[i]);
+						switch (side)
+						{
+							case -1:
+								out = true;
+								break;
+							case +1:
+								se.mask |= j;
+								break;
+						}
+					}
+				}
+				if (out) continue;
+			}
+			if (policy.Descent(se.node))
+			{
+				if (se.node->isinternal())
+				{
+					const b3DbvtNode* pns[] = {se.node->childs[0], se.node->childs[1]};
+					sStkNPS nes[] = {sStkNPS(pns[0], se.mask, pns[0]->volume.ProjectMinimum(sortaxis, srtsgns)),
+									 sStkNPS(pns[1], se.mask, pns[1]->volume.ProjectMinimum(sortaxis, srtsgns))};
+					const int q = nes[0].value < nes[1].value ? 1 : 0;
+					int j = stack.size();
+					if (fsort && (j > 0))
+					{
+						/* Insert 0	*/
+						j = nearest(&stack[0], &stock[0], nes[q].value, 0, stack.size());
+						stack.push_back(0);
+#if B3_DBVT_USE_MEMMOVE
+						memmove(&stack[j + 1], &stack[j], sizeof(int) * (stack.size() - j - 1));
+#else
+						for (int k = stack.size() - 1; k > j; --k) stack[k] = stack[k - 1];
+#endif
+						stack[j] = allocate(ifree, stock, nes[q]);
+						/* Insert 1	*/
+						j = nearest(&stack[0], &stock[0], nes[1 - q].value, j, stack.size());
+						stack.push_back(0);
+#if B3_DBVT_USE_MEMMOVE
+						memmove(&stack[j + 1], &stack[j], sizeof(int) * (stack.size() - j - 1));
+#else
+						for (int k = stack.size() - 1; k > j; --k) stack[k] = stack[k - 1];
+#endif
+						stack[j] = allocate(ifree, stock, nes[1 - q]);
+					}
+					else
+					{
+						stack.push_back(allocate(ifree, stock, nes[q]));
+						stack.push_back(allocate(ifree, stock, nes[1 - q]));
+					}
+				}
+				else
+				{
+					policy.Process(se.node, se.value);
+				}
+			}
+		} while (stack.size());
+	}
+}
+
+//
+B3_DBVT_PREFIX
+inline void b3DynamicBvh::collideTU(const b3DbvtNode* root,
+									B3_DBVT_IPOLICY)
+{
+	B3_DBVT_CHECKTYPE
+	if (root)
+	{
+		b3AlignedObjectArray<const b3DbvtNode*> stack;
+		stack.reserve(B3_SIMPLE_STACKSIZE);
+		stack.push_back(root);
+		do
+		{
+			const b3DbvtNode* n = stack[stack.size() - 1];
+			stack.pop_back();
+			if (policy.Descent(n))
+			{
+				if (n->isinternal())
+				{
+					stack.push_back(n->childs[0]);
+					stack.push_back(n->childs[1]);
+				}
+				else
+				{
+					policy.Process(n);
+				}
+			}
+		} while (stack.size() > 0);
+	}
+}
+
+//
+// PP Cleanup
+//
+
+#undef B3_DBVT_USE_MEMMOVE
+#undef B3_DBVT_USE_TEMPLATE
+#undef B3_DBVT_VIRTUAL_DTOR
+#undef B3_DBVT_VIRTUAL
+#undef B3_DBVT_PREFIX
+#undef B3_DBVT_IPOLICY
+#undef B3_DBVT_CHECKTYPE
+#undef B3_DBVT_IMPL_GENERIC
+#undef B3_DBVT_IMPL_SSE
+#undef B3_DBVT_USE_INTRINSIC_SSE
+#undef B3_DBVT_SELECT_IMPL
+#undef B3_DBVT_MERGE_IMPL
+#undef B3_DBVT_INT0_IMPL
+
+#endif

+ 808 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.cpp

@@ -0,0 +1,808 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///b3DynamicBvhBroadphase implementation by Nathanael Presson
+
+#include "b3DynamicBvhBroadphase.h"
+#include "b3OverlappingPair.h"
+
+//
+// Profiling
+//
+
+#if B3_DBVT_BP_PROFILE || B3_DBVT_BP_ENABLE_BENCHMARK
+#include <stdio.h>
+#endif
+
+#if B3_DBVT_BP_PROFILE
+struct b3ProfileScope
+{
+	__forceinline b3ProfileScope(b3Clock& clock, unsigned long& value) : m_clock(&clock), m_value(&value), m_base(clock.getTimeMicroseconds())
+	{
+	}
+	__forceinline ~b3ProfileScope()
+	{
+		(*m_value) += m_clock->getTimeMicroseconds() - m_base;
+	}
+	b3Clock* m_clock;
+	unsigned long* m_value;
+	unsigned long m_base;
+};
+#define b3SPC(_value_) b3ProfileScope spc_scope(m_clock, _value_)
+#else
+#define b3SPC(_value_)
+#endif
+
+//
+// Helpers
+//
+
+//
+template <typename T>
+static inline void b3ListAppend(T* item, T*& list)
+{
+	item->links[0] = 0;
+	item->links[1] = list;
+	if (list) list->links[0] = item;
+	list = item;
+}
+
+//
+template <typename T>
+static inline void b3ListRemove(T* item, T*& list)
+{
+	if (item->links[0])
+		item->links[0]->links[1] = item->links[1];
+	else
+		list = item->links[1];
+	if (item->links[1]) item->links[1]->links[0] = item->links[0];
+}
+
+//
+template <typename T>
+static inline int b3ListCount(T* root)
+{
+	int n = 0;
+	while (root)
+	{
+		++n;
+		root = root->links[1];
+	}
+	return (n);
+}
+
+//
+template <typename T>
+static inline void b3Clear(T& value)
+{
+	static const struct ZeroDummy : T
+	{
+	} zerodummy;
+	value = zerodummy;
+}
+
+//
+// Colliders
+//
+
+/* Tree collider	*/
+struct b3DbvtTreeCollider : b3DynamicBvh::ICollide
+{
+	b3DynamicBvhBroadphase* pbp;
+	b3DbvtProxy* proxy;
+	b3DbvtTreeCollider(b3DynamicBvhBroadphase* p) : pbp(p) {}
+	void Process(const b3DbvtNode* na, const b3DbvtNode* nb)
+	{
+		if (na != nb)
+		{
+			b3DbvtProxy* pa = (b3DbvtProxy*)na->data;
+			b3DbvtProxy* pb = (b3DbvtProxy*)nb->data;
+#if B3_DBVT_BP_SORTPAIRS
+			if (pa->m_uniqueId > pb->m_uniqueId)
+				b3Swap(pa, pb);
+#endif
+			pbp->m_paircache->addOverlappingPair(pa->getUid(), pb->getUid());
+			++pbp->m_newpairs;
+		}
+	}
+	void Process(const b3DbvtNode* n)
+	{
+		Process(n, proxy->leaf);
+	}
+};
+
+//
+// b3DynamicBvhBroadphase
+//
+
+//
+b3DynamicBvhBroadphase::b3DynamicBvhBroadphase(int proxyCapacity, b3OverlappingPairCache* paircache)
+{
+	m_deferedcollide = false;
+	m_needcleanup = true;
+	m_releasepaircache = (paircache != 0) ? false : true;
+	m_prediction = 0;
+	m_stageCurrent = 0;
+	m_fixedleft = 0;
+	m_fupdates = 1;
+	m_dupdates = 0;
+	m_cupdates = 10;
+	m_newpairs = 1;
+	m_updates_call = 0;
+	m_updates_done = 0;
+	m_updates_ratio = 0;
+	m_paircache = paircache ? paircache : new (b3AlignedAlloc(sizeof(b3HashedOverlappingPairCache), 16)) b3HashedOverlappingPairCache();
+
+	m_pid = 0;
+	m_cid = 0;
+	for (int i = 0; i <= STAGECOUNT; ++i)
+	{
+		m_stageRoots[i] = 0;
+	}
+#if B3_DBVT_BP_PROFILE
+	b3Clear(m_profiling);
+#endif
+	m_proxies.resize(proxyCapacity);
+}
+
+//
+b3DynamicBvhBroadphase::~b3DynamicBvhBroadphase()
+{
+	if (m_releasepaircache)
+	{
+		m_paircache->~b3OverlappingPairCache();
+		b3AlignedFree(m_paircache);
+	}
+}
+
+//
+b3BroadphaseProxy* b3DynamicBvhBroadphase::createProxy(const b3Vector3& aabbMin,
+													   const b3Vector3& aabbMax,
+													   int objectId,
+													   void* userPtr,
+													   int collisionFilterGroup,
+													   int collisionFilterMask)
+{
+	b3DbvtProxy* mem = &m_proxies[objectId];
+	b3DbvtProxy* proxy = new (mem) b3DbvtProxy(aabbMin, aabbMax, userPtr,
+											   collisionFilterGroup,
+											   collisionFilterMask);
+
+	b3DbvtAabbMm aabb = b3DbvtVolume::FromMM(aabbMin, aabbMax);
+
+	//bproxy->aabb			=	b3DbvtVolume::FromMM(aabbMin,aabbMax);
+	proxy->stage = m_stageCurrent;
+	proxy->m_uniqueId = objectId;
+	proxy->leaf = m_sets[0].insert(aabb, proxy);
+	b3ListAppend(proxy, m_stageRoots[m_stageCurrent]);
+	if (!m_deferedcollide)
+	{
+		b3DbvtTreeCollider collider(this);
+		collider.proxy = proxy;
+		m_sets[0].collideTV(m_sets[0].m_root, aabb, collider);
+		m_sets[1].collideTV(m_sets[1].m_root, aabb, collider);
+	}
+	return (proxy);
+}
+
+//
+void b3DynamicBvhBroadphase::destroyProxy(b3BroadphaseProxy* absproxy,
+										  b3Dispatcher* dispatcher)
+{
+	b3DbvtProxy* proxy = (b3DbvtProxy*)absproxy;
+	if (proxy->stage == STAGECOUNT)
+		m_sets[1].remove(proxy->leaf);
+	else
+		m_sets[0].remove(proxy->leaf);
+	b3ListRemove(proxy, m_stageRoots[proxy->stage]);
+	m_paircache->removeOverlappingPairsContainingProxy(proxy->getUid(), dispatcher);
+
+	m_needcleanup = true;
+}
+
+void b3DynamicBvhBroadphase::getAabb(int objectId, b3Vector3& aabbMin, b3Vector3& aabbMax) const
+{
+	const b3DbvtProxy* proxy = &m_proxies[objectId];
+	aabbMin = proxy->m_aabbMin;
+	aabbMax = proxy->m_aabbMax;
+}
+/*
+void	b3DynamicBvhBroadphase::getAabb(b3BroadphaseProxy* absproxy,b3Vector3& aabbMin, b3Vector3& aabbMax ) const
+{
+	b3DbvtProxy*						proxy=(b3DbvtProxy*)absproxy;
+	aabbMin = proxy->m_aabbMin;
+	aabbMax = proxy->m_aabbMax;
+}
+*/
+
+struct BroadphaseRayTester : b3DynamicBvh::ICollide
+{
+	b3BroadphaseRayCallback& m_rayCallback;
+	BroadphaseRayTester(b3BroadphaseRayCallback& orgCallback)
+		: m_rayCallback(orgCallback)
+	{
+	}
+	void Process(const b3DbvtNode* leaf)
+	{
+		b3DbvtProxy* proxy = (b3DbvtProxy*)leaf->data;
+		m_rayCallback.process(proxy);
+	}
+};
+
+void b3DynamicBvhBroadphase::rayTest(const b3Vector3& rayFrom, const b3Vector3& rayTo, b3BroadphaseRayCallback& rayCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
+{
+	BroadphaseRayTester callback(rayCallback);
+
+	m_sets[0].rayTestInternal(m_sets[0].m_root,
+							  rayFrom,
+							  rayTo,
+							  rayCallback.m_rayDirectionInverse,
+							  rayCallback.m_signs,
+							  rayCallback.m_lambda_max,
+							  aabbMin,
+							  aabbMax,
+							  callback);
+
+	m_sets[1].rayTestInternal(m_sets[1].m_root,
+							  rayFrom,
+							  rayTo,
+							  rayCallback.m_rayDirectionInverse,
+							  rayCallback.m_signs,
+							  rayCallback.m_lambda_max,
+							  aabbMin,
+							  aabbMax,
+							  callback);
+}
+
+struct BroadphaseAabbTester : b3DynamicBvh::ICollide
+{
+	b3BroadphaseAabbCallback& m_aabbCallback;
+	BroadphaseAabbTester(b3BroadphaseAabbCallback& orgCallback)
+		: m_aabbCallback(orgCallback)
+	{
+	}
+	void Process(const b3DbvtNode* leaf)
+	{
+		b3DbvtProxy* proxy = (b3DbvtProxy*)leaf->data;
+		m_aabbCallback.process(proxy);
+	}
+};
+
+void b3DynamicBvhBroadphase::aabbTest(const b3Vector3& aabbMin, const b3Vector3& aabbMax, b3BroadphaseAabbCallback& aabbCallback)
+{
+	BroadphaseAabbTester callback(aabbCallback);
+
+	const B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume) bounds = b3DbvtVolume::FromMM(aabbMin, aabbMax);
+	//process all children, that overlap with  the given AABB bounds
+	m_sets[0].collideTV(m_sets[0].m_root, bounds, callback);
+	m_sets[1].collideTV(m_sets[1].m_root, bounds, callback);
+}
+
+//
+void b3DynamicBvhBroadphase::setAabb(int objectId,
+									 const b3Vector3& aabbMin,
+									 const b3Vector3& aabbMax,
+									 b3Dispatcher* /*dispatcher*/)
+{
+	b3DbvtProxy* proxy = &m_proxies[objectId];
+	//	b3DbvtProxy*						proxy=(b3DbvtProxy*)absproxy;
+	B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)
+	aabb = b3DbvtVolume::FromMM(aabbMin, aabbMax);
+#if B3_DBVT_BP_PREVENTFALSEUPDATE
+	if (b3NotEqual(aabb, proxy->leaf->volume))
+#endif
+	{
+		bool docollide = false;
+		if (proxy->stage == STAGECOUNT)
+		{ /* fixed -> dynamic set	*/
+			m_sets[1].remove(proxy->leaf);
+			proxy->leaf = m_sets[0].insert(aabb, proxy);
+			docollide = true;
+		}
+		else
+		{ /* dynamic set				*/
+			++m_updates_call;
+			if (b3Intersect(proxy->leaf->volume, aabb))
+			{ /* Moving				*/
+
+				const b3Vector3 delta = aabbMin - proxy->m_aabbMin;
+				b3Vector3 velocity(((proxy->m_aabbMax - proxy->m_aabbMin) / 2) * m_prediction);
+				if (delta[0] < 0) velocity[0] = -velocity[0];
+				if (delta[1] < 0) velocity[1] = -velocity[1];
+				if (delta[2] < 0) velocity[2] = -velocity[2];
+				if (
+#ifdef B3_DBVT_BP_MARGIN
+					m_sets[0].update(proxy->leaf, aabb, velocity, B3_DBVT_BP_MARGIN)
+#else
+					m_sets[0].update(proxy->leaf, aabb, velocity)
+#endif
+				)
+				{
+					++m_updates_done;
+					docollide = true;
+				}
+			}
+			else
+			{ /* Teleporting			*/
+				m_sets[0].update(proxy->leaf, aabb);
+				++m_updates_done;
+				docollide = true;
+			}
+		}
+		b3ListRemove(proxy, m_stageRoots[proxy->stage]);
+		proxy->m_aabbMin = aabbMin;
+		proxy->m_aabbMax = aabbMax;
+		proxy->stage = m_stageCurrent;
+		b3ListAppend(proxy, m_stageRoots[m_stageCurrent]);
+		if (docollide)
+		{
+			m_needcleanup = true;
+			if (!m_deferedcollide)
+			{
+				b3DbvtTreeCollider collider(this);
+				m_sets[1].collideTTpersistentStack(m_sets[1].m_root, proxy->leaf, collider);
+				m_sets[0].collideTTpersistentStack(m_sets[0].m_root, proxy->leaf, collider);
+			}
+		}
+	}
+}
+
+//
+void b3DynamicBvhBroadphase::setAabbForceUpdate(b3BroadphaseProxy* absproxy,
+												const b3Vector3& aabbMin,
+												const b3Vector3& aabbMax,
+												b3Dispatcher* /*dispatcher*/)
+{
+	b3DbvtProxy* proxy = (b3DbvtProxy*)absproxy;
+	B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)
+	aabb = b3DbvtVolume::FromMM(aabbMin, aabbMax);
+	bool docollide = false;
+	if (proxy->stage == STAGECOUNT)
+	{ /* fixed -> dynamic set	*/
+		m_sets[1].remove(proxy->leaf);
+		proxy->leaf = m_sets[0].insert(aabb, proxy);
+		docollide = true;
+	}
+	else
+	{ /* dynamic set				*/
+		++m_updates_call;
+		/* Teleporting			*/
+		m_sets[0].update(proxy->leaf, aabb);
+		++m_updates_done;
+		docollide = true;
+	}
+	b3ListRemove(proxy, m_stageRoots[proxy->stage]);
+	proxy->m_aabbMin = aabbMin;
+	proxy->m_aabbMax = aabbMax;
+	proxy->stage = m_stageCurrent;
+	b3ListAppend(proxy, m_stageRoots[m_stageCurrent]);
+	if (docollide)
+	{
+		m_needcleanup = true;
+		if (!m_deferedcollide)
+		{
+			b3DbvtTreeCollider collider(this);
+			m_sets[1].collideTTpersistentStack(m_sets[1].m_root, proxy->leaf, collider);
+			m_sets[0].collideTTpersistentStack(m_sets[0].m_root, proxy->leaf, collider);
+		}
+	}
+}
+
+//
+void b3DynamicBvhBroadphase::calculateOverlappingPairs(b3Dispatcher* dispatcher)
+{
+	collide(dispatcher);
+#if B3_DBVT_BP_PROFILE
+	if (0 == (m_pid % B3_DBVT_BP_PROFILING_RATE))
+	{
+		printf("fixed(%u) dynamics(%u) pairs(%u)\r\n", m_sets[1].m_leaves, m_sets[0].m_leaves, m_paircache->getNumOverlappingPairs());
+		unsigned int total = m_profiling.m_total;
+		if (total <= 0) total = 1;
+		printf("ddcollide: %u%% (%uus)\r\n", (50 + m_profiling.m_ddcollide * 100) / total, m_profiling.m_ddcollide / B3_DBVT_BP_PROFILING_RATE);
+		printf("fdcollide: %u%% (%uus)\r\n", (50 + m_profiling.m_fdcollide * 100) / total, m_profiling.m_fdcollide / B3_DBVT_BP_PROFILING_RATE);
+		printf("cleanup:   %u%% (%uus)\r\n", (50 + m_profiling.m_cleanup * 100) / total, m_profiling.m_cleanup / B3_DBVT_BP_PROFILING_RATE);
+		printf("total:     %uus\r\n", total / B3_DBVT_BP_PROFILING_RATE);
+		const unsigned long sum = m_profiling.m_ddcollide +
+								  m_profiling.m_fdcollide +
+								  m_profiling.m_cleanup;
+		printf("leaked: %u%% (%uus)\r\n", 100 - ((50 + sum * 100) / total), (total - sum) / B3_DBVT_BP_PROFILING_RATE);
+		printf("job counts: %u%%\r\n", (m_profiling.m_jobcount * 100) / ((m_sets[0].m_leaves + m_sets[1].m_leaves) * B3_DBVT_BP_PROFILING_RATE));
+		b3Clear(m_profiling);
+		m_clock.reset();
+	}
+#endif
+
+	performDeferredRemoval(dispatcher);
+}
+
+void b3DynamicBvhBroadphase::performDeferredRemoval(b3Dispatcher* dispatcher)
+{
+	if (m_paircache->hasDeferredRemoval())
+	{
+		b3BroadphasePairArray& overlappingPairArray = m_paircache->getOverlappingPairArray();
+
+		//perform a sort, to find duplicates and to sort 'invalid' pairs to the end
+		overlappingPairArray.quickSort(b3BroadphasePairSortPredicate());
+
+		int invalidPair = 0;
+
+		int i;
+
+		b3BroadphasePair previousPair = b3MakeBroadphasePair(-1, -1);
+
+		for (i = 0; i < overlappingPairArray.size(); i++)
+		{
+			b3BroadphasePair& pair = overlappingPairArray[i];
+
+			bool isDuplicate = (pair == previousPair);
+
+			previousPair = pair;
+
+			bool needsRemoval = false;
+
+			if (!isDuplicate)
+			{
+				//important to perform AABB check that is consistent with the broadphase
+				b3DbvtProxy* pa = &m_proxies[pair.x];
+				b3DbvtProxy* pb = &m_proxies[pair.y];
+				bool hasOverlap = b3Intersect(pa->leaf->volume, pb->leaf->volume);
+
+				if (hasOverlap)
+				{
+					needsRemoval = false;
+				}
+				else
+				{
+					needsRemoval = true;
+				}
+			}
+			else
+			{
+				//remove duplicate
+				needsRemoval = true;
+				//should have no algorithm
+			}
+
+			if (needsRemoval)
+			{
+				m_paircache->cleanOverlappingPair(pair, dispatcher);
+
+				pair.x = -1;
+				pair.y = -1;
+				invalidPair++;
+			}
+		}
+
+		//perform a sort, to sort 'invalid' pairs to the end
+		overlappingPairArray.quickSort(b3BroadphasePairSortPredicate());
+		overlappingPairArray.resize(overlappingPairArray.size() - invalidPair);
+	}
+}
+
+//
+void b3DynamicBvhBroadphase::collide(b3Dispatcher* dispatcher)
+{
+	/*printf("---------------------------------------------------------\n");
+	printf("m_sets[0].m_leaves=%d\n",m_sets[0].m_leaves);
+	printf("m_sets[1].m_leaves=%d\n",m_sets[1].m_leaves);
+	printf("numPairs = %d\n",getOverlappingPairCache()->getNumOverlappingPairs());
+	{
+		int i;
+		for (i=0;i<getOverlappingPairCache()->getNumOverlappingPairs();i++)
+		{
+			printf("pair[%d]=(%d,%d),",i,getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy0->getUid(),
+				getOverlappingPairCache()->getOverlappingPairArray()[i].m_pProxy1->getUid());
+		}
+		printf("\n");
+	}
+*/
+
+	b3SPC(m_profiling.m_total);
+	/* optimize				*/
+	m_sets[0].optimizeIncremental(1 + (m_sets[0].m_leaves * m_dupdates) / 100);
+	if (m_fixedleft)
+	{
+		const int count = 1 + (m_sets[1].m_leaves * m_fupdates) / 100;
+		m_sets[1].optimizeIncremental(1 + (m_sets[1].m_leaves * m_fupdates) / 100);
+		m_fixedleft = b3Max<int>(0, m_fixedleft - count);
+	}
+	/* dynamic -> fixed set	*/
+	m_stageCurrent = (m_stageCurrent + 1) % STAGECOUNT;
+	b3DbvtProxy* current = m_stageRoots[m_stageCurrent];
+	if (current)
+	{
+		b3DbvtTreeCollider collider(this);
+		do
+		{
+			b3DbvtProxy* next = current->links[1];
+			b3ListRemove(current, m_stageRoots[current->stage]);
+			b3ListAppend(current, m_stageRoots[STAGECOUNT]);
+#if B3_DBVT_BP_ACCURATESLEEPING
+			m_paircache->removeOverlappingPairsContainingProxy(current, dispatcher);
+			collider.proxy = current;
+			b3DynamicBvh::collideTV(m_sets[0].m_root, current->aabb, collider);
+			b3DynamicBvh::collideTV(m_sets[1].m_root, current->aabb, collider);
+#endif
+			m_sets[0].remove(current->leaf);
+			B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)
+			curAabb = b3DbvtVolume::FromMM(current->m_aabbMin, current->m_aabbMax);
+			current->leaf = m_sets[1].insert(curAabb, current);
+			current->stage = STAGECOUNT;
+			current = next;
+		} while (current);
+		m_fixedleft = m_sets[1].m_leaves;
+		m_needcleanup = true;
+	}
+	/* collide dynamics		*/
+	{
+		b3DbvtTreeCollider collider(this);
+		if (m_deferedcollide)
+		{
+			b3SPC(m_profiling.m_fdcollide);
+			m_sets[0].collideTTpersistentStack(m_sets[0].m_root, m_sets[1].m_root, collider);
+		}
+		if (m_deferedcollide)
+		{
+			b3SPC(m_profiling.m_ddcollide);
+			m_sets[0].collideTTpersistentStack(m_sets[0].m_root, m_sets[0].m_root, collider);
+		}
+	}
+	/* clean up				*/
+	if (m_needcleanup)
+	{
+		b3SPC(m_profiling.m_cleanup);
+		b3BroadphasePairArray& pairs = m_paircache->getOverlappingPairArray();
+		if (pairs.size() > 0)
+		{
+			int ni = b3Min(pairs.size(), b3Max<int>(m_newpairs, (pairs.size() * m_cupdates) / 100));
+			for (int i = 0; i < ni; ++i)
+			{
+				b3BroadphasePair& p = pairs[(m_cid + i) % pairs.size()];
+				b3DbvtProxy* pa = &m_proxies[p.x];
+				b3DbvtProxy* pb = &m_proxies[p.y];
+				if (!b3Intersect(pa->leaf->volume, pb->leaf->volume))
+				{
+#if B3_DBVT_BP_SORTPAIRS
+					if (pa->m_uniqueId > pb->m_uniqueId)
+						b3Swap(pa, pb);
+#endif
+					m_paircache->removeOverlappingPair(pa->getUid(), pb->getUid(), dispatcher);
+					--ni;
+					--i;
+				}
+			}
+			if (pairs.size() > 0)
+				m_cid = (m_cid + ni) % pairs.size();
+			else
+				m_cid = 0;
+		}
+	}
+	++m_pid;
+	m_newpairs = 1;
+	m_needcleanup = false;
+	if (m_updates_call > 0)
+	{
+		m_updates_ratio = m_updates_done / (b3Scalar)m_updates_call;
+	}
+	else
+	{
+		m_updates_ratio = 0;
+	}
+	m_updates_done /= 2;
+	m_updates_call /= 2;
+}
+
+//
+void b3DynamicBvhBroadphase::optimize()
+{
+	m_sets[0].optimizeTopDown();
+	m_sets[1].optimizeTopDown();
+}
+
+//
+b3OverlappingPairCache* b3DynamicBvhBroadphase::getOverlappingPairCache()
+{
+	return (m_paircache);
+}
+
+//
+const b3OverlappingPairCache* b3DynamicBvhBroadphase::getOverlappingPairCache() const
+{
+	return (m_paircache);
+}
+
+//
+void b3DynamicBvhBroadphase::getBroadphaseAabb(b3Vector3& aabbMin, b3Vector3& aabbMax) const
+{
+	B3_ATTRIBUTE_ALIGNED16(b3DbvtVolume)
+	bounds;
+
+	if (!m_sets[0].empty())
+		if (!m_sets[1].empty())
+			b3Merge(m_sets[0].m_root->volume,
+					m_sets[1].m_root->volume, bounds);
+		else
+			bounds = m_sets[0].m_root->volume;
+	else if (!m_sets[1].empty())
+		bounds = m_sets[1].m_root->volume;
+	else
+		bounds = b3DbvtVolume::FromCR(b3MakeVector3(0, 0, 0), 0);
+	aabbMin = bounds.Mins();
+	aabbMax = bounds.Maxs();
+}
+
+void b3DynamicBvhBroadphase::resetPool(b3Dispatcher* dispatcher)
+{
+	int totalObjects = m_sets[0].m_leaves + m_sets[1].m_leaves;
+	if (!totalObjects)
+	{
+		//reset internal dynamic tree data structures
+		m_sets[0].clear();
+		m_sets[1].clear();
+
+		m_deferedcollide = false;
+		m_needcleanup = true;
+		m_stageCurrent = 0;
+		m_fixedleft = 0;
+		m_fupdates = 1;
+		m_dupdates = 0;
+		m_cupdates = 10;
+		m_newpairs = 1;
+		m_updates_call = 0;
+		m_updates_done = 0;
+		m_updates_ratio = 0;
+
+		m_pid = 0;
+		m_cid = 0;
+		for (int i = 0; i <= STAGECOUNT; ++i)
+		{
+			m_stageRoots[i] = 0;
+		}
+	}
+}
+
+//
+void b3DynamicBvhBroadphase::printStats()
+{
+}
+
+//
+#if B3_DBVT_BP_ENABLE_BENCHMARK
+
+struct b3BroadphaseBenchmark
+{
+	struct Experiment
+	{
+		const char* name;
+		int object_count;
+		int update_count;
+		int spawn_count;
+		int iterations;
+		b3Scalar speed;
+		b3Scalar amplitude;
+	};
+	struct Object
+	{
+		b3Vector3 center;
+		b3Vector3 extents;
+		b3BroadphaseProxy* proxy;
+		b3Scalar time;
+		void update(b3Scalar speed, b3Scalar amplitude, b3BroadphaseInterface* pbi)
+		{
+			time += speed;
+			center[0] = b3Cos(time * (b3Scalar)2.17) * amplitude +
+						b3Sin(time) * amplitude / 2;
+			center[1] = b3Cos(time * (b3Scalar)1.38) * amplitude +
+						b3Sin(time) * amplitude;
+			center[2] = b3Sin(time * (b3Scalar)0.777) * amplitude;
+			pbi->setAabb(proxy, center - extents, center + extents, 0);
+		}
+	};
+	static int UnsignedRand(int range = RAND_MAX - 1) { return (rand() % (range + 1)); }
+	static b3Scalar UnitRand() { return (UnsignedRand(16384) / (b3Scalar)16384); }
+	static void OutputTime(const char* name, b3Clock& c, unsigned count = 0)
+	{
+		const unsigned long us = c.getTimeMicroseconds();
+		const unsigned long ms = (us + 500) / 1000;
+		const b3Scalar sec = us / (b3Scalar)(1000 * 1000);
+		if (count > 0)
+			printf("%s : %u us (%u ms), %.2f/s\r\n", name, us, ms, count / sec);
+		else
+			printf("%s : %u us (%u ms)\r\n", name, us, ms);
+	}
+};
+
+void b3DynamicBvhBroadphase::benchmark(b3BroadphaseInterface* pbi)
+{
+	static const b3BroadphaseBenchmark::Experiment experiments[] =
+		{
+			{"1024o.10%", 1024, 10, 0, 8192, (b3Scalar)0.005, (b3Scalar)100},
+			/*{"4096o.10%",4096,10,0,8192,(b3Scalar)0.005,(b3Scalar)100},
+		{"8192o.10%",8192,10,0,8192,(b3Scalar)0.005,(b3Scalar)100},*/
+		};
+	static const int nexperiments = sizeof(experiments) / sizeof(experiments[0]);
+	b3AlignedObjectArray<b3BroadphaseBenchmark::Object*> objects;
+	b3Clock wallclock;
+	/* Begin			*/
+	for (int iexp = 0; iexp < nexperiments; ++iexp)
+	{
+		const b3BroadphaseBenchmark::Experiment& experiment = experiments[iexp];
+		const int object_count = experiment.object_count;
+		const int update_count = (object_count * experiment.update_count) / 100;
+		const int spawn_count = (object_count * experiment.spawn_count) / 100;
+		const b3Scalar speed = experiment.speed;
+		const b3Scalar amplitude = experiment.amplitude;
+		printf("Experiment #%u '%s':\r\n", iexp, experiment.name);
+		printf("\tObjects: %u\r\n", object_count);
+		printf("\tUpdate: %u\r\n", update_count);
+		printf("\tSpawn: %u\r\n", spawn_count);
+		printf("\tSpeed: %f\r\n", speed);
+		printf("\tAmplitude: %f\r\n", amplitude);
+		srand(180673);
+		/* Create objects	*/
+		wallclock.reset();
+		objects.reserve(object_count);
+		for (int i = 0; i < object_count; ++i)
+		{
+			b3BroadphaseBenchmark::Object* po = new b3BroadphaseBenchmark::Object();
+			po->center[0] = b3BroadphaseBenchmark::UnitRand() * 50;
+			po->center[1] = b3BroadphaseBenchmark::UnitRand() * 50;
+			po->center[2] = b3BroadphaseBenchmark::UnitRand() * 50;
+			po->extents[0] = b3BroadphaseBenchmark::UnitRand() * 2 + 2;
+			po->extents[1] = b3BroadphaseBenchmark::UnitRand() * 2 + 2;
+			po->extents[2] = b3BroadphaseBenchmark::UnitRand() * 2 + 2;
+			po->time = b3BroadphaseBenchmark::UnitRand() * 2000;
+			po->proxy = pbi->createProxy(po->center - po->extents, po->center + po->extents, 0, po, 1, 1, 0, 0);
+			objects.push_back(po);
+		}
+		b3BroadphaseBenchmark::OutputTime("\tInitialization", wallclock);
+		/* First update		*/
+		wallclock.reset();
+		for (int i = 0; i < objects.size(); ++i)
+		{
+			objects[i]->update(speed, amplitude, pbi);
+		}
+		b3BroadphaseBenchmark::OutputTime("\tFirst update", wallclock);
+		/* Updates			*/
+		wallclock.reset();
+		for (int i = 0; i < experiment.iterations; ++i)
+		{
+			for (int j = 0; j < update_count; ++j)
+			{
+				objects[j]->update(speed, amplitude, pbi);
+			}
+			pbi->calculateOverlappingPairs(0);
+		}
+		b3BroadphaseBenchmark::OutputTime("\tUpdate", wallclock, experiment.iterations);
+		/* Clean up			*/
+		wallclock.reset();
+		for (int i = 0; i < objects.size(); ++i)
+		{
+			pbi->destroyProxy(objects[i]->proxy, 0);
+			delete objects[i];
+		}
+		objects.resize(0);
+		b3BroadphaseBenchmark::OutputTime("\tRelease", wallclock);
+	}
+}
+#else
+/*void							b3DynamicBvhBroadphase::benchmark(b3BroadphaseInterface*)
+{}
+*/
+#endif
+
+#if B3_DBVT_BP_PROFILE
+#undef b3SPC
+#endif

+ 197 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h

@@ -0,0 +1,197 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///b3DynamicBvhBroadphase implementation by Nathanael Presson
+#ifndef B3_DBVT_BROADPHASE_H
+#define B3_DBVT_BROADPHASE_H
+
+#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvh.h"
+#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+#include "b3BroadphaseCallback.h"
+
+//
+// Compile time config
+//
+
+#define B3_DBVT_BP_PROFILE 0
+//#define B3_DBVT_BP_SORTPAIRS				1
+#define B3_DBVT_BP_PREVENTFALSEUPDATE 0
+#define B3_DBVT_BP_ACCURATESLEEPING 0
+#define B3_DBVT_BP_ENABLE_BENCHMARK 0
+#define B3_DBVT_BP_MARGIN (b3Scalar)0.05
+
+#if B3_DBVT_BP_PROFILE
+#define B3_DBVT_BP_PROFILING_RATE 256
+
+#endif
+
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3BroadphaseProxy
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	///optional filtering to cull potential collisions
+	enum CollisionFilterGroups
+	{
+		DefaultFilter = 1,
+		StaticFilter = 2,
+		KinematicFilter = 4,
+		DebrisFilter = 8,
+		SensorTrigger = 16,
+		CharacterFilter = 32,
+		AllFilter = -1  //all bits sets: DefaultFilter | StaticFilter | KinematicFilter | DebrisFilter | SensorTrigger
+	};
+
+	//Usually the client b3CollisionObject or Rigidbody class
+	void* m_clientObject;
+	int m_collisionFilterGroup;
+	int m_collisionFilterMask;
+	int m_uniqueId;  //m_uniqueId is introduced for paircache. could get rid of this, by calculating the address offset etc.
+
+	b3Vector3 m_aabbMin;
+	b3Vector3 m_aabbMax;
+
+	B3_FORCE_INLINE int getUid() const
+	{
+		return m_uniqueId;
+	}
+
+	//used for memory pools
+	b3BroadphaseProxy() : m_clientObject(0)
+	{
+	}
+
+	b3BroadphaseProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, void* userPtr, int collisionFilterGroup, int collisionFilterMask)
+		: m_clientObject(userPtr),
+		  m_collisionFilterGroup(collisionFilterGroup),
+		  m_collisionFilterMask(collisionFilterMask),
+		  m_aabbMin(aabbMin),
+		  m_aabbMax(aabbMax)
+	{
+	}
+};
+
+//
+// b3DbvtProxy
+//
+struct b3DbvtProxy : b3BroadphaseProxy
+{
+	/* Fields		*/
+	//b3DbvtAabbMm	aabb;
+	b3DbvtNode* leaf;
+	b3DbvtProxy* links[2];
+	int stage;
+	/* ctor			*/
+
+	explicit b3DbvtProxy() {}
+	b3DbvtProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, void* userPtr, int collisionFilterGroup, int collisionFilterMask) : b3BroadphaseProxy(aabbMin, aabbMax, userPtr, collisionFilterGroup, collisionFilterMask)
+	{
+		links[0] = links[1] = 0;
+	}
+};
+
+typedef b3AlignedObjectArray<b3DbvtProxy*> b3DbvtProxyArray;
+
+///The b3DynamicBvhBroadphase implements a broadphase using two dynamic AABB bounding volume hierarchies/trees (see b3DynamicBvh).
+///One tree is used for static/non-moving objects, and another tree is used for dynamic objects. Objects can move from one tree to the other.
+///This is a very fast broadphase, especially for very dynamic worlds where many objects are moving. Its insert/add and remove of objects is generally faster than the sweep and prune broadphases b3AxisSweep3 and b332BitAxisSweep3.
+struct b3DynamicBvhBroadphase
+{
+	/* Config		*/
+	enum
+	{
+		DYNAMIC_SET = 0, /* Dynamic set index	*/
+		FIXED_SET = 1,   /* Fixed set index		*/
+		STAGECOUNT = 2   /* Number of stages		*/
+	};
+	/* Fields		*/
+	b3DynamicBvh m_sets[2];                     // Dbvt sets
+	b3DbvtProxy* m_stageRoots[STAGECOUNT + 1];  // Stages list
+
+	b3AlignedObjectArray<b3DbvtProxy> m_proxies;
+	b3OverlappingPairCache* m_paircache;  // Pair cache
+	b3Scalar m_prediction;                // Velocity prediction
+	int m_stageCurrent;                   // Current stage
+	int m_fupdates;                       // % of fixed updates per frame
+	int m_dupdates;                       // % of dynamic updates per frame
+	int m_cupdates;                       // % of cleanup updates per frame
+	int m_newpairs;                       // Number of pairs created
+	int m_fixedleft;                      // Fixed optimization left
+	unsigned m_updates_call;              // Number of updates call
+	unsigned m_updates_done;              // Number of updates done
+	b3Scalar m_updates_ratio;             // m_updates_done/m_updates_call
+	int m_pid;                            // Parse id
+	int m_cid;                            // Cleanup index
+	bool m_releasepaircache;              // Release pair cache on delete
+	bool m_deferedcollide;                // Defere dynamic/static collision to collide call
+	bool m_needcleanup;                   // Need to run cleanup?
+#if B3_DBVT_BP_PROFILE
+	b3Clock m_clock;
+	struct
+	{
+		unsigned long m_total;
+		unsigned long m_ddcollide;
+		unsigned long m_fdcollide;
+		unsigned long m_cleanup;
+		unsigned long m_jobcount;
+	} m_profiling;
+#endif
+	/* Methods		*/
+	b3DynamicBvhBroadphase(int proxyCapacity, b3OverlappingPairCache* paircache = 0);
+	virtual ~b3DynamicBvhBroadphase();
+	void collide(b3Dispatcher* dispatcher);
+	void optimize();
+
+	/* b3BroadphaseInterface Implementation	*/
+	b3BroadphaseProxy* createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int objectIndex, void* userPtr, int collisionFilterGroup, int collisionFilterMask);
+	virtual void destroyProxy(b3BroadphaseProxy* proxy, b3Dispatcher* dispatcher);
+	virtual void setAabb(int objectId, const b3Vector3& aabbMin, const b3Vector3& aabbMax, b3Dispatcher* dispatcher);
+	virtual void rayTest(const b3Vector3& rayFrom, const b3Vector3& rayTo, b3BroadphaseRayCallback& rayCallback, const b3Vector3& aabbMin = b3MakeVector3(0, 0, 0), const b3Vector3& aabbMax = b3MakeVector3(0, 0, 0));
+	virtual void aabbTest(const b3Vector3& aabbMin, const b3Vector3& aabbMax, b3BroadphaseAabbCallback& callback);
+
+	//virtual void					getAabb(b3BroadphaseProxy* proxy,b3Vector3& aabbMin, b3Vector3& aabbMax ) const;
+	virtual void getAabb(int objectId, b3Vector3& aabbMin, b3Vector3& aabbMax) const;
+	virtual void calculateOverlappingPairs(b3Dispatcher* dispatcher = 0);
+	virtual b3OverlappingPairCache* getOverlappingPairCache();
+	virtual const b3OverlappingPairCache* getOverlappingPairCache() const;
+	virtual void getBroadphaseAabb(b3Vector3& aabbMin, b3Vector3& aabbMax) const;
+	virtual void printStats();
+
+	///reset broadphase internal structures, to ensure determinism/reproducability
+	virtual void resetPool(b3Dispatcher* dispatcher);
+
+	void performDeferredRemoval(b3Dispatcher* dispatcher);
+
+	void setVelocityPrediction(b3Scalar prediction)
+	{
+		m_prediction = prediction;
+	}
+	b3Scalar getVelocityPrediction() const
+	{
+		return m_prediction;
+	}
+
+	///this setAabbForceUpdate is similar to setAabb but always forces the aabb update.
+	///it is not part of the b3BroadphaseInterface but specific to b3DynamicBvhBroadphase.
+	///it bypasses certain optimizations that prevent aabb updates (when the aabb shrinks), see
+	///http://code.google.com/p/bullet/issues/detail?id=223
+	void setAabbForceUpdate(b3BroadphaseProxy* absproxy, const b3Vector3& aabbMin, const b3Vector3& aabbMax, b3Dispatcher* /*dispatcher*/);
+
+	//static void						benchmark(b3BroadphaseInterface*);
+};
+
+#endif

+ 70 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h

@@ -0,0 +1,70 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_OVERLAPPING_PAIR_H
+#define B3_OVERLAPPING_PAIR_H
+
+#include "Bullet3Common/shared/b3Int4.h"
+
+#define B3_NEW_PAIR_MARKER -1
+#define B3_REMOVED_PAIR_MARKER -2
+
+typedef b3Int4 b3BroadphasePair;
+
+inline b3Int4 b3MakeBroadphasePair(int xx, int yy)
+{
+	b3Int4 pair;
+
+	if (xx < yy)
+	{
+		pair.x = xx;
+		pair.y = yy;
+	}
+	else
+	{
+		pair.x = yy;
+		pair.y = xx;
+	}
+	pair.z = B3_NEW_PAIR_MARKER;
+	pair.w = B3_NEW_PAIR_MARKER;
+	return pair;
+}
+
+/*struct b3BroadphasePair : public b3Int4
+{
+	explicit b3BroadphasePair(){}
+	
+};
+*/
+
+class b3BroadphasePairSortPredicate
+{
+public:
+	bool operator()(const b3BroadphasePair& a, const b3BroadphasePair& b) const
+	{
+		const int uidA0 = a.x;
+		const int uidB0 = b.x;
+		const int uidA1 = a.y;
+		const int uidB1 = b.y;
+		return uidA0 > uidB0 || (uidA0 == uidB0 && uidA1 > uidB1);
+	}
+};
+
+B3_FORCE_INLINE bool operator==(const b3BroadphasePair& a, const b3BroadphasePair& b)
+{
+	return (a.x == b.x) && (a.y == b.y);
+}
+
+#endif  //B3_OVERLAPPING_PAIR_H

+ 559 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.cpp

@@ -0,0 +1,559 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3OverlappingPairCache.h"
+
+//#include "b3Dispatcher.h"
+//#include "b3CollisionAlgorithm.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+
+#include <stdio.h>
+
+int b3g_overlappingPairs = 0;
+int b3g_removePairs = 0;
+int b3g_addedPairs = 0;
+int b3g_findPairs = 0;
+
+b3HashedOverlappingPairCache::b3HashedOverlappingPairCache() : m_overlapFilterCallback(0)
+//,	m_blockedForChanges(false)
+{
+	int initialAllocatedSize = 2;
+	m_overlappingPairArray.reserve(initialAllocatedSize);
+	growTables();
+}
+
+b3HashedOverlappingPairCache::~b3HashedOverlappingPairCache()
+{
+}
+
+void b3HashedOverlappingPairCache::cleanOverlappingPair(b3BroadphasePair& pair, b3Dispatcher* dispatcher)
+{
+	/*	if (pair.m_algorithm)
+	{
+		{
+			pair.m_algorithm->~b3CollisionAlgorithm();
+			dispatcher->freeCollisionAlgorithm(pair.m_algorithm);
+			pair.m_algorithm=0;
+		}
+	}
+	*/
+}
+
+void b3HashedOverlappingPairCache::cleanProxyFromPairs(int proxy, b3Dispatcher* dispatcher)
+{
+	class CleanPairCallback : public b3OverlapCallback
+	{
+		int m_cleanProxy;
+		b3OverlappingPairCache* m_pairCache;
+		b3Dispatcher* m_dispatcher;
+
+	public:
+		CleanPairCallback(int cleanProxy, b3OverlappingPairCache* pairCache, b3Dispatcher* dispatcher)
+			: m_cleanProxy(cleanProxy),
+			  m_pairCache(pairCache),
+			  m_dispatcher(dispatcher)
+		{
+		}
+		virtual bool processOverlap(b3BroadphasePair& pair)
+		{
+			if ((pair.x == m_cleanProxy) ||
+				(pair.y == m_cleanProxy))
+			{
+				m_pairCache->cleanOverlappingPair(pair, m_dispatcher);
+			}
+			return false;
+		}
+	};
+
+	CleanPairCallback cleanPairs(proxy, this, dispatcher);
+
+	processAllOverlappingPairs(&cleanPairs, dispatcher);
+}
+
+void b3HashedOverlappingPairCache::removeOverlappingPairsContainingProxy(int proxy, b3Dispatcher* dispatcher)
+{
+	class RemovePairCallback : public b3OverlapCallback
+	{
+		int m_obsoleteProxy;
+
+	public:
+		RemovePairCallback(int obsoleteProxy)
+			: m_obsoleteProxy(obsoleteProxy)
+		{
+		}
+		virtual bool processOverlap(b3BroadphasePair& pair)
+		{
+			return ((pair.x == m_obsoleteProxy) ||
+					(pair.y == m_obsoleteProxy));
+		}
+	};
+
+	RemovePairCallback removeCallback(proxy);
+
+	processAllOverlappingPairs(&removeCallback, dispatcher);
+}
+
+b3BroadphasePair* b3HashedOverlappingPairCache::findPair(int proxy0, int proxy1)
+{
+	b3g_findPairs++;
+	if (proxy0 > proxy1)
+		b3Swap(proxy0, proxy1);
+	int proxyId1 = proxy0;
+	int proxyId2 = proxy1;
+
+	/*if (proxyId1 > proxyId2) 
+		b3Swap(proxyId1, proxyId2);*/
+
+	int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity() - 1));
+
+	if (hash >= m_hashTable.size())
+	{
+		return NULL;
+	}
+
+	int index = m_hashTable[hash];
+	while (index != B3_NULL_PAIR && equalsPair(m_overlappingPairArray[index], proxyId1, proxyId2) == false)
+	{
+		index = m_next[index];
+	}
+
+	if (index == B3_NULL_PAIR)
+	{
+		return NULL;
+	}
+
+	b3Assert(index < m_overlappingPairArray.size());
+
+	return &m_overlappingPairArray[index];
+}
+
+//#include <stdio.h>
+
+void b3HashedOverlappingPairCache::growTables()
+{
+	int newCapacity = m_overlappingPairArray.capacity();
+
+	if (m_hashTable.size() < newCapacity)
+	{
+		//grow hashtable and next table
+		int curHashtableSize = m_hashTable.size();
+
+		m_hashTable.resize(newCapacity);
+		m_next.resize(newCapacity);
+
+		int i;
+
+		for (i = 0; i < newCapacity; ++i)
+		{
+			m_hashTable[i] = B3_NULL_PAIR;
+		}
+		for (i = 0; i < newCapacity; ++i)
+		{
+			m_next[i] = B3_NULL_PAIR;
+		}
+
+		for (i = 0; i < curHashtableSize; i++)
+		{
+			const b3BroadphasePair& pair = m_overlappingPairArray[i];
+			int proxyId1 = pair.x;
+			int proxyId2 = pair.y;
+			/*if (proxyId1 > proxyId2) 
+				b3Swap(proxyId1, proxyId2);*/
+			int hashValue = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity() - 1));  // New hash value with new mask
+			m_next[i] = m_hashTable[hashValue];
+			m_hashTable[hashValue] = i;
+		}
+	}
+}
+
+b3BroadphasePair* b3HashedOverlappingPairCache::internalAddPair(int proxy0, int proxy1)
+{
+	if (proxy0 > proxy1)
+		b3Swap(proxy0, proxy1);
+	int proxyId1 = proxy0;
+	int proxyId2 = proxy1;
+
+	/*if (proxyId1 > proxyId2) 
+		b3Swap(proxyId1, proxyId2);*/
+
+	int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity() - 1));  // New hash value with new mask
+
+	b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash);
+	if (pair != NULL)
+	{
+		return pair;
+	}
+	/*for(int i=0;i<m_overlappingPairArray.size();++i)
+		{
+		if(	(m_overlappingPairArray[i].m_pProxy0==proxy0)&&
+			(m_overlappingPairArray[i].m_pProxy1==proxy1))
+			{
+			printf("Adding duplicated %u<>%u\r\n",proxyId1,proxyId2);
+			internalFindPair(proxy0, proxy1, hash);
+			}
+		}*/
+	int count = m_overlappingPairArray.size();
+	int oldCapacity = m_overlappingPairArray.capacity();
+	pair = &m_overlappingPairArray.expandNonInitializing();
+
+	//this is where we add an actual pair, so also call the 'ghost'
+	//	if (m_ghostPairCallback)
+	//		m_ghostPairCallback->addOverlappingPair(proxy0,proxy1);
+
+	int newCapacity = m_overlappingPairArray.capacity();
+
+	if (oldCapacity < newCapacity)
+	{
+		growTables();
+		//hash with new capacity
+		hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity() - 1));
+	}
+
+	*pair = b3MakeBroadphasePair(proxy0, proxy1);
+
+	//	pair->m_pProxy0 = proxy0;
+	//	pair->m_pProxy1 = proxy1;
+	//pair->m_algorithm = 0;
+	//pair->m_internalTmpValue = 0;
+
+	m_next[count] = m_hashTable[hash];
+	m_hashTable[hash] = count;
+
+	return pair;
+}
+
+void* b3HashedOverlappingPairCache::removeOverlappingPair(int proxy0, int proxy1, b3Dispatcher* dispatcher)
+{
+	b3g_removePairs++;
+	if (proxy0 > proxy1)
+		b3Swap(proxy0, proxy1);
+	int proxyId1 = proxy0;
+	int proxyId2 = proxy1;
+
+	/*if (proxyId1 > proxyId2) 
+		b3Swap(proxyId1, proxyId2);*/
+
+	int hash = static_cast<int>(getHash(static_cast<unsigned int>(proxyId1), static_cast<unsigned int>(proxyId2)) & (m_overlappingPairArray.capacity() - 1));
+
+	b3BroadphasePair* pair = internalFindPair(proxy0, proxy1, hash);
+	if (pair == NULL)
+	{
+		return 0;
+	}
+
+	cleanOverlappingPair(*pair, dispatcher);
+
+	int pairIndex = int(pair - &m_overlappingPairArray[0]);
+	b3Assert(pairIndex < m_overlappingPairArray.size());
+
+	// Remove the pair from the hash table.
+	int index = m_hashTable[hash];
+	b3Assert(index != B3_NULL_PAIR);
+
+	int previous = B3_NULL_PAIR;
+	while (index != pairIndex)
+	{
+		previous = index;
+		index = m_next[index];
+	}
+
+	if (previous != B3_NULL_PAIR)
+	{
+		b3Assert(m_next[previous] == pairIndex);
+		m_next[previous] = m_next[pairIndex];
+	}
+	else
+	{
+		m_hashTable[hash] = m_next[pairIndex];
+	}
+
+	// We now move the last pair into spot of the
+	// pair being removed. We need to fix the hash
+	// table indices to support the move.
+
+	int lastPairIndex = m_overlappingPairArray.size() - 1;
+
+	//if (m_ghostPairCallback)
+	//	m_ghostPairCallback->removeOverlappingPair(proxy0, proxy1,dispatcher);
+
+	// If the removed pair is the last pair, we are done.
+	if (lastPairIndex == pairIndex)
+	{
+		m_overlappingPairArray.pop_back();
+		return 0;
+	}
+
+	// Remove the last pair from the hash table.
+	const b3BroadphasePair* last = &m_overlappingPairArray[lastPairIndex];
+	/* missing swap here too, Nat. */
+	int lastHash = static_cast<int>(getHash(static_cast<unsigned int>(last->x), static_cast<unsigned int>(last->y)) & (m_overlappingPairArray.capacity() - 1));
+
+	index = m_hashTable[lastHash];
+	b3Assert(index != B3_NULL_PAIR);
+
+	previous = B3_NULL_PAIR;
+	while (index != lastPairIndex)
+	{
+		previous = index;
+		index = m_next[index];
+	}
+
+	if (previous != B3_NULL_PAIR)
+	{
+		b3Assert(m_next[previous] == lastPairIndex);
+		m_next[previous] = m_next[lastPairIndex];
+	}
+	else
+	{
+		m_hashTable[lastHash] = m_next[lastPairIndex];
+	}
+
+	// Copy the last pair into the remove pair's spot.
+	m_overlappingPairArray[pairIndex] = m_overlappingPairArray[lastPairIndex];
+
+	// Insert the last pair into the hash table
+	m_next[pairIndex] = m_hashTable[lastHash];
+	m_hashTable[lastHash] = pairIndex;
+
+	m_overlappingPairArray.pop_back();
+
+	return 0;
+}
+//#include <stdio.h>
+
+void b3HashedOverlappingPairCache::processAllOverlappingPairs(b3OverlapCallback* callback, b3Dispatcher* dispatcher)
+{
+	int i;
+
+	//	printf("m_overlappingPairArray.size()=%d\n",m_overlappingPairArray.size());
+	for (i = 0; i < m_overlappingPairArray.size();)
+	{
+		b3BroadphasePair* pair = &m_overlappingPairArray[i];
+		if (callback->processOverlap(*pair))
+		{
+			removeOverlappingPair(pair->x, pair->y, dispatcher);
+
+			b3g_overlappingPairs--;
+		}
+		else
+		{
+			i++;
+		}
+	}
+}
+
+void b3HashedOverlappingPairCache::sortOverlappingPairs(b3Dispatcher* dispatcher)
+{
+	///need to keep hashmap in sync with pair address, so rebuild all
+	b3BroadphasePairArray tmpPairs;
+	int i;
+	for (i = 0; i < m_overlappingPairArray.size(); i++)
+	{
+		tmpPairs.push_back(m_overlappingPairArray[i]);
+	}
+
+	for (i = 0; i < tmpPairs.size(); i++)
+	{
+		removeOverlappingPair(tmpPairs[i].x, tmpPairs[i].y, dispatcher);
+	}
+
+	for (i = 0; i < m_next.size(); i++)
+	{
+		m_next[i] = B3_NULL_PAIR;
+	}
+
+	tmpPairs.quickSort(b3BroadphasePairSortPredicate());
+
+	for (i = 0; i < tmpPairs.size(); i++)
+	{
+		addOverlappingPair(tmpPairs[i].x, tmpPairs[i].y);
+	}
+}
+
+void* b3SortedOverlappingPairCache::removeOverlappingPair(int proxy0, int proxy1, b3Dispatcher* dispatcher)
+{
+	if (!hasDeferredRemoval())
+	{
+		b3BroadphasePair findPair = b3MakeBroadphasePair(proxy0, proxy1);
+
+		int findIndex = m_overlappingPairArray.findLinearSearch(findPair);
+		if (findIndex < m_overlappingPairArray.size())
+		{
+			b3g_overlappingPairs--;
+			b3BroadphasePair& pair = m_overlappingPairArray[findIndex];
+
+			cleanOverlappingPair(pair, dispatcher);
+			//if (m_ghostPairCallback)
+			//	m_ghostPairCallback->removeOverlappingPair(proxy0, proxy1,dispatcher);
+
+			m_overlappingPairArray.swap(findIndex, m_overlappingPairArray.capacity() - 1);
+			m_overlappingPairArray.pop_back();
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+b3BroadphasePair* b3SortedOverlappingPairCache::addOverlappingPair(int proxy0, int proxy1)
+{
+	//don't add overlap with own
+	b3Assert(proxy0 != proxy1);
+
+	if (!needsBroadphaseCollision(proxy0, proxy1))
+		return 0;
+
+	b3BroadphasePair* pair = &m_overlappingPairArray.expandNonInitializing();
+	*pair = b3MakeBroadphasePair(proxy0, proxy1);
+
+	b3g_overlappingPairs++;
+	b3g_addedPairs++;
+
+	//	if (m_ghostPairCallback)
+	//		m_ghostPairCallback->addOverlappingPair(proxy0, proxy1);
+	return pair;
+}
+
+///this findPair becomes really slow. Either sort the list to speedup the query, or
+///use a different solution. It is mainly used for Removing overlapping pairs. Removal could be delayed.
+///we could keep a linked list in each proxy, and store pair in one of the proxies (with lowest memory address)
+///Also we can use a 2D bitmap, which can be useful for a future GPU implementation
+b3BroadphasePair* b3SortedOverlappingPairCache::findPair(int proxy0, int proxy1)
+{
+	if (!needsBroadphaseCollision(proxy0, proxy1))
+		return 0;
+
+	b3BroadphasePair tmpPair = b3MakeBroadphasePair(proxy0, proxy1);
+	int findIndex = m_overlappingPairArray.findLinearSearch(tmpPair);
+
+	if (findIndex < m_overlappingPairArray.size())
+	{
+		//b3Assert(it != m_overlappingPairSet.end());
+		b3BroadphasePair* pair = &m_overlappingPairArray[findIndex];
+		return pair;
+	}
+	return 0;
+}
+
+//#include <stdio.h>
+
+void b3SortedOverlappingPairCache::processAllOverlappingPairs(b3OverlapCallback* callback, b3Dispatcher* dispatcher)
+{
+	int i;
+
+	for (i = 0; i < m_overlappingPairArray.size();)
+	{
+		b3BroadphasePair* pair = &m_overlappingPairArray[i];
+		if (callback->processOverlap(*pair))
+		{
+			cleanOverlappingPair(*pair, dispatcher);
+			pair->x = -1;
+			pair->y = -1;
+			m_overlappingPairArray.swap(i, m_overlappingPairArray.size() - 1);
+			m_overlappingPairArray.pop_back();
+			b3g_overlappingPairs--;
+		}
+		else
+		{
+			i++;
+		}
+	}
+}
+
+b3SortedOverlappingPairCache::b3SortedOverlappingPairCache() : m_blockedForChanges(false),
+															   m_hasDeferredRemoval(true),
+															   m_overlapFilterCallback(0)
+
+{
+	int initialAllocatedSize = 2;
+	m_overlappingPairArray.reserve(initialAllocatedSize);
+}
+
+b3SortedOverlappingPairCache::~b3SortedOverlappingPairCache()
+{
+}
+
+void b3SortedOverlappingPairCache::cleanOverlappingPair(b3BroadphasePair& pair, b3Dispatcher* dispatcher)
+{
+	/*	if (pair.m_algorithm)
+	{
+		{
+			pair.m_algorithm->~b3CollisionAlgorithm();
+			dispatcher->freeCollisionAlgorithm(pair.m_algorithm);
+			pair.m_algorithm=0;
+			b3g_removePairs--;
+		}
+	}
+	*/
+}
+
+void b3SortedOverlappingPairCache::cleanProxyFromPairs(int proxy, b3Dispatcher* dispatcher)
+{
+	class CleanPairCallback : public b3OverlapCallback
+	{
+		int m_cleanProxy;
+		b3OverlappingPairCache* m_pairCache;
+		b3Dispatcher* m_dispatcher;
+
+	public:
+		CleanPairCallback(int cleanProxy, b3OverlappingPairCache* pairCache, b3Dispatcher* dispatcher)
+			: m_cleanProxy(cleanProxy),
+			  m_pairCache(pairCache),
+			  m_dispatcher(dispatcher)
+		{
+		}
+		virtual bool processOverlap(b3BroadphasePair& pair)
+		{
+			if ((pair.x == m_cleanProxy) ||
+				(pair.y == m_cleanProxy))
+			{
+				m_pairCache->cleanOverlappingPair(pair, m_dispatcher);
+			}
+			return false;
+		}
+	};
+
+	CleanPairCallback cleanPairs(proxy, this, dispatcher);
+
+	processAllOverlappingPairs(&cleanPairs, dispatcher);
+}
+
+void b3SortedOverlappingPairCache::removeOverlappingPairsContainingProxy(int proxy, b3Dispatcher* dispatcher)
+{
+	class RemovePairCallback : public b3OverlapCallback
+	{
+		int m_obsoleteProxy;
+
+	public:
+		RemovePairCallback(int obsoleteProxy)
+			: m_obsoleteProxy(obsoleteProxy)
+		{
+		}
+		virtual bool processOverlap(b3BroadphasePair& pair)
+		{
+			return ((pair.x == m_obsoleteProxy) ||
+					(pair.y == m_obsoleteProxy));
+		}
+	};
+
+	RemovePairCallback removeCallback(proxy);
+
+	processAllOverlappingPairs(&removeCallback, dispatcher);
+}
+
+void b3SortedOverlappingPairCache::sortOverlappingPairs(b3Dispatcher* dispatcher)
+{
+	//should already be sorted
+}

+ 427 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/b3OverlappingPairCache.h

@@ -0,0 +1,427 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_OVERLAPPING_PAIR_CACHE_H
+#define B3_OVERLAPPING_PAIR_CACHE_H
+
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+class b3Dispatcher;
+#include "b3OverlappingPair.h"
+
+typedef b3AlignedObjectArray<b3BroadphasePair> b3BroadphasePairArray;
+
+struct b3OverlapCallback
+{
+	virtual ~b3OverlapCallback()
+	{
+	}
+	//return true for deletion of the pair
+	virtual bool processOverlap(b3BroadphasePair& pair) = 0;
+};
+
+struct b3OverlapFilterCallback
+{
+	virtual ~b3OverlapFilterCallback()
+	{
+	}
+	// return true when pairs need collision
+	virtual bool needBroadphaseCollision(int proxy0, int proxy1) const = 0;
+};
+
+extern int b3g_removePairs;
+extern int b3g_addedPairs;
+extern int b3g_findPairs;
+
+const int B3_NULL_PAIR = 0xffffffff;
+
+///The b3OverlappingPairCache provides an interface for overlapping pair management (add, remove, storage), used by the b3BroadphaseInterface broadphases.
+///The b3HashedOverlappingPairCache and b3SortedOverlappingPairCache classes are two implementations.
+class b3OverlappingPairCache
+{
+public:
+	virtual ~b3OverlappingPairCache() {}  // this is needed so we can get to the derived class destructor
+
+	virtual b3BroadphasePair* getOverlappingPairArrayPtr() = 0;
+
+	virtual const b3BroadphasePair* getOverlappingPairArrayPtr() const = 0;
+
+	virtual b3BroadphasePairArray& getOverlappingPairArray() = 0;
+
+	virtual void cleanOverlappingPair(b3BroadphasePair& pair, b3Dispatcher* dispatcher) = 0;
+
+	virtual int getNumOverlappingPairs() const = 0;
+
+	virtual void cleanProxyFromPairs(int proxy, b3Dispatcher* dispatcher) = 0;
+
+	virtual void setOverlapFilterCallback(b3OverlapFilterCallback* callback) = 0;
+
+	virtual void processAllOverlappingPairs(b3OverlapCallback*, b3Dispatcher* dispatcher) = 0;
+
+	virtual b3BroadphasePair* findPair(int proxy0, int proxy1) = 0;
+
+	virtual bool hasDeferredRemoval() = 0;
+
+	//virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* ghostPairCallback)=0;
+
+	virtual b3BroadphasePair* addOverlappingPair(int proxy0, int proxy1) = 0;
+	virtual void* removeOverlappingPair(int proxy0, int proxy1, b3Dispatcher* dispatcher) = 0;
+	virtual void removeOverlappingPairsContainingProxy(int /*proxy0*/, b3Dispatcher* /*dispatcher*/) = 0;
+
+	virtual void sortOverlappingPairs(b3Dispatcher* dispatcher) = 0;
+};
+
+/// Hash-space based Pair Cache, thanks to Erin Catto, Box2D, http://www.box2d.org, and Pierre Terdiman, Codercorner, http://codercorner.com
+class b3HashedOverlappingPairCache : public b3OverlappingPairCache
+{
+	b3BroadphasePairArray m_overlappingPairArray;
+	b3OverlapFilterCallback* m_overlapFilterCallback;
+	//	bool		m_blockedForChanges;
+
+public:
+	b3HashedOverlappingPairCache();
+	virtual ~b3HashedOverlappingPairCache();
+
+	virtual void removeOverlappingPairsContainingProxy(int proxy, b3Dispatcher* dispatcher);
+
+	virtual void* removeOverlappingPair(int proxy0, int proxy1, b3Dispatcher* dispatcher);
+
+	B3_FORCE_INLINE bool needsBroadphaseCollision(int proxy0, int proxy1) const
+	{
+		if (m_overlapFilterCallback)
+			return m_overlapFilterCallback->needBroadphaseCollision(proxy0, proxy1);
+
+		bool collides = true;  //(proxy0->m_collisionFilterGroup & proxy1->m_collisionFilterMask) != 0;
+		//collides = collides && (proxy1->m_collisionFilterGroup & proxy0->m_collisionFilterMask);
+
+		return collides;
+	}
+
+	// Add a pair and return the new pair. If the pair already exists,
+	// no new pair is created and the old one is returned.
+	virtual b3BroadphasePair* addOverlappingPair(int proxy0, int proxy1)
+	{
+		b3g_addedPairs++;
+
+		if (!needsBroadphaseCollision(proxy0, proxy1))
+			return 0;
+
+		return internalAddPair(proxy0, proxy1);
+	}
+
+	void cleanProxyFromPairs(int proxy, b3Dispatcher* dispatcher);
+
+	virtual void processAllOverlappingPairs(b3OverlapCallback*, b3Dispatcher* dispatcher);
+
+	virtual b3BroadphasePair* getOverlappingPairArrayPtr()
+	{
+		return &m_overlappingPairArray[0];
+	}
+
+	const b3BroadphasePair* getOverlappingPairArrayPtr() const
+	{
+		return &m_overlappingPairArray[0];
+	}
+
+	b3BroadphasePairArray& getOverlappingPairArray()
+	{
+		return m_overlappingPairArray;
+	}
+
+	const b3BroadphasePairArray& getOverlappingPairArray() const
+	{
+		return m_overlappingPairArray;
+	}
+
+	void cleanOverlappingPair(b3BroadphasePair& pair, b3Dispatcher* dispatcher);
+
+	b3BroadphasePair* findPair(int proxy0, int proxy1);
+
+	int GetCount() const { return m_overlappingPairArray.size(); }
+	//	b3BroadphasePair* GetPairs() { return m_pairs; }
+
+	b3OverlapFilterCallback* getOverlapFilterCallback()
+	{
+		return m_overlapFilterCallback;
+	}
+
+	void setOverlapFilterCallback(b3OverlapFilterCallback* callback)
+	{
+		m_overlapFilterCallback = callback;
+	}
+
+	int getNumOverlappingPairs() const
+	{
+		return m_overlappingPairArray.size();
+	}
+
+private:
+	b3BroadphasePair* internalAddPair(int proxy0, int proxy1);
+
+	void growTables();
+
+	B3_FORCE_INLINE bool equalsPair(const b3BroadphasePair& pair, int proxyId1, int proxyId2)
+	{
+		return pair.x == proxyId1 && pair.y == proxyId2;
+	}
+
+	/*
+	// Thomas Wang's hash, see: http://www.concentric.net/~Ttwang/tech/inthash.htm
+	// This assumes proxyId1 and proxyId2 are 16-bit.
+	B3_FORCE_INLINE int getHash(int proxyId1, int proxyId2)
+	{
+		int key = (proxyId2 << 16) | proxyId1;
+		key = ~key + (key << 15);
+		key = key ^ (key >> 12);
+		key = key + (key << 2);
+		key = key ^ (key >> 4);
+		key = key * 2057;
+		key = key ^ (key >> 16);
+		return key;
+	}
+	*/
+
+	B3_FORCE_INLINE unsigned int getHash(unsigned int proxyId1, unsigned int proxyId2)
+	{
+		int key = static_cast<int>(((unsigned int)proxyId1) | (((unsigned int)proxyId2) << 16));
+		// Thomas Wang's hash
+
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
+		return static_cast<unsigned int>(key);
+	}
+
+	B3_FORCE_INLINE b3BroadphasePair* internalFindPair(int proxy0, int proxy1, int hash)
+	{
+		int proxyId1 = proxy0;
+		int proxyId2 = proxy1;
+#if 0  // wrong, 'equalsPair' use unsorted uids, copy-past devil striked again. Nat.
+		if (proxyId1 > proxyId2) 
+			b3Swap(proxyId1, proxyId2);
+#endif
+
+		int index = m_hashTable[hash];
+
+		while (index != B3_NULL_PAIR && equalsPair(m_overlappingPairArray[index], proxyId1, proxyId2) == false)
+		{
+			index = m_next[index];
+		}
+
+		if (index == B3_NULL_PAIR)
+		{
+			return NULL;
+		}
+
+		b3Assert(index < m_overlappingPairArray.size());
+
+		return &m_overlappingPairArray[index];
+	}
+
+	virtual bool hasDeferredRemoval()
+	{
+		return false;
+	}
+
+	/*	virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* ghostPairCallback)
+	{
+		m_ghostPairCallback = ghostPairCallback;
+	}
+	*/
+
+	virtual void sortOverlappingPairs(b3Dispatcher* dispatcher);
+
+protected:
+	b3AlignedObjectArray<int> m_hashTable;
+	b3AlignedObjectArray<int> m_next;
+	//	b3OverlappingPairCallback*	m_ghostPairCallback;
+};
+
+///b3SortedOverlappingPairCache maintains the objects with overlapping AABB
+///Typically managed by the Broadphase, Axis3Sweep or b3SimpleBroadphase
+class b3SortedOverlappingPairCache : public b3OverlappingPairCache
+{
+protected:
+	//avoid brute-force finding all the time
+	b3BroadphasePairArray m_overlappingPairArray;
+
+	//during the dispatch, check that user doesn't destroy/create proxy
+	bool m_blockedForChanges;
+
+	///by default, do the removal during the pair traversal
+	bool m_hasDeferredRemoval;
+
+	//if set, use the callback instead of the built in filter in needBroadphaseCollision
+	b3OverlapFilterCallback* m_overlapFilterCallback;
+
+	//		b3OverlappingPairCallback*	m_ghostPairCallback;
+
+public:
+	b3SortedOverlappingPairCache();
+	virtual ~b3SortedOverlappingPairCache();
+
+	virtual void processAllOverlappingPairs(b3OverlapCallback*, b3Dispatcher* dispatcher);
+
+	void* removeOverlappingPair(int proxy0, int proxy1, b3Dispatcher* dispatcher);
+
+	void cleanOverlappingPair(b3BroadphasePair& pair, b3Dispatcher* dispatcher);
+
+	b3BroadphasePair* addOverlappingPair(int proxy0, int proxy1);
+
+	b3BroadphasePair* findPair(int proxy0, int proxy1);
+
+	void cleanProxyFromPairs(int proxy, b3Dispatcher* dispatcher);
+
+	virtual void removeOverlappingPairsContainingProxy(int proxy, b3Dispatcher* dispatcher);
+
+	inline bool needsBroadphaseCollision(int proxy0, int proxy1) const
+	{
+		if (m_overlapFilterCallback)
+			return m_overlapFilterCallback->needBroadphaseCollision(proxy0, proxy1);
+
+		bool collides = true;  //(proxy0->m_collisionFilterGroup & proxy1->m_collisionFilterMask) != 0;
+		//collides = collides && (proxy1->m_collisionFilterGroup & proxy0->m_collisionFilterMask);
+
+		return collides;
+	}
+
+	b3BroadphasePairArray& getOverlappingPairArray()
+	{
+		return m_overlappingPairArray;
+	}
+
+	const b3BroadphasePairArray& getOverlappingPairArray() const
+	{
+		return m_overlappingPairArray;
+	}
+
+	b3BroadphasePair* getOverlappingPairArrayPtr()
+	{
+		return &m_overlappingPairArray[0];
+	}
+
+	const b3BroadphasePair* getOverlappingPairArrayPtr() const
+	{
+		return &m_overlappingPairArray[0];
+	}
+
+	int getNumOverlappingPairs() const
+	{
+		return m_overlappingPairArray.size();
+	}
+
+	b3OverlapFilterCallback* getOverlapFilterCallback()
+	{
+		return m_overlapFilterCallback;
+	}
+
+	void setOverlapFilterCallback(b3OverlapFilterCallback* callback)
+	{
+		m_overlapFilterCallback = callback;
+	}
+
+	virtual bool hasDeferredRemoval()
+	{
+		return m_hasDeferredRemoval;
+	}
+
+	/*		virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* ghostPairCallback)
+		{
+			m_ghostPairCallback = ghostPairCallback;
+		}
+		*/
+	virtual void sortOverlappingPairs(b3Dispatcher* dispatcher);
+};
+
+///b3NullPairCache skips add/removal of overlapping pairs. Userful for benchmarking and unit testing.
+class b3NullPairCache : public b3OverlappingPairCache
+{
+	b3BroadphasePairArray m_overlappingPairArray;
+
+public:
+	virtual b3BroadphasePair* getOverlappingPairArrayPtr()
+	{
+		return &m_overlappingPairArray[0];
+	}
+	const b3BroadphasePair* getOverlappingPairArrayPtr() const
+	{
+		return &m_overlappingPairArray[0];
+	}
+	b3BroadphasePairArray& getOverlappingPairArray()
+	{
+		return m_overlappingPairArray;
+	}
+
+	virtual void cleanOverlappingPair(b3BroadphasePair& /*pair*/, b3Dispatcher* /*dispatcher*/)
+	{
+	}
+
+	virtual int getNumOverlappingPairs() const
+	{
+		return 0;
+	}
+
+	virtual void cleanProxyFromPairs(int /*proxy*/, b3Dispatcher* /*dispatcher*/)
+	{
+	}
+
+	virtual void setOverlapFilterCallback(b3OverlapFilterCallback* /*callback*/)
+	{
+	}
+
+	virtual void processAllOverlappingPairs(b3OverlapCallback*, b3Dispatcher* /*dispatcher*/)
+	{
+	}
+
+	virtual b3BroadphasePair* findPair(int /*proxy0*/, int /*proxy1*/)
+	{
+		return 0;
+	}
+
+	virtual bool hasDeferredRemoval()
+	{
+		return true;
+	}
+
+	//	virtual	void	setInternalGhostPairCallback(b3OverlappingPairCallback* /* ghostPairCallback */)
+	//	{
+	//
+	//	}
+
+	virtual b3BroadphasePair* addOverlappingPair(int /*proxy0*/, int /*proxy1*/)
+	{
+		return 0;
+	}
+
+	virtual void* removeOverlappingPair(int /*proxy0*/, int /*proxy1*/, b3Dispatcher* /*dispatcher*/)
+	{
+		return 0;
+	}
+
+	virtual void removeOverlappingPairsContainingProxy(int /*proxy0*/, b3Dispatcher* /*dispatcher*/)
+	{
+	}
+
+	virtual void sortOverlappingPairs(b3Dispatcher* dispatcher)
+	{
+		(void)dispatcher;
+	}
+};
+
+#endif  //B3_OVERLAPPING_PAIR_CACHE_H

+ 56 - 0
Dependencies/include/bullet3/Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h

@@ -0,0 +1,56 @@
+
+#ifndef B3_AABB_H
+#define B3_AABB_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Mat3x3.h"
+
+typedef struct b3Aabb b3Aabb_t;
+
+struct b3Aabb
+{
+	union {
+		float m_min[4];
+		b3Float4 m_minVec;
+		int m_minIndices[4];
+	};
+	union {
+		float m_max[4];
+		b3Float4 m_maxVec;
+		int m_signedMaxIndices[4];
+	};
+};
+
+inline void b3TransformAabb2(b3Float4ConstArg localAabbMin, b3Float4ConstArg localAabbMax, float margin,
+							 b3Float4ConstArg pos,
+							 b3QuatConstArg orn,
+							 b3Float4* aabbMinOut, b3Float4* aabbMaxOut)
+{
+	b3Float4 localHalfExtents = 0.5f * (localAabbMax - localAabbMin);
+	localHalfExtents += b3MakeFloat4(margin, margin, margin, 0.f);
+	b3Float4 localCenter = 0.5f * (localAabbMax + localAabbMin);
+	b3Mat3x3 m;
+	m = b3QuatGetRotationMatrix(orn);
+	b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);
+	b3Float4 center = b3TransformPoint(localCenter, pos, orn);
+
+	b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents, b3GetRow(abs_b, 0)),
+								   b3Dot3F4(localHalfExtents, b3GetRow(abs_b, 1)),
+								   b3Dot3F4(localHalfExtents, b3GetRow(abs_b, 2)),
+								   0.f);
+	*aabbMinOut = center - extent;
+	*aabbMaxOut = center + extent;
+}
+
+/// conservative test for overlap between two aabbs
+inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1, b3Float4ConstArg aabbMax1,
+								  b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)
+{
+	bool overlap = true;
+	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;
+	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;
+	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;
+	return overlap;
+}
+
+#endif  //B3_AABB_H

+ 93 - 0
Dependencies/include/bullet3/Bullet3Collision/CMakeLists.txt

@@ -0,0 +1,93 @@
+
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+SET(Bullet3Collision_SRCS
+	BroadPhaseCollision/b3DynamicBvh.cpp
+	BroadPhaseCollision/b3DynamicBvhBroadphase.cpp
+	BroadPhaseCollision/b3OverlappingPairCache.cpp
+	NarrowPhaseCollision/b3ConvexUtility.cpp
+	NarrowPhaseCollision/b3CpuNarrowPhase.cpp
+)
+
+SET(Bullet3CollisionBroadPhase_HDRS
+	BroadPhaseCollision/b3BroadphaseCallback.h
+	BroadPhaseCollision/b3DynamicBvh.h
+	BroadPhaseCollision/b3DynamicBvhBroadphase.h
+	BroadPhaseCollision/b3OverlappingPair.h
+	BroadPhaseCollision/b3OverlappingPairCache.h
+)
+SET(Bullet3CollisionBroadPhaseShared_HDRS
+	BroadPhaseCollision/shared/b3Aabb.h
+)
+
+SET(Bullet3CollisionNarrowPhase_HDRS
+	NarrowPhaseCollision/b3Config.h
+	NarrowPhaseCollision/b3Contact4.h
+	NarrowPhaseCollision/b3ConvexUtility.h
+	NarrowPhaseCollision/b3CpuNarrowPhase.h
+	NarrowPhaseCollision/b3RaycastInfo.h
+	NarrowPhaseCollision/b3RigidBodyCL.h
+)
+SET(Bullet3CollisionNarrowPhaseShared_HDRS
+
+	NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h
+	NarrowPhaseCollision/shared/b3BvhTraversal.h
+	NarrowPhaseCollision/shared/b3ClipFaces.h
+	NarrowPhaseCollision/shared/b3Collidable.h
+	NarrowPhaseCollision/shared/b3Contact4Data.h
+	NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h
+	NarrowPhaseCollision/shared/b3ContactSphereSphere.h
+	NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h
+	NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h
+	NarrowPhaseCollision/shared/b3FindSeparatingAxis.h
+	NarrowPhaseCollision/shared/b3MprPenetration.h
+	NarrowPhaseCollision/shared/b3NewContactReduction.h
+	NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h
+	NarrowPhaseCollision/shared/b3ReduceContacts.h
+	NarrowPhaseCollision/shared/b3RigidBodyData.h
+	NarrowPhaseCollision/shared/b3UpdateAabbs.h
+)
+
+SET(Bullet3Collision_HDRS
+	${Bullet3CollisionBroadPhase_HDRS}
+	${Bullet3CollisionBroadPhaseShared_HDRS}
+	${Bullet3CollisionNarrowPhaseShared_HDRS}
+	${Bullet3CollisionNarrowPhase_HDRS}
+)
+
+ADD_LIBRARY(Bullet3Collision ${Bullet3Collision_SRCS} ${Bullet3Collision_HDRS})
+if (BUILD_SHARED_LIBS)
+  target_link_libraries(Bullet3Collision Bullet3Geometry)
+endif ()
+SET_TARGET_PROPERTIES(Bullet3Collision PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(Bullet3Collision PROPERTIES SOVERSION ${BULLET_VERSION})
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		#FILES_MATCHING requires CMake 2.6
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Collision DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Collision
+					RUNTIME DESTINATION bin
+					LIBRARY DESTINATION lib${LIB_SUFFIX}
+					ARCHIVE DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h"  PATTERN
+".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(Bullet3Collision PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(Bullet3Collision PROPERTIES PUBLIC_HEADER "${Bullet3Collision_HDRS}")
+			# Have to list out sub-directories manually:
+			#todo
+			#SET_PROPERTY(SOURCE ${Bullet3CollisionBroadPhase_HDRS} PROPERTY MACOSX_PACKAGE_LOCATION Headers/BroadPhaseCollision)
+
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)

+ 39 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3Config.h

@@ -0,0 +1,39 @@
+#ifndef B3_CONFIG_H
+#define B3_CONFIG_H
+
+struct b3Config
+{
+	int m_maxConvexBodies;
+	int m_maxConvexShapes;
+	int m_maxBroadphasePairs;
+	int m_maxContactCapacity;
+	int m_compoundPairCapacity;
+
+	int m_maxVerticesPerFace;
+	int m_maxFacesPerShape;
+	int m_maxConvexVertices;
+	int m_maxConvexIndices;
+	int m_maxConvexUniqueEdges;
+
+	int m_maxCompoundChildShapes;
+
+	int m_maxTriConvexPairCapacity;
+
+	b3Config()
+		: m_maxConvexBodies(128 * 1024),
+		  m_maxVerticesPerFace(64),
+		  m_maxFacesPerShape(12),
+		  m_maxConvexVertices(8192),
+		  m_maxConvexIndices(81920),
+		  m_maxConvexUniqueEdges(8192),
+		  m_maxCompoundChildShapes(8192),
+		  m_maxTriConvexPairCapacity(256 * 1024)
+	{
+		m_maxConvexShapes = m_maxConvexBodies;
+		m_maxBroadphasePairs = 16 * m_maxConvexBodies;
+		m_maxContactCapacity = m_maxBroadphasePairs;
+		m_compoundPairCapacity = 1024 * 1024;
+	}
+};
+
+#endif  //B3_CONFIG_H

+ 55 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3Contact4.h

@@ -0,0 +1,55 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_CONTACT4_H
+#define B3_CONTACT4_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3Contact4 : public b3Contact4Data
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	int getBodyA() const { return abs(m_bodyAPtrAndSignBit); }
+	int getBodyB() const { return abs(m_bodyBPtrAndSignBit); }
+	bool isBodyAFixed() const { return m_bodyAPtrAndSignBit < 0; }
+	bool isBodyBFixed() const { return m_bodyBPtrAndSignBit < 0; }
+	//	todo. make it safer
+	int& getBatchIdx() { return m_batchIdx; }
+	const int& getBatchIdx() const { return m_batchIdx; }
+	float getRestituitionCoeff() const { return ((float)m_restituitionCoeffCmp / (float)0xffff); }
+	void setRestituitionCoeff(float c)
+	{
+		b3Assert(c >= 0.f && c <= 1.f);
+		m_restituitionCoeffCmp = (unsigned short)(c * 0xffff);
+	}
+	float getFrictionCoeff() const { return ((float)m_frictionCoeffCmp / (float)0xffff); }
+	void setFrictionCoeff(float c)
+	{
+		b3Assert(c >= 0.f && c <= 1.f);
+		m_frictionCoeffCmp = (unsigned short)(c * 0xffff);
+	}
+
+	//float& getNPoints() { return m_worldNormal[3]; }
+	int getNPoints() const { return (int)m_worldNormalOnB.w; }
+
+	float getPenetration(int idx) const { return m_worldPosB[idx].w; }
+
+	bool isInvalid() const { return (getBodyA() == 0 || getBodyB() == 0); }
+};
+
+#endif  //B3_CONTACT4_H

+ 500 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.cpp

@@ -0,0 +1,500 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "b3ConvexUtility.h"
+#include "Bullet3Geometry/b3ConvexHullComputer.h"
+#include "Bullet3Geometry/b3GrahamScan2dConvexHull.h"
+#include "Bullet3Common/b3Quaternion.h"
+#include "Bullet3Common/b3HashMap.h"
+
+b3ConvexUtility::~b3ConvexUtility()
+{
+}
+
+bool b3ConvexUtility::initializePolyhedralFeatures(const b3Vector3* orgVertices, int numPoints, bool mergeCoplanarTriangles)
+{
+	b3ConvexHullComputer conv;
+	conv.compute(&orgVertices[0].getX(), sizeof(b3Vector3), numPoints, 0.f, 0.f);
+
+	b3AlignedObjectArray<b3Vector3> faceNormals;
+	int numFaces = conv.faces.size();
+	faceNormals.resize(numFaces);
+	b3ConvexHullComputer* convexUtil = &conv;
+
+	b3AlignedObjectArray<b3MyFace> tmpFaces;
+	tmpFaces.resize(numFaces);
+
+	int numVertices = convexUtil->vertices.size();
+	m_vertices.resize(numVertices);
+	for (int p = 0; p < numVertices; p++)
+	{
+		m_vertices[p] = convexUtil->vertices[p];
+	}
+
+	for (int i = 0; i < numFaces; i++)
+	{
+		int face = convexUtil->faces[i];
+		//printf("face=%d\n",face);
+		const b3ConvexHullComputer::Edge* firstEdge = &convexUtil->edges[face];
+		const b3ConvexHullComputer::Edge* edge = firstEdge;
+
+		b3Vector3 edges[3];
+		int numEdges = 0;
+		//compute face normals
+
+		do
+		{
+			int src = edge->getSourceVertex();
+			tmpFaces[i].m_indices.push_back(src);
+			int targ = edge->getTargetVertex();
+			b3Vector3 wa = convexUtil->vertices[src];
+
+			b3Vector3 wb = convexUtil->vertices[targ];
+			b3Vector3 newEdge = wb - wa;
+			newEdge.normalize();
+			if (numEdges < 2)
+				edges[numEdges++] = newEdge;
+
+			edge = edge->getNextEdgeOfFace();
+		} while (edge != firstEdge);
+
+		b3Scalar planeEq = 1e30f;
+
+		if (numEdges == 2)
+		{
+			faceNormals[i] = edges[0].cross(edges[1]);
+			faceNormals[i].normalize();
+			tmpFaces[i].m_plane[0] = faceNormals[i].getX();
+			tmpFaces[i].m_plane[1] = faceNormals[i].getY();
+			tmpFaces[i].m_plane[2] = faceNormals[i].getZ();
+			tmpFaces[i].m_plane[3] = planeEq;
+		}
+		else
+		{
+			b3Assert(0);  //degenerate?
+			faceNormals[i].setZero();
+		}
+
+		for (int v = 0; v < tmpFaces[i].m_indices.size(); v++)
+		{
+			b3Scalar eq = m_vertices[tmpFaces[i].m_indices[v]].dot(faceNormals[i]);
+			if (planeEq > eq)
+			{
+				planeEq = eq;
+			}
+		}
+		tmpFaces[i].m_plane[3] = -planeEq;
+	}
+
+	//merge coplanar faces and copy them to m_polyhedron
+
+	b3Scalar faceWeldThreshold = 0.999f;
+	b3AlignedObjectArray<int> todoFaces;
+	for (int i = 0; i < tmpFaces.size(); i++)
+		todoFaces.push_back(i);
+
+	while (todoFaces.size())
+	{
+		b3AlignedObjectArray<int> coplanarFaceGroup;
+		int refFace = todoFaces[todoFaces.size() - 1];
+
+		coplanarFaceGroup.push_back(refFace);
+		b3MyFace& faceA = tmpFaces[refFace];
+		todoFaces.pop_back();
+
+		b3Vector3 faceNormalA = b3MakeVector3(faceA.m_plane[0], faceA.m_plane[1], faceA.m_plane[2]);
+		for (int j = todoFaces.size() - 1; j >= 0; j--)
+		{
+			int i = todoFaces[j];
+			b3MyFace& faceB = tmpFaces[i];
+			b3Vector3 faceNormalB = b3MakeVector3(faceB.m_plane[0], faceB.m_plane[1], faceB.m_plane[2]);
+			if (faceNormalA.dot(faceNormalB) > faceWeldThreshold)
+			{
+				coplanarFaceGroup.push_back(i);
+				todoFaces.remove(i);
+			}
+		}
+
+		bool did_merge = false;
+		if (coplanarFaceGroup.size() > 1)
+		{
+			//do the merge: use Graham Scan 2d convex hull
+
+			b3AlignedObjectArray<b3GrahamVector3> orgpoints;
+			b3Vector3 averageFaceNormal = b3MakeVector3(0, 0, 0);
+
+			for (int i = 0; i < coplanarFaceGroup.size(); i++)
+			{
+				//				m_polyhedron->m_faces.push_back(tmpFaces[coplanarFaceGroup[i]]);
+
+				b3MyFace& face = tmpFaces[coplanarFaceGroup[i]];
+				b3Vector3 faceNormal = b3MakeVector3(face.m_plane[0], face.m_plane[1], face.m_plane[2]);
+				averageFaceNormal += faceNormal;
+				for (int f = 0; f < face.m_indices.size(); f++)
+				{
+					int orgIndex = face.m_indices[f];
+					b3Vector3 pt = m_vertices[orgIndex];
+
+					bool found = false;
+
+					for (int i = 0; i < orgpoints.size(); i++)
+					{
+						//if ((orgpoints[i].m_orgIndex == orgIndex) || ((rotatedPt-orgpoints[i]).length2()<0.0001))
+						if (orgpoints[i].m_orgIndex == orgIndex)
+						{
+							found = true;
+							break;
+						}
+					}
+					if (!found)
+						orgpoints.push_back(b3GrahamVector3(pt, orgIndex));
+				}
+			}
+
+			b3MyFace combinedFace;
+			for (int i = 0; i < 4; i++)
+				combinedFace.m_plane[i] = tmpFaces[coplanarFaceGroup[0]].m_plane[i];
+
+			b3AlignedObjectArray<b3GrahamVector3> hull;
+
+			averageFaceNormal.normalize();
+			b3GrahamScanConvexHull2D(orgpoints, hull, averageFaceNormal);
+
+			for (int i = 0; i < hull.size(); i++)
+			{
+				combinedFace.m_indices.push_back(hull[i].m_orgIndex);
+				for (int k = 0; k < orgpoints.size(); k++)
+				{
+					if (orgpoints[k].m_orgIndex == hull[i].m_orgIndex)
+					{
+						orgpoints[k].m_orgIndex = -1;  // invalidate...
+						break;
+					}
+				}
+			}
+
+			// are there rejected vertices?
+			bool reject_merge = false;
+
+			for (int i = 0; i < orgpoints.size(); i++)
+			{
+				if (orgpoints[i].m_orgIndex == -1)
+					continue;  // this is in the hull...
+				// this vertex is rejected -- is anybody else using this vertex?
+				for (int j = 0; j < tmpFaces.size(); j++)
+				{
+					b3MyFace& face = tmpFaces[j];
+					// is this a face of the current coplanar group?
+					bool is_in_current_group = false;
+					for (int k = 0; k < coplanarFaceGroup.size(); k++)
+					{
+						if (coplanarFaceGroup[k] == j)
+						{
+							is_in_current_group = true;
+							break;
+						}
+					}
+					if (is_in_current_group)  // ignore this face...
+						continue;
+					// does this face use this rejected vertex?
+					for (int v = 0; v < face.m_indices.size(); v++)
+					{
+						if (face.m_indices[v] == orgpoints[i].m_orgIndex)
+						{
+							// this rejected vertex is used in another face -- reject merge
+							reject_merge = true;
+							break;
+						}
+					}
+					if (reject_merge)
+						break;
+				}
+				if (reject_merge)
+					break;
+			}
+
+			if (!reject_merge)
+			{
+				// do this merge!
+				did_merge = true;
+				m_faces.push_back(combinedFace);
+			}
+		}
+		if (!did_merge)
+		{
+			for (int i = 0; i < coplanarFaceGroup.size(); i++)
+			{
+				b3MyFace face = tmpFaces[coplanarFaceGroup[i]];
+				m_faces.push_back(face);
+			}
+		}
+	}
+
+	initialize();
+
+	return true;
+}
+
+inline bool IsAlmostZero(const b3Vector3& v)
+{
+	if (fabsf(v.getX()) > 1e-6 || fabsf(v.getY()) > 1e-6 || fabsf(v.getZ()) > 1e-6) return false;
+	return true;
+}
+
+struct b3InternalVertexPair
+{
+	b3InternalVertexPair(short int v0, short int v1)
+		: m_v0(v0),
+		  m_v1(v1)
+	{
+		if (m_v1 > m_v0)
+			b3Swap(m_v0, m_v1);
+	}
+	short int m_v0;
+	short int m_v1;
+	int getHash() const
+	{
+		return m_v0 + (m_v1 << 16);
+	}
+	bool equals(const b3InternalVertexPair& other) const
+	{
+		return m_v0 == other.m_v0 && m_v1 == other.m_v1;
+	}
+};
+
+struct b3InternalEdge
+{
+	b3InternalEdge()
+		: m_face0(-1),
+		  m_face1(-1)
+	{
+	}
+	short int m_face0;
+	short int m_face1;
+};
+
+//
+
+#ifdef TEST_INTERNAL_OBJECTS
+bool b3ConvexUtility::testContainment() const
+{
+	for (int p = 0; p < 8; p++)
+	{
+		b3Vector3 LocalPt;
+		if (p == 0)
+			LocalPt = m_localCenter + b3Vector3(m_extents[0], m_extents[1], m_extents[2]);
+		else if (p == 1)
+			LocalPt = m_localCenter + b3Vector3(m_extents[0], m_extents[1], -m_extents[2]);
+		else if (p == 2)
+			LocalPt = m_localCenter + b3Vector3(m_extents[0], -m_extents[1], m_extents[2]);
+		else if (p == 3)
+			LocalPt = m_localCenter + b3Vector3(m_extents[0], -m_extents[1], -m_extents[2]);
+		else if (p == 4)
+			LocalPt = m_localCenter + b3Vector3(-m_extents[0], m_extents[1], m_extents[2]);
+		else if (p == 5)
+			LocalPt = m_localCenter + b3Vector3(-m_extents[0], m_extents[1], -m_extents[2]);
+		else if (p == 6)
+			LocalPt = m_localCenter + b3Vector3(-m_extents[0], -m_extents[1], m_extents[2]);
+		else if (p == 7)
+			LocalPt = m_localCenter + b3Vector3(-m_extents[0], -m_extents[1], -m_extents[2]);
+
+		for (int i = 0; i < m_faces.size(); i++)
+		{
+			const b3Vector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const b3Scalar d = LocalPt.dot(Normal) + m_faces[i].m_plane[3];
+			if (d > 0.0f)
+				return false;
+		}
+	}
+	return true;
+}
+#endif
+
+void b3ConvexUtility::initialize()
+{
+	b3HashMap<b3InternalVertexPair, b3InternalEdge> edges;
+
+	b3Scalar TotalArea = 0.0f;
+
+	m_localCenter.setValue(0, 0, 0);
+	for (int i = 0; i < m_faces.size(); i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices;
+		for (int j = 0; j < NbTris; j++)
+		{
+			int k = (j + 1) % numVertices;
+			b3InternalVertexPair vp(m_faces[i].m_indices[j], m_faces[i].m_indices[k]);
+			b3InternalEdge* edptr = edges.find(vp);
+			b3Vector3 edge = m_vertices[vp.m_v1] - m_vertices[vp.m_v0];
+			edge.normalize();
+
+			bool found = false;
+			b3Vector3 diff, diff2;
+
+			for (int p = 0; p < m_uniqueEdges.size(); p++)
+			{
+				diff = m_uniqueEdges[p] - edge;
+				diff2 = m_uniqueEdges[p] + edge;
+
+				//	if ((diff.length2()==0.f) ||
+				//	(diff2.length2()==0.f))
+
+				if (IsAlmostZero(diff) ||
+					IsAlmostZero(diff2))
+				{
+					found = true;
+					break;
+				}
+			}
+
+			if (!found)
+			{
+				m_uniqueEdges.push_back(edge);
+			}
+
+			if (edptr)
+			{
+				//TBD: figure out why I added this assert
+				//				b3Assert(edptr->m_face0>=0);
+				//			b3Assert(edptr->m_face1<0);
+				edptr->m_face1 = i;
+			}
+			else
+			{
+				b3InternalEdge ed;
+				ed.m_face0 = i;
+				edges.insert(vp, ed);
+			}
+		}
+	}
+
+#ifdef USE_CONNECTED_FACES
+	for (int i = 0; i < m_faces.size(); i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		m_faces[i].m_connectedFaces.resize(numVertices);
+
+		for (int j = 0; j < numVertices; j++)
+		{
+			int k = (j + 1) % numVertices;
+			b3InternalVertexPair vp(m_faces[i].m_indices[j], m_faces[i].m_indices[k]);
+			b3InternalEdge* edptr = edges.find(vp);
+			b3Assert(edptr);
+			b3Assert(edptr->m_face0 >= 0);
+			b3Assert(edptr->m_face1 >= 0);
+
+			int connectedFace = (edptr->m_face0 == i) ? edptr->m_face1 : edptr->m_face0;
+			m_faces[i].m_connectedFaces[j] = connectedFace;
+		}
+	}
+#endif  //USE_CONNECTED_FACES
+
+	for (int i = 0; i < m_faces.size(); i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices - 2;
+
+		const b3Vector3& p0 = m_vertices[m_faces[i].m_indices[0]];
+		for (int j = 1; j <= NbTris; j++)
+		{
+			int k = (j + 1) % numVertices;
+			const b3Vector3& p1 = m_vertices[m_faces[i].m_indices[j]];
+			const b3Vector3& p2 = m_vertices[m_faces[i].m_indices[k]];
+			b3Scalar Area = ((p0 - p1).cross(p0 - p2)).length() * 0.5f;
+			b3Vector3 Center = (p0 + p1 + p2) / 3.0f;
+			m_localCenter += Area * Center;
+			TotalArea += Area;
+		}
+	}
+	m_localCenter /= TotalArea;
+
+#ifdef TEST_INTERNAL_OBJECTS
+	if (1)
+	{
+		m_radius = FLT_MAX;
+		for (int i = 0; i < m_faces.size(); i++)
+		{
+			const b3Vector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const b3Scalar dist = b3Fabs(m_localCenter.dot(Normal) + m_faces[i].m_plane[3]);
+			if (dist < m_radius)
+				m_radius = dist;
+		}
+
+		b3Scalar MinX = FLT_MAX;
+		b3Scalar MinY = FLT_MAX;
+		b3Scalar MinZ = FLT_MAX;
+		b3Scalar MaxX = -FLT_MAX;
+		b3Scalar MaxY = -FLT_MAX;
+		b3Scalar MaxZ = -FLT_MAX;
+		for (int i = 0; i < m_vertices.size(); i++)
+		{
+			const b3Vector3& pt = m_vertices[i];
+			if (pt.getX() < MinX) MinX = pt.getX();
+			if (pt.getX() > MaxX) MaxX = pt.getX();
+			if (pt.getY() < MinY) MinY = pt.getY();
+			if (pt.getY() > MaxY) MaxY = pt.getY();
+			if (pt.getZ() < MinZ) MinZ = pt.getZ();
+			if (pt.getZ() > MaxZ) MaxZ = pt.getZ();
+		}
+		mC.setValue(MaxX + MinX, MaxY + MinY, MaxZ + MinZ);
+		mE.setValue(MaxX - MinX, MaxY - MinY, MaxZ - MinZ);
+
+		//		const b3Scalar r = m_radius / sqrtf(2.0f);
+		const b3Scalar r = m_radius / sqrtf(3.0f);
+		const int LargestExtent = mE.maxAxis();
+		const b3Scalar Step = (mE[LargestExtent] * 0.5f - r) / 1024.0f;
+		m_extents[0] = m_extents[1] = m_extents[2] = r;
+		m_extents[LargestExtent] = mE[LargestExtent] * 0.5f;
+		bool FoundBox = false;
+		for (int j = 0; j < 1024; j++)
+		{
+			if (testContainment())
+			{
+				FoundBox = true;
+				break;
+			}
+
+			m_extents[LargestExtent] -= Step;
+		}
+		if (!FoundBox)
+		{
+			m_extents[0] = m_extents[1] = m_extents[2] = r;
+		}
+		else
+		{
+			// Refine the box
+			const b3Scalar Step = (m_radius - r) / 1024.0f;
+			const int e0 = (1 << LargestExtent) & 3;
+			const int e1 = (1 << e0) & 3;
+
+			for (int j = 0; j < 1024; j++)
+			{
+				const b3Scalar Saved0 = m_extents[e0];
+				const b3Scalar Saved1 = m_extents[e1];
+				m_extents[e0] += Step;
+				m_extents[e1] += Step;
+
+				if (!testContainment())
+				{
+					m_extents[e0] = Saved0;
+					m_extents[e1] = Saved1;
+					break;
+				}
+			}
+		}
+	}
+#endif
+}

+ 55 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h

@@ -0,0 +1,55 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef _BT_CONVEX_UTILITY_H
+#define _BT_CONVEX_UTILITY_H
+
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Transform.h"
+
+struct b3MyFace
+{
+	b3AlignedObjectArray<int> m_indices;
+	b3Scalar m_plane[4];
+};
+
+B3_ATTRIBUTE_ALIGNED16(class)
+b3ConvexUtility
+{
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3Vector3 m_localCenter;
+	b3Vector3 m_extents;
+	b3Vector3 mC;
+	b3Vector3 mE;
+	b3Scalar m_radius;
+
+	b3AlignedObjectArray<b3Vector3> m_vertices;
+	b3AlignedObjectArray<b3MyFace> m_faces;
+	b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
+
+	b3ConvexUtility()
+	{
+	}
+	virtual ~b3ConvexUtility();
+
+	bool initializePolyhedralFeatures(const b3Vector3* orgVertices, int numVertices, bool mergeCoplanarTriangles = true);
+
+	void initialize();
+	bool testContainment() const;
+};
+#endif

+ 297 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.cpp

@@ -0,0 +1,297 @@
+#include "b3CpuNarrowPhase.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3ConvexUtility.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h"
+
+struct b3CpuNarrowPhaseInternalData
+{
+	b3AlignedObjectArray<b3Aabb> m_localShapeAABBCPU;
+	b3AlignedObjectArray<b3Collidable> m_collidablesCPU;
+	b3AlignedObjectArray<b3ConvexUtility*> m_convexData;
+	b3Config m_config;
+
+	b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra;
+	b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
+	b3AlignedObjectArray<b3Vector3> m_convexVertices;
+	b3AlignedObjectArray<int> m_convexIndices;
+	b3AlignedObjectArray<b3GpuFace> m_convexFaces;
+
+	b3AlignedObjectArray<b3Contact4Data> m_contacts;
+
+	int m_numAcceleratedShapes;
+};
+
+const b3AlignedObjectArray<b3Contact4Data>& b3CpuNarrowPhase::getContacts() const
+{
+	return m_data->m_contacts;
+}
+
+b3Collidable& b3CpuNarrowPhase::getCollidableCpu(int collidableIndex)
+{
+	return m_data->m_collidablesCPU[collidableIndex];
+}
+
+const b3Collidable& b3CpuNarrowPhase::getCollidableCpu(int collidableIndex) const
+{
+	return m_data->m_collidablesCPU[collidableIndex];
+}
+
+b3CpuNarrowPhase::b3CpuNarrowPhase(const struct b3Config& config)
+{
+	m_data = new b3CpuNarrowPhaseInternalData;
+	m_data->m_config = config;
+	m_data->m_numAcceleratedShapes = 0;
+}
+
+b3CpuNarrowPhase::~b3CpuNarrowPhase()
+{
+	delete m_data;
+}
+
+void b3CpuNarrowPhase::computeContacts(b3AlignedObjectArray<b3Int4>& pairs, b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace, b3AlignedObjectArray<b3RigidBodyData>& bodies)
+{
+	int nPairs = pairs.size();
+	int numContacts = 0;
+	int maxContactCapacity = m_data->m_config.m_maxContactCapacity;
+	m_data->m_contacts.resize(maxContactCapacity);
+
+	for (int i = 0; i < nPairs; i++)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+		int collidableIndexA = bodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = bodies[bodyIndexB].m_collidableIdx;
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			//			computeContactSphereConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&bodies[0],
+			//				&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+		}
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			//			computeContactSphereConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+			//				&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//printf("convex-sphere\n");
+		}
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			//			computeContactPlaneConvex(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+			//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//			printf("convex-plane\n");
+		}
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			//			computeContactPlaneConvex(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&bodies[0],
+			//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//			printf("plane-convex\n");
+		}
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			//			computeContactCompoundCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+			//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&cpuChildShapes[0], hostAabbsWorldSpace,hostAabbsLocalSpace,hostVertices,hostUniqueEdges,hostIndices,hostFaces,&hostContacts[0],
+			//			nContacts,maxContactCapacity,treeNodesCPU,subTreesCPU,bvhInfoCPU);
+			//			printf("convex-plane\n");
+		}
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+			//			computeContactPlaneCompound(i,bodyIndexB,bodyIndexA,collidableIndexB,collidableIndexA,&bodies[0],
+			//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&cpuChildShapes[0], &hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//			printf("convex-plane\n");
+		}
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			//			computeContactPlaneCompound(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,&bodies[0],
+			//			&m_data->m_collidablesCPU[0],&hostConvexData[0],&cpuChildShapes[0],&hostVertices[0],&hostIndices[0],&hostFaces[0],&hostContacts[0],nContacts,maxContactCapacity);
+			//			printf("plane-convex\n");
+		}
+
+		if (m_data->m_collidablesCPU[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&
+			m_data->m_collidablesCPU[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
+		{
+			//printf("pairs[i].z=%d\n",pairs[i].z);
+			//int contactIndex = computeContactConvexConvex2(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,bodies,
+			//		m_data->m_collidablesCPU,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+			int contactIndex = b3ContactConvexConvexSAT(i, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, bodies,
+														m_data->m_collidablesCPU, m_data->m_convexPolyhedra, m_data->m_convexVertices, m_data->m_uniqueEdges, m_data->m_convexIndices, m_data->m_convexFaces, m_data->m_contacts, numContacts, maxContactCapacity);
+
+			if (contactIndex >= 0)
+			{
+				pairs[i].z = contactIndex;
+			}
+			//			printf("plane-convex\n");
+		}
+	}
+
+	m_data->m_contacts.resize(numContacts);
+}
+
+int b3CpuNarrowPhase::registerConvexHullShape(b3ConvexUtility* utilPtr)
+{
+	int collidableIndex = allocateCollidable();
+	if (collidableIndex < 0)
+		return collidableIndex;
+
+	b3Collidable& col = m_data->m_collidablesCPU[collidableIndex];
+	col.m_shapeType = SHAPE_CONVEX_HULL;
+	col.m_shapeIndex = -1;
+
+	{
+		b3Vector3 localCenter = b3MakeVector3(0, 0, 0);
+		for (int i = 0; i < utilPtr->m_vertices.size(); i++)
+			localCenter += utilPtr->m_vertices[i];
+		localCenter *= (1.f / utilPtr->m_vertices.size());
+		utilPtr->m_localCenter = localCenter;
+
+		col.m_shapeIndex = registerConvexHullShapeInternal(utilPtr, col);
+	}
+
+	if (col.m_shapeIndex >= 0)
+	{
+		b3Aabb aabb;
+
+		b3Vector3 myAabbMin = b3MakeVector3(1e30f, 1e30f, 1e30f);
+		b3Vector3 myAabbMax = b3MakeVector3(-1e30f, -1e30f, -1e30f);
+
+		for (int i = 0; i < utilPtr->m_vertices.size(); i++)
+		{
+			myAabbMin.setMin(utilPtr->m_vertices[i]);
+			myAabbMax.setMax(utilPtr->m_vertices[i]);
+		}
+		aabb.m_min[0] = myAabbMin[0];
+		aabb.m_min[1] = myAabbMin[1];
+		aabb.m_min[2] = myAabbMin[2];
+		aabb.m_minIndices[3] = 0;
+
+		aabb.m_max[0] = myAabbMax[0];
+		aabb.m_max[1] = myAabbMax[1];
+		aabb.m_max[2] = myAabbMax[2];
+		aabb.m_signedMaxIndices[3] = 0;
+
+		m_data->m_localShapeAABBCPU.push_back(aabb);
+	}
+
+	return collidableIndex;
+}
+
+int b3CpuNarrowPhase::allocateCollidable()
+{
+	int curSize = m_data->m_collidablesCPU.size();
+	if (curSize < m_data->m_config.m_maxConvexShapes)
+	{
+		m_data->m_collidablesCPU.expand();
+		return curSize;
+	}
+	else
+	{
+		b3Error("allocateCollidable out-of-range %d\n", m_data->m_config.m_maxConvexShapes);
+	}
+	return -1;
+}
+
+int b3CpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling)
+{
+	b3AlignedObjectArray<b3Vector3> verts;
+
+	unsigned char* vts = (unsigned char*)vertices;
+	for (int i = 0; i < numVertices; i++)
+	{
+		float* vertex = (float*)&vts[i * strideInBytes];
+		verts.push_back(b3MakeVector3(vertex[0] * scaling[0], vertex[1] * scaling[1], vertex[2] * scaling[2]));
+	}
+
+	b3ConvexUtility* utilPtr = new b3ConvexUtility();
+	bool merge = true;
+	if (numVertices)
+	{
+		utilPtr->initializePolyhedralFeatures(&verts[0], verts.size(), merge);
+	}
+
+	int collidableIndex = registerConvexHullShape(utilPtr);
+
+	delete utilPtr;
+	return collidableIndex;
+}
+
+int b3CpuNarrowPhase::registerConvexHullShapeInternal(b3ConvexUtility* convexPtr, b3Collidable& col)
+{
+	m_data->m_convexData.resize(m_data->m_numAcceleratedShapes + 1);
+	m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes + 1);
+
+	b3ConvexPolyhedronData& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size() - 1);
+	convex.mC = convexPtr->mC;
+	convex.mE = convexPtr->mE;
+	convex.m_extents = convexPtr->m_extents;
+	convex.m_localCenter = convexPtr->m_localCenter;
+	convex.m_radius = convexPtr->m_radius;
+
+	convex.m_numUniqueEdges = convexPtr->m_uniqueEdges.size();
+	int edgeOffset = m_data->m_uniqueEdges.size();
+	convex.m_uniqueEdgesOffset = edgeOffset;
+
+	m_data->m_uniqueEdges.resize(edgeOffset + convex.m_numUniqueEdges);
+
+	//convex data here
+	int i;
+	for (i = 0; i < convexPtr->m_uniqueEdges.size(); i++)
+	{
+		m_data->m_uniqueEdges[edgeOffset + i] = convexPtr->m_uniqueEdges[i];
+	}
+
+	int faceOffset = m_data->m_convexFaces.size();
+	convex.m_faceOffset = faceOffset;
+	convex.m_numFaces = convexPtr->m_faces.size();
+
+	m_data->m_convexFaces.resize(faceOffset + convex.m_numFaces);
+
+	for (i = 0; i < convexPtr->m_faces.size(); i++)
+	{
+		m_data->m_convexFaces[convex.m_faceOffset + i].m_plane = b3MakeVector3(convexPtr->m_faces[i].m_plane[0],
+																			   convexPtr->m_faces[i].m_plane[1],
+																			   convexPtr->m_faces[i].m_plane[2],
+																			   convexPtr->m_faces[i].m_plane[3]);
+
+		int indexOffset = m_data->m_convexIndices.size();
+		int numIndices = convexPtr->m_faces[i].m_indices.size();
+		m_data->m_convexFaces[convex.m_faceOffset + i].m_numIndices = numIndices;
+		m_data->m_convexFaces[convex.m_faceOffset + i].m_indexOffset = indexOffset;
+		m_data->m_convexIndices.resize(indexOffset + numIndices);
+		for (int p = 0; p < numIndices; p++)
+		{
+			m_data->m_convexIndices[indexOffset + p] = convexPtr->m_faces[i].m_indices[p];
+		}
+	}
+
+	convex.m_numVertices = convexPtr->m_vertices.size();
+	int vertexOffset = m_data->m_convexVertices.size();
+	convex.m_vertexOffset = vertexOffset;
+
+	m_data->m_convexVertices.resize(vertexOffset + convex.m_numVertices);
+	for (int i = 0; i < convexPtr->m_vertices.size(); i++)
+	{
+		m_data->m_convexVertices[vertexOffset + i] = convexPtr->m_vertices[i];
+	}
+
+	(m_data->m_convexData)[m_data->m_numAcceleratedShapes] = convexPtr;
+
+	return m_data->m_numAcceleratedShapes++;
+}
+
+const b3Aabb& b3CpuNarrowPhase::getLocalSpaceAabb(int collidableIndex) const
+{
+	return m_data->m_localShapeAABBCPU[collidableIndex];
+}

+ 92 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h

@@ -0,0 +1,92 @@
+#ifndef B3_CPU_NARROWPHASE_H
+#define B3_CPU_NARROWPHASE_H
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
+class b3CpuNarrowPhase
+{
+protected:
+	struct b3CpuNarrowPhaseInternalData* m_data;
+	int m_acceleratedCompanionShapeIndex;
+	int m_planeBodyIndex;
+	int m_static0Index;
+
+	int registerConvexHullShapeInternal(class b3ConvexUtility* convexPtr, b3Collidable& col);
+	int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);
+
+public:
+	b3CpuNarrowPhase(const struct b3Config& config);
+
+	virtual ~b3CpuNarrowPhase(void);
+
+	int registerSphereShape(float radius);
+	int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
+
+	int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
+	int registerFace(const b3Vector3& faceNormal, float faceConstant);
+
+	int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
+
+	//do they need to be merged?
+
+	int registerConvexHullShape(b3ConvexUtility* utilPtr);
+	int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
+
+	//int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
+	void setObjectTransform(const float* position, const float* orientation, int bodyIndex);
+
+	void writeAllBodiesToGpu();
+	void reset();
+	void readbackAllBodiesToCpu();
+	bool getObjectTransformFromCpu(float* position, float* orientation, int bodyIndex) const;
+
+	void setObjectTransformCpu(float* position, float* orientation, int bodyIndex);
+	void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex);
+
+	//virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects);
+	virtual void computeContacts(b3AlignedObjectArray<b3Int4>& pairs, b3AlignedObjectArray<b3Aabb>& aabbsWorldSpace, b3AlignedObjectArray<b3RigidBodyData>& bodies);
+
+	const struct b3RigidBodyData* getBodiesCpu() const;
+	//struct b3RigidBodyData* getBodiesCpu();
+
+	int getNumBodiesGpu() const;
+
+	int getNumBodyInertiasGpu() const;
+
+	const struct b3Collidable* getCollidablesCpu() const;
+	int getNumCollidablesGpu() const;
+
+	/*const struct b3Contact4* getContactsCPU() const;
+
+	
+	int	getNumContactsGpu() const;
+	*/
+
+	const b3AlignedObjectArray<b3Contact4Data>& getContacts() const;
+
+	int getNumRigidBodies() const;
+
+	int allocateCollidable();
+
+	int getStatic0Index() const
+	{
+		return m_static0Index;
+	}
+	b3Collidable& getCollidableCpu(int collidableIndex);
+	const b3Collidable& getCollidableCpu(int collidableIndex) const;
+
+	const b3CpuNarrowPhaseInternalData* getInternalData() const
+	{
+		return m_data;
+	}
+
+	const struct b3Aabb& getLocalSpaceAabb(int collidableIndex) const;
+};
+
+#endif  //B3_CPU_NARROWPHASE_H

+ 25 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h

@@ -0,0 +1,25 @@
+
+#ifndef B3_RAYCAST_INFO_H
+#define B3_RAYCAST_INFO_H
+
+#include "Bullet3Common/b3Vector3.h"
+
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3RayInfo
+{
+	b3Vector3 m_from;
+	b3Vector3 m_to;
+};
+
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3RayHit
+{
+	b3Scalar m_hitFraction;
+	int m_hitBody;
+	int m_hitResult1;
+	int m_hitResult2;
+	b3Vector3 m_hitPoint;
+	b3Vector3 m_hitNormal;
+};
+
+#endif  //B3_RAYCAST_INFO_H

+ 28 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h

@@ -0,0 +1,28 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_RIGID_BODY_CL
+#define B3_RIGID_BODY_CL
+
+#include "Bullet3Common/b3Scalar.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+inline float b3GetInvMass(const b3RigidBodyData& body)
+{
+	return body.m_invMass;
+}
+
+#endif  //B3_RIGID_BODY_CL

+ 19 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h

@@ -0,0 +1,19 @@
+
+#ifndef B3_BVH_SUBTREE_INFO_DATA_H
+#define B3_BVH_SUBTREE_INFO_DATA_H
+
+typedef struct b3BvhSubtreeInfoData b3BvhSubtreeInfoData_t;
+
+struct b3BvhSubtreeInfoData
+{
+	//12 bytes
+	unsigned short int m_quantizedAabbMin[3];
+	unsigned short int m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int m_rootNodeIndex;
+	//4 bytes
+	int m_subtreeSize;
+	int m_padding[3];
+};
+
+#endif  //B3_BVH_SUBTREE_INFO_DATA_H

+ 123 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3BvhTraversal.h

@@ -0,0 +1,123 @@
+
+
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+
+// work-in-progress
+void b3BvhTraversal(__global const b3Int4* pairs,
+					__global const b3RigidBodyData* rigidBodies,
+					__global const b3Collidable* collidables,
+					__global b3Aabb* aabbs,
+					__global b3Int4* concavePairsOut,
+					__global volatile int* numConcavePairsOut,
+					__global const b3BvhSubtreeInfo* subtreeHeadersRoot,
+					__global const b3QuantizedBvhNode* quantizedNodesRoot,
+					__global const b3BvhInfo* bvhInfos,
+					int numPairs,
+					int maxNumConcavePairsCapacity,
+					int id)
+{
+	int bodyIndexA = pairs[id].x;
+	int bodyIndexB = pairs[id].y;
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+	//once the broadphase avoids static-static pairs, we can remove this test
+	if ((rigidBodies[bodyIndexA].m_invMass == 0) && (rigidBodies[bodyIndexB].m_invMass == 0))
+	{
+		return;
+	}
+
+	if (collidables[collidableIndexA].m_shapeType != SHAPE_CONCAVE_TRIMESH)
+		return;
+
+	int shapeTypeB = collidables[collidableIndexB].m_shapeType;
+
+	if (shapeTypeB != SHAPE_CONVEX_HULL &&
+		shapeTypeB != SHAPE_SPHERE &&
+		shapeTypeB != SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		return;
+
+	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];
+
+	b3Float4 bvhAabbMin = bvhInfo.m_aabbMin;
+	b3Float4 bvhAabbMax = bvhInfo.m_aabbMax;
+	b3Float4 bvhQuantization = bvhInfo.m_quantization;
+	int numSubtreeHeaders = bvhInfo.m_numSubTrees;
+	__global const b3BvhSubtreeInfoData* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];
+	__global const b3QuantizedBvhNodeData* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];
+
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	b3QuantizeWithClamp(quantizedQueryAabbMin, aabbs[bodyIndexB].m_minVec, false, bvhAabbMin, bvhAabbMax, bvhQuantization);
+	b3QuantizeWithClamp(quantizedQueryAabbMax, aabbs[bodyIndexB].m_maxVec, true, bvhAabbMin, bvhAabbMax, bvhQuantization);
+
+	for (int i = 0; i < numSubtreeHeaders; i++)
+	{
+		b3BvhSubtreeInfoData subtree = subtreeHeaders[i];
+
+		int overlap = b3TestQuantizedAabbAgainstQuantizedAabbSlow(quantizedQueryAabbMin, quantizedQueryAabbMax, subtree.m_quantizedAabbMin, subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			int startNodeIndex = subtree.m_rootNodeIndex;
+			int endNodeIndex = subtree.m_rootNodeIndex + subtree.m_subtreeSize;
+			int curIndex = startNodeIndex;
+			int escapeIndex;
+			int isLeafNode;
+			int aabbOverlap;
+			while (curIndex < endNodeIndex)
+			{
+				b3QuantizedBvhNodeData rootNode = quantizedNodes[curIndex];
+				aabbOverlap = b3TestQuantizedAabbAgainstQuantizedAabbSlow(quantizedQueryAabbMin, quantizedQueryAabbMax, rootNode.m_quantizedAabbMin, rootNode.m_quantizedAabbMax);
+				isLeafNode = b3IsLeaf(&rootNode);
+				if (aabbOverlap)
+				{
+					if (isLeafNode)
+					{
+						int triangleIndex = b3GetTriangleIndex(&rootNode);
+						if (shapeTypeB == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+						{
+							int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+							int pairIdx = b3AtomicAdd(numConcavePairsOut, numChildrenB);
+							for (int b = 0; b < numChildrenB; b++)
+							{
+								if ((pairIdx + b) < maxNumConcavePairsCapacity)
+								{
+									int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex + b;
+									b3Int4 newPair = b3MakeInt4(bodyIndexA, bodyIndexB, triangleIndex, childShapeIndexB);
+									concavePairsOut[pairIdx + b] = newPair;
+								}
+							}
+						}
+						else
+						{
+							int pairIdx = b3AtomicInc(numConcavePairsOut);
+							if (pairIdx < maxNumConcavePairsCapacity)
+							{
+								b3Int4 newPair = b3MakeInt4(bodyIndexA, bodyIndexB, triangleIndex, 0);
+								concavePairsOut[pairIdx] = newPair;
+							}
+						}
+					}
+					curIndex++;
+				}
+				else
+				{
+					if (isLeafNode)
+					{
+						curIndex++;
+					}
+					else
+					{
+						escapeIndex = b3GetEscapeIndex(&rootNode);
+						curIndex += escapeIndex;
+					}
+				}
+			}
+		}
+	}
+}

+ 171 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ClipFaces.h

@@ -0,0 +1,171 @@
+#ifndef B3_CLIP_FACES_H
+#define B3_CLIP_FACES_H
+
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+
+inline b3Float4 b3Lerp3(b3Float4ConstArg a, b3Float4ConstArg b, float t)
+{
+	return b3MakeFloat4(a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+}
+
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFaceGlobal(__global const b3Float4* pVtxIn, int numVertsIn, b3Float4ConstArg planeNormalWS, float planeEqWS, __global b3Float4* ppVtxOut)
+{
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+	//double-check next test
+	//	if (numVertsIn < 2)
+	//		return 0;
+
+	b3Float4 firstVertex = pVtxIn[numVertsIn - 1];
+	b3Float4 endVertex = pVtxIn[0];
+
+	ds = b3Dot(planeNormalWS, firstVertex) + planeEqWS;
+
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex = pVtxIn[ve];
+		de = b3Dot(planeNormalWS, endVertex) + planeEqWS;
+		if (ds < 0)
+		{
+			if (de < 0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex, (ds * 1.f / (ds - de)));
+			}
+		}
+		else
+		{
+			if (de < 0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex, (ds * 1.f / (ds - de)));
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+}
+
+__kernel void clipFacesAndFindContactsKernel(__global const b3Float4* separatingNormals,
+											 __global const int* hasSeparatingAxis,
+											 __global b3Int4* clippingFacesOut,
+											 __global b3Float4* worldVertsA1,
+											 __global b3Float4* worldNormalsA1,
+											 __global b3Float4* worldVertsB1,
+											 __global b3Float4* worldVertsB2,
+											 int vertexFaceCapacity,
+											 int pairIndex)
+{
+	//    int i = get_global_id(0);
+	//int pairIndex = i;
+	int i = pairIndex;
+
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	//	if (i<numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+			//			int bodyIndexA = pairs[i].x;
+			//		int bodyIndexB = pairs[i].y;
+
+			int numLocalContactsOut = 0;
+
+			int capacityWorldVertsB2 = vertexFaceCapacity;
+
+			__global b3Float4* pVtxIn = &worldVertsB1[pairIndex * capacityWorldVertsB2];
+			__global b3Float4* pVtxOut = &worldVertsB2[pairIndex * capacityWorldVertsB2];
+
+			{
+				__global b3Int4* clippingFaces = clippingFacesOut;
+
+				int closestFaceA = clippingFaces[pairIndex].x;
+				// int closestFaceB = clippingFaces[pairIndex].y;
+				int numVertsInA = clippingFaces[pairIndex].z;
+				int numVertsInB = clippingFaces[pairIndex].w;
+
+				int numVertsOut = 0;
+
+				if (closestFaceA >= 0)
+				{
+					// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+
+					for (int e0 = 0; e0 < numVertsInA; e0++)
+					{
+						const b3Float4 aw = worldVertsA1[pairIndex * capacityWorldVertsB2 + e0];
+						const b3Float4 bw = worldVertsA1[pairIndex * capacityWorldVertsB2 + ((e0 + 1) % numVertsInA)];
+						const b3Float4 WorldEdge0 = aw - bw;
+						b3Float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+						b3Float4 planeNormalWS1 = -b3Cross(WorldEdge0, worldPlaneAnormal1);
+						b3Float4 worldA1 = aw;
+						float planeEqWS1 = -b3Dot(worldA1, planeNormalWS1);
+						b3Float4 planeNormalWS = planeNormalWS1;
+						float planeEqWS = planeEqWS1;
+						numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS, planeEqWS, pVtxOut);
+						__global b3Float4* tmp = pVtxOut;
+						pVtxOut = pVtxIn;
+						pVtxIn = tmp;
+						numVertsInB = numVertsOut;
+						numVertsOut = 0;
+					}
+
+					b3Float4 planeNormalWS = worldNormalsA1[pairIndex];
+					float planeEqWS = -b3Dot(planeNormalWS, worldVertsA1[pairIndex * capacityWorldVertsB2]);
+
+					for (int i = 0; i < numVertsInB; i++)
+					{
+						float depth = b3Dot(planeNormalWS, pVtxIn[i]) + planeEqWS;
+						if (depth <= minDist)
+						{
+							depth = minDist;
+						}
+						/*
+						static float maxDepth = 0.f;
+						if (depth < maxDepth)
+						{
+							maxDepth = depth;
+							if (maxDepth < -10)
+							{
+								printf("error at framecount %d?\n",myframecount);
+							}
+							printf("maxDepth = %f\n", maxDepth);
+
+						}
+*/
+						if (depth <= maxDist)
+						{
+							b3Float4 pointInWorld = pVtxIn[i];
+							pVtxOut[numLocalContactsOut++] = b3MakeFloat4(pointInWorld.x, pointInWorld.y, pointInWorld.z, depth);
+						}
+					}
+				}
+				clippingFaces[pairIndex].w = numLocalContactsOut;
+			}
+
+			for (int i = 0; i < numLocalContactsOut; i++)
+				pVtxIn[i] = pVtxOut[i];
+
+		}  //		if (hasSeparatingAxis[i])
+	}      //	if (i<numPairs)
+}
+
+#endif  //B3_CLIP_FACES_H

+ 69 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h

@@ -0,0 +1,69 @@
+
+#ifndef B3_COLLIDABLE_H
+#define B3_COLLIDABLE_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Quat.h"
+
+enum b3ShapeTypes
+{
+	SHAPE_HEIGHT_FIELD = 1,
+
+	SHAPE_CONVEX_HULL = 3,
+	SHAPE_PLANE = 4,
+	SHAPE_CONCAVE_TRIMESH = 5,
+	SHAPE_COMPOUND_OF_CONVEX_HULLS = 6,
+	SHAPE_SPHERE = 7,
+	MAX_NUM_SHAPE_TYPES,
+};
+
+typedef struct b3Collidable b3Collidable_t;
+
+struct b3Collidable
+{
+	union {
+		int m_numChildShapes;
+		int m_bvhIndex;
+	};
+	union {
+		float m_radius;
+		int m_compoundBvhIndex;
+	};
+
+	int m_shapeType;
+	union {
+		int m_shapeIndex;
+		float m_height;
+	};
+};
+
+typedef struct b3GpuChildShape b3GpuChildShape_t;
+struct b3GpuChildShape
+{
+	b3Float4 m_childPosition;
+	b3Quat m_childOrientation;
+	union {
+		int m_shapeIndex;  //used for SHAPE_COMPOUND_OF_CONVEX_HULLS
+		int m_capsuleAxis;
+	};
+	union {
+		float m_radius;        //used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES
+		int m_numChildShapes;  //used for compound shape
+	};
+	union {
+		float m_height;  //used for childshape of SHAPE_COMPOUND_OF_CAPSULES
+		int m_collidableShapeIndex;
+	};
+	int m_shapeType;
+};
+
+struct b3CompoundOverlappingPair
+{
+	int m_bodyIndexA;
+	int m_bodyIndexB;
+	//	int	m_pairType;
+	int m_childShapeIndexA;
+	int m_childShapeIndexB;
+};
+
+#endif  //B3_COLLIDABLE_H

+ 36 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h

@@ -0,0 +1,36 @@
+#ifndef B3_CONTACT4DATA_H
+#define B3_CONTACT4DATA_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+
+typedef struct b3Contact4Data b3Contact4Data_t;
+
+struct b3Contact4Data
+{
+	b3Float4 m_worldPosB[4];
+	//	b3Float4	m_localPosA[4];
+	//	b3Float4	m_localPosB[4];
+	b3Float4 m_worldNormalOnB;  //	w: m_nPoints
+	unsigned short m_restituitionCoeffCmp;
+	unsigned short m_frictionCoeffCmp;
+	int m_batchIdx;
+	int m_bodyAPtrAndSignBit;  //x:m_bodyAPtr, y:m_bodyBPtr
+	int m_bodyBPtrAndSignBit;
+
+	int m_childIndexA;
+	int m_childIndexB;
+	int m_unused1;
+	int m_unused2;
+};
+
+inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)
+{
+	return (int)contact->m_worldNormalOnB.w;
+};
+
+inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)
+{
+	contact->m_worldNormalOnB.w = (float)numPoints;
+};
+
+#endif  //B3_CONTACT4DATA_H

+ 486 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactConvexConvexSAT.h

@@ -0,0 +1,486 @@
+
+#ifndef B3_CONTACT_CONVEX_CONVEX_SAT_H
+#define B3_CONTACT_CONVEX_CONVEX_SAT_H
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h"
+
+#define B3_MAX_VERTS 1024
+
+inline b3Float4 b3Lerp3(const b3Float4& a, const b3Float4& b, float t)
+{
+	return b3MakeVector3(a.x + (b.x - a.x) * t,
+						 a.y + (b.y - a.y) * t,
+						 a.z + (b.z - a.z) * t,
+						 0.f);
+}
+
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+inline int b3ClipFace(const b3Float4* pVtxIn, int numVertsIn, b3Float4& planeNormalWS, float planeEqWS, b3Float4* ppVtxOut)
+{
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+	if (numVertsIn < 2)
+		return 0;
+
+	b3Float4 firstVertex = pVtxIn[numVertsIn - 1];
+	b3Float4 endVertex = pVtxIn[0];
+
+	ds = b3Dot3F4(planeNormalWS, firstVertex) + planeEqWS;
+
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex = pVtxIn[ve];
+
+		de = b3Dot3F4(planeNormalWS, endVertex) + planeEqWS;
+
+		if (ds < 0)
+		{
+			if (de < 0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex, (ds * 1.f / (ds - de)));
+			}
+		}
+		else
+		{
+			if (de < 0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = b3Lerp3(firstVertex, endVertex, (ds * 1.f / (ds - de)));
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+}
+
+inline int b3ClipFaceAgainstHull(const b3Float4& separatingNormal, const b3ConvexPolyhedronData* hullA,
+								 const b3Float4& posA, const b3Quaternion& ornA, b3Float4* worldVertsB1, int numWorldVertsB1,
+								 b3Float4* worldVertsB2, int capacityWorldVertsB2,
+								 const float minDist, float maxDist,
+								 const b3AlignedObjectArray<b3Float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA,
+								 //const b3Float4* verticesB,	const b3GpuFace* facesB,	const int* indicesB,
+								 b3Float4* contactsOut,
+								 int contactCapacity)
+{
+	int numContactsOut = 0;
+
+	b3Float4* pVtxIn = worldVertsB1;
+	b3Float4* pVtxOut = worldVertsB2;
+
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+
+	int closestFaceA = -1;
+	{
+		float dmin = FLT_MAX;
+		for (int face = 0; face < hullA->m_numFaces; face++)
+		{
+			const b3Float4 Normal = b3MakeVector3(
+				facesA[hullA->m_faceOffset + face].m_plane.x,
+				facesA[hullA->m_faceOffset + face].m_plane.y,
+				facesA[hullA->m_faceOffset + face].m_plane.z, 0.f);
+			const b3Float4 faceANormalWS = b3QuatRotate(ornA, Normal);
+
+			float d = b3Dot3F4(faceANormalWS, separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA < 0)
+		return numContactsOut;
+
+	b3GpuFace polyA = facesA[hullA->m_faceOffset + closestFaceA];
+
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	//int numContacts = numWorldVertsB1;
+	int numVerticesA = polyA.m_numIndices;
+	for (int e0 = 0; e0 < numVerticesA; e0++)
+	{
+		const b3Float4 a = verticesA[hullA->m_vertexOffset + indicesA[polyA.m_indexOffset + e0]];
+		const b3Float4 b = verticesA[hullA->m_vertexOffset + indicesA[polyA.m_indexOffset + ((e0 + 1) % numVerticesA)]];
+		const b3Float4 edge0 = a - b;
+		const b3Float4 WorldEdge0 = b3QuatRotate(ornA, edge0);
+		b3Float4 planeNormalA = b3MakeFloat4(polyA.m_plane.x, polyA.m_plane.y, polyA.m_plane.z, 0.f);
+		b3Float4 worldPlaneAnormal1 = b3QuatRotate(ornA, planeNormalA);
+
+		b3Float4 planeNormalWS1 = -b3Cross3(WorldEdge0, worldPlaneAnormal1);
+		b3Float4 worldA1 = b3TransformPoint(a, posA, ornA);
+		float planeEqWS1 = -b3Dot3F4(worldA1, planeNormalWS1);
+
+		b3Float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS = planeEqWS1;
+
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = b3ClipFace(pVtxIn, numVertsIn, planeNormalWS, planeEqWS, pVtxOut);
+
+		//btSwap(pVtxIn,pVtxOut);
+		b3Float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+
+	// only keep points that are behind the witness face
+	{
+		b3Float4 localPlaneNormal = b3MakeFloat4(polyA.m_plane.x, polyA.m_plane.y, polyA.m_plane.z, 0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		b3Float4 planeNormalWS = b3QuatRotate(ornA, localPlaneNormal);
+		float planeEqWS = localPlaneEq - b3Dot3F4(planeNormalWS, posA);
+		for (int i = 0; i < numVertsIn; i++)
+		{
+			float depth = b3Dot3F4(planeNormalWS, pVtxIn[i]) + planeEqWS;
+			if (depth <= minDist)
+			{
+				depth = minDist;
+			}
+			if (numContactsOut < contactCapacity)
+			{
+				if (depth <= maxDist)
+				{
+					b3Float4 pointInWorld = pVtxIn[i];
+					//resultOut.addContactPoint(separatingNormal,point,depth);
+					contactsOut[numContactsOut++] = b3MakeVector3(pointInWorld.x, pointInWorld.y, pointInWorld.z, depth);
+					//printf("depth=%f\n",depth);
+				}
+			}
+			else
+			{
+				b3Error("exceeding contact capacity (%d,%df)\n", numContactsOut, contactCapacity);
+			}
+		}
+	}
+
+	return numContactsOut;
+}
+
+inline int b3ClipHullAgainstHull(const b3Float4& separatingNormal,
+								 const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB,
+								 const b3Float4& posA, const b3Quaternion& ornA, const b3Float4& posB, const b3Quaternion& ornB,
+								 b3Float4* worldVertsB1, b3Float4* worldVertsB2, int capacityWorldVerts,
+								 const float minDist, float maxDist,
+								 const b3AlignedObjectArray<b3Float4>& verticesA, const b3AlignedObjectArray<b3GpuFace>& facesA, const b3AlignedObjectArray<int>& indicesA,
+								 const b3AlignedObjectArray<b3Float4>& verticesB, const b3AlignedObjectArray<b3GpuFace>& facesB, const b3AlignedObjectArray<int>& indicesB,
+
+								 b3Float4* contactsOut,
+								 int contactCapacity)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1 = 0;
+
+	B3_PROFILE("clipHullAgainstHull");
+
+	//float curMaxDist=maxDist;
+	int closestFaceB = -1;
+	float dmax = -FLT_MAX;
+
+	{
+		//B3_PROFILE("closestFaceB");
+		if (hullB.m_numFaces != 1)
+		{
+			//printf("wtf\n");
+		}
+		static bool once = true;
+		//printf("separatingNormal=%f,%f,%f\n",separatingNormal.x,separatingNormal.y,separatingNormal.z);
+
+		for (int face = 0; face < hullB.m_numFaces; face++)
+		{
+#ifdef BT_DEBUG_SAT_FACE
+			if (once)
+				printf("face %d\n", face);
+			const b3GpuFace* faceB = &facesB[hullB.m_faceOffset + face];
+			if (once)
+			{
+				for (int i = 0; i < faceB->m_numIndices; i++)
+				{
+					b3Float4 vert = verticesB[hullB.m_vertexOffset + indicesB[faceB->m_indexOffset + i]];
+					printf("vert[%d] = %f,%f,%f\n", i, vert.x, vert.y, vert.z);
+				}
+			}
+#endif  //BT_DEBUG_SAT_FACE \
+	//if (facesB[hullB.m_faceOffset+face].m_numIndices>2)
+			{
+				const b3Float4 Normal = b3MakeVector3(facesB[hullB.m_faceOffset + face].m_plane.x,
+													  facesB[hullB.m_faceOffset + face].m_plane.y, facesB[hullB.m_faceOffset + face].m_plane.z, 0.f);
+				const b3Float4 WorldNormal = b3QuatRotate(ornB, Normal);
+#ifdef BT_DEBUG_SAT_FACE
+				if (once)
+					printf("faceNormal = %f,%f,%f\n", Normal.x, Normal.y, Normal.z);
+#endif
+				float d = b3Dot3F4(WorldNormal, separatingNormal);
+				if (d > dmax)
+				{
+					dmax = d;
+					closestFaceB = face;
+				}
+			}
+		}
+		once = false;
+	}
+
+	b3Assert(closestFaceB >= 0);
+	{
+		//B3_PROFILE("worldVertsB1");
+		const b3GpuFace& polyB = facesB[hullB.m_faceOffset + closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for (int e0 = 0; e0 < numVertices; e0++)
+		{
+			const b3Float4& b = verticesB[hullB.m_vertexOffset + indicesB[polyB.m_indexOffset + e0]];
+			worldVertsB1[numWorldVertsB1++] = b3TransformPoint(b, posB, ornB);
+		}
+	}
+
+	if (closestFaceB >= 0)
+	{
+		//B3_PROFILE("clipFaceAgainstHull");
+		numContactsOut = b3ClipFaceAgainstHull((b3Float4&)separatingNormal, &hullA,
+											   posA, ornA,
+											   worldVertsB1, numWorldVertsB1, worldVertsB2, capacityWorldVerts, minDist, maxDist,
+											   verticesA, facesA, indicesA,
+											   contactsOut, contactCapacity);
+	}
+
+	return numContactsOut;
+}
+
+inline int b3ClipHullHullSingle(
+	int bodyIndexA, int bodyIndexB,
+	const b3Float4& posA,
+	const b3Quaternion& ornA,
+	const b3Float4& posB,
+	const b3Quaternion& ornB,
+
+	int collidableIndexA, int collidableIndexB,
+
+	const b3AlignedObjectArray<b3RigidBodyData>* bodyBuf,
+	b3AlignedObjectArray<b3Contact4Data>* globalContactOut,
+	int& nContacts,
+
+	const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataA,
+	const b3AlignedObjectArray<b3ConvexPolyhedronData>& hostConvexDataB,
+
+	const b3AlignedObjectArray<b3Vector3>& verticesA,
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA,
+	const b3AlignedObjectArray<b3GpuFace>& facesA,
+	const b3AlignedObjectArray<int>& indicesA,
+
+	const b3AlignedObjectArray<b3Vector3>& verticesB,
+	const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB,
+	const b3AlignedObjectArray<b3GpuFace>& facesB,
+	const b3AlignedObjectArray<int>& indicesB,
+
+	const b3AlignedObjectArray<b3Collidable>& hostCollidablesA,
+	const b3AlignedObjectArray<b3Collidable>& hostCollidablesB,
+	const b3Vector3& sepNormalWorldSpace,
+	int maxContactCapacity)
+{
+	int contactIndex = -1;
+	b3ConvexPolyhedronData hullA, hullB;
+
+	b3Collidable colA = hostCollidablesA[collidableIndexA];
+	hullA = hostConvexDataA[colA.m_shapeIndex];
+	//printf("numvertsA = %d\n",hullA.m_numVertices);
+
+	b3Collidable colB = hostCollidablesB[collidableIndexB];
+	hullB = hostConvexDataB[colB.m_shapeIndex];
+	//printf("numvertsB = %d\n",hullB.m_numVertices);
+
+	b3Float4 contactsOut[B3_MAX_VERTS];
+	int localContactCapacity = B3_MAX_VERTS;
+
+#ifdef _WIN32
+	b3Assert(_finite(bodyBuf->at(bodyIndexA).m_pos.x));
+	b3Assert(_finite(bodyBuf->at(bodyIndexB).m_pos.x));
+#endif
+
+	{
+		b3Float4 worldVertsB1[B3_MAX_VERTS];
+		b3Float4 worldVertsB2[B3_MAX_VERTS];
+		int capacityWorldVerts = B3_MAX_VERTS;
+
+		b3Float4 hostNormal = b3MakeFloat4(sepNormalWorldSpace.x, sepNormalWorldSpace.y, sepNormalWorldSpace.z, 0.f);
+		int shapeA = hostCollidablesA[collidableIndexA].m_shapeIndex;
+		int shapeB = hostCollidablesB[collidableIndexB].m_shapeIndex;
+
+		b3Scalar minDist = -1;
+		b3Scalar maxDist = 0.;
+
+		b3Transform trA, trB;
+		{
+			//B3_PROFILE("b3TransformPoint computation");
+			//trA.setIdentity();
+			trA.setOrigin(b3MakeVector3(posA.x, posA.y, posA.z));
+			trA.setRotation(b3Quaternion(ornA.x, ornA.y, ornA.z, ornA.w));
+
+			//trB.setIdentity();
+			trB.setOrigin(b3MakeVector3(posB.x, posB.y, posB.z));
+			trB.setRotation(b3Quaternion(ornB.x, ornB.y, ornB.z, ornB.w));
+		}
+
+		b3Quaternion trAorn = trA.getRotation();
+		b3Quaternion trBorn = trB.getRotation();
+
+		int numContactsOut = b3ClipHullAgainstHull(hostNormal,
+												   hostConvexDataA.at(shapeA),
+												   hostConvexDataB.at(shapeB),
+												   (b3Float4&)trA.getOrigin(), (b3Quaternion&)trAorn,
+												   (b3Float4&)trB.getOrigin(), (b3Quaternion&)trBorn,
+												   worldVertsB1, worldVertsB2, capacityWorldVerts,
+												   minDist, maxDist,
+												   verticesA, facesA, indicesA,
+												   verticesB, facesB, indicesB,
+
+												   contactsOut, localContactCapacity);
+
+		if (numContactsOut > 0)
+		{
+			B3_PROFILE("overlap");
+
+			b3Float4 normalOnSurfaceB = (b3Float4&)hostNormal;
+			//			b3Float4 centerOut;
+
+			b3Int4 contactIdx;
+			contactIdx.x = 0;
+			contactIdx.y = 1;
+			contactIdx.z = 2;
+			contactIdx.w = 3;
+
+			int numPoints = 0;
+
+			{
+				B3_PROFILE("extractManifold");
+				numPoints = b3ReduceContacts(contactsOut, numContactsOut, normalOnSurfaceB, &contactIdx);
+			}
+
+			b3Assert(numPoints);
+
+			if (nContacts < maxContactCapacity)
+			{
+				contactIndex = nContacts;
+				globalContactOut->expand();
+				b3Contact4Data& contact = globalContactOut->at(nContacts);
+				contact.m_batchIdx = 0;  //i;
+				contact.m_bodyAPtrAndSignBit = (bodyBuf->at(bodyIndexA).m_invMass == 0) ? -bodyIndexA : bodyIndexA;
+				contact.m_bodyBPtrAndSignBit = (bodyBuf->at(bodyIndexB).m_invMass == 0) ? -bodyIndexB : bodyIndexB;
+
+				contact.m_frictionCoeffCmp = 45874;
+				contact.m_restituitionCoeffCmp = 0;
+
+				//	float distance = 0.f;
+				for (int p = 0; p < numPoints; p++)
+				{
+					contact.m_worldPosB[p] = contactsOut[contactIdx.s[p]];  //check if it is actually on B
+					contact.m_worldNormalOnB = normalOnSurfaceB;
+				}
+				//printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints);
+				contact.m_worldNormalOnB.w = (b3Scalar)numPoints;
+				nContacts++;
+			}
+			else
+			{
+				b3Error("Error: exceeding contact capacity (%d/%d)\n", nContacts, maxContactCapacity);
+			}
+		}
+	}
+	return contactIndex;
+}
+
+inline int b3ContactConvexConvexSAT(
+	int pairIndex,
+	int bodyIndexA, int bodyIndexB,
+	int collidableIndexA, int collidableIndexB,
+	const b3AlignedObjectArray<b3RigidBodyData>& rigidBodies,
+	const b3AlignedObjectArray<b3Collidable>& collidables,
+	const b3AlignedObjectArray<b3ConvexPolyhedronData>& convexShapes,
+	const b3AlignedObjectArray<b3Float4>& convexVertices,
+	const b3AlignedObjectArray<b3Float4>& uniqueEdges,
+	const b3AlignedObjectArray<int>& convexIndices,
+	const b3AlignedObjectArray<b3GpuFace>& faces,
+	b3AlignedObjectArray<b3Contact4Data>& globalContactsOut,
+	int& nGlobalContactsOut,
+	int maxContactCapacity)
+{
+	int contactIndex = -1;
+
+	b3Float4 posA = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+	b3Float4 posB = rigidBodies[bodyIndexB].m_pos;
+	b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat;
+
+	b3ConvexPolyhedronData hullA, hullB;
+
+	b3Float4 sepNormalWorldSpace;
+
+	b3Collidable colA = collidables[collidableIndexA];
+	hullA = convexShapes[colA.m_shapeIndex];
+	//printf("numvertsA = %d\n",hullA.m_numVertices);
+
+	b3Collidable colB = collidables[collidableIndexB];
+	hullB = convexShapes[colB.m_shapeIndex];
+	//printf("numvertsB = %d\n",hullB.m_numVertices);
+
+#ifdef _WIN32
+	b3Assert(_finite(rigidBodies[bodyIndexA].m_pos.x));
+	b3Assert(_finite(rigidBodies[bodyIndexB].m_pos.x));
+#endif
+
+	bool foundSepAxis = b3FindSeparatingAxis(hullA, hullB,
+											 posA,
+											 ornA,
+											 posB,
+											 ornB,
+
+											 convexVertices, uniqueEdges, faces, convexIndices,
+											 convexVertices, uniqueEdges, faces, convexIndices,
+
+											 sepNormalWorldSpace);
+
+	if (foundSepAxis)
+	{
+		contactIndex = b3ClipHullHullSingle(
+			bodyIndexA, bodyIndexB,
+			posA, ornA,
+			posB, ornB,
+			collidableIndexA, collidableIndexB,
+			&rigidBodies,
+			&globalContactsOut,
+			nGlobalContactsOut,
+
+			convexShapes,
+			convexShapes,
+
+			convexVertices,
+			uniqueEdges,
+			faces,
+			convexIndices,
+
+			convexVertices,
+			uniqueEdges,
+			faces,
+			convexIndices,
+
+			collidables,
+			collidables,
+			sepNormalWorldSpace,
+			maxContactCapacity);
+	}
+
+	return contactIndex;
+}
+
+#endif  //B3_CONTACT_CONVEX_CONVEX_SAT_H

+ 153 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ContactSphereSphere.h

@@ -0,0 +1,153 @@
+
+#ifndef B3_CONTACT_SPHERE_SPHERE_H
+#define B3_CONTACT_SPHERE_SPHERE_H
+
+void computeContactSphereConvex(int pairIndex,
+								int bodyIndexA, int bodyIndexB,
+								int collidableIndexA, int collidableIndexB,
+								const b3RigidBodyData* rigidBodies,
+								const b3Collidable* collidables,
+								const b3ConvexPolyhedronData* convexShapes,
+								const b3Vector3* convexVertices,
+								const int* convexIndices,
+								const b3GpuFace* faces,
+								b3Contact4* globalContactsOut,
+								int& nGlobalContactsOut,
+								int maxContactCapacity)
+{
+	float radius = collidables[collidableIndexA].m_radius;
+	float4 spherePos1 = rigidBodies[bodyIndexA].m_pos;
+	b3Quaternion sphereOrn = rigidBodies[bodyIndexA].m_quat;
+
+	float4 pos = rigidBodies[bodyIndexB].m_pos;
+
+	b3Quaternion quat = rigidBodies[bodyIndexB].m_quat;
+
+	b3Transform tr;
+	tr.setIdentity();
+	tr.setOrigin(pos);
+	tr.setRotation(quat);
+	b3Transform trInv = tr.inverse();
+
+	float4 spherePos = trInv(spherePos1);
+
+	int collidableIndex = rigidBodies[bodyIndexB].m_collidableIdx;
+	int shapeIndex = collidables[collidableIndex].m_shapeIndex;
+	int numFaces = convexShapes[shapeIndex].m_numFaces;
+	float4 closestPnt = b3MakeVector3(0, 0, 0, 0);
+	float4 hitNormalWorld = b3MakeVector3(0, 0, 0, 0);
+	float minDist = -1000000.f;  // TODO: What is the largest/smallest float?
+	bool bCollide = true;
+	int region = -1;
+	float4 localHitNormal;
+	for (int f = 0; f < numFaces; f++)
+	{
+		b3GpuFace face = faces[convexShapes[shapeIndex].m_faceOffset + f];
+		float4 planeEqn;
+		float4 localPlaneNormal = b3MakeVector3(face.m_plane.x, face.m_plane.y, face.m_plane.z, 0.f);
+		float4 n1 = localPlaneNormal;  //quatRotate(quat,localPlaneNormal);
+		planeEqn = n1;
+		planeEqn[3] = face.m_plane.w;
+
+		float4 pntReturn;
+		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);
+
+		if (dist > radius)
+		{
+			bCollide = false;
+			break;
+		}
+
+		if (dist > 0)
+		{
+			//might hit an edge or vertex
+			b3Vector3 out;
+
+			bool isInPoly = IsPointInPolygon(spherePos,
+											 &face,
+											 &convexVertices[convexShapes[shapeIndex].m_vertexOffset],
+											 convexIndices,
+											 &out);
+			if (isInPoly)
+			{
+				if (dist > minDist)
+				{
+					minDist = dist;
+					closestPnt = pntReturn;
+					localHitNormal = planeEqn;
+					region = 1;
+				}
+			}
+			else
+			{
+				b3Vector3 tmp = spherePos - out;
+				b3Scalar l2 = tmp.length2();
+				if (l2 < radius * radius)
+				{
+					dist = b3Sqrt(l2);
+					if (dist > minDist)
+					{
+						minDist = dist;
+						closestPnt = out;
+						localHitNormal = tmp / dist;
+						region = 2;
+					}
+				}
+				else
+				{
+					bCollide = false;
+					break;
+				}
+			}
+		}
+		else
+		{
+			if (dist > minDist)
+			{
+				minDist = dist;
+				closestPnt = pntReturn;
+				localHitNormal = planeEqn;
+				region = 3;
+			}
+		}
+	}
+	static int numChecks = 0;
+	numChecks++;
+
+	if (bCollide && minDist > -10000)
+	{
+		float4 normalOnSurfaceB1 = tr.getBasis() * localHitNormal;  //-hitNormalWorld;
+		float4 pOnB1 = tr(closestPnt);
+		//printf("dist ,%f,",minDist);
+		float actualDepth = minDist - radius;
+		if (actualDepth < 0)
+		{
+			//printf("actualDepth = ,%f,", actualDepth);
+			//printf("normalOnSurfaceB1 = ,%f,%f,%f,", normalOnSurfaceB1.x,normalOnSurfaceB1.y,normalOnSurfaceB1.z);
+			//printf("region=,%d,\n", region);
+			pOnB1[3] = actualDepth;
+
+			int dstIdx;
+			//    dstIdx = nGlobalContactsOut++;//AppendInc( nGlobalContactsOut, dstIdx );
+
+			if (nGlobalContactsOut < maxContactCapacity)
+			{
+				dstIdx = nGlobalContactsOut;
+				nGlobalContactsOut++;
+
+				b3Contact4* c = &globalContactsOut[dstIdx];
+				c->m_worldNormalOnB = normalOnSurfaceB1;
+				c->setFrictionCoeff(0.7);
+				c->setRestituitionCoeff(0.f);
+
+				c->m_batchIdx = pairIndex;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass == 0 ? -bodyIndexA : bodyIndexA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass == 0 ? -bodyIndexB : bodyIndexB;
+				c->m_worldPosB[0] = pOnB1;
+				int numPoints = 1;
+				c->m_worldNormalOnB.w = (b3Scalar)numPoints;
+			}  //if (dstIdx < numPairs)
+		}
+	}  //if (hasCollision)
+}
+#endif  //B3_CONTACT_SPHERE_SPHERE_H

+ 38 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h

@@ -0,0 +1,38 @@
+
+#ifndef B3_CONVEX_POLYHEDRON_DATA_H
+#define B3_CONVEX_POLYHEDRON_DATA_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Quat.h"
+
+typedef struct b3GpuFace b3GpuFace_t;
+struct b3GpuFace
+{
+	b3Float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+	int m_unusedPadding1;
+	int m_unusedPadding2;
+};
+
+typedef struct b3ConvexPolyhedronData b3ConvexPolyhedronData_t;
+
+struct b3ConvexPolyhedronData
+{
+	b3Float4 m_localCenter;
+	b3Float4 m_extents;
+	b3Float4 mC;
+	b3Float4 mE;
+
+	float m_radius;
+	int m_faceOffset;
+	int m_numFaces;
+	int m_numVertices;
+
+	int m_vertexOffset;
+	int m_uniqueEdgesOffset;
+	int m_numUniqueEdges;
+	int m_unused;
+};
+
+#endif  //B3_CONVEX_POLYHEDRON_DATA_H

+ 797 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3FindConcaveSatAxis.h

@@ -0,0 +1,797 @@
+#ifndef B3_FIND_CONCAVE_SEPARATING_AXIS_H
+#define B3_FIND_CONCAVE_SEPARATING_AXIS_H
+
+#define B3_TRIANGLE_NUM_CONVEX_FACES 5
+
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+
+inline void b3Project(__global const b3ConvexPolyhedronData* hull, b3Float4ConstArg pos, b3QuatConstArg orn,
+					  const b3Float4* dir, __global const b3Float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn), *dir);
+	float offset = b3Dot(pos, *dir);
+	for (int i = 0; i < numVerts; i++)
+	{
+		float dp = b3Dot(vertices[hull->m_vertexOffset + i], localDir);
+		if (dp < min[0])
+			min[0] = dp;
+		if (dp > max[0])
+			max[0] = dp;
+	}
+	if (min[0] > max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline bool b3TestSepAxis(const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB,
+						  b3Float4ConstArg posA, b3QuatConstArg ornA,
+						  b3Float4ConstArg posB, b3QuatConstArg ornB,
+						  b3Float4* sep_axis, const b3Float4* verticesA, __global const b3Float4* verticesB, float* depth)
+{
+	float Min0, Max0;
+	float Min1, Max1;
+	b3Project(hullA, posA, ornA, sep_axis, verticesA, &Min0, &Max0);
+	b3Project(hullB, posB, ornB, sep_axis, verticesB, &Min1, &Max1);
+
+	if (Max0 < Min1 || Max1 < Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0 < d1 ? d0 : d1;
+	return true;
+}
+
+bool b3FindSeparatingAxis(const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB,
+						  b3Float4ConstArg posA1,
+						  b3QuatConstArg ornA,
+						  b3Float4ConstArg posB1,
+						  b3QuatConstArg ornB,
+						  b3Float4ConstArg DeltaC2,
+
+						  const b3Float4* verticesA,
+						  const b3Float4* uniqueEdgesA,
+						  const b3GpuFace* facesA,
+						  const int* indicesA,
+
+						  __global const b3Float4* verticesB,
+						  __global const b3Float4* uniqueEdgesB,
+						  __global const b3GpuFace* facesB,
+						  __global const int* indicesB,
+						  b3Float4* sep,
+						  float* dmin)
+{
+	b3Float4 posA = posA1;
+	posA.w = 0.f;
+	b3Float4 posB = posB1;
+	posB.w = 0.f;
+	/*
+	static int maxFaceVertex = 0;
+
+	int curFaceVertexAB = hullA->m_numFaces*hullB->m_numVertices;
+	curFaceVertexAB+= hullB->m_numFaces*hullA->m_numVertices;
+
+	if (curFaceVertexAB>maxFaceVertex)
+	{
+		maxFaceVertex = curFaceVertexAB;
+		printf("curFaceVertexAB = %d\n",curFaceVertexAB);
+		printf("hullA->m_numFaces = %d\n",hullA->m_numFaces);
+		printf("hullA->m_numVertices = %d\n",hullA->m_numVertices);
+		printf("hullB->m_numVertices = %d\n",hullB->m_numVertices);
+	}
+*/
+
+	int curPlaneTests = 0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for (int i = 0; i < numFacesA; i++)
+		{
+			const b3Float4 normal = facesA[hullA->m_faceOffset + i].m_plane;
+			b3Float4 faceANormalWS = b3QuatRotate(ornA, normal);
+			if (b3Dot(DeltaC2, faceANormalWS) < 0)
+				faceANormalWS *= -1.f;
+			curPlaneTests++;
+			float d;
+			if (!b3TestSepAxis(hullA, hullB, posA, ornA, posB, ornB, &faceANormalWS, verticesA, verticesB, &d))
+				return false;
+			if (d < *dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if ((b3Dot(-DeltaC2, *sep)) > 0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+b3Vector3 unitSphere162[] =
+	{
+		b3MakeVector3(0.000000, -1.000000, 0.000000),
+		b3MakeVector3(0.203181, -0.967950, 0.147618),
+		b3MakeVector3(-0.077607, -0.967950, 0.238853),
+		b3MakeVector3(0.723607, -0.447220, 0.525725),
+		b3MakeVector3(0.609547, -0.657519, 0.442856),
+		b3MakeVector3(0.812729, -0.502301, 0.295238),
+		b3MakeVector3(-0.251147, -0.967949, 0.000000),
+		b3MakeVector3(-0.077607, -0.967950, -0.238853),
+		b3MakeVector3(0.203181, -0.967950, -0.147618),
+		b3MakeVector3(0.860698, -0.251151, 0.442858),
+		b3MakeVector3(-0.276388, -0.447220, 0.850649),
+		b3MakeVector3(-0.029639, -0.502302, 0.864184),
+		b3MakeVector3(-0.155215, -0.251152, 0.955422),
+		b3MakeVector3(-0.894426, -0.447216, 0.000000),
+		b3MakeVector3(-0.831051, -0.502299, 0.238853),
+		b3MakeVector3(-0.956626, -0.251149, 0.147618),
+		b3MakeVector3(-0.276388, -0.447220, -0.850649),
+		b3MakeVector3(-0.483971, -0.502302, -0.716565),
+		b3MakeVector3(-0.436007, -0.251152, -0.864188),
+		b3MakeVector3(0.723607, -0.447220, -0.525725),
+		b3MakeVector3(0.531941, -0.502302, -0.681712),
+		b3MakeVector3(0.687159, -0.251152, -0.681715),
+		b3MakeVector3(0.687159, -0.251152, 0.681715),
+		b3MakeVector3(-0.436007, -0.251152, 0.864188),
+		b3MakeVector3(-0.956626, -0.251149, -0.147618),
+		b3MakeVector3(-0.155215, -0.251152, -0.955422),
+		b3MakeVector3(0.860698, -0.251151, -0.442858),
+		b3MakeVector3(0.276388, 0.447220, 0.850649),
+		b3MakeVector3(0.483971, 0.502302, 0.716565),
+		b3MakeVector3(0.232822, 0.657519, 0.716563),
+		b3MakeVector3(-0.723607, 0.447220, 0.525725),
+		b3MakeVector3(-0.531941, 0.502302, 0.681712),
+		b3MakeVector3(-0.609547, 0.657519, 0.442856),
+		b3MakeVector3(-0.723607, 0.447220, -0.525725),
+		b3MakeVector3(-0.812729, 0.502301, -0.295238),
+		b3MakeVector3(-0.609547, 0.657519, -0.442856),
+		b3MakeVector3(0.276388, 0.447220, -0.850649),
+		b3MakeVector3(0.029639, 0.502302, -0.864184),
+		b3MakeVector3(0.232822, 0.657519, -0.716563),
+		b3MakeVector3(0.894426, 0.447216, 0.000000),
+		b3MakeVector3(0.831051, 0.502299, -0.238853),
+		b3MakeVector3(0.753442, 0.657515, 0.000000),
+		b3MakeVector3(-0.232822, -0.657519, 0.716563),
+		b3MakeVector3(-0.162456, -0.850654, 0.499995),
+		b3MakeVector3(0.052790, -0.723612, 0.688185),
+		b3MakeVector3(0.138199, -0.894429, 0.425321),
+		b3MakeVector3(0.262869, -0.525738, 0.809012),
+		b3MakeVector3(0.361805, -0.723611, 0.587779),
+		b3MakeVector3(0.531941, -0.502302, 0.681712),
+		b3MakeVector3(0.425323, -0.850654, 0.309011),
+		b3MakeVector3(0.812729, -0.502301, -0.295238),
+		b3MakeVector3(0.609547, -0.657519, -0.442856),
+		b3MakeVector3(0.850648, -0.525736, 0.000000),
+		b3MakeVector3(0.670817, -0.723611, -0.162457),
+		b3MakeVector3(0.670817, -0.723610, 0.162458),
+		b3MakeVector3(0.425323, -0.850654, -0.309011),
+		b3MakeVector3(0.447211, -0.894428, 0.000001),
+		b3MakeVector3(-0.753442, -0.657515, 0.000000),
+		b3MakeVector3(-0.525730, -0.850652, 0.000000),
+		b3MakeVector3(-0.638195, -0.723609, 0.262864),
+		b3MakeVector3(-0.361801, -0.894428, 0.262864),
+		b3MakeVector3(-0.688189, -0.525736, 0.499997),
+		b3MakeVector3(-0.447211, -0.723610, 0.525729),
+		b3MakeVector3(-0.483971, -0.502302, 0.716565),
+		b3MakeVector3(-0.232822, -0.657519, -0.716563),
+		b3MakeVector3(-0.162456, -0.850654, -0.499995),
+		b3MakeVector3(-0.447211, -0.723611, -0.525727),
+		b3MakeVector3(-0.361801, -0.894429, -0.262863),
+		b3MakeVector3(-0.688189, -0.525736, -0.499997),
+		b3MakeVector3(-0.638195, -0.723609, -0.262863),
+		b3MakeVector3(-0.831051, -0.502299, -0.238853),
+		b3MakeVector3(0.361804, -0.723612, -0.587779),
+		b3MakeVector3(0.138197, -0.894429, -0.425321),
+		b3MakeVector3(0.262869, -0.525738, -0.809012),
+		b3MakeVector3(0.052789, -0.723611, -0.688186),
+		b3MakeVector3(-0.029639, -0.502302, -0.864184),
+		b3MakeVector3(0.956626, 0.251149, 0.147618),
+		b3MakeVector3(0.956626, 0.251149, -0.147618),
+		b3MakeVector3(0.951058, -0.000000, 0.309013),
+		b3MakeVector3(1.000000, 0.000000, 0.000000),
+		b3MakeVector3(0.947213, -0.276396, 0.162458),
+		b3MakeVector3(0.951058, 0.000000, -0.309013),
+		b3MakeVector3(0.947213, -0.276396, -0.162458),
+		b3MakeVector3(0.155215, 0.251152, 0.955422),
+		b3MakeVector3(0.436007, 0.251152, 0.864188),
+		b3MakeVector3(-0.000000, -0.000000, 1.000000),
+		b3MakeVector3(0.309017, 0.000000, 0.951056),
+		b3MakeVector3(0.138199, -0.276398, 0.951055),
+		b3MakeVector3(0.587786, 0.000000, 0.809017),
+		b3MakeVector3(0.447216, -0.276398, 0.850648),
+		b3MakeVector3(-0.860698, 0.251151, 0.442858),
+		b3MakeVector3(-0.687159, 0.251152, 0.681715),
+		b3MakeVector3(-0.951058, -0.000000, 0.309013),
+		b3MakeVector3(-0.809018, 0.000000, 0.587783),
+		b3MakeVector3(-0.861803, -0.276396, 0.425324),
+		b3MakeVector3(-0.587786, 0.000000, 0.809017),
+		b3MakeVector3(-0.670819, -0.276397, 0.688191),
+		b3MakeVector3(-0.687159, 0.251152, -0.681715),
+		b3MakeVector3(-0.860698, 0.251151, -0.442858),
+		b3MakeVector3(-0.587786, -0.000000, -0.809017),
+		b3MakeVector3(-0.809018, -0.000000, -0.587783),
+		b3MakeVector3(-0.670819, -0.276397, -0.688191),
+		b3MakeVector3(-0.951058, 0.000000, -0.309013),
+		b3MakeVector3(-0.861803, -0.276396, -0.425324),
+		b3MakeVector3(0.436007, 0.251152, -0.864188),
+		b3MakeVector3(0.155215, 0.251152, -0.955422),
+		b3MakeVector3(0.587786, -0.000000, -0.809017),
+		b3MakeVector3(0.309017, -0.000000, -0.951056),
+		b3MakeVector3(0.447216, -0.276398, -0.850648),
+		b3MakeVector3(0.000000, 0.000000, -1.000000),
+		b3MakeVector3(0.138199, -0.276398, -0.951055),
+		b3MakeVector3(0.670820, 0.276396, 0.688190),
+		b3MakeVector3(0.809019, -0.000002, 0.587783),
+		b3MakeVector3(0.688189, 0.525736, 0.499997),
+		b3MakeVector3(0.861804, 0.276394, 0.425323),
+		b3MakeVector3(0.831051, 0.502299, 0.238853),
+		b3MakeVector3(-0.447216, 0.276397, 0.850649),
+		b3MakeVector3(-0.309017, -0.000001, 0.951056),
+		b3MakeVector3(-0.262869, 0.525738, 0.809012),
+		b3MakeVector3(-0.138199, 0.276397, 0.951055),
+		b3MakeVector3(0.029639, 0.502302, 0.864184),
+		b3MakeVector3(-0.947213, 0.276396, -0.162458),
+		b3MakeVector3(-1.000000, 0.000001, 0.000000),
+		b3MakeVector3(-0.850648, 0.525736, -0.000000),
+		b3MakeVector3(-0.947213, 0.276397, 0.162458),
+		b3MakeVector3(-0.812729, 0.502301, 0.295238),
+		b3MakeVector3(-0.138199, 0.276397, -0.951055),
+		b3MakeVector3(-0.309016, -0.000000, -0.951057),
+		b3MakeVector3(-0.262869, 0.525738, -0.809012),
+		b3MakeVector3(-0.447215, 0.276397, -0.850649),
+		b3MakeVector3(-0.531941, 0.502302, -0.681712),
+		b3MakeVector3(0.861804, 0.276396, -0.425322),
+		b3MakeVector3(0.809019, 0.000000, -0.587782),
+		b3MakeVector3(0.688189, 0.525736, -0.499997),
+		b3MakeVector3(0.670821, 0.276397, -0.688189),
+		b3MakeVector3(0.483971, 0.502302, -0.716565),
+		b3MakeVector3(0.077607, 0.967950, 0.238853),
+		b3MakeVector3(0.251147, 0.967949, 0.000000),
+		b3MakeVector3(0.000000, 1.000000, 0.000000),
+		b3MakeVector3(0.162456, 0.850654, 0.499995),
+		b3MakeVector3(0.361800, 0.894429, 0.262863),
+		b3MakeVector3(0.447209, 0.723612, 0.525728),
+		b3MakeVector3(0.525730, 0.850652, 0.000000),
+		b3MakeVector3(0.638194, 0.723610, 0.262864),
+		b3MakeVector3(-0.203181, 0.967950, 0.147618),
+		b3MakeVector3(-0.425323, 0.850654, 0.309011),
+		b3MakeVector3(-0.138197, 0.894430, 0.425320),
+		b3MakeVector3(-0.361804, 0.723612, 0.587778),
+		b3MakeVector3(-0.052790, 0.723612, 0.688185),
+		b3MakeVector3(-0.203181, 0.967950, -0.147618),
+		b3MakeVector3(-0.425323, 0.850654, -0.309011),
+		b3MakeVector3(-0.447210, 0.894429, 0.000000),
+		b3MakeVector3(-0.670817, 0.723611, -0.162457),
+		b3MakeVector3(-0.670817, 0.723611, 0.162457),
+		b3MakeVector3(0.077607, 0.967950, -0.238853),
+		b3MakeVector3(0.162456, 0.850654, -0.499995),
+		b3MakeVector3(-0.138197, 0.894430, -0.425320),
+		b3MakeVector3(-0.052790, 0.723612, -0.688185),
+		b3MakeVector3(-0.361804, 0.723612, -0.587778),
+		b3MakeVector3(0.361800, 0.894429, -0.262863),
+		b3MakeVector3(0.638194, 0.723610, -0.262864),
+		b3MakeVector3(0.447209, 0.723612, -0.525728)};
+
+bool b3FindSeparatingAxisEdgeEdge(const b3ConvexPolyhedronData* hullA, __global const b3ConvexPolyhedronData* hullB,
+								  b3Float4ConstArg posA1,
+								  b3QuatConstArg ornA,
+								  b3Float4ConstArg posB1,
+								  b3QuatConstArg ornB,
+								  b3Float4ConstArg DeltaC2,
+								  const b3Float4* verticesA,
+								  const b3Float4* uniqueEdgesA,
+								  const b3GpuFace* facesA,
+								  const int* indicesA,
+								  __global const b3Float4* verticesB,
+								  __global const b3Float4* uniqueEdgesB,
+								  __global const b3GpuFace* facesB,
+								  __global const int* indicesB,
+								  b3Float4* sep,
+								  float* dmin,
+								  bool searchAllEdgeEdge)
+{
+	b3Float4 posA = posA1;
+	posA.w = 0.f;
+	b3Float4 posB = posB1;
+	posB.w = 0.f;
+
+	//	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	static int maxEdgeTests = 0;
+	int curEdgeTests = hullA->m_numUniqueEdges * hullB->m_numUniqueEdges;
+	if (curEdgeTests > maxEdgeTests)
+	{
+		maxEdgeTests = curEdgeTests;
+		printf("maxEdgeTests = %d\n", maxEdgeTests);
+		printf("hullA->m_numUniqueEdges = %d\n", hullA->m_numUniqueEdges);
+		printf("hullB->m_numUniqueEdges = %d\n", hullB->m_numUniqueEdges);
+	}
+
+	if (searchAllEdgeEdge)
+	{
+		for (int e0 = 0; e0 < hullA->m_numUniqueEdges; e0++)
+		{
+			const b3Float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset + e0];
+			b3Float4 edge0World = b3QuatRotate(ornA, edge0);
+
+			for (int e1 = 0; e1 < hullB->m_numUniqueEdges; e1++)
+			{
+				const b3Float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset + e1];
+				b3Float4 edge1World = b3QuatRotate(ornB, edge1);
+
+				b3Float4 crossje = b3Cross(edge0World, edge1World);
+
+				curEdgeEdge++;
+				if (!b3IsAlmostZero(crossje))
+				{
+					crossje = b3Normalized(crossje);
+					if (b3Dot(DeltaC2, crossje) < 0)
+						crossje *= -1.f;
+
+					float dist;
+					bool result = true;
+					{
+						float Min0, Max0;
+						float Min1, Max1;
+						b3Project(hullA, posA, ornA, &crossje, verticesA, &Min0, &Max0);
+						b3Project(hullB, posB, ornB, &crossje, verticesB, &Min1, &Max1);
+
+						if (Max0 < Min1 || Max1 < Min0)
+							return false;
+
+						float d0 = Max0 - Min1;
+						float d1 = Max1 - Min0;
+						dist = d0 < d1 ? d0 : d1;
+						result = true;
+					}
+
+					if (dist < *dmin)
+					{
+						*dmin = dist;
+						*sep = crossje;
+					}
+				}
+			}
+		}
+	}
+	else
+	{
+		int numDirections = sizeof(unitSphere162) / sizeof(b3Vector3);
+		//printf("numDirections =%d\n",numDirections );
+
+		for (int i = 0; i < numDirections; i++)
+		{
+			b3Float4 crossje = unitSphere162[i];
+			{
+				//if (b3Dot(DeltaC2,crossje)>0)
+				{
+					float dist;
+					bool result = true;
+					{
+						float Min0, Max0;
+						float Min1, Max1;
+						b3Project(hullA, posA, ornA, &crossje, verticesA, &Min0, &Max0);
+						b3Project(hullB, posB, ornB, &crossje, verticesB, &Min1, &Max1);
+
+						if (Max0 < Min1 || Max1 < Min0)
+							return false;
+
+						float d0 = Max0 - Min1;
+						float d1 = Max1 - Min0;
+						dist = d0 < d1 ? d0 : d1;
+						result = true;
+					}
+
+					if (dist < *dmin)
+					{
+						*dmin = dist;
+						*sep = crossje;
+					}
+				}
+			}
+		}
+	}
+
+	if ((b3Dot(-DeltaC2, *sep)) > 0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+inline int b3FindClippingFaces(b3Float4ConstArg separatingNormal,
+							   __global const b3ConvexPolyhedronData_t* hullA, __global const b3ConvexPolyhedronData_t* hullB,
+							   b3Float4ConstArg posA, b3QuatConstArg ornA, b3Float4ConstArg posB, b3QuatConstArg ornB,
+							   __global b3Float4* worldVertsA1,
+							   __global b3Float4* worldNormalsA1,
+							   __global b3Float4* worldVertsB1,
+							   int capacityWorldVerts,
+							   const float minDist, float maxDist,
+							   __global const b3Float4* verticesA,
+							   __global const b3GpuFace_t* facesA,
+							   __global const int* indicesA,
+							   __global const b3Float4* verticesB,
+							   __global const b3GpuFace_t* facesB,
+							   __global const int* indicesB,
+
+							   __global b3Int4* clippingFaces, int pairIndex)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1 = 0;
+
+	int closestFaceB = -1;
+	float dmax = -FLT_MAX;
+
+	{
+		for (int face = 0; face < hullB->m_numFaces; face++)
+		{
+			const b3Float4 Normal = b3MakeFloat4(facesB[hullB->m_faceOffset + face].m_plane.x,
+												 facesB[hullB->m_faceOffset + face].m_plane.y, facesB[hullB->m_faceOffset + face].m_plane.z, 0.f);
+			const b3Float4 WorldNormal = b3QuatRotate(ornB, Normal);
+			float d = b3Dot(WorldNormal, separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+
+	{
+		const b3GpuFace_t polyB = facesB[hullB->m_faceOffset + closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for (int e0 = 0; e0 < numVertices; e0++)
+		{
+			const b3Float4 b = verticesB[hullB->m_vertexOffset + indicesB[polyB.m_indexOffset + e0]];
+			worldVertsB1[pairIndex * capacityWorldVerts + numWorldVertsB1++] = b3TransformPoint(b, posB, ornB);
+		}
+	}
+
+	int closestFaceA = -1;
+	{
+		float dmin = FLT_MAX;
+		for (int face = 0; face < hullA->m_numFaces; face++)
+		{
+			const b3Float4 Normal = b3MakeFloat4(
+				facesA[hullA->m_faceOffset + face].m_plane.x,
+				facesA[hullA->m_faceOffset + face].m_plane.y,
+				facesA[hullA->m_faceOffset + face].m_plane.z,
+				0.f);
+			const b3Float4 faceANormalWS = b3QuatRotate(ornA, Normal);
+
+			float d = b3Dot(faceANormalWS, separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+				worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+
+	int numVerticesA = facesA[hullA->m_faceOffset + closestFaceA].m_numIndices;
+	for (int e0 = 0; e0 < numVerticesA; e0++)
+	{
+		const b3Float4 a = verticesA[hullA->m_vertexOffset + indicesA[facesA[hullA->m_faceOffset + closestFaceA].m_indexOffset + e0]];
+		worldVertsA1[pairIndex * capacityWorldVerts + e0] = b3TransformPoint(a, posA, ornA);
+	}
+
+	clippingFaces[pairIndex].x = closestFaceA;
+	clippingFaces[pairIndex].y = closestFaceB;
+	clippingFaces[pairIndex].z = numVerticesA;
+	clippingFaces[pairIndex].w = numWorldVertsB1;
+
+	return numContactsOut;
+}
+
+__kernel void b3FindConcaveSeparatingAxisKernel(__global b3Int4* concavePairs,
+												__global const b3RigidBodyData* rigidBodies,
+												__global const b3Collidable* collidables,
+												__global const b3ConvexPolyhedronData* convexShapes,
+												__global const b3Float4* vertices,
+												__global const b3Float4* uniqueEdges,
+												__global const b3GpuFace* faces,
+												__global const int* indices,
+												__global const b3GpuChildShape* gpuChildShapes,
+												__global b3Aabb* aabbs,
+												__global b3Float4* concaveSeparatingNormalsOut,
+												__global b3Int4* clippingFacesOut,
+												__global b3Vector3* worldVertsA1Out,
+												__global b3Vector3* worldNormalsA1Out,
+												__global b3Vector3* worldVertsB1Out,
+												__global int* hasSeparatingNormals,
+												int vertexFaceCapacity,
+												int numConcavePairs,
+												int pairIdx)
+{
+	int i = pairIdx;
+	/*	int i = get_global_id(0);
+	if (i>=numConcavePairs)
+		return;
+	int pairIdx = i;
+	*/
+
+	int bodyIndexA = concavePairs[i].x;
+	int bodyIndexB = concavePairs[i].y;
+
+	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+	if (collidables[collidableIndexB].m_shapeType != SHAPE_CONVEX_HULL &&
+		collidables[collidableIndexB].m_shapeType != SHAPE_COMPOUND_OF_CONVEX_HULLS)
+	{
+		concavePairs[pairIdx].w = -1;
+		return;
+	}
+
+	hasSeparatingNormals[i] = 0;
+
+	//	int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+	int numActualConcaveConvexTests = 0;
+
+	int f = concavePairs[i].z;
+
+	bool overlap = false;
+
+	b3ConvexPolyhedronData convexPolyhedronA;
+
+	//add 3 vertices of the triangle
+	convexPolyhedronA.m_numVertices = 3;
+	convexPolyhedronA.m_vertexOffset = 0;
+	b3Float4 localCenter = b3MakeFloat4(0.f, 0.f, 0.f, 0.f);
+
+	b3GpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset + f];
+	b3Aabb triAabb;
+	triAabb.m_minVec = b3MakeFloat4(1e30f, 1e30f, 1e30f, 0.f);
+	triAabb.m_maxVec = b3MakeFloat4(-1e30f, -1e30f, -1e30f, 0.f);
+
+	b3Float4 verticesA[3];
+	for (int i = 0; i < 3; i++)
+	{
+		int index = indices[face.m_indexOffset + i];
+		b3Float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset + index];
+		verticesA[i] = vert;
+		localCenter += vert;
+
+		triAabb.m_minVec = b3MinFloat4(triAabb.m_minVec, vert);
+		triAabb.m_maxVec = b3MaxFloat4(triAabb.m_maxVec, vert);
+	}
+
+	overlap = true;
+	overlap = (triAabb.m_minVec.x > aabbs[bodyIndexB].m_maxVec.x || triAabb.m_maxVec.x < aabbs[bodyIndexB].m_minVec.x) ? false : overlap;
+	overlap = (triAabb.m_minVec.z > aabbs[bodyIndexB].m_maxVec.z || triAabb.m_maxVec.z < aabbs[bodyIndexB].m_minVec.z) ? false : overlap;
+	overlap = (triAabb.m_minVec.y > aabbs[bodyIndexB].m_maxVec.y || triAabb.m_maxVec.y < aabbs[bodyIndexB].m_minVec.y) ? false : overlap;
+
+	if (overlap)
+	{
+		float dmin = FLT_MAX;
+		int hasSeparatingAxis = 5;
+		b3Float4 sepAxis = b3MakeFloat4(1, 2, 3, 4);
+
+		//	int localCC=0;
+		numActualConcaveConvexTests++;
+
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		b3Float4 uniqueEdgesA[3];
+
+		uniqueEdgesA[0] = (verticesA[1] - verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2] - verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0] - verticesA[2]);
+
+		convexPolyhedronA.m_faceOffset = 0;
+
+		b3Float4 normal = b3MakeFloat4(face.m_plane.x, face.m_plane.y, face.m_plane.z, 0.f);
+
+		b3GpuFace facesA[B3_TRIANGLE_NUM_CONVEX_FACES];
+		int indicesA[3 + 3 + 2 + 2 + 2];
+		int curUsedIndices = 0;
+		int fidx = 0;
+
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset = curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices += 3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices = 3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset = curUsedIndices;
+			indicesA[3] = 2;
+			indicesA[4] = 1;
+			indicesA[5] = 0;
+			curUsedIndices += 3;
+			float c = b3Dot(normal, verticesA[0]);
+			//	float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices = 3;
+		}
+		fidx++;
+
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices = 3;
+			int prevVertex = numVertices - 1;
+			for (int i = 0; i < numVertices; i++)
+			{
+				b3Float4 v0 = verticesA[i];
+				b3Float4 v1 = verticesA[prevVertex];
+
+				b3Float4 edgeNormal = b3Normalized(b3Cross(normal, v1 - v0));
+				float c = -b3Dot(edgeNormal, v0);
+
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset = curUsedIndices;
+				indicesA[curUsedIndices++] = i;
+				indicesA[curUsedIndices++] = prevVertex;
+
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = B3_TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter * (1.f / 3.f);
+
+		b3Float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		b3Float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+
+		b3Quaternion ornA = rigidBodies[bodyIndexA].m_quat;
+		b3Quaternion ornB = rigidBodies[bodyIndexB].m_quat;
+
+		///////////////////
+		///compound shape support
+
+		if (collidables[collidableIndexB].m_shapeType == SHAPE_COMPOUND_OF_CONVEX_HULLS)
+		{
+			int compoundChild = concavePairs[pairIdx].w;
+			int childShapeIndexB = compoundChild;  //collidables[collidableIndexB].m_shapeIndex+compoundChild;
+			int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			b3Float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			b3Quaternion childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			b3Float4 newPosB = b3TransformPoint(childPosB, posB, ornB);
+			b3Quaternion newOrnB = b3QuatMul(ornB, childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+			shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+		}
+		//////////////////
+
+		b3Float4 c0local = convexPolyhedronA.m_localCenter;
+		b3Float4 c0 = b3TransformPoint(c0local, posA, ornA);
+		b3Float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		b3Float4 c1 = b3TransformPoint(c1local, posB, ornB);
+		const b3Float4 DeltaC2 = c0 - c1;
+
+		bool sepA = b3FindSeparatingAxis(&convexPolyhedronA, &convexShapes[shapeIndexB],
+										 posA, ornA,
+										 posB, ornB,
+										 DeltaC2,
+										 verticesA, uniqueEdgesA, facesA, indicesA,
+										 vertices, uniqueEdges, faces, indices,
+										 &sepAxis, &dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		}
+		else
+		{
+			bool sepB = b3FindSeparatingAxis(&convexShapes[shapeIndexB], &convexPolyhedronA,
+											 posB, ornB,
+											 posA, ornA,
+											 DeltaC2,
+											 vertices, uniqueEdges, faces, indices,
+											 verticesA, uniqueEdgesA, facesA, indicesA,
+											 &sepAxis, &dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			}
+			else
+			{
+				bool sepEE = b3FindSeparatingAxisEdgeEdge(&convexPolyhedronA, &convexShapes[shapeIndexB],
+														  posA, ornA,
+														  posB, ornB,
+														  DeltaC2,
+														  verticesA, uniqueEdgesA, facesA, indicesA,
+														  vertices, uniqueEdges, faces, indices,
+														  &sepAxis, &dmin, true);
+
+				if (!sepEE)
+				{
+					hasSeparatingAxis = 0;
+				}
+				else
+				{
+					hasSeparatingAxis = 1;
+				}
+			}
+		}
+
+		if (hasSeparatingAxis)
+		{
+			hasSeparatingNormals[i] = 1;
+			sepAxis.w = dmin;
+			concaveSeparatingNormalsOut[pairIdx] = sepAxis;
+
+			//now compute clipping faces A and B, and world-space clipping vertices A and B...
+
+			float minDist = -1e30f;
+			float maxDist = 0.02f;
+
+			b3FindClippingFaces(sepAxis,
+								&convexPolyhedronA,
+								&convexShapes[shapeIndexB],
+								posA, ornA,
+								posB, ornB,
+								worldVertsA1Out,
+								worldNormalsA1Out,
+								worldVertsB1Out,
+								vertexFaceCapacity,
+								minDist, maxDist,
+								verticesA,
+								facesA,
+								indicesA,
+
+								vertices,
+								faces,
+								indices,
+								clippingFacesOut, pairIdx);
+		}
+		else
+		{
+			//mark this pair as in-active
+			concavePairs[pairIdx].w = -1;
+		}
+	}
+	else
+	{
+		//mark this pair as in-active
+		concavePairs[pairIdx].w = -1;
+	}
+}
+
+#endif  //B3_FIND_CONCAVE_SEPARATING_AXIS_H

+ 197 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3FindSeparatingAxis.h

@@ -0,0 +1,197 @@
+#ifndef B3_FIND_SEPARATING_AXIS_H
+#define B3_FIND_SEPARATING_AXIS_H
+
+inline void b3ProjectAxis(const b3ConvexPolyhedronData& hull, const b3Float4& pos, const b3Quaternion& orn, const b3Float4& dir, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar& min, b3Scalar& max)
+{
+	min = FLT_MAX;
+	max = -FLT_MAX;
+	int numVerts = hull.m_numVertices;
+
+	const b3Float4 localDir = b3QuatRotate(orn.inverse(), dir);
+
+	b3Scalar offset = b3Dot3F4(pos, dir);
+
+	for (int i = 0; i < numVerts; i++)
+	{
+		//b3Vector3 pt = trans * vertices[m_vertexOffset+i];
+		//b3Scalar dp = pt.dot(dir);
+		//b3Vector3 vertex = vertices[hull.m_vertexOffset+i];
+		b3Scalar dp = b3Dot3F4((b3Float4&)vertices[hull.m_vertexOffset + i], localDir);
+		//b3Assert(dp==dpL);
+		if (dp < min) min = dp;
+		if (dp > max) max = dp;
+	}
+	if (min > max)
+	{
+		b3Scalar tmp = min;
+		min = max;
+		max = tmp;
+	}
+	min += offset;
+	max += offset;
+}
+
+inline bool b3TestSepAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB,
+						  const b3Float4& posA, const b3Quaternion& ornA,
+						  const b3Float4& posB, const b3Quaternion& ornB,
+						  const b3Float4& sep_axis, const b3AlignedObjectArray<b3Vector3>& verticesA, const b3AlignedObjectArray<b3Vector3>& verticesB, b3Scalar& depth)
+{
+	b3Scalar Min0, Max0;
+	b3Scalar Min1, Max1;
+	b3ProjectAxis(hullA, posA, ornA, sep_axis, verticesA, Min0, Max0);
+	b3ProjectAxis(hullB, posB, ornB, sep_axis, verticesB, Min1, Max1);
+
+	if (Max0 < Min1 || Max1 < Min0)
+		return false;
+
+	b3Scalar d0 = Max0 - Min1;
+	b3Assert(d0 >= 0.0f);
+	b3Scalar d1 = Max1 - Min0;
+	b3Assert(d1 >= 0.0f);
+	depth = d0 < d1 ? d0 : d1;
+	return true;
+}
+
+inline bool b3FindSeparatingAxis(const b3ConvexPolyhedronData& hullA, const b3ConvexPolyhedronData& hullB,
+								 const b3Float4& posA1,
+								 const b3Quaternion& ornA,
+								 const b3Float4& posB1,
+								 const b3Quaternion& ornB,
+								 const b3AlignedObjectArray<b3Vector3>& verticesA,
+								 const b3AlignedObjectArray<b3Vector3>& uniqueEdgesA,
+								 const b3AlignedObjectArray<b3GpuFace>& facesA,
+								 const b3AlignedObjectArray<int>& indicesA,
+								 const b3AlignedObjectArray<b3Vector3>& verticesB,
+								 const b3AlignedObjectArray<b3Vector3>& uniqueEdgesB,
+								 const b3AlignedObjectArray<b3GpuFace>& facesB,
+								 const b3AlignedObjectArray<int>& indicesB,
+
+								 b3Vector3& sep)
+{
+	B3_PROFILE("findSeparatingAxis");
+
+	b3Float4 posA = posA1;
+	posA.w = 0.f;
+	b3Float4 posB = posB1;
+	posB.w = 0.f;
+	//#ifdef TEST_INTERNAL_OBJECTS
+	b3Float4 c0local = (b3Float4&)hullA.m_localCenter;
+
+	b3Float4 c0 = b3TransformPoint(c0local, posA, ornA);
+	b3Float4 c1local = (b3Float4&)hullB.m_localCenter;
+	b3Float4 c1 = b3TransformPoint(c1local, posB, ornB);
+	const b3Float4 deltaC2 = c0 - c1;
+	//#endif
+
+	b3Scalar dmin = FLT_MAX;
+	int curPlaneTests = 0;
+
+	int numFacesA = hullA.m_numFaces;
+	// Test normals from hullA
+	for (int i = 0; i < numFacesA; i++)
+	{
+		const b3Float4& normal = (b3Float4&)facesA[hullA.m_faceOffset + i].m_plane;
+		b3Float4 faceANormalWS = b3QuatRotate(ornA, normal);
+
+		if (b3Dot3F4(deltaC2, faceANormalWS) < 0)
+			faceANormalWS *= -1.f;
+
+		curPlaneTests++;
+#ifdef TEST_INTERNAL_OBJECTS
+		gExpectedNbTests++;
+		if (gUseInternalObject && !TestInternalObjects(transA, transB, DeltaC2, faceANormalWS, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+#endif
+
+		b3Scalar d;
+		if (!b3TestSepAxis(hullA, hullB, posA, ornA, posB, ornB, faceANormalWS, verticesA, verticesB, d))
+			return false;
+
+		if (d < dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)faceANormalWS;
+		}
+	}
+
+	int numFacesB = hullB.m_numFaces;
+	// Test normals from hullB
+	for (int i = 0; i < numFacesB; i++)
+	{
+		b3Float4 normal = (b3Float4&)facesB[hullB.m_faceOffset + i].m_plane;
+		b3Float4 WorldNormal = b3QuatRotate(ornB, normal);
+
+		if (b3Dot3F4(deltaC2, WorldNormal) < 0)
+		{
+			WorldNormal *= -1.f;
+		}
+		curPlaneTests++;
+#ifdef TEST_INTERNAL_OBJECTS
+		gExpectedNbTests++;
+		if (gUseInternalObject && !TestInternalObjects(transA, transB, DeltaC2, WorldNormal, hullA, hullB, dmin))
+			continue;
+		gActualNbTests++;
+#endif
+
+		b3Scalar d;
+		if (!b3TestSepAxis(hullA, hullB, posA, ornA, posB, ornB, WorldNormal, verticesA, verticesB, d))
+			return false;
+
+		if (d < dmin)
+		{
+			dmin = d;
+			sep = (b3Vector3&)WorldNormal;
+		}
+	}
+
+	//	b3Vector3 edgeAstart,edgeAend,edgeBstart,edgeBend;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for (int e0 = 0; e0 < hullA.m_numUniqueEdges; e0++)
+	{
+		const b3Float4& edge0 = (b3Float4&)uniqueEdgesA[hullA.m_uniqueEdgesOffset + e0];
+		b3Float4 edge0World = b3QuatRotate(ornA, (b3Float4&)edge0);
+
+		for (int e1 = 0; e1 < hullB.m_numUniqueEdges; e1++)
+		{
+			const b3Vector3 edge1 = uniqueEdgesB[hullB.m_uniqueEdgesOffset + e1];
+			b3Float4 edge1World = b3QuatRotate(ornB, (b3Float4&)edge1);
+
+			b3Float4 crossje = b3Cross3(edge0World, edge1World);
+
+			curEdgeEdge++;
+			if (!b3IsAlmostZero((b3Vector3&)crossje))
+			{
+				crossje = b3FastNormalized3(crossje);
+				if (b3Dot3F4(deltaC2, crossje) < 0)
+					crossje *= -1.f;
+
+#ifdef TEST_INTERNAL_OBJECTS
+				gExpectedNbTests++;
+				if (gUseInternalObject && !TestInternalObjects(transA, transB, DeltaC2, Cross, hullA, hullB, dmin))
+					continue;
+				gActualNbTests++;
+#endif
+
+				b3Scalar dist;
+				if (!b3TestSepAxis(hullA, hullB, posA, ornA, posB, ornB, crossje, verticesA, verticesB, dist))
+					return false;
+
+				if (dist < dmin)
+				{
+					dmin = dist;
+					sep = (b3Vector3&)crossje;
+				}
+			}
+		}
+	}
+
+	if ((b3Dot3F4(-deltaC2, (b3Float4&)sep)) > 0.0f)
+		sep = -sep;
+
+	return true;
+}
+
+#endif  //B3_FIND_SEPARATING_AXIS_H

+ 888 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3MprPenetration.h

@@ -0,0 +1,888 @@
+
+/***
+ * ---------------------------------
+ * Copyright (c)2012 Daniel Fiser <[email protected]>
+ *
+ *  This file was ported from mpr.c file, part of libccd.
+ *  The Minkoski Portal Refinement implementation was ported 
+ *  to OpenCL by Erwin Coumans for the Bullet 3 Physics library.
+ *  at http://github.com/erwincoumans/bullet3
+ *
+ *  Distributed under the OSI-approved BSD License (the "License");
+ *  see <http://www.opensource.org/licenses/bsd-license.php>.
+ *  This software is distributed WITHOUT ANY WARRANTY; without even the
+ *  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *  See the License for more information.
+ */
+
+#ifndef B3_MPR_PENETRATION_H
+#define B3_MPR_PENETRATION_H
+
+#include "Bullet3Common/shared/b3PlatformDefinitions.h"
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+
+#ifdef __cplusplus
+#define B3_MPR_SQRT sqrtf
+#else
+#define B3_MPR_SQRT sqrt
+#endif
+#define B3_MPR_FMIN(x, y) ((x) < (y) ? (x) : (y))
+#define B3_MPR_FABS fabs
+
+#define B3_MPR_TOLERANCE 1E-6f
+#define B3_MPR_MAX_ITERATIONS 1000
+
+struct _b3MprSupport_t
+{
+	b3Float4 v;   //!< Support point in minkowski sum
+	b3Float4 v1;  //!< Support point in obj1
+	b3Float4 v2;  //!< Support point in obj2
+};
+typedef struct _b3MprSupport_t b3MprSupport_t;
+
+struct _b3MprSimplex_t
+{
+	b3MprSupport_t ps[4];
+	int last;  //!< index of last added point
+};
+typedef struct _b3MprSimplex_t b3MprSimplex_t;
+
+inline b3MprSupport_t *b3MprSimplexPointW(b3MprSimplex_t *s, int idx)
+{
+	return &s->ps[idx];
+}
+
+inline void b3MprSimplexSetSize(b3MprSimplex_t *s, int size)
+{
+	s->last = size - 1;
+}
+
+inline int b3MprSimplexSize(const b3MprSimplex_t *s)
+{
+	return s->last + 1;
+}
+
+inline const b3MprSupport_t *b3MprSimplexPoint(const b3MprSimplex_t *s, int idx)
+{
+	// here is no check on boundaries
+	return &s->ps[idx];
+}
+
+inline void b3MprSupportCopy(b3MprSupport_t *d, const b3MprSupport_t *s)
+{
+	*d = *s;
+}
+
+inline void b3MprSimplexSet(b3MprSimplex_t *s, size_t pos, const b3MprSupport_t *a)
+{
+	b3MprSupportCopy(s->ps + pos, a);
+}
+
+inline void b3MprSimplexSwap(b3MprSimplex_t *s, size_t pos1, size_t pos2)
+{
+	b3MprSupport_t supp;
+
+	b3MprSupportCopy(&supp, &s->ps[pos1]);
+	b3MprSupportCopy(&s->ps[pos1], &s->ps[pos2]);
+	b3MprSupportCopy(&s->ps[pos2], &supp);
+}
+
+inline int b3MprIsZero(float val)
+{
+	return B3_MPR_FABS(val) < FLT_EPSILON;
+}
+
+inline int b3MprEq(float _a, float _b)
+{
+	float ab;
+	float a, b;
+
+	ab = B3_MPR_FABS(_a - _b);
+	if (B3_MPR_FABS(ab) < FLT_EPSILON)
+		return 1;
+
+	a = B3_MPR_FABS(_a);
+	b = B3_MPR_FABS(_b);
+	if (b > a)
+	{
+		return ab < FLT_EPSILON * b;
+	}
+	else
+	{
+		return ab < FLT_EPSILON * a;
+	}
+}
+
+inline int b3MprVec3Eq(const b3Float4 *a, const b3Float4 *b)
+{
+	return b3MprEq((*a).x, (*b).x) && b3MprEq((*a).y, (*b).y) && b3MprEq((*a).z, (*b).z);
+}
+
+inline b3Float4 b3LocalGetSupportVertex(b3Float4ConstArg supportVec, __global const b3ConvexPolyhedronData_t *hull, b3ConstArray(b3Float4) verticesA)
+{
+	b3Float4 supVec = b3MakeFloat4(0, 0, 0, 0);
+	float maxDot = -B3_LARGE_FLOAT;
+
+	if (0 < hull->m_numVertices)
+	{
+		const b3Float4 scaled = supportVec;
+		int index = b3MaxDot(scaled, &verticesA[hull->m_vertexOffset], hull->m_numVertices, &maxDot);
+		return verticesA[hull->m_vertexOffset + index];
+	}
+
+	return supVec;
+}
+
+B3_STATIC void b3MprConvexSupport(int pairIndex, int bodyIndex, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
+								  b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData,
+								  b3ConstArray(b3Collidable_t) cpuCollidables,
+								  b3ConstArray(b3Float4) cpuVertices,
+								  __global b3Float4 *sepAxis,
+								  const b3Float4 *_dir, b3Float4 *outp, int logme)
+{
+	//dir is in worldspace, move to local space
+
+	b3Float4 pos = cpuBodyBuf[bodyIndex].m_pos;
+	b3Quat orn = cpuBodyBuf[bodyIndex].m_quat;
+
+	b3Float4 dir = b3MakeFloat4((*_dir).x, (*_dir).y, (*_dir).z, 0.f);
+
+	const b3Float4 localDir = b3QuatRotate(b3QuatInverse(orn), dir);
+
+	//find local support vertex
+	int colIndex = cpuBodyBuf[bodyIndex].m_collidableIdx;
+
+	b3Assert(cpuCollidables[colIndex].m_shapeType == SHAPE_CONVEX_HULL);
+	__global const b3ConvexPolyhedronData_t *hull = &cpuConvexData[cpuCollidables[colIndex].m_shapeIndex];
+
+	b3Float4 pInA;
+	if (logme)
+	{
+		//	b3Float4 supVec = b3MakeFloat4(0,0,0,0);
+		float maxDot = -B3_LARGE_FLOAT;
+
+		if (0 < hull->m_numVertices)
+		{
+			const b3Float4 scaled = localDir;
+			int index = b3MaxDot(scaled, &cpuVertices[hull->m_vertexOffset], hull->m_numVertices, &maxDot);
+			pInA = cpuVertices[hull->m_vertexOffset + index];
+		}
+	}
+	else
+	{
+		pInA = b3LocalGetSupportVertex(localDir, hull, cpuVertices);
+	}
+
+	//move vertex to world space
+	*outp = b3TransformPoint(pInA, pos, orn);
+}
+
+inline void b3MprSupport(int pairIndex, int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
+						 b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData,
+						 b3ConstArray(b3Collidable_t) cpuCollidables,
+						 b3ConstArray(b3Float4) cpuVertices,
+						 __global b3Float4 *sepAxis,
+						 const b3Float4 *_dir, b3MprSupport_t *supp)
+{
+	b3Float4 dir;
+	dir = *_dir;
+	b3MprConvexSupport(pairIndex, bodyIndexA, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &dir, &supp->v1, 0);
+	dir = *_dir * -1.f;
+	b3MprConvexSupport(pairIndex, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &dir, &supp->v2, 0);
+	supp->v = supp->v1 - supp->v2;
+}
+
+inline void b3FindOrigin(int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf, b3MprSupport_t *center)
+{
+	center->v1 = cpuBodyBuf[bodyIndexA].m_pos;
+	center->v2 = cpuBodyBuf[bodyIndexB].m_pos;
+	center->v = center->v1 - center->v2;
+}
+
+inline void b3MprVec3Set(b3Float4 *v, float x, float y, float z)
+{
+	(*v).x = x;
+	(*v).y = y;
+	(*v).z = z;
+	(*v).w = 0.f;
+}
+
+inline void b3MprVec3Add(b3Float4 *v, const b3Float4 *w)
+{
+	(*v).x += (*w).x;
+	(*v).y += (*w).y;
+	(*v).z += (*w).z;
+}
+
+inline void b3MprVec3Copy(b3Float4 *v, const b3Float4 *w)
+{
+	*v = *w;
+}
+
+inline void b3MprVec3Scale(b3Float4 *d, float k)
+{
+	*d *= k;
+}
+
+inline float b3MprVec3Dot(const b3Float4 *a, const b3Float4 *b)
+{
+	float dot;
+
+	dot = b3Dot3F4(*a, *b);
+	return dot;
+}
+
+inline float b3MprVec3Len2(const b3Float4 *v)
+{
+	return b3MprVec3Dot(v, v);
+}
+
+inline void b3MprVec3Normalize(b3Float4 *d)
+{
+	float k = 1.f / B3_MPR_SQRT(b3MprVec3Len2(d));
+	b3MprVec3Scale(d, k);
+}
+
+inline void b3MprVec3Cross(b3Float4 *d, const b3Float4 *a, const b3Float4 *b)
+{
+	*d = b3Cross3(*a, *b);
+}
+
+inline void b3MprVec3Sub2(b3Float4 *d, const b3Float4 *v, const b3Float4 *w)
+{
+	*d = *v - *w;
+}
+
+inline void b3PortalDir(const b3MprSimplex_t *portal, b3Float4 *dir)
+{
+	b3Float4 v2v1, v3v1;
+
+	b3MprVec3Sub2(&v2v1, &b3MprSimplexPoint(portal, 2)->v,
+				  &b3MprSimplexPoint(portal, 1)->v);
+	b3MprVec3Sub2(&v3v1, &b3MprSimplexPoint(portal, 3)->v,
+				  &b3MprSimplexPoint(portal, 1)->v);
+	b3MprVec3Cross(dir, &v2v1, &v3v1);
+	b3MprVec3Normalize(dir);
+}
+
+inline int portalEncapsulesOrigin(const b3MprSimplex_t *portal,
+								  const b3Float4 *dir)
+{
+	float dot;
+	dot = b3MprVec3Dot(dir, &b3MprSimplexPoint(portal, 1)->v);
+	return b3MprIsZero(dot) || dot > 0.f;
+}
+
+inline int portalReachTolerance(const b3MprSimplex_t *portal,
+								const b3MprSupport_t *v4,
+								const b3Float4 *dir)
+{
+	float dv1, dv2, dv3, dv4;
+	float dot1, dot2, dot3;
+
+	// find the smallest dot product of dir and {v1-v4, v2-v4, v3-v4}
+
+	dv1 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, dir);
+	dv2 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, dir);
+	dv3 = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, dir);
+	dv4 = b3MprVec3Dot(&v4->v, dir);
+
+	dot1 = dv4 - dv1;
+	dot2 = dv4 - dv2;
+	dot3 = dv4 - dv3;
+
+	dot1 = B3_MPR_FMIN(dot1, dot2);
+	dot1 = B3_MPR_FMIN(dot1, dot3);
+
+	return b3MprEq(dot1, B3_MPR_TOLERANCE) || dot1 < B3_MPR_TOLERANCE;
+}
+
+inline int portalCanEncapsuleOrigin(const b3MprSimplex_t *portal,
+									const b3MprSupport_t *v4,
+									const b3Float4 *dir)
+{
+	float dot;
+	dot = b3MprVec3Dot(&v4->v, dir);
+	return b3MprIsZero(dot) || dot > 0.f;
+}
+
+inline void b3ExpandPortal(b3MprSimplex_t *portal,
+						   const b3MprSupport_t *v4)
+{
+	float dot;
+	b3Float4 v4v0;
+
+	b3MprVec3Cross(&v4v0, &v4->v, &b3MprSimplexPoint(portal, 0)->v);
+	dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &v4v0);
+	if (dot > 0.f)
+	{
+		dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &v4v0);
+		if (dot > 0.f)
+		{
+			b3MprSimplexSet(portal, 1, v4);
+		}
+		else
+		{
+			b3MprSimplexSet(portal, 3, v4);
+		}
+	}
+	else
+	{
+		dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &v4v0);
+		if (dot > 0.f)
+		{
+			b3MprSimplexSet(portal, 2, v4);
+		}
+		else
+		{
+			b3MprSimplexSet(portal, 1, v4);
+		}
+	}
+}
+
+B3_STATIC int b3DiscoverPortal(int pairIndex, int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
+							   b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData,
+							   b3ConstArray(b3Collidable_t) cpuCollidables,
+							   b3ConstArray(b3Float4) cpuVertices,
+							   __global b3Float4 *sepAxis,
+							   __global int *hasSepAxis,
+							   b3MprSimplex_t *portal)
+{
+	b3Float4 dir, va, vb;
+	float dot;
+	int cont;
+
+	// vertex 0 is center of portal
+	b3FindOrigin(bodyIndexA, bodyIndexB, cpuBodyBuf, b3MprSimplexPointW(portal, 0));
+	// vertex 0 is center of portal
+	b3MprSimplexSetSize(portal, 1);
+
+	b3Float4 zero = b3MakeFloat4(0, 0, 0, 0);
+	b3Float4 *b3mpr_vec3_origin = &zero;
+
+	if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 0)->v, b3mpr_vec3_origin))
+	{
+		// Portal's center lies on origin (0,0,0) => we know that objects
+		// intersect but we would need to know penetration info.
+		// So move center little bit...
+		b3MprVec3Set(&va, FLT_EPSILON * 10.f, 0.f, 0.f);
+		b3MprVec3Add(&b3MprSimplexPointW(portal, 0)->v, &va);
+	}
+
+	// vertex 1 = support in direction of origin
+	b3MprVec3Copy(&dir, &b3MprSimplexPoint(portal, 0)->v);
+	b3MprVec3Scale(&dir, -1.f);
+	b3MprVec3Normalize(&dir);
+
+	b3MprSupport(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &dir, b3MprSimplexPointW(portal, 1));
+
+	b3MprSimplexSetSize(portal, 2);
+
+	// test if origin isn't outside of v1
+	dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 1)->v, &dir);
+
+	if (b3MprIsZero(dot) || dot < 0.f)
+		return -1;
+
+	// vertex 2
+	b3MprVec3Cross(&dir, &b3MprSimplexPoint(portal, 0)->v,
+				   &b3MprSimplexPoint(portal, 1)->v);
+	if (b3MprIsZero(b3MprVec3Len2(&dir)))
+	{
+		if (b3MprVec3Eq(&b3MprSimplexPoint(portal, 1)->v, b3mpr_vec3_origin))
+		{
+			// origin lies on v1
+			return 1;
+		}
+		else
+		{
+			// origin lies on v0-v1 segment
+			return 2;
+		}
+	}
+
+	b3MprVec3Normalize(&dir);
+	b3MprSupport(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &dir, b3MprSimplexPointW(portal, 2));
+
+	dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 2)->v, &dir);
+	if (b3MprIsZero(dot) || dot < 0.f)
+		return -1;
+
+	b3MprSimplexSetSize(portal, 3);
+
+	// vertex 3 direction
+	b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,
+				  &b3MprSimplexPoint(portal, 0)->v);
+	b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,
+				  &b3MprSimplexPoint(portal, 0)->v);
+	b3MprVec3Cross(&dir, &va, &vb);
+	b3MprVec3Normalize(&dir);
+
+	// it is better to form portal faces to be oriented "outside" origin
+	dot = b3MprVec3Dot(&dir, &b3MprSimplexPoint(portal, 0)->v);
+	if (dot > 0.f)
+	{
+		b3MprSimplexSwap(portal, 1, 2);
+		b3MprVec3Scale(&dir, -1.f);
+	}
+
+	while (b3MprSimplexSize(portal) < 4)
+	{
+		b3MprSupport(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &dir, b3MprSimplexPointW(portal, 3));
+
+		dot = b3MprVec3Dot(&b3MprSimplexPoint(portal, 3)->v, &dir);
+		if (b3MprIsZero(dot) || dot < 0.f)
+			return -1;
+
+		cont = 0;
+
+		// test if origin is outside (v1, v0, v3) - set v2 as v3 and
+		// continue
+		b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 1)->v,
+					   &b3MprSimplexPoint(portal, 3)->v);
+		dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);
+		if (dot < 0.f && !b3MprIsZero(dot))
+		{
+			b3MprSimplexSet(portal, 2, b3MprSimplexPoint(portal, 3));
+			cont = 1;
+		}
+
+		if (!cont)
+		{
+			// test if origin is outside (v3, v0, v2) - set v1 as v3 and
+			// continue
+			b3MprVec3Cross(&va, &b3MprSimplexPoint(portal, 3)->v,
+						   &b3MprSimplexPoint(portal, 2)->v);
+			dot = b3MprVec3Dot(&va, &b3MprSimplexPoint(portal, 0)->v);
+			if (dot < 0.f && !b3MprIsZero(dot))
+			{
+				b3MprSimplexSet(portal, 1, b3MprSimplexPoint(portal, 3));
+				cont = 1;
+			}
+		}
+
+		if (cont)
+		{
+			b3MprVec3Sub2(&va, &b3MprSimplexPoint(portal, 1)->v,
+						  &b3MprSimplexPoint(portal, 0)->v);
+			b3MprVec3Sub2(&vb, &b3MprSimplexPoint(portal, 2)->v,
+						  &b3MprSimplexPoint(portal, 0)->v);
+			b3MprVec3Cross(&dir, &va, &vb);
+			b3MprVec3Normalize(&dir);
+		}
+		else
+		{
+			b3MprSimplexSetSize(portal, 4);
+		}
+	}
+
+	return 0;
+}
+
+B3_STATIC int b3RefinePortal(int pairIndex, int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
+							 b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData,
+							 b3ConstArray(b3Collidable_t) cpuCollidables,
+							 b3ConstArray(b3Float4) cpuVertices,
+							 __global b3Float4 *sepAxis,
+							 b3MprSimplex_t *portal)
+{
+	b3Float4 dir;
+	b3MprSupport_t v4;
+
+	for (int i = 0; i < B3_MPR_MAX_ITERATIONS; i++)
+	//while (1)
+	{
+		// compute direction outside the portal (from v0 throught v1,v2,v3
+		// face)
+		b3PortalDir(portal, &dir);
+
+		// test if origin is inside the portal
+		if (portalEncapsulesOrigin(portal, &dir))
+			return 0;
+
+		// get next support point
+
+		b3MprSupport(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &dir, &v4);
+
+		// test if v4 can expand portal to contain origin and if portal
+		// expanding doesn't reach given tolerance
+		if (!portalCanEncapsuleOrigin(portal, &v4, &dir) || portalReachTolerance(portal, &v4, &dir))
+		{
+			return -1;
+		}
+
+		// v1-v2-v3 triangle must be rearranged to face outside Minkowski
+		// difference (direction from v0).
+		b3ExpandPortal(portal, &v4);
+	}
+
+	return -1;
+}
+
+B3_STATIC void b3FindPos(const b3MprSimplex_t *portal, b3Float4 *pos)
+{
+	b3Float4 zero = b3MakeFloat4(0, 0, 0, 0);
+	b3Float4 *b3mpr_vec3_origin = &zero;
+
+	b3Float4 dir;
+	size_t i;
+	float b[4], sum, inv;
+	b3Float4 vec, p1, p2;
+
+	b3PortalDir(portal, &dir);
+
+	// use barycentric coordinates of tetrahedron to find origin
+	b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,
+				   &b3MprSimplexPoint(portal, 2)->v);
+	b[0] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);
+
+	b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,
+				   &b3MprSimplexPoint(portal, 2)->v);
+	b[1] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);
+
+	b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 0)->v,
+				   &b3MprSimplexPoint(portal, 1)->v);
+	b[2] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 3)->v);
+
+	b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,
+				   &b3MprSimplexPoint(portal, 1)->v);
+	b[3] = b3MprVec3Dot(&vec, &b3MprSimplexPoint(portal, 0)->v);
+
+	sum = b[0] + b[1] + b[2] + b[3];
+
+	if (b3MprIsZero(sum) || sum < 0.f)
+	{
+		b[0] = 0.f;
+
+		b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 2)->v,
+					   &b3MprSimplexPoint(portal, 3)->v);
+		b[1] = b3MprVec3Dot(&vec, &dir);
+		b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 3)->v,
+					   &b3MprSimplexPoint(portal, 1)->v);
+		b[2] = b3MprVec3Dot(&vec, &dir);
+		b3MprVec3Cross(&vec, &b3MprSimplexPoint(portal, 1)->v,
+					   &b3MprSimplexPoint(portal, 2)->v);
+		b[3] = b3MprVec3Dot(&vec, &dir);
+
+		sum = b[1] + b[2] + b[3];
+	}
+
+	inv = 1.f / sum;
+
+	b3MprVec3Copy(&p1, b3mpr_vec3_origin);
+	b3MprVec3Copy(&p2, b3mpr_vec3_origin);
+	for (i = 0; i < 4; i++)
+	{
+		b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v1);
+		b3MprVec3Scale(&vec, b[i]);
+		b3MprVec3Add(&p1, &vec);
+
+		b3MprVec3Copy(&vec, &b3MprSimplexPoint(portal, i)->v2);
+		b3MprVec3Scale(&vec, b[i]);
+		b3MprVec3Add(&p2, &vec);
+	}
+	b3MprVec3Scale(&p1, inv);
+	b3MprVec3Scale(&p2, inv);
+
+	b3MprVec3Copy(pos, &p1);
+	b3MprVec3Add(pos, &p2);
+	b3MprVec3Scale(pos, 0.5);
+}
+
+inline float b3MprVec3Dist2(const b3Float4 *a, const b3Float4 *b)
+{
+	b3Float4 ab;
+	b3MprVec3Sub2(&ab, a, b);
+	return b3MprVec3Len2(&ab);
+}
+
+inline float _b3MprVec3PointSegmentDist2(const b3Float4 *P,
+										 const b3Float4 *x0,
+										 const b3Float4 *b,
+										 b3Float4 *witness)
+{
+	// The computation comes from solving equation of segment:
+	//      S(t) = x0 + t.d
+	//          where - x0 is initial point of segment
+	//                - d is direction of segment from x0 (|d| > 0)
+	//                - t belongs to <0, 1> interval
+	//
+	// Than, distance from a segment to some point P can be expressed:
+	//      D(t) = |x0 + t.d - P|^2
+	//          which is distance from any point on segment. Minimization
+	//          of this function brings distance from P to segment.
+	// Minimization of D(t) leads to simple quadratic equation that's
+	// solving is straightforward.
+	//
+	// Bonus of this method is witness point for free.
+
+	float dist, t;
+	b3Float4 d, a;
+
+	// direction of segment
+	b3MprVec3Sub2(&d, b, x0);
+
+	// precompute vector from P to x0
+	b3MprVec3Sub2(&a, x0, P);
+
+	t = -1.f * b3MprVec3Dot(&a, &d);
+	t /= b3MprVec3Len2(&d);
+
+	if (t < 0.f || b3MprIsZero(t))
+	{
+		dist = b3MprVec3Dist2(x0, P);
+		if (witness)
+			b3MprVec3Copy(witness, x0);
+	}
+	else if (t > 1.f || b3MprEq(t, 1.f))
+	{
+		dist = b3MprVec3Dist2(b, P);
+		if (witness)
+			b3MprVec3Copy(witness, b);
+	}
+	else
+	{
+		if (witness)
+		{
+			b3MprVec3Copy(witness, &d);
+			b3MprVec3Scale(witness, t);
+			b3MprVec3Add(witness, x0);
+			dist = b3MprVec3Dist2(witness, P);
+		}
+		else
+		{
+			// recycling variables
+			b3MprVec3Scale(&d, t);
+			b3MprVec3Add(&d, &a);
+			dist = b3MprVec3Len2(&d);
+		}
+	}
+
+	return dist;
+}
+
+inline float b3MprVec3PointTriDist2(const b3Float4 *P,
+									const b3Float4 *x0, const b3Float4 *B,
+									const b3Float4 *C,
+									b3Float4 *witness)
+{
+	// Computation comes from analytic expression for triangle (x0, B, C)
+	//      T(s, t) = x0 + s.d1 + t.d2, where d1 = B - x0 and d2 = C - x0 and
+	// Then equation for distance is:
+	//      D(s, t) = | T(s, t) - P |^2
+	// This leads to minimization of quadratic function of two variables.
+	// The solution from is taken only if s is between 0 and 1, t is
+	// between 0 and 1 and t + s < 1, otherwise distance from segment is
+	// computed.
+
+	b3Float4 d1, d2, a;
+	float u, v, w, p, q, r;
+	float s, t, dist, dist2;
+	b3Float4 witness2;
+
+	b3MprVec3Sub2(&d1, B, x0);
+	b3MprVec3Sub2(&d2, C, x0);
+	b3MprVec3Sub2(&a, x0, P);
+
+	u = b3MprVec3Dot(&a, &a);
+	v = b3MprVec3Dot(&d1, &d1);
+	w = b3MprVec3Dot(&d2, &d2);
+	p = b3MprVec3Dot(&a, &d1);
+	q = b3MprVec3Dot(&a, &d2);
+	r = b3MprVec3Dot(&d1, &d2);
+
+	s = (q * r - w * p) / (w * v - r * r);
+	t = (-s * r - q) / w;
+
+	if ((b3MprIsZero(s) || s > 0.f) && (b3MprEq(s, 1.f) || s < 1.f) && (b3MprIsZero(t) || t > 0.f) && (b3MprEq(t, 1.f) || t < 1.f) && (b3MprEq(t + s, 1.f) || t + s < 1.f))
+	{
+		if (witness)
+		{
+			b3MprVec3Scale(&d1, s);
+			b3MprVec3Scale(&d2, t);
+			b3MprVec3Copy(witness, x0);
+			b3MprVec3Add(witness, &d1);
+			b3MprVec3Add(witness, &d2);
+
+			dist = b3MprVec3Dist2(witness, P);
+		}
+		else
+		{
+			dist = s * s * v;
+			dist += t * t * w;
+			dist += 2.f * s * t * r;
+			dist += 2.f * s * p;
+			dist += 2.f * t * q;
+			dist += u;
+		}
+	}
+	else
+	{
+		dist = _b3MprVec3PointSegmentDist2(P, x0, B, witness);
+
+		dist2 = _b3MprVec3PointSegmentDist2(P, x0, C, &witness2);
+		if (dist2 < dist)
+		{
+			dist = dist2;
+			if (witness)
+				b3MprVec3Copy(witness, &witness2);
+		}
+
+		dist2 = _b3MprVec3PointSegmentDist2(P, B, C, &witness2);
+		if (dist2 < dist)
+		{
+			dist = dist2;
+			if (witness)
+				b3MprVec3Copy(witness, &witness2);
+		}
+	}
+
+	return dist;
+}
+
+B3_STATIC void b3FindPenetr(int pairIndex, int bodyIndexA, int bodyIndexB, b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
+							b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData,
+							b3ConstArray(b3Collidable_t) cpuCollidables,
+							b3ConstArray(b3Float4) cpuVertices,
+							__global b3Float4 *sepAxis,
+							b3MprSimplex_t *portal,
+							float *depth, b3Float4 *pdir, b3Float4 *pos)
+{
+	b3Float4 dir;
+	b3MprSupport_t v4;
+	unsigned long iterations;
+
+	b3Float4 zero = b3MakeFloat4(0, 0, 0, 0);
+	b3Float4 *b3mpr_vec3_origin = &zero;
+
+	iterations = 1UL;
+	for (int i = 0; i < B3_MPR_MAX_ITERATIONS; i++)
+	//while (1)
+	{
+		// compute portal direction and obtain next support point
+		b3PortalDir(portal, &dir);
+
+		b3MprSupport(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &dir, &v4);
+
+		// reached tolerance -> find penetration info
+		if (portalReachTolerance(portal, &v4, &dir) || iterations == B3_MPR_MAX_ITERATIONS)
+		{
+			*depth = b3MprVec3PointTriDist2(b3mpr_vec3_origin, &b3MprSimplexPoint(portal, 1)->v, &b3MprSimplexPoint(portal, 2)->v, &b3MprSimplexPoint(portal, 3)->v, pdir);
+			*depth = B3_MPR_SQRT(*depth);
+
+			if (b3MprIsZero((*pdir).x) && b3MprIsZero((*pdir).y) && b3MprIsZero((*pdir).z))
+			{
+				*pdir = dir;
+			}
+			b3MprVec3Normalize(pdir);
+
+			// barycentric coordinates:
+			b3FindPos(portal, pos);
+
+			return;
+		}
+
+		b3ExpandPortal(portal, &v4);
+
+		iterations++;
+	}
+}
+
+B3_STATIC void b3FindPenetrTouch(b3MprSimplex_t *portal, float *depth, b3Float4 *dir, b3Float4 *pos)
+{
+	// Touching contact on portal's v1 - so depth is zero and direction
+	// is unimportant and pos can be guessed
+	*depth = 0.f;
+	b3Float4 zero = b3MakeFloat4(0, 0, 0, 0);
+	b3Float4 *b3mpr_vec3_origin = &zero;
+
+	b3MprVec3Copy(dir, b3mpr_vec3_origin);
+
+	b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);
+	b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);
+	b3MprVec3Scale(pos, 0.5);
+}
+
+B3_STATIC void b3FindPenetrSegment(b3MprSimplex_t *portal,
+								   float *depth, b3Float4 *dir, b3Float4 *pos)
+{
+	// Origin lies on v0-v1 segment.
+	// Depth is distance to v1, direction also and position must be
+	// computed
+
+	b3MprVec3Copy(pos, &b3MprSimplexPoint(portal, 1)->v1);
+	b3MprVec3Add(pos, &b3MprSimplexPoint(portal, 1)->v2);
+	b3MprVec3Scale(pos, 0.5f);
+
+	b3MprVec3Copy(dir, &b3MprSimplexPoint(portal, 1)->v);
+	*depth = B3_MPR_SQRT(b3MprVec3Len2(dir));
+	b3MprVec3Normalize(dir);
+}
+
+inline int b3MprPenetration(int pairIndex, int bodyIndexA, int bodyIndexB,
+							b3ConstArray(b3RigidBodyData_t) cpuBodyBuf,
+							b3ConstArray(b3ConvexPolyhedronData_t) cpuConvexData,
+							b3ConstArray(b3Collidable_t) cpuCollidables,
+							b3ConstArray(b3Float4) cpuVertices,
+							__global b3Float4 *sepAxis,
+							__global int *hasSepAxis,
+							float *depthOut, b3Float4 *dirOut, b3Float4 *posOut)
+{
+	b3MprSimplex_t portal;
+
+	//	if (!hasSepAxis[pairIndex])
+	//	return -1;
+
+	hasSepAxis[pairIndex] = 0;
+	int res;
+
+	// Phase 1: Portal discovery
+	res = b3DiscoverPortal(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, hasSepAxis, &portal);
+
+	//sepAxis[pairIndex] = *pdir;//or -dir?
+
+	switch (res)
+	{
+		case 0:
+		{
+			// Phase 2: Portal refinement
+
+			res = b3RefinePortal(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &portal);
+			if (res < 0)
+				return -1;
+
+			// Phase 3. Penetration info
+			b3FindPenetr(pairIndex, bodyIndexA, bodyIndexB, cpuBodyBuf, cpuConvexData, cpuCollidables, cpuVertices, sepAxis, &portal, depthOut, dirOut, posOut);
+			hasSepAxis[pairIndex] = 1;
+			sepAxis[pairIndex] = -*dirOut;
+			break;
+		}
+		case 1:
+		{
+			// Touching contact on portal's v1.
+			b3FindPenetrTouch(&portal, depthOut, dirOut, posOut);
+			break;
+		}
+		case 2:
+		{
+			b3FindPenetrSegment(&portal, depthOut, dirOut, posOut);
+			break;
+		}
+		default:
+		{
+			hasSepAxis[pairIndex] = 0;
+			//if (res < 0)
+			//{
+			// Origin isn't inside portal - no collision.
+			return -1;
+			//}
+		}
+	};
+
+	return 0;
+};
+
+#endif  //B3_MPR_PENETRATION_H

+ 175 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3NewContactReduction.h

@@ -0,0 +1,175 @@
+
+#ifndef B3_NEW_CONTACT_REDUCTION_H
+#define B3_NEW_CONTACT_REDUCTION_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
+#define GET_NPOINTS(x) (x).m_worldNormalOnB.w
+
+int b3ExtractManifoldSequentialGlobal(__global const b3Float4* p, int nPoints, b3Float4ConstArg nearNormal, b3Int4* contactIdx)
+{
+	if (nPoints == 0)
+		return 0;
+
+	if (nPoints <= 4)
+		return nPoints;
+
+	if (nPoints > 64)
+		nPoints = 64;
+
+	b3Float4 center = b3MakeFloat4(0, 0, 0, 0);
+	{
+		for (int i = 0; i < nPoints; i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+
+	//	sample 4 directions
+
+	b3Float4 aVector = p[0] - center;
+	b3Float4 u = b3Cross(nearNormal, aVector);
+	b3Float4 v = b3Cross(nearNormal, u);
+	u = b3Normalized(u);
+	v = b3Normalized(v);
+
+	//keep point with deepest penetration
+	float minW = FLT_MAX;
+
+	int minIndex = -1;
+
+	b3Float4 maxDots;
+	maxDots.x = FLT_MIN;
+	maxDots.y = FLT_MIN;
+	maxDots.z = FLT_MIN;
+	maxDots.w = FLT_MIN;
+
+	//	idx, distance
+	for (int ie = 0; ie < nPoints; ie++)
+	{
+		if (p[ie].w < minW)
+		{
+			minW = p[ie].w;
+			minIndex = ie;
+		}
+		float f;
+		b3Float4 r = p[ie] - center;
+		f = b3Dot(u, r);
+		if (f < maxDots.x)
+		{
+			maxDots.x = f;
+			contactIdx[0].x = ie;
+		}
+
+		f = b3Dot(-u, r);
+		if (f < maxDots.y)
+		{
+			maxDots.y = f;
+			contactIdx[0].y = ie;
+		}
+
+		f = b3Dot(v, r);
+		if (f < maxDots.z)
+		{
+			maxDots.z = f;
+			contactIdx[0].z = ie;
+		}
+
+		f = b3Dot(-v, r);
+		if (f < maxDots.w)
+		{
+			maxDots.w = f;
+			contactIdx[0].w = ie;
+		}
+	}
+
+	if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+	{
+		//replace the first contact with minimum (todo: replace contact with least penetration)
+		contactIdx[0].x = minIndex;
+	}
+
+	return 4;
+}
+
+__kernel void b3NewContactReductionKernel(__global b3Int4* pairs,
+										  __global const b3RigidBodyData_t* rigidBodies,
+										  __global const b3Float4* separatingNormals,
+										  __global const int* hasSeparatingAxis,
+										  __global struct b3Contact4Data* globalContactsOut,
+										  __global b3Int4* clippingFaces,
+										  __global b3Float4* worldVertsB2,
+										  volatile __global int* nGlobalContactsOut,
+										  int vertexFaceCapacity,
+										  int contactCapacity,
+										  int numPairs,
+										  int pairIndex)
+{
+	//    int i = get_global_id(0);
+	//int pairIndex = i;
+	int i = pairIndex;
+
+	b3Int4 contactIdx;
+	contactIdx = b3MakeInt4(0, 1, 2, 3);
+
+	if (i < numPairs)
+	{
+		if (hasSeparatingAxis[i])
+		{
+			int nPoints = clippingFaces[pairIndex].w;
+
+			if (nPoints > 0)
+			{
+				__global b3Float4* pointsIn = &worldVertsB2[pairIndex * vertexFaceCapacity];
+				b3Float4 normal = -separatingNormals[i];
+
+				int nReducedContacts = b3ExtractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
+
+				int dstIdx;
+				dstIdx = b3AtomicInc(nGlobalContactsOut);
+
+				//#if 0
+				b3Assert(dstIdx < contactCapacity);
+				if (dstIdx < contactCapacity)
+				{
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
+					c->m_worldNormalOnB = -normal;
+					c->m_restituitionCoeffCmp = (0.f * 0xffff);
+					c->m_frictionCoeffCmp = (0.7f * 0xffff);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+
+					pairs[pairIndex].w = dstIdx;
+
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass == 0 ? -bodyA : bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass == 0 ? -bodyB : bodyB;
+					c->m_childIndexA = -1;
+					c->m_childIndexB = -1;
+
+					switch (nReducedContacts)
+					{
+						case 4:
+							c->m_worldPosB[3] = pointsIn[contactIdx.w];
+						case 3:
+							c->m_worldPosB[2] = pointsIn[contactIdx.z];
+						case 2:
+							c->m_worldPosB[1] = pointsIn[contactIdx.y];
+						case 1:
+							c->m_worldPosB[0] = pointsIn[contactIdx.x];
+						default:
+						{
+						}
+					};
+
+					GET_NPOINTS(*c) = nReducedContacts;
+				}
+
+				//#endif
+
+			}  //		if (numContactsOut>0)
+		}      //		if (hasSeparatingAxis[i])
+	}          //	if (i<numPairs)
+}
+#endif

+ 88 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h

@@ -0,0 +1,88 @@
+
+
+#ifndef B3_QUANTIZED_BVH_NODE_H
+#define B3_QUANTIZED_BVH_NODE_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+
+#define B3_MAX_NUM_PARTS_IN_BITS 10
+
+///b3QuantizedBvhNodeData is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+typedef struct b3QuantizedBvhNodeData b3QuantizedBvhNodeData_t;
+
+struct b3QuantizedBvhNodeData
+{
+	//12 bytes
+	unsigned short int m_quantizedAabbMin[3];
+	unsigned short int m_quantizedAabbMax[3];
+	//4 bytes
+	int m_escapeIndexOrTriangleIndex;
+};
+
+inline int b3GetTriangleIndex(const b3QuantizedBvhNodeData* rootNode)
+{
+	unsigned int x = 0;
+	unsigned int y = (~(x & 0)) << (31 - B3_MAX_NUM_PARTS_IN_BITS);
+	// Get only the lower bits where the triangle index is stored
+	return (rootNode->m_escapeIndexOrTriangleIndex & ~(y));
+}
+
+inline int b3IsLeaf(const b3QuantizedBvhNodeData* rootNode)
+{
+	//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+	return (rootNode->m_escapeIndexOrTriangleIndex >= 0) ? 1 : 0;
+}
+
+inline int b3GetEscapeIndex(const b3QuantizedBvhNodeData* rootNode)
+{
+	return -rootNode->m_escapeIndexOrTriangleIndex;
+}
+
+inline void b3QuantizeWithClamp(unsigned short* out, b3Float4ConstArg point2, int isMax, b3Float4ConstArg bvhAabbMin, b3Float4ConstArg bvhAabbMax, b3Float4ConstArg bvhQuantization)
+{
+	b3Float4 clampedPoint = b3MaxFloat4(point2, bvhAabbMin);
+	clampedPoint = b3MinFloat4(clampedPoint, bvhAabbMax);
+
+	b3Float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;
+	if (isMax)
+	{
+		out[0] = (unsigned short)(((unsigned short)(v.x + 1.f) | 1));
+		out[1] = (unsigned short)(((unsigned short)(v.y + 1.f) | 1));
+		out[2] = (unsigned short)(((unsigned short)(v.z + 1.f) | 1));
+	}
+	else
+	{
+		out[0] = (unsigned short)(((unsigned short)(v.x) & 0xfffe));
+		out[1] = (unsigned short)(((unsigned short)(v.y) & 0xfffe));
+		out[2] = (unsigned short)(((unsigned short)(v.z) & 0xfffe));
+	}
+}
+
+inline int b3TestQuantizedAabbAgainstQuantizedAabbSlow(
+	const unsigned short int* aabbMin1,
+	const unsigned short int* aabbMax1,
+	const unsigned short int* aabbMin2,
+	const unsigned short int* aabbMax2)
+{
+	//int overlap = 1;
+	if (aabbMin1[0] > aabbMax2[0])
+		return 0;
+	if (aabbMax1[0] < aabbMin2[0])
+		return 0;
+	if (aabbMin1[1] > aabbMax2[1])
+		return 0;
+	if (aabbMax1[1] < aabbMin2[1])
+		return 0;
+	if (aabbMin1[2] > aabbMax2[2])
+		return 0;
+	if (aabbMax1[2] < aabbMin2[2])
+		return 0;
+	return 1;
+	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;
+	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;
+	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;
+	//return overlap;
+}
+
+#endif  //B3_QUANTIZED_BVH_NODE_H

+ 89 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3ReduceContacts.h

@@ -0,0 +1,89 @@
+#ifndef B3_REDUCE_CONTACTS_H
+#define B3_REDUCE_CONTACTS_H
+
+inline int b3ReduceContacts(const b3Float4* p, int nPoints, const b3Float4& nearNormal, b3Int4* contactIdx)
+{
+	if (nPoints == 0)
+		return 0;
+
+	if (nPoints <= 4)
+		return nPoints;
+
+	if (nPoints > 64)
+		nPoints = 64;
+
+	b3Float4 center = b3MakeFloat4(0, 0, 0, 0);
+	{
+		for (int i = 0; i < nPoints; i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+
+	//	sample 4 directions
+
+	b3Float4 aVector = p[0] - center;
+	b3Float4 u = b3Cross3(nearNormal, aVector);
+	b3Float4 v = b3Cross3(nearNormal, u);
+	u = b3FastNormalized3(u);
+	v = b3FastNormalized3(v);
+
+	//keep point with deepest penetration
+	float minW = FLT_MAX;
+
+	int minIndex = -1;
+
+	b3Float4 maxDots;
+	maxDots.x = FLT_MIN;
+	maxDots.y = FLT_MIN;
+	maxDots.z = FLT_MIN;
+	maxDots.w = FLT_MIN;
+
+	//	idx, distance
+	for (int ie = 0; ie < nPoints; ie++)
+	{
+		if (p[ie].w < minW)
+		{
+			minW = p[ie].w;
+			minIndex = ie;
+		}
+		float f;
+		b3Float4 r = p[ie] - center;
+		f = b3Dot3F4(u, r);
+		if (f < maxDots.x)
+		{
+			maxDots.x = f;
+			contactIdx[0].x = ie;
+		}
+
+		f = b3Dot3F4(-u, r);
+		if (f < maxDots.y)
+		{
+			maxDots.y = f;
+			contactIdx[0].y = ie;
+		}
+
+		f = b3Dot3F4(v, r);
+		if (f < maxDots.z)
+		{
+			maxDots.z = f;
+			contactIdx[0].z = ie;
+		}
+
+		f = b3Dot3F4(-v, r);
+		if (f < maxDots.w)
+		{
+			maxDots.w = f;
+			contactIdx[0].w = ie;
+		}
+	}
+
+	if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+	{
+		//replace the first contact with minimum (todo: replace contact with least penetration)
+		contactIdx[0].x = minIndex;
+	}
+
+	return 4;
+}
+
+#endif  //B3_REDUCE_CONTACTS_H

+ 31 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h

@@ -0,0 +1,31 @@
+#ifndef B3_RIGIDBODY_DATA_H
+#define B3_RIGIDBODY_DATA_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+#include "Bullet3Common/shared/b3Quat.h"
+#include "Bullet3Common/shared/b3Mat3x3.h"
+
+typedef struct b3RigidBodyData b3RigidBodyData_t;
+
+struct b3RigidBodyData
+{
+	b3Float4 m_pos;
+	b3Quat m_quat;
+	b3Float4 m_linVel;
+	b3Float4 m_angVel;
+
+	int m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+};
+
+typedef struct b3InertiaData b3InertiaData_t;
+
+struct b3InertiaData
+{
+	b3Mat3x3 m_invInertiaWorld;
+	b3Mat3x3 m_initInvInertia;
+};
+
+#endif  //B3_RIGIDBODY_DATA_H

+ 35 - 0
Dependencies/include/bullet3/Bullet3Collision/NarrowPhaseCollision/shared/b3UpdateAabbs.h

@@ -0,0 +1,35 @@
+#ifndef B3_UPDATE_AABBS_H
+#define B3_UPDATE_AABBS_H
+
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+void b3ComputeWorldAabb(int bodyId, __global const b3RigidBodyData_t* bodies, __global const b3Collidable_t* collidables, __global const b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)
+{
+	__global const b3RigidBodyData_t* body = &bodies[bodyId];
+
+	b3Float4 position = body->m_pos;
+	b3Quat orientation = body->m_quat;
+
+	int collidableIndex = body->m_collidableIdx;
+	int shapeIndex = collidables[collidableIndex].m_shapeIndex;
+
+	if (shapeIndex >= 0)
+	{
+		b3Aabb_t localAabb = localShapeAABB[collidableIndex];
+		b3Aabb_t worldAabb;
+
+		b3Float4 aabbAMinOut, aabbAMaxOut;
+		float margin = 0.f;
+		b3TransformAabb2(localAabb.m_minVec, localAabb.m_maxVec, margin, position, orientation, &aabbAMinOut, &aabbAMaxOut);
+
+		worldAabb.m_minVec = aabbAMinOut;
+		worldAabb.m_minIndices[3] = bodyId;
+		worldAabb.m_maxVec = aabbAMaxOut;
+		worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass == 0.f ? 0 : 1;
+		worldAabbs[bodyId] = worldAabb;
+	}
+}
+
+#endif  //B3_UPDATE_AABBS_H

+ 16 - 0
Dependencies/include/bullet3/Bullet3Collision/premake4.lua

@@ -0,0 +1,16 @@
+	project "Bullet3Collision"
+
+	language "C++"
+				
+	kind "StaticLib"
+		
+	includedirs {".."}
+
+    if os.is("Linux") then
+        buildoptions{"-fPIC"}
+    end
+
+	files {
+		"**.cpp",
+		"**.h"
+	}

+ 63 - 0
Dependencies/include/bullet3/Bullet3Common/CMakeLists.txt

@@ -0,0 +1,63 @@
+
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+SET(Bullet3Common_SRCS
+	b3AlignedAllocator.cpp
+	b3Vector3.cpp
+	b3Logging.cpp
+)
+
+SET(Bullet3Common_HDRS
+	b3AlignedAllocator.h
+	b3AlignedObjectArray.h
+	b3CommandLineArgs.h
+	b3HashMap.h
+	b3Logging.h
+	b3Matrix3x3.h
+	b3MinMax.h
+	b3PoolAllocator.h
+	b3QuadWord.h
+	b3Quaternion.h
+	b3Random.h
+	b3Scalar.h
+	b3StackAlloc.h
+	b3Transform.h
+	b3TransformUtil.h
+	b3Vector3.h
+	shared/b3Float4.h
+	shared/b3Int2.h
+	shared/b3Int4.h
+	shared/b3Mat3x3.h
+	shared/b3PlatformDefinitions.h
+	shared/b3Quat.h
+)
+
+ADD_LIBRARY(Bullet3Common ${Bullet3Common_SRCS} ${Bullet3Common_HDRS})
+SET_TARGET_PROPERTIES(Bullet3Common PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(Bullet3Common PROPERTIES SOVERSION ${BULLET_VERSION})
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		#FILES_MATCHING requires CMake 2.6
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Common DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Common
+					RUNTIME DESTINATION bin
+					LIBRARY DESTINATION lib${LIB_SUFFIX}
+					ARCHIVE DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h"  PATTERN
+".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(Bullet3Common PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(Bullet3Common PROPERTIES PUBLIC_HEADER "${Bullet3Common_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)

+ 186 - 0
Dependencies/include/bullet3/Bullet3Common/b3AlignedAllocator.cpp

@@ -0,0 +1,186 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3AlignedAllocator.h"
+
+#ifdef B3_ALLOCATOR_STATISTICS
+int b3g_numAlignedAllocs = 0;
+int b3g_numAlignedFree = 0;
+int b3g_totalBytesAlignedAllocs = 0;  //detect memory leaks
+#endif
+
+static void *b3AllocDefault(size_t size)
+{
+	return malloc(size);
+}
+
+static void b3FreeDefault(void *ptr)
+{
+	free(ptr);
+}
+
+static b3AllocFunc *b3s_allocFunc = b3AllocDefault;
+static b3FreeFunc *b3s_freeFunc = b3FreeDefault;
+
+#if defined(B3_HAS_ALIGNED_ALLOCATOR)
+#include <malloc.h>
+static void *b3AlignedAllocDefault(size_t size, int alignment)
+{
+	return _aligned_malloc(size, (size_t)alignment);
+}
+
+static void b3AlignedFreeDefault(void *ptr)
+{
+	_aligned_free(ptr);
+}
+#elif defined(__CELLOS_LV2__)
+#include <stdlib.h>
+
+static inline void *b3AlignedAllocDefault(size_t size, int alignment)
+{
+	return memalign(alignment, size);
+}
+
+static inline void b3AlignedFreeDefault(void *ptr)
+{
+	free(ptr);
+}
+#else
+
+static inline void *b3AlignedAllocDefault(size_t size, int alignment)
+{
+	void *ret;
+	char *real;
+	real = (char *)b3s_allocFunc(size + sizeof(void *) + (alignment - 1));
+	if (real)
+	{
+		ret = b3AlignPointer(real + sizeof(void *), alignment);
+		*((void **)(ret)-1) = (void *)(real);
+	}
+	else
+	{
+		ret = (void *)(real);
+	}
+	return (ret);
+}
+
+static inline void b3AlignedFreeDefault(void *ptr)
+{
+	void *real;
+
+	if (ptr)
+	{
+		real = *((void **)(ptr)-1);
+		b3s_freeFunc(real);
+	}
+}
+#endif
+
+static b3AlignedAllocFunc *b3s_alignedAllocFunc = b3AlignedAllocDefault;
+static b3AlignedFreeFunc *b3s_alignedFreeFunc = b3AlignedFreeDefault;
+
+void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc *allocFunc, b3AlignedFreeFunc *freeFunc)
+{
+	b3s_alignedAllocFunc = allocFunc ? allocFunc : b3AlignedAllocDefault;
+	b3s_alignedFreeFunc = freeFunc ? freeFunc : b3AlignedFreeDefault;
+}
+
+void b3AlignedAllocSetCustom(b3AllocFunc *allocFunc, b3FreeFunc *freeFunc)
+{
+	b3s_allocFunc = allocFunc ? allocFunc : b3AllocDefault;
+	b3s_freeFunc = freeFunc ? freeFunc : b3FreeDefault;
+}
+
+#ifdef B3_DEBUG_MEMORY_ALLOCATIONS
+//this generic allocator provides the total allocated number of bytes
+#include <stdio.h>
+
+void *b3AlignedAllocInternal(size_t size, int alignment, int line, char *filename)
+{
+	void *ret;
+	char *real;
+#ifdef B3_ALLOCATOR_STATISTICS
+	b3g_totalBytesAlignedAllocs += size;
+	b3g_numAlignedAllocs++;
+#endif
+	real = (char *)b3s_allocFunc(size + 2 * sizeof(void *) + (alignment - 1));
+	if (real)
+	{
+		ret = (void *)b3AlignPointer(real + 2 * sizeof(void *), alignment);
+		*((void **)(ret)-1) = (void *)(real);
+		*((int *)(ret)-2) = size;
+	}
+	else
+	{
+		ret = (void *)(real);  //??
+	}
+
+	b3Printf("allocation#%d at address %x, from %s,line %d, size %d\n", b3g_numAlignedAllocs, real, filename, line, size);
+
+	int *ptr = (int *)ret;
+	*ptr = 12;
+	return (ret);
+}
+
+void b3AlignedFreeInternal(void *ptr, int line, char *filename)
+{
+	void *real;
+#ifdef B3_ALLOCATOR_STATISTICS
+	b3g_numAlignedFree++;
+#endif
+	if (ptr)
+	{
+		real = *((void **)(ptr)-1);
+		int size = *((int *)(ptr)-2);
+#ifdef B3_ALLOCATOR_STATISTICS
+		b3g_totalBytesAlignedAllocs -= size;
+#endif
+		b3Printf("free #%d at address %x, from %s,line %d, size %d\n", b3g_numAlignedFree, real, filename, line, size);
+
+		b3s_freeFunc(real);
+	}
+	else
+	{
+		b3Printf("NULL ptr\n");
+	}
+}
+
+#else  //B3_DEBUG_MEMORY_ALLOCATIONS
+
+void *b3AlignedAllocInternal(size_t size, int alignment)
+{
+#ifdef B3_ALLOCATOR_STATISTICS
+	b3g_numAlignedAllocs++;
+#endif
+	void *ptr;
+	ptr = b3s_alignedAllocFunc(size, alignment);
+	//	b3Printf("b3AlignedAllocInternal %d, %x\n",size,ptr);
+	return ptr;
+}
+
+void b3AlignedFreeInternal(void *ptr)
+{
+	if (!ptr)
+	{
+		return;
+	}
+#ifdef B3_ALLOCATOR_STATISTICS
+	b3g_numAlignedFree++;
+#endif
+	//	b3Printf("b3AlignedFreeInternal %x\n",ptr);
+	b3s_alignedFreeFunc(ptr);
+}
+
+#endif  //B3_DEBUG_MEMORY_ALLOCATIONS

+ 110 - 0
Dependencies/include/bullet3/Bullet3Common/b3AlignedAllocator.h

@@ -0,0 +1,110 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_ALIGNED_ALLOCATOR
+#define B3_ALIGNED_ALLOCATOR
+
+///we probably replace this with our own aligned memory allocator
+///so we replace _aligned_malloc and _aligned_free with our own
+///that is better portable and more predictable
+
+#include "b3Scalar.h"
+//#define B3_DEBUG_MEMORY_ALLOCATIONS 1
+#ifdef B3_DEBUG_MEMORY_ALLOCATIONS
+
+#define b3AlignedAlloc(a, b) \
+	b3AlignedAllocInternal(a, b, __LINE__, __FILE__)
+
+#define b3AlignedFree(ptr) \
+	b3AlignedFreeInternal(ptr, __LINE__, __FILE__)
+
+void* b3AlignedAllocInternal(size_t size, int alignment, int line, char* filename);
+
+void b3AlignedFreeInternal(void* ptr, int line, char* filename);
+
+#else
+void* b3AlignedAllocInternal(size_t size, int alignment);
+void b3AlignedFreeInternal(void* ptr);
+
+#define b3AlignedAlloc(size, alignment) b3AlignedAllocInternal(size, alignment)
+#define b3AlignedFree(ptr) b3AlignedFreeInternal(ptr)
+
+#endif
+typedef int btSizeType;
+
+typedef void*(b3AlignedAllocFunc)(size_t size, int alignment);
+typedef void(b3AlignedFreeFunc)(void* memblock);
+typedef void*(b3AllocFunc)(size_t size);
+typedef void(b3FreeFunc)(void* memblock);
+
+///The developer can let all Bullet memory allocations go through a custom memory allocator, using b3AlignedAllocSetCustom
+void b3AlignedAllocSetCustom(b3AllocFunc* allocFunc, b3FreeFunc* freeFunc);
+///If the developer has already an custom aligned allocator, then b3AlignedAllocSetCustomAligned can be used. The default aligned allocator pre-allocates extra memory using the non-aligned allocator, and instruments it.
+void b3AlignedAllocSetCustomAligned(b3AlignedAllocFunc* allocFunc, b3AlignedFreeFunc* freeFunc);
+
+///The b3AlignedAllocator is a portable class for aligned memory allocations.
+///Default implementations for unaligned and aligned allocations can be overridden by a custom allocator using b3AlignedAllocSetCustom and b3AlignedAllocSetCustomAligned.
+template <typename T, unsigned Alignment>
+class b3AlignedAllocator
+{
+	typedef b3AlignedAllocator<T, Alignment> self_type;
+
+public:
+	//just going down a list:
+	b3AlignedAllocator() {}
+	/*
+	b3AlignedAllocator( const self_type & ) {}
+	*/
+
+	template <typename Other>
+	b3AlignedAllocator(const b3AlignedAllocator<Other, Alignment>&)
+	{
+	}
+
+	typedef const T* const_pointer;
+	typedef const T& const_reference;
+	typedef T* pointer;
+	typedef T& reference;
+	typedef T value_type;
+
+	pointer address(reference ref) const { return &ref; }
+	const_pointer address(const_reference ref) const { return &ref; }
+	pointer allocate(btSizeType n, const_pointer* hint = 0)
+	{
+		(void)hint;
+		return reinterpret_cast<pointer>(b3AlignedAlloc(sizeof(value_type) * n, Alignment));
+	}
+	void construct(pointer ptr, const value_type& value) { new (ptr) value_type(value); }
+	void deallocate(pointer ptr)
+	{
+		b3AlignedFree(reinterpret_cast<void*>(ptr));
+	}
+	void destroy(pointer ptr) { ptr->~value_type(); }
+
+	template <typename O>
+	struct rebind
+	{
+		typedef b3AlignedAllocator<O, Alignment> other;
+	};
+	template <typename O>
+	self_type& operator=(const b3AlignedAllocator<O, Alignment>&)
+	{
+		return *this;
+	}
+
+	friend bool operator==(const self_type&, const self_type&) { return true; }
+};
+
+#endif  //B3_ALIGNED_ALLOCATOR

+ 522 - 0
Dependencies/include/bullet3/Bullet3Common/b3AlignedObjectArray.h

@@ -0,0 +1,522 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_OBJECT_ARRAY__
+#define B3_OBJECT_ARRAY__
+
+#include "b3Scalar.h"  // has definitions like B3_FORCE_INLINE
+#include "b3AlignedAllocator.h"
+
+///If the platform doesn't support placement new, you can disable B3_USE_PLACEMENT_NEW
+///then the b3AlignedObjectArray doesn't support objects with virtual methods, and non-trivial constructors/destructors
+///You can enable B3_USE_MEMCPY, then swapping elements in the array will use memcpy instead of operator=
+///see discussion here: https://bulletphysics.orgphpBB2/viewtopic.php?t=1231 and
+///http://www.continuousphysics.com/Bullet/phpBB2/viewtopic.php?t=1240
+
+#define B3_USE_PLACEMENT_NEW 1
+//#define B3_USE_MEMCPY 1 //disable, because it is cumbersome to find out for each platform where memcpy is defined. It can be in <memory.h> or <string.h> or otherwise...
+#define B3_ALLOW_ARRAY_COPY_OPERATOR  // enabling this can accidently perform deep copies of data if you are not careful
+
+#ifdef B3_USE_MEMCPY
+#include <memory.h>
+#include <string.h>
+#endif  //B3_USE_MEMCPY
+
+#ifdef B3_USE_PLACEMENT_NEW
+#include <new>  //for placement new
+#endif          //B3_USE_PLACEMENT_NEW
+
+///The b3AlignedObjectArray template class uses a subset of the stl::vector interface for its methods
+///It is developed to replace stl::vector to avoid portability issues, including STL alignment issues to add SIMD/SSE data
+template <typename T>
+//template <class T>
+class b3AlignedObjectArray
+{
+	b3AlignedAllocator<T, 16> m_allocator;
+
+	int m_size;
+	int m_capacity;
+	T* m_data;
+	//PCK: added this line
+	bool m_ownsMemory;
+
+#ifdef B3_ALLOW_ARRAY_COPY_OPERATOR
+public:
+	B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T>& other)
+	{
+		copyFromArray(other);
+		return *this;
+	}
+#else   //B3_ALLOW_ARRAY_COPY_OPERATOR
+private:
+	B3_FORCE_INLINE b3AlignedObjectArray<T>& operator=(const b3AlignedObjectArray<T>& other);
+#endif  //B3_ALLOW_ARRAY_COPY_OPERATOR
+
+protected:
+	B3_FORCE_INLINE int allocSize(int size)
+	{
+		return (size ? size * 2 : 1);
+	}
+	B3_FORCE_INLINE void copy(int start, int end, T* dest) const
+	{
+		int i;
+		for (i = start; i < end; ++i)
+#ifdef B3_USE_PLACEMENT_NEW
+			new (&dest[i]) T(m_data[i]);
+#else
+			dest[i] = m_data[i];
+#endif  //B3_USE_PLACEMENT_NEW
+	}
+
+	B3_FORCE_INLINE void init()
+	{
+		//PCK: added this line
+		m_ownsMemory = true;
+		m_data = 0;
+		m_size = 0;
+		m_capacity = 0;
+	}
+	B3_FORCE_INLINE void destroy(int first, int last)
+	{
+		int i;
+		for (i = first; i < last; i++)
+		{
+			m_data[i].~T();
+		}
+	}
+
+	B3_FORCE_INLINE void* allocate(int size)
+	{
+		if (size)
+			return m_allocator.allocate(size);
+		return 0;
+	}
+
+	B3_FORCE_INLINE void deallocate()
+	{
+		if (m_data)
+		{
+			//PCK: enclosed the deallocation in this block
+			if (m_ownsMemory)
+			{
+				m_allocator.deallocate(m_data);
+			}
+			m_data = 0;
+		}
+	}
+
+public:
+	b3AlignedObjectArray()
+	{
+		init();
+	}
+
+	~b3AlignedObjectArray()
+	{
+		clear();
+	}
+
+	///Generally it is best to avoid using the copy constructor of an b3AlignedObjectArray, and use a (const) reference to the array instead.
+	b3AlignedObjectArray(const b3AlignedObjectArray& otherArray)
+	{
+		init();
+
+		int otherSize = otherArray.size();
+		resize(otherSize);
+		otherArray.copy(0, otherSize, m_data);
+	}
+
+	/// return the number of elements in the array
+	B3_FORCE_INLINE int size() const
+	{
+		return m_size;
+	}
+
+	B3_FORCE_INLINE const T& at(int n) const
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
+
+	B3_FORCE_INLINE T& at(int n)
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
+
+	B3_FORCE_INLINE const T& operator[](int n) const
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
+
+	B3_FORCE_INLINE T& operator[](int n)
+	{
+		b3Assert(n >= 0);
+		b3Assert(n < size());
+		return m_data[n];
+	}
+
+	///clear the array, deallocated memory. Generally it is better to use array.resize(0), to reduce performance overhead of run-time memory (de)allocations.
+	B3_FORCE_INLINE void clear()
+	{
+		destroy(0, size());
+
+		deallocate();
+
+		init();
+	}
+
+	B3_FORCE_INLINE void pop_back()
+	{
+		b3Assert(m_size > 0);
+		m_size--;
+		m_data[m_size].~T();
+	}
+
+	///resize changes the number of elements in the array. If the new size is larger, the new elements will be constructed using the optional second argument.
+	///when the new number of elements is smaller, the destructor will be called, but memory will not be freed, to reduce performance overhead of run-time memory (de)allocations.
+	B3_FORCE_INLINE void resizeNoInitialize(int newsize)
+	{
+		int curSize = size();
+
+		if (newsize < curSize)
+		{
+		}
+		else
+		{
+			if (newsize > size())
+			{
+				reserve(newsize);
+			}
+			//leave this uninitialized
+		}
+		m_size = newsize;
+	}
+
+	B3_FORCE_INLINE void resize(int newsize, const T& fillData = T())
+	{
+		int curSize = size();
+
+		if (newsize < curSize)
+		{
+			for (int i = newsize; i < curSize; i++)
+			{
+				m_data[i].~T();
+			}
+		}
+		else
+		{
+			if (newsize > size())
+			{
+				reserve(newsize);
+			}
+#ifdef B3_USE_PLACEMENT_NEW
+			for (int i = curSize; i < newsize; i++)
+			{
+				new (&m_data[i]) T(fillData);
+			}
+#endif  //B3_USE_PLACEMENT_NEW
+		}
+
+		m_size = newsize;
+	}
+	B3_FORCE_INLINE T& expandNonInitializing()
+	{
+		int sz = size();
+		if (sz == capacity())
+		{
+			reserve(allocSize(size()));
+		}
+		m_size++;
+
+		return m_data[sz];
+	}
+
+	B3_FORCE_INLINE T& expand(const T& fillValue = T())
+	{
+		int sz = size();
+		if (sz == capacity())
+		{
+			reserve(allocSize(size()));
+		}
+		m_size++;
+#ifdef B3_USE_PLACEMENT_NEW
+		new (&m_data[sz]) T(fillValue);  //use the in-place new (not really allocating heap memory)
+#endif
+
+		return m_data[sz];
+	}
+
+	B3_FORCE_INLINE void push_back(const T& _Val)
+	{
+		int sz = size();
+		if (sz == capacity())
+		{
+			reserve(allocSize(size()));
+		}
+
+#ifdef B3_USE_PLACEMENT_NEW
+		new (&m_data[m_size]) T(_Val);
+#else
+		m_data[size()] = _Val;
+#endif  //B3_USE_PLACEMENT_NEW
+
+		m_size++;
+	}
+
+	/// return the pre-allocated (reserved) elements, this is at least as large as the total number of elements,see size() and reserve()
+	B3_FORCE_INLINE int capacity() const
+	{
+		return m_capacity;
+	}
+
+	B3_FORCE_INLINE void reserve(int _Count)
+	{  // determine new minimum length of allocated storage
+		if (capacity() < _Count)
+		{  // not enough room, reallocate
+			T* s = (T*)allocate(_Count);
+			b3Assert(s);
+			if (s == 0)
+			{
+				b3Error("b3AlignedObjectArray reserve out-of-memory\n");
+				_Count = 0;
+				m_size = 0;
+			}
+			copy(0, size(), s);
+
+			destroy(0, size());
+
+			deallocate();
+
+			//PCK: added this line
+			m_ownsMemory = true;
+
+			m_data = s;
+
+			m_capacity = _Count;
+		}
+	}
+
+	class less
+	{
+	public:
+		bool operator()(const T& a, const T& b)
+		{
+			return (a < b);
+		}
+	};
+
+	template <typename L>
+	void quickSortInternal(const L& CompareFunc, int lo, int hi)
+	{
+		//  lo is the lower index, hi is the upper index
+		//  of the region of array a that is to be sorted
+		int i = lo, j = hi;
+		T x = m_data[(lo + hi) / 2];
+
+		//  partition
+		do
+		{
+			while (CompareFunc(m_data[i], x))
+				i++;
+			while (CompareFunc(x, m_data[j]))
+				j--;
+			if (i <= j)
+			{
+				swap(i, j);
+				i++;
+				j--;
+			}
+		} while (i <= j);
+
+		//  recursion
+		if (lo < j)
+			quickSortInternal(CompareFunc, lo, j);
+		if (i < hi)
+			quickSortInternal(CompareFunc, i, hi);
+	}
+
+	template <typename L>
+	void quickSort(const L& CompareFunc)
+	{
+		//don't sort 0 or 1 elements
+		if (size() > 1)
+		{
+			quickSortInternal(CompareFunc, 0, size() - 1);
+		}
+	}
+
+	///heap sort from http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Sort/Heap/
+	template <typename L>
+	void downHeap(T* pArr, int k, int n, const L& CompareFunc)
+	{
+		/*  PRE: a[k+1..N] is a heap */
+		/* POST:  a[k..N]  is a heap */
+
+		T temp = pArr[k - 1];
+		/* k has child(s) */
+		while (k <= n / 2)
+		{
+			int child = 2 * k;
+
+			if ((child < n) && CompareFunc(pArr[child - 1], pArr[child]))
+			{
+				child++;
+			}
+			/* pick larger child */
+			if (CompareFunc(temp, pArr[child - 1]))
+			{
+				/* move child up */
+				pArr[k - 1] = pArr[child - 1];
+				k = child;
+			}
+			else
+			{
+				break;
+			}
+		}
+		pArr[k - 1] = temp;
+	} /*downHeap*/
+
+	void swap(int index0, int index1)
+	{
+#ifdef B3_USE_MEMCPY
+		char temp[sizeof(T)];
+		memcpy(temp, &m_data[index0], sizeof(T));
+		memcpy(&m_data[index0], &m_data[index1], sizeof(T));
+		memcpy(&m_data[index1], temp, sizeof(T));
+#else
+		T temp = m_data[index0];
+		m_data[index0] = m_data[index1];
+		m_data[index1] = temp;
+#endif  //B3_USE_PLACEMENT_NEW
+	}
+
+	template <typename L>
+	void heapSort(const L& CompareFunc)
+	{
+		/* sort a[0..N-1],  N.B. 0 to N-1 */
+		int k;
+		int n = m_size;
+		for (k = n / 2; k > 0; k--)
+		{
+			downHeap(m_data, k, n, CompareFunc);
+		}
+
+		/* a[1..N] is now a heap */
+		while (n >= 1)
+		{
+			swap(0, n - 1); /* largest of a[0..n-1] */
+
+			n = n - 1;
+			/* restore a[1..i-1] heap */
+			downHeap(m_data, 1, n, CompareFunc);
+		}
+	}
+
+	///non-recursive binary search, assumes sorted array
+	int findBinarySearch(const T& key) const
+	{
+		int first = 0;
+		int last = size() - 1;
+
+		//assume sorted array
+		while (first <= last)
+		{
+			int mid = (first + last) / 2;  // compute mid point.
+			if (key > m_data[mid])
+				first = mid + 1;  // repeat search in top half.
+			else if (key < m_data[mid])
+				last = mid - 1;  // repeat search in bottom half.
+			else
+				return mid;  // found it. return position /////
+		}
+		return size();  // failed to find key
+	}
+
+	int findLinearSearch(const T& key) const
+	{
+		int index = size();
+		int i;
+
+		for (i = 0; i < size(); i++)
+		{
+			if (m_data[i] == key)
+			{
+				index = i;
+				break;
+			}
+		}
+		return index;
+	}
+
+	int findLinearSearch2(const T& key) const
+	{
+		int index = -1;
+		int i;
+
+		for (i = 0; i < size(); i++)
+		{
+			if (m_data[i] == key)
+			{
+				index = i;
+				break;
+			}
+		}
+		return index;
+	}
+
+	void remove(const T& key)
+	{
+		int findIndex = findLinearSearch(key);
+		if (findIndex < size())
+		{
+			swap(findIndex, size() - 1);
+			pop_back();
+		}
+	}
+
+	//PCK: whole function
+	void initializeFromBuffer(void* buffer, int size, int capacity)
+	{
+		clear();
+		m_ownsMemory = false;
+		m_data = (T*)buffer;
+		m_size = size;
+		m_capacity = capacity;
+	}
+
+	void copyFromArray(const b3AlignedObjectArray& otherArray)
+	{
+		int otherSize = otherArray.size();
+		resize(otherSize);
+		otherArray.copy(0, otherSize, m_data);
+	}
+
+	void removeAtIndex(int index)
+	{
+		if (index < size())
+		{
+			swap(index, size() - 1);
+			pop_back();
+		}
+	}
+};
+
+#endif  //B3_OBJECT_ARRAY__

+ 106 - 0
Dependencies/include/bullet3/Bullet3Common/b3CommandLineArgs.h

@@ -0,0 +1,106 @@
+#ifndef COMMAND_LINE_ARGS_H
+#define COMMAND_LINE_ARGS_H
+
+/******************************************************************************
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+#include <cstring>
+#include <sstream>
+class b3CommandLineArgs
+{
+protected:
+	std::map<std::string, std::string> pairs;
+
+public:
+	// Constructor
+	b3CommandLineArgs(int argc, char **argv)
+	{
+		addArgs(argc, argv);
+	}
+
+	void addArgs(int argc, char **argv)
+	{
+		for (int i = 1; i < argc; i++)
+		{
+			std::string arg = argv[i];
+
+			if ((arg.length() < 2) || (arg[0] != '-') || (arg[1] != '-'))
+			{
+				continue;
+			}
+
+			std::string::size_type pos;
+			std::string key, val;
+			if ((pos = arg.find('=')) == std::string::npos)
+			{
+				key = std::string(arg, 2, arg.length() - 2);
+				val = "";
+			}
+			else
+			{
+				key = std::string(arg, 2, pos - 2);
+				val = std::string(arg, pos + 1, arg.length() - 1);
+			}
+
+			//only add new keys, don't replace existing
+			if (pairs.find(key) == pairs.end())
+			{
+				pairs[key] = val;
+			}
+		}
+	}
+
+	bool CheckCmdLineFlag(const char *arg_name)
+	{
+		std::map<std::string, std::string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end())
+		{
+			return true;
+		}
+		return false;
+	}
+
+	template <typename T>
+	bool GetCmdLineArgument(const char *arg_name, T &val);
+
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+};
+
+template <typename T>
+inline bool b3CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+{
+	std::map<std::string, std::string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end())
+	{
+		std::istringstream strstream(itr->second);
+		strstream >> val;
+		return true;
+	}
+	return false;
+}
+
+template <>
+inline bool b3CommandLineArgs::GetCmdLineArgument<char *>(const char *arg_name, char *&val)
+{
+	std::map<std::string, std::string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end())
+	{
+		std::string s = itr->second;
+		val = (char *)malloc(sizeof(char) * (s.length() + 1));
+		std::strcpy(val, s.c_str());
+		return true;
+	}
+	else
+	{
+		val = NULL;
+	}
+	return false;
+}
+
+#endif  //COMMAND_LINE_ARGS_H

+ 133 - 0
Dependencies/include/bullet3/Bullet3Common/b3FileUtils.h

@@ -0,0 +1,133 @@
+#ifndef B3_FILE_UTILS_H
+#define B3_FILE_UTILS_H
+
+#include <stdio.h>
+#include "b3Scalar.h"
+#include <stddef.h>  //ptrdiff_h
+#include <string.h>
+
+struct b3FileUtils
+{
+	b3FileUtils()
+	{
+	}
+	virtual ~b3FileUtils()
+	{
+	}
+
+	static bool findFile(const char* orgFileName, char* relativeFileName, int maxRelativeFileNameMaxLen)
+	{
+		FILE* f = 0;
+		f = fopen(orgFileName, "rb");
+		if (f)
+		{
+			//printf("original file found: [%s]\n", orgFileName);
+			sprintf(relativeFileName, "%s", orgFileName);
+			fclose(f);
+			return true;
+		}
+
+		//printf("Trying various directories, relative to current working directory\n");
+		const char* prefix[] = {"./", "./data/", "../data/", "../../data/", "../../../data/", "../../../../data/"};
+		int numPrefixes = sizeof(prefix) / sizeof(const char*);
+
+		f = 0;
+		bool fileFound = false;
+
+		for (int i = 0; !f && i < numPrefixes; i++)
+		{
+#ifdef _MSC_VER
+			sprintf_s(relativeFileName, maxRelativeFileNameMaxLen, "%s%s", prefix[i], orgFileName);
+#else
+			sprintf(relativeFileName, "%s%s", prefix[i], orgFileName);
+#endif
+			f = fopen(relativeFileName, "rb");
+			if (f)
+			{
+				fileFound = true;
+				break;
+			}
+		}
+		if (f)
+		{
+			fclose(f);
+		}
+
+		return fileFound;
+	}
+
+	static const char* strip2(const char* name, const char* pattern)
+	{
+		size_t const patlen = strlen(pattern);
+		size_t patcnt = 0;
+		const char* oriptr;
+		const char* patloc;
+		// find how many times the pattern occurs in the original string
+		for (oriptr = name; (patloc = strstr(oriptr, pattern)); oriptr = patloc + patlen)
+		{
+			patcnt++;
+		}
+		return oriptr;
+	}
+
+	static int extractPath(const char* fileName, char* path, int maxPathLength)
+	{
+		const char* stripped = strip2(fileName, "/");
+		stripped = strip2(stripped, "\\");
+
+		ptrdiff_t len = stripped - fileName;
+		b3Assert((len + 1) < maxPathLength);
+
+		if (len && ((len + 1) < maxPathLength))
+		{
+			for (int i = 0; i < len; i++)
+			{
+				path[i] = fileName[i];
+			}
+			path[len] = 0;
+		}
+		else
+		{
+			len = 0;
+			b3Assert(maxPathLength > 0);
+			if (maxPathLength > 0)
+			{
+				path[len] = 0;
+			}
+		}
+		return len;
+	}
+
+	static char toLowerChar(const char t)
+	{
+		if (t >= (char)'A' && t <= (char)'Z')
+			return t + ((char)'a' - (char)'A');
+		else
+			return t;
+	}
+
+	static void toLower(char* str)
+	{
+		int len = strlen(str);
+		for (int i = 0; i < len; i++)
+		{
+			str[i] = toLowerChar(str[i]);
+		}
+	}
+
+	/*static const char* strip2(const char* name, const char* pattern)
+	{
+		size_t const patlen = strlen(pattern);
+		size_t patcnt = 0;
+		const char * oriptr;
+		const char * patloc;
+		// find how many times the pattern occurs in the original string
+		for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+		{
+			patcnt++;
+		}
+		return oriptr;
+	}
+	*/
+};
+#endif  //B3_FILE_UTILS_H

+ 462 - 0
Dependencies/include/bullet3/Bullet3Common/b3HashMap.h

@@ -0,0 +1,462 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_HASH_MAP_H
+#define B3_HASH_MAP_H
+
+#include "b3AlignedObjectArray.h"
+
+#include <string>
+
+///very basic hashable string implementation, compatible with b3HashMap
+struct b3HashString
+{
+	std::string m_string;
+	unsigned int m_hash;
+
+	B3_FORCE_INLINE unsigned int getHash() const
+	{
+		return m_hash;
+	}
+
+	b3HashString(const char* name)
+		: m_string(name)
+	{
+		/* magic numbers from http://www.isthe.com/chongo/tech/comp/fnv/ */
+		static const unsigned int InitialFNV = 2166136261u;
+		static const unsigned int FNVMultiple = 16777619u;
+
+		/* Fowler / Noll / Vo (FNV) Hash */
+		unsigned int hash = InitialFNV;
+		int len = m_string.length();
+		for (int i = 0; i < len; i++)
+		{
+			hash = hash ^ (m_string[i]); /* xor  the low 8 bits */
+			hash = hash * FNVMultiple;   /* multiply by the magic number */
+		}
+		m_hash = hash;
+	}
+
+	int portableStringCompare(const char* src, const char* dst) const
+	{
+		int ret = 0;
+
+		while (!(ret = *(unsigned char*)src - *(unsigned char*)dst) && *dst)
+			++src, ++dst;
+
+		if (ret < 0)
+			ret = -1;
+		else if (ret > 0)
+			ret = 1;
+
+		return (ret);
+	}
+
+	bool equals(const b3HashString& other) const
+	{
+		return (m_string == other.m_string);
+	}
+};
+
+const int B3_HASH_NULL = 0xffffffff;
+
+class b3HashInt
+{
+	int m_uid;
+
+public:
+	b3HashInt(int uid) : m_uid(uid)
+	{
+	}
+
+	int getUid1() const
+	{
+		return m_uid;
+	}
+
+	void setUid1(int uid)
+	{
+		m_uid = uid;
+	}
+
+	bool equals(const b3HashInt& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+	//to our success
+	B3_FORCE_INLINE unsigned int getHash() const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
+		return key;
+	}
+};
+
+class b3HashPtr
+{
+	union {
+		const void* m_pointer;
+		int m_hashValues[2];
+	};
+
+public:
+	b3HashPtr(const void* ptr)
+		: m_pointer(ptr)
+	{
+	}
+
+	const void* getPointer() const
+	{
+		return m_pointer;
+	}
+
+	bool equals(const b3HashPtr& other) const
+	{
+		return getPointer() == other.getPointer();
+	}
+
+	//to our success
+	B3_FORCE_INLINE unsigned int getHash() const
+	{
+		const bool VOID_IS_8 = ((sizeof(void*) == 8));
+
+		int key = VOID_IS_8 ? m_hashValues[0] + m_hashValues[1] : m_hashValues[0];
+
+		// Thomas Wang's hash
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
+		return key;
+	}
+};
+
+template <class Value>
+class b3HashKeyPtr
+{
+	int m_uid;
+
+public:
+	b3HashKeyPtr(int uid) : m_uid(uid)
+	{
+	}
+
+	int getUid1() const
+	{
+		return m_uid;
+	}
+
+	bool equals(const b3HashKeyPtr<Value>& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+
+	//to our success
+	B3_FORCE_INLINE unsigned int getHash() const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
+		return key;
+	}
+};
+
+template <class Value>
+class b3HashKey
+{
+	int m_uid;
+
+public:
+	b3HashKey(int uid) : m_uid(uid)
+	{
+	}
+
+	int getUid1() const
+	{
+		return m_uid;
+	}
+
+	bool equals(const b3HashKey<Value>& other) const
+	{
+		return getUid1() == other.getUid1();
+	}
+	//to our success
+	B3_FORCE_INLINE unsigned int getHash() const
+	{
+		int key = m_uid;
+		// Thomas Wang's hash
+		key += ~(key << 15);
+		key ^= (key >> 10);
+		key += (key << 3);
+		key ^= (key >> 6);
+		key += ~(key << 11);
+		key ^= (key >> 16);
+		return key;
+	}
+};
+
+///The b3HashMap template class implements a generic and lightweight hashmap.
+///A basic sample of how to use b3HashMap is located in Demos\BasicDemo\main.cpp
+template <class Key, class Value>
+class b3HashMap
+{
+protected:
+	b3AlignedObjectArray<int> m_hashTable;
+	b3AlignedObjectArray<int> m_next;
+
+	b3AlignedObjectArray<Value> m_valueArray;
+	b3AlignedObjectArray<Key> m_keyArray;
+
+	void growTables(const Key& /*key*/)
+	{
+		int newCapacity = m_valueArray.capacity();
+
+		if (m_hashTable.size() < newCapacity)
+		{
+			//grow hashtable and next table
+			int curHashtableSize = m_hashTable.size();
+
+			m_hashTable.resize(newCapacity);
+			m_next.resize(newCapacity);
+
+			int i;
+
+			for (i = 0; i < newCapacity; ++i)
+			{
+				m_hashTable[i] = B3_HASH_NULL;
+			}
+			for (i = 0; i < newCapacity; ++i)
+			{
+				m_next[i] = B3_HASH_NULL;
+			}
+
+			for (i = 0; i < curHashtableSize; i++)
+			{
+				//const Value& value = m_valueArray[i];
+				//const Key& key = m_keyArray[i];
+
+				int hashValue = m_keyArray[i].getHash() & (m_valueArray.capacity() - 1);  // New hash value with new mask
+				m_next[i] = m_hashTable[hashValue];
+				m_hashTable[hashValue] = i;
+			}
+		}
+	}
+
+public:
+	void insert(const Key& key, const Value& value)
+	{
+		int hash = key.getHash() & (m_valueArray.capacity() - 1);
+
+		//replace value if the key is already there
+		int index = findIndex(key);
+		if (index != B3_HASH_NULL)
+		{
+			m_valueArray[index] = value;
+			return;
+		}
+
+		int count = m_valueArray.size();
+		int oldCapacity = m_valueArray.capacity();
+		m_valueArray.push_back(value);
+		m_keyArray.push_back(key);
+
+		int newCapacity = m_valueArray.capacity();
+		if (oldCapacity < newCapacity)
+		{
+			growTables(key);
+			//hash with new capacity
+			hash = key.getHash() & (m_valueArray.capacity() - 1);
+		}
+		m_next[count] = m_hashTable[hash];
+		m_hashTable[hash] = count;
+	}
+
+	void remove(const Key& key)
+	{
+		int hash = key.getHash() & (m_valueArray.capacity() - 1);
+
+		int pairIndex = findIndex(key);
+
+		if (pairIndex == B3_HASH_NULL)
+		{
+			return;
+		}
+
+		// Remove the pair from the hash table.
+		int index = m_hashTable[hash];
+		b3Assert(index != B3_HASH_NULL);
+
+		int previous = B3_HASH_NULL;
+		while (index != pairIndex)
+		{
+			previous = index;
+			index = m_next[index];
+		}
+
+		if (previous != B3_HASH_NULL)
+		{
+			b3Assert(m_next[previous] == pairIndex);
+			m_next[previous] = m_next[pairIndex];
+		}
+		else
+		{
+			m_hashTable[hash] = m_next[pairIndex];
+		}
+
+		// We now move the last pair into spot of the
+		// pair being removed. We need to fix the hash
+		// table indices to support the move.
+
+		int lastPairIndex = m_valueArray.size() - 1;
+
+		// If the removed pair is the last pair, we are done.
+		if (lastPairIndex == pairIndex)
+		{
+			m_valueArray.pop_back();
+			m_keyArray.pop_back();
+			return;
+		}
+
+		// Remove the last pair from the hash table.
+		int lastHash = m_keyArray[lastPairIndex].getHash() & (m_valueArray.capacity() - 1);
+
+		index = m_hashTable[lastHash];
+		b3Assert(index != B3_HASH_NULL);
+
+		previous = B3_HASH_NULL;
+		while (index != lastPairIndex)
+		{
+			previous = index;
+			index = m_next[index];
+		}
+
+		if (previous != B3_HASH_NULL)
+		{
+			b3Assert(m_next[previous] == lastPairIndex);
+			m_next[previous] = m_next[lastPairIndex];
+		}
+		else
+		{
+			m_hashTable[lastHash] = m_next[lastPairIndex];
+		}
+
+		// Copy the last pair into the remove pair's spot.
+		m_valueArray[pairIndex] = m_valueArray[lastPairIndex];
+		m_keyArray[pairIndex] = m_keyArray[lastPairIndex];
+
+		// Insert the last pair into the hash table
+		m_next[pairIndex] = m_hashTable[lastHash];
+		m_hashTable[lastHash] = pairIndex;
+
+		m_valueArray.pop_back();
+		m_keyArray.pop_back();
+	}
+
+	int size() const
+	{
+		return m_valueArray.size();
+	}
+
+	const Value* getAtIndex(int index) const
+	{
+		b3Assert(index < m_valueArray.size());
+
+		return &m_valueArray[index];
+	}
+
+	Value* getAtIndex(int index)
+	{
+		b3Assert(index < m_valueArray.size());
+
+		return &m_valueArray[index];
+	}
+
+	Key getKeyAtIndex(int index)
+	{
+		b3Assert(index < m_keyArray.size());
+		return m_keyArray[index];
+	}
+
+	const Key getKeyAtIndex(int index) const
+	{
+		b3Assert(index < m_keyArray.size());
+		return m_keyArray[index];
+	}
+
+	Value* operator[](const Key& key)
+	{
+		return find(key);
+	}
+
+	const Value* find(const Key& key) const
+	{
+		int index = findIndex(key);
+		if (index == B3_HASH_NULL)
+		{
+			return NULL;
+		}
+		return &m_valueArray[index];
+	}
+
+	Value* find(const Key& key)
+	{
+		int index = findIndex(key);
+		if (index == B3_HASH_NULL)
+		{
+			return NULL;
+		}
+		return &m_valueArray[index];
+	}
+
+	int findIndex(const Key& key) const
+	{
+		unsigned int hash = key.getHash() & (m_valueArray.capacity() - 1);
+
+		if (hash >= (unsigned int)m_hashTable.size())
+		{
+			return B3_HASH_NULL;
+		}
+
+		int index = m_hashTable[hash];
+		while ((index != B3_HASH_NULL) && key.equals(m_keyArray[index]) == false)
+		{
+			index = m_next[index];
+		}
+		return index;
+	}
+
+	void clear()
+	{
+		m_hashTable.clear();
+		m_next.clear();
+		m_valueArray.clear();
+		m_keyArray.clear();
+	}
+};
+
+#endif  //B3_HASH_MAP_H

+ 145 - 0
Dependencies/include/bullet3/Bullet3Common/b3Logging.cpp

@@ -0,0 +1,145 @@
+/*
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "b3Logging.h"
+
+#include <stdio.h>
+#include <stdarg.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif  //_WIN32
+
+void b3PrintfFuncDefault(const char* msg)
+{
+#ifdef _WIN32
+	OutputDebugStringA(msg);
+#endif
+	printf("%s", msg);
+	//is this portable?
+	fflush(stdout);
+}
+
+void b3WarningMessageFuncDefault(const char* msg)
+{
+#ifdef _WIN32
+	OutputDebugStringA(msg);
+#endif
+	printf("%s", msg);
+	//is this portable?
+	fflush(stdout);
+}
+
+void b3ErrorMessageFuncDefault(const char* msg)
+{
+#ifdef _WIN32
+	OutputDebugStringA(msg);
+#endif
+	printf("%s", msg);
+
+	//is this portable?
+	fflush(stdout);
+}
+
+static b3PrintfFunc* b3s_printfFunc = b3PrintfFuncDefault;
+static b3WarningMessageFunc* b3s_warningMessageFunc = b3WarningMessageFuncDefault;
+static b3ErrorMessageFunc* b3s_errorMessageFunc = b3ErrorMessageFuncDefault;
+
+///The developer can route b3Printf output using their own implementation
+void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc)
+{
+	b3s_printfFunc = printfFunc;
+}
+void b3SetCustomWarningMessageFunc(b3PrintfFunc* warningMessageFunc)
+{
+	b3s_warningMessageFunc = warningMessageFunc;
+}
+void b3SetCustomErrorMessageFunc(b3PrintfFunc* errorMessageFunc)
+{
+	b3s_errorMessageFunc = errorMessageFunc;
+}
+
+//#define B3_MAX_DEBUG_STRING_LENGTH 2048
+#define B3_MAX_DEBUG_STRING_LENGTH 32768
+
+void b3OutputPrintfVarArgsInternal(const char* str, ...)
+{
+	char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0};
+	va_list argList;
+	va_start(argList, str);
+#ifdef _MSC_VER
+	vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
+#else
+	vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
+#endif
+	(b3s_printfFunc)(strDebug);
+	va_end(argList);
+}
+void b3OutputWarningMessageVarArgsInternal(const char* str, ...)
+{
+	char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0};
+	va_list argList;
+	va_start(argList, str);
+#ifdef _MSC_VER
+	vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
+#else
+	vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
+#endif
+	(b3s_warningMessageFunc)(strDebug);
+	va_end(argList);
+}
+void b3OutputErrorMessageVarArgsInternal(const char* str, ...)
+{
+	char strDebug[B3_MAX_DEBUG_STRING_LENGTH] = {0};
+	va_list argList;
+	va_start(argList, str);
+#ifdef _MSC_VER
+	vsprintf_s(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
+#else
+	vsnprintf(strDebug, B3_MAX_DEBUG_STRING_LENGTH, str, argList);
+#endif
+	(b3s_errorMessageFunc)(strDebug);
+	va_end(argList);
+}
+
+void b3EnterProfileZoneDefault(const char* name)
+{
+}
+void b3LeaveProfileZoneDefault()
+{
+}
+static b3EnterProfileZoneFunc* b3s_enterFunc = b3EnterProfileZoneDefault;
+static b3LeaveProfileZoneFunc* b3s_leaveFunc = b3LeaveProfileZoneDefault;
+void b3EnterProfileZone(const char* name)
+{
+	(b3s_enterFunc)(name);
+}
+void b3LeaveProfileZone()
+{
+	(b3s_leaveFunc)();
+}
+
+void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc)
+{
+	b3s_enterFunc = enterFunc;
+}
+void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc)
+{
+	b3s_leaveFunc = leaveFunc;
+}
+
+#ifndef _MSC_VER
+#undef vsprintf_s
+#endif

+ 74 - 0
Dependencies/include/bullet3/Bullet3Common/b3Logging.h

@@ -0,0 +1,74 @@
+
+#ifndef B3_LOGGING_H
+#define B3_LOGGING_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+///We add the do/while so that the statement "if (condition) b3Printf("test"); else {...}" would fail
+///You can also customize the message by uncommenting out a different line below
+#define b3Printf(...) b3OutputPrintfVarArgsInternal(__VA_ARGS__)
+	//#define b3Printf(...) do {b3OutputPrintfVarArgsInternal("b3Printf[%s,%d]:",__FILE__,__LINE__);b3OutputPrintfVarArgsInternal(__VA_ARGS__); } while(0)
+	//#define b3Printf b3OutputPrintfVarArgsInternal
+	//#define b3Printf(...) printf(__VA_ARGS__)
+	//#define b3Printf(...)
+#define b3Warning(...) do{	b3OutputWarningMessageVarArgsInternal("b3Warning[%s,%d]:\n", __FILE__, __LINE__);b3OutputWarningMessageVarArgsInternal(__VA_ARGS__);} while (0)
+#define b3Error(...)do	{b3OutputErrorMessageVarArgsInternal("b3Error[%s,%d]:\n", __FILE__, __LINE__);b3OutputErrorMessageVarArgsInternal(__VA_ARGS__);} while (0)
+#ifndef B3_NO_PROFILE
+
+	void b3EnterProfileZone(const char* name);
+	void b3LeaveProfileZone();
+#ifdef __cplusplus
+
+	class b3ProfileZone
+	{
+	public:
+		b3ProfileZone(const char* name)
+		{
+			b3EnterProfileZone(name);
+		}
+
+		~b3ProfileZone()
+		{
+			b3LeaveProfileZone();
+		}
+	};
+
+#define B3_PROFILE(name) b3ProfileZone __profile(name)
+#endif
+
+#else  //B3_NO_PROFILE
+
+#define B3_PROFILE(name)
+#define b3StartProfile(a)
+#define b3StopProfile
+
+#endif  //#ifndef B3_NO_PROFILE
+
+	typedef void(b3PrintfFunc)(const char* msg);
+	typedef void(b3WarningMessageFunc)(const char* msg);
+	typedef void(b3ErrorMessageFunc)(const char* msg);
+	typedef void(b3EnterProfileZoneFunc)(const char* msg);
+	typedef void(b3LeaveProfileZoneFunc)();
+
+	///The developer can route b3Printf output using their own implementation
+	void b3SetCustomPrintfFunc(b3PrintfFunc* printfFunc);
+	void b3SetCustomWarningMessageFunc(b3WarningMessageFunc* warningMsgFunc);
+	void b3SetCustomErrorMessageFunc(b3ErrorMessageFunc* errorMsgFunc);
+
+	///Set custom profile zone functions (zones can be nested)
+	void b3SetCustomEnterProfileZoneFunc(b3EnterProfileZoneFunc* enterFunc);
+	void b3SetCustomLeaveProfileZoneFunc(b3LeaveProfileZoneFunc* leaveFunc);
+
+	///Don't use those internal functions directly, use the b3Printf or b3SetCustomPrintfFunc instead (or warning/error version)
+	void b3OutputPrintfVarArgsInternal(const char* str, ...);
+	void b3OutputWarningMessageVarArgsInternal(const char* str, ...);
+	void b3OutputErrorMessageVarArgsInternal(const char* str, ...);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  //B3_LOGGING_H

+ 1354 - 0
Dependencies/include/bullet3/Bullet3Common/b3Matrix3x3.h

@@ -0,0 +1,1354 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_MATRIX3x3_H
+#define B3_MATRIX3x3_H
+
+#include "b3Vector3.h"
+#include "b3Quaternion.h"
+#include <stdio.h>
+
+#ifdef B3_USE_SSE
+//const __m128 B3_ATTRIBUTE_ALIGNED16(b3v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
+#endif
+
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
+#endif
+
+#ifdef B3_USE_DOUBLE_PRECISION
+#define b3Matrix3x3Data b3Matrix3x3DoubleData
+#else
+#define b3Matrix3x3Data b3Matrix3x3FloatData
+#endif  //B3_USE_DOUBLE_PRECISION
+
+/**@brief The b3Matrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with b3Quaternion, b3Transform and b3Vector3.
+* Make sure to only include a pure orthogonal matrix without scaling. */
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Matrix3x3
+{
+	///Data storage for the matrix, each vector is a row of the matrix
+	b3Vector3 m_el[3];
+
+public:
+	/** @brief No initializaion constructor */
+	b3Matrix3x3() {}
+
+	//		explicit b3Matrix3x3(const b3Scalar *m) { setFromOpenGLSubMatrix(m); }
+
+	/**@brief Constructor from Quaternion */
+	explicit b3Matrix3x3(const b3Quaternion& q) { setRotation(q); }
+	/*
+	template <typename b3Scalar>
+	Matrix3x3(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{ 
+	setEulerYPR(yaw, pitch, roll);
+	}
+	*/
+	/** @brief Constructor with row major formatting */
+	b3Matrix3x3(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz,
+				const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
+				const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
+	{
+		setValue(xx, xy, xz,
+				 yx, yy, yz,
+				 zx, zy, zz);
+	}
+
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	B3_FORCE_INLINE b3Matrix3x3(const b3SimdFloat4 v0, const b3SimdFloat4 v1, const b3SimdFloat4 v2)
+	{
+		m_el[0].mVec128 = v0;
+		m_el[1].mVec128 = v1;
+		m_el[2].mVec128 = v2;
+	}
+
+	B3_FORCE_INLINE b3Matrix3x3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2)
+	{
+		m_el[0] = v0;
+		m_el[1] = v1;
+		m_el[2] = v2;
+	}
+
+	// Copy constructor
+	B3_FORCE_INLINE b3Matrix3x3(const b3Matrix3x3& rhs)
+	{
+		m_el[0].mVec128 = rhs.m_el[0].mVec128;
+		m_el[1].mVec128 = rhs.m_el[1].mVec128;
+		m_el[2].mVec128 = rhs.m_el[2].mVec128;
+	}
+
+	// Assignment Operator
+	B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& m)
+	{
+		m_el[0].mVec128 = m.m_el[0].mVec128;
+		m_el[1].mVec128 = m.m_el[1].mVec128;
+		m_el[2].mVec128 = m.m_el[2].mVec128;
+
+		return *this;
+	}
+
+#else
+
+	/** @brief Copy constructor */
+	B3_FORCE_INLINE b3Matrix3x3(const b3Matrix3x3& other)
+	{
+		m_el[0] = other.m_el[0];
+		m_el[1] = other.m_el[1];
+		m_el[2] = other.m_el[2];
+	}
+
+	/** @brief Assignment Operator */
+	B3_FORCE_INLINE b3Matrix3x3& operator=(const b3Matrix3x3& other)
+	{
+		m_el[0] = other.m_el[0];
+		m_el[1] = other.m_el[1];
+		m_el[2] = other.m_el[2];
+		return *this;
+	}
+
+#endif
+
+	/** @brief Get a column of the matrix as a vector 
+	*  @param i Column number 0 indexed */
+	B3_FORCE_INLINE b3Vector3 getColumn(int i) const
+	{
+		return b3MakeVector3(m_el[0][i], m_el[1][i], m_el[2][i]);
+	}
+
+	/** @brief Get a row of the matrix as a vector 
+	*  @param i Row number 0 indexed */
+	B3_FORCE_INLINE const b3Vector3& getRow(int i) const
+	{
+		b3FullAssert(0 <= i && i < 3);
+		return m_el[i];
+	}
+
+	/** @brief Get a mutable reference to a row of the matrix as a vector 
+	*  @param i Row number 0 indexed */
+	B3_FORCE_INLINE b3Vector3& operator[](int i)
+	{
+		b3FullAssert(0 <= i && i < 3);
+		return m_el[i];
+	}
+
+	/** @brief Get a const reference to a row of the matrix as a vector 
+	*  @param i Row number 0 indexed */
+	B3_FORCE_INLINE const b3Vector3& operator[](int i) const
+	{
+		b3FullAssert(0 <= i && i < 3);
+		return m_el[i];
+	}
+
+	/** @brief Multiply by the target matrix on the right
+	*  @param m Rotation matrix to be applied 
+	* Equivilant to this = this * m */
+	b3Matrix3x3& operator*=(const b3Matrix3x3& m);
+
+	/** @brief Adds by the target matrix on the right
+	*  @param m matrix to be applied 
+	* Equivilant to this = this + m */
+	b3Matrix3x3& operator+=(const b3Matrix3x3& m);
+
+	/** @brief Substractss by the target matrix on the right
+	*  @param m matrix to be applied 
+	* Equivilant to this = this - m */
+	b3Matrix3x3& operator-=(const b3Matrix3x3& m);
+
+	/** @brief Set from the rotational part of a 4x4 OpenGL matrix
+	*  @param m A pointer to the beginning of the array of scalars*/
+	void setFromOpenGLSubMatrix(const b3Scalar* m)
+	{
+		m_el[0].setValue(m[0], m[4], m[8]);
+		m_el[1].setValue(m[1], m[5], m[9]);
+		m_el[2].setValue(m[2], m[6], m[10]);
+	}
+	/** @brief Set the values of the matrix explicitly (row major)
+	*  @param xx Top left
+	*  @param xy Top Middle
+	*  @param xz Top Right
+	*  @param yx Middle Left
+	*  @param yy Middle Middle
+	*  @param yz Middle Right
+	*  @param zx Bottom Left
+	*  @param zy Bottom Middle
+	*  @param zz Bottom Right*/
+	void setValue(const b3Scalar& xx, const b3Scalar& xy, const b3Scalar& xz,
+				  const b3Scalar& yx, const b3Scalar& yy, const b3Scalar& yz,
+				  const b3Scalar& zx, const b3Scalar& zy, const b3Scalar& zz)
+	{
+		m_el[0].setValue(xx, xy, xz);
+		m_el[1].setValue(yx, yy, yz);
+		m_el[2].setValue(zx, zy, zz);
+	}
+
+	/** @brief Set the matrix from a quaternion
+	*  @param q The Quaternion to match */
+	void setRotation(const b3Quaternion& q)
+	{
+		b3Scalar d = q.length2();
+		b3FullAssert(d != b3Scalar(0.0));
+		b3Scalar s = b3Scalar(2.0) / d;
+
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs, Q = q.get128();
+		__m128i Qi = b3CastfTo128i(Q);
+		__m128 Y, Z;
+		__m128 V1, V2, V3;
+		__m128 V11, V21, V31;
+		__m128 NQ = _mm_xor_ps(Q, b3vMzeroMask);
+		__m128i NQi = b3CastfTo128i(NQ);
+
+		V1 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 2, 3)));  // Y X Z W
+		V2 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(0, 0, 1, 3));                 // -X -X  Y  W
+		V3 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(2, 1, 0, 3)));  // Z Y X W
+		V1 = _mm_xor_ps(V1, b3vMPPP);                                       //	change the sign of the first element
+
+		V11 = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 1, 0, 3)));  // Y Y X W
+		V21 = _mm_unpackhi_ps(Q, Q);                                         //  Z  Z  W  W
+		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(0, 2, 0, 3));                 //  X  Z -X -W
+
+		V2 = V2 * V1;   //
+		V1 = V1 * V11;  //
+		V3 = V3 * V31;  //
+
+		V11 = _mm_shuffle_ps(NQ, Q, B3_SHUFFLE(2, 3, 1, 3));                //	-Z -W  Y  W
+		V11 = V11 * V21;                                                    //
+		V21 = _mm_xor_ps(V21, b3vMPPP);                                     //	change the sign of the first element
+		V31 = _mm_shuffle_ps(Q, NQ, B3_SHUFFLE(3, 3, 1, 3));                //	 W  W -Y -W
+		V31 = _mm_xor_ps(V31, b3vMPPP);                                     //	change the sign of the first element
+		Y = b3CastiTo128f(_mm_shuffle_epi32(NQi, B3_SHUFFLE(3, 2, 0, 3)));  // -W -Z -X -W
+		Z = b3CastiTo128f(_mm_shuffle_epi32(Qi, B3_SHUFFLE(1, 0, 1, 3)));   //  Y  X  Y  W
+
+		vs = _mm_load_ss(&s);
+		V21 = V21 * Y;
+		V31 = V31 * Z;
+
+		V1 = V1 + V11;
+		V2 = V2 + V21;
+		V3 = V3 + V31;
+
+		vs = b3_splat3_ps(vs, 0);
+		//	s ready
+		V1 = V1 * vs;
+		V2 = V2 * vs;
+		V3 = V3 * vs;
+
+		V1 = V1 + b3v1000;
+		V2 = V2 + b3v0100;
+		V3 = V3 + b3v0010;
+
+		m_el[0] = b3MakeVector3(V1);
+		m_el[1] = b3MakeVector3(V2);
+		m_el[2] = b3MakeVector3(V3);
+#else
+		b3Scalar xs = q.getX() * s, ys = q.getY() * s, zs = q.getZ() * s;
+		b3Scalar wx = q.getW() * xs, wy = q.getW() * ys, wz = q.getW() * zs;
+		b3Scalar xx = q.getX() * xs, xy = q.getX() * ys, xz = q.getX() * zs;
+		b3Scalar yy = q.getY() * ys, yz = q.getY() * zs, zz = q.getZ() * zs;
+		setValue(
+			b3Scalar(1.0) - (yy + zz), xy - wz, xz + wy,
+			xy + wz, b3Scalar(1.0) - (xx + zz), yz - wx,
+			xz - wy, yz + wx, b3Scalar(1.0) - (xx + yy));
+#endif
+	}
+
+	/** @brief Set the matrix from euler angles using YPR around YXZ respectively
+	*  @param yaw Yaw about Y axis
+	*  @param pitch Pitch about X axis
+	*  @param roll Roll about Z axis 
+	*/
+	void setEulerYPR(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{
+		setEulerZYX(roll, pitch, yaw);
+	}
+
+	/** @brief Set the matrix from euler angles YPR around ZYX axes
+	* @param eulerX Roll about X axis
+	* @param eulerY Pitch around Y axis
+	* @param eulerZ Yaw aboud Z axis
+	* 
+	* These angles are used to produce a rotation matrix. The euler
+	* angles are applied in ZYX order. I.e a vector is first rotated 
+	* about X then Y and then Z
+	**/
+	void setEulerZYX(b3Scalar eulerX, b3Scalar eulerY, b3Scalar eulerZ)
+	{
+		///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
+		b3Scalar ci(b3Cos(eulerX));
+		b3Scalar cj(b3Cos(eulerY));
+		b3Scalar ch(b3Cos(eulerZ));
+		b3Scalar si(b3Sin(eulerX));
+		b3Scalar sj(b3Sin(eulerY));
+		b3Scalar sh(b3Sin(eulerZ));
+		b3Scalar cc = ci * ch;
+		b3Scalar cs = ci * sh;
+		b3Scalar sc = si * ch;
+		b3Scalar ss = si * sh;
+
+		setValue(cj * ch, sj * sc - cs, sj * cc + ss,
+				 cj * sh, sj * ss + cc, sj * cs - sc,
+				 -sj, cj * si, cj * ci);
+	}
+
+	/**@brief Set the matrix to the identity */
+	void setIdentity()
+	{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+		m_el[0] = b3MakeVector3(b3v1000);
+		m_el[1] = b3MakeVector3(b3v0100);
+		m_el[2] = b3MakeVector3(b3v0010);
+#else
+		setValue(b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0),
+				 b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0),
+				 b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
+#endif
+	}
+
+	static const b3Matrix3x3& getIdentity()
+	{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+		static const b3Matrix3x3
+			identityMatrix(b3v1000, b3v0100, b3v0010);
+#else
+		static const b3Matrix3x3
+			identityMatrix(
+				b3Scalar(1.0), b3Scalar(0.0), b3Scalar(0.0),
+				b3Scalar(0.0), b3Scalar(1.0), b3Scalar(0.0),
+				b3Scalar(0.0), b3Scalar(0.0), b3Scalar(1.0));
+#endif
+		return identityMatrix;
+	}
+
+	/**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
+	* @param m The array to be filled */
+	void getOpenGLSubMatrix(b3Scalar * m) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 v0 = m_el[0].mVec128;
+		__m128 v1 = m_el[1].mVec128;
+		__m128 v2 = m_el[2].mVec128;  //  x2 y2 z2 w2
+		__m128* vm = (__m128*)m;
+		__m128 vT;
+
+		v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
+
+		vT = _mm_unpackhi_ps(v0, v1);  //	z0 z1 * *
+		v0 = _mm_unpacklo_ps(v0, v1);  //	x0 x1 y0 y1
+
+		v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3));                    // y0 y1 y2 0
+		v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3));                    // x0 x1 x2 0
+		v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));  // z0 z1 z2 0
+
+		vm[0] = v0;
+		vm[1] = v1;
+		vm[2] = v2;
+#elif defined(B3_USE_NEON)
+		// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+		static const uint32x2_t zMask = (const uint32x2_t){-1, 0};
+		float32x4_t* vm = (float32x4_t*)m;
+		float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);               // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+		float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));  // {x2  0 }, {y2 0}
+		float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
+		float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
+		float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
+		float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  // z0 z1 z2  0
+
+		vm[0] = v0;
+		vm[1] = v1;
+		vm[2] = v2;
+#else
+		m[0] = b3Scalar(m_el[0].getX());
+		m[1] = b3Scalar(m_el[1].getX());
+		m[2] = b3Scalar(m_el[2].getX());
+		m[3] = b3Scalar(0.0);
+		m[4] = b3Scalar(m_el[0].getY());
+		m[5] = b3Scalar(m_el[1].getY());
+		m[6] = b3Scalar(m_el[2].getY());
+		m[7] = b3Scalar(0.0);
+		m[8] = b3Scalar(m_el[0].getZ());
+		m[9] = b3Scalar(m_el[1].getZ());
+		m[10] = b3Scalar(m_el[2].getZ());
+		m[11] = b3Scalar(0.0);
+#endif
+	}
+
+	/**@brief Get the matrix represented as a quaternion 
+	* @param q The quaternion which will be set */
+	void getRotation(b3Quaternion & q) const
+	{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+		b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
+		b3Scalar s, x;
+
+		union {
+			b3SimdFloat4 vec;
+			b3Scalar f[4];
+		} temp;
+
+		if (trace > b3Scalar(0.0))
+		{
+			x = trace + b3Scalar(1.0);
+
+			temp.f[0] = m_el[2].getY() - m_el[1].getZ();
+			temp.f[1] = m_el[0].getZ() - m_el[2].getX();
+			temp.f[2] = m_el[1].getX() - m_el[0].getY();
+			temp.f[3] = x;
+			//temp.f[3]= s * b3Scalar(0.5);
+		}
+		else
+		{
+			int i, j, k;
+			if (m_el[0].getX() < m_el[1].getY())
+			{
+				if (m_el[1].getY() < m_el[2].getZ())
+				{
+					i = 2;
+					j = 0;
+					k = 1;
+				}
+				else
+				{
+					i = 1;
+					j = 2;
+					k = 0;
+				}
+			}
+			else
+			{
+				if (m_el[0].getX() < m_el[2].getZ())
+				{
+					i = 2;
+					j = 0;
+					k = 1;
+				}
+				else
+				{
+					i = 0;
+					j = 1;
+					k = 2;
+				}
+			}
+
+			x = m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0);
+
+			temp.f[3] = (m_el[k][j] - m_el[j][k]);
+			temp.f[j] = (m_el[j][i] + m_el[i][j]);
+			temp.f[k] = (m_el[k][i] + m_el[i][k]);
+			temp.f[i] = x;
+			//temp.f[i] = s * b3Scalar(0.5);
+		}
+
+		s = b3Sqrt(x);
+		q.set128(temp.vec);
+		s = b3Scalar(0.5) / s;
+
+		q *= s;
+#else
+		b3Scalar trace = m_el[0].getX() + m_el[1].getY() + m_el[2].getZ();
+
+		b3Scalar temp[4];
+
+		if (trace > b3Scalar(0.0))
+		{
+			b3Scalar s = b3Sqrt(trace + b3Scalar(1.0));
+			temp[3] = (s * b3Scalar(0.5));
+			s = b3Scalar(0.5) / s;
+
+			temp[0] = ((m_el[2].getY() - m_el[1].getZ()) * s);
+			temp[1] = ((m_el[0].getZ() - m_el[2].getX()) * s);
+			temp[2] = ((m_el[1].getX() - m_el[0].getY()) * s);
+		}
+		else
+		{
+			int i = m_el[0].getX() < m_el[1].getY() ? (m_el[1].getY() < m_el[2].getZ() ? 2 : 1) : (m_el[0].getX() < m_el[2].getZ() ? 2 : 0);
+			int j = (i + 1) % 3;
+			int k = (i + 2) % 3;
+
+			b3Scalar s = b3Sqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + b3Scalar(1.0));
+			temp[i] = s * b3Scalar(0.5);
+			s = b3Scalar(0.5) / s;
+
+			temp[3] = (m_el[k][j] - m_el[j][k]) * s;
+			temp[j] = (m_el[j][i] + m_el[i][j]) * s;
+			temp[k] = (m_el[k][i] + m_el[i][k]) * s;
+		}
+		q.setValue(temp[0], temp[1], temp[2], temp[3]);
+#endif
+	}
+
+	/**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
+	* @param yaw Yaw around Y axis
+	* @param pitch Pitch around X axis
+	* @param roll around Z axis */
+	void getEulerYPR(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll) const
+	{
+		// first use the normal calculus
+		yaw = b3Scalar(b3Atan2(m_el[1].getX(), m_el[0].getX()));
+		pitch = b3Scalar(b3Asin(-m_el[2].getX()));
+		roll = b3Scalar(b3Atan2(m_el[2].getY(), m_el[2].getZ()));
+
+		// on pitch = +/-HalfPI
+		if (b3Fabs(pitch) == B3_HALF_PI)
+		{
+			if (yaw > 0)
+				yaw -= B3_PI;
+			else
+				yaw += B3_PI;
+
+			if (roll > 0)
+				roll -= B3_PI;
+			else
+				roll += B3_PI;
+		}
+	};
+
+	/**@brief Get the matrix represented as euler angles around ZYX
+	* @param yaw Yaw around X axis
+	* @param pitch Pitch around Y axis
+	* @param roll around X axis 
+	* @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
+	void getEulerZYX(b3Scalar & yaw, b3Scalar & pitch, b3Scalar & roll, unsigned int solution_number = 1) const
+	{
+		struct Euler
+		{
+			b3Scalar yaw;
+			b3Scalar pitch;
+			b3Scalar roll;
+		};
+
+		Euler euler_out;
+		Euler euler_out2;  //second solution
+		//get the pointer to the raw data
+
+		// Check that pitch is not at a singularity
+		if (b3Fabs(m_el[2].getX()) >= 1)
+		{
+			euler_out.yaw = 0;
+			euler_out2.yaw = 0;
+
+			// From difference of angles formula
+			b3Scalar delta = b3Atan2(m_el[0].getX(), m_el[0].getZ());
+			if (m_el[2].getX() > 0)  //gimbal locked up
+			{
+				euler_out.pitch = B3_PI / b3Scalar(2.0);
+				euler_out2.pitch = B3_PI / b3Scalar(2.0);
+				euler_out.roll = euler_out.pitch + delta;
+				euler_out2.roll = euler_out.pitch + delta;
+			}
+			else  // gimbal locked down
+			{
+				euler_out.pitch = -B3_PI / b3Scalar(2.0);
+				euler_out2.pitch = -B3_PI / b3Scalar(2.0);
+				euler_out.roll = -euler_out.pitch + delta;
+				euler_out2.roll = -euler_out.pitch + delta;
+			}
+		}
+		else
+		{
+			euler_out.pitch = -b3Asin(m_el[2].getX());
+			euler_out2.pitch = B3_PI - euler_out.pitch;
+
+			euler_out.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out.pitch),
+									 m_el[2].getZ() / b3Cos(euler_out.pitch));
+			euler_out2.roll = b3Atan2(m_el[2].getY() / b3Cos(euler_out2.pitch),
+									  m_el[2].getZ() / b3Cos(euler_out2.pitch));
+
+			euler_out.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out.pitch),
+									m_el[0].getX() / b3Cos(euler_out.pitch));
+			euler_out2.yaw = b3Atan2(m_el[1].getX() / b3Cos(euler_out2.pitch),
+									 m_el[0].getX() / b3Cos(euler_out2.pitch));
+		}
+
+		if (solution_number == 1)
+		{
+			yaw = euler_out.yaw;
+			pitch = euler_out.pitch;
+			roll = euler_out.roll;
+		}
+		else
+		{
+			yaw = euler_out2.yaw;
+			pitch = euler_out2.pitch;
+			roll = euler_out2.roll;
+		}
+	}
+
+	/**@brief Create a scaled copy of the matrix 
+	* @param s Scaling vector The elements of the vector will scale each column */
+
+	b3Matrix3x3 scaled(const b3Vector3& s) const
+	{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+		return b3Matrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
+#else
+		return b3Matrix3x3(
+			m_el[0].getX() * s.getX(), m_el[0].getY() * s.getY(), m_el[0].getZ() * s.getZ(),
+			m_el[1].getX() * s.getX(), m_el[1].getY() * s.getY(), m_el[1].getZ() * s.getZ(),
+			m_el[2].getX() * s.getX(), m_el[2].getY() * s.getY(), m_el[2].getZ() * s.getZ());
+#endif
+	}
+
+	/**@brief Return the determinant of the matrix */
+	b3Scalar determinant() const;
+	/**@brief Return the adjoint of the matrix */
+	b3Matrix3x3 adjoint() const;
+	/**@brief Return the matrix with all values non negative */
+	b3Matrix3x3 absolute() const;
+	/**@brief Return the transpose of the matrix */
+	b3Matrix3x3 transpose() const;
+	/**@brief Return the inverse of the matrix */
+	b3Matrix3x3 inverse() const;
+
+	b3Matrix3x3 transposeTimes(const b3Matrix3x3& m) const;
+	b3Matrix3x3 timesTranspose(const b3Matrix3x3& m) const;
+
+	B3_FORCE_INLINE b3Scalar tdotx(const b3Vector3& v) const
+	{
+		return m_el[0].getX() * v.getX() + m_el[1].getX() * v.getY() + m_el[2].getX() * v.getZ();
+	}
+	B3_FORCE_INLINE b3Scalar tdoty(const b3Vector3& v) const
+	{
+		return m_el[0].getY() * v.getX() + m_el[1].getY() * v.getY() + m_el[2].getY() * v.getZ();
+	}
+	B3_FORCE_INLINE b3Scalar tdotz(const b3Vector3& v) const
+	{
+		return m_el[0].getZ() * v.getX() + m_el[1].getZ() * v.getY() + m_el[2].getZ() * v.getZ();
+	}
+
+	/**@brief diagonalizes this matrix by the Jacobi method.
+	* @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
+	* coordinate system, i.e., old_this = rot * new_this * rot^T. 
+	* @param threshold See iteration
+	* @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied 
+	* by the sum of the absolute values of the diagonal, or when maxSteps have been executed. 
+	* 
+	* Note that this matrix is assumed to be symmetric. 
+	*/
+	void diagonalize(b3Matrix3x3 & rot, b3Scalar threshold, int maxSteps)
+	{
+		rot.setIdentity();
+		for (int step = maxSteps; step > 0; step--)
+		{
+			// find off-diagonal element [p][q] with largest magnitude
+			int p = 0;
+			int q = 1;
+			int r = 2;
+			b3Scalar max = b3Fabs(m_el[0][1]);
+			b3Scalar v = b3Fabs(m_el[0][2]);
+			if (v > max)
+			{
+				q = 2;
+				r = 1;
+				max = v;
+			}
+			v = b3Fabs(m_el[1][2]);
+			if (v > max)
+			{
+				p = 1;
+				q = 2;
+				r = 0;
+				max = v;
+			}
+
+			b3Scalar t = threshold * (b3Fabs(m_el[0][0]) + b3Fabs(m_el[1][1]) + b3Fabs(m_el[2][2]));
+			if (max <= t)
+			{
+				if (max <= B3_EPSILON * t)
+				{
+					return;
+				}
+				step = 1;
+			}
+
+			// compute Jacobi rotation J which leads to a zero for element [p][q]
+			b3Scalar mpq = m_el[p][q];
+			b3Scalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
+			b3Scalar theta2 = theta * theta;
+			b3Scalar cos;
+			b3Scalar sin;
+			if (theta2 * theta2 < b3Scalar(10 / B3_EPSILON))
+			{
+				t = (theta >= 0) ? 1 / (theta + b3Sqrt(1 + theta2))
+								 : 1 / (theta - b3Sqrt(1 + theta2));
+				cos = 1 / b3Sqrt(1 + t * t);
+				sin = cos * t;
+			}
+			else
+			{
+				// approximation for large theta-value, i.e., a nearly diagonal matrix
+				t = 1 / (theta * (2 + b3Scalar(0.5) / theta2));
+				cos = 1 - b3Scalar(0.5) * t * t;
+				sin = cos * t;
+			}
+
+			// apply rotation to matrix (this = J^T * this * J)
+			m_el[p][q] = m_el[q][p] = 0;
+			m_el[p][p] -= t * mpq;
+			m_el[q][q] += t * mpq;
+			b3Scalar mrp = m_el[r][p];
+			b3Scalar mrq = m_el[r][q];
+			m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
+			m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
+
+			// apply rotation to rot (rot = rot * J)
+			for (int i = 0; i < 3; i++)
+			{
+				b3Vector3& row = rot[i];
+				mrp = row[p];
+				mrq = row[q];
+				row[p] = cos * mrp - sin * mrq;
+				row[q] = cos * mrq + sin * mrp;
+			}
+		}
+	}
+
+	/**@brief Calculate the matrix cofactor 
+	* @param r1 The first row to use for calculating the cofactor
+	* @param c1 The first column to use for calculating the cofactor
+	* @param r1 The second row to use for calculating the cofactor
+	* @param c1 The second column to use for calculating the cofactor
+	* See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
+	*/
+	b3Scalar cofac(int r1, int c1, int r2, int c2) const
+	{
+		return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
+	}
+
+	void serialize(struct b3Matrix3x3Data & dataOut) const;
+
+	void serializeFloat(struct b3Matrix3x3FloatData & dataOut) const;
+
+	void deSerialize(const struct b3Matrix3x3Data& dataIn);
+
+	void deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn);
+
+	void deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn);
+};
+
+B3_FORCE_INLINE b3Matrix3x3&
+b3Matrix3x3::operator*=(const b3Matrix3x3& m)
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	__m128 rv00, rv01, rv02;
+	__m128 rv10, rv11, rv12;
+	__m128 rv20, rv21, rv22;
+	__m128 mv0, mv1, mv2;
+
+	rv02 = m_el[0].mVec128;
+	rv12 = m_el[1].mVec128;
+	rv22 = m_el[2].mVec128;
+
+	mv0 = _mm_and_ps(m[0].mVec128, b3vFFF0fMask);
+	mv1 = _mm_and_ps(m[1].mVec128, b3vFFF0fMask);
+	mv2 = _mm_and_ps(m[2].mVec128, b3vFFF0fMask);
+
+	// rv0
+	rv00 = b3_splat_ps(rv02, 0);
+	rv01 = b3_splat_ps(rv02, 1);
+	rv02 = b3_splat_ps(rv02, 2);
+
+	rv00 = _mm_mul_ps(rv00, mv0);
+	rv01 = _mm_mul_ps(rv01, mv1);
+	rv02 = _mm_mul_ps(rv02, mv2);
+
+	// rv1
+	rv10 = b3_splat_ps(rv12, 0);
+	rv11 = b3_splat_ps(rv12, 1);
+	rv12 = b3_splat_ps(rv12, 2);
+
+	rv10 = _mm_mul_ps(rv10, mv0);
+	rv11 = _mm_mul_ps(rv11, mv1);
+	rv12 = _mm_mul_ps(rv12, mv2);
+
+	// rv2
+	rv20 = b3_splat_ps(rv22, 0);
+	rv21 = b3_splat_ps(rv22, 1);
+	rv22 = b3_splat_ps(rv22, 2);
+
+	rv20 = _mm_mul_ps(rv20, mv0);
+	rv21 = _mm_mul_ps(rv21, mv1);
+	rv22 = _mm_mul_ps(rv22, mv2);
+
+	rv00 = _mm_add_ps(rv00, rv01);
+	rv10 = _mm_add_ps(rv10, rv11);
+	rv20 = _mm_add_ps(rv20, rv21);
+
+	m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
+	m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
+	m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
+
+#elif defined(B3_USE_NEON)
+
+	float32x4_t rv0, rv1, rv2;
+	float32x4_t v0, v1, v2;
+	float32x4_t mv0, mv1, mv2;
+
+	v0 = m_el[0].mVec128;
+	v1 = m_el[1].mVec128;
+	v2 = m_el[2].mVec128;
+
+	mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
+	mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
+	mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
+
+	rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+	rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+	rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+
+	rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+	rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+	rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+
+	rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+	rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+	rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+
+	m_el[0].mVec128 = rv0;
+	m_el[1].mVec128 = rv1;
+	m_el[2].mVec128 = rv2;
+#else
+	setValue(
+		m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
+		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
+		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
+#endif
+	return *this;
+}
+
+B3_FORCE_INLINE b3Matrix3x3&
+b3Matrix3x3::operator+=(const b3Matrix3x3& m)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
+	m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
+	m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
+#else
+	setValue(
+		m_el[0][0] + m.m_el[0][0],
+		m_el[0][1] + m.m_el[0][1],
+		m_el[0][2] + m.m_el[0][2],
+		m_el[1][0] + m.m_el[1][0],
+		m_el[1][1] + m.m_el[1][1],
+		m_el[1][2] + m.m_el[1][2],
+		m_el[2][0] + m.m_el[2][0],
+		m_el[2][1] + m.m_el[2][1],
+		m_el[2][2] + m.m_el[2][2]);
+#endif
+	return *this;
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+operator*(const b3Matrix3x3& m, const b3Scalar& k)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 vk = b3_splat_ps(_mm_load_ss((float*)&k), 0x80);
+	return b3Matrix3x3(
+		_mm_mul_ps(m[0].mVec128, vk),
+		_mm_mul_ps(m[1].mVec128, vk),
+		_mm_mul_ps(m[2].mVec128, vk));
+#elif defined(B3_USE_NEON)
+	return b3Matrix3x3(
+		vmulq_n_f32(m[0].mVec128, k),
+		vmulq_n_f32(m[1].mVec128, k),
+		vmulq_n_f32(m[2].mVec128, k));
+#else
+	return b3Matrix3x3(
+		m[0].getX() * k, m[0].getY() * k, m[0].getZ() * k,
+		m[1].getX() * k, m[1].getY() * k, m[1].getZ() * k,
+		m[2].getX() * k, m[2].getY() * k, m[2].getZ() * k);
+#endif
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+operator+(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	return b3Matrix3x3(
+		m1[0].mVec128 + m2[0].mVec128,
+		m1[1].mVec128 + m2[1].mVec128,
+		m1[2].mVec128 + m2[2].mVec128);
+#else
+	return b3Matrix3x3(
+		m1[0][0] + m2[0][0],
+		m1[0][1] + m2[0][1],
+		m1[0][2] + m2[0][2],
+
+		m1[1][0] + m2[1][0],
+		m1[1][1] + m2[1][1],
+		m1[1][2] + m2[1][2],
+
+		m1[2][0] + m2[2][0],
+		m1[2][1] + m2[2][1],
+		m1[2][2] + m2[2][2]);
+#endif
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+operator-(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	return b3Matrix3x3(
+		m1[0].mVec128 - m2[0].mVec128,
+		m1[1].mVec128 - m2[1].mVec128,
+		m1[2].mVec128 - m2[2].mVec128);
+#else
+	return b3Matrix3x3(
+		m1[0][0] - m2[0][0],
+		m1[0][1] - m2[0][1],
+		m1[0][2] - m2[0][2],
+
+		m1[1][0] - m2[1][0],
+		m1[1][1] - m2[1][1],
+		m1[1][2] - m2[1][2],
+
+		m1[2][0] - m2[2][0],
+		m1[2][1] - m2[2][1],
+		m1[2][2] - m2[2][2]);
+#endif
+}
+
+B3_FORCE_INLINE b3Matrix3x3&
+b3Matrix3x3::operator-=(const b3Matrix3x3& m)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
+	m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
+	m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
+#else
+	setValue(
+		m_el[0][0] - m.m_el[0][0],
+		m_el[0][1] - m.m_el[0][1],
+		m_el[0][2] - m.m_el[0][2],
+		m_el[1][0] - m.m_el[1][0],
+		m_el[1][1] - m.m_el[1][1],
+		m_el[1][2] - m.m_el[1][2],
+		m_el[2][0] - m.m_el[2][0],
+		m_el[2][1] - m.m_el[2][1],
+		m_el[2][2] - m.m_el[2][2]);
+#endif
+	return *this;
+}
+
+B3_FORCE_INLINE b3Scalar
+b3Matrix3x3::determinant() const
+{
+	return b3Triple((*this)[0], (*this)[1], (*this)[2]);
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::absolute() const
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	return b3Matrix3x3(
+		_mm_and_ps(m_el[0].mVec128, b3vAbsfMask),
+		_mm_and_ps(m_el[1].mVec128, b3vAbsfMask),
+		_mm_and_ps(m_el[2].mVec128, b3vAbsfMask));
+#elif defined(B3_USE_NEON)
+	return b3Matrix3x3(
+		(float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, b3v3AbsMask),
+		(float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, b3v3AbsMask),
+		(float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, b3v3AbsMask));
+#else
+	return b3Matrix3x3(
+		b3Fabs(m_el[0].getX()), b3Fabs(m_el[0].getY()), b3Fabs(m_el[0].getZ()),
+		b3Fabs(m_el[1].getX()), b3Fabs(m_el[1].getY()), b3Fabs(m_el[1].getZ()),
+		b3Fabs(m_el[2].getX()), b3Fabs(m_el[2].getY()), b3Fabs(m_el[2].getZ()));
+#endif
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::transpose() const
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 v0 = m_el[0].mVec128;
+	__m128 v1 = m_el[1].mVec128;
+	__m128 v2 = m_el[2].mVec128;  //  x2 y2 z2 w2
+	__m128 vT;
+
+	v2 = _mm_and_ps(v2, b3vFFF0fMask);  //  x2 y2 z2 0
+
+	vT = _mm_unpackhi_ps(v0, v1);  //	z0 z1 * *
+	v0 = _mm_unpacklo_ps(v0, v1);  //	x0 x1 y0 y1
+
+	v1 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(2, 3, 1, 3));                    // y0 y1 y2 0
+	v0 = _mm_shuffle_ps(v0, v2, B3_SHUFFLE(0, 1, 0, 3));                    // x0 x1 x2 0
+	v2 = b3CastdTo128f(_mm_move_sd(b3CastfTo128d(v2), b3CastfTo128d(vT)));  // z0 z1 z2 0
+
+	return b3Matrix3x3(v0, v1, v2);
+#elif defined(B3_USE_NEON)
+	// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
+	static const uint32x2_t zMask = (const uint32x2_t){-1, 0};
+	float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);               // {x0 x1 z0 z1}, {y0 y1 w0 w1}
+	float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));  // {x2  0 }, {y2 0}
+	float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
+	float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
+	float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
+	float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  // z0 z1 z2  0
+	return b3Matrix3x3(v0, v1, v2);
+#else
+	return b3Matrix3x3(m_el[0].getX(), m_el[1].getX(), m_el[2].getX(),
+					   m_el[0].getY(), m_el[1].getY(), m_el[2].getY(),
+					   m_el[0].getZ(), m_el[1].getZ(), m_el[2].getZ());
+#endif
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::adjoint() const
+{
+	return b3Matrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
+					   cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
+					   cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::inverse() const
+{
+	b3Vector3 co = b3MakeVector3(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
+	b3Scalar det = (*this)[0].dot(co);
+	b3FullAssert(det != b3Scalar(0.0));
+	b3Scalar s = b3Scalar(1.0) / det;
+	return b3Matrix3x3(co.getX() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
+					   co.getY() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
+					   co.getZ() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::transposeTimes(const b3Matrix3x3& m) const
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	// zeros w
+	//    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
+	__m128 row = m_el[0].mVec128;
+	__m128 m0 = _mm_and_ps(m.getRow(0).mVec128, b3vFFF0fMask);
+	__m128 m1 = _mm_and_ps(m.getRow(1).mVec128, b3vFFF0fMask);
+	__m128 m2 = _mm_and_ps(m.getRow(2).mVec128, b3vFFF0fMask);
+	__m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
+	__m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
+	__m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
+	row = m_el[1].mVec128;
+	r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
+	row = m_el[2].mVec128;
+	r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
+	return b3Matrix3x3(r0, r1, r2);
+
+#elif defined B3_USE_NEON
+	// zeros w
+	static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0};
+	float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(0).mVec128, xyzMask);
+	float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(1).mVec128, xyzMask);
+	float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(2).mVec128, xyzMask);
+	float32x4_t row = m_el[0].mVec128;
+	float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0);
+	float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1);
+	float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0);
+	row = m_el[1].mVec128;
+	r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0);
+	r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1);
+	r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0);
+	row = m_el[2].mVec128;
+	r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0);
+	r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1);
+	r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0);
+	return b3Matrix3x3(r0, r1, r2);
+#else
+	return b3Matrix3x3(
+		m_el[0].getX() * m[0].getX() + m_el[1].getX() * m[1].getX() + m_el[2].getX() * m[2].getX(),
+		m_el[0].getX() * m[0].getY() + m_el[1].getX() * m[1].getY() + m_el[2].getX() * m[2].getY(),
+		m_el[0].getX() * m[0].getZ() + m_el[1].getX() * m[1].getZ() + m_el[2].getX() * m[2].getZ(),
+		m_el[0].getY() * m[0].getX() + m_el[1].getY() * m[1].getX() + m_el[2].getY() * m[2].getX(),
+		m_el[0].getY() * m[0].getY() + m_el[1].getY() * m[1].getY() + m_el[2].getY() * m[2].getY(),
+		m_el[0].getY() * m[0].getZ() + m_el[1].getY() * m[1].getZ() + m_el[2].getY() * m[2].getZ(),
+		m_el[0].getZ() * m[0].getX() + m_el[1].getZ() * m[1].getX() + m_el[2].getZ() * m[2].getX(),
+		m_el[0].getZ() * m[0].getY() + m_el[1].getZ() * m[1].getY() + m_el[2].getZ() * m[2].getY(),
+		m_el[0].getZ() * m[0].getZ() + m_el[1].getZ() * m[1].getZ() + m_el[2].getZ() * m[2].getZ());
+#endif
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+b3Matrix3x3::timesTranspose(const b3Matrix3x3& m) const
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 a0 = m_el[0].mVec128;
+	__m128 a1 = m_el[1].mVec128;
+	__m128 a2 = m_el[2].mVec128;
+
+	b3Matrix3x3 mT = m.transpose();  // we rely on transpose() zeroing w channel so that we don't have to do it here
+	__m128 mx = mT[0].mVec128;
+	__m128 my = mT[1].mVec128;
+	__m128 mz = mT[2].mVec128;
+
+	__m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
+	__m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
+	__m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
+	r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
+	r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
+	r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
+	r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
+	return b3Matrix3x3(r0, r1, r2);
+
+#elif defined B3_USE_NEON
+	float32x4_t a0 = m_el[0].mVec128;
+	float32x4_t a1 = m_el[1].mVec128;
+	float32x4_t a2 = m_el[2].mVec128;
+
+	b3Matrix3x3 mT = m.transpose();  // we rely on transpose() zeroing w channel so that we don't have to do it here
+	float32x4_t mx = mT[0].mVec128;
+	float32x4_t my = mT[1].mVec128;
+	float32x4_t mz = mT[2].mVec128;
+
+	float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0);
+	float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0);
+	float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0);
+	r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1);
+	r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1);
+	r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1);
+	r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0);
+	r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0);
+	r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0);
+	return b3Matrix3x3(r0, r1, r2);
+
+#else
+	return b3Matrix3x3(
+		m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
+		m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
+		m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
+#endif
+}
+
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Matrix3x3& m, const b3Vector3& v)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	return v.dot3(m[0], m[1], m[2]);
+#else
+	return b3MakeVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
+#endif
+}
+
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Vector3& v, const b3Matrix3x3& m)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+
+	const __m128 vv = v.mVec128;
+
+	__m128 c0 = b3_splat_ps(vv, 0);
+	__m128 c1 = b3_splat_ps(vv, 1);
+	__m128 c2 = b3_splat_ps(vv, 2);
+
+	c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, b3vFFF0fMask));
+	c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, b3vFFF0fMask));
+	c0 = _mm_add_ps(c0, c1);
+	c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, b3vFFF0fMask));
+
+	return b3MakeVector3(_mm_add_ps(c0, c2));
+#elif defined(B3_USE_NEON)
+	const float32x4_t vv = v.mVec128;
+	const float32x2_t vlo = vget_low_f32(vv);
+	const float32x2_t vhi = vget_high_f32(vv);
+
+	float32x4_t c0, c1, c2;
+
+	c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, b3vFFF0Mask);
+	c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, b3vFFF0Mask);
+	c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, b3vFFF0Mask);
+
+	c0 = vmulq_lane_f32(c0, vlo, 0);
+	c1 = vmulq_lane_f32(c1, vlo, 1);
+	c2 = vmulq_lane_f32(c2, vhi, 0);
+	c0 = vaddq_f32(c0, c1);
+	c0 = vaddq_f32(c0, c2);
+
+	return b3MakeVector3(c0);
+#else
+	return b3MakeVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
+#endif
+}
+
+B3_FORCE_INLINE b3Matrix3x3
+operator*(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+
+	__m128 m10 = m1[0].mVec128;
+	__m128 m11 = m1[1].mVec128;
+	__m128 m12 = m1[2].mVec128;
+
+	__m128 m2v = _mm_and_ps(m2[0].mVec128, b3vFFF0fMask);
+
+	__m128 c0 = b3_splat_ps(m10, 0);
+	__m128 c1 = b3_splat_ps(m11, 0);
+	__m128 c2 = b3_splat_ps(m12, 0);
+
+	c0 = _mm_mul_ps(c0, m2v);
+	c1 = _mm_mul_ps(c1, m2v);
+	c2 = _mm_mul_ps(c2, m2v);
+
+	m2v = _mm_and_ps(m2[1].mVec128, b3vFFF0fMask);
+
+	__m128 c0_1 = b3_splat_ps(m10, 1);
+	__m128 c1_1 = b3_splat_ps(m11, 1);
+	__m128 c2_1 = b3_splat_ps(m12, 1);
+
+	c0_1 = _mm_mul_ps(c0_1, m2v);
+	c1_1 = _mm_mul_ps(c1_1, m2v);
+	c2_1 = _mm_mul_ps(c2_1, m2v);
+
+	m2v = _mm_and_ps(m2[2].mVec128, b3vFFF0fMask);
+
+	c0 = _mm_add_ps(c0, c0_1);
+	c1 = _mm_add_ps(c1, c1_1);
+	c2 = _mm_add_ps(c2, c2_1);
+
+	m10 = b3_splat_ps(m10, 2);
+	m11 = b3_splat_ps(m11, 2);
+	m12 = b3_splat_ps(m12, 2);
+
+	m10 = _mm_mul_ps(m10, m2v);
+	m11 = _mm_mul_ps(m11, m2v);
+	m12 = _mm_mul_ps(m12, m2v);
+
+	c0 = _mm_add_ps(c0, m10);
+	c1 = _mm_add_ps(c1, m11);
+	c2 = _mm_add_ps(c2, m12);
+
+	return b3Matrix3x3(c0, c1, c2);
+
+#elif defined(B3_USE_NEON)
+
+	float32x4_t rv0, rv1, rv2;
+	float32x4_t v0, v1, v2;
+	float32x4_t mv0, mv1, mv2;
+
+	v0 = m1[0].mVec128;
+	v1 = m1[1].mVec128;
+	v2 = m1[2].mVec128;
+
+	mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, b3vFFF0Mask);
+	mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, b3vFFF0Mask);
+	mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, b3vFFF0Mask);
+
+	rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
+	rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
+	rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
+
+	rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
+	rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
+	rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
+
+	rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
+	rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
+	rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
+
+	return b3Matrix3x3(rv0, rv1, rv2);
+
+#else
+	return b3Matrix3x3(
+		m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]),
+		m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]),
+		m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2]));
+#endif
+}
+
+/*
+B3_FORCE_INLINE b3Matrix3x3 b3MultTransposeLeft(const b3Matrix3x3& m1, const b3Matrix3x3& m2) {
+return b3Matrix3x3(
+m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
+m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
+m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
+m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
+m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
+m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
+m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
+m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
+m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
+}
+*/
+
+/**@brief Equality operator between two matrices
+* It will test all elements are equal.  */
+B3_FORCE_INLINE bool operator==(const b3Matrix3x3& m1, const b3Matrix3x3& m2)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+
+	__m128 c0, c1, c2;
+
+	c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
+	c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
+	c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
+
+	c0 = _mm_and_ps(c0, c1);
+	c0 = _mm_and_ps(c0, c2);
+
+	return (0x7 == _mm_movemask_ps((__m128)c0));
+#else
+	return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
+			m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
+			m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]);
+#endif
+}
+
+///for serialization
+struct b3Matrix3x3FloatData
+{
+	b3Vector3FloatData m_el[3];
+};
+
+///for serialization
+struct b3Matrix3x3DoubleData
+{
+	b3Vector3DoubleData m_el[3];
+};
+
+B3_FORCE_INLINE void b3Matrix3x3::serialize(struct b3Matrix3x3Data& dataOut) const
+{
+	for (int i = 0; i < 3; i++)
+		m_el[i].serialize(dataOut.m_el[i]);
+}
+
+B3_FORCE_INLINE void b3Matrix3x3::serializeFloat(struct b3Matrix3x3FloatData& dataOut) const
+{
+	for (int i = 0; i < 3; i++)
+		m_el[i].serializeFloat(dataOut.m_el[i]);
+}
+
+B3_FORCE_INLINE void b3Matrix3x3::deSerialize(const struct b3Matrix3x3Data& dataIn)
+{
+	for (int i = 0; i < 3; i++)
+		m_el[i].deSerialize(dataIn.m_el[i]);
+}
+
+B3_FORCE_INLINE void b3Matrix3x3::deSerializeFloat(const struct b3Matrix3x3FloatData& dataIn)
+{
+	for (int i = 0; i < 3; i++)
+		m_el[i].deSerializeFloat(dataIn.m_el[i]);
+}
+
+B3_FORCE_INLINE void b3Matrix3x3::deSerializeDouble(const struct b3Matrix3x3DoubleData& dataIn)
+{
+	for (int i = 0; i < 3; i++)
+		m_el[i].deSerializeDouble(dataIn.m_el[i]);
+}
+
+#endif  //B3_MATRIX3x3_H

+ 69 - 0
Dependencies/include/bullet3/Bullet3Common/b3MinMax.h

@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_GEN_MINMAX_H
+#define B3_GEN_MINMAX_H
+
+#include "b3Scalar.h"
+
+template <class T>
+B3_FORCE_INLINE const T& b3Min(const T& a, const T& b)
+{
+	return a < b ? a : b;
+}
+
+template <class T>
+B3_FORCE_INLINE const T& b3Max(const T& a, const T& b)
+{
+	return a > b ? a : b;
+}
+
+template <class T>
+B3_FORCE_INLINE const T& b3Clamped(const T& a, const T& lb, const T& ub)
+{
+	return a < lb ? lb : (ub < a ? ub : a);
+}
+
+template <class T>
+B3_FORCE_INLINE void b3SetMin(T& a, const T& b)
+{
+	if (b < a)
+	{
+		a = b;
+	}
+}
+
+template <class T>
+B3_FORCE_INLINE void b3SetMax(T& a, const T& b)
+{
+	if (a < b)
+	{
+		a = b;
+	}
+}
+
+template <class T>
+B3_FORCE_INLINE void b3Clamp(T& a, const T& lb, const T& ub)
+{
+	if (a < lb)
+	{
+		a = lb;
+	}
+	else if (ub < a)
+	{
+		a = ub;
+	}
+}
+
+#endif  //B3_GEN_MINMAX_H

+ 121 - 0
Dependencies/include/bullet3/Bullet3Common/b3PoolAllocator.h

@@ -0,0 +1,121 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _BT_POOL_ALLOCATOR_H
+#define _BT_POOL_ALLOCATOR_H
+
+#include "b3Scalar.h"
+#include "b3AlignedAllocator.h"
+
+///The b3PoolAllocator class allows to efficiently allocate a large pool of objects, instead of dynamically allocating them separately.
+class b3PoolAllocator
+{
+	int m_elemSize;
+	int m_maxElements;
+	int m_freeCount;
+	void* m_firstFree;
+	unsigned char* m_pool;
+
+public:
+	b3PoolAllocator(int elemSize, int maxElements)
+		: m_elemSize(elemSize),
+		  m_maxElements(maxElements)
+	{
+		m_pool = (unsigned char*)b3AlignedAlloc(static_cast<unsigned int>(m_elemSize * m_maxElements), 16);
+
+		unsigned char* p = m_pool;
+		m_firstFree = p;
+		m_freeCount = m_maxElements;
+		int count = m_maxElements;
+		while (--count)
+		{
+			*(void**)p = (p + m_elemSize);
+			p += m_elemSize;
+		}
+		*(void**)p = 0;
+	}
+
+	~b3PoolAllocator()
+	{
+		b3AlignedFree(m_pool);
+	}
+
+	int getFreeCount() const
+	{
+		return m_freeCount;
+	}
+
+	int getUsedCount() const
+	{
+		return m_maxElements - m_freeCount;
+	}
+
+	int getMaxCount() const
+	{
+		return m_maxElements;
+	}
+
+	void* allocate(int size)
+	{
+		// release mode fix
+		(void)size;
+		b3Assert(!size || size <= m_elemSize);
+		b3Assert(m_freeCount > 0);
+		void* result = m_firstFree;
+		m_firstFree = *(void**)m_firstFree;
+		--m_freeCount;
+		return result;
+	}
+
+	bool validPtr(void* ptr)
+	{
+		if (ptr)
+		{
+			if (((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize))
+			{
+				return true;
+			}
+		}
+		return false;
+	}
+
+	void freeMemory(void* ptr)
+	{
+		if (ptr)
+		{
+			b3Assert((unsigned char*)ptr >= m_pool && (unsigned char*)ptr < m_pool + m_maxElements * m_elemSize);
+
+			*(void**)ptr = m_firstFree;
+			m_firstFree = ptr;
+			++m_freeCount;
+		}
+	}
+
+	int getElementSize() const
+	{
+		return m_elemSize;
+	}
+
+	unsigned char* getPoolAddress()
+	{
+		return m_pool;
+	}
+
+	const unsigned char* getPoolAddress() const
+	{
+		return m_pool;
+	}
+};
+
+#endif  //_BT_POOL_ALLOCATOR_H

+ 242 - 0
Dependencies/include/bullet3/Bullet3Common/b3QuadWord.h

@@ -0,0 +1,242 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_SIMD_QUADWORD_H
+#define B3_SIMD_QUADWORD_H
+
+#include "b3Scalar.h"
+#include "b3MinMax.h"
+
+#if defined(__CELLOS_LV2) && defined(__SPU__)
+#include <altivec.h>
+#endif
+
+/**@brief The b3QuadWord class is base class for b3Vector3 and b3Quaternion. 
+ * Some issues under PS3 Linux with IBM 2.1 SDK, gcc compiler prevent from using aligned quadword.
+ */
+#ifndef USE_LIBSPE2
+B3_ATTRIBUTE_ALIGNED16(class)
+b3QuadWord
+#else
+class b3QuadWord
+#endif
+{
+protected:
+#if defined(__SPU__) && defined(__CELLOS_LV2__)
+	union {
+		vec_float4 mVec128;
+		b3Scalar m_floats[4];
+	};
+
+public:
+	vec_float4 get128() const
+	{
+		return mVec128;
+	}
+
+#else  //__CELLOS_LV2__ __SPU__
+
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+public:
+	union {
+		b3SimdFloat4 mVec128;
+		b3Scalar m_floats[4];
+		struct
+		{
+			b3Scalar x, y, z, w;
+		};
+	};
+
+public:
+	B3_FORCE_INLINE b3SimdFloat4 get128() const
+	{
+		return mVec128;
+	}
+	B3_FORCE_INLINE void set128(b3SimdFloat4 v128)
+	{
+		mVec128 = v128;
+	}
+#else
+public:
+	union {
+		b3Scalar m_floats[4];
+		struct
+		{
+			b3Scalar x, y, z, w;
+		};
+	};
+#endif  // B3_USE_SSE
+
+#endif  //__CELLOS_LV2__ __SPU__
+
+public:
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+
+	// Set Vector
+	B3_FORCE_INLINE b3QuadWord(const b3SimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+
+	// Copy constructor
+	B3_FORCE_INLINE b3QuadWord(const b3QuadWord& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+
+	// Assignment Operator
+	B3_FORCE_INLINE b3QuadWord&
+	operator=(const b3QuadWord& v)
+	{
+		mVec128 = v.mVec128;
+
+		return *this;
+	}
+
+#endif
+
+	/**@brief Return the x value */
+	B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
+	/**@brief Return the y value */
+	B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
+	/**@brief Return the z value */
+	B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
+	/**@brief Set the x value */
+	B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; };
+	/**@brief Set the y value */
+	B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; };
+	/**@brief Set the z value */
+	B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; };
+	/**@brief Set the w value */
+	B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; };
+	/**@brief Return the x value */
+
+	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}
+	//B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
+	///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
+	B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; }
+	B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; }
+
+	B3_FORCE_INLINE bool operator==(const b3QuadWord& other) const
+	{
+#ifdef B3_USE_SSE
+		return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+#else
+		return ((m_floats[3] == other.m_floats[3]) &&
+				(m_floats[2] == other.m_floats[2]) &&
+				(m_floats[1] == other.m_floats[1]) &&
+				(m_floats[0] == other.m_floats[0]));
+#endif
+	}
+
+	B3_FORCE_INLINE bool operator!=(const b3QuadWord& other) const
+	{
+		return !(*this == other);
+	}
+
+	/**@brief Set x,y,z and zero w 
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   */
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+	{
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = 0.f;
+	}
+
+	/*		void getValue(b3Scalar *m) const 
+		{
+			m[0] = m_floats[0];
+			m[1] = m_floats[1];
+			m[2] = m_floats[2];
+		}
+*/
+	/**@brief Set the values 
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   * @param w Value of w
+   */
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+	{
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = _w;
+	}
+	/**@brief No initialization constructor */
+	B3_FORCE_INLINE b3QuadWord()
+	//	:m_floats[0](b3Scalar(0.)),m_floats[1](b3Scalar(0.)),m_floats[2](b3Scalar(0.)),m_floats[3](b3Scalar(0.))
+	{
+	}
+
+	/**@brief Three argument constructor (zeros w)
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   */
+	B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+	{
+		m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = 0.0f;
+	}
+
+	/**@brief Initializing constructor
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   * @param w Value of w
+   */
+	B3_FORCE_INLINE b3QuadWord(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+	{
+		m_floats[0] = _x, m_floats[1] = _y, m_floats[2] = _z, m_floats[3] = _w;
+	}
+
+	/**@brief Set each element to the max of the current values and the values of another b3QuadWord
+   * @param other The other b3QuadWord to compare with 
+   */
+	B3_FORCE_INLINE void setMax(const b3QuadWord& other)
+	{
+#ifdef B3_USE_SSE
+		mVec128 = _mm_max_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmaxq_f32(mVec128, other.mVec128);
+#else
+		b3SetMax(m_floats[0], other.m_floats[0]);
+		b3SetMax(m_floats[1], other.m_floats[1]);
+		b3SetMax(m_floats[2], other.m_floats[2]);
+		b3SetMax(m_floats[3], other.m_floats[3]);
+#endif
+	}
+	/**@brief Set each element to the min of the current values and the values of another b3QuadWord
+   * @param other The other b3QuadWord to compare with 
+   */
+	B3_FORCE_INLINE void setMin(const b3QuadWord& other)
+	{
+#ifdef B3_USE_SSE
+		mVec128 = _mm_min_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vminq_f32(mVec128, other.mVec128);
+#else
+		b3SetMin(m_floats[0], other.m_floats[0]);
+		b3SetMin(m_floats[1], other.m_floats[1]);
+		b3SetMin(m_floats[2], other.m_floats[2]);
+		b3SetMin(m_floats[3], other.m_floats[3]);
+#endif
+	}
+};
+
+#endif  //B3_SIMD_QUADWORD_H

+ 908 - 0
Dependencies/include/bullet3/Bullet3Common/b3Quaternion.h

@@ -0,0 +1,908 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_SIMD__QUATERNION_H_
+#define B3_SIMD__QUATERNION_H_
+
+#include "b3Vector3.h"
+#include "b3QuadWord.h"
+
+#ifdef B3_USE_SSE
+
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vOnes) = {1.0f, 1.0f, 1.0f, 1.0f};
+
+#endif
+
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3vQInv) = {-0.0f, -0.0f, -0.0f, +0.0f};
+const b3SimdFloat4 B3_ATTRIBUTE_ALIGNED16(b3vPPPM) = {+0.0f, +0.0f, +0.0f, -0.0f};
+
+#endif
+
+/**@brief The b3Quaternion implements quaternion to perform linear algebra rotations in combination with b3Matrix3x3, b3Vector3 and b3Transform. */
+class b3Quaternion : public b3QuadWord
+{
+public:
+	/**@brief No initialization constructor */
+	b3Quaternion() {}
+
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)) || defined(B3_USE_NEON)
+	// Set Vector
+	B3_FORCE_INLINE b3Quaternion(const b3SimdFloat4 vec)
+	{
+		mVec128 = vec;
+	}
+
+	// Copy constructor
+	B3_FORCE_INLINE b3Quaternion(const b3Quaternion& rhs)
+	{
+		mVec128 = rhs.mVec128;
+	}
+
+	// Assignment Operator
+	B3_FORCE_INLINE b3Quaternion&
+	operator=(const b3Quaternion& v)
+	{
+		mVec128 = v.mVec128;
+
+		return *this;
+	}
+
+#endif
+
+	//		template <typename b3Scalar>
+	//		explicit Quaternion(const b3Scalar *v) : Tuple4<b3Scalar>(v) {}
+	/**@brief Constructor from scalars */
+	b3Quaternion(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+		: b3QuadWord(_x, _y, _z, _w)
+	{
+		//b3Assert(!((_x==1.f) && (_y==0.f) && (_z==0.f) && (_w==0.f)));
+	}
+	/**@brief Axis angle Constructor
+   * @param axis The axis which the rotation is around
+   * @param angle The magnitude of the rotation around the angle (Radians) */
+	b3Quaternion(const b3Vector3& _axis, const b3Scalar& _angle)
+	{
+		setRotation(_axis, _angle);
+	}
+	/**@brief Constructor from Euler angles
+   * @param yaw Angle around Y unless B3_EULER_DEFAULT_ZYX defined then Z
+   * @param pitch Angle around X unless B3_EULER_DEFAULT_ZYX defined then Y
+   * @param roll Angle around Z unless B3_EULER_DEFAULT_ZYX defined then X */
+	b3Quaternion(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{
+#ifndef B3_EULER_DEFAULT_ZYX
+		setEuler(yaw, pitch, roll);
+#else
+		setEulerZYX(yaw, pitch, roll);
+#endif
+	}
+	/**@brief Set the rotation using axis angle notation 
+   * @param axis The axis around which to rotate
+   * @param angle The magnitude of the rotation in Radians */
+	void setRotation(const b3Vector3& axis1, const b3Scalar& _angle)
+	{
+		b3Vector3 axis = axis1;
+		axis.safeNormalize();
+		
+		b3Scalar d = axis.length();
+		b3Assert(d != b3Scalar(0.0));
+		if (d < B3_EPSILON)
+		{
+			setValue(0, 0, 0, 1);
+		}
+		else
+		{
+			b3Scalar s = b3Sin(_angle * b3Scalar(0.5)) / d;
+			setValue(axis.getX() * s, axis.getY() * s, axis.getZ() * s,
+				b3Cos(_angle * b3Scalar(0.5)));
+		}
+	}
+	/**@brief Set the quaternion using Euler angles
+   * @param yaw Angle around Y
+   * @param pitch Angle around X
+   * @param roll Angle around Z */
+	void setEuler(const b3Scalar& yaw, const b3Scalar& pitch, const b3Scalar& roll)
+	{
+		b3Scalar halfYaw = b3Scalar(yaw) * b3Scalar(0.5);
+		b3Scalar halfPitch = b3Scalar(pitch) * b3Scalar(0.5);
+		b3Scalar halfRoll = b3Scalar(roll) * b3Scalar(0.5);
+		b3Scalar cosYaw = b3Cos(halfYaw);
+		b3Scalar sinYaw = b3Sin(halfYaw);
+		b3Scalar cosPitch = b3Cos(halfPitch);
+		b3Scalar sinPitch = b3Sin(halfPitch);
+		b3Scalar cosRoll = b3Cos(halfRoll);
+		b3Scalar sinRoll = b3Sin(halfRoll);
+		setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
+				 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
+				 sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
+				 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
+	}
+
+	/**@brief Set the quaternion using euler angles 
+   * @param yaw Angle around Z
+   * @param pitch Angle around Y
+   * @param roll Angle around X */
+	void setEulerZYX(const b3Scalar& yawZ, const b3Scalar& pitchY, const b3Scalar& rollX)
+	{
+		b3Scalar halfYaw = b3Scalar(yawZ) * b3Scalar(0.5);
+		b3Scalar halfPitch = b3Scalar(pitchY) * b3Scalar(0.5);
+		b3Scalar halfRoll = b3Scalar(rollX) * b3Scalar(0.5);
+		b3Scalar cosYaw = b3Cos(halfYaw);
+		b3Scalar sinYaw = b3Sin(halfYaw);
+		b3Scalar cosPitch = b3Cos(halfPitch);
+		b3Scalar sinPitch = b3Sin(halfPitch);
+		b3Scalar cosRoll = b3Cos(halfRoll);
+		b3Scalar sinRoll = b3Sin(halfRoll);
+		setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,   //x
+				 cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,   //y
+				 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,   //z
+				 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);  //formerly yzx
+		normalize();
+	}
+
+	/**@brief Get the euler angles from this quaternion
+	   * @param yaw Angle around Z
+	   * @param pitch Angle around Y
+	   * @param roll Angle around X */
+	void getEulerZYX(b3Scalar& yawZ, b3Scalar& pitchY, b3Scalar& rollX) const
+	{
+		b3Scalar squ;
+		b3Scalar sqx;
+		b3Scalar sqy;
+		b3Scalar sqz;
+		b3Scalar sarg;
+		sqx = m_floats[0] * m_floats[0];
+		sqy = m_floats[1] * m_floats[1];
+		sqz = m_floats[2] * m_floats[2];
+		squ = m_floats[3] * m_floats[3];
+		rollX = b3Atan2(2 * (m_floats[1] * m_floats[2] + m_floats[3] * m_floats[0]), squ - sqx - sqy + sqz);
+		sarg = b3Scalar(-2.) * (m_floats[0] * m_floats[2] - m_floats[3] * m_floats[1]);
+		pitchY = sarg <= b3Scalar(-1.0) ? b3Scalar(-0.5) * B3_PI : (sarg >= b3Scalar(1.0) ? b3Scalar(0.5) * B3_PI : b3Asin(sarg));
+		yawZ = b3Atan2(2 * (m_floats[0] * m_floats[1] + m_floats[3] * m_floats[2]), squ + sqx - sqy - sqz);
+	}
+
+	/**@brief Add two quaternions
+   * @param q The quaternion to add to this one */
+	B3_FORCE_INLINE b3Quaternion& operator+=(const b3Quaternion& q)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, q.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, q.mVec128);
+#else
+		m_floats[0] += q.getX();
+		m_floats[1] += q.getY();
+		m_floats[2] += q.getZ();
+		m_floats[3] += q.m_floats[3];
+#endif
+		return *this;
+	}
+
+	/**@brief Subtract out a quaternion
+   * @param q The quaternion to subtract from this one */
+	b3Quaternion& operator-=(const b3Quaternion& q)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, q.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, q.mVec128);
+#else
+		m_floats[0] -= q.getX();
+		m_floats[1] -= q.getY();
+		m_floats[2] -= q.getZ();
+		m_floats[3] -= q.m_floats[3];
+#endif
+		return *this;
+	}
+
+	/**@brief Scale this quaternion
+   * @param s The scalar to scale by */
+	b3Quaternion& operator*=(const b3Scalar& s)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0);     //	(S S S S)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+#else
+		m_floats[0] *= s;
+		m_floats[1] *= s;
+		m_floats[2] *= s;
+		m_floats[3] *= s;
+#endif
+		return *this;
+	}
+
+	/**@brief Multiply this quaternion by q on the right
+   * @param q The other quaternion 
+   * Equivilant to this = this * q */
+	b3Quaternion& operator*=(const b3Quaternion& q)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vQ2 = q.get128();
+
+		__m128 A1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(0, 1, 2, 0));
+		__m128 B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0));
+
+		A1 = A1 * B1;
+
+		__m128 A2 = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 1));
+		__m128 B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));
+
+		A2 = A2 * B2;
+
+		B1 = b3_pshufd_ps(mVec128, B3_SHUFFLE(2, 0, 1, 2));
+		B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));
+
+		B1 = B1 * B2;  //	A3 *= B3
+
+		mVec128 = b3_splat_ps(mVec128, 3);  //	A0
+		mVec128 = mVec128 * vQ2;            //	A0 * B0
+
+		A1 = A1 + A2;                  //	AB12
+		mVec128 = mVec128 - B1;        //	AB03 = AB0 - AB3
+		A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+		mVec128 = mVec128 + A1;        //	AB03 + AB12
+
+#elif defined(B3_USE_NEON)
+
+		float32x4_t vQ1 = mVec128;
+		float32x4_t vQ2 = q.get128();
+		float32x4_t A0, A1, B1, A2, B2, A3, B3;
+		float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+
+		{
+			float32x2x2_t tmp;
+			tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+			vQ1zx = tmp.val[0];
+
+			tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+			vQ2zx = tmp.val[0];
+		}
+		vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
+
+		vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+		vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+		vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+		A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     // X Y  z x
+		B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  // W W  W X
+
+		A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+		B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+		A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+		B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
+
+		A1 = vmulq_f32(A1, B1);
+		A2 = vmulq_f32(A2, B2);
+		A3 = vmulq_f32(A3, B3);                           //	A3 *= B3
+		A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);  //	A0 * B0
+
+		A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+		A0 = vsubq_f32(A0, A3);  //	AB03 = AB0 - AB3
+
+		//	change the sign of the last element
+		A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+		A0 = vaddq_f32(A0, A1);  //	AB03 + AB12
+
+		mVec128 = A0;
+#else
+		setValue(
+			m_floats[3] * q.getX() + m_floats[0] * q.m_floats[3] + m_floats[1] * q.getZ() - m_floats[2] * q.getY(),
+			m_floats[3] * q.getY() + m_floats[1] * q.m_floats[3] + m_floats[2] * q.getX() - m_floats[0] * q.getZ(),
+			m_floats[3] * q.getZ() + m_floats[2] * q.m_floats[3] + m_floats[0] * q.getY() - m_floats[1] * q.getX(),
+			m_floats[3] * q.m_floats[3] - m_floats[0] * q.getX() - m_floats[1] * q.getY() - m_floats[2] * q.getZ());
+#endif
+		return *this;
+	}
+	/**@brief Return the dot product between this quaternion and another
+   * @param q The other quaternion */
+	b3Scalar dot(const b3Quaternion& q) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vd;
+
+		vd = _mm_mul_ps(mVec128, q.mVec128);
+
+		__m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+
+		return _mm_cvtss_f32(vd);
+#elif defined(B3_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
+		x = vpadd_f32(x, x);
+		return vget_lane_f32(x, 0);
+#else
+		return m_floats[0] * q.getX() +
+			   m_floats[1] * q.getY() +
+			   m_floats[2] * q.getZ() +
+			   m_floats[3] * q.m_floats[3];
+#endif
+	}
+
+	/**@brief Return the length squared of the quaternion */
+	b3Scalar length2() const
+	{
+		return dot(*this);
+	}
+
+	/**@brief Return the length of the quaternion */
+	b3Scalar length() const
+	{
+		return b3Sqrt(length2());
+	}
+
+	/**@brief Normalize the quaternion 
+   * Such that x^2 + y^2 + z^2 +w^2 = 1 */
+	b3Quaternion& normalize()
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vd;
+
+		vd = _mm_mul_ps(mVec128, mVec128);
+
+		__m128 t = _mm_movehl_ps(vd, vd);
+		vd = _mm_add_ps(vd, t);
+		t = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, t);
+
+		vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(b3vOnes, vd);
+		vd = b3_pshufd_ps(vd, 0);  // splat
+		mVec128 = _mm_mul_ps(mVec128, vd);
+
+		return *this;
+#else
+		return *this /= length();
+#endif
+	}
+
+	/**@brief Return a scaled version of this quaternion
+   * @param s The scale factor */
+	B3_FORCE_INLINE b3Quaternion
+	operator*(const b3Scalar& s) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x00);  //	(S S S S)
+
+		return b3Quaternion(_mm_mul_ps(mVec128, vs));
+#elif defined(B3_USE_NEON)
+		return b3Quaternion(vmulq_n_f32(mVec128, s));
+#else
+		return b3Quaternion(getX() * s, getY() * s, getZ() * s, m_floats[3] * s);
+#endif
+	}
+
+	/**@brief Return an inversely scaled versionof this quaternion
+   * @param s The inverse scale factor */
+	b3Quaternion operator/(const b3Scalar& s) const
+	{
+		b3Assert(s != b3Scalar(0.0));
+		return *this * (b3Scalar(1.0) / s);
+	}
+
+	/**@brief Inversely scale this quaternion
+   * @param s The scale factor */
+	b3Quaternion& operator/=(const b3Scalar& s)
+	{
+		b3Assert(s != b3Scalar(0.0));
+		return *this *= b3Scalar(1.0) / s;
+	}
+
+	/**@brief Return a normalized version of this quaternion */
+	b3Quaternion normalized() const
+	{
+		return *this / length();
+	}
+	/**@brief Return the angle between this quaternion and the other 
+   * @param q The other quaternion */
+	b3Scalar angle(const b3Quaternion& q) const
+	{
+		b3Scalar s = b3Sqrt(length2() * q.length2());
+		b3Assert(s != b3Scalar(0.0));
+		return b3Acos(dot(q) / s);
+	}
+	/**@brief Return the angle of rotation represented by this quaternion */
+	b3Scalar getAngle() const
+	{
+		b3Scalar s = b3Scalar(2.) * b3Acos(m_floats[3]);
+		return s;
+	}
+
+	/**@brief Return the axis of the rotation represented by this quaternion */
+	b3Vector3 getAxis() const
+	{
+		b3Scalar s_squared = 1.f - m_floats[3] * m_floats[3];
+
+		if (s_squared < b3Scalar(10.) * B3_EPSILON)  //Check for divide by zero
+			return b3MakeVector3(1.0, 0.0, 0.0);     // Arbitrary
+		b3Scalar s = 1.f / b3Sqrt(s_squared);
+		return b3MakeVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s);
+	}
+
+	/**@brief Return the inverse of this quaternion */
+	b3Quaternion inverse() const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return b3Quaternion(_mm_xor_ps(mVec128, b3vQInv));
+#elif defined(B3_USE_NEON)
+		return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vQInv));
+#else
+		return b3Quaternion(-m_floats[0], -m_floats[1], -m_floats[2], m_floats[3]);
+#endif
+	}
+
+	/**@brief Return the sum of this quaternion and the other 
+   * @param q2 The other quaternion */
+	B3_FORCE_INLINE b3Quaternion
+	operator+(const b3Quaternion& q2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return b3Quaternion(_mm_add_ps(mVec128, q2.mVec128));
+#elif defined(B3_USE_NEON)
+		return b3Quaternion(vaddq_f32(mVec128, q2.mVec128));
+#else
+		const b3Quaternion& q1 = *this;
+		return b3Quaternion(q1.getX() + q2.getX(), q1.getY() + q2.getY(), q1.getZ() + q2.getZ(), q1.m_floats[3] + q2.m_floats[3]);
+#endif
+	}
+
+	/**@brief Return the difference between this quaternion and the other 
+   * @param q2 The other quaternion */
+	B3_FORCE_INLINE b3Quaternion
+	operator-(const b3Quaternion& q2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return b3Quaternion(_mm_sub_ps(mVec128, q2.mVec128));
+#elif defined(B3_USE_NEON)
+		return b3Quaternion(vsubq_f32(mVec128, q2.mVec128));
+#else
+		const b3Quaternion& q1 = *this;
+		return b3Quaternion(q1.getX() - q2.getX(), q1.getY() - q2.getY(), q1.getZ() - q2.getZ(), q1.m_floats[3] - q2.m_floats[3]);
+#endif
+	}
+
+	/**@brief Return the negative of this quaternion 
+   * This simply negates each element */
+	B3_FORCE_INLINE b3Quaternion operator-() const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return b3Quaternion(_mm_xor_ps(mVec128, b3vMzeroMask));
+#elif defined(B3_USE_NEON)
+		return b3Quaternion((b3SimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)b3vMzeroMask));
+#else
+		const b3Quaternion& q2 = *this;
+		return b3Quaternion(-q2.getX(), -q2.getY(), -q2.getZ(), -q2.m_floats[3]);
+#endif
+	}
+	/**@todo document this and it's use */
+	B3_FORCE_INLINE b3Quaternion farthest(const b3Quaternion& qd) const
+	{
+		b3Quaternion diff, sum;
+		diff = *this - qd;
+		sum = *this + qd;
+		if (diff.dot(diff) > sum.dot(sum))
+			return qd;
+		return (-qd);
+	}
+
+	/**@todo document this and it's use */
+	B3_FORCE_INLINE b3Quaternion nearest(const b3Quaternion& qd) const
+	{
+		b3Quaternion diff, sum;
+		diff = *this - qd;
+		sum = *this + qd;
+		if (diff.dot(diff) < sum.dot(sum))
+			return qd;
+		return (-qd);
+	}
+
+	/**@brief Return the quaternion which is the result of Spherical Linear Interpolation between this and the other quaternion
+   * @param q The other quaternion to interpolate with 
+   * @param t The ratio between this and q to interpolate.  If t = 0 the result is this, if t=1 the result is q.
+   * Slerp interpolates assuming constant velocity.  */
+	b3Quaternion slerp(const b3Quaternion& q, const b3Scalar& t) const
+	{
+		b3Scalar magnitude = b3Sqrt(length2() * q.length2());
+		b3Assert(magnitude > b3Scalar(0));
+
+		b3Scalar product = dot(q) / magnitude;
+		if (b3Fabs(product) < b3Scalar(1))
+		{
+			// Take care of long angle case see http://en.wikipedia.org/wiki/Slerp
+			const b3Scalar sign = (product < 0) ? b3Scalar(-1) : b3Scalar(1);
+
+			const b3Scalar theta = b3Acos(sign * product);
+			const b3Scalar s1 = b3Sin(sign * t * theta);
+			const b3Scalar d = b3Scalar(1.0) / b3Sin(theta);
+			const b3Scalar s0 = b3Sin((b3Scalar(1.0) - t) * theta);
+
+			return b3Quaternion(
+				(m_floats[0] * s0 + q.getX() * s1) * d,
+				(m_floats[1] * s0 + q.getY() * s1) * d,
+				(m_floats[2] * s0 + q.getZ() * s1) * d,
+				(m_floats[3] * s0 + q.m_floats[3] * s1) * d);
+		}
+		else
+		{
+			return *this;
+		}
+	}
+
+	static const b3Quaternion& getIdentity()
+	{
+		static const b3Quaternion identityQuat(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.), b3Scalar(1.));
+		return identityQuat;
+	}
+
+	B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
+};
+
+/**@brief Return the product of two quaternions */
+B3_FORCE_INLINE b3Quaternion
+operator*(const b3Quaternion& q1, const b3Quaternion& q2)
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	__m128 vQ1 = q1.get128();
+	__m128 vQ2 = q2.get128();
+	__m128 A0, A1, B1, A2, B2;
+
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0, 1, 2, 0));  // X Y  z x     //      vtrn
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0));  // W W  W X     // vdup vext
+
+	A1 = A1 * B1;
+
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1));  // Y Z  X Y     // vext
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));  // z x  Y Y     // vtrn vdup
+
+	A2 = A2 * B2;
+
+	B1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2));  // z x Y Z      // vtrn vext
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));  // Y Z x z      // vext vtrn
+
+	B1 = B1 * B2;  //	A3 *= B3
+
+	A0 = b3_splat_ps(vQ1, 3);  //	A0
+	A0 = A0 * vQ2;             //	A0 * B0
+
+	A1 = A1 + A2;  //	AB12
+	A0 = A0 - B1;  //	AB03 = AB0 - AB3
+
+	A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+	A0 = A0 + A1;                  //	AB03 + AB12
+
+	return b3Quaternion(A0);
+
+#elif defined(B3_USE_NEON)
+
+	float32x4_t vQ1 = q1.get128();
+	float32x4_t vQ2 = q2.get128();
+	float32x4_t A0, A1, B1, A2, B2, A3, B3;
+	float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+
+	{
+		float32x2x2_t tmp;
+		tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+		vQ1zx = tmp.val[0];
+
+		tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+		vQ2zx = tmp.val[0];
+	}
+	vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
+
+	vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+	vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+	vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+	A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     // X Y  z x
+	B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  // W W  W X
+
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+	B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+	A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+	B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
+
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);                           //	A3 *= B3
+	A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);  //	A0 * B0
+
+	A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+	A0 = vsubq_f32(A0, A3);  //	AB03 = AB0 - AB3
+
+	//	change the sign of the last element
+	A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+	A0 = vaddq_f32(A0, A1);  //	AB03 + AB12
+
+	return b3Quaternion(A0);
+
+#else
+	return b3Quaternion(
+		q1.getW() * q2.getX() + q1.getX() * q2.getW() + q1.getY() * q2.getZ() - q1.getZ() * q2.getY(),
+		q1.getW() * q2.getY() + q1.getY() * q2.getW() + q1.getZ() * q2.getX() - q1.getX() * q2.getZ(),
+		q1.getW() * q2.getZ() + q1.getZ() * q2.getW() + q1.getX() * q2.getY() - q1.getY() * q2.getX(),
+		q1.getW() * q2.getW() - q1.getX() * q2.getX() - q1.getY() * q2.getY() - q1.getZ() * q2.getZ());
+#endif
+}
+
+B3_FORCE_INLINE b3Quaternion
+operator*(const b3Quaternion& q, const b3Vector3& w)
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	__m128 vQ1 = q.get128();
+	__m128 vQ2 = w.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(3, 3, 3, 0));
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(0, 1, 2, 0));
+
+	A1 = A1 * B1;
+
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1));
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));
+
+	A2 = A2 * B2;
+
+	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2));
+	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));
+
+	A3 = A3 * B3;  //	A3 *= B3
+
+	A1 = A1 + A2;                  //	AB12
+	A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+	A1 = A1 - A3;                  //	AB123 = AB12 - AB3
+
+	return b3Quaternion(A1);
+
+#elif defined(B3_USE_NEON)
+
+	float32x4_t vQ1 = q.get128();
+	float32x4_t vQ2 = w.get128();
+	float32x4_t A1, B1, A2, B2, A3, B3;
+	float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
+
+	vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
+	{
+		float32x2x2_t tmp;
+
+		tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+		vQ2zx = tmp.val[0];
+
+		tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+		vQ1zx = tmp.val[0];
+	}
+
+	vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+	vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+	vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+	A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx);  // W W  W X
+	B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);                     // X Y  z x
+
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+	B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+	A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+	B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
+
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);  //	A3 *= B3
+
+	A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+
+	//	change the sign of the last element
+	A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+
+	A1 = vsubq_f32(A1, A3);  //	AB123 = AB12 - AB3
+
+	return b3Quaternion(A1);
+
+#else
+	return b3Quaternion(
+		q.getW() * w.getX() + q.getY() * w.getZ() - q.getZ() * w.getY(),
+		q.getW() * w.getY() + q.getZ() * w.getX() - q.getX() * w.getZ(),
+		q.getW() * w.getZ() + q.getX() * w.getY() - q.getY() * w.getX(),
+		-q.getX() * w.getX() - q.getY() * w.getY() - q.getZ() * w.getZ());
+#endif
+}
+
+B3_FORCE_INLINE b3Quaternion
+operator*(const b3Vector3& w, const b3Quaternion& q)
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	__m128 vQ1 = w.get128();
+	__m128 vQ2 = q.get128();
+	__m128 A1, B1, A2, B2, A3, B3;
+
+	A1 = b3_pshufd_ps(vQ1, B3_SHUFFLE(0, 1, 2, 0));  // X Y  z x
+	B1 = b3_pshufd_ps(vQ2, B3_SHUFFLE(3, 3, 3, 0));  // W W  W X
+
+	A1 = A1 * B1;
+
+	A2 = b3_pshufd_ps(vQ1, B3_SHUFFLE(1, 2, 0, 1));
+	B2 = b3_pshufd_ps(vQ2, B3_SHUFFLE(2, 0, 1, 1));
+
+	A2 = A2 * B2;
+
+	A3 = b3_pshufd_ps(vQ1, B3_SHUFFLE(2, 0, 1, 2));
+	B3 = b3_pshufd_ps(vQ2, B3_SHUFFLE(1, 2, 0, 2));
+
+	A3 = A3 * B3;  //	A3 *= B3
+
+	A1 = A1 + A2;                  //	AB12
+	A1 = _mm_xor_ps(A1, b3vPPPM);  //	change sign of the last element
+	A1 = A1 - A3;                  //	AB123 = AB12 - AB3
+
+	return b3Quaternion(A1);
+
+#elif defined(B3_USE_NEON)
+
+	float32x4_t vQ1 = w.get128();
+	float32x4_t vQ2 = q.get128();
+	float32x4_t A1, B1, A2, B2, A3, B3;
+	float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
+
+	{
+		float32x2x2_t tmp;
+
+		tmp = vtrn_f32(vget_high_f32(vQ1), vget_low_f32(vQ1));  // {z x}, {w y}
+		vQ1zx = tmp.val[0];
+
+		tmp = vtrn_f32(vget_high_f32(vQ2), vget_low_f32(vQ2));  // {z x}, {w y}
+		vQ2zx = tmp.val[0];
+	}
+	vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
+
+	vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
+
+	vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
+	vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
+
+	A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);                     // X Y  z x
+	B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);  // W W  W X
+
+	A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
+	B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
+
+	A3 = vcombine_f32(vQ1zx, vQ1yz);  // Z X Y Z
+	B3 = vcombine_f32(vQ2yz, vQ2xz);  // Y Z x z
+
+	A1 = vmulq_f32(A1, B1);
+	A2 = vmulq_f32(A2, B2);
+	A3 = vmulq_f32(A3, B3);  //	A3 *= B3
+
+	A1 = vaddq_f32(A1, A2);  //	AB12 = AB1 + AB2
+
+	//	change the sign of the last element
+	A1 = (b3SimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)b3vPPPM);
+
+	A1 = vsubq_f32(A1, A3);  //	AB123 = AB12 - AB3
+
+	return b3Quaternion(A1);
+
+#else
+	return b3Quaternion(
+		+w.getX() * q.getW() + w.getY() * q.getZ() - w.getZ() * q.getY(),
+		+w.getY() * q.getW() + w.getZ() * q.getX() - w.getX() * q.getZ(),
+		+w.getZ() * q.getW() + w.getX() * q.getY() - w.getY() * q.getX(),
+		-w.getX() * q.getX() - w.getY() * q.getY() - w.getZ() * q.getZ());
+#endif
+}
+
+/**@brief Calculate the dot product between two quaternions */
+B3_FORCE_INLINE b3Scalar
+b3Dot(const b3Quaternion& q1, const b3Quaternion& q2)
+{
+	return q1.dot(q2);
+}
+
+/**@brief Return the length of a quaternion */
+B3_FORCE_INLINE b3Scalar
+b3Length(const b3Quaternion& q)
+{
+	return q.length();
+}
+
+/**@brief Return the angle between two quaternions*/
+B3_FORCE_INLINE b3Scalar
+b3Angle(const b3Quaternion& q1, const b3Quaternion& q2)
+{
+	return q1.angle(q2);
+}
+
+/**@brief Return the inverse of a quaternion*/
+B3_FORCE_INLINE b3Quaternion
+b3Inverse(const b3Quaternion& q)
+{
+	return q.inverse();
+}
+
+/**@brief Return the result of spherical linear interpolation betwen two quaternions 
+ * @param q1 The first quaternion
+ * @param q2 The second quaternion 
+ * @param t The ration between q1 and q2.  t = 0 return q1, t=1 returns q2 
+ * Slerp assumes constant velocity between positions. */
+B3_FORCE_INLINE b3Quaternion
+b3Slerp(const b3Quaternion& q1, const b3Quaternion& q2, const b3Scalar& t)
+{
+	return q1.slerp(q2, t);
+}
+
+B3_FORCE_INLINE b3Quaternion
+b3QuatMul(const b3Quaternion& rot0, const b3Quaternion& rot1)
+{
+	return rot0 * rot1;
+}
+
+B3_FORCE_INLINE b3Quaternion
+b3QuatNormalized(const b3Quaternion& orn)
+{
+	return orn.normalized();
+}
+
+B3_FORCE_INLINE b3Vector3
+b3QuatRotate(const b3Quaternion& rotation, const b3Vector3& v)
+{
+	b3Quaternion q = rotation * v;
+	q *= rotation.inverse();
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	return b3MakeVector3(_mm_and_ps(q.get128(), b3vFFF0fMask));
+#elif defined(B3_USE_NEON)
+	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), b3vFFF0Mask));
+#else
+	return b3MakeVector3(q.getX(), q.getY(), q.getZ());
+#endif
+}
+
+B3_FORCE_INLINE b3Quaternion
+b3ShortestArcQuat(const b3Vector3& v0, const b3Vector3& v1)  // Game Programming Gems 2.10. make sure v0,v1 are normalized
+{
+	b3Vector3 c = v0.cross(v1);
+	b3Scalar d = v0.dot(v1);
+
+	if (d < -1.0 + B3_EPSILON)
+	{
+		b3Vector3 n, unused;
+		b3PlaneSpace1(v0, n, unused);
+		return b3Quaternion(n.getX(), n.getY(), n.getZ(), 0.0f);  // just pick any vector that is orthogonal to v0
+	}
+
+	b3Scalar s = b3Sqrt((1.0f + d) * 2.0f);
+	b3Scalar rs = 1.0f / s;
+
+	return b3Quaternion(c.getX() * rs, c.getY() * rs, c.getZ() * rs, s * 0.5f);
+}
+
+B3_FORCE_INLINE b3Quaternion
+b3ShortestArcQuatNormalize2(b3Vector3& v0, b3Vector3& v1)
+{
+	v0.normalize();
+	v1.normalize();
+	return b3ShortestArcQuat(v0, v1);
+}
+
+#endif  //B3_SIMD__QUATERNION_H_

+ 46 - 0
Dependencies/include/bullet3/Bullet3Common/b3Random.h

@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_GEN_RANDOM_H
+#define B3_GEN_RANDOM_H
+
+#include "b3Scalar.h"
+
+#ifdef MT19937
+
+#include <limits.h>
+#include <mt19937.h>
+
+#define B3_RAND_MAX UINT_MAX
+
+B3_FORCE_INLINE void b3Srand(unsigned int seed) { init_genrand(seed); }
+B3_FORCE_INLINE unsigned int b3rand() { return genrand_int32(); }
+
+#else
+
+#include <stdlib.h>
+
+#define B3_RAND_MAX RAND_MAX
+
+B3_FORCE_INLINE void b3Srand(unsigned int seed) { srand(seed); }
+B3_FORCE_INLINE unsigned int b3rand() { return rand(); }
+
+#endif
+
+inline b3Scalar b3RandRange(b3Scalar minRange, b3Scalar maxRange)
+{
+	return (b3rand() / (b3Scalar(B3_RAND_MAX) + b3Scalar(1.0))) * (maxRange - minRange) + minRange;
+}
+
+#endif  //B3_GEN_RANDOM_H

+ 171 - 0
Dependencies/include/bullet3/Bullet3Common/b3ResizablePool.h

@@ -0,0 +1,171 @@
+
+#ifndef B3_RESIZABLE_POOL_H
+#define B3_RESIZABLE_POOL_H
+
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+enum
+{
+	B3_POOL_HANDLE_TERMINAL_FREE = -1,
+	B3_POOL_HANDLE_TERMINAL_USED = -2
+};
+
+template <typename U>
+struct b3PoolBodyHandle : public U
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	int m_nextFreeHandle;
+	void setNextFree(int next)
+	{
+		m_nextFreeHandle = next;
+	}
+	int getNextFree() const
+	{
+		return m_nextFreeHandle;
+	}
+};
+
+template <typename T>
+class b3ResizablePool
+{
+protected:
+	b3AlignedObjectArray<T> m_bodyHandles;
+	int m_numUsedHandles;   // number of active handles
+	int m_firstFreeHandle;  // free handles list
+
+	T* getHandleInternal(int handle)
+	{
+		return &m_bodyHandles[handle];
+	}
+	const T* getHandleInternal(int handle) const
+	{
+		return &m_bodyHandles[handle];
+	}
+
+public:
+	b3ResizablePool()
+	{
+		initHandles();
+	}
+
+	virtual ~b3ResizablePool()
+	{
+		exitHandles();
+	}
+	///handle management
+
+	int getNumHandles() const
+	{
+		return m_bodyHandles.size();
+	}
+
+	void getUsedHandles(b3AlignedObjectArray<int>& usedHandles) const
+	{
+		for (int i = 0; i < m_bodyHandles.size(); i++)
+		{
+			if (m_bodyHandles[i].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
+			{
+				usedHandles.push_back(i);
+			}
+		}
+	}
+
+	T* getHandle(int handle)
+	{
+		b3Assert(handle >= 0);
+		b3Assert(handle < m_bodyHandles.size());
+		if ((handle < 0) || (handle >= m_bodyHandles.size()))
+		{
+			return 0;
+		}
+
+		if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
+		{
+			return &m_bodyHandles[handle];
+		}
+		return 0;
+	}
+	const T* getHandle(int handle) const
+	{
+		b3Assert(handle >= 0);
+		b3Assert(handle < m_bodyHandles.size());
+		if ((handle < 0) || (handle >= m_bodyHandles.size()))
+		{
+			return 0;
+		}
+
+		if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
+		{
+			return &m_bodyHandles[handle];
+		}
+		return 0;
+	}
+
+	void increaseHandleCapacity(int extraCapacity)
+	{
+		int curCapacity = m_bodyHandles.size();
+		//b3Assert(curCapacity == m_numUsedHandles);
+		int newCapacity = curCapacity + extraCapacity;
+		m_bodyHandles.resize(newCapacity);
+
+		{
+			for (int i = curCapacity; i < newCapacity; i++)
+				m_bodyHandles[i].setNextFree(i + 1);
+
+			m_bodyHandles[newCapacity - 1].setNextFree(-1);
+		}
+		m_firstFreeHandle = curCapacity;
+	}
+	void initHandles()
+	{
+		m_numUsedHandles = 0;
+		m_firstFreeHandle = -1;
+
+		increaseHandleCapacity(1);
+	}
+
+	void exitHandles()
+	{
+		m_bodyHandles.resize(0);
+		m_firstFreeHandle = -1;
+		m_numUsedHandles = 0;
+	}
+
+	int allocHandle()
+	{
+		b3Assert(m_firstFreeHandle >= 0);
+
+		int handle = m_firstFreeHandle;
+		m_firstFreeHandle = getHandleInternal(handle)->getNextFree();
+		m_numUsedHandles++;
+
+		if (m_firstFreeHandle < 0)
+		{
+			//int curCapacity = m_bodyHandles.size();
+			int additionalCapacity = m_bodyHandles.size();
+			increaseHandleCapacity(additionalCapacity);
+
+			getHandleInternal(handle)->setNextFree(m_firstFreeHandle);
+		}
+		getHandleInternal(handle)->setNextFree(B3_POOL_HANDLE_TERMINAL_USED);
+		getHandleInternal(handle)->clear();
+		return handle;
+	}
+
+	void freeHandle(int handle)
+	{
+		b3Assert(handle >= 0);
+
+		if (m_bodyHandles[handle].getNextFree() == B3_POOL_HANDLE_TERMINAL_USED)
+		{
+			getHandleInternal(handle)->clear();
+			getHandleInternal(handle)->setNextFree(m_firstFreeHandle);
+			m_firstFreeHandle = handle;
+			m_numUsedHandles--;
+		}
+	}
+};
+///end handle management
+
+#endif  //B3_RESIZABLE_POOL_H

+ 689 - 0
Dependencies/include/bullet3/Bullet3Common/b3Scalar.h

@@ -0,0 +1,689 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_SCALAR_H
+#define B3_SCALAR_H
+
+#ifdef B3_MANAGED_CODE
+//Aligned data types not supported in managed code
+#pragma unmanaged
+#endif
+
+#include <math.h>
+#include <stdlib.h>  //size_t for MSVC 6.0
+#include <float.h>
+
+//Original repository is at http://github.com/erwincoumans/bullet3
+#define B3_BULLET_VERSION 300
+
+inline int b3GetVersion()
+{
+	return B3_BULLET_VERSION;
+}
+
+#if defined(DEBUG) || defined(_DEBUG)
+#define B3_DEBUG
+#endif
+
+#include "b3Logging.h"  //for b3Error
+
+#ifdef _WIN32
+
+#if  defined(__GNUC__)	// it should handle both MINGW and CYGWIN
+#define B3_FORCE_INLINE             __inline__ __attribute__((always_inline))
+#define B3_ATTRIBUTE_ALIGNED16(a)   a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a)   a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a)  a __attribute__((aligned(128)))
+#elif ( defined(_MSC_VER) && _MSC_VER < 1300 )
+#define B3_FORCE_INLINE inline
+#define B3_ATTRIBUTE_ALIGNED16(a) a
+#define B3_ATTRIBUTE_ALIGNED64(a) a
+#define B3_ATTRIBUTE_ALIGNED128(a) a
+#else
+//#define B3_HAS_ALIGNED_ALLOCATOR
+#pragma warning(disable : 4324)  // disable padding warning
+//			#pragma warning(disable:4530) // Disable the exception disable but used in MSCV Stl warning.
+#pragma warning(disable : 4996)  //Turn off warnings about deprecated C routines
+//			#pragma warning(disable:4786) // Disable the "debug name too long" warning
+
+#define B3_FORCE_INLINE __forceinline
+#define B3_ATTRIBUTE_ALIGNED16(a) __declspec(align(16)) a
+#define B3_ATTRIBUTE_ALIGNED64(a) __declspec(align(64)) a
+#define B3_ATTRIBUTE_ALIGNED128(a) __declspec(align(128)) a
+#ifdef _XBOX
+#define B3_USE_VMX128
+
+#include <ppcintrinsics.h>
+#define B3_HAVE_NATIVE_FSEL
+#define b3Fsel(a, b, c) __fsel((a), (b), (c))
+#else
+
+#if (defined(_WIN32) && (_MSC_VER) && _MSC_VER >= 1400) && (!defined(B3_USE_DOUBLE_PRECISION))
+#if (defined(_M_IX86) || defined(_M_X64))
+
+
+#ifdef __clang__
+//#define B3_NO_SIMD_OPERATOR_OVERLOADS
+#define B3_DISABLE_SSE
+#endif //__clang__
+
+#ifndef B3_DISABLE_SSE
+#define B3_USE_SSE
+#endif //B3_DISABLE_SSE
+
+#ifdef B3_USE_SSE
+//B3_USE_SSE_IN_API is disabled under Windows by default, because
+//it makes it harder to integrate Bullet into your application under Windows
+//(structured embedding Bullet structs/classes need to be 16-byte aligned)
+//with relatively little performance gain
+//If you are not embedded Bullet data in your classes, or make sure that you align those classes on 16-byte boundaries
+//you can manually enable this line or set it in the build system for a bit of performance gain (a few percent, dependent on usage)
+//#define B3_USE_SSE_IN_API
+#endif  //B3_USE_SSE
+#include <emmintrin.h>
+#endif
+#endif
+
+#endif  //_XBOX
+
+#endif  //__MINGW32__
+
+#ifdef B3_DEBUG
+#ifdef _MSC_VER
+#include <stdio.h>
+#define b3Assert(x) { if(!(x)){b3Error("Assert " __FILE__ ":%u (%s)\n", __LINE__, #x);__debugbreak();	}}
+#else  //_MSC_VER
+#include <assert.h>
+#define b3Assert assert
+#endif  //_MSC_VER
+#else
+#define b3Assert(x)
+#endif
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
+
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
+
+#else
+
+#if defined(__CELLOS_LV2__)
+#define B3_FORCE_INLINE inline __attribute__((always_inline))
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+#ifndef assert
+#include <assert.h>
+#endif
+#ifdef B3_DEBUG
+#ifdef __SPU__
+#include <spu_printf.h>
+#define printf spu_printf
+#define b3Assert(x)               \
+	{                             \
+		if (!(x))                 \
+		{                         \
+			b3Error(              \
+				"Assert "__FILE__ \
+				":%u (" #x ")\n", \
+				__LINE__);        \
+			spu_hcmpeq(0, 0);     \
+		}                         \
+	}
+#else
+#define b3Assert assert
+#endif
+
+#else
+#define b3Assert(x)
+#endif
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
+
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
+
+#else
+
+#ifdef USE_LIBSPE2
+
+#define B3_FORCE_INLINE __inline
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+#ifndef assert
+#include <assert.h>
+#endif
+#ifdef B3_DEBUG
+#define b3Assert assert
+#else
+#define b3Assert(x)
+#endif
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
+
+#define b3Likely(_c) __builtin_expect((_c), 1)
+#define b3Unlikely(_c) __builtin_expect((_c), 0)
+
+#else
+//non-windows systems
+
+#if (defined(__APPLE__) && (!defined(B3_USE_DOUBLE_PRECISION)))
+#if defined(__i386__) || defined(__x86_64__)
+#define B3_USE_SSE
+//B3_USE_SSE_IN_API is enabled on Mac OSX by default, because memory is automatically aligned on 16-byte boundaries
+//if apps run into issues, we will disable the next line
+#define B3_USE_SSE_IN_API
+#ifdef B3_USE_SSE
+// include appropriate SSE level
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#elif defined(__SSSE3__)
+#include <tmmintrin.h>
+#elif defined(__SSE3__)
+#include <pmmintrin.h>
+#else
+#include <emmintrin.h>
+#endif
+#endif  //B3_USE_SSE
+#elif defined(__armv7__)
+#ifdef __clang__
+#define B3_USE_NEON 1
+
+#if defined B3_USE_NEON && defined(__clang__)
+#include <arm_neon.h>
+#endif  //B3_USE_NEON
+#endif  //__clang__
+#endif  //__arm__
+
+#define B3_FORCE_INLINE inline __attribute__((always_inline))
+///@todo: check out alignment methods for other platforms/compilers
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+#ifndef assert
+#include <assert.h>
+#endif
+
+#if defined(DEBUG) || defined(_DEBUG)
+#if defined(__i386__) || defined(__x86_64__)
+#include <stdio.h>
+#define b3Assert(x)                                                             \
+	{                                                                           \
+		if (!(x))                                                               \
+		{                                                                       \
+			b3Error("Assert %s in line %d, file %s\n", #x, __LINE__, __FILE__); \
+			asm volatile("int3");                                               \
+		}                                                                       \
+	}
+#else  //defined (__i386__) || defined (__x86_64__)
+#define b3Assert assert
+#endif  //defined (__i386__) || defined (__x86_64__)
+#else   //defined(DEBUG) || defined (_DEBUG)
+#define b3Assert(x)
+#endif  //defined(DEBUG) || defined (_DEBUG)
+
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
+
+#else
+
+#define B3_FORCE_INLINE inline
+///@todo: check out alignment methods for other platforms/compilers
+#define B3_ATTRIBUTE_ALIGNED16(a) a __attribute__((aligned(16)))
+#define B3_ATTRIBUTE_ALIGNED64(a) a __attribute__((aligned(64)))
+#define B3_ATTRIBUTE_ALIGNED128(a) a __attribute__((aligned(128)))
+///#define B3_ATTRIBUTE_ALIGNED16(a) a
+///#define B3_ATTRIBUTE_ALIGNED64(a) a
+///#define B3_ATTRIBUTE_ALIGNED128(a) a
+#ifndef assert
+#include <assert.h>
+#endif
+
+#if defined(DEBUG) || defined(_DEBUG)
+#define b3Assert assert
+#else
+#define b3Assert(x)
+#endif
+
+//b3FullAssert is optional, slows down a lot
+#define b3FullAssert(x)
+#define b3Likely(_c) _c
+#define b3Unlikely(_c) _c
+#endif  //__APPLE__
+
+#endif  // LIBSPE2
+
+#endif  //__CELLOS_LV2__
+#endif
+
+///The b3Scalar type abstracts floating point numbers, to easily switch between double and single floating point precision.
+#if defined(B3_USE_DOUBLE_PRECISION)
+typedef double b3Scalar;
+//this number could be bigger in double precision
+#define B3_LARGE_FLOAT 1e30
+#else
+typedef float b3Scalar;
+//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX
+#define B3_LARGE_FLOAT 1e18f
+#endif
+
+#ifdef B3_USE_SSE
+typedef __m128 b3SimdFloat4;
+#endif  //B3_USE_SSE
+
+#if defined B3_USE_SSE_IN_API && defined(B3_USE_SSE)
+#ifdef _WIN32
+
+#ifndef B3_NAN
+static int b3NanMask = 0x7F800001;
+#define B3_NAN (*(float *)&b3NanMask)
+#endif
+
+#ifndef B3_INFINITY_MASK
+static int b3InfinityMask = 0x7F800000;
+#define B3_INFINITY_MASK (*(float *)&b3InfinityMask)
+#endif
+#ifndef B3_NO_SIMD_OPERATOR_OVERLOADS
+inline __m128 operator+(const __m128 A, const __m128 B)
+{
+	return _mm_add_ps(A, B);
+}
+
+inline __m128 operator-(const __m128 A, const __m128 B)
+{
+	return _mm_sub_ps(A, B);
+}
+
+inline __m128 operator*(const __m128 A, const __m128 B)
+{
+	return _mm_mul_ps(A, B);
+}
+#endif //B3_NO_SIMD_OPERATOR_OVERLOADS
+#define b3CastfTo128i(a) (_mm_castps_si128(a))
+#define b3CastfTo128d(a) (_mm_castps_pd(a))
+#define b3CastiTo128f(a) (_mm_castsi128_ps(a))
+#define b3CastdTo128f(a) (_mm_castpd_ps(a))
+#define b3CastdTo128i(a) (_mm_castpd_si128(a))
+#define b3Assign128(r0, r1, r2, r3) _mm_setr_ps(r0, r1, r2, r3)
+
+#else  //_WIN32
+
+#define b3CastfTo128i(a) ((__m128i)(a))
+#define b3CastfTo128d(a) ((__m128d)(a))
+#define b3CastiTo128f(a) ((__m128)(a))
+#define b3CastdTo128f(a) ((__m128)(a))
+#define b3CastdTo128i(a) ((__m128i)(a))
+#define b3Assign128(r0, r1, r2, r3) \
+	(__m128) { r0, r1, r2, r3 }
+#endif  //_WIN32
+#endif  //B3_USE_SSE_IN_API
+
+#ifdef B3_USE_NEON
+#include <arm_neon.h>
+
+typedef float32x4_t b3SimdFloat4;
+#define B3_INFINITY INFINITY
+#define B3_NAN NAN
+#define b3Assign128(r0, r1, r2, r3) \
+	(float32x4_t) { r0, r1, r2, r3 }
+#endif
+
+#define B3_DECLARE_ALIGNED_ALLOCATOR()                                                                   \
+	B3_FORCE_INLINE void *operator new(size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes, 16); }   \
+	B3_FORCE_INLINE void operator delete(void *ptr) { b3AlignedFree(ptr); }                              \
+	B3_FORCE_INLINE void *operator new(size_t, void *ptr) { return ptr; }                                \
+	B3_FORCE_INLINE void operator delete(void *, void *) {}                                              \
+	B3_FORCE_INLINE void *operator new[](size_t sizeInBytes) { return b3AlignedAlloc(sizeInBytes, 16); } \
+	B3_FORCE_INLINE void operator delete[](void *ptr) { b3AlignedFree(ptr); }                            \
+	B3_FORCE_INLINE void *operator new[](size_t, void *ptr) { return ptr; }                              \
+	B3_FORCE_INLINE void operator delete[](void *, void *) {}
+
+#if defined(B3_USE_DOUBLE_PRECISION) || defined(B3_FORCE_DOUBLE_FUNCTIONS)
+
+B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar x)
+{
+	return sqrt(x);
+}
+B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabs(x); }
+B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cos(x); }
+B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sin(x); }
+B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tan(x); }
+B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x)
+{
+	if (x < b3Scalar(-1)) x = b3Scalar(-1);
+	if (x > b3Scalar(1)) x = b3Scalar(1);
+	return acos(x);
+}
+B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x)
+{
+	if (x < b3Scalar(-1)) x = b3Scalar(-1);
+	if (x > b3Scalar(1)) x = b3Scalar(1);
+	return asin(x);
+}
+B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atan(x); }
+B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2(x, y); }
+B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return exp(x); }
+B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return log(x); }
+B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x, b3Scalar y) { return pow(x, y); }
+B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x, b3Scalar y) { return fmod(x, y); }
+
+#else
+
+B3_FORCE_INLINE b3Scalar b3Sqrt(b3Scalar y)
+{
+#ifdef USE_APPROXIMATION
+	double x, z, tempf;
+	unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
+
+	tempf = y;
+	*tfptr = (0xbfcdd90a - *tfptr) >> 1; /* estimate of 1/sqrt(y) */
+	x = tempf;
+	z = y * b3Scalar(0.5);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z); /* iteration formula     */
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	x = (b3Scalar(1.5) * x) - (x * x) * (x * z);
+	return x * y;
+#else
+	return sqrtf(y);
+#endif
+}
+B3_FORCE_INLINE b3Scalar b3Fabs(b3Scalar x) { return fabsf(x); }
+B3_FORCE_INLINE b3Scalar b3Cos(b3Scalar x) { return cosf(x); }
+B3_FORCE_INLINE b3Scalar b3Sin(b3Scalar x) { return sinf(x); }
+B3_FORCE_INLINE b3Scalar b3Tan(b3Scalar x) { return tanf(x); }
+B3_FORCE_INLINE b3Scalar b3Acos(b3Scalar x)
+{
+	if (x < b3Scalar(-1))
+		x = b3Scalar(-1);
+	if (x > b3Scalar(1))
+		x = b3Scalar(1);
+	return acosf(x);
+}
+B3_FORCE_INLINE b3Scalar b3Asin(b3Scalar x)
+{
+	if (x < b3Scalar(-1))
+		x = b3Scalar(-1);
+	if (x > b3Scalar(1))
+		x = b3Scalar(1);
+	return asinf(x);
+}
+B3_FORCE_INLINE b3Scalar b3Atan(b3Scalar x) { return atanf(x); }
+B3_FORCE_INLINE b3Scalar b3Atan2(b3Scalar x, b3Scalar y) { return atan2f(x, y); }
+B3_FORCE_INLINE b3Scalar b3Exp(b3Scalar x) { return expf(x); }
+B3_FORCE_INLINE b3Scalar b3Log(b3Scalar x) { return logf(x); }
+B3_FORCE_INLINE b3Scalar b3Pow(b3Scalar x, b3Scalar y) { return powf(x, y); }
+B3_FORCE_INLINE b3Scalar b3Fmod(b3Scalar x, b3Scalar y) { return fmodf(x, y); }
+
+#endif
+
+#define B3_2_PI b3Scalar(6.283185307179586232)
+#define B3_PI (B3_2_PI * b3Scalar(0.5))
+#define B3_HALF_PI (B3_2_PI * b3Scalar(0.25))
+#define B3_RADS_PER_DEG (B3_2_PI / b3Scalar(360.0))
+#define B3_DEGS_PER_RAD (b3Scalar(360.0) / B3_2_PI)
+#define B3_SQRT12 b3Scalar(0.7071067811865475244008443621048490)
+
+#define b3RecipSqrt(x) ((b3Scalar)(b3Scalar(1.0) / b3Sqrt(b3Scalar(x)))) /* reciprocal square root */
+
+#ifdef B3_USE_DOUBLE_PRECISION
+#define B3_EPSILON DBL_EPSILON
+#define B3_INFINITY DBL_MAX
+#else
+#define B3_EPSILON FLT_EPSILON
+#define B3_INFINITY FLT_MAX
+#endif
+
+B3_FORCE_INLINE b3Scalar b3Atan2Fast(b3Scalar y, b3Scalar x)
+{
+	b3Scalar coeff_1 = B3_PI / 4.0f;
+	b3Scalar coeff_2 = 3.0f * coeff_1;
+	b3Scalar abs_y = b3Fabs(y);
+	b3Scalar angle;
+	if (x >= 0.0f)
+	{
+		b3Scalar r = (x - abs_y) / (x + abs_y);
+		angle = coeff_1 - coeff_1 * r;
+	}
+	else
+	{
+		b3Scalar r = (x + abs_y) / (abs_y - x);
+		angle = coeff_2 - coeff_1 * r;
+	}
+	return (y < 0.0f) ? -angle : angle;
+}
+
+B3_FORCE_INLINE bool b3FuzzyZero(b3Scalar x) { return b3Fabs(x) < B3_EPSILON; }
+
+B3_FORCE_INLINE bool b3Equal(b3Scalar a, b3Scalar eps)
+{
+	return (((a) <= eps) && !((a) < -eps));
+}
+B3_FORCE_INLINE bool b3GreaterEqual(b3Scalar a, b3Scalar eps)
+{
+	return (!((a) <= eps));
+}
+
+B3_FORCE_INLINE int b3IsNegative(b3Scalar x)
+{
+	return x < b3Scalar(0.0) ? 1 : 0;
+}
+
+B3_FORCE_INLINE b3Scalar b3Radians(b3Scalar x) { return x * B3_RADS_PER_DEG; }
+B3_FORCE_INLINE b3Scalar b3Degrees(b3Scalar x) { return x * B3_DEGS_PER_RAD; }
+
+#define B3_DECLARE_HANDLE(name) \
+	typedef struct name##__     \
+	{                           \
+		int unused;             \
+	} * name
+
+#ifndef b3Fsel
+B3_FORCE_INLINE b3Scalar b3Fsel(b3Scalar a, b3Scalar b, b3Scalar c)
+{
+	return a >= 0 ? b : c;
+}
+#endif
+#define b3Fsels(a, b, c) (b3Scalar) b3Fsel(a, b, c)
+
+B3_FORCE_INLINE bool b3MachineIsLittleEndian()
+{
+	long int i = 1;
+	const char *p = (const char *)&i;
+	if (p[0] == 1)  // Lowest address contains the least significant byte
+		return true;
+	else
+		return false;
+}
+
+///b3Select avoids branches, which makes performance much better for consoles like Playstation 3 and XBox 360
+///Thanks Phil Knight. See also http://www.cellperformance.com/articles/2006/04/more_techniques_for_eliminatin_1.html
+B3_FORCE_INLINE unsigned b3Select(unsigned condition, unsigned valueIfConditionNonZero, unsigned valueIfConditionZero)
+{
+	// Set testNz to 0xFFFFFFFF if condition is nonzero, 0x00000000 if condition is zero
+	// Rely on positive value or'ed with its negative having sign bit on
+	// and zero value or'ed with its negative (which is still zero) having sign bit off
+	// Use arithmetic shift right, shifting the sign bit through all 32 bits
+	unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+	unsigned testEqz = ~testNz;
+	return ((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
+}
+B3_FORCE_INLINE int b3Select(unsigned condition, int valueIfConditionNonZero, int valueIfConditionZero)
+{
+	unsigned testNz = (unsigned)(((int)condition | -(int)condition) >> 31);
+	unsigned testEqz = ~testNz;
+	return static_cast<int>((valueIfConditionNonZero & testNz) | (valueIfConditionZero & testEqz));
+}
+B3_FORCE_INLINE float b3Select(unsigned condition, float valueIfConditionNonZero, float valueIfConditionZero)
+{
+#ifdef B3_HAVE_NATIVE_FSEL
+	return (float)b3Fsel((b3Scalar)condition - b3Scalar(1.0f), valueIfConditionNonZero, valueIfConditionZero);
+#else
+	return (condition != 0) ? valueIfConditionNonZero : valueIfConditionZero;
+#endif
+}
+
+template <typename T>
+B3_FORCE_INLINE void b3Swap(T &a, T &b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+//PCK: endian swapping functions
+B3_FORCE_INLINE unsigned b3SwapEndian(unsigned val)
+{
+	return (((val & 0xff000000) >> 24) | ((val & 0x00ff0000) >> 8) | ((val & 0x0000ff00) << 8) | ((val & 0x000000ff) << 24));
+}
+
+B3_FORCE_INLINE unsigned short b3SwapEndian(unsigned short val)
+{
+	return static_cast<unsigned short>(((val & 0xff00) >> 8) | ((val & 0x00ff) << 8));
+}
+
+B3_FORCE_INLINE unsigned b3SwapEndian(int val)
+{
+	return b3SwapEndian((unsigned)val);
+}
+
+B3_FORCE_INLINE unsigned short b3SwapEndian(short val)
+{
+	return b3SwapEndian((unsigned short)val);
+}
+
+///b3SwapFloat uses using char pointers to swap the endianness
+////b3SwapFloat/b3SwapDouble will NOT return a float, because the machine might 'correct' invalid floating point values
+///Not all values of sign/exponent/mantissa are valid floating point numbers according to IEEE 754.
+///When a floating point unit is faced with an invalid value, it may actually change the value, or worse, throw an exception.
+///In most systems, running user mode code, you wouldn't get an exception, but instead the hardware/os/runtime will 'fix' the number for you.
+///so instead of returning a float/double, we return integer/long long integer
+B3_FORCE_INLINE unsigned int b3SwapEndianFloat(float d)
+{
+	unsigned int a = 0;
+	unsigned char *dst = (unsigned char *)&a;
+	unsigned char *src = (unsigned char *)&d;
+
+	dst[0] = src[3];
+	dst[1] = src[2];
+	dst[2] = src[1];
+	dst[3] = src[0];
+	return a;
+}
+
+// unswap using char pointers
+B3_FORCE_INLINE float b3UnswapEndianFloat(unsigned int a)
+{
+	float d = 0.0f;
+	unsigned char *src = (unsigned char *)&a;
+	unsigned char *dst = (unsigned char *)&d;
+
+	dst[0] = src[3];
+	dst[1] = src[2];
+	dst[2] = src[1];
+	dst[3] = src[0];
+
+	return d;
+}
+
+// swap using char pointers
+B3_FORCE_INLINE void b3SwapEndianDouble(double d, unsigned char *dst)
+{
+	unsigned char *src = (unsigned char *)&d;
+
+	dst[0] = src[7];
+	dst[1] = src[6];
+	dst[2] = src[5];
+	dst[3] = src[4];
+	dst[4] = src[3];
+	dst[5] = src[2];
+	dst[6] = src[1];
+	dst[7] = src[0];
+}
+
+// unswap using char pointers
+B3_FORCE_INLINE double b3UnswapEndianDouble(const unsigned char *src)
+{
+	double d = 0.0;
+	unsigned char *dst = (unsigned char *)&d;
+
+	dst[0] = src[7];
+	dst[1] = src[6];
+	dst[2] = src[5];
+	dst[3] = src[4];
+	dst[4] = src[3];
+	dst[5] = src[2];
+	dst[6] = src[1];
+	dst[7] = src[0];
+
+	return d;
+}
+
+// returns normalized value in range [-B3_PI, B3_PI]
+B3_FORCE_INLINE b3Scalar b3NormalizeAngle(b3Scalar angleInRadians)
+{
+	angleInRadians = b3Fmod(angleInRadians, B3_2_PI);
+	if (angleInRadians < -B3_PI)
+	{
+		return angleInRadians + B3_2_PI;
+	}
+	else if (angleInRadians > B3_PI)
+	{
+		return angleInRadians - B3_2_PI;
+	}
+	else
+	{
+		return angleInRadians;
+	}
+}
+
+///rudimentary class to provide type info
+struct b3TypedObject
+{
+	b3TypedObject(int objectType)
+		: m_objectType(objectType)
+	{
+	}
+	int m_objectType;
+	inline int getObjectType() const
+	{
+		return m_objectType;
+	}
+};
+
+///align a pointer to the provided alignment, upwards
+template <typename T>
+T *b3AlignPointer(T *unalignedPtr, size_t alignment)
+{
+	struct b3ConvertPointerSizeT
+	{
+		union {
+			T *ptr;
+			size_t integer;
+		};
+	};
+	b3ConvertPointerSizeT converter;
+
+	const size_t bit_mask = ~(alignment - 1);
+	converter.ptr = unalignedPtr;
+	converter.integer += alignment - 1;
+	converter.integer &= bit_mask;
+	return converter.ptr;
+}
+
+#endif  //B3_SCALAR_H

+ 118 - 0
Dependencies/include/bullet3/Bullet3Common/b3StackAlloc.h

@@ -0,0 +1,118 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+/*
+StackAlloc extracted from GJK-EPA collision solver by Nathanael Presson
+Nov.2006
+*/
+
+#ifndef B3_STACK_ALLOC
+#define B3_STACK_ALLOC
+
+#include "b3Scalar.h"  //for b3Assert
+#include "b3AlignedAllocator.h"
+
+///The b3Block class is an internal structure for the b3StackAlloc memory allocator.
+struct b3Block
+{
+	b3Block* previous;
+	unsigned char* address;
+};
+
+///The StackAlloc class provides some fast stack-based memory allocator (LIFO last-in first-out)
+class b3StackAlloc
+{
+public:
+	b3StackAlloc(unsigned int size)
+	{
+		ctor();
+		create(size);
+	}
+	~b3StackAlloc() { destroy(); }
+
+	inline void create(unsigned int size)
+	{
+		destroy();
+		data = (unsigned char*)b3AlignedAlloc(size, 16);
+		totalsize = size;
+	}
+	inline void destroy()
+	{
+		b3Assert(usedsize == 0);
+		//Raise(L"StackAlloc is still in use");
+
+		if (usedsize == 0)
+		{
+			if (!ischild && data)
+				b3AlignedFree(data);
+
+			data = 0;
+			usedsize = 0;
+		}
+	}
+
+	int getAvailableMemory() const
+	{
+		return static_cast<int>(totalsize - usedsize);
+	}
+
+	unsigned char* allocate(unsigned int size)
+	{
+		const unsigned int nus(usedsize + size);
+		if (nus < totalsize)
+		{
+			usedsize = nus;
+			return (data + (usedsize - size));
+		}
+		b3Assert(0);
+		//&& (L"Not enough memory"));
+
+		return (0);
+	}
+	B3_FORCE_INLINE b3Block* beginBlock()
+	{
+		b3Block* pb = (b3Block*)allocate(sizeof(b3Block));
+		pb->previous = current;
+		pb->address = data + usedsize;
+		current = pb;
+		return (pb);
+	}
+	B3_FORCE_INLINE void endBlock(b3Block* block)
+	{
+		b3Assert(block == current);
+		//Raise(L"Unmatched blocks");
+		if (block == current)
+		{
+			current = block->previous;
+			usedsize = (unsigned int)((block->address - data) - sizeof(b3Block));
+		}
+	}
+
+private:
+	void ctor()
+	{
+		data = 0;
+		totalsize = 0;
+		usedsize = 0;
+		current = 0;
+		ischild = false;
+	}
+	unsigned char* data;
+	unsigned int totalsize;
+	unsigned int usedsize;
+	b3Block* current;
+	bool ischild;
+};
+
+#endif  //B3_STACK_ALLOC

+ 286 - 0
Dependencies/include/bullet3/Bullet3Common/b3Transform.h

@@ -0,0 +1,286 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_TRANSFORM_H
+#define B3_TRANSFORM_H
+
+#include "b3Matrix3x3.h"
+
+#ifdef B3_USE_DOUBLE_PRECISION
+#define b3TransformData b3TransformDoubleData
+#else
+#define b3TransformData b3TransformFloatData
+#endif
+
+/**@brief The b3Transform class supports rigid transforms with only translation and rotation and no scaling/shear.
+ *It can be used in combination with b3Vector3, b3Quaternion and b3Matrix3x3 linear algebra classes. */
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Transform
+{
+	///Storage for the rotation
+	b3Matrix3x3 m_basis;
+	///Storage for the translation
+	b3Vector3 m_origin;
+
+public:
+	/**@brief No initialization constructor */
+	b3Transform() {}
+	/**@brief Constructor from b3Quaternion (optional b3Vector3 )
+   * @param q Rotation from quaternion 
+   * @param c Translation from Vector (default 0,0,0) */
+	explicit B3_FORCE_INLINE b3Transform(const b3Quaternion& q,
+										 const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0)))
+		: m_basis(q),
+		  m_origin(c)
+	{
+	}
+
+	/**@brief Constructor from b3Matrix3x3 (optional b3Vector3)
+   * @param b Rotation from Matrix 
+   * @param c Translation from Vector default (0,0,0)*/
+	explicit B3_FORCE_INLINE b3Transform(const b3Matrix3x3& b,
+										 const b3Vector3& c = b3MakeVector3(b3Scalar(0), b3Scalar(0), b3Scalar(0)))
+		: m_basis(b),
+		  m_origin(c)
+	{
+	}
+	/**@brief Copy constructor */
+	B3_FORCE_INLINE b3Transform(const b3Transform& other)
+		: m_basis(other.m_basis),
+		  m_origin(other.m_origin)
+	{
+	}
+	/**@brief Assignment Operator */
+	B3_FORCE_INLINE b3Transform& operator=(const b3Transform& other)
+	{
+		m_basis = other.m_basis;
+		m_origin = other.m_origin;
+		return *this;
+	}
+
+	/**@brief Set the current transform as the value of the product of two transforms
+   * @param t1 Transform 1
+   * @param t2 Transform 2
+   * This = Transform1 * Transform2 */
+	B3_FORCE_INLINE void mult(const b3Transform& t1, const b3Transform& t2)
+	{
+		m_basis = t1.m_basis * t2.m_basis;
+		m_origin = t1(t2.m_origin);
+	}
+
+	/*		void multInverseLeft(const b3Transform& t1, const b3Transform& t2) {
+			b3Vector3 v = t2.m_origin - t1.m_origin;
+			m_basis = b3MultTransposeLeft(t1.m_basis, t2.m_basis);
+			m_origin = v * t1.m_basis;
+		}
+		*/
+
+	/**@brief Return the transform of the vector */
+	B3_FORCE_INLINE b3Vector3 operator()(const b3Vector3& x) const
+	{
+		return x.dot3(m_basis[0], m_basis[1], m_basis[2]) + m_origin;
+	}
+
+	/**@brief Return the transform of the vector */
+	B3_FORCE_INLINE b3Vector3 operator*(const b3Vector3& x) const
+	{
+		return (*this)(x);
+	}
+
+	/**@brief Return the transform of the b3Quaternion */
+	B3_FORCE_INLINE b3Quaternion operator*(const b3Quaternion& q) const
+	{
+		return getRotation() * q;
+	}
+
+	/**@brief Return the basis matrix for the rotation */
+	B3_FORCE_INLINE b3Matrix3x3& getBasis() { return m_basis; }
+	/**@brief Return the basis matrix for the rotation */
+	B3_FORCE_INLINE const b3Matrix3x3& getBasis() const { return m_basis; }
+
+	/**@brief Return the origin vector translation */
+	B3_FORCE_INLINE b3Vector3& getOrigin() { return m_origin; }
+	/**@brief Return the origin vector translation */
+	B3_FORCE_INLINE const b3Vector3& getOrigin() const { return m_origin; }
+
+	/**@brief Return a quaternion representing the rotation */
+	b3Quaternion getRotation() const
+	{
+		b3Quaternion q;
+		m_basis.getRotation(q);
+		return q;
+	}
+
+	/**@brief Set from an array 
+   * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
+	void setFromOpenGLMatrix(const b3Scalar* m)
+	{
+		m_basis.setFromOpenGLSubMatrix(m);
+		m_origin.setValue(m[12], m[13], m[14]);
+	}
+
+	/**@brief Fill an array representation
+   * @param m A pointer to a 15 element array (12 rotation(row major padded on the right by 1), and 3 translation */
+	void getOpenGLMatrix(b3Scalar * m) const
+	{
+		m_basis.getOpenGLSubMatrix(m);
+		m[12] = m_origin.getX();
+		m[13] = m_origin.getY();
+		m[14] = m_origin.getZ();
+		m[15] = b3Scalar(1.0);
+	}
+
+	/**@brief Set the translational element
+   * @param origin The vector to set the translation to */
+	B3_FORCE_INLINE void setOrigin(const b3Vector3& origin)
+	{
+		m_origin = origin;
+	}
+
+	B3_FORCE_INLINE b3Vector3 invXform(const b3Vector3& inVec) const;
+
+	/**@brief Set the rotational element by b3Matrix3x3 */
+	B3_FORCE_INLINE void setBasis(const b3Matrix3x3& basis)
+	{
+		m_basis = basis;
+	}
+
+	/**@brief Set the rotational element by b3Quaternion */
+	B3_FORCE_INLINE void setRotation(const b3Quaternion& q)
+	{
+		m_basis.setRotation(q);
+	}
+
+	/**@brief Set this transformation to the identity */
+	void setIdentity()
+	{
+		m_basis.setIdentity();
+		m_origin.setValue(b3Scalar(0.0), b3Scalar(0.0), b3Scalar(0.0));
+	}
+
+	/**@brief Multiply this Transform by another(this = this * another) 
+   * @param t The other transform */
+	b3Transform& operator*=(const b3Transform& t)
+	{
+		m_origin += m_basis * t.m_origin;
+		m_basis *= t.m_basis;
+		return *this;
+	}
+
+	/**@brief Return the inverse of this transform */
+	b3Transform inverse() const
+	{
+		b3Matrix3x3 inv = m_basis.transpose();
+		return b3Transform(inv, inv * -m_origin);
+	}
+
+	/**@brief Return the inverse of this transform times the other transform
+   * @param t The other transform 
+   * return this.inverse() * the other */
+	b3Transform inverseTimes(const b3Transform& t) const;
+
+	/**@brief Return the product of this transform and the other */
+	b3Transform operator*(const b3Transform& t) const;
+
+	/**@brief Return an identity transform */
+	static const b3Transform& getIdentity()
+	{
+		static const b3Transform identityTransform(b3Matrix3x3::getIdentity());
+		return identityTransform;
+	}
+
+	void serialize(struct b3TransformData & dataOut) const;
+
+	void serializeFloat(struct b3TransformFloatData & dataOut) const;
+
+	void deSerialize(const struct b3TransformData& dataIn);
+
+	void deSerializeDouble(const struct b3TransformDoubleData& dataIn);
+
+	void deSerializeFloat(const struct b3TransformFloatData& dataIn);
+};
+
+B3_FORCE_INLINE b3Vector3
+b3Transform::invXform(const b3Vector3& inVec) const
+{
+	b3Vector3 v = inVec - m_origin;
+	return (m_basis.transpose() * v);
+}
+
+B3_FORCE_INLINE b3Transform
+b3Transform::inverseTimes(const b3Transform& t) const
+{
+	b3Vector3 v = t.getOrigin() - m_origin;
+	return b3Transform(m_basis.transposeTimes(t.m_basis),
+					   v * m_basis);
+}
+
+B3_FORCE_INLINE b3Transform
+	b3Transform::operator*(const b3Transform& t) const
+{
+	return b3Transform(m_basis * t.m_basis,
+					   (*this)(t.m_origin));
+}
+
+/**@brief Test if two transforms have all elements equal */
+B3_FORCE_INLINE bool operator==(const b3Transform& t1, const b3Transform& t2)
+{
+	return (t1.getBasis() == t2.getBasis() &&
+			t1.getOrigin() == t2.getOrigin());
+}
+
+///for serialization
+struct b3TransformFloatData
+{
+	b3Matrix3x3FloatData m_basis;
+	b3Vector3FloatData m_origin;
+};
+
+struct b3TransformDoubleData
+{
+	b3Matrix3x3DoubleData m_basis;
+	b3Vector3DoubleData m_origin;
+};
+
+B3_FORCE_INLINE void b3Transform::serialize(b3TransformData& dataOut) const
+{
+	m_basis.serialize(dataOut.m_basis);
+	m_origin.serialize(dataOut.m_origin);
+}
+
+B3_FORCE_INLINE void b3Transform::serializeFloat(b3TransformFloatData& dataOut) const
+{
+	m_basis.serializeFloat(dataOut.m_basis);
+	m_origin.serializeFloat(dataOut.m_origin);
+}
+
+B3_FORCE_INLINE void b3Transform::deSerialize(const b3TransformData& dataIn)
+{
+	m_basis.deSerialize(dataIn.m_basis);
+	m_origin.deSerialize(dataIn.m_origin);
+}
+
+B3_FORCE_INLINE void b3Transform::deSerializeFloat(const b3TransformFloatData& dataIn)
+{
+	m_basis.deSerializeFloat(dataIn.m_basis);
+	m_origin.deSerializeFloat(dataIn.m_origin);
+}
+
+B3_FORCE_INLINE void b3Transform::deSerializeDouble(const b3TransformDoubleData& dataIn)
+{
+	m_basis.deSerializeDouble(dataIn.m_basis);
+	m_origin.deSerializeDouble(dataIn.m_origin);
+}
+
+#endif  //B3_TRANSFORM_H

+ 210 - 0
Dependencies/include/bullet3/Bullet3Common/b3TransformUtil.h

@@ -0,0 +1,210 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_TRANSFORM_UTIL_H
+#define B3_TRANSFORM_UTIL_H
+
+#include "b3Transform.h"
+#define B3_ANGULAR_MOTION_THRESHOLD b3Scalar(0.5) * B3_HALF_PI
+
+B3_FORCE_INLINE b3Vector3 b3AabbSupport(const b3Vector3& halfExtents, const b3Vector3& supportDir)
+{
+	return b3MakeVector3(supportDir.getX() < b3Scalar(0.0) ? -halfExtents.getX() : halfExtents.getX(),
+						 supportDir.getY() < b3Scalar(0.0) ? -halfExtents.getY() : halfExtents.getY(),
+						 supportDir.getZ() < b3Scalar(0.0) ? -halfExtents.getZ() : halfExtents.getZ());
+}
+
+/// Utils related to temporal transforms
+class b3TransformUtil
+{
+public:
+	static void integrateTransform(const b3Transform& curTrans, const b3Vector3& linvel, const b3Vector3& angvel, b3Scalar timeStep, b3Transform& predictedTransform)
+	{
+		predictedTransform.setOrigin(curTrans.getOrigin() + linvel * timeStep);
+		//	#define QUATERNION_DERIVATIVE
+#ifdef QUATERNION_DERIVATIVE
+		b3Quaternion predictedOrn = curTrans.getRotation();
+		predictedOrn += (angvel * predictedOrn) * (timeStep * b3Scalar(0.5));
+		predictedOrn.normalize();
+#else
+		//Exponential map
+		//google for "Practical Parameterization of Rotations Using the Exponential Map", F. Sebastian Grassia
+
+		b3Vector3 axis;
+		b3Scalar fAngle = angvel.length();
+		//limit the angular motion
+		if (fAngle * timeStep > B3_ANGULAR_MOTION_THRESHOLD)
+		{
+			fAngle = B3_ANGULAR_MOTION_THRESHOLD / timeStep;
+		}
+
+		if (fAngle < b3Scalar(0.001))
+		{
+			// use Taylor's expansions of sync function
+			axis = angvel * (b3Scalar(0.5) * timeStep - (timeStep * timeStep * timeStep) * (b3Scalar(0.020833333333)) * fAngle * fAngle);
+		}
+		else
+		{
+			// sync(fAngle) = sin(c*fAngle)/t
+			axis = angvel * (b3Sin(b3Scalar(0.5) * fAngle * timeStep) / fAngle);
+		}
+		b3Quaternion dorn(axis.getX(), axis.getY(), axis.getZ(), b3Cos(fAngle * timeStep * b3Scalar(0.5)));
+		b3Quaternion orn0 = curTrans.getRotation();
+
+		b3Quaternion predictedOrn = dorn * orn0;
+		predictedOrn.normalize();
+#endif
+		predictedTransform.setRotation(predictedOrn);
+	}
+
+	static void calculateVelocityQuaternion(const b3Vector3& pos0, const b3Vector3& pos1, const b3Quaternion& orn0, const b3Quaternion& orn1, b3Scalar timeStep, b3Vector3& linVel, b3Vector3& angVel)
+	{
+		linVel = (pos1 - pos0) / timeStep;
+		b3Vector3 axis;
+		b3Scalar angle;
+		if (orn0 != orn1)
+		{
+			calculateDiffAxisAngleQuaternion(orn0, orn1, axis, angle);
+			angVel = axis * angle / timeStep;
+		}
+		else
+		{
+			angVel.setValue(0, 0, 0);
+		}
+	}
+
+	static void calculateDiffAxisAngleQuaternion(const b3Quaternion& orn0, const b3Quaternion& orn1a, b3Vector3& axis, b3Scalar& angle)
+	{
+		b3Quaternion orn1 = orn0.nearest(orn1a);
+		b3Quaternion dorn = orn1 * orn0.inverse();
+		angle = dorn.getAngle();
+		axis = b3MakeVector3(dorn.getX(), dorn.getY(), dorn.getZ());
+		axis[3] = b3Scalar(0.);
+		//check for axis length
+		b3Scalar len = axis.length2();
+		if (len < B3_EPSILON * B3_EPSILON)
+			axis = b3MakeVector3(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.));
+		else
+			axis /= b3Sqrt(len);
+	}
+
+	static void calculateVelocity(const b3Transform& transform0, const b3Transform& transform1, b3Scalar timeStep, b3Vector3& linVel, b3Vector3& angVel)
+	{
+		linVel = (transform1.getOrigin() - transform0.getOrigin()) / timeStep;
+		b3Vector3 axis;
+		b3Scalar angle;
+		calculateDiffAxisAngle(transform0, transform1, axis, angle);
+		angVel = axis * angle / timeStep;
+	}
+
+	static void calculateDiffAxisAngle(const b3Transform& transform0, const b3Transform& transform1, b3Vector3& axis, b3Scalar& angle)
+	{
+		b3Matrix3x3 dmat = transform1.getBasis() * transform0.getBasis().inverse();
+		b3Quaternion dorn;
+		dmat.getRotation(dorn);
+
+		///floating point inaccuracy can lead to w component > 1..., which breaks
+		dorn.normalize();
+
+		angle = dorn.getAngle();
+		axis = b3MakeVector3(dorn.getX(), dorn.getY(), dorn.getZ());
+		axis[3] = b3Scalar(0.);
+		//check for axis length
+		b3Scalar len = axis.length2();
+		if (len < B3_EPSILON * B3_EPSILON)
+			axis = b3MakeVector3(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.));
+		else
+			axis /= b3Sqrt(len);
+	}
+};
+
+///The b3ConvexSeparatingDistanceUtil can help speed up convex collision detection
+///by conservatively updating a cached separating distance/vector instead of re-calculating the closest distance
+class b3ConvexSeparatingDistanceUtil
+{
+	b3Quaternion m_ornA;
+	b3Quaternion m_ornB;
+	b3Vector3 m_posA;
+	b3Vector3 m_posB;
+
+	b3Vector3 m_separatingNormal;
+
+	b3Scalar m_boundingRadiusA;
+	b3Scalar m_boundingRadiusB;
+	b3Scalar m_separatingDistance;
+
+public:
+	b3ConvexSeparatingDistanceUtil(b3Scalar boundingRadiusA, b3Scalar boundingRadiusB)
+		: m_boundingRadiusA(boundingRadiusA),
+		  m_boundingRadiusB(boundingRadiusB),
+		  m_separatingDistance(0.f)
+	{
+	}
+
+	b3Scalar getConservativeSeparatingDistance()
+	{
+		return m_separatingDistance;
+	}
+
+	void updateSeparatingDistance(const b3Transform& transA, const b3Transform& transB)
+	{
+		const b3Vector3& toPosA = transA.getOrigin();
+		const b3Vector3& toPosB = transB.getOrigin();
+		b3Quaternion toOrnA = transA.getRotation();
+		b3Quaternion toOrnB = transB.getRotation();
+
+		if (m_separatingDistance > 0.f)
+		{
+			b3Vector3 linVelA, angVelA, linVelB, angVelB;
+			b3TransformUtil::calculateVelocityQuaternion(m_posA, toPosA, m_ornA, toOrnA, b3Scalar(1.), linVelA, angVelA);
+			b3TransformUtil::calculateVelocityQuaternion(m_posB, toPosB, m_ornB, toOrnB, b3Scalar(1.), linVelB, angVelB);
+			b3Scalar maxAngularProjectedVelocity = angVelA.length() * m_boundingRadiusA + angVelB.length() * m_boundingRadiusB;
+			b3Vector3 relLinVel = (linVelB - linVelA);
+			b3Scalar relLinVelocLength = relLinVel.dot(m_separatingNormal);
+			if (relLinVelocLength < 0.f)
+			{
+				relLinVelocLength = 0.f;
+			}
+
+			b3Scalar projectedMotion = maxAngularProjectedVelocity + relLinVelocLength;
+			m_separatingDistance -= projectedMotion;
+		}
+
+		m_posA = toPosA;
+		m_posB = toPosB;
+		m_ornA = toOrnA;
+		m_ornB = toOrnB;
+	}
+
+	void initSeparatingDistance(const b3Vector3& separatingVector, b3Scalar separatingDistance, const b3Transform& transA, const b3Transform& transB)
+	{
+		m_separatingDistance = separatingDistance;
+
+		if (m_separatingDistance > 0.f)
+		{
+			m_separatingNormal = separatingVector;
+
+			const b3Vector3& toPosA = transA.getOrigin();
+			const b3Vector3& toPosB = transB.getOrigin();
+			b3Quaternion toOrnA = transA.getRotation();
+			b3Quaternion toOrnB = transB.getRotation();
+			m_posA = toPosA;
+			m_posB = toPosB;
+			m_ornA = toOrnA;
+			m_ornB = toOrnB;
+		}
+	}
+};
+
+#endif  //B3_TRANSFORM_UTIL_H

+ 1637 - 0
Dependencies/include/bullet3/Bullet3Common/b3Vector3.cpp

@@ -0,0 +1,1637 @@
+/*
+ Copyright (c) 2011-213 Apple Inc. http://bulletphysics.org
+
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ This source version has been altered.
+ */
+
+#if defined(_WIN32) || defined(__i386__)
+#define B3_USE_SSE_IN_API
+#endif
+
+#include "b3Vector3.h"
+
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+
+#ifdef __APPLE__
+#include <stdint.h>
+typedef float float4 __attribute__((vector_size(16)));
+#else
+#define float4 __m128
+#endif
+//typedef  uint32_t uint4 __attribute__ ((vector_size(16)));
+
+#if defined B3_USE_SSE || defined _WIN32
+
+#define LOG2_ARRAY_SIZE 6
+#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
+
+#include <emmintrin.h>
+
+long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
+long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	const float4 *vertices = (const float4 *)vv;
+	static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+	float4 dotMax = b3Assign128(-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY);
+	float4 vvec = _mm_loadu_ps(vec);
+	float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa));  /// zzzz
+	float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
+
+	long maxIndex = -1L;
+
+	size_t segment = 0;
+	float4 stack_array[STACK_ARRAY_COUNT];
+
+#if DEBUG
+	// memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+#endif
+
+	size_t index;
+	float4 max;
+	// Faster loop without cleanup code for full tiles
+	for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
+	{
+		max = dotMax;
+
+		for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			// It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+		}
+
+		// If we found a new max
+		if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
+		{
+			// copy the new max across all lanes of our max accumulator
+			max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
+			max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
+
+			dotMax = max;
+
+			// find first occurrence of that max
+			size_t test;
+			for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
+			{
+			}
+			// record where it is.
+			maxIndex = 4 * index + segment + indexTable[test];
+		}
+	}
+
+	// account for work we've already done
+	count -= segment;
+
+	// Deal with the last < STACK_ARRAY_COUNT vectors
+	max = dotMax;
+	index = 0;
+
+	if (b3Unlikely(count > 16))
+	{
+		for (; index + 4 <= count / 4; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+
+			// It is too costly to keep the index of the max here. We will look for it again later.  We save a lot of work this way.
+		}
+	}
+
+	size_t localCount = (count & -4L) - 4 * index;
+	if (localCount)
+	{
+#ifdef __APPLE__
+		float4 t0, t1, t2, t3, t4;
+		float4 *sap = &stack_array[index + localCount / 4];
+		vertices += localCount;  // counter the offset
+		size_t byteIndex = -(localCount) * sizeof(float);
+		//AT&T Code style assembly
+		asm volatile(
+			".align 4                                                                   \n\
+             0: movaps  %[max], %[t2]                            // move max out of the way to avoid propagating NaNs in max \n\
+          movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+          movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+          movaps  %[t0], %[max]                               // vertices[0]      \n\
+          movlhps %[t1], %[max]                               // x0y0x1y1         \n\
+         movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+         movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+          mulps   %[vLo], %[max]                              // x0y0x1y1 * vLo   \n\
+         movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+         movaps  %[t3], %[t0]                                // vertices[2]      \n\
+         movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+         mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+          movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+          shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+          mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+         movaps  %[max], %[t3]                               // x0y0x1y1 * vLo   \n\
+         shufps  $0x88, %[t0], %[max]                        // x0x1x2x3 * vLo.x \n\
+         shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+         addps   %[t3], %[max]                               // x + y            \n\
+         addps   %[t1], %[max]                               // x + y + z        \n\
+         movaps  %[max], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+         maxps   %[t2], %[max]                               // record max, restore max   \n\
+         add     $16, %[byteIndex]                           // advance loop counter\n\
+         jnz     0b                                          \n\
+     "
+			: [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
+			: [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
+			: "memory", "cc");
+		index += localCount / 4;
+#else
+		{
+			for (unsigned int i = 0; i < localCount / 4; i++, index++)
+			{  // do four dot products at a time. Carefully avoid touching the w element.
+				float4 v0 = vertices[0];
+				float4 v1 = vertices[1];
+				float4 v2 = vertices[2];
+				float4 v3 = vertices[3];
+				vertices += 4;
+
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+				float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+				float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+				lo0 = lo0 * vLo;
+				lo1 = lo1 * vLo;
+				float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+				float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+				z = z * vHi;
+				x = x + y;
+				x = x + z;
+				stack_array[index] = x;
+				max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+			}
+		}
+#endif  //__APPLE__
+	}
+
+	// process the last few points
+	if (count & 3)
+	{
+		float4 v0, v1, v2, x, y, z;
+		switch (count & 3)
+		{
+			case 3:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				v2 = vertices[2];
+
+				// Calculate 3 dot products, transpose, duplicate v2
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
+				lo0 = lo0 * vLo;
+				z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
+				z = z * vHi;
+				float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
+				lo1 = lo1 * vLo;
+				x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			}
+			break;
+			case 2:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				float4 xy = _mm_movelh_ps(v0, v1);
+				z = _mm_movehl_ps(v1, v0);
+				xy = xy * vLo;
+				z = _mm_shuffle_ps(z, z, 0xa8);
+				x = _mm_shuffle_ps(xy, xy, 0xa8);
+				y = _mm_shuffle_ps(xy, xy, 0xfd);
+				z = z * vHi;
+			}
+			break;
+			case 1:
+			{
+				float4 xy = vertices[0];
+				z = _mm_shuffle_ps(xy, xy, 0xaa);
+				xy = xy * vLo;
+				z = z * vHi;
+				x = _mm_shuffle_ps(xy, xy, 0);
+				y = _mm_shuffle_ps(xy, xy, 0x55);
+			}
+			break;
+		}
+		x = x + y;
+		x = x + z;
+		stack_array[index] = x;
+		max = _mm_max_ps(x, max);  // control the order here so that max is never NaN even if x is nan
+		index++;
+	}
+
+	// if we found a new max.
+	if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
+	{  // we found a new max. Search for it
+		// find max across the max vector, place in all elements of max -- big latency hit here
+		max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
+		max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
+
+		// It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+		// this where it actually makes a difference is handled in the early out at the top of the function,
+		// so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+		// complexity, and removed it.
+
+		dotMax = max;
+
+		// scan for the first occurence of max in the array
+		size_t test;
+		for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++)  // local_count must be a multiple of 4
+		{
+		}
+		maxIndex = 4 * index + segment + indexTable[test];
+	}
+
+	_mm_store_ss(dotResult, dotMax);
+	return maxIndex;
+}
+
+long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
+
+long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	const float4 *vertices = (const float4 *)vv;
+	static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+
+	float4 dotmin = b3Assign128(B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY);
+	float4 vvec = _mm_loadu_ps(vec);
+	float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa));  /// zzzz
+	float4 vLo = _mm_movelh_ps(vvec, vvec);                                    /// xyxy
+
+	long minIndex = -1L;
+
+	size_t segment = 0;
+	float4 stack_array[STACK_ARRAY_COUNT];
+
+#if DEBUG
+	// memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+#endif
+
+	size_t index;
+	float4 min;
+	// Faster loop without cleanup code for full tiles
+	for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
+	{
+		min = dotmin;
+
+		for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			// It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+		}
+
+		// If we found a new min
+		if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
+		{
+			// copy the new min across all lanes of our min accumulator
+			min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
+			min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
+
+			dotmin = min;
+
+			// find first occurrence of that min
+			size_t test;
+			for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
+			{
+			}
+			// record where it is.
+			minIndex = 4 * index + segment + indexTable[test];
+		}
+	}
+
+	// account for work we've already done
+	count -= segment;
+
+	// Deal with the last < STACK_ARRAY_COUNT vectors
+	min = dotmin;
+	index = 0;
+
+	if (b3Unlikely(count > 16))
+	{
+		for (; index + 4 <= count / 4; index += 4)
+		{  // do four dot products at a time. Carefully avoid touching the w element.
+			float4 v0 = vertices[0];
+			float4 v1 = vertices[1];
+			float4 v2 = vertices[2];
+			float4 v3 = vertices[3];
+			vertices += 4;
+
+			float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 1] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 2] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			v0 = vertices[0];
+			v1 = vertices[1];
+			v2 = vertices[2];
+			v3 = vertices[3];
+			vertices += 4;
+
+			lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+			hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+			lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+			hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+			lo0 = lo0 * vLo;
+			lo1 = lo1 * vLo;
+			z = _mm_shuffle_ps(hi0, hi1, 0x88);
+			x = _mm_shuffle_ps(lo0, lo1, 0x88);
+			y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			z = z * vHi;
+			x = x + y;
+			x = x + z;
+			stack_array[index + 3] = x;
+			min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+
+			// It is too costly to keep the index of the min here. We will look for it again later.  We save a lot of work this way.
+		}
+	}
+
+	size_t localCount = (count & -4L) - 4 * index;
+	if (localCount)
+	{
+#ifdef __APPLE__
+		vertices += localCount;  // counter the offset
+		float4 t0, t1, t2, t3, t4;
+		size_t byteIndex = -(localCount) * sizeof(float);
+		float4 *sap = &stack_array[index + localCount / 4];
+
+		asm volatile(
+			".align 4                                                                   \n\
+             0: movaps  %[min], %[t2]                            // move min out of the way to avoid propagating NaNs in min \n\
+             movaps  (%[vertices], %[byteIndex], 4),    %[t0]    // vertices[0]      \n\
+             movaps  16(%[vertices], %[byteIndex], 4),  %[t1]    // vertices[1]      \n\
+             movaps  %[t0], %[min]                               // vertices[0]      \n\
+             movlhps %[t1], %[min]                               // x0y0x1y1         \n\
+             movaps  32(%[vertices], %[byteIndex], 4),  %[t3]    // vertices[2]      \n\
+             movaps  48(%[vertices], %[byteIndex], 4),  %[t4]    // vertices[3]      \n\
+             mulps   %[vLo], %[min]                              // x0y0x1y1 * vLo   \n\
+             movhlps %[t0], %[t1]                                // z0w0z1w1         \n\
+             movaps  %[t3], %[t0]                                // vertices[2]      \n\
+             movlhps %[t4], %[t0]                                // x2y2x3y3         \n\
+             movhlps %[t3], %[t4]                                // z2w2z3w3         \n\
+             mulps   %[vLo], %[t0]                               // x2y2x3y3 * vLo   \n\
+             shufps  $0x88, %[t4], %[t1]                         // z0z1z2z3         \n\
+             mulps   %[vHi], %[t1]                               // z0z1z2z3 * vHi   \n\
+             movaps  %[min], %[t3]                               // x0y0x1y1 * vLo   \n\
+             shufps  $0x88, %[t0], %[min]                        // x0x1x2x3 * vLo.x \n\
+             shufps  $0xdd, %[t0], %[t3]                         // y0y1y2y3 * vLo.y \n\
+             addps   %[t3], %[min]                               // x + y            \n\
+             addps   %[t1], %[min]                               // x + y + z        \n\
+             movaps  %[min], (%[sap], %[byteIndex])              // record result for later scrutiny \n\
+             minps   %[t2], %[min]                               // record min, restore min   \n\
+             add     $16, %[byteIndex]                           // advance loop counter\n\
+             jnz     0b                                          \n\
+             "
+			: [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
+			: [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
+			: "memory", "cc");
+		index += localCount / 4;
+#else
+		{
+			for (unsigned int i = 0; i < localCount / 4; i++, index++)
+			{  // do four dot products at a time. Carefully avoid touching the w element.
+				float4 v0 = vertices[0];
+				float4 v1 = vertices[1];
+				float4 v2 = vertices[2];
+				float4 v3 = vertices[3];
+				vertices += 4;
+
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // x0y0x1y1
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z0?0z1?1
+				float4 lo1 = _mm_movelh_ps(v2, v3);  // x2y2x3y3
+				float4 hi1 = _mm_movehl_ps(v3, v2);  // z2?2z3?3
+
+				lo0 = lo0 * vLo;
+				lo1 = lo1 * vLo;
+				float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+				float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+				z = z * vHi;
+				x = x + y;
+				x = x + z;
+				stack_array[index] = x;
+				min = _mm_min_ps(x, min);  // control the order here so that max is never NaN even if x is nan
+			}
+		}
+
+#endif
+	}
+
+	// process the last few points
+	if (count & 3)
+	{
+		float4 v0, v1, v2, x, y, z;
+		switch (count & 3)
+		{
+			case 3:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				v2 = vertices[2];
+
+				// Calculate 3 dot products, transpose, duplicate v2
+				float4 lo0 = _mm_movelh_ps(v0, v1);  // xyxy.lo
+				float4 hi0 = _mm_movehl_ps(v1, v0);  // z?z?.lo
+				lo0 = lo0 * vLo;
+				z = _mm_shuffle_ps(hi0, v2, 0xa8);  // z0z1z2z2
+				z = z * vHi;
+				float4 lo1 = _mm_movelh_ps(v2, v2);  // xyxy
+				lo1 = lo1 * vLo;
+				x = _mm_shuffle_ps(lo0, lo1, 0x88);
+				y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+			}
+			break;
+			case 2:
+			{
+				v0 = vertices[0];
+				v1 = vertices[1];
+				float4 xy = _mm_movelh_ps(v0, v1);
+				z = _mm_movehl_ps(v1, v0);
+				xy = xy * vLo;
+				z = _mm_shuffle_ps(z, z, 0xa8);
+				x = _mm_shuffle_ps(xy, xy, 0xa8);
+				y = _mm_shuffle_ps(xy, xy, 0xfd);
+				z = z * vHi;
+			}
+			break;
+			case 1:
+			{
+				float4 xy = vertices[0];
+				z = _mm_shuffle_ps(xy, xy, 0xaa);
+				xy = xy * vLo;
+				z = z * vHi;
+				x = _mm_shuffle_ps(xy, xy, 0);
+				y = _mm_shuffle_ps(xy, xy, 0x55);
+			}
+			break;
+		}
+		x = x + y;
+		x = x + z;
+		stack_array[index] = x;
+		min = _mm_min_ps(x, min);  // control the order here so that min is never NaN even if x is nan
+		index++;
+	}
+
+	// if we found a new min.
+	if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
+	{  // we found a new min. Search for it
+		// find min across the min vector, place in all elements of min -- big latency hit here
+		min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
+		min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
+
+		// It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+		// this where it actually makes a difference is handled in the early out at the top of the function,
+		// so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+		// complexity, and removed it.
+
+		dotmin = min;
+
+		// scan for the first occurence of min in the array
+		size_t test;
+		for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++)  // local_count must be a multiple of 4
+		{
+		}
+		minIndex = 4 * index + segment + indexTable[test];
+	}
+
+	_mm_store_ss(dotResult, dotmin);
+	return minIndex;
+}
+
+#elif defined B3_USE_NEON
+#define ARM_NEON_GCC_COMPATIBILITY 1
+#include <arm_neon.h>
+
+static long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
+
+long (*b3_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_maxdot_large_sel;
+long (*b3_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_mindot_large_sel;
+
+extern "C"
+{
+	int _get_cpu_capabilities(void);
+}
+
+static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	if (_get_cpu_capabilities() & 0x2000)
+		b3_maxdot_large = _maxdot_large_v1;
+	else
+		b3_maxdot_large = _maxdot_large_v0;
+
+	return b3_maxdot_large(vv, vec, count, dotResult);
+}
+
+static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	if (_get_cpu_capabilities() & 0x2000)
+		b3_mindot_large = _mindot_large_v1;
+	else
+		b3_mindot_large = _mindot_large_v0;
+
+	return b3_mindot_large(vv, vec, count, dotResult);
+}
+
+#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32  {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
+
+long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	unsigned long i = 0;
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x2_t vLo = vget_low_f32(vvec);
+	float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+	float32x2_t dotMaxLo = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
+	float32x2_t dotMaxHi = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
+	uint32x2_t indexLo = (uint32x2_t){0, 1};
+	uint32x2_t indexHi = (uint32x2_t){2, 3};
+	uint32x2_t iLo = (uint32x2_t){-1, -1};
+	uint32x2_t iHi = (uint32x2_t){-1, -1};
+	const uint32x2_t four = (uint32x2_t){4, 4};
+
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+		uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		zLo = vmul_f32(z0.val[0], vHi);
+		zHi = vmul_f32(z1.val[0], vHi);
+
+		rLo = vpadd_f32(xy0, xy1);
+		rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		maskLo = vcgt_f32(rLo, dotMaxLo);
+		maskHi = vcgt_f32(rHi, dotMaxHi);
+		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+		uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+		dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+		dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+			float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+			float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			float32x2_t rHi = vpadd_f32(xy2, xy2);
+			rLo = vadd_f32(rLo, zLo);
+			rHi = vadd_f32(rHi, zHi);
+
+			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+			uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+			dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+			iHi = vbsl_u32(maskHi, indexHi, iHi);
+		}
+		break;
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			rLo = vadd_f32(rLo, zLo);
+
+			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+			float32x2_t zLo = vmul_f32(z0, vHi);
+			float32x2_t rLo = vpadd_f32(xy0, xy0);
+			rLo = vadd_f32(rLo, zLo);
+			uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+			dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
+	dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	// select best answer between even and odd results
+	dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
+	iHi = vdup_lane_u32(iLo, 1);
+	mask = vcgt_f32(dotMaxHi, dotMaxLo);
+	dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	*dotResult = vget_lane_f32(dotMaxLo, 0);
+	return vget_lane_u32(iLo, 0);
+}
+
+long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+	float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+	const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
+	uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
+	uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
+	float32x4_t maxDot = (float32x4_t){-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY};
+
+	unsigned long i = 0;
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcgtq_f32(x, maxDot);
+		maxDot = vbslq_f32(mask, x, maxDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		zb = vuzpq_f32(z0, z1);
+		z = vmulq_f32(zb.val[0], vHi);
+		xy = vuzpq_f32(xy0, xy1);
+		x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		mask = vcgtq_f32(x, maxDot);
+		maxDot = vbslq_f32(mask, x, maxDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcgtq_f32(x, maxDot);
+		maxDot = vbslq_f32(mask, x, maxDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
+
+			xy0 = vmulq_f32(xy0, vLo);
+			xy1 = vmulq_f32(xy1, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z1);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcgtq_f32(x, maxDot);
+			maxDot = vbslq_f32(mask, x, maxDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z0);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcgtq_f32(x, maxDot);
+			maxDot = vbslq_f32(mask, x, maxDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			z = vmulq_f32(z, vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcgtq_f32(x, maxDot);
+			maxDot = vbslq_f32(mask, x, maxDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
+	float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
+	uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+
+	// select best answer between even and odd results
+	float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
+	uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+	mask = vcgt_f32(maxDotO, maxDot2);
+	maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
+	index2 = vbsl_u32(mask, indexHi, index2);
+
+	*dotResult = vget_lane_f32(maxDot2, 0);
+	return vget_lane_u32(index2, 0);
+}
+
+long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	unsigned long i = 0;
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x2_t vLo = vget_low_f32(vvec);
+	float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+	float32x2_t dotMinLo = (float32x2_t){B3_INFINITY, B3_INFINITY};
+	float32x2_t dotMinHi = (float32x2_t){B3_INFINITY, B3_INFINITY};
+	uint32x2_t indexLo = (uint32x2_t){0, 1};
+	uint32x2_t indexHi = (uint32x2_t){2, 3};
+	uint32x2_t iLo = (uint32x2_t){-1, -1};
+	uint32x2_t iHi = (uint32x2_t){-1, -1};
+	const uint32x2_t four = (uint32x2_t){4, 4};
+
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+		uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		zLo = vmul_f32(z0.val[0], vHi);
+		zHi = vmul_f32(z1.val[0], vHi);
+
+		rLo = vpadd_f32(xy0, xy1);
+		rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		maskLo = vclt_f32(rLo, dotMinLo);
+		maskHi = vclt_f32(rHi, dotMinHi);
+		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+		float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+		float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+		float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+		float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+		float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+		float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+		float32x2_t rLo = vpadd_f32(xy0, xy1);
+		float32x2_t rHi = vpadd_f32(xy2, xy3);
+		rLo = vadd_f32(rLo, zLo);
+		rHi = vadd_f32(rHi, zHi);
+
+		uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+		uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+		dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+		dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+		iLo = vbsl_u32(maskLo, indexLo, iLo);
+		iHi = vbsl_u32(maskHi, indexHi, iHi);
+		indexLo = vadd_u32(indexLo, four);
+		indexHi = vadd_u32(indexHi, four);
+	}
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+			float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+			float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			float32x2_t rHi = vpadd_f32(xy2, xy2);
+			rLo = vadd_f32(rLo, zLo);
+			rHi = vadd_f32(rHi, zHi);
+
+			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+			uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+			dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+			iHi = vbsl_u32(maskHi, indexHi, iHi);
+		}
+		break;
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+
+			float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+
+			float32x2_t rLo = vpadd_f32(xy0, xy1);
+			rLo = vadd_f32(rLo, zLo);
+
+			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+			float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+			float32x2_t zLo = vmul_f32(z0, vHi);
+			float32x2_t rLo = vpadd_f32(xy0, xy0);
+			rLo = vadd_f32(rLo, zLo);
+			uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+			dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+			iLo = vbsl_u32(maskLo, indexLo, iLo);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
+	dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	// select best answer between even and odd results
+	dotMinHi = vdup_lane_f32(dotMinLo, 1);
+	iHi = vdup_lane_u32(iLo, 1);
+	mask = vclt_f32(dotMinHi, dotMinLo);
+	dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+	iLo = vbsl_u32(mask, iHi, iLo);
+
+	*dotResult = vget_lane_f32(dotMinLo, 0);
+	return vget_lane_u32(iLo, 0);
+}
+
+long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+	float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+	float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+	float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+	const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
+	uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
+	uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
+	float32x4_t minDot = (float32x4_t){B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY};
+
+	unsigned long i = 0;
+	for (; i + 8 <= count; i += 8)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcltq_f32(x, minDot);
+		minDot = vbslq_f32(mask, x, minDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+
+		v0 = vld1q_f32_aligned_postincrement(vv);
+		v1 = vld1q_f32_aligned_postincrement(vv);
+		v2 = vld1q_f32_aligned_postincrement(vv);
+		v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		zb = vuzpq_f32(z0, z1);
+		z = vmulq_f32(zb.val[0], vHi);
+		xy = vuzpq_f32(xy0, xy1);
+		x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		mask = vcltq_f32(x, minDot);
+		minDot = vbslq_f32(mask, x, minDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	for (; i + 4 <= count; i += 4)
+	{
+		float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+		float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+		float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+		// the next two lines should resolve to a single vswp d, d
+		float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+		float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+		xy0 = vmulq_f32(xy0, vLo);
+		xy1 = vmulq_f32(xy1, vLo);
+
+		float32x4x2_t zb = vuzpq_f32(z0, z1);
+		float32x4_t z = vmulq_f32(zb.val[0], vHi);
+		float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+		float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+		x = vaddq_f32(x, z);
+
+		uint32x4_t mask = vcltq_f32(x, minDot);
+		minDot = vbslq_f32(mask, x, minDot);
+		index = vbslq_u32(mask, local_index, index);
+		local_index = vaddq_u32(local_index, four);
+	}
+
+	switch (count & 3)
+	{
+		case 3:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+			float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
+
+			xy0 = vmulq_f32(xy0, vLo);
+			xy1 = vmulq_f32(xy1, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z1);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcltq_f32(x, minDot);
+			minDot = vbslq_f32(mask, x, minDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 2:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+			float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			float32x4x2_t zb = vuzpq_f32(z0, z0);
+			float32x4_t z = vmulq_f32(zb.val[0], vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcltq_f32(x, minDot);
+			minDot = vbslq_f32(mask, x, minDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		case 1:
+		{
+			float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
+			// the next two lines should resolve to a single vswp d, d
+			float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
+			xy0 = vmulq_f32(xy0, vLo);
+
+			z = vmulq_f32(z, vHi);
+			float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+			float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+			x = vaddq_f32(x, z);
+
+			uint32x4_t mask = vcltq_f32(x, minDot);
+			minDot = vbslq_f32(mask, x, minDot);
+			index = vbslq_u32(mask, local_index, index);
+			local_index = vaddq_u32(local_index, four);
+		}
+		break;
+
+		default:
+			break;
+	}
+
+	// select best answer between hi and lo results
+	uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
+	float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
+	uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+
+	// select best answer between even and odd results
+	float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
+	uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+	mask = vclt_f32(minDotO, minDot2);
+	minDot2 = vbsl_f32(mask, minDotO, minDot2);
+	index2 = vbsl_u32(mask, indexHi, index2);
+
+	*dotResult = vget_lane_f32(minDot2, 0);
+	return vget_lane_u32(index2, 0);
+}
+
+#else
+#error Unhandled __APPLE__ arch
+#endif
+
+#endif /* __APPLE__ */

+ 1303 - 0
Dependencies/include/bullet3/Bullet3Common/b3Vector3.h

@@ -0,0 +1,1303 @@
+/*
+Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_VECTOR3_H
+#define B3_VECTOR3_H
+
+//#include <stdint.h>
+#include "b3Scalar.h"
+#include "b3MinMax.h"
+#include "b3AlignedAllocator.h"
+
+#ifdef B3_USE_DOUBLE_PRECISION
+#define b3Vector3Data b3Vector3DoubleData
+#define b3Vector3DataName "b3Vector3DoubleData"
+#else
+#define b3Vector3Data b3Vector3FloatData
+#define b3Vector3DataName "b3Vector3FloatData"
+#endif  //B3_USE_DOUBLE_PRECISION
+
+#if defined B3_USE_SSE
+
+//typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4556)  // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
+#endif
+
+#define B3_SHUFFLE(x, y, z, w) (((w) << 6 | (z) << 4 | (y) << 2 | (x)) & 0xff)
+//#define b3_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
+#define b3_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask))
+#define b3_splat3_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, 3))
+#define b3_splat_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, _i))
+
+#define b3v3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define b3vAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
+#define b3vFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
+#define b3v3AbsfMask b3CastiTo128f(b3v3AbsiMask)
+#define b3vFFF0fMask b3CastiTo128f(b3vFFF0Mask)
+#define b3vxyzMaskf b3vFFF0fMask
+#define b3vAbsfMask b3CastiTo128f(b3vAbsMask)
+
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
+const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5) = {1.5f, 1.5f, 1.5f, 1.5f};
+
+#endif
+
+#ifdef B3_USE_NEON
+
+const float32x4_t B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
+const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3v3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
+
+#endif
+
+class b3Vector3;
+class b3Vector4;
+
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+//#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+inline b3Vector3 b3MakeVector3(b3SimdFloat4 v);
+inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec);
+#endif
+
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z);
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
+inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
+
+/**@brief b3Vector3 can be used to represent 3D points and vectors.
+ * It has an un-used w component to suit 16-byte alignment when b3Vector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
+ * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
+ */
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Vector3
+{
+public:
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
+	union {
+		b3SimdFloat4 mVec128;
+		float m_floats[4];
+		struct
+		{
+			float x, y, z, w;
+		};
+	};
+#else
+	union {
+		float m_floats[4];
+		struct
+		{
+			float x, y, z, w;
+		};
+	};
+#endif
+
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
+
+	/*B3_FORCE_INLINE		b3Vector3()
+	{
+	}
+	*/
+
+	B3_FORCE_INLINE b3SimdFloat4 get128() const
+	{
+		return mVec128;
+	}
+	B3_FORCE_INLINE void set128(b3SimdFloat4 v128)
+	{
+		mVec128 = v128;
+	}
+#endif
+
+public:
+	/**@brief Add a vector to this one
+ * @param The vector to add to this one */
+	B3_FORCE_INLINE b3Vector3& operator+=(const b3Vector3& v)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = _mm_add_ps(mVec128, v.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vaddq_f32(mVec128, v.mVec128);
+#else
+		m_floats[0] += v.m_floats[0];
+		m_floats[1] += v.m_floats[1];
+		m_floats[2] += v.m_floats[2];
+#endif
+		return *this;
+	}
+
+	/**@brief Subtract a vector from this one
+   * @param The vector to subtract */
+	B3_FORCE_INLINE b3Vector3& operator-=(const b3Vector3& v)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = _mm_sub_ps(mVec128, v.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vsubq_f32(mVec128, v.mVec128);
+#else
+		m_floats[0] -= v.m_floats[0];
+		m_floats[1] -= v.m_floats[1];
+		m_floats[2] -= v.m_floats[2];
+#endif
+		return *this;
+	}
+
+	/**@brief Scale the vector
+   * @param s Scale factor */
+	B3_FORCE_INLINE b3Vector3& operator*=(const b3Scalar& s)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
+		mVec128 = _mm_mul_ps(mVec128, vs);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmulq_n_f32(mVec128, s);
+#else
+		m_floats[0] *= s;
+		m_floats[1] *= s;
+		m_floats[2] *= s;
+#endif
+		return *this;
+	}
+
+	/**@brief Inversely scale the vector
+   * @param s Scale factor to divide by */
+	B3_FORCE_INLINE b3Vector3& operator/=(const b3Scalar& s)
+	{
+		b3FullAssert(s != b3Scalar(0.0));
+
+#if 0  //defined(B3_USE_SSE_IN_API)
+// this code is not faster !
+		__m128 vs = _mm_load_ss(&s);
+		vs = _mm_div_ss(b3v1110, vs);
+		vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
+
+		mVec128 = _mm_mul_ps(mVec128, vs);
+
+		return *this;
+#else
+		return *this *= b3Scalar(1.0) / s;
+#endif
+	}
+
+	/**@brief Return the dot product
+   * @param v The other vector in the dot product */
+	B3_FORCE_INLINE b3Scalar dot(const b3Vector3& v) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+		return _mm_cvtss_f32(vd);
+#elif defined(B3_USE_NEON)
+		float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
+		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
+		x = vadd_f32(x, vget_high_f32(vd));
+		return vget_lane_f32(x, 0);
+#else
+		return m_floats[0] * v.m_floats[0] +
+			   m_floats[1] * v.m_floats[1] +
+			   m_floats[2] * v.m_floats[2];
+#endif
+	}
+
+	/**@brief Return the length of the vector squared */
+	B3_FORCE_INLINE b3Scalar length2() const
+	{
+		return dot(*this);
+	}
+
+	/**@brief Return the length of the vector */
+	B3_FORCE_INLINE b3Scalar length() const
+	{
+		return b3Sqrt(length2());
+	}
+
+	/**@brief Return the distance squared between the ends of this and another vector
+   * This is symantically treating the vector like a point */
+	B3_FORCE_INLINE b3Scalar distance2(const b3Vector3& v) const;
+
+	/**@brief Return the distance between the ends of this and another vector
+   * This is symantically treating the vector like a point */
+	B3_FORCE_INLINE b3Scalar distance(const b3Vector3& v) const;
+
+	B3_FORCE_INLINE b3Vector3& safeNormalize()
+	{
+		b3Scalar l2 = length2();
+		//triNormal.normalize();
+		if (l2 >= B3_EPSILON * B3_EPSILON)
+		{
+			(*this) /= b3Sqrt(l2);
+		}
+		else
+		{
+			setValue(1, 0, 0);
+		}
+		return *this;
+	}
+
+	/**@brief Normalize this vector
+   * x^2 + y^2 + z^2 = 1 */
+	B3_FORCE_INLINE b3Vector3& normalize()
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		// dot product first
+		__m128 vd = _mm_mul_ps(mVec128, mVec128);
+		__m128 z = _mm_movehl_ps(vd, vd);
+		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
+		vd = _mm_add_ss(vd, y);
+		vd = _mm_add_ss(vd, z);
+
+#if 0
+        vd = _mm_sqrt_ss(vd);
+		vd = _mm_div_ss(b3v1110, vd);
+		vd = b3_splat_ps(vd, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, vd);
+#else
+
+		// NR step 1/sqrt(x) - vd is x, y is output
+		y = _mm_rsqrt_ss(vd);  // estimate
+
+		//  one step NR
+		z = b3v1_5;
+		vd = _mm_mul_ss(vd, b3vHalf);  // vd * 0.5
+		//x2 = vd;
+		vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0
+		vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0 * y0
+		z = _mm_sub_ss(z, vd);   // 1.5 - vd * 0.5 * y0 * y0
+
+		y = _mm_mul_ss(y, z);  // y0 * (1.5 - vd * 0.5 * y0 * y0)
+
+		y = b3_splat_ps(y, 0x80);
+		mVec128 = _mm_mul_ps(mVec128, y);
+
+#endif
+
+		return *this;
+#else
+		return *this /= length();
+#endif
+	}
+
+	/**@brief Return a normalized version of this vector */
+	B3_FORCE_INLINE b3Vector3 normalized() const;
+
+	/**@brief Return a rotated version of this vector
+   * @param wAxis The axis to rotate about
+   * @param angle The angle to rotate by */
+	B3_FORCE_INLINE b3Vector3 rotate(const b3Vector3& wAxis, const b3Scalar angle) const;
+
+	/**@brief Return the angle between this and another vector
+   * @param v The other vector */
+	B3_FORCE_INLINE b3Scalar angle(const b3Vector3& v) const
+	{
+		b3Scalar s = b3Sqrt(length2() * v.length2());
+		b3FullAssert(s != b3Scalar(0.0));
+		return b3Acos(dot(v) / s);
+	}
+
+	/**@brief Return a vector will the absolute values of each element */
+	B3_FORCE_INLINE b3Vector3 absolute() const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return b3MakeVector3(_mm_and_ps(mVec128, b3v3AbsfMask));
+#elif defined(B3_USE_NEON)
+		return b3Vector3(vabsq_f32(mVec128));
+#else
+		return b3MakeVector3(
+			b3Fabs(m_floats[0]),
+			b3Fabs(m_floats[1]),
+			b3Fabs(m_floats[2]));
+#endif
+	}
+
+	/**@brief Return the cross product between this and another vector
+   * @param v The other vector */
+	B3_FORCE_INLINE b3Vector3 cross(const b3Vector3& v) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 T, V;
+
+		T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3));    //	(Y Z X 0)
+		V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
+
+		V = _mm_mul_ps(V, mVec128);
+		T = _mm_mul_ps(T, v.mVec128);
+		V = _mm_sub_ps(V, T);
+
+		V = b3_pshufd_ps(V, B3_SHUFFLE(1, 2, 0, 3));
+		return b3MakeVector3(V);
+#elif defined(B3_USE_NEON)
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(mVec128);
+		float32x2_t Vlow = vget_low_f32(v.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
+
+		V = vmulq_f32(V, mVec128);
+		T = vmulq_f32(T, v.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+		V = (float32x4_t)vandq_s32((int32x4_t)V, b3vFFF0Mask);
+
+		return b3Vector3(V);
+#else
+		return b3MakeVector3(
+			m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
+			m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
+			m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
+#endif
+	}
+
+	B3_FORCE_INLINE b3Scalar triple(const b3Vector3& v1, const b3Vector3& v2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		// cross:
+		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
+		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
+
+		V = _mm_mul_ps(V, v1.mVec128);
+		T = _mm_mul_ps(T, v2.mVec128);
+		V = _mm_sub_ps(V, T);
+
+		V = _mm_shuffle_ps(V, V, B3_SHUFFLE(1, 2, 0, 3));
+
+		// dot:
+		V = _mm_mul_ps(V, mVec128);
+		__m128 z = _mm_movehl_ps(V, V);
+		__m128 y = _mm_shuffle_ps(V, V, 0x55);
+		V = _mm_add_ss(V, y);
+		V = _mm_add_ss(V, z);
+		return _mm_cvtss_f32(V);
+
+#elif defined(B3_USE_NEON)
+		// cross:
+		float32x4_t T, V;
+		// form (Y, Z, X, _) of mVec128 and v.mVec128
+		float32x2_t Tlow = vget_low_f32(v1.mVec128);
+		float32x2_t Vlow = vget_low_f32(v2.mVec128);
+		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
+
+		V = vmulq_f32(V, v1.mVec128);
+		T = vmulq_f32(T, v2.mVec128);
+		V = vsubq_f32(V, T);
+		Vlow = vget_low_f32(V);
+		// form (Y, Z, X, _);
+		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
+
+		// dot:
+		V = vmulq_f32(mVec128, V);
+		float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
+		x = vadd_f32(x, vget_high_f32(V));
+		return vget_lane_f32(x, 0);
+#else
+		return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
+			   m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
+			   m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
+#endif
+	}
+
+	/**@brief Return the axis with the smallest value
+   * Note return values are 0,1,2 for x, y, or z */
+	B3_FORCE_INLINE int minAxis() const
+	{
+		return m_floats[0] < m_floats[1] ? (m_floats[0] < m_floats[2] ? 0 : 2) : (m_floats[1] < m_floats[2] ? 1 : 2);
+	}
+
+	/**@brief Return the axis with the largest value
+   * Note return values are 0,1,2 for x, y, or z */
+	B3_FORCE_INLINE int maxAxis() const
+	{
+		return m_floats[0] < m_floats[1] ? (m_floats[1] < m_floats[2] ? 2 : 1) : (m_floats[0] < m_floats[2] ? 2 : 0);
+	}
+
+	B3_FORCE_INLINE int furthestAxis() const
+	{
+		return absolute().minAxis();
+	}
+
+	B3_FORCE_INLINE int closestAxis() const
+	{
+		return absolute().maxAxis();
+	}
+
+	B3_FORCE_INLINE void setInterpolate3(const b3Vector3& v0, const b3Vector3& v1, b3Scalar rt)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vrt = _mm_load_ss(&rt);  //	(rt 0 0 0)
+		b3Scalar s = b3Scalar(1.0) - rt;
+		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+		vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
+		__m128 r0 = _mm_mul_ps(v0.mVec128, vs);
+		vrt = b3_pshufd_ps(vrt, 0x80);  //	(rt rt rt 0.0)
+		__m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
+		__m128 tmp3 = _mm_add_ps(r0, r1);
+		mVec128 = tmp3;
+#elif defined(B3_USE_NEON)
+		float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
+		vl = vmulq_n_f32(vl, rt);
+		mVec128 = vaddq_f32(vl, v0.mVec128);
+#else
+		b3Scalar s = b3Scalar(1.0) - rt;
+		m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
+		m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
+		m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
+		//don't do the unused w component
+		//		m_co[3] = s * v0[3] + rt * v1[3];
+#endif
+	}
+
+	/**@brief Return the linear interpolation between this and another vector
+   * @param v The other vector
+   * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
+	B3_FORCE_INLINE b3Vector3 lerp(const b3Vector3& v, const b3Scalar& t) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		__m128 vt = _mm_load_ss(&t);  //	(t 0 0 0)
+		vt = b3_pshufd_ps(vt, 0x80);  //	(rt rt rt 0.0)
+		__m128 vl = _mm_sub_ps(v.mVec128, mVec128);
+		vl = _mm_mul_ps(vl, vt);
+		vl = _mm_add_ps(vl, mVec128);
+
+		return b3MakeVector3(vl);
+#elif defined(B3_USE_NEON)
+		float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
+		vl = vmulq_n_f32(vl, t);
+		vl = vaddq_f32(vl, mVec128);
+
+		return b3Vector3(vl);
+#else
+		return b3MakeVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
+							 m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
+							 m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
+#endif
+	}
+
+	/**@brief Elementwise multiply this vector by the other
+   * @param v The other vector */
+	B3_FORCE_INLINE b3Vector3& operator*=(const b3Vector3& v)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = _mm_mul_ps(mVec128, v.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmulq_f32(mVec128, v.mVec128);
+#else
+		m_floats[0] *= v.m_floats[0];
+		m_floats[1] *= v.m_floats[1];
+		m_floats[2] *= v.m_floats[2];
+#endif
+		return *this;
+	}
+
+	/**@brief Return the x value */
+	B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
+	/**@brief Return the y value */
+	B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
+	/**@brief Return the z value */
+	B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
+	/**@brief Return the w value */
+	B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
+
+	/**@brief Set the x value */
+	B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; };
+	/**@brief Set the y value */
+	B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; };
+	/**@brief Set the z value */
+	B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; };
+	/**@brief Set the w value */
+	B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; };
+
+	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}
+	//B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
+	///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
+	B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; }
+	B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; }
+
+	B3_FORCE_INLINE bool operator==(const b3Vector3& other) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
+#else
+		return ((m_floats[3] == other.m_floats[3]) &&
+				(m_floats[2] == other.m_floats[2]) &&
+				(m_floats[1] == other.m_floats[1]) &&
+				(m_floats[0] == other.m_floats[0]));
+#endif
+	}
+
+	B3_FORCE_INLINE bool operator!=(const b3Vector3& other) const
+	{
+		return !(*this == other);
+	}
+
+	/**@brief Set each element to the max of the current values and the values of another b3Vector3
+   * @param other The other b3Vector3 to compare with
+   */
+	B3_FORCE_INLINE void setMax(const b3Vector3& other)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = _mm_max_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vmaxq_f32(mVec128, other.mVec128);
+#else
+		b3SetMax(m_floats[0], other.m_floats[0]);
+		b3SetMax(m_floats[1], other.m_floats[1]);
+		b3SetMax(m_floats[2], other.m_floats[2]);
+		b3SetMax(m_floats[3], other.m_floats[3]);
+#endif
+	}
+
+	/**@brief Set each element to the min of the current values and the values of another b3Vector3
+   * @param other The other b3Vector3 to compare with
+   */
+	B3_FORCE_INLINE void setMin(const b3Vector3& other)
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = _mm_min_ps(mVec128, other.mVec128);
+#elif defined(B3_USE_NEON)
+		mVec128 = vminq_f32(mVec128, other.mVec128);
+#else
+		b3SetMin(m_floats[0], other.m_floats[0]);
+		b3SetMin(m_floats[1], other.m_floats[1]);
+		b3SetMin(m_floats[2], other.m_floats[2]);
+		b3SetMin(m_floats[3], other.m_floats[3]);
+#endif
+	}
+
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
+	{
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = b3Scalar(0.f);
+	}
+
+	void getSkewSymmetricMatrix(b3Vector3 * v0, b3Vector3 * v1, b3Vector3 * v2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+
+		__m128 V = _mm_and_ps(mVec128, b3vFFF0fMask);
+		__m128 V0 = _mm_xor_ps(b3vMzeroMask, V);
+		__m128 V2 = _mm_movelh_ps(V0, V);
+
+		__m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
+
+		V0 = _mm_shuffle_ps(V0, V, 0xDB);
+		V2 = _mm_shuffle_ps(V2, V, 0xF9);
+
+		v0->mVec128 = V0;
+		v1->mVec128 = V1;
+		v2->mVec128 = V2;
+#else
+		v0->setValue(0., -getZ(), getY());
+		v1->setValue(getZ(), 0., -getX());
+		v2->setValue(-getY(), getX(), 0.);
+#endif
+	}
+
+	void setZero()
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
+#elif defined(B3_USE_NEON)
+		int32x4_t vi = vdupq_n_s32(0);
+		mVec128 = vreinterpretq_f32_s32(vi);
+#else
+		setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
+#endif
+	}
+
+	B3_FORCE_INLINE bool isZero() const
+	{
+		return m_floats[0] == b3Scalar(0) && m_floats[1] == b3Scalar(0) && m_floats[2] == b3Scalar(0);
+	}
+
+	B3_FORCE_INLINE bool fuzzyZero() const
+	{
+		return length2() < B3_EPSILON;
+	}
+
+	B3_FORCE_INLINE void serialize(struct b3Vector3Data & dataOut) const;
+
+	B3_FORCE_INLINE void deSerialize(const struct b3Vector3Data& dataIn);
+
+	B3_FORCE_INLINE void serializeFloat(struct b3Vector3FloatData & dataOut) const;
+
+	B3_FORCE_INLINE void deSerializeFloat(const struct b3Vector3FloatData& dataIn);
+
+	B3_FORCE_INLINE void serializeDouble(struct b3Vector3DoubleData & dataOut) const;
+
+	B3_FORCE_INLINE void deSerializeDouble(const struct b3Vector3DoubleData& dataIn);
+
+	/**@brief returns index of maximum dot product between this and vectors in array[]
+         * @param array The other vectors
+         * @param array_count The number of other vectors
+         * @param dotOut The maximum dot product */
+	B3_FORCE_INLINE long maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
+
+	/**@brief returns index of minimum dot product between this and vectors in array[]
+         * @param array The other vectors
+         * @param array_count The number of other vectors
+         * @param dotOut The minimum dot product */
+	B3_FORCE_INLINE long minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
+
+	/* create a vector as  b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 ))  */
+	B3_FORCE_INLINE b3Vector3 dot3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2) const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+
+		__m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128);
+		__m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128);
+		__m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128);
+		__m128 b0 = _mm_unpacklo_ps(a0, a1);
+		__m128 b1 = _mm_unpackhi_ps(a0, a1);
+		__m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps());
+		__m128 r = _mm_movelh_ps(b0, b2);
+		r = _mm_add_ps(r, _mm_movehl_ps(b2, b0));
+		a2 = _mm_and_ps(a2, b3vxyzMaskf);
+		r = _mm_add_ps(r, b3CastdTo128f(_mm_move_sd(b3CastfTo128d(a2), b3CastfTo128d(b1))));
+		return b3MakeVector3(r);
+
+#elif defined(B3_USE_NEON)
+		static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0};
+		float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128);
+		float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128);
+		float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128);
+		float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1));
+		a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask);
+		float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]);
+		float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
+		return b3Vector3(vcombine_f32(b0, b1));
+#else
+		return b3MakeVector3(dot(v0), dot(v1), dot(v2));
+#endif
+	}
+};
+
+/**@brief Return the sum of two vectors (Point symantics)*/
+B3_FORCE_INLINE b3Vector3
+operator+(const b3Vector3& v1, const b3Vector3& v2)
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	return b3MakeVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
+#elif defined(B3_USE_NEON)
+	return b3MakeVector3(vaddq_f32(v1.mVec128, v2.mVec128));
+#else
+	return b3MakeVector3(
+		v1.m_floats[0] + v2.m_floats[0],
+		v1.m_floats[1] + v2.m_floats[1],
+		v1.m_floats[2] + v2.m_floats[2]);
+#endif
+}
+
+/**@brief Return the elementwise product of two vectors */
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Vector3& v1, const b3Vector3& v2)
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	return b3MakeVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
+#elif defined(B3_USE_NEON)
+	return b3MakeVector3(vmulq_f32(v1.mVec128, v2.mVec128));
+#else
+	return b3MakeVector3(
+		v1.m_floats[0] * v2.m_floats[0],
+		v1.m_floats[1] * v2.m_floats[1],
+		v1.m_floats[2] * v2.m_floats[2]);
+#endif
+}
+
+/**@brief Return the difference between two vectors */
+B3_FORCE_INLINE b3Vector3
+operator-(const b3Vector3& v1, const b3Vector3& v2)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+
+	//	without _mm_and_ps this code causes slowdown in Concave moving
+	__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
+	return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
+#elif defined(B3_USE_NEON)
+	float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
+	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
+#else
+	return b3MakeVector3(
+		v1.m_floats[0] - v2.m_floats[0],
+		v1.m_floats[1] - v2.m_floats[1],
+		v1.m_floats[2] - v2.m_floats[2]);
+#endif
+}
+
+/**@brief Return the negative of the vector */
+B3_FORCE_INLINE b3Vector3
+operator-(const b3Vector3& v)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 r = _mm_xor_ps(v.mVec128, b3vMzeroMask);
+	return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
+#elif defined(B3_USE_NEON)
+	return b3MakeVector3((b3SimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)b3vMzeroMask));
+#else
+	return b3MakeVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
+#endif
+}
+
+/**@brief Return the vector scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Vector3& v, const b3Scalar& s)
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
+	vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
+	return b3MakeVector3(_mm_mul_ps(v.mVec128, vs));
+#elif defined(B3_USE_NEON)
+	float32x4_t r = vmulq_n_f32(v.mVec128, s);
+	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
+#else
+	return b3MakeVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
+#endif
+}
+
+/**@brief Return the vector scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator*(const b3Scalar& s, const b3Vector3& v)
+{
+	return v * s;
+}
+
+/**@brief Return the vector inversely scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator/(const b3Vector3& v, const b3Scalar& s)
+{
+	b3FullAssert(s != b3Scalar(0.0));
+#if 0  //defined(B3_USE_SSE_IN_API)
+// this code is not faster !
+	__m128 vs = _mm_load_ss(&s);
+    vs = _mm_div_ss(b3v1110, vs);
+	vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
+
+	return b3Vector3(_mm_mul_ps(v.mVec128, vs));
+#else
+	return v * (b3Scalar(1.0) / s);
+#endif
+}
+
+/**@brief Return the vector inversely scaled by s */
+B3_FORCE_INLINE b3Vector3
+operator/(const b3Vector3& v1, const b3Vector3& v2)
+{
+#if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
+	__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
+	vec = _mm_and_ps(vec, b3vFFF0fMask);
+	return b3MakeVector3(vec);
+#elif defined(B3_USE_NEON)
+	float32x4_t x, y, v, m;
+
+	x = v1.mVec128;
+	y = v2.mVec128;
+
+	v = vrecpeq_f32(y);     // v ~ 1/y
+	m = vrecpsq_f32(y, v);  // m = (2-v*y)
+	v = vmulq_f32(v, m);    // vv = v*m ~~ 1/y
+	m = vrecpsq_f32(y, v);  // mm = (2-vv*y)
+	v = vmulq_f32(v, x);    // x*vv
+	v = vmulq_f32(v, m);    // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
+
+	return b3Vector3(v);
+#else
+	return b3MakeVector3(
+		v1.m_floats[0] / v2.m_floats[0],
+		v1.m_floats[1] / v2.m_floats[1],
+		v1.m_floats[2] / v2.m_floats[2]);
+#endif
+}
+
+/**@brief Return the dot product between two vectors */
+B3_FORCE_INLINE b3Scalar
+b3Dot(const b3Vector3& v1, const b3Vector3& v2)
+{
+	return v1.dot(v2);
+}
+
+/**@brief Return the distance squared between two vectors */
+B3_FORCE_INLINE b3Scalar
+b3Distance2(const b3Vector3& v1, const b3Vector3& v2)
+{
+	return v1.distance2(v2);
+}
+
+/**@brief Return the distance between two vectors */
+B3_FORCE_INLINE b3Scalar
+b3Distance(const b3Vector3& v1, const b3Vector3& v2)
+{
+	return v1.distance(v2);
+}
+
+/**@brief Return the angle between two vectors */
+B3_FORCE_INLINE b3Scalar
+b3Angle(const b3Vector3& v1, const b3Vector3& v2)
+{
+	return v1.angle(v2);
+}
+
+/**@brief Return the cross product of two vectors */
+B3_FORCE_INLINE b3Vector3
+b3Cross(const b3Vector3& v1, const b3Vector3& v2)
+{
+	return v1.cross(v2);
+}
+
+B3_FORCE_INLINE b3Scalar
+b3Triple(const b3Vector3& v1, const b3Vector3& v2, const b3Vector3& v3)
+{
+	return v1.triple(v2, v3);
+}
+
+/**@brief Return the linear interpolation between two vectors
+ * @param v1 One vector
+ * @param v2 The other vector
+ * @param t The ration of this to v (t = 0 => return v1, t=1 => return v2) */
+B3_FORCE_INLINE b3Vector3
+b3Lerp(const b3Vector3& v1, const b3Vector3& v2, const b3Scalar& t)
+{
+	return v1.lerp(v2, t);
+}
+
+B3_FORCE_INLINE b3Scalar b3Vector3::distance2(const b3Vector3& v) const
+{
+	return (v - *this).length2();
+}
+
+B3_FORCE_INLINE b3Scalar b3Vector3::distance(const b3Vector3& v) const
+{
+	return (v - *this).length();
+}
+
+B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const
+{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+	b3Vector3 norm = *this;
+
+	return norm.normalize();
+#else
+	return *this / length();
+#endif
+}
+
+B3_FORCE_INLINE b3Vector3 b3Vector3::rotate(const b3Vector3& wAxis, const b3Scalar _angle) const
+{
+	// wAxis must be a unit lenght vector
+
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+
+	__m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
+	b3Scalar ssin = b3Sin(_angle);
+	__m128 C = wAxis.cross(b3MakeVector3(mVec128)).mVec128;
+	O = _mm_and_ps(O, b3vFFF0fMask);
+	b3Scalar scos = b3Cos(_angle);
+
+	__m128 vsin = _mm_load_ss(&ssin);  //	(S 0 0 0)
+	__m128 vcos = _mm_load_ss(&scos);  //	(S 0 0 0)
+
+	__m128 Y = b3_pshufd_ps(O, 0xC9);  //	(Y Z X 0)
+	__m128 Z = b3_pshufd_ps(O, 0xD2);  //	(Z X Y 0)
+	O = _mm_add_ps(O, Y);
+	vsin = b3_pshufd_ps(vsin, 0x80);  //	(S S S 0)
+	O = _mm_add_ps(O, Z);
+	vcos = b3_pshufd_ps(vcos, 0x80);  //	(S S S 0)
+
+	vsin = vsin * C;
+	O = O * wAxis.mVec128;
+	__m128 X = mVec128 - O;
+
+	O = O + vsin;
+	vcos = vcos * X;
+	O = O + vcos;
+
+	return b3MakeVector3(O);
+#else
+	b3Vector3 o = wAxis * wAxis.dot(*this);
+	b3Vector3 _x = *this - o;
+	b3Vector3 _y;
+
+	_y = wAxis.cross(*this);
+
+	return (o + _x * b3Cos(_angle) + _y * b3Sin(_angle));
+#endif
+}
+
+B3_FORCE_INLINE long b3Vector3::maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
+{
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+#if defined _WIN32 || defined(B3_USE_SSE)
+	const long scalar_cutoff = 10;
+	long b3_maxdot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#elif defined B3_USE_NEON
+	const long scalar_cutoff = 4;
+	extern long (*_maxdot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#endif
+	if (array_count < scalar_cutoff)
+#else
+
+#endif  //B3_USE_SSE || B3_USE_NEON
+	{
+		b3Scalar maxDot = -B3_INFINITY;
+		int i = 0;
+		int ptIndex = -1;
+		for (i = 0; i < array_count; i++)
+		{
+			b3Scalar dot = array[i].dot(*this);
+
+			if (dot > maxDot)
+			{
+				maxDot = dot;
+				ptIndex = i;
+			}
+		}
+
+		b3Assert(ptIndex >= 0);
+		if (ptIndex < 0)
+		{
+			ptIndex = 0;
+		}
+		dotOut = maxDot;
+		return ptIndex;
+	}
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+	return b3_maxdot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
+#endif
+}
+
+B3_FORCE_INLINE long b3Vector3::minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
+{
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+#if defined B3_USE_SSE
+	const long scalar_cutoff = 10;
+	long b3_mindot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#elif defined B3_USE_NEON
+	const long scalar_cutoff = 4;
+	extern long (*b3_mindot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
+#else
+#error unhandled arch!
+#endif
+
+	if (array_count < scalar_cutoff)
+#endif  //B3_USE_SSE || B3_USE_NEON
+	{
+		b3Scalar minDot = B3_INFINITY;
+		int i = 0;
+		int ptIndex = -1;
+
+		for (i = 0; i < array_count; i++)
+		{
+			b3Scalar dot = array[i].dot(*this);
+
+			if (dot < minDot)
+			{
+				minDot = dot;
+				ptIndex = i;
+			}
+		}
+
+		dotOut = minDot;
+
+		return ptIndex;
+	}
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
+	return b3_mindot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
+#endif
+}
+
+class b3Vector4 : public b3Vector3
+{
+public:
+	B3_FORCE_INLINE b3Vector4 absolute4() const
+	{
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+		return b3MakeVector4(_mm_and_ps(mVec128, b3vAbsfMask));
+#elif defined(B3_USE_NEON)
+		return b3Vector4(vabsq_f32(mVec128));
+#else
+		return b3MakeVector4(
+			b3Fabs(m_floats[0]),
+			b3Fabs(m_floats[1]),
+			b3Fabs(m_floats[2]),
+			b3Fabs(m_floats[3]));
+#endif
+	}
+
+	b3Scalar getW() const { return m_floats[3]; }
+
+	B3_FORCE_INLINE int maxAxis4() const
+	{
+		int maxIndex = -1;
+		b3Scalar maxVal = b3Scalar(-B3_LARGE_FLOAT);
+		if (m_floats[0] > maxVal)
+		{
+			maxIndex = 0;
+			maxVal = m_floats[0];
+		}
+		if (m_floats[1] > maxVal)
+		{
+			maxIndex = 1;
+			maxVal = m_floats[1];
+		}
+		if (m_floats[2] > maxVal)
+		{
+			maxIndex = 2;
+			maxVal = m_floats[2];
+		}
+		if (m_floats[3] > maxVal)
+		{
+			maxIndex = 3;
+		}
+
+		return maxIndex;
+	}
+
+	B3_FORCE_INLINE int minAxis4() const
+	{
+		int minIndex = -1;
+		b3Scalar minVal = b3Scalar(B3_LARGE_FLOAT);
+		if (m_floats[0] < minVal)
+		{
+			minIndex = 0;
+			minVal = m_floats[0];
+		}
+		if (m_floats[1] < minVal)
+		{
+			minIndex = 1;
+			minVal = m_floats[1];
+		}
+		if (m_floats[2] < minVal)
+		{
+			minIndex = 2;
+			minVal = m_floats[2];
+		}
+		if (m_floats[3] < minVal)
+		{
+			minIndex = 3;
+			minVal = m_floats[3];
+		}
+
+		return minIndex;
+	}
+
+	B3_FORCE_INLINE int closestAxis4() const
+	{
+		return absolute4().maxAxis4();
+	}
+
+	/**@brief Set x,y,z and zero w
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   */
+
+	/*		void getValue(b3Scalar *m) const
+		{
+			m[0] = m_floats[0];
+			m[1] = m_floats[1];
+			m[2] =m_floats[2];
+		}
+*/
+	/**@brief Set the values
+   * @param x Value of x
+   * @param y Value of y
+   * @param z Value of z
+   * @param w Value of w
+   */
+	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
+	{
+		m_floats[0] = _x;
+		m_floats[1] = _y;
+		m_floats[2] = _z;
+		m_floats[3] = _w;
+	}
+};
+
+///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+B3_FORCE_INLINE void b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal)
+{
+#ifdef B3_USE_DOUBLE_PRECISION
+	unsigned char* dest = (unsigned char*)&destVal;
+	unsigned char* src = (unsigned char*)&sourceVal;
+	dest[0] = src[7];
+	dest[1] = src[6];
+	dest[2] = src[5];
+	dest[3] = src[4];
+	dest[4] = src[3];
+	dest[5] = src[2];
+	dest[6] = src[1];
+	dest[7] = src[0];
+#else
+	unsigned char* dest = (unsigned char*)&destVal;
+	unsigned char* src = (unsigned char*)&sourceVal;
+	dest[0] = src[3];
+	dest[1] = src[2];
+	dest[2] = src[1];
+	dest[3] = src[0];
+#endif  //B3_USE_DOUBLE_PRECISION
+}
+///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+B3_FORCE_INLINE void b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec)
+{
+	for (int i = 0; i < 4; i++)
+	{
+		b3SwapScalarEndian(sourceVec[i], destVec[i]);
+	}
+}
+
+///b3UnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
+B3_FORCE_INLINE void b3UnSwapVector3Endian(b3Vector3& vector)
+{
+	b3Vector3 swappedVec;
+	for (int i = 0; i < 4; i++)
+	{
+		b3SwapScalarEndian(vector[i], swappedVec[i]);
+	}
+	vector = swappedVec;
+}
+
+template <class T>
+B3_FORCE_INLINE void b3PlaneSpace1(const T& n, T& p, T& q)
+{
+	if (b3Fabs(n[2]) > B3_SQRT12)
+	{
+		// choose p in y-z plane
+		b3Scalar a = n[1] * n[1] + n[2] * n[2];
+		b3Scalar k = b3RecipSqrt(a);
+		p[0] = 0;
+		p[1] = -n[2] * k;
+		p[2] = n[1] * k;
+		// set q = n x p
+		q[0] = a * k;
+		q[1] = -n[0] * p[2];
+		q[2] = n[0] * p[1];
+	}
+	else
+	{
+		// choose p in x-y plane
+		b3Scalar a = n[0] * n[0] + n[1] * n[1];
+		b3Scalar k = b3RecipSqrt(a);
+		p[0] = -n[1] * k;
+		p[1] = n[0] * k;
+		p[2] = 0;
+		// set q = n x p
+		q[0] = -n[2] * p[1];
+		q[1] = n[2] * p[0];
+		q[2] = a * k;
+	}
+}
+
+struct b3Vector3FloatData
+{
+	float m_floats[4];
+};
+
+struct b3Vector3DoubleData
+{
+	double m_floats[4];
+};
+
+B3_FORCE_INLINE void b3Vector3::serializeFloat(struct b3Vector3FloatData& dataOut) const
+{
+	///could also do a memcpy, check if it is worth it
+	for (int i = 0; i < 4; i++)
+		dataOut.m_floats[i] = float(m_floats[i]);
+}
+
+B3_FORCE_INLINE void b3Vector3::deSerializeFloat(const struct b3Vector3FloatData& dataIn)
+{
+	for (int i = 0; i < 4; i++)
+		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
+}
+
+B3_FORCE_INLINE void b3Vector3::serializeDouble(struct b3Vector3DoubleData& dataOut) const
+{
+	///could also do a memcpy, check if it is worth it
+	for (int i = 0; i < 4; i++)
+		dataOut.m_floats[i] = double(m_floats[i]);
+}
+
+B3_FORCE_INLINE void b3Vector3::deSerializeDouble(const struct b3Vector3DoubleData& dataIn)
+{
+	for (int i = 0; i < 4; i++)
+		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
+}
+
+B3_FORCE_INLINE void b3Vector3::serialize(struct b3Vector3Data& dataOut) const
+{
+	///could also do a memcpy, check if it is worth it
+	for (int i = 0; i < 4; i++)
+		dataOut.m_floats[i] = m_floats[i];
+}
+
+B3_FORCE_INLINE void b3Vector3::deSerialize(const struct b3Vector3Data& dataIn)
+{
+	for (int i = 0; i < 4; i++)
+		m_floats[i] = dataIn.m_floats[i];
+}
+
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z)
+{
+	b3Vector3 tmp;
+	tmp.setValue(x, y, z);
+	return tmp;
+}
+
+inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
+{
+	b3Vector3 tmp;
+	tmp.setValue(x, y, z);
+	tmp.w = w;
+	return tmp;
+}
+
+inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
+{
+	b3Vector4 tmp;
+	tmp.setValue(x, y, z, w);
+	return tmp;
+}
+
+#if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
+
+inline b3Vector3 b3MakeVector3(b3SimdFloat4 v)
+{
+	b3Vector3 tmp;
+	tmp.set128(v);
+	return tmp;
+}
+
+inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec)
+{
+	b3Vector4 tmp;
+	tmp.set128(vec);
+	return tmp;
+}
+
+#endif
+
+#endif  //B3_VECTOR3_H

+ 16 - 0
Dependencies/include/bullet3/Bullet3Common/premake4.lua

@@ -0,0 +1,16 @@
+	project "Bullet3Common"
+
+	language "C++"
+				
+	kind "StaticLib"
+		
+	if os.is("Linux") then
+	    buildoptions{"-fPIC"}
+	end
+
+	includedirs {".."}
+
+	files {
+		"*.cpp",
+		"*.h"
+	}

+ 90 - 0
Dependencies/include/bullet3/Bullet3Common/shared/b3Float4.h

@@ -0,0 +1,90 @@
+#ifndef B3_FLOAT4_H
+#define B3_FLOAT4_H
+
+#include "Bullet3Common/shared/b3PlatformDefinitions.h"
+
+#ifdef __cplusplus
+#include "Bullet3Common/b3Vector3.h"
+#define b3Float4 b3Vector3
+#define b3Float4ConstArg const b3Vector3&
+#define b3Dot3F4 b3Dot
+#define b3Cross3 b3Cross
+#define b3MakeFloat4 b3MakeVector3
+inline b3Vector3 b3Normalized(const b3Vector3& vec)
+{
+	return vec.normalized();
+}
+
+inline b3Float4 b3FastNormalized3(b3Float4ConstArg v)
+{
+	return v.normalized();
+}
+
+inline b3Float4 b3MaxFloat4(const b3Float4& a, const b3Float4& b)
+{
+	b3Float4 tmp = a;
+	tmp.setMax(b);
+	return tmp;
+}
+inline b3Float4 b3MinFloat4(const b3Float4& a, const b3Float4& b)
+{
+	b3Float4 tmp = a;
+	tmp.setMin(b);
+	return tmp;
+}
+
+#else
+typedef float4 b3Float4;
+#define b3Float4ConstArg const b3Float4
+#define b3MakeFloat4 (float4)
+float b3Dot3F4(b3Float4ConstArg v0, b3Float4ConstArg v1)
+{
+	float4 a1 = b3MakeFloat4(v0.xyz, 0.f);
+	float4 b1 = b3MakeFloat4(v1.xyz, 0.f);
+	return dot(a1, b1);
+}
+b3Float4 b3Cross3(b3Float4ConstArg v0, b3Float4ConstArg v1)
+{
+	float4 a1 = b3MakeFloat4(v0.xyz, 0.f);
+	float4 b1 = b3MakeFloat4(v1.xyz, 0.f);
+	return cross(a1, b1);
+}
+#define b3MinFloat4 min
+#define b3MaxFloat4 max
+
+#define b3Normalized(a) normalize(a)
+
+#endif
+
+inline bool b3IsAlmostZero(b3Float4ConstArg v)
+{
+	if (b3Fabs(v.x) > 1e-6 || b3Fabs(v.y) > 1e-6 || b3Fabs(v.z) > 1e-6)
+		return false;
+	return true;
+}
+
+inline int b3MaxDot(b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut)
+{
+	float maxDot = -B3_INFINITY;
+	int i = 0;
+	int ptIndex = -1;
+	for (i = 0; i < vecLen; i++)
+	{
+		float dot = b3Dot3F4(vecArray[i], vec);
+
+		if (dot > maxDot)
+		{
+			maxDot = dot;
+			ptIndex = i;
+		}
+	}
+	b3Assert(ptIndex >= 0);
+	if (ptIndex < 0)
+	{
+		ptIndex = 0;
+	}
+	*dotOut = maxDot;
+	return ptIndex;
+}
+
+#endif  //B3_FLOAT4_H

+ 63 - 0
Dependencies/include/bullet3/Bullet3Common/shared/b3Int2.h

@@ -0,0 +1,63 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_INT2_H
+#define B3_INT2_H
+
+#ifdef __cplusplus
+
+struct b3UnsignedInt2
+{
+	union {
+		struct
+		{
+			unsigned int x, y;
+		};
+		struct
+		{
+			unsigned int s[2];
+		};
+	};
+};
+
+struct b3Int2
+{
+	union {
+		struct
+		{
+			int x, y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+inline b3Int2 b3MakeInt2(int x, int y)
+{
+	b3Int2 v;
+	v.s[0] = x;
+	v.s[1] = y;
+	return v;
+}
+#else
+
+#define b3UnsignedInt2 uint2
+#define b3Int2 int2
+#define b3MakeInt2 (int2)
+
+#endif  //__cplusplus
+#endif

+ 71 - 0
Dependencies/include/bullet3/Bullet3Common/shared/b3Int4.h

@@ -0,0 +1,71 @@
+#ifndef B3_INT4_H
+#define B3_INT4_H
+
+#ifdef __cplusplus
+
+#include "Bullet3Common/b3Scalar.h"
+
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3UnsignedInt4
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	union {
+		struct
+		{
+			unsigned int x, y, z, w;
+		};
+		struct
+		{
+			unsigned int s[4];
+		};
+	};
+};
+
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3Int4
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	union {
+		struct
+		{
+			int x, y, z, w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+B3_FORCE_INLINE b3Int4 b3MakeInt4(int x, int y, int z, int w = 0)
+{
+	b3Int4 v;
+	v.s[0] = x;
+	v.s[1] = y;
+	v.s[2] = z;
+	v.s[3] = w;
+	return v;
+}
+
+B3_FORCE_INLINE b3UnsignedInt4 b3MakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
+{
+	b3UnsignedInt4 v;
+	v.s[0] = x;
+	v.s[1] = y;
+	v.s[2] = z;
+	v.s[3] = w;
+	return v;
+}
+
+#else
+
+#define b3UnsignedInt4 uint4
+#define b3Int4 int4
+#define b3MakeInt4 (int4)
+#define b3MakeUnsignedInt4 (uint4)
+
+#endif  //__cplusplus
+
+#endif  //B3_INT4_H

+ 157 - 0
Dependencies/include/bullet3/Bullet3Common/shared/b3Mat3x3.h

@@ -0,0 +1,157 @@
+
+#ifndef B3_MAT3x3_H
+#define B3_MAT3x3_H
+
+#include "Bullet3Common/shared/b3Quat.h"
+
+#ifdef __cplusplus
+
+#include "Bullet3Common/b3Matrix3x3.h"
+
+#define b3Mat3x3 b3Matrix3x3
+#define b3Mat3x3ConstArg const b3Matrix3x3&
+
+inline b3Mat3x3 b3QuatGetRotationMatrix(b3QuatConstArg quat)
+{
+	return b3Mat3x3(quat);
+}
+
+inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg mat)
+{
+	return mat.absolute();
+}
+
+#define b3GetRow(m, row) m.getRow(row)
+
+__inline b3Float4 mtMul3(b3Float4ConstArg a, b3Mat3x3ConstArg b)
+{
+	return b * a;
+}
+
+#else
+
+typedef struct
+{
+	b3Float4 m_row[3];
+} b3Mat3x3;
+
+#define b3Mat3x3ConstArg const b3Mat3x3
+#define b3GetRow(m, row) (m.m_row[row])
+
+inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)
+{
+	b3Float4 quat2 = (b3Float4)(quat.x * quat.x, quat.y * quat.y, quat.z * quat.z, 0.f);
+	b3Mat3x3 out;
+
+	out.m_row[0].x = 1 - 2 * quat2.y - 2 * quat2.z;
+	out.m_row[0].y = 2 * quat.x * quat.y - 2 * quat.w * quat.z;
+	out.m_row[0].z = 2 * quat.x * quat.z + 2 * quat.w * quat.y;
+	out.m_row[0].w = 0.f;
+
+	out.m_row[1].x = 2 * quat.x * quat.y + 2 * quat.w * quat.z;
+	out.m_row[1].y = 1 - 2 * quat2.x - 2 * quat2.z;
+	out.m_row[1].z = 2 * quat.y * quat.z - 2 * quat.w * quat.x;
+	out.m_row[1].w = 0.f;
+
+	out.m_row[2].x = 2 * quat.x * quat.z - 2 * quat.w * quat.y;
+	out.m_row[2].y = 2 * quat.y * quat.z + 2 * quat.w * quat.x;
+	out.m_row[2].z = 1 - 2 * quat2.x - 2 * quat2.y;
+	out.m_row[2].w = 0.f;
+
+	return out;
+}
+
+inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)
+{
+	b3Mat3x3 out;
+	out.m_row[0] = fabs(matIn.m_row[0]);
+	out.m_row[1] = fabs(matIn.m_row[1]);
+	out.m_row[2] = fabs(matIn.m_row[2]);
+	return out;
+}
+
+__inline b3Mat3x3 mtZero();
+
+__inline b3Mat3x3 mtIdentity();
+
+__inline b3Mat3x3 mtTranspose(b3Mat3x3 m);
+
+__inline b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);
+
+__inline b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);
+
+__inline b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);
+
+__inline b3Mat3x3 mtZero()
+{
+	b3Mat3x3 m;
+	m.m_row[0] = (b3Float4)(0.f);
+	m.m_row[1] = (b3Float4)(0.f);
+	m.m_row[2] = (b3Float4)(0.f);
+	return m;
+}
+
+__inline b3Mat3x3 mtIdentity()
+{
+	b3Mat3x3 m;
+	m.m_row[0] = (b3Float4)(1, 0, 0, 0);
+	m.m_row[1] = (b3Float4)(0, 1, 0, 0);
+	m.m_row[2] = (b3Float4)(0, 0, 1, 0);
+	return m;
+}
+
+__inline b3Mat3x3 mtTranspose(b3Mat3x3 m)
+{
+	b3Mat3x3 out;
+	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+}
+
+__inline b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)
+{
+	b3Mat3x3 transB;
+	transB = mtTranspose(b);
+	b3Mat3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for (int i = 0; i < 3; i++)
+	{
+		//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = b3Dot3F4(a.m_row[i], transB.m_row[0]);
+		ans.m_row[i].y = b3Dot3F4(a.m_row[i], transB.m_row[1]);
+		ans.m_row[i].z = b3Dot3F4(a.m_row[i], transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+}
+
+__inline b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)
+{
+	b3Float4 ans;
+	ans.x = b3Dot3F4(a.m_row[0], b);
+	ans.y = b3Dot3F4(a.m_row[1], b);
+	ans.z = b3Dot3F4(a.m_row[2], b);
+	ans.w = 0.f;
+	return ans;
+}
+
+__inline b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)
+{
+	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+
+	b3Float4 ans;
+	ans.x = b3Dot3F4(a, colx);
+	ans.y = b3Dot3F4(a, coly);
+	ans.z = b3Dot3F4(a, colz);
+	return ans;
+}
+
+#endif
+
+#endif  //B3_MAT3x3_H

+ 41 - 0
Dependencies/include/bullet3/Bullet3Common/shared/b3PlatformDefinitions.h

@@ -0,0 +1,41 @@
+#ifndef B3_PLATFORM_DEFINITIONS_H
+#define B3_PLATFORM_DEFINITIONS_H
+
+struct MyTest
+{
+	int bla;
+};
+
+#ifdef __cplusplus
+//#define b3ConstArray(a) const b3AlignedObjectArray<a>&
+#define b3ConstArray(a) const a *
+#define b3AtomicInc(a) ((*a)++)
+
+inline int b3AtomicAdd(volatile int *p, int val)
+{
+	int oldValue = *p;
+	int newValue = oldValue + val;
+	*p = newValue;
+	return oldValue;
+}
+
+#define __global
+
+#define B3_STATIC static
+#else
+//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX
+#define B3_LARGE_FLOAT 1e18f
+#define B3_INFINITY 1e18f
+#define b3Assert(a)
+#define b3ConstArray(a) __global const a *
+#define b3AtomicInc atomic_inc
+#define b3AtomicAdd atomic_add
+#define b3Fabs fabs
+#define b3Sqrt native_sqrt
+#define b3Sin native_sin
+#define b3Cos native_cos
+
+#define B3_STATIC
+#endif
+
+#endif

+ 100 - 0
Dependencies/include/bullet3/Bullet3Common/shared/b3Quat.h

@@ -0,0 +1,100 @@
+#ifndef B3_QUAT_H
+#define B3_QUAT_H
+
+#include "Bullet3Common/shared/b3PlatformDefinitions.h"
+#include "Bullet3Common/shared/b3Float4.h"
+
+#ifdef __cplusplus
+#include "Bullet3Common/b3Quaternion.h"
+#include "Bullet3Common/b3Transform.h"
+
+#define b3Quat b3Quaternion
+#define b3QuatConstArg const b3Quaternion&
+inline b3Quat b3QuatInverse(b3QuatConstArg orn)
+{
+	return orn.inverse();
+}
+
+inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)
+{
+	b3Transform tr;
+	tr.setOrigin(translation);
+	tr.setRotation(orientation);
+	return tr(point);
+}
+
+#else
+typedef float4 b3Quat;
+#define b3QuatConstArg const b3Quat
+
+inline float4 b3FastNormalize4(float4 v)
+{
+	v = (float4)(v.xyz, 0.f);
+	return fast_normalize(v);
+}
+
+inline b3Quat b3QuatMul(b3Quat a, b3Quat b);
+inline b3Quat b3QuatNormalized(b3QuatConstArg in);
+inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);
+inline b3Quat b3QuatInvert(b3QuatConstArg q);
+inline b3Quat b3QuatInverse(b3QuatConstArg q);
+
+inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)
+{
+	b3Quat ans;
+	ans = b3Cross3(a, b);
+	ans += a.w * b + b.w * a;
+	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w * b.w - b3Dot3F4(a, b);
+	return ans;
+}
+
+inline b3Quat b3QuatNormalized(b3QuatConstArg in)
+{
+	b3Quat q;
+	q = in;
+	//return b3FastNormalize4(in);
+	float len = native_sqrt(dot(q, q));
+	if (len > 0.f)
+	{
+		q *= 1.f / len;
+	}
+	else
+	{
+		q.x = q.y = q.z = 0.f;
+		q.w = 1.f;
+	}
+	return q;
+}
+inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)
+{
+	b3Quat qInv = b3QuatInvert(q);
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = b3QuatMul(b3QuatMul(q, vcpy), qInv);
+	return out;
+}
+
+inline b3Quat b3QuatInverse(b3QuatConstArg q)
+{
+	return (b3Quat)(-q.xyz, q.w);
+}
+
+inline b3Quat b3QuatInvert(b3QuatConstArg q)
+{
+	return (b3Quat)(-q.xyz, q.w);
+}
+
+inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)
+{
+	return b3QuatRotate(b3QuatInvert(q), vec);
+}
+
+inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)
+{
+	return b3QuatRotate(orientation, point) + (translation);
+}
+
+#endif
+
+#endif  //B3_QUAT_H

+ 61 - 0
Dependencies/include/bullet3/Bullet3Dynamics/CMakeLists.txt

@@ -0,0 +1,61 @@
+
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+SET(Bullet3Dynamics_SRCS
+	b3CpuRigidBodyPipeline.cpp
+	ConstraintSolver/b3FixedConstraint.cpp
+	ConstraintSolver/b3Generic6DofConstraint.cpp
+	ConstraintSolver/b3PgsJacobiSolver.cpp
+	ConstraintSolver/b3Point2PointConstraint.cpp
+	ConstraintSolver/b3TypedConstraint.cpp
+)
+
+SET(Bullet3Dynamics_HDRS
+	  b3CpuRigidBodyPipeline.h
+	ConstraintSolver/b3ContactSolverInfo.h
+	ConstraintSolver/b3FixedConstraint.h
+	ConstraintSolver/b3Generic6DofConstraint.h
+	ConstraintSolver/b3JacobianEntry.h
+	ConstraintSolver/b3PgsJacobiSolver.h
+	ConstraintSolver/b3Point2PointConstraint.h
+	ConstraintSolver/b3SolverBody.h
+	ConstraintSolver/b3SolverConstraint.h
+	ConstraintSolver/b3TypedConstraint.h
+	shared/b3ContactConstraint4.h
+	shared/b3ConvertConstraint4.h
+	shared/b3Inertia.h
+	shared/b3IntegrateTransforms.h
+)
+
+ADD_LIBRARY(Bullet3Dynamics ${Bullet3Dynamics_SRCS} ${Bullet3Dynamics_HDRS})
+if (BUILD_SHARED_LIBS)
+  target_link_libraries(Bullet3Dynamics Bullet3Collision)
+endif ()
+SET_TARGET_PROPERTIES(Bullet3Dynamics PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(Bullet3Dynamics PROPERTIES SOVERSION ${BULLET_VERSION})
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		#FILES_MATCHING requires CMake 2.6
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Dynamics DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Dynamics
+					RUNTIME DESTINATION bin
+					LIBRARY DESTINATION lib${LIB_SUFFIX}
+					ARCHIVE DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h"  PATTERN
+".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(Bullet3Dynamics PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(Bullet3Dynamics PROPERTIES PUBLIC_HEADER "${Bullet3Dynamics_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)

+ 149 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3ContactSolverInfo.h

@@ -0,0 +1,149 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_CONTACT_SOLVER_INFO
+#define B3_CONTACT_SOLVER_INFO
+
+#include "Bullet3Common/b3Scalar.h"
+
+enum b3SolverMode
+{
+	B3_SOLVER_RANDMIZE_ORDER = 1,
+	B3_SOLVER_FRICTION_SEPARATE = 2,
+	B3_SOLVER_USE_WARMSTARTING = 4,
+	B3_SOLVER_USE_2_FRICTION_DIRECTIONS = 16,
+	B3_SOLVER_ENABLE_FRICTION_DIRECTION_CACHING = 32,
+	B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION = 64,
+	B3_SOLVER_CACHE_FRIENDLY = 128,
+	B3_SOLVER_SIMD = 256,
+	B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS = 512,
+	B3_SOLVER_ALLOW_ZERO_LENGTH_FRICTION_DIRECTIONS = 1024
+};
+
+struct b3ContactSolverInfoData
+{
+	b3Scalar m_tau;
+	b3Scalar m_damping;  //global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	b3Scalar m_friction;
+	b3Scalar m_timeStep;
+	b3Scalar m_restitution;
+	int m_numIterations;
+	b3Scalar m_maxErrorReduction;
+	b3Scalar m_sor;
+	b3Scalar m_erp;        //used as Baumgarte factor
+	b3Scalar m_erp2;       //used in Split Impulse
+	b3Scalar m_globalCfm;  //constraint force mixing
+	int m_splitImpulse;
+	b3Scalar m_splitImpulsePenetrationThreshold;
+	b3Scalar m_splitImpulseTurnErp;
+	b3Scalar m_linearSlop;
+	b3Scalar m_warmstartingFactor;
+
+	int m_solverMode;
+	int m_restingContactRestitutionThreshold;
+	int m_minimumSolverBatchSize;
+	b3Scalar m_maxGyroscopicForce;
+	b3Scalar m_singleAxisRollingFrictionThreshold;
+};
+
+struct b3ContactSolverInfo : public b3ContactSolverInfoData
+{
+	inline b3ContactSolverInfo()
+	{
+		m_tau = b3Scalar(0.6);
+		m_damping = b3Scalar(1.0);
+		m_friction = b3Scalar(0.3);
+		m_timeStep = b3Scalar(1.f / 60.f);
+		m_restitution = b3Scalar(0.);
+		m_maxErrorReduction = b3Scalar(20.);
+		m_numIterations = 10;
+		m_erp = b3Scalar(0.2);
+		m_erp2 = b3Scalar(0.8);
+		m_globalCfm = b3Scalar(0.);
+		m_sor = b3Scalar(1.);
+		m_splitImpulse = true;
+		m_splitImpulsePenetrationThreshold = -.04f;
+		m_splitImpulseTurnErp = 0.1f;
+		m_linearSlop = b3Scalar(0.0);
+		m_warmstartingFactor = b3Scalar(0.85);
+		//m_solverMode =  B3_SOLVER_USE_WARMSTARTING |  B3_SOLVER_SIMD | B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION|B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_ENABLE_FRICTION_DIRECTION_CACHING;// | B3_SOLVER_RANDMIZE_ORDER;
+		m_solverMode = B3_SOLVER_USE_WARMSTARTING | B3_SOLVER_SIMD;  // | B3_SOLVER_RANDMIZE_ORDER;
+		m_restingContactRestitutionThreshold = 2;                    //unused as of 2.81
+		m_minimumSolverBatchSize = 128;                              //try to combine islands until the amount of constraints reaches this limit
+		m_maxGyroscopicForce = 100.f;                                ///only used to clamp forces for bodies that have their B3_ENABLE_GYROPSCOPIC_FORCE flag set (using b3RigidBody::setFlag)
+		m_singleAxisRollingFrictionThreshold = 1e30f;                ///if the velocity is above this threshold, it will use a single constraint row (axis), otherwise 3 rows.
+	}
+};
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct b3ContactSolverInfoDoubleData
+{
+	double m_tau;
+	double m_damping;  //global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	double m_friction;
+	double m_timeStep;
+	double m_restitution;
+	double m_maxErrorReduction;
+	double m_sor;
+	double m_erp;        //used as Baumgarte factor
+	double m_erp2;       //used in Split Impulse
+	double m_globalCfm;  //constraint force mixing
+	double m_splitImpulsePenetrationThreshold;
+	double m_splitImpulseTurnErp;
+	double m_linearSlop;
+	double m_warmstartingFactor;
+	double m_maxGyroscopicForce;
+	double m_singleAxisRollingFrictionThreshold;
+
+	int m_numIterations;
+	int m_solverMode;
+	int m_restingContactRestitutionThreshold;
+	int m_minimumSolverBatchSize;
+	int m_splitImpulse;
+	char m_padding[4];
+};
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct b3ContactSolverInfoFloatData
+{
+	float m_tau;
+	float m_damping;  //global non-contact constraint damping, can be locally overridden by constraints during 'getInfo2'.
+	float m_friction;
+	float m_timeStep;
+
+	float m_restitution;
+	float m_maxErrorReduction;
+	float m_sor;
+	float m_erp;  //used as Baumgarte factor
+
+	float m_erp2;       //used in Split Impulse
+	float m_globalCfm;  //constraint force mixing
+	float m_splitImpulsePenetrationThreshold;
+	float m_splitImpulseTurnErp;
+
+	float m_linearSlop;
+	float m_warmstartingFactor;
+	float m_maxGyroscopicForce;
+	float m_singleAxisRollingFrictionThreshold;
+
+	int m_numIterations;
+	int m_solverMode;
+	int m_restingContactRestitutionThreshold;
+	int m_minimumSolverBatchSize;
+
+	int m_splitImpulse;
+	char m_padding[4];
+};
+
+#endif  //B3_CONTACT_SOLVER_INFO

+ 103 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.cpp

@@ -0,0 +1,103 @@
+
+#include "b3FixedConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Common/b3TransformUtil.h"
+#include <new>
+
+b3FixedConstraint::b3FixedConstraint(int rbA, int rbB, const b3Transform& frameInA, const b3Transform& frameInB)
+	: b3TypedConstraint(B3_FIXED_CONSTRAINT_TYPE, rbA, rbB)
+{
+	m_pivotInA = frameInA.getOrigin();
+	m_pivotInB = frameInB.getOrigin();
+	m_relTargetAB = frameInA.getRotation() * frameInB.getRotation().inverse();
+}
+
+b3FixedConstraint::~b3FixedConstraint()
+{
+}
+
+void b3FixedConstraint::getInfo1(b3ConstraintInfo1* info, const b3RigidBodyData* bodies)
+{
+	info->m_numConstraintRows = 6;
+	info->nub = 6;
+}
+
+void b3FixedConstraint::getInfo2(b3ConstraintInfo2* info, const b3RigidBodyData* bodies)
+{
+	//fix the 3 linear degrees of freedom
+
+	const b3Vector3& worldPosA = bodies[m_rbA].m_pos;
+	const b3Quaternion& worldOrnA = bodies[m_rbA].m_quat;
+	const b3Vector3& worldPosB = bodies[m_rbB].m_pos;
+	const b3Quaternion& worldOrnB = bodies[m_rbB].m_quat;
+
+	info->m_J1linearAxis[0] = 1;
+	info->m_J1linearAxis[info->rowskip + 1] = 1;
+	info->m_J1linearAxis[2 * info->rowskip + 2] = 1;
+
+	b3Vector3 a1 = b3QuatRotate(worldOrnA, m_pivotInA);
+	{
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis + info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis + 2 * info->rowskip);
+		b3Vector3 a1neg = -a1;
+		a1neg.getSkewSymmetricMatrix(angular0, angular1, angular2);
+	}
+
+	if (info->m_J2linearAxis)
+	{
+		info->m_J2linearAxis[0] = -1;
+		info->m_J2linearAxis[info->rowskip + 1] = -1;
+		info->m_J2linearAxis[2 * info->rowskip + 2] = -1;
+	}
+
+	b3Vector3 a2 = b3QuatRotate(worldOrnB, m_pivotInB);
+
+	{
+		//	b3Vector3 a2n = -a2;
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis + info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis + 2 * info->rowskip);
+		a2.getSkewSymmetricMatrix(angular0, angular1, angular2);
+	}
+
+	// set right hand side for the linear dofs
+	b3Scalar k = info->fps * info->erp;
+	b3Vector3 linearError = k * (a2 + worldPosB - a1 - worldPosA);
+	int j;
+	for (j = 0; j < 3; j++)
+	{
+		info->m_constraintError[j * info->rowskip] = linearError[j];
+		//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
+	}
+
+	//fix the 3 angular degrees of freedom
+
+	int start_row = 3;
+	int s = info->rowskip;
+	int start_index = start_row * s;
+
+	// 3 rows to make body rotations equal
+	info->m_J1angularAxis[start_index] = 1;
+	info->m_J1angularAxis[start_index + s + 1] = 1;
+	info->m_J1angularAxis[start_index + s * 2 + 2] = 1;
+	if (info->m_J2angularAxis)
+	{
+		info->m_J2angularAxis[start_index] = -1;
+		info->m_J2angularAxis[start_index + s + 1] = -1;
+		info->m_J2angularAxis[start_index + s * 2 + 2] = -1;
+	}
+
+	// set right hand side for the angular dofs
+
+	b3Vector3 diff;
+	b3Scalar angle;
+	b3Quaternion qrelCur = worldOrnA * worldOrnB.inverse();
+
+	b3TransformUtil::calculateDiffAxisAngleQuaternion(m_relTargetAB, qrelCur, diff, angle);
+	diff *= -angle;
+	for (j = 0; j < 3; j++)
+	{
+		info->m_constraintError[(3 + j) * info->rowskip] = k * diff[j];
+	}
+}

+ 34 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3FixedConstraint.h

@@ -0,0 +1,34 @@
+
+#ifndef B3_FIXED_CONSTRAINT_H
+#define B3_FIXED_CONSTRAINT_H
+
+#include "b3TypedConstraint.h"
+
+B3_ATTRIBUTE_ALIGNED16(class)
+b3FixedConstraint : public b3TypedConstraint
+{
+	b3Vector3 m_pivotInA;
+	b3Vector3 m_pivotInB;
+	b3Quaternion m_relTargetAB;
+
+public:
+	b3FixedConstraint(int rbA, int rbB, const b3Transform& frameInA, const b3Transform& frameInB);
+
+	virtual ~b3FixedConstraint();
+
+	virtual void getInfo1(b3ConstraintInfo1 * info, const b3RigidBodyData* bodies);
+
+	virtual void getInfo2(b3ConstraintInfo2 * info, const b3RigidBodyData* bodies);
+
+	virtual void setParam(int num, b3Scalar value, int axis = -1)
+	{
+		b3Assert(0);
+	}
+	virtual b3Scalar getParam(int num, int axis = -1) const
+	{
+		b3Assert(0);
+		return 0.f;
+	}
+};
+
+#endif  //B3_FIXED_CONSTRAINT_H

+ 737 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.cpp

@@ -0,0 +1,737 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+/*
+2007-09-09
+Refactored by Francisco Le?n
+email: [email protected]
+http://gimpact.sf.net
+*/
+
+#include "b3Generic6DofConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+#include "Bullet3Common/b3TransformUtil.h"
+#include "Bullet3Common/b3TransformUtil.h"
+#include <new>
+
+#define D6_USE_OBSOLETE_METHOD false
+#define D6_USE_FRAME_OFFSET true
+
+b3Generic6DofConstraint::b3Generic6DofConstraint(int rbA, int rbB, const b3Transform& frameInA, const b3Transform& frameInB, bool useLinearReferenceFrameA, const b3RigidBodyData* bodies)
+	: b3TypedConstraint(B3_D6_CONSTRAINT_TYPE, rbA, rbB), m_frameInA(frameInA), m_frameInB(frameInB), m_useLinearReferenceFrameA(useLinearReferenceFrameA), m_useOffsetForConstraintFrame(D6_USE_FRAME_OFFSET), m_flags(0)
+{
+	calculateTransforms(bodies);
+}
+
+#define GENERIC_D6_DISABLE_WARMSTARTING 1
+
+b3Scalar btGetMatrixElem(const b3Matrix3x3& mat, int index);
+b3Scalar btGetMatrixElem(const b3Matrix3x3& mat, int index)
+{
+	int i = index % 3;
+	int j = index / 3;
+	return mat[i][j];
+}
+
+///MatrixToEulerXYZ from http://www.geometrictools.com/LibFoundation/Mathematics/Wm4Matrix3.inl.html
+bool matrixToEulerXYZ(const b3Matrix3x3& mat, b3Vector3& xyz);
+bool matrixToEulerXYZ(const b3Matrix3x3& mat, b3Vector3& xyz)
+{
+	//	// rot =  cy*cz          -cy*sz           sy
+	//	//        cz*sx*sy+cx*sz  cx*cz-sx*sy*sz -cy*sx
+	//	//       -cx*cz*sy+sx*sz  cz*sx+cx*sy*sz  cx*cy
+	//
+
+	b3Scalar fi = btGetMatrixElem(mat, 2);
+	if (fi < b3Scalar(1.0f))
+	{
+		if (fi > b3Scalar(-1.0f))
+		{
+			xyz[0] = b3Atan2(-btGetMatrixElem(mat, 5), btGetMatrixElem(mat, 8));
+			xyz[1] = b3Asin(btGetMatrixElem(mat, 2));
+			xyz[2] = b3Atan2(-btGetMatrixElem(mat, 1), btGetMatrixElem(mat, 0));
+			return true;
+		}
+		else
+		{
+			// WARNING.  Not unique.  XA - ZA = -atan2(r10,r11)
+			xyz[0] = -b3Atan2(btGetMatrixElem(mat, 3), btGetMatrixElem(mat, 4));
+			xyz[1] = -B3_HALF_PI;
+			xyz[2] = b3Scalar(0.0);
+			return false;
+		}
+	}
+	else
+	{
+		// WARNING.  Not unique.  XAngle + ZAngle = atan2(r10,r11)
+		xyz[0] = b3Atan2(btGetMatrixElem(mat, 3), btGetMatrixElem(mat, 4));
+		xyz[1] = B3_HALF_PI;
+		xyz[2] = 0.0;
+	}
+	return false;
+}
+
+//////////////////////////// b3RotationalLimitMotor ////////////////////////////////////
+
+int b3RotationalLimitMotor::testLimitValue(b3Scalar test_value)
+{
+	if (m_loLimit > m_hiLimit)
+	{
+		m_currentLimit = 0;  //Free from violation
+		return 0;
+	}
+	if (test_value < m_loLimit)
+	{
+		m_currentLimit = 1;  //low limit violation
+		m_currentLimitError = test_value - m_loLimit;
+		if (m_currentLimitError > B3_PI)
+			m_currentLimitError -= B3_2_PI;
+		else if (m_currentLimitError < -B3_PI)
+			m_currentLimitError += B3_2_PI;
+		return 1;
+	}
+	else if (test_value > m_hiLimit)
+	{
+		m_currentLimit = 2;  //High limit violation
+		m_currentLimitError = test_value - m_hiLimit;
+		if (m_currentLimitError > B3_PI)
+			m_currentLimitError -= B3_2_PI;
+		else if (m_currentLimitError < -B3_PI)
+			m_currentLimitError += B3_2_PI;
+		return 2;
+	};
+
+	m_currentLimit = 0;  //Free from violation
+	return 0;
+}
+
+//////////////////////////// End b3RotationalLimitMotor ////////////////////////////////////
+
+//////////////////////////// b3TranslationalLimitMotor ////////////////////////////////////
+
+int b3TranslationalLimitMotor::testLimitValue(int limitIndex, b3Scalar test_value)
+{
+	b3Scalar loLimit = m_lowerLimit[limitIndex];
+	b3Scalar hiLimit = m_upperLimit[limitIndex];
+	if (loLimit > hiLimit)
+	{
+		m_currentLimit[limitIndex] = 0;  //Free from violation
+		m_currentLimitError[limitIndex] = b3Scalar(0.f);
+		return 0;
+	}
+
+	if (test_value < loLimit)
+	{
+		m_currentLimit[limitIndex] = 2;  //low limit violation
+		m_currentLimitError[limitIndex] = test_value - loLimit;
+		return 2;
+	}
+	else if (test_value > hiLimit)
+	{
+		m_currentLimit[limitIndex] = 1;  //High limit violation
+		m_currentLimitError[limitIndex] = test_value - hiLimit;
+		return 1;
+	};
+
+	m_currentLimit[limitIndex] = 0;  //Free from violation
+	m_currentLimitError[limitIndex] = b3Scalar(0.f);
+	return 0;
+}
+
+//////////////////////////// b3TranslationalLimitMotor ////////////////////////////////////
+
+void b3Generic6DofConstraint::calculateAngleInfo()
+{
+	b3Matrix3x3 relative_frame = m_calculatedTransformA.getBasis().inverse() * m_calculatedTransformB.getBasis();
+	matrixToEulerXYZ(relative_frame, m_calculatedAxisAngleDiff);
+	// in euler angle mode we do not actually constrain the angular velocity
+	// along the axes axis[0] and axis[2] (although we do use axis[1]) :
+	//
+	//    to get			constrain w2-w1 along		...not
+	//    ------			---------------------		------
+	//    d(angle[0])/dt = 0	ax[1] x ax[2]			ax[0]
+	//    d(angle[1])/dt = 0	ax[1]
+	//    d(angle[2])/dt = 0	ax[0] x ax[1]			ax[2]
+	//
+	// constraining w2-w1 along an axis 'a' means that a'*(w2-w1)=0.
+	// to prove the result for angle[0], write the expression for angle[0] from
+	// GetInfo1 then take the derivative. to prove this for angle[2] it is
+	// easier to take the euler rate expression for d(angle[2])/dt with respect
+	// to the components of w and set that to 0.
+	b3Vector3 axis0 = m_calculatedTransformB.getBasis().getColumn(0);
+	b3Vector3 axis2 = m_calculatedTransformA.getBasis().getColumn(2);
+
+	m_calculatedAxis[1] = axis2.cross(axis0);
+	m_calculatedAxis[0] = m_calculatedAxis[1].cross(axis2);
+	m_calculatedAxis[2] = axis0.cross(m_calculatedAxis[1]);
+
+	m_calculatedAxis[0].normalize();
+	m_calculatedAxis[1].normalize();
+	m_calculatedAxis[2].normalize();
+}
+
+static b3Transform getCenterOfMassTransform(const b3RigidBodyData& body)
+{
+	b3Transform tr(body.m_quat, body.m_pos);
+	return tr;
+}
+
+void b3Generic6DofConstraint::calculateTransforms(const b3RigidBodyData* bodies)
+{
+	b3Transform transA;
+	b3Transform transB;
+	transA = getCenterOfMassTransform(bodies[m_rbA]);
+	transB = getCenterOfMassTransform(bodies[m_rbB]);
+	calculateTransforms(transA, transB, bodies);
+}
+
+void b3Generic6DofConstraint::calculateTransforms(const b3Transform& transA, const b3Transform& transB, const b3RigidBodyData* bodies)
+{
+	m_calculatedTransformA = transA * m_frameInA;
+	m_calculatedTransformB = transB * m_frameInB;
+	calculateLinearInfo();
+	calculateAngleInfo();
+	if (m_useOffsetForConstraintFrame)
+	{  //  get weight factors depending on masses
+		b3Scalar miA = bodies[m_rbA].m_invMass;
+		b3Scalar miB = bodies[m_rbB].m_invMass;
+		m_hasStaticBody = (miA < B3_EPSILON) || (miB < B3_EPSILON);
+		b3Scalar miS = miA + miB;
+		if (miS > b3Scalar(0.f))
+		{
+			m_factA = miB / miS;
+		}
+		else
+		{
+			m_factA = b3Scalar(0.5f);
+		}
+		m_factB = b3Scalar(1.0f) - m_factA;
+	}
+}
+
+bool b3Generic6DofConstraint::testAngularLimitMotor(int axis_index)
+{
+	b3Scalar angle = m_calculatedAxisAngleDiff[axis_index];
+	angle = b3AdjustAngleToLimits(angle, m_angularLimits[axis_index].m_loLimit, m_angularLimits[axis_index].m_hiLimit);
+	m_angularLimits[axis_index].m_currentPosition = angle;
+	//test limits
+	m_angularLimits[axis_index].testLimitValue(angle);
+	return m_angularLimits[axis_index].needApplyTorques();
+}
+
+void b3Generic6DofConstraint::getInfo1(b3ConstraintInfo1* info, const b3RigidBodyData* bodies)
+{
+	//prepare constraint
+	calculateTransforms(getCenterOfMassTransform(bodies[m_rbA]), getCenterOfMassTransform(bodies[m_rbB]), bodies);
+	info->m_numConstraintRows = 0;
+	info->nub = 6;
+	int i;
+	//test linear limits
+	for (i = 0; i < 3; i++)
+	{
+		if (m_linearLimits.needApplyForce(i))
+		{
+			info->m_numConstraintRows++;
+			info->nub--;
+		}
+	}
+	//test angular limits
+	for (i = 0; i < 3; i++)
+	{
+		if (testAngularLimitMotor(i))
+		{
+			info->m_numConstraintRows++;
+			info->nub--;
+		}
+	}
+	//	printf("info->m_numConstraintRows=%d\n",info->m_numConstraintRows);
+}
+
+void b3Generic6DofConstraint::getInfo1NonVirtual(b3ConstraintInfo1* info, const b3RigidBodyData* bodies)
+{
+	//pre-allocate all 6
+	info->m_numConstraintRows = 6;
+	info->nub = 0;
+}
+
+void b3Generic6DofConstraint::getInfo2(b3ConstraintInfo2* info, const b3RigidBodyData* bodies)
+{
+	b3Transform transA = getCenterOfMassTransform(bodies[m_rbA]);
+	b3Transform transB = getCenterOfMassTransform(bodies[m_rbB]);
+	const b3Vector3& linVelA = bodies[m_rbA].m_linVel;
+	const b3Vector3& linVelB = bodies[m_rbB].m_linVel;
+	const b3Vector3& angVelA = bodies[m_rbA].m_angVel;
+	const b3Vector3& angVelB = bodies[m_rbB].m_angVel;
+
+	if (m_useOffsetForConstraintFrame)
+	{  // for stability better to solve angular limits first
+		int row = setAngularLimits(info, 0, transA, transB, linVelA, linVelB, angVelA, angVelB);
+		setLinearLimits(info, row, transA, transB, linVelA, linVelB, angVelA, angVelB);
+	}
+	else
+	{  // leave old version for compatibility
+		int row = setLinearLimits(info, 0, transA, transB, linVelA, linVelB, angVelA, angVelB);
+		setAngularLimits(info, row, transA, transB, linVelA, linVelB, angVelA, angVelB);
+	}
+}
+
+void b3Generic6DofConstraint::getInfo2NonVirtual(b3ConstraintInfo2* info, const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB, const b3RigidBodyData* bodies)
+{
+	//prepare constraint
+	calculateTransforms(transA, transB, bodies);
+
+	int i;
+	for (i = 0; i < 3; i++)
+	{
+		testAngularLimitMotor(i);
+	}
+
+	if (m_useOffsetForConstraintFrame)
+	{  // for stability better to solve angular limits first
+		int row = setAngularLimits(info, 0, transA, transB, linVelA, linVelB, angVelA, angVelB);
+		setLinearLimits(info, row, transA, transB, linVelA, linVelB, angVelA, angVelB);
+	}
+	else
+	{  // leave old version for compatibility
+		int row = setLinearLimits(info, 0, transA, transB, linVelA, linVelB, angVelA, angVelB);
+		setAngularLimits(info, row, transA, transB, linVelA, linVelB, angVelA, angVelB);
+	}
+}
+
+int b3Generic6DofConstraint::setLinearLimits(b3ConstraintInfo2* info, int row, const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB)
+{
+	//	int row = 0;
+	//solve linear limits
+	b3RotationalLimitMotor limot;
+	for (int i = 0; i < 3; i++)
+	{
+		if (m_linearLimits.needApplyForce(i))
+		{  // re-use rotational motor code
+			limot.m_bounce = b3Scalar(0.f);
+			limot.m_currentLimit = m_linearLimits.m_currentLimit[i];
+			limot.m_currentPosition = m_linearLimits.m_currentLinearDiff[i];
+			limot.m_currentLimitError = m_linearLimits.m_currentLimitError[i];
+			limot.m_damping = m_linearLimits.m_damping;
+			limot.m_enableMotor = m_linearLimits.m_enableMotor[i];
+			limot.m_hiLimit = m_linearLimits.m_upperLimit[i];
+			limot.m_limitSoftness = m_linearLimits.m_limitSoftness;
+			limot.m_loLimit = m_linearLimits.m_lowerLimit[i];
+			limot.m_maxLimitForce = b3Scalar(0.f);
+			limot.m_maxMotorForce = m_linearLimits.m_maxMotorForce[i];
+			limot.m_targetVelocity = m_linearLimits.m_targetVelocity[i];
+			b3Vector3 axis = m_calculatedTransformA.getBasis().getColumn(i);
+			int flags = m_flags >> (i * B3_6DOF_FLAGS_AXIS_SHIFT);
+			limot.m_normalCFM = (flags & B3_6DOF_FLAGS_CFM_NORM) ? m_linearLimits.m_normalCFM[i] : info->cfm[0];
+			limot.m_stopCFM = (flags & B3_6DOF_FLAGS_CFM_STOP) ? m_linearLimits.m_stopCFM[i] : info->cfm[0];
+			limot.m_stopERP = (flags & B3_6DOF_FLAGS_ERP_STOP) ? m_linearLimits.m_stopERP[i] : info->erp;
+			if (m_useOffsetForConstraintFrame)
+			{
+				int indx1 = (i + 1) % 3;
+				int indx2 = (i + 2) % 3;
+				int rotAllowed = 1;  // rotations around orthos to current axis
+				if (m_angularLimits[indx1].m_currentLimit && m_angularLimits[indx2].m_currentLimit)
+				{
+					rotAllowed = 0;
+				}
+				row += get_limit_motor_info2(&limot, transA, transB, linVelA, linVelB, angVelA, angVelB, info, row, axis, 0, rotAllowed);
+			}
+			else
+			{
+				row += get_limit_motor_info2(&limot, transA, transB, linVelA, linVelB, angVelA, angVelB, info, row, axis, 0);
+			}
+		}
+	}
+	return row;
+}
+
+int b3Generic6DofConstraint::setAngularLimits(b3ConstraintInfo2* info, int row_offset, const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB)
+{
+	b3Generic6DofConstraint* d6constraint = this;
+	int row = row_offset;
+	//solve angular limits
+	for (int i = 0; i < 3; i++)
+	{
+		if (d6constraint->getRotationalLimitMotor(i)->needApplyTorques())
+		{
+			b3Vector3 axis = d6constraint->getAxis(i);
+			int flags = m_flags >> ((i + 3) * B3_6DOF_FLAGS_AXIS_SHIFT);
+			if (!(flags & B3_6DOF_FLAGS_CFM_NORM))
+			{
+				m_angularLimits[i].m_normalCFM = info->cfm[0];
+			}
+			if (!(flags & B3_6DOF_FLAGS_CFM_STOP))
+			{
+				m_angularLimits[i].m_stopCFM = info->cfm[0];
+			}
+			if (!(flags & B3_6DOF_FLAGS_ERP_STOP))
+			{
+				m_angularLimits[i].m_stopERP = info->erp;
+			}
+			row += get_limit_motor_info2(d6constraint->getRotationalLimitMotor(i),
+										 transA, transB, linVelA, linVelB, angVelA, angVelB, info, row, axis, 1);
+		}
+	}
+
+	return row;
+}
+
+void b3Generic6DofConstraint::updateRHS(b3Scalar timeStep)
+{
+	(void)timeStep;
+}
+
+void b3Generic6DofConstraint::setFrames(const b3Transform& frameA, const b3Transform& frameB, const b3RigidBodyData* bodies)
+{
+	m_frameInA = frameA;
+	m_frameInB = frameB;
+
+	calculateTransforms(bodies);
+}
+
+b3Vector3 b3Generic6DofConstraint::getAxis(int axis_index) const
+{
+	return m_calculatedAxis[axis_index];
+}
+
+b3Scalar b3Generic6DofConstraint::getRelativePivotPosition(int axisIndex) const
+{
+	return m_calculatedLinearDiff[axisIndex];
+}
+
+b3Scalar b3Generic6DofConstraint::getAngle(int axisIndex) const
+{
+	return m_calculatedAxisAngleDiff[axisIndex];
+}
+
+void b3Generic6DofConstraint::calcAnchorPos(const b3RigidBodyData* bodies)
+{
+	b3Scalar imA = bodies[m_rbA].m_invMass;
+	b3Scalar imB = bodies[m_rbB].m_invMass;
+	b3Scalar weight;
+	if (imB == b3Scalar(0.0))
+	{
+		weight = b3Scalar(1.0);
+	}
+	else
+	{
+		weight = imA / (imA + imB);
+	}
+	const b3Vector3& pA = m_calculatedTransformA.getOrigin();
+	const b3Vector3& pB = m_calculatedTransformB.getOrigin();
+	m_AnchorPos = pA * weight + pB * (b3Scalar(1.0) - weight);
+	return;
+}
+
+void b3Generic6DofConstraint::calculateLinearInfo()
+{
+	m_calculatedLinearDiff = m_calculatedTransformB.getOrigin() - m_calculatedTransformA.getOrigin();
+	m_calculatedLinearDiff = m_calculatedTransformA.getBasis().inverse() * m_calculatedLinearDiff;
+	for (int i = 0; i < 3; i++)
+	{
+		m_linearLimits.m_currentLinearDiff[i] = m_calculatedLinearDiff[i];
+		m_linearLimits.testLimitValue(i, m_calculatedLinearDiff[i]);
+	}
+}
+
+int b3Generic6DofConstraint::get_limit_motor_info2(
+	b3RotationalLimitMotor* limot,
+	const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB,
+	b3ConstraintInfo2* info, int row, b3Vector3& ax1, int rotational, int rotAllowed)
+{
+	int srow = row * info->rowskip;
+	bool powered = limot->m_enableMotor;
+	int limit = limot->m_currentLimit;
+	if (powered || limit)
+	{  // if the joint is powered, or has joint limits, add in the extra row
+		b3Scalar* J1 = rotational ? info->m_J1angularAxis : info->m_J1linearAxis;
+		b3Scalar* J2 = rotational ? info->m_J2angularAxis : info->m_J2linearAxis;
+		if (J1)
+		{
+			J1[srow + 0] = ax1[0];
+			J1[srow + 1] = ax1[1];
+			J1[srow + 2] = ax1[2];
+		}
+		if (J2)
+		{
+			J2[srow + 0] = -ax1[0];
+			J2[srow + 1] = -ax1[1];
+			J2[srow + 2] = -ax1[2];
+		}
+		if ((!rotational))
+		{
+			if (m_useOffsetForConstraintFrame)
+			{
+				b3Vector3 tmpA, tmpB, relA, relB;
+				// get vector from bodyB to frameB in WCS
+				relB = m_calculatedTransformB.getOrigin() - transB.getOrigin();
+				// get its projection to constraint axis
+				b3Vector3 projB = ax1 * relB.dot(ax1);
+				// get vector directed from bodyB to constraint axis (and orthogonal to it)
+				b3Vector3 orthoB = relB - projB;
+				// same for bodyA
+				relA = m_calculatedTransformA.getOrigin() - transA.getOrigin();
+				b3Vector3 projA = ax1 * relA.dot(ax1);
+				b3Vector3 orthoA = relA - projA;
+				// get desired offset between frames A and B along constraint axis
+				b3Scalar desiredOffs = limot->m_currentPosition - limot->m_currentLimitError;
+				// desired vector from projection of center of bodyA to projection of center of bodyB to constraint axis
+				b3Vector3 totalDist = projA + ax1 * desiredOffs - projB;
+				// get offset vectors relA and relB
+				relA = orthoA + totalDist * m_factA;
+				relB = orthoB - totalDist * m_factB;
+				tmpA = relA.cross(ax1);
+				tmpB = relB.cross(ax1);
+				if (m_hasStaticBody && (!rotAllowed))
+				{
+					tmpA *= m_factA;
+					tmpB *= m_factB;
+				}
+				int i;
+				for (i = 0; i < 3; i++) info->m_J1angularAxis[srow + i] = tmpA[i];
+				for (i = 0; i < 3; i++) info->m_J2angularAxis[srow + i] = -tmpB[i];
+			}
+			else
+			{
+				b3Vector3 ltd;  // Linear Torque Decoupling vector
+				b3Vector3 c = m_calculatedTransformB.getOrigin() - transA.getOrigin();
+				ltd = c.cross(ax1);
+				info->m_J1angularAxis[srow + 0] = ltd[0];
+				info->m_J1angularAxis[srow + 1] = ltd[1];
+				info->m_J1angularAxis[srow + 2] = ltd[2];
+
+				c = m_calculatedTransformB.getOrigin() - transB.getOrigin();
+				ltd = -c.cross(ax1);
+				info->m_J2angularAxis[srow + 0] = ltd[0];
+				info->m_J2angularAxis[srow + 1] = ltd[1];
+				info->m_J2angularAxis[srow + 2] = ltd[2];
+			}
+		}
+		// if we're limited low and high simultaneously, the joint motor is
+		// ineffective
+		if (limit && (limot->m_loLimit == limot->m_hiLimit)) powered = false;
+		info->m_constraintError[srow] = b3Scalar(0.f);
+		if (powered)
+		{
+			info->cfm[srow] = limot->m_normalCFM;
+			if (!limit)
+			{
+				b3Scalar tag_vel = rotational ? limot->m_targetVelocity : -limot->m_targetVelocity;
+
+				b3Scalar mot_fact = getMotorFactor(limot->m_currentPosition,
+												   limot->m_loLimit,
+												   limot->m_hiLimit,
+												   tag_vel,
+												   info->fps * limot->m_stopERP);
+				info->m_constraintError[srow] += mot_fact * limot->m_targetVelocity;
+				info->m_lowerLimit[srow] = -limot->m_maxMotorForce / info->fps;
+				info->m_upperLimit[srow] = limot->m_maxMotorForce / info->fps;
+			}
+		}
+		if (limit)
+		{
+			b3Scalar k = info->fps * limot->m_stopERP;
+			if (!rotational)
+			{
+				info->m_constraintError[srow] += k * limot->m_currentLimitError;
+			}
+			else
+			{
+				info->m_constraintError[srow] += -k * limot->m_currentLimitError;
+			}
+			info->cfm[srow] = limot->m_stopCFM;
+			if (limot->m_loLimit == limot->m_hiLimit)
+			{  // limited low and high simultaneously
+				info->m_lowerLimit[srow] = -B3_INFINITY;
+				info->m_upperLimit[srow] = B3_INFINITY;
+			}
+			else
+			{
+				if (limit == 1)
+				{
+					info->m_lowerLimit[srow] = 0;
+					info->m_upperLimit[srow] = B3_INFINITY;
+				}
+				else
+				{
+					info->m_lowerLimit[srow] = -B3_INFINITY;
+					info->m_upperLimit[srow] = 0;
+				}
+				// deal with bounce
+				if (limot->m_bounce > 0)
+				{
+					// calculate joint velocity
+					b3Scalar vel;
+					if (rotational)
+					{
+						vel = angVelA.dot(ax1);
+						//make sure that if no body -> angVelB == zero vec
+						//                        if (body1)
+						vel -= angVelB.dot(ax1);
+					}
+					else
+					{
+						vel = linVelA.dot(ax1);
+						//make sure that if no body -> angVelB == zero vec
+						//                        if (body1)
+						vel -= linVelB.dot(ax1);
+					}
+					// only apply bounce if the velocity is incoming, and if the
+					// resulting c[] exceeds what we already have.
+					if (limit == 1)
+					{
+						if (vel < 0)
+						{
+							b3Scalar newc = -limot->m_bounce * vel;
+							if (newc > info->m_constraintError[srow])
+								info->m_constraintError[srow] = newc;
+						}
+					}
+					else
+					{
+						if (vel > 0)
+						{
+							b3Scalar newc = -limot->m_bounce * vel;
+							if (newc < info->m_constraintError[srow])
+								info->m_constraintError[srow] = newc;
+						}
+					}
+				}
+			}
+		}
+		return 1;
+	}
+	else
+		return 0;
+}
+
+///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5).
+///If no axis is provided, it uses the default axis for this constraint.
+void b3Generic6DofConstraint::setParam(int num, b3Scalar value, int axis)
+{
+	if ((axis >= 0) && (axis < 3))
+	{
+		switch (num)
+		{
+			case B3_CONSTRAINT_STOP_ERP:
+				m_linearLimits.m_stopERP[axis] = value;
+				m_flags |= B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			case B3_CONSTRAINT_STOP_CFM:
+				m_linearLimits.m_stopCFM[axis] = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			case B3_CONSTRAINT_CFM:
+				m_linearLimits.m_normalCFM[axis] = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			default:
+				b3AssertConstrParams(0);
+		}
+	}
+	else if ((axis >= 3) && (axis < 6))
+	{
+		switch (num)
+		{
+			case B3_CONSTRAINT_STOP_ERP:
+				m_angularLimits[axis - 3].m_stopERP = value;
+				m_flags |= B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			case B3_CONSTRAINT_STOP_CFM:
+				m_angularLimits[axis - 3].m_stopCFM = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			case B3_CONSTRAINT_CFM:
+				m_angularLimits[axis - 3].m_normalCFM = value;
+				m_flags |= B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT);
+				break;
+			default:
+				b3AssertConstrParams(0);
+		}
+	}
+	else
+	{
+		b3AssertConstrParams(0);
+	}
+}
+
+///return the local value of parameter
+b3Scalar b3Generic6DofConstraint::getParam(int num, int axis) const
+{
+	b3Scalar retVal = 0;
+	if ((axis >= 0) && (axis < 3))
+	{
+		switch (num)
+		{
+			case B3_CONSTRAINT_STOP_ERP:
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_linearLimits.m_stopERP[axis];
+				break;
+			case B3_CONSTRAINT_STOP_CFM:
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_linearLimits.m_stopCFM[axis];
+				break;
+			case B3_CONSTRAINT_CFM:
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_linearLimits.m_normalCFM[axis];
+				break;
+			default:
+				b3AssertConstrParams(0);
+		}
+	}
+	else if ((axis >= 3) && (axis < 6))
+	{
+		switch (num)
+		{
+			case B3_CONSTRAINT_STOP_ERP:
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_ERP_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_angularLimits[axis - 3].m_stopERP;
+				break;
+			case B3_CONSTRAINT_STOP_CFM:
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_STOP << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_angularLimits[axis - 3].m_stopCFM;
+				break;
+			case B3_CONSTRAINT_CFM:
+				b3AssertConstrParams(m_flags & (B3_6DOF_FLAGS_CFM_NORM << (axis * B3_6DOF_FLAGS_AXIS_SHIFT)));
+				retVal = m_angularLimits[axis - 3].m_normalCFM;
+				break;
+			default:
+				b3AssertConstrParams(0);
+		}
+	}
+	else
+	{
+		b3AssertConstrParams(0);
+	}
+	return retVal;
+}
+
+void b3Generic6DofConstraint::setAxis(const b3Vector3& axis1, const b3Vector3& axis2, const b3RigidBodyData* bodies)
+{
+	b3Vector3 zAxis = axis1.normalized();
+	b3Vector3 yAxis = axis2.normalized();
+	b3Vector3 xAxis = yAxis.cross(zAxis);  // we want right coordinate system
+
+	b3Transform frameInW;
+	frameInW.setIdentity();
+	frameInW.getBasis().setValue(xAxis[0], yAxis[0], zAxis[0],
+								 xAxis[1], yAxis[1], zAxis[1],
+								 xAxis[2], yAxis[2], zAxis[2]);
+
+	// now get constraint frame in local coordinate systems
+	m_frameInA = getCenterOfMassTransform(bodies[m_rbA]).inverse() * frameInW;
+	m_frameInB = getCenterOfMassTransform(bodies[m_rbB]).inverse() * frameInW;
+
+	calculateTransforms(bodies);
+}

+ 517 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Generic6DofConstraint.h

@@ -0,0 +1,517 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+/// 2009 March: b3Generic6DofConstraint refactored by Roman Ponomarev
+/// Added support for generic constraint solver through getInfo1/getInfo2 methods
+
+/*
+2007-09-09
+b3Generic6DofConstraint Refactored by Francisco Le?n
+email: [email protected]
+http://gimpact.sf.net
+*/
+
+#ifndef B3_GENERIC_6DOF_CONSTRAINT_H
+#define B3_GENERIC_6DOF_CONSTRAINT_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "b3JacobianEntry.h"
+#include "b3TypedConstraint.h"
+
+struct b3RigidBodyData;
+
+//! Rotation Limit structure for generic joints
+class b3RotationalLimitMotor
+{
+public:
+	//! limit_parameters
+	//!@{
+	b3Scalar m_loLimit;         //!< joint limit
+	b3Scalar m_hiLimit;         //!< joint limit
+	b3Scalar m_targetVelocity;  //!< target motor velocity
+	b3Scalar m_maxMotorForce;   //!< max force on motor
+	b3Scalar m_maxLimitForce;   //!< max force on limit
+	b3Scalar m_damping;         //!< Damping.
+	b3Scalar m_limitSoftness;   //! Relaxation factor
+	b3Scalar m_normalCFM;       //!< Constraint force mixing factor
+	b3Scalar m_stopERP;         //!< Error tolerance factor when joint is at limit
+	b3Scalar m_stopCFM;         //!< Constraint force mixing factor when joint is at limit
+	b3Scalar m_bounce;          //!< restitution factor
+	bool m_enableMotor;
+
+	//!@}
+
+	//! temp_variables
+	//!@{
+	b3Scalar m_currentLimitError;  //!  How much is violated this limit
+	b3Scalar m_currentPosition;    //!  current value of angle
+	int m_currentLimit;            //!< 0=free, 1=at lo limit, 2=at hi limit
+	b3Scalar m_accumulatedImpulse;
+	//!@}
+
+	b3RotationalLimitMotor()
+	{
+		m_accumulatedImpulse = 0.f;
+		m_targetVelocity = 0;
+		m_maxMotorForce = 6.0f;
+		m_maxLimitForce = 300.0f;
+		m_loLimit = 1.0f;
+		m_hiLimit = -1.0f;
+		m_normalCFM = 0.f;
+		m_stopERP = 0.2f;
+		m_stopCFM = 0.f;
+		m_bounce = 0.0f;
+		m_damping = 1.0f;
+		m_limitSoftness = 0.5f;
+		m_currentLimit = 0;
+		m_currentLimitError = 0;
+		m_enableMotor = false;
+	}
+
+	b3RotationalLimitMotor(const b3RotationalLimitMotor& limot)
+	{
+		m_targetVelocity = limot.m_targetVelocity;
+		m_maxMotorForce = limot.m_maxMotorForce;
+		m_limitSoftness = limot.m_limitSoftness;
+		m_loLimit = limot.m_loLimit;
+		m_hiLimit = limot.m_hiLimit;
+		m_normalCFM = limot.m_normalCFM;
+		m_stopERP = limot.m_stopERP;
+		m_stopCFM = limot.m_stopCFM;
+		m_bounce = limot.m_bounce;
+		m_currentLimit = limot.m_currentLimit;
+		m_currentLimitError = limot.m_currentLimitError;
+		m_enableMotor = limot.m_enableMotor;
+	}
+
+	//! Is limited
+	bool isLimited()
+	{
+		if (m_loLimit > m_hiLimit) return false;
+		return true;
+	}
+
+	//! Need apply correction
+	bool needApplyTorques()
+	{
+		if (m_currentLimit == 0 && m_enableMotor == false) return false;
+		return true;
+	}
+
+	//! calculates  error
+	/*!
+	calculates m_currentLimit and m_currentLimitError.
+	*/
+	int testLimitValue(b3Scalar test_value);
+
+	//! apply the correction impulses for two bodies
+	b3Scalar solveAngularLimits(b3Scalar timeStep, b3Vector3& axis, b3Scalar jacDiagABInv, b3RigidBodyData* body0, b3RigidBodyData* body1);
+};
+
+class b3TranslationalLimitMotor
+{
+public:
+	b3Vector3 m_lowerLimit;  //!< the constraint lower limits
+	b3Vector3 m_upperLimit;  //!< the constraint upper limits
+	b3Vector3 m_accumulatedImpulse;
+	//! Linear_Limit_parameters
+	//!@{
+	b3Vector3 m_normalCFM;          //!< Constraint force mixing factor
+	b3Vector3 m_stopERP;            //!< Error tolerance factor when joint is at limit
+	b3Vector3 m_stopCFM;            //!< Constraint force mixing factor when joint is at limit
+	b3Vector3 m_targetVelocity;     //!< target motor velocity
+	b3Vector3 m_maxMotorForce;      //!< max force on motor
+	b3Vector3 m_currentLimitError;  //!  How much is violated this limit
+	b3Vector3 m_currentLinearDiff;  //!  Current relative offset of constraint frames
+	b3Scalar m_limitSoftness;       //!< Softness for linear limit
+	b3Scalar m_damping;             //!< Damping for linear limit
+	b3Scalar m_restitution;         //! Bounce parameter for linear limit
+	//!@}
+	bool m_enableMotor[3];
+	int m_currentLimit[3];  //!< 0=free, 1=at lower limit, 2=at upper limit
+
+	b3TranslationalLimitMotor()
+	{
+		m_lowerLimit.setValue(0.f, 0.f, 0.f);
+		m_upperLimit.setValue(0.f, 0.f, 0.f);
+		m_accumulatedImpulse.setValue(0.f, 0.f, 0.f);
+		m_normalCFM.setValue(0.f, 0.f, 0.f);
+		m_stopERP.setValue(0.2f, 0.2f, 0.2f);
+		m_stopCFM.setValue(0.f, 0.f, 0.f);
+
+		m_limitSoftness = 0.7f;
+		m_damping = b3Scalar(1.0f);
+		m_restitution = b3Scalar(0.5f);
+		for (int i = 0; i < 3; i++)
+		{
+			m_enableMotor[i] = false;
+			m_targetVelocity[i] = b3Scalar(0.f);
+			m_maxMotorForce[i] = b3Scalar(0.f);
+		}
+	}
+
+	b3TranslationalLimitMotor(const b3TranslationalLimitMotor& other)
+	{
+		m_lowerLimit = other.m_lowerLimit;
+		m_upperLimit = other.m_upperLimit;
+		m_accumulatedImpulse = other.m_accumulatedImpulse;
+
+		m_limitSoftness = other.m_limitSoftness;
+		m_damping = other.m_damping;
+		m_restitution = other.m_restitution;
+		m_normalCFM = other.m_normalCFM;
+		m_stopERP = other.m_stopERP;
+		m_stopCFM = other.m_stopCFM;
+
+		for (int i = 0; i < 3; i++)
+		{
+			m_enableMotor[i] = other.m_enableMotor[i];
+			m_targetVelocity[i] = other.m_targetVelocity[i];
+			m_maxMotorForce[i] = other.m_maxMotorForce[i];
+		}
+	}
+
+	//! Test limit
+	/*!
+    - free means upper < lower,
+    - locked means upper == lower
+    - limited means upper > lower
+    - limitIndex: first 3 are linear, next 3 are angular
+    */
+	inline bool isLimited(int limitIndex)
+	{
+		return (m_upperLimit[limitIndex] >= m_lowerLimit[limitIndex]);
+	}
+	inline bool needApplyForce(int limitIndex)
+	{
+		if (m_currentLimit[limitIndex] == 0 && m_enableMotor[limitIndex] == false) return false;
+		return true;
+	}
+	int testLimitValue(int limitIndex, b3Scalar test_value);
+
+	b3Scalar solveLinearAxis(
+		b3Scalar timeStep,
+		b3Scalar jacDiagABInv,
+		b3RigidBodyData& body1, const b3Vector3& pointInA,
+		b3RigidBodyData& body2, const b3Vector3& pointInB,
+		int limit_index,
+		const b3Vector3& axis_normal_on_a,
+		const b3Vector3& anchorPos);
+};
+
+enum b36DofFlags
+{
+	B3_6DOF_FLAGS_CFM_NORM = 1,
+	B3_6DOF_FLAGS_CFM_STOP = 2,
+	B3_6DOF_FLAGS_ERP_STOP = 4
+};
+#define B3_6DOF_FLAGS_AXIS_SHIFT 3  // bits per axis
+
+/// b3Generic6DofConstraint between two rigidbodies each with a pivotpoint that descibes the axis location in local space
+/*!
+b3Generic6DofConstraint can leave any of the 6 degree of freedom 'free' or 'locked'.
+currently this limit supports rotational motors<br>
+<ul>
+<li> For Linear limits, use b3Generic6DofConstraint.setLinearUpperLimit, b3Generic6DofConstraint.setLinearLowerLimit. You can set the parameters with the b3TranslationalLimitMotor structure accsesible through the b3Generic6DofConstraint.getTranslationalLimitMotor method.
+At this moment translational motors are not supported. May be in the future. </li>
+
+<li> For Angular limits, use the b3RotationalLimitMotor structure for configuring the limit.
+This is accessible through b3Generic6DofConstraint.getLimitMotor method,
+This brings support for limit parameters and motors. </li>
+
+<li> Angulars limits have these possible ranges:
+<table border=1 >
+<tr>
+	<td><b>AXIS</b></td>
+	<td><b>MIN ANGLE</b></td>
+	<td><b>MAX ANGLE</b></td>
+</tr><tr>
+	<td>X</td>
+	<td>-PI</td>
+	<td>PI</td>
+</tr><tr>
+	<td>Y</td>
+	<td>-PI/2</td>
+	<td>PI/2</td>
+</tr><tr>
+	<td>Z</td>
+	<td>-PI</td>
+	<td>PI</td>
+</tr>
+</table>
+</li>
+</ul>
+
+*/
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Generic6DofConstraint : public b3TypedConstraint
+{
+protected:
+	//! relative_frames
+	//!@{
+	b3Transform m_frameInA;  //!< the constraint space w.r.t body A
+	b3Transform m_frameInB;  //!< the constraint space w.r.t body B
+	//!@}
+
+	//! Jacobians
+	//!@{
+	//    b3JacobianEntry	m_jacLinear[3];//!< 3 orthogonal linear constraints
+	//    b3JacobianEntry	m_jacAng[3];//!< 3 orthogonal angular constraints
+	//!@}
+
+	//! Linear_Limit_parameters
+	//!@{
+	b3TranslationalLimitMotor m_linearLimits;
+	//!@}
+
+	//! hinge_parameters
+	//!@{
+	b3RotationalLimitMotor m_angularLimits[3];
+	//!@}
+
+protected:
+	//! temporal variables
+	//!@{
+	b3Transform m_calculatedTransformA;
+	b3Transform m_calculatedTransformB;
+	b3Vector3 m_calculatedAxisAngleDiff;
+	b3Vector3 m_calculatedAxis[3];
+	b3Vector3 m_calculatedLinearDiff;
+	b3Scalar m_timeStep;
+	b3Scalar m_factA;
+	b3Scalar m_factB;
+	bool m_hasStaticBody;
+
+	b3Vector3 m_AnchorPos;  // point betwen pivots of bodies A and B to solve linear axes
+
+	bool m_useLinearReferenceFrameA;
+	bool m_useOffsetForConstraintFrame;
+
+	int m_flags;
+
+	//!@}
+
+	b3Generic6DofConstraint& operator=(b3Generic6DofConstraint& other)
+	{
+		b3Assert(0);
+		(void)other;
+		return *this;
+	}
+
+	int setAngularLimits(b3ConstraintInfo2 * info, int row_offset, const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB);
+
+	int setLinearLimits(b3ConstraintInfo2 * info, int row, const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB);
+
+	// tests linear limits
+	void calculateLinearInfo();
+
+	//! calcs the euler angles between the two bodies.
+	void calculateAngleInfo();
+
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3Generic6DofConstraint(int rbA, int rbB, const b3Transform& frameInA, const b3Transform& frameInB, bool useLinearReferenceFrameA, const b3RigidBodyData* bodies);
+
+	//! Calcs global transform of the offsets
+	/*!
+	Calcs the global transform for the joint offset for body A an B, and also calcs the agle differences between the bodies.
+	\sa b3Generic6DofConstraint.getCalculatedTransformA , b3Generic6DofConstraint.getCalculatedTransformB, b3Generic6DofConstraint.calculateAngleInfo
+	*/
+	void calculateTransforms(const b3Transform& transA, const b3Transform& transB, const b3RigidBodyData* bodies);
+
+	void calculateTransforms(const b3RigidBodyData* bodies);
+
+	//! Gets the global transform of the offset for body A
+	/*!
+    \sa b3Generic6DofConstraint.getFrameOffsetA, b3Generic6DofConstraint.getFrameOffsetB, b3Generic6DofConstraint.calculateAngleInfo.
+    */
+	const b3Transform& getCalculatedTransformA() const
+	{
+		return m_calculatedTransformA;
+	}
+
+	//! Gets the global transform of the offset for body B
+	/*!
+    \sa b3Generic6DofConstraint.getFrameOffsetA, b3Generic6DofConstraint.getFrameOffsetB, b3Generic6DofConstraint.calculateAngleInfo.
+    */
+	const b3Transform& getCalculatedTransformB() const
+	{
+		return m_calculatedTransformB;
+	}
+
+	const b3Transform& getFrameOffsetA() const
+	{
+		return m_frameInA;
+	}
+
+	const b3Transform& getFrameOffsetB() const
+	{
+		return m_frameInB;
+	}
+
+	b3Transform& getFrameOffsetA()
+	{
+		return m_frameInA;
+	}
+
+	b3Transform& getFrameOffsetB()
+	{
+		return m_frameInB;
+	}
+
+	virtual void getInfo1(b3ConstraintInfo1 * info, const b3RigidBodyData* bodies);
+
+	void getInfo1NonVirtual(b3ConstraintInfo1 * info, const b3RigidBodyData* bodies);
+
+	virtual void getInfo2(b3ConstraintInfo2 * info, const b3RigidBodyData* bodies);
+
+	void getInfo2NonVirtual(b3ConstraintInfo2 * info, const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB, const b3RigidBodyData* bodies);
+
+	void updateRHS(b3Scalar timeStep);
+
+	//! Get the rotation axis in global coordinates
+	b3Vector3 getAxis(int axis_index) const;
+
+	//! Get the relative Euler angle
+	/*!
+	\pre b3Generic6DofConstraint::calculateTransforms() must be called previously.
+	*/
+	b3Scalar getAngle(int axis_index) const;
+
+	//! Get the relative position of the constraint pivot
+	/*!
+	\pre b3Generic6DofConstraint::calculateTransforms() must be called previously.
+	*/
+	b3Scalar getRelativePivotPosition(int axis_index) const;
+
+	void setFrames(const b3Transform& frameA, const b3Transform& frameB, const b3RigidBodyData* bodies);
+
+	//! Test angular limit.
+	/*!
+	Calculates angular correction and returns true if limit needs to be corrected.
+	\pre b3Generic6DofConstraint::calculateTransforms() must be called previously.
+	*/
+	bool testAngularLimitMotor(int axis_index);
+
+	void setLinearLowerLimit(const b3Vector3& linearLower)
+	{
+		m_linearLimits.m_lowerLimit = linearLower;
+	}
+
+	void getLinearLowerLimit(b3Vector3 & linearLower)
+	{
+		linearLower = m_linearLimits.m_lowerLimit;
+	}
+
+	void setLinearUpperLimit(const b3Vector3& linearUpper)
+	{
+		m_linearLimits.m_upperLimit = linearUpper;
+	}
+
+	void getLinearUpperLimit(b3Vector3 & linearUpper)
+	{
+		linearUpper = m_linearLimits.m_upperLimit;
+	}
+
+	void setAngularLowerLimit(const b3Vector3& angularLower)
+	{
+		for (int i = 0; i < 3; i++)
+			m_angularLimits[i].m_loLimit = b3NormalizeAngle(angularLower[i]);
+	}
+
+	void getAngularLowerLimit(b3Vector3 & angularLower)
+	{
+		for (int i = 0; i < 3; i++)
+			angularLower[i] = m_angularLimits[i].m_loLimit;
+	}
+
+	void setAngularUpperLimit(const b3Vector3& angularUpper)
+	{
+		for (int i = 0; i < 3; i++)
+			m_angularLimits[i].m_hiLimit = b3NormalizeAngle(angularUpper[i]);
+	}
+
+	void getAngularUpperLimit(b3Vector3 & angularUpper)
+	{
+		for (int i = 0; i < 3; i++)
+			angularUpper[i] = m_angularLimits[i].m_hiLimit;
+	}
+
+	//! Retrieves the angular limit informacion
+	b3RotationalLimitMotor* getRotationalLimitMotor(int index)
+	{
+		return &m_angularLimits[index];
+	}
+
+	//! Retrieves the  limit informacion
+	b3TranslationalLimitMotor* getTranslationalLimitMotor()
+	{
+		return &m_linearLimits;
+	}
+
+	//first 3 are linear, next 3 are angular
+	void setLimit(int axis, b3Scalar lo, b3Scalar hi)
+	{
+		if (axis < 3)
+		{
+			m_linearLimits.m_lowerLimit[axis] = lo;
+			m_linearLimits.m_upperLimit[axis] = hi;
+		}
+		else
+		{
+			lo = b3NormalizeAngle(lo);
+			hi = b3NormalizeAngle(hi);
+			m_angularLimits[axis - 3].m_loLimit = lo;
+			m_angularLimits[axis - 3].m_hiLimit = hi;
+		}
+	}
+
+	//! Test limit
+	/*!
+    - free means upper < lower,
+    - locked means upper == lower
+    - limited means upper > lower
+    - limitIndex: first 3 are linear, next 3 are angular
+    */
+	bool isLimited(int limitIndex)
+	{
+		if (limitIndex < 3)
+		{
+			return m_linearLimits.isLimited(limitIndex);
+		}
+		return m_angularLimits[limitIndex - 3].isLimited();
+	}
+
+	virtual void calcAnchorPos(const b3RigidBodyData* bodies);  // overridable
+
+	int get_limit_motor_info2(b3RotationalLimitMotor * limot,
+							  const b3Transform& transA, const b3Transform& transB, const b3Vector3& linVelA, const b3Vector3& linVelB, const b3Vector3& angVelA, const b3Vector3& angVelB,
+							  b3ConstraintInfo2* info, int row, b3Vector3& ax1, int rotational, int rotAllowed = false);
+
+	// access for UseFrameOffset
+	bool getUseFrameOffset() { return m_useOffsetForConstraintFrame; }
+	void setUseFrameOffset(bool frameOffsetOnOff) { m_useOffsetForConstraintFrame = frameOffsetOnOff; }
+
+	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5).
+	///If no axis is provided, it uses the default axis for this constraint.
+	virtual void setParam(int num, b3Scalar value, int axis = -1);
+	///return the local value of parameter
+	virtual b3Scalar getParam(int num, int axis = -1) const;
+
+	void setAxis(const b3Vector3& axis1, const b3Vector3& axis2, const b3RigidBodyData* bodies);
+};
+
+#endif  //B3_GENERIC_6DOF_CONSTRAINT_H

+ 150 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3JacobianEntry.h

@@ -0,0 +1,150 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_JACOBIAN_ENTRY_H
+#define B3_JACOBIAN_ENTRY_H
+
+#include "Bullet3Common/b3Matrix3x3.h"
+
+//notes:
+// Another memory optimization would be to store m_1MinvJt in the remaining 3 w components
+// which makes the b3JacobianEntry memory layout 16 bytes
+// if you only are interested in angular part, just feed massInvA and massInvB zero
+
+/// Jacobian entry is an abstraction that allows to describe constraints
+/// it can be used in combination with a constraint solver
+/// Can be used to relate the effect of an impulse to the constraint error
+B3_ATTRIBUTE_ALIGNED16(class)
+b3JacobianEntry
+{
+public:
+	b3JacobianEntry(){};
+	//constraint between two different rigidbodies
+	b3JacobianEntry(
+		const b3Matrix3x3& world2A,
+		const b3Matrix3x3& world2B,
+		const b3Vector3& rel_pos1, const b3Vector3& rel_pos2,
+		const b3Vector3& jointAxis,
+		const b3Vector3& inertiaInvA,
+		const b3Scalar massInvA,
+		const b3Vector3& inertiaInvB,
+		const b3Scalar massInvB)
+		: m_linearJointAxis(jointAxis)
+	{
+		m_aJ = world2A * (rel_pos1.cross(m_linearJointAxis));
+		m_bJ = world2B * (rel_pos2.cross(-m_linearJointAxis));
+		m_0MinvJt = inertiaInvA * m_aJ;
+		m_1MinvJt = inertiaInvB * m_bJ;
+		m_Adiag = massInvA + m_0MinvJt.dot(m_aJ) + massInvB + m_1MinvJt.dot(m_bJ);
+
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+
+	//angular constraint between two different rigidbodies
+	b3JacobianEntry(const b3Vector3& jointAxis,
+					const b3Matrix3x3& world2A,
+					const b3Matrix3x3& world2B,
+					const b3Vector3& inertiaInvA,
+					const b3Vector3& inertiaInvB)
+		: m_linearJointAxis(b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.)))
+	{
+		m_aJ = world2A * jointAxis;
+		m_bJ = world2B * -jointAxis;
+		m_0MinvJt = inertiaInvA * m_aJ;
+		m_1MinvJt = inertiaInvB * m_bJ;
+		m_Adiag = m_0MinvJt.dot(m_aJ) + m_1MinvJt.dot(m_bJ);
+
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+
+	//angular constraint between two different rigidbodies
+	b3JacobianEntry(const b3Vector3& axisInA,
+					const b3Vector3& axisInB,
+					const b3Vector3& inertiaInvA,
+					const b3Vector3& inertiaInvB)
+		: m_linearJointAxis(b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.))), m_aJ(axisInA), m_bJ(-axisInB)
+	{
+		m_0MinvJt = inertiaInvA * m_aJ;
+		m_1MinvJt = inertiaInvB * m_bJ;
+		m_Adiag = m_0MinvJt.dot(m_aJ) + m_1MinvJt.dot(m_bJ);
+
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+
+	//constraint on one rigidbody
+	b3JacobianEntry(
+		const b3Matrix3x3& world2A,
+		const b3Vector3& rel_pos1, const b3Vector3& rel_pos2,
+		const b3Vector3& jointAxis,
+		const b3Vector3& inertiaInvA,
+		const b3Scalar massInvA)
+		: m_linearJointAxis(jointAxis)
+	{
+		m_aJ = world2A * (rel_pos1.cross(jointAxis));
+		m_bJ = world2A * (rel_pos2.cross(-jointAxis));
+		m_0MinvJt = inertiaInvA * m_aJ;
+		m_1MinvJt = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
+		m_Adiag = massInvA + m_0MinvJt.dot(m_aJ);
+
+		b3Assert(m_Adiag > b3Scalar(0.0));
+	}
+
+	b3Scalar getDiagonal() const { return m_Adiag; }
+
+	// for two constraints on the same rigidbody (for example vehicle friction)
+	b3Scalar getNonDiagonal(const b3JacobianEntry& jacB, const b3Scalar massInvA) const
+	{
+		const b3JacobianEntry& jacA = *this;
+		b3Scalar lin = massInvA * jacA.m_linearJointAxis.dot(jacB.m_linearJointAxis);
+		b3Scalar ang = jacA.m_0MinvJt.dot(jacB.m_aJ);
+		return lin + ang;
+	}
+
+	// for two constraints on sharing two same rigidbodies (for example two contact points between two rigidbodies)
+	b3Scalar getNonDiagonal(const b3JacobianEntry& jacB, const b3Scalar massInvA, const b3Scalar massInvB) const
+	{
+		const b3JacobianEntry& jacA = *this;
+		b3Vector3 lin = jacA.m_linearJointAxis * jacB.m_linearJointAxis;
+		b3Vector3 ang0 = jacA.m_0MinvJt * jacB.m_aJ;
+		b3Vector3 ang1 = jacA.m_1MinvJt * jacB.m_bJ;
+		b3Vector3 lin0 = massInvA * lin;
+		b3Vector3 lin1 = massInvB * lin;
+		b3Vector3 sum = ang0 + ang1 + lin0 + lin1;
+		return sum[0] + sum[1] + sum[2];
+	}
+
+	b3Scalar getRelativeVelocity(const b3Vector3& linvelA, const b3Vector3& angvelA, const b3Vector3& linvelB, const b3Vector3& angvelB)
+	{
+		b3Vector3 linrel = linvelA - linvelB;
+		b3Vector3 angvela = angvelA * m_aJ;
+		b3Vector3 angvelb = angvelB * m_bJ;
+		linrel *= m_linearJointAxis;
+		angvela += angvelb;
+		angvela += linrel;
+		b3Scalar rel_vel2 = angvela[0] + angvela[1] + angvela[2];
+		return rel_vel2 + B3_EPSILON;
+	}
+	//private:
+
+	b3Vector3 m_linearJointAxis;
+	b3Vector3 m_aJ;
+	b3Vector3 m_bJ;
+	b3Vector3 m_0MinvJt;
+	b3Vector3 m_1MinvJt;
+	//Optimization: can be stored in the w/last component of one of the vectors
+	b3Scalar m_Adiag;
+};
+
+#endif  //B3_JACOBIAN_ENTRY_H

+ 1696 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.cpp

@@ -0,0 +1,1696 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2012 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//enable B3_SOLVER_DEBUG if you experience solver crashes
+//#define B3_SOLVER_DEBUG
+//#define COMPUTE_IMPULSE_DENOM 1
+//It is not necessary (redundant) to refresh contact manifolds, this refresh has been moved to the collision algorithms.
+
+//#define DISABLE_JOINTS
+
+#include "b3PgsJacobiSolver.h"
+#include "Bullet3Common/b3MinMax.h"
+#include "b3TypedConstraint.h"
+#include <new>
+#include "Bullet3Common/b3StackAlloc.h"
+
+//#include "b3SolverBody.h"
+//#include "b3SolverConstraint.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include <string.h>  //for memset
+//#include "../../dynamics/basic_demo/Stubs/AdlContact4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+static b3Transform getWorldTransform(b3RigidBodyData* rb)
+{
+	b3Transform newTrans;
+	newTrans.setOrigin(rb->m_pos);
+	newTrans.setRotation(rb->m_quat);
+	return newTrans;
+}
+
+static const b3Matrix3x3& getInvInertiaTensorWorld(b3InertiaData* inertia)
+{
+	return inertia->m_invInertiaWorld;
+}
+
+static const b3Vector3& getLinearVelocity(b3RigidBodyData* rb)
+{
+	return rb->m_linVel;
+}
+
+static const b3Vector3& getAngularVelocity(b3RigidBodyData* rb)
+{
+	return rb->m_angVel;
+}
+
+static b3Vector3 getVelocityInLocalPoint(b3RigidBodyData* rb, const b3Vector3& rel_pos)
+{
+	//we also calculate lin/ang velocity for kinematic objects
+	return getLinearVelocity(rb) + getAngularVelocity(rb).cross(rel_pos);
+}
+
+struct b3ContactPoint
+{
+	b3Vector3 m_positionWorldOnA;
+	b3Vector3 m_positionWorldOnB;
+	b3Vector3 m_normalWorldOnB;
+	b3Scalar m_appliedImpulse;
+	b3Scalar m_distance;
+	b3Scalar m_combinedRestitution;
+
+	///information related to friction
+	b3Scalar m_combinedFriction;
+	b3Vector3 m_lateralFrictionDir1;
+	b3Vector3 m_lateralFrictionDir2;
+	b3Scalar m_appliedImpulseLateral1;
+	b3Scalar m_appliedImpulseLateral2;
+	b3Scalar m_combinedRollingFriction;
+	b3Scalar m_contactMotion1;
+	b3Scalar m_contactMotion2;
+	b3Scalar m_contactCFM1;
+	b3Scalar m_contactCFM2;
+
+	bool m_lateralFrictionInitialized;
+
+	b3Vector3 getPositionWorldOnA()
+	{
+		return m_positionWorldOnA;
+	}
+	b3Vector3 getPositionWorldOnB()
+	{
+		return m_positionWorldOnB;
+	}
+	b3Scalar getDistance()
+	{
+		return m_distance;
+	}
+};
+
+void getContactPoint(b3Contact4* contact, int contactIndex, b3ContactPoint& pointOut)
+{
+	pointOut.m_appliedImpulse = 0.f;
+	pointOut.m_appliedImpulseLateral1 = 0.f;
+	pointOut.m_appliedImpulseLateral2 = 0.f;
+	pointOut.m_combinedFriction = contact->getFrictionCoeff();
+	pointOut.m_combinedRestitution = contact->getRestituitionCoeff();
+	pointOut.m_combinedRollingFriction = 0.f;
+	pointOut.m_contactCFM1 = 0.f;
+	pointOut.m_contactCFM2 = 0.f;
+	pointOut.m_contactMotion1 = 0.f;
+	pointOut.m_contactMotion2 = 0.f;
+	pointOut.m_distance = contact->getPenetration(contactIndex);  //??0.01f
+	b3Vector3 normalOnB = contact->m_worldNormalOnB;
+	normalOnB.normalize();  //is this needed?
+
+	b3Vector3 l1, l2;
+	b3PlaneSpace1(normalOnB, l1, l2);
+
+	pointOut.m_normalWorldOnB = normalOnB;
+	//printf("normalOnB = %f,%f,%f\n",normalOnB.getX(),normalOnB.getY(),normalOnB.getZ());
+	pointOut.m_lateralFrictionDir1 = l1;
+	pointOut.m_lateralFrictionDir2 = l2;
+	pointOut.m_lateralFrictionInitialized = true;
+
+	b3Vector3 worldPosB = contact->m_worldPosB[contactIndex];
+	pointOut.m_positionWorldOnB = worldPosB;
+	pointOut.m_positionWorldOnA = worldPosB + normalOnB * pointOut.m_distance;
+}
+
+int getNumContacts(b3Contact4* contact)
+{
+	return contact->getNPoints();
+}
+
+b3PgsJacobiSolver::b3PgsJacobiSolver(bool usePgs)
+	: m_usePgs(usePgs),
+	  m_numSplitImpulseRecoveries(0),
+	  m_btSeed2(0)
+{
+}
+
+b3PgsJacobiSolver::~b3PgsJacobiSolver()
+{
+}
+
+void b3PgsJacobiSolver::solveContacts(int numBodies, b3RigidBodyData* bodies, b3InertiaData* inertias, int numContacts, b3Contact4* contacts, int numConstraints, b3TypedConstraint** constraints)
+{
+	b3ContactSolverInfo infoGlobal;
+	infoGlobal.m_splitImpulse = false;
+	infoGlobal.m_timeStep = 1.f / 60.f;
+	infoGlobal.m_numIterations = 4;  //4;
+									 //	infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS|B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION;
+	//infoGlobal.m_solverMode|=B3_SOLVER_USE_2_FRICTION_DIRECTIONS|B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS;
+	infoGlobal.m_solverMode |= B3_SOLVER_USE_2_FRICTION_DIRECTIONS;
+
+	//if (infoGlobal.m_solverMode & B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS)
+	//if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) && (infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION))
+
+	solveGroup(bodies, inertias, numBodies, contacts, numContacts, constraints, numConstraints, infoGlobal);
+
+	if (!numContacts)
+		return;
+}
+
+/// b3PgsJacobiSolver Sequentially applies impulses
+b3Scalar b3PgsJacobiSolver::solveGroup(b3RigidBodyData* bodies,
+									   b3InertiaData* inertias,
+									   int numBodies,
+									   b3Contact4* manifoldPtr,
+									   int numManifolds,
+									   b3TypedConstraint** constraints,
+									   int numConstraints,
+									   const b3ContactSolverInfo& infoGlobal)
+{
+	B3_PROFILE("solveGroup");
+	//you need to provide at least some bodies
+
+	solveGroupCacheFriendlySetup(bodies, inertias, numBodies, manifoldPtr, numManifolds, constraints, numConstraints, infoGlobal);
+
+	solveGroupCacheFriendlyIterations(constraints, numConstraints, infoGlobal);
+
+	solveGroupCacheFriendlyFinish(bodies, inertias, numBodies, infoGlobal);
+
+	return 0.f;
+}
+
+#ifdef USE_SIMD
+#include <emmintrin.h>
+#define b3VecSplat(x, e) _mm_shuffle_ps(x, x, _MM_SHUFFLE(e, e, e, e))
+static inline __m128 b3SimdDot3(__m128 vec0, __m128 vec1)
+{
+	__m128 result = _mm_mul_ps(vec0, vec1);
+	return _mm_add_ps(b3VecSplat(result, 0), _mm_add_ps(b3VecSplat(result, 1), b3VecSplat(result, 2)));
+}
+#endif  //USE_SIMD
+
+// Project Gauss Seidel or the equivalent Sequential Impulse
+void b3PgsJacobiSolver::resolveSingleConstraintRowGenericSIMD(b3SolverBody& body1, b3SolverBody& body2, const b3SolverConstraint& c)
+{
+#ifdef USE_SIMD
+	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
+	__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
+	__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
+	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn = _mm_add_ps(b3SimdDot3(c.m_contactNormal.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), b3SimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
+	__m128 deltaVel2Dotn = _mm_sub_ps(b3SimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128), b3SimdDot3((c.m_contactNormal).mVec128, body2.internalGetDeltaLinearVelocity().mVec128));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	b3SimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse);
+	b3SimdScalar resultLowerLess, resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum, upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp);
+	deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse));
+	c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum));
+	__m128 upperMinApplied = _mm_sub_ps(upperLimit1, cpAppliedImp);
+	deltaImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, deltaImpulse), _mm_andnot_ps(resultUpperLess, upperMinApplied));
+	c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultUpperLess, c.m_appliedImpulse), _mm_andnot_ps(resultUpperLess, upperLimit1));
+	__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128, body1.internalGetInvMass().mVec128);
+	__m128 linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128, body2.internalGetInvMass().mVec128);
+	__m128 impulseMagnitude = deltaImpulse;
+	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude));
+	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude));
+	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude));
+	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude));
+#else
+	resolveSingleConstraintRowGeneric(body1, body2, c);
+#endif
+}
+
+// Project Gauss Seidel or the equivalent Sequential Impulse
+void b3PgsJacobiSolver::resolveSingleConstraintRowGeneric(b3SolverBody& body1, b3SolverBody& body2, const b3SolverConstraint& c)
+{
+	b3Scalar deltaImpulse = c.m_rhs - b3Scalar(c.m_appliedImpulse) * c.m_cfm;
+	const b3Scalar deltaVel1Dotn = c.m_contactNormal.dot(body1.internalGetDeltaLinearVelocity()) + c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
+	const b3Scalar deltaVel2Dotn = -c.m_contactNormal.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+
+	//	const b3Scalar delta_rel_vel	=	deltaVel1Dotn-deltaVel2Dotn;
+	deltaImpulse -= deltaVel1Dotn * c.m_jacDiagABInv;
+	deltaImpulse -= deltaVel2Dotn * c.m_jacDiagABInv;
+
+	const b3Scalar sum = b3Scalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit - c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else if (sum > c.m_upperLimit)
+	{
+		deltaImpulse = c.m_upperLimit - c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_upperLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+
+	body1.internalApplyImpulse(c.m_contactNormal * body1.internalGetInvMass(), c.m_angularComponentA, deltaImpulse);
+	body2.internalApplyImpulse(-c.m_contactNormal * body2.internalGetInvMass(), c.m_angularComponentB, deltaImpulse);
+}
+
+void b3PgsJacobiSolver::resolveSingleConstraintRowLowerLimitSIMD(b3SolverBody& body1, b3SolverBody& body2, const b3SolverConstraint& c)
+{
+#ifdef USE_SIMD
+	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedImpulse);
+	__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
+	__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
+	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhs), _mm_mul_ps(_mm_set1_ps(c.m_appliedImpulse), _mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn = _mm_add_ps(b3SimdDot3(c.m_contactNormal.mVec128, body1.internalGetDeltaLinearVelocity().mVec128), b3SimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetDeltaAngularVelocity().mVec128));
+	__m128 deltaVel2Dotn = _mm_sub_ps(b3SimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetDeltaAngularVelocity().mVec128), b3SimdDot3((c.m_contactNormal).mVec128, body2.internalGetDeltaLinearVelocity().mVec128));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	b3SimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse);
+	b3SimdScalar resultLowerLess, resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum, upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp);
+	deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse));
+	c.m_appliedImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum));
+	__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128, body1.internalGetInvMass().mVec128);
+	__m128 linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128, body2.internalGetInvMass().mVec128);
+	__m128 impulseMagnitude = deltaImpulse;
+	body1.internalGetDeltaLinearVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude));
+	body1.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body1.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude));
+	body2.internalGetDeltaLinearVelocity().mVec128 = _mm_sub_ps(body2.internalGetDeltaLinearVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude));
+	body2.internalGetDeltaAngularVelocity().mVec128 = _mm_add_ps(body2.internalGetDeltaAngularVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude));
+#else
+	resolveSingleConstraintRowLowerLimit(body1, body2, c);
+#endif
+}
+
+// Project Gauss Seidel or the equivalent Sequential Impulse
+void b3PgsJacobiSolver::resolveSingleConstraintRowLowerLimit(b3SolverBody& body1, b3SolverBody& body2, const b3SolverConstraint& c)
+{
+	b3Scalar deltaImpulse = c.m_rhs - b3Scalar(c.m_appliedImpulse) * c.m_cfm;
+	const b3Scalar deltaVel1Dotn = c.m_contactNormal.dot(body1.internalGetDeltaLinearVelocity()) + c.m_relpos1CrossNormal.dot(body1.internalGetDeltaAngularVelocity());
+	const b3Scalar deltaVel2Dotn = -c.m_contactNormal.dot(body2.internalGetDeltaLinearVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetDeltaAngularVelocity());
+
+	deltaImpulse -= deltaVel1Dotn * c.m_jacDiagABInv;
+	deltaImpulse -= deltaVel2Dotn * c.m_jacDiagABInv;
+	const b3Scalar sum = b3Scalar(c.m_appliedImpulse) + deltaImpulse;
+	if (sum < c.m_lowerLimit)
+	{
+		deltaImpulse = c.m_lowerLimit - c.m_appliedImpulse;
+		c.m_appliedImpulse = c.m_lowerLimit;
+	}
+	else
+	{
+		c.m_appliedImpulse = sum;
+	}
+	body1.internalApplyImpulse(c.m_contactNormal * body1.internalGetInvMass(), c.m_angularComponentA, deltaImpulse);
+	body2.internalApplyImpulse(-c.m_contactNormal * body2.internalGetInvMass(), c.m_angularComponentB, deltaImpulse);
+}
+
+void b3PgsJacobiSolver::resolveSplitPenetrationImpulseCacheFriendly(
+	b3SolverBody& body1,
+	b3SolverBody& body2,
+	const b3SolverConstraint& c)
+{
+	if (c.m_rhsPenetration)
+	{
+		m_numSplitImpulseRecoveries++;
+		b3Scalar deltaImpulse = c.m_rhsPenetration - b3Scalar(c.m_appliedPushImpulse) * c.m_cfm;
+		const b3Scalar deltaVel1Dotn = c.m_contactNormal.dot(body1.internalGetPushVelocity()) + c.m_relpos1CrossNormal.dot(body1.internalGetTurnVelocity());
+		const b3Scalar deltaVel2Dotn = -c.m_contactNormal.dot(body2.internalGetPushVelocity()) + c.m_relpos2CrossNormal.dot(body2.internalGetTurnVelocity());
+
+		deltaImpulse -= deltaVel1Dotn * c.m_jacDiagABInv;
+		deltaImpulse -= deltaVel2Dotn * c.m_jacDiagABInv;
+		const b3Scalar sum = b3Scalar(c.m_appliedPushImpulse) + deltaImpulse;
+		if (sum < c.m_lowerLimit)
+		{
+			deltaImpulse = c.m_lowerLimit - c.m_appliedPushImpulse;
+			c.m_appliedPushImpulse = c.m_lowerLimit;
+		}
+		else
+		{
+			c.m_appliedPushImpulse = sum;
+		}
+		body1.internalApplyPushImpulse(c.m_contactNormal * body1.internalGetInvMass(), c.m_angularComponentA, deltaImpulse);
+		body2.internalApplyPushImpulse(-c.m_contactNormal * body2.internalGetInvMass(), c.m_angularComponentB, deltaImpulse);
+	}
+}
+
+void b3PgsJacobiSolver::resolveSplitPenetrationSIMD(b3SolverBody& body1, b3SolverBody& body2, const b3SolverConstraint& c)
+{
+#ifdef USE_SIMD
+	if (!c.m_rhsPenetration)
+		return;
+
+	m_numSplitImpulseRecoveries++;
+
+	__m128 cpAppliedImp = _mm_set1_ps(c.m_appliedPushImpulse);
+	__m128 lowerLimit1 = _mm_set1_ps(c.m_lowerLimit);
+	__m128 upperLimit1 = _mm_set1_ps(c.m_upperLimit);
+	__m128 deltaImpulse = _mm_sub_ps(_mm_set1_ps(c.m_rhsPenetration), _mm_mul_ps(_mm_set1_ps(c.m_appliedPushImpulse), _mm_set1_ps(c.m_cfm)));
+	__m128 deltaVel1Dotn = _mm_add_ps(b3SimdDot3(c.m_contactNormal.mVec128, body1.internalGetPushVelocity().mVec128), b3SimdDot3(c.m_relpos1CrossNormal.mVec128, body1.internalGetTurnVelocity().mVec128));
+	__m128 deltaVel2Dotn = _mm_sub_ps(b3SimdDot3(c.m_relpos2CrossNormal.mVec128, body2.internalGetTurnVelocity().mVec128), b3SimdDot3((c.m_contactNormal).mVec128, body2.internalGetPushVelocity().mVec128));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel1Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	deltaImpulse = _mm_sub_ps(deltaImpulse, _mm_mul_ps(deltaVel2Dotn, _mm_set1_ps(c.m_jacDiagABInv)));
+	b3SimdScalar sum = _mm_add_ps(cpAppliedImp, deltaImpulse);
+	b3SimdScalar resultLowerLess, resultUpperLess;
+	resultLowerLess = _mm_cmplt_ps(sum, lowerLimit1);
+	resultUpperLess = _mm_cmplt_ps(sum, upperLimit1);
+	__m128 lowMinApplied = _mm_sub_ps(lowerLimit1, cpAppliedImp);
+	deltaImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowMinApplied), _mm_andnot_ps(resultLowerLess, deltaImpulse));
+	c.m_appliedPushImpulse = _mm_or_ps(_mm_and_ps(resultLowerLess, lowerLimit1), _mm_andnot_ps(resultLowerLess, sum));
+	__m128 linearComponentA = _mm_mul_ps(c.m_contactNormal.mVec128, body1.internalGetInvMass().mVec128);
+	__m128 linearComponentB = _mm_mul_ps((c.m_contactNormal).mVec128, body2.internalGetInvMass().mVec128);
+	__m128 impulseMagnitude = deltaImpulse;
+	body1.internalGetPushVelocity().mVec128 = _mm_add_ps(body1.internalGetPushVelocity().mVec128, _mm_mul_ps(linearComponentA, impulseMagnitude));
+	body1.internalGetTurnVelocity().mVec128 = _mm_add_ps(body1.internalGetTurnVelocity().mVec128, _mm_mul_ps(c.m_angularComponentA.mVec128, impulseMagnitude));
+	body2.internalGetPushVelocity().mVec128 = _mm_sub_ps(body2.internalGetPushVelocity().mVec128, _mm_mul_ps(linearComponentB, impulseMagnitude));
+	body2.internalGetTurnVelocity().mVec128 = _mm_add_ps(body2.internalGetTurnVelocity().mVec128, _mm_mul_ps(c.m_angularComponentB.mVec128, impulseMagnitude));
+#else
+	resolveSplitPenetrationImpulseCacheFriendly(body1, body2, c);
+#endif
+}
+
+unsigned long b3PgsJacobiSolver::b3Rand2()
+{
+	m_btSeed2 = (1664525L * m_btSeed2 + 1013904223L) & 0xffffffff;
+	return m_btSeed2;
+}
+
+//See ODE: adam's all-int straightforward(?) dRandInt (0..n-1)
+int b3PgsJacobiSolver::b3RandInt2(int n)
+{
+	// seems good; xor-fold and modulus
+	const unsigned long un = static_cast<unsigned long>(n);
+	unsigned long r = b3Rand2();
+
+	// note: probably more aggressive than it needs to be -- might be
+	//       able to get away without one or two of the innermost branches.
+	if (un <= 0x00010000UL)
+	{
+		r ^= (r >> 16);
+		if (un <= 0x00000100UL)
+		{
+			r ^= (r >> 8);
+			if (un <= 0x00000010UL)
+			{
+				r ^= (r >> 4);
+				if (un <= 0x00000004UL)
+				{
+					r ^= (r >> 2);
+					if (un <= 0x00000002UL)
+					{
+						r ^= (r >> 1);
+					}
+				}
+			}
+		}
+	}
+
+	return (int)(r % un);
+}
+
+void b3PgsJacobiSolver::initSolverBody(int bodyIndex, b3SolverBody* solverBody, b3RigidBodyData* rb)
+{
+	solverBody->m_deltaLinearVelocity.setValue(0.f, 0.f, 0.f);
+	solverBody->m_deltaAngularVelocity.setValue(0.f, 0.f, 0.f);
+	solverBody->internalGetPushVelocity().setValue(0.f, 0.f, 0.f);
+	solverBody->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f);
+
+	if (rb)
+	{
+		solverBody->m_worldTransform = getWorldTransform(rb);
+		solverBody->internalSetInvMass(b3MakeVector3(rb->m_invMass, rb->m_invMass, rb->m_invMass));
+		solverBody->m_originalBodyIndex = bodyIndex;
+		solverBody->m_angularFactor = b3MakeVector3(1, 1, 1);
+		solverBody->m_linearFactor = b3MakeVector3(1, 1, 1);
+		solverBody->m_linearVelocity = getLinearVelocity(rb);
+		solverBody->m_angularVelocity = getAngularVelocity(rb);
+	}
+	else
+	{
+		solverBody->m_worldTransform.setIdentity();
+		solverBody->internalSetInvMass(b3MakeVector3(0, 0, 0));
+		solverBody->m_originalBodyIndex = bodyIndex;
+		solverBody->m_angularFactor.setValue(1, 1, 1);
+		solverBody->m_linearFactor.setValue(1, 1, 1);
+		solverBody->m_linearVelocity.setValue(0, 0, 0);
+		solverBody->m_angularVelocity.setValue(0, 0, 0);
+	}
+}
+
+b3Scalar b3PgsJacobiSolver::restitutionCurve(b3Scalar rel_vel, b3Scalar restitution)
+{
+	b3Scalar rest = restitution * -rel_vel;
+	return rest;
+}
+
+void b3PgsJacobiSolver::setupFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis, int solverBodyIdA, int solverBodyIdB, b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2, b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity, b3Scalar cfmSlip)
+{
+	solverConstraint.m_contactNormal = normalAxis;
+	b3SolverBody& solverBodyA = m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody& solverBodyB = m_tmpSolverBodyPool[solverBodyIdB];
+
+	b3RigidBodyData* body0 = &bodies[solverBodyA.m_originalBodyIndex];
+	b3RigidBodyData* body1 = &bodies[solverBodyB.m_originalBodyIndex];
+
+	solverConstraint.m_solverBodyIdA = solverBodyIdA;
+	solverConstraint.m_solverBodyIdB = solverBodyIdB;
+
+	solverConstraint.m_friction = cp.m_combinedFriction;
+	solverConstraint.m_originalContactPoint = 0;
+
+	solverConstraint.m_appliedImpulse = 0.f;
+	solverConstraint.m_appliedPushImpulse = 0.f;
+
+	{
+		b3Vector3 ftorqueAxis1 = rel_pos1.cross(solverConstraint.m_contactNormal);
+		solverConstraint.m_relpos1CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentA = body0 ? getInvInertiaTensorWorld(&inertias[solverBodyA.m_originalBodyIndex]) * ftorqueAxis1 : b3MakeVector3(0, 0, 0);
+	}
+	{
+		b3Vector3 ftorqueAxis1 = rel_pos2.cross(-solverConstraint.m_contactNormal);
+		solverConstraint.m_relpos2CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentB = body1 ? getInvInertiaTensorWorld(&inertias[solverBodyB.m_originalBodyIndex]) * ftorqueAxis1 : b3MakeVector3(0, 0, 0);
+	}
+
+	b3Scalar scaledDenom;
+
+	{
+		b3Vector3 vec;
+		b3Scalar denom0 = 0.f;
+		b3Scalar denom1 = 0.f;
+		if (body0)
+		{
+			vec = (solverConstraint.m_angularComponentA).cross(rel_pos1);
+			denom0 = body0->m_invMass + normalAxis.dot(vec);
+		}
+		if (body1)
+		{
+			vec = (-solverConstraint.m_angularComponentB).cross(rel_pos2);
+			denom1 = body1->m_invMass + normalAxis.dot(vec);
+		}
+
+		b3Scalar denom;
+		if (m_usePgs)
+		{
+			scaledDenom = denom = relaxation / (denom0 + denom1);
+		}
+		else
+		{
+			denom = relaxation / (denom0 + denom1);
+			b3Scalar countA = body0->m_invMass ? b3Scalar(m_bodyCount[solverBodyA.m_originalBodyIndex]) : 1.f;
+			b3Scalar countB = body1->m_invMass ? b3Scalar(m_bodyCount[solverBodyB.m_originalBodyIndex]) : 1.f;
+
+			scaledDenom = relaxation / (denom0 * countA + denom1 * countB);
+		}
+
+		solverConstraint.m_jacDiagABInv = denom;
+	}
+
+	{
+		b3Scalar rel_vel;
+		b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(body0 ? solverBodyA.m_linearVelocity : b3MakeVector3(0, 0, 0)) + solverConstraint.m_relpos1CrossNormal.dot(body0 ? solverBodyA.m_angularVelocity : b3MakeVector3(0, 0, 0));
+		b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(body1 ? solverBodyB.m_linearVelocity : b3MakeVector3(0, 0, 0)) + solverConstraint.m_relpos2CrossNormal.dot(body1 ? solverBodyB.m_angularVelocity : b3MakeVector3(0, 0, 0));
+
+		rel_vel = vel1Dotn + vel2Dotn;
+
+		//		b3Scalar positionalError = 0.f;
+
+		b3SimdScalar velocityError = desiredVelocity - rel_vel;
+		b3SimdScalar velocityImpulse = velocityError * b3SimdScalar(scaledDenom);  //solverConstraint.m_jacDiagABInv);
+		solverConstraint.m_rhs = velocityImpulse;
+		solverConstraint.m_cfm = cfmSlip;
+		solverConstraint.m_lowerLimit = 0;
+		solverConstraint.m_upperLimit = 1e10f;
+	}
+}
+
+b3SolverConstraint& b3PgsJacobiSolver::addFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, const b3Vector3& normalAxis, int solverBodyIdA, int solverBodyIdB, int frictionIndex, b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2, b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity, b3Scalar cfmSlip)
+{
+	b3SolverConstraint& solverConstraint = m_tmpSolverContactFrictionConstraintPool.expandNonInitializing();
+	solverConstraint.m_frictionIndex = frictionIndex;
+	setupFrictionConstraint(bodies, inertias, solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2,
+							colObj0, colObj1, relaxation, desiredVelocity, cfmSlip);
+	return solverConstraint;
+}
+
+void b3PgsJacobiSolver::setupRollingFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis1, int solverBodyIdA, int solverBodyIdB,
+													   b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2,
+													   b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation,
+													   b3Scalar desiredVelocity, b3Scalar cfmSlip)
+
+{
+	b3Vector3 normalAxis = b3MakeVector3(0, 0, 0);
+
+	solverConstraint.m_contactNormal = normalAxis;
+	b3SolverBody& solverBodyA = m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody& solverBodyB = m_tmpSolverBodyPool[solverBodyIdB];
+
+	b3RigidBodyData* body0 = &bodies[m_tmpSolverBodyPool[solverBodyIdA].m_originalBodyIndex];
+	b3RigidBodyData* body1 = &bodies[m_tmpSolverBodyPool[solverBodyIdB].m_originalBodyIndex];
+
+	solverConstraint.m_solverBodyIdA = solverBodyIdA;
+	solverConstraint.m_solverBodyIdB = solverBodyIdB;
+
+	solverConstraint.m_friction = cp.m_combinedRollingFriction;
+	solverConstraint.m_originalContactPoint = 0;
+
+	solverConstraint.m_appliedImpulse = 0.f;
+	solverConstraint.m_appliedPushImpulse = 0.f;
+
+	{
+		b3Vector3 ftorqueAxis1 = -normalAxis1;
+		solverConstraint.m_relpos1CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentA = body0 ? getInvInertiaTensorWorld(&inertias[solverBodyA.m_originalBodyIndex]) * ftorqueAxis1 : b3MakeVector3(0, 0, 0);
+	}
+	{
+		b3Vector3 ftorqueAxis1 = normalAxis1;
+		solverConstraint.m_relpos2CrossNormal = ftorqueAxis1;
+		solverConstraint.m_angularComponentB = body1 ? getInvInertiaTensorWorld(&inertias[solverBodyB.m_originalBodyIndex]) * ftorqueAxis1 : b3MakeVector3(0, 0, 0);
+	}
+
+	{
+		b3Vector3 iMJaA = body0 ? getInvInertiaTensorWorld(&inertias[solverBodyA.m_originalBodyIndex]) * solverConstraint.m_relpos1CrossNormal : b3MakeVector3(0, 0, 0);
+		b3Vector3 iMJaB = body1 ? getInvInertiaTensorWorld(&inertias[solverBodyB.m_originalBodyIndex]) * solverConstraint.m_relpos2CrossNormal : b3MakeVector3(0, 0, 0);
+		b3Scalar sum = 0;
+		sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+		sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+		solverConstraint.m_jacDiagABInv = b3Scalar(1.) / sum;
+	}
+
+	{
+		b3Scalar rel_vel;
+		b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(body0 ? solverBodyA.m_linearVelocity : b3MakeVector3(0, 0, 0)) + solverConstraint.m_relpos1CrossNormal.dot(body0 ? solverBodyA.m_angularVelocity : b3MakeVector3(0, 0, 0));
+		b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(body1 ? solverBodyB.m_linearVelocity : b3MakeVector3(0, 0, 0)) + solverConstraint.m_relpos2CrossNormal.dot(body1 ? solverBodyB.m_angularVelocity : b3MakeVector3(0, 0, 0));
+
+		rel_vel = vel1Dotn + vel2Dotn;
+
+		//		b3Scalar positionalError = 0.f;
+
+		b3SimdScalar velocityError = desiredVelocity - rel_vel;
+		b3SimdScalar velocityImpulse = velocityError * b3SimdScalar(solverConstraint.m_jacDiagABInv);
+		solverConstraint.m_rhs = velocityImpulse;
+		solverConstraint.m_cfm = cfmSlip;
+		solverConstraint.m_lowerLimit = 0;
+		solverConstraint.m_upperLimit = 1e10f;
+	}
+}
+
+b3SolverConstraint& b3PgsJacobiSolver::addRollingFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, const b3Vector3& normalAxis, int solverBodyIdA, int solverBodyIdB, int frictionIndex, b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2, b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity, b3Scalar cfmSlip)
+{
+	b3SolverConstraint& solverConstraint = m_tmpSolverContactRollingFrictionConstraintPool.expandNonInitializing();
+	solverConstraint.m_frictionIndex = frictionIndex;
+	setupRollingFrictionConstraint(bodies, inertias, solverConstraint, normalAxis, solverBodyIdA, solverBodyIdB, cp, rel_pos1, rel_pos2,
+								   colObj0, colObj1, relaxation, desiredVelocity, cfmSlip);
+	return solverConstraint;
+}
+
+int b3PgsJacobiSolver::getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies, b3InertiaData* inertias)
+{
+	//b3Assert(bodyIndex< m_tmpSolverBodyPool.size());
+
+	b3RigidBodyData& body = bodies[bodyIndex];
+	int curIndex = -1;
+	if (m_usePgs || body.m_invMass == 0.f)
+	{
+		if (m_bodyCount[bodyIndex] < 0)
+		{
+			curIndex = m_tmpSolverBodyPool.size();
+			b3SolverBody& solverBody = m_tmpSolverBodyPool.expand();
+			initSolverBody(bodyIndex, &solverBody, &body);
+			solverBody.m_originalBodyIndex = bodyIndex;
+			m_bodyCount[bodyIndex] = curIndex;
+		}
+		else
+		{
+			curIndex = m_bodyCount[bodyIndex];
+		}
+	}
+	else
+	{
+		b3Assert(m_bodyCount[bodyIndex] > 0);
+		m_bodyCountCheck[bodyIndex]++;
+		curIndex = m_tmpSolverBodyPool.size();
+		b3SolverBody& solverBody = m_tmpSolverBodyPool.expand();
+		initSolverBody(bodyIndex, &solverBody, &body);
+		solverBody.m_originalBodyIndex = bodyIndex;
+	}
+
+	b3Assert(curIndex >= 0);
+	return curIndex;
+}
+#include <stdio.h>
+
+void b3PgsJacobiSolver::setupContactConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, b3SolverConstraint& solverConstraint,
+											   int solverBodyIdA, int solverBodyIdB,
+											   b3ContactPoint& cp, const b3ContactSolverInfo& infoGlobal,
+											   b3Vector3& vel, b3Scalar& rel_vel, b3Scalar& relaxation,
+											   b3Vector3& rel_pos1, b3Vector3& rel_pos2)
+{
+	const b3Vector3& pos1 = cp.getPositionWorldOnA();
+	const b3Vector3& pos2 = cp.getPositionWorldOnB();
+
+	b3SolverBody* bodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody* bodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+
+	b3RigidBodyData* rb0 = &bodies[bodyA->m_originalBodyIndex];
+	b3RigidBodyData* rb1 = &bodies[bodyB->m_originalBodyIndex];
+
+	//			b3Vector3 rel_pos1 = pos1 - colObj0->getWorldTransform().getOrigin();
+	//			b3Vector3 rel_pos2 = pos2 - colObj1->getWorldTransform().getOrigin();
+	rel_pos1 = pos1 - bodyA->getWorldTransform().getOrigin();
+	rel_pos2 = pos2 - bodyB->getWorldTransform().getOrigin();
+
+	relaxation = 1.f;
+
+	b3Vector3 torqueAxis0 = rel_pos1.cross(cp.m_normalWorldOnB);
+	solverConstraint.m_angularComponentA = rb0 ? getInvInertiaTensorWorld(&inertias[bodyA->m_originalBodyIndex]) * torqueAxis0 : b3MakeVector3(0, 0, 0);
+	b3Vector3 torqueAxis1 = rel_pos2.cross(cp.m_normalWorldOnB);
+	solverConstraint.m_angularComponentB = rb1 ? getInvInertiaTensorWorld(&inertias[bodyB->m_originalBodyIndex]) * -torqueAxis1 : b3MakeVector3(0, 0, 0);
+
+	b3Scalar scaledDenom;
+	{
+#ifdef COMPUTE_IMPULSE_DENOM
+		b3Scalar denom0 = rb0->computeImpulseDenominator(pos1, cp.m_normalWorldOnB);
+		b3Scalar denom1 = rb1->computeImpulseDenominator(pos2, cp.m_normalWorldOnB);
+#else
+		b3Vector3 vec;
+		b3Scalar denom0 = 0.f;
+		b3Scalar denom1 = 0.f;
+		if (rb0)
+		{
+			vec = (solverConstraint.m_angularComponentA).cross(rel_pos1);
+			denom0 = rb0->m_invMass + cp.m_normalWorldOnB.dot(vec);
+		}
+		if (rb1)
+		{
+			vec = (-solverConstraint.m_angularComponentB).cross(rel_pos2);
+			denom1 = rb1->m_invMass + cp.m_normalWorldOnB.dot(vec);
+		}
+#endif  //COMPUTE_IMPULSE_DENOM
+
+		b3Scalar denom;
+		if (m_usePgs)
+		{
+			scaledDenom = denom = relaxation / (denom0 + denom1);
+		}
+		else
+		{
+			denom = relaxation / (denom0 + denom1);
+
+			b3Scalar countA = rb0->m_invMass ? b3Scalar(m_bodyCount[bodyA->m_originalBodyIndex]) : 1.f;
+			b3Scalar countB = rb1->m_invMass ? b3Scalar(m_bodyCount[bodyB->m_originalBodyIndex]) : 1.f;
+			scaledDenom = relaxation / (denom0 * countA + denom1 * countB);
+		}
+		solverConstraint.m_jacDiagABInv = denom;
+	}
+
+	solverConstraint.m_contactNormal = cp.m_normalWorldOnB;
+	solverConstraint.m_relpos1CrossNormal = torqueAxis0;
+	solverConstraint.m_relpos2CrossNormal = -torqueAxis1;
+
+	b3Scalar restitution = 0.f;
+	b3Scalar penetration = cp.getDistance() + infoGlobal.m_linearSlop;
+
+	{
+		b3Vector3 vel1, vel2;
+
+		vel1 = rb0 ? getVelocityInLocalPoint(rb0, rel_pos1) : b3MakeVector3(0, 0, 0);
+		vel2 = rb1 ? getVelocityInLocalPoint(rb1, rel_pos2) : b3MakeVector3(0, 0, 0);
+
+		//			b3Vector3 vel2 = rb1 ? rb1->getVelocityInLocalPoint(rel_pos2) : b3Vector3(0,0,0);
+		vel = vel1 - vel2;
+		rel_vel = cp.m_normalWorldOnB.dot(vel);
+
+		solverConstraint.m_friction = cp.m_combinedFriction;
+
+		restitution = restitutionCurve(rel_vel, cp.m_combinedRestitution);
+		if (restitution <= b3Scalar(0.))
+		{
+			restitution = 0.f;
+		};
+	}
+
+	///warm starting (or zero if disabled)
+	if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+	{
+		solverConstraint.m_appliedImpulse = cp.m_appliedImpulse * infoGlobal.m_warmstartingFactor;
+		if (rb0)
+			bodyA->internalApplyImpulse(solverConstraint.m_contactNormal * bodyA->internalGetInvMass(), solverConstraint.m_angularComponentA, solverConstraint.m_appliedImpulse);
+		if (rb1)
+			bodyB->internalApplyImpulse(solverConstraint.m_contactNormal * bodyB->internalGetInvMass(), -solverConstraint.m_angularComponentB, -(b3Scalar)solverConstraint.m_appliedImpulse);
+	}
+	else
+	{
+		solverConstraint.m_appliedImpulse = 0.f;
+	}
+
+	solverConstraint.m_appliedPushImpulse = 0.f;
+
+	{
+		b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rb0 ? bodyA->m_linearVelocity : b3MakeVector3(0, 0, 0)) + solverConstraint.m_relpos1CrossNormal.dot(rb0 ? bodyA->m_angularVelocity : b3MakeVector3(0, 0, 0));
+		b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rb1 ? bodyB->m_linearVelocity : b3MakeVector3(0, 0, 0)) + solverConstraint.m_relpos2CrossNormal.dot(rb1 ? bodyB->m_angularVelocity : b3MakeVector3(0, 0, 0));
+		b3Scalar rel_vel = vel1Dotn + vel2Dotn;
+
+		b3Scalar positionalError = 0.f;
+		b3Scalar velocityError = restitution - rel_vel;  // * damping;
+
+		b3Scalar erp = infoGlobal.m_erp2;
+		if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+		{
+			erp = infoGlobal.m_erp;
+		}
+
+		if (penetration > 0)
+		{
+			positionalError = 0;
+
+			velocityError -= penetration / infoGlobal.m_timeStep;
+		}
+		else
+		{
+			positionalError = -penetration * erp / infoGlobal.m_timeStep;
+		}
+
+		b3Scalar penetrationImpulse = positionalError * scaledDenom;  //solverConstraint.m_jacDiagABInv;
+		b3Scalar velocityImpulse = velocityError * scaledDenom;       //solverConstraint.m_jacDiagABInv;
+
+		if (!infoGlobal.m_splitImpulse || (penetration > infoGlobal.m_splitImpulsePenetrationThreshold))
+		{
+			//combine position and velocity into rhs
+			solverConstraint.m_rhs = penetrationImpulse + velocityImpulse;
+			solverConstraint.m_rhsPenetration = 0.f;
+		}
+		else
+		{
+			//split position and velocity into rhs and m_rhsPenetration
+			solverConstraint.m_rhs = velocityImpulse;
+			solverConstraint.m_rhsPenetration = penetrationImpulse;
+		}
+		solverConstraint.m_cfm = 0.f;
+		solverConstraint.m_lowerLimit = 0;
+		solverConstraint.m_upperLimit = 1e10f;
+	}
+}
+
+void b3PgsJacobiSolver::setFrictionConstraintImpulse(b3RigidBodyData* bodies, b3InertiaData* inertias, b3SolverConstraint& solverConstraint,
+													 int solverBodyIdA, int solverBodyIdB,
+													 b3ContactPoint& cp, const b3ContactSolverInfo& infoGlobal)
+{
+	b3SolverBody* bodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody* bodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+
+	{
+		b3SolverConstraint& frictionConstraint1 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex];
+		if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+		{
+			frictionConstraint1.m_appliedImpulse = cp.m_appliedImpulseLateral1 * infoGlobal.m_warmstartingFactor;
+			if (bodies[bodyA->m_originalBodyIndex].m_invMass)
+				bodyA->internalApplyImpulse(frictionConstraint1.m_contactNormal * bodies[bodyA->m_originalBodyIndex].m_invMass, frictionConstraint1.m_angularComponentA, frictionConstraint1.m_appliedImpulse);
+			if (bodies[bodyB->m_originalBodyIndex].m_invMass)
+				bodyB->internalApplyImpulse(frictionConstraint1.m_contactNormal * bodies[bodyB->m_originalBodyIndex].m_invMass, -frictionConstraint1.m_angularComponentB, -(b3Scalar)frictionConstraint1.m_appliedImpulse);
+		}
+		else
+		{
+			frictionConstraint1.m_appliedImpulse = 0.f;
+		}
+	}
+
+	if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+	{
+		b3SolverConstraint& frictionConstraint2 = m_tmpSolverContactFrictionConstraintPool[solverConstraint.m_frictionIndex + 1];
+		if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+		{
+			frictionConstraint2.m_appliedImpulse = cp.m_appliedImpulseLateral2 * infoGlobal.m_warmstartingFactor;
+			if (bodies[bodyA->m_originalBodyIndex].m_invMass)
+				bodyA->internalApplyImpulse(frictionConstraint2.m_contactNormal * bodies[bodyA->m_originalBodyIndex].m_invMass, frictionConstraint2.m_angularComponentA, frictionConstraint2.m_appliedImpulse);
+			if (bodies[bodyB->m_originalBodyIndex].m_invMass)
+				bodyB->internalApplyImpulse(frictionConstraint2.m_contactNormal * bodies[bodyB->m_originalBodyIndex].m_invMass, -frictionConstraint2.m_angularComponentB, -(b3Scalar)frictionConstraint2.m_appliedImpulse);
+		}
+		else
+		{
+			frictionConstraint2.m_appliedImpulse = 0.f;
+		}
+	}
+}
+
+void b3PgsJacobiSolver::convertContact(b3RigidBodyData* bodies, b3InertiaData* inertias, b3Contact4* manifold, const b3ContactSolverInfo& infoGlobal)
+{
+	b3RigidBodyData *colObj0 = 0, *colObj1 = 0;
+
+	int solverBodyIdA = getOrInitSolverBody(manifold->getBodyA(), bodies, inertias);
+	int solverBodyIdB = getOrInitSolverBody(manifold->getBodyB(), bodies, inertias);
+
+	//	b3RigidBody* bodyA = b3RigidBody::upcast(colObj0);
+	//	b3RigidBody* bodyB = b3RigidBody::upcast(colObj1);
+
+	b3SolverBody* solverBodyA = &m_tmpSolverBodyPool[solverBodyIdA];
+	b3SolverBody* solverBodyB = &m_tmpSolverBodyPool[solverBodyIdB];
+
+	///avoid collision response between two static objects
+	if (solverBodyA->m_invMass.isZero() && solverBodyB->m_invMass.isZero())
+		return;
+
+	int rollingFriction = 1;
+	int numContacts = getNumContacts(manifold);
+	for (int j = 0; j < numContacts; j++)
+	{
+		b3ContactPoint cp;
+		getContactPoint(manifold, j, cp);
+
+		if (cp.getDistance() <= getContactProcessingThreshold(manifold))
+		{
+			b3Vector3 rel_pos1;
+			b3Vector3 rel_pos2;
+			b3Scalar relaxation;
+			b3Scalar rel_vel;
+			b3Vector3 vel;
+
+			int frictionIndex = m_tmpSolverContactConstraintPool.size();
+			b3SolverConstraint& solverConstraint = m_tmpSolverContactConstraintPool.expandNonInitializing();
+			//			b3RigidBody* rb0 = b3RigidBody::upcast(colObj0);
+			//			b3RigidBody* rb1 = b3RigidBody::upcast(colObj1);
+			solverConstraint.m_solverBodyIdA = solverBodyIdA;
+			solverConstraint.m_solverBodyIdB = solverBodyIdB;
+
+			solverConstraint.m_originalContactPoint = &cp;
+
+			setupContactConstraint(bodies, inertias, solverConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal, vel, rel_vel, relaxation, rel_pos1, rel_pos2);
+
+			//			const b3Vector3& pos1 = cp.getPositionWorldOnA();
+			//			const b3Vector3& pos2 = cp.getPositionWorldOnB();
+
+			/////setup the friction constraints
+
+			solverConstraint.m_frictionIndex = m_tmpSolverContactFrictionConstraintPool.size();
+
+			b3Vector3 angVelA, angVelB;
+			solverBodyA->getAngularVelocity(angVelA);
+			solverBodyB->getAngularVelocity(angVelB);
+			b3Vector3 relAngVel = angVelB - angVelA;
+
+			if ((cp.m_combinedRollingFriction > 0.f) && (rollingFriction > 0))
+			{
+				//only a single rollingFriction per manifold
+				rollingFriction--;
+				if (relAngVel.length() > infoGlobal.m_singleAxisRollingFrictionThreshold)
+				{
+					relAngVel.normalize();
+					if (relAngVel.length() > 0.001)
+						addRollingFrictionConstraint(bodies, inertias, relAngVel, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+				}
+				else
+				{
+					addRollingFrictionConstraint(bodies, inertias, cp.m_normalWorldOnB, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+					b3Vector3 axis0, axis1;
+					b3PlaneSpace1(cp.m_normalWorldOnB, axis0, axis1);
+					if (axis0.length() > 0.001)
+						addRollingFrictionConstraint(bodies, inertias, axis0, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+					if (axis1.length() > 0.001)
+						addRollingFrictionConstraint(bodies, inertias, axis1, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+				}
+			}
+
+			///Bullet has several options to set the friction directions
+			///By default, each contact has only a single friction direction that is recomputed automatically very frame
+			///based on the relative linear velocity.
+			///If the relative velocity it zero, it will automatically compute a friction direction.
+
+			///You can also enable two friction directions, using the B3_SOLVER_USE_2_FRICTION_DIRECTIONS.
+			///In that case, the second friction direction will be orthogonal to both contact normal and first friction direction.
+			///
+			///If you choose B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION, then the friction will be independent from the relative projected velocity.
+			///
+			///The user can manually override the friction directions for certain contacts using a contact callback,
+			///and set the cp.m_lateralFrictionInitialized to true
+			///In that case, you can set the target relative motion in each friction direction (cp.m_contactMotion1 and cp.m_contactMotion2)
+			///this will give a conveyor belt effect
+			///
+			if (!(infoGlobal.m_solverMode & B3_SOLVER_ENABLE_FRICTION_DIRECTION_CACHING) || !cp.m_lateralFrictionInitialized)
+			{
+				cp.m_lateralFrictionDir1 = vel - cp.m_normalWorldOnB * rel_vel;
+				b3Scalar lat_rel_vel = cp.m_lateralFrictionDir1.length2();
+				if (!(infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION) && lat_rel_vel > B3_EPSILON)
+				{
+					cp.m_lateralFrictionDir1 *= 1.f / b3Sqrt(lat_rel_vel);
+					if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+					{
+						cp.m_lateralFrictionDir2 = cp.m_lateralFrictionDir1.cross(cp.m_normalWorldOnB);
+						cp.m_lateralFrictionDir2.normalize();  //??
+						addFrictionConstraint(bodies, inertias, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+					}
+
+					addFrictionConstraint(bodies, inertias, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+				}
+				else
+				{
+					b3PlaneSpace1(cp.m_normalWorldOnB, cp.m_lateralFrictionDir1, cp.m_lateralFrictionDir2);
+
+					if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+					{
+						addFrictionConstraint(bodies, inertias, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+					}
+
+					addFrictionConstraint(bodies, inertias, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation);
+
+					if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) && (infoGlobal.m_solverMode & B3_SOLVER_DISABLE_VELOCITY_DEPENDENT_FRICTION_DIRECTION))
+					{
+						cp.m_lateralFrictionInitialized = true;
+					}
+				}
+			}
+			else
+			{
+				addFrictionConstraint(bodies, inertias, cp.m_lateralFrictionDir1, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, cp.m_contactMotion1, cp.m_contactCFM1);
+
+				if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+					addFrictionConstraint(bodies, inertias, cp.m_lateralFrictionDir2, solverBodyIdA, solverBodyIdB, frictionIndex, cp, rel_pos1, rel_pos2, colObj0, colObj1, relaxation, cp.m_contactMotion2, cp.m_contactCFM2);
+
+				setFrictionConstraintImpulse(bodies, inertias, solverConstraint, solverBodyIdA, solverBodyIdB, cp, infoGlobal);
+			}
+		}
+	}
+}
+
+b3Scalar b3PgsJacobiSolver::solveGroupCacheFriendlySetup(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, b3Contact4* manifoldPtr, int numManifolds, b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal)
+{
+	B3_PROFILE("solveGroupCacheFriendlySetup");
+
+	m_maxOverrideNumSolverIterations = 0;
+
+	m_tmpSolverBodyPool.resize(0);
+
+	m_bodyCount.resize(0);
+	m_bodyCount.resize(numBodies, 0);
+	m_bodyCountCheck.resize(0);
+	m_bodyCountCheck.resize(numBodies, 0);
+
+	m_deltaLinearVelocities.resize(0);
+	m_deltaLinearVelocities.resize(numBodies, b3MakeVector3(0, 0, 0));
+	m_deltaAngularVelocities.resize(0);
+	m_deltaAngularVelocities.resize(numBodies, b3MakeVector3(0, 0, 0));
+
+	//int totalBodies = 0;
+
+	for (int i = 0; i < numConstraints; i++)
+	{
+		int bodyIndexA = constraints[i]->getRigidBodyA();
+		int bodyIndexB = constraints[i]->getRigidBodyB();
+		if (m_usePgs)
+		{
+			m_bodyCount[bodyIndexA] = -1;
+			m_bodyCount[bodyIndexB] = -1;
+		}
+		else
+		{
+			//didn't implement joints with Jacobi version yet
+			b3Assert(0);
+		}
+	}
+	for (int i = 0; i < numManifolds; i++)
+	{
+		int bodyIndexA = manifoldPtr[i].getBodyA();
+		int bodyIndexB = manifoldPtr[i].getBodyB();
+		if (m_usePgs)
+		{
+			m_bodyCount[bodyIndexA] = -1;
+			m_bodyCount[bodyIndexB] = -1;
+		}
+		else
+		{
+			if (bodies[bodyIndexA].m_invMass)
+			{
+				//m_bodyCount[bodyIndexA]+=manifoldPtr[i].getNPoints();
+				m_bodyCount[bodyIndexA]++;
+			}
+			else
+				m_bodyCount[bodyIndexA] = -1;
+
+			if (bodies[bodyIndexB].m_invMass)
+				//	m_bodyCount[bodyIndexB]+=manifoldPtr[i].getNPoints();
+				m_bodyCount[bodyIndexB]++;
+			else
+				m_bodyCount[bodyIndexB] = -1;
+		}
+	}
+
+	if (1)
+	{
+		int j;
+		for (j = 0; j < numConstraints; j++)
+		{
+			b3TypedConstraint* constraint = constraints[j];
+
+			constraint->internalSetAppliedImpulse(0.0f);
+		}
+	}
+
+	//b3RigidBody* rb0=0,*rb1=0;
+	//if (1)
+	{
+		{
+			int totalNumRows = 0;
+			int i;
+
+			m_tmpConstraintSizesPool.resizeNoInitialize(numConstraints);
+			//calculate the total number of contraint rows
+			for (i = 0; i < numConstraints; i++)
+			{
+				b3TypedConstraint::b3ConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+				b3JointFeedback* fb = constraints[i]->getJointFeedback();
+				if (fb)
+				{
+					fb->m_appliedForceBodyA.setZero();
+					fb->m_appliedTorqueBodyA.setZero();
+					fb->m_appliedForceBodyB.setZero();
+					fb->m_appliedTorqueBodyB.setZero();
+				}
+
+				if (constraints[i]->isEnabled())
+				{
+				}
+				if (constraints[i]->isEnabled())
+				{
+					constraints[i]->getInfo1(&info1, bodies);
+				}
+				else
+				{
+					info1.m_numConstraintRows = 0;
+					info1.nub = 0;
+				}
+				totalNumRows += info1.m_numConstraintRows;
+			}
+			m_tmpSolverNonContactConstraintPool.resizeNoInitialize(totalNumRows);
+
+#ifndef DISABLE_JOINTS
+			///setup the b3SolverConstraints
+			int currentRow = 0;
+
+			for (i = 0; i < numConstraints; i++)
+			{
+				const b3TypedConstraint::b3ConstraintInfo1& info1 = m_tmpConstraintSizesPool[i];
+
+				if (info1.m_numConstraintRows)
+				{
+					b3Assert(currentRow < totalNumRows);
+
+					b3SolverConstraint* currentConstraintRow = &m_tmpSolverNonContactConstraintPool[currentRow];
+					b3TypedConstraint* constraint = constraints[i];
+
+					b3RigidBodyData& rbA = bodies[constraint->getRigidBodyA()];
+					//b3RigidBody& rbA = constraint->getRigidBodyA();
+					//				b3RigidBody& rbB = constraint->getRigidBodyB();
+					b3RigidBodyData& rbB = bodies[constraint->getRigidBodyB()];
+
+					int solverBodyIdA = getOrInitSolverBody(constraint->getRigidBodyA(), bodies, inertias);
+					int solverBodyIdB = getOrInitSolverBody(constraint->getRigidBodyB(), bodies, inertias);
+
+					b3SolverBody* bodyAPtr = &m_tmpSolverBodyPool[solverBodyIdA];
+					b3SolverBody* bodyBPtr = &m_tmpSolverBodyPool[solverBodyIdB];
+
+					int overrideNumSolverIterations = constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;
+					if (overrideNumSolverIterations > m_maxOverrideNumSolverIterations)
+						m_maxOverrideNumSolverIterations = overrideNumSolverIterations;
+
+					int j;
+					for (j = 0; j < info1.m_numConstraintRows; j++)
+					{
+						memset(&currentConstraintRow[j], 0, sizeof(b3SolverConstraint));
+						currentConstraintRow[j].m_lowerLimit = -B3_INFINITY;
+						currentConstraintRow[j].m_upperLimit = B3_INFINITY;
+						currentConstraintRow[j].m_appliedImpulse = 0.f;
+						currentConstraintRow[j].m_appliedPushImpulse = 0.f;
+						currentConstraintRow[j].m_solverBodyIdA = solverBodyIdA;
+						currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;
+						currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;
+					}
+
+					bodyAPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f);
+					bodyAPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f);
+					bodyAPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f);
+					bodyAPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f);
+					bodyBPtr->internalGetDeltaLinearVelocity().setValue(0.f, 0.f, 0.f);
+					bodyBPtr->internalGetDeltaAngularVelocity().setValue(0.f, 0.f, 0.f);
+					bodyBPtr->internalGetPushVelocity().setValue(0.f, 0.f, 0.f);
+					bodyBPtr->internalGetTurnVelocity().setValue(0.f, 0.f, 0.f);
+
+					b3TypedConstraint::b3ConstraintInfo2 info2;
+					info2.fps = 1.f / infoGlobal.m_timeStep;
+					info2.erp = infoGlobal.m_erp;
+					info2.m_J1linearAxis = currentConstraintRow->m_contactNormal;
+					info2.m_J1angularAxis = currentConstraintRow->m_relpos1CrossNormal;
+					info2.m_J2linearAxis = 0;
+					info2.m_J2angularAxis = currentConstraintRow->m_relpos2CrossNormal;
+					info2.rowskip = sizeof(b3SolverConstraint) / sizeof(b3Scalar);  //check this
+																					///the size of b3SolverConstraint needs be a multiple of b3Scalar
+					b3Assert(info2.rowskip * sizeof(b3Scalar) == sizeof(b3SolverConstraint));
+					info2.m_constraintError = &currentConstraintRow->m_rhs;
+					currentConstraintRow->m_cfm = infoGlobal.m_globalCfm;
+					info2.m_damping = infoGlobal.m_damping;
+					info2.cfm = &currentConstraintRow->m_cfm;
+					info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;
+					info2.m_upperLimit = &currentConstraintRow->m_upperLimit;
+					info2.m_numIterations = infoGlobal.m_numIterations;
+					constraints[i]->getInfo2(&info2, bodies);
+
+					///finalize the constraint setup
+					for (j = 0; j < info1.m_numConstraintRows; j++)
+					{
+						b3SolverConstraint& solverConstraint = currentConstraintRow[j];
+
+						if (solverConstraint.m_upperLimit >= constraints[i]->getBreakingImpulseThreshold())
+						{
+							solverConstraint.m_upperLimit = constraints[i]->getBreakingImpulseThreshold();
+						}
+
+						if (solverConstraint.m_lowerLimit <= -constraints[i]->getBreakingImpulseThreshold())
+						{
+							solverConstraint.m_lowerLimit = -constraints[i]->getBreakingImpulseThreshold();
+						}
+
+						solverConstraint.m_originalContactPoint = constraint;
+
+						b3Matrix3x3& invInertiaWorldA = inertias[constraint->getRigidBodyA()].m_invInertiaWorld;
+						{
+							//b3Vector3 angularFactorA(1,1,1);
+							const b3Vector3& ftorqueAxis1 = solverConstraint.m_relpos1CrossNormal;
+							solverConstraint.m_angularComponentA = invInertiaWorldA * ftorqueAxis1;  //*angularFactorA;
+						}
+
+						b3Matrix3x3& invInertiaWorldB = inertias[constraint->getRigidBodyB()].m_invInertiaWorld;
+						{
+							const b3Vector3& ftorqueAxis2 = solverConstraint.m_relpos2CrossNormal;
+							solverConstraint.m_angularComponentB = invInertiaWorldB * ftorqueAxis2;  //*constraint->getRigidBodyB().getAngularFactor();
+						}
+
+						{
+							//it is ok to use solverConstraint.m_contactNormal instead of -solverConstraint.m_contactNormal
+							//because it gets multiplied iMJlB
+							b3Vector3 iMJlA = solverConstraint.m_contactNormal * rbA.m_invMass;
+							b3Vector3 iMJaA = invInertiaWorldA * solverConstraint.m_relpos1CrossNormal;
+							b3Vector3 iMJlB = solverConstraint.m_contactNormal * rbB.m_invMass;  //sign of normal?
+							b3Vector3 iMJaB = invInertiaWorldB * solverConstraint.m_relpos2CrossNormal;
+
+							b3Scalar sum = iMJlA.dot(solverConstraint.m_contactNormal);
+							sum += iMJaA.dot(solverConstraint.m_relpos1CrossNormal);
+							sum += iMJlB.dot(solverConstraint.m_contactNormal);
+							sum += iMJaB.dot(solverConstraint.m_relpos2CrossNormal);
+							b3Scalar fsum = b3Fabs(sum);
+							b3Assert(fsum > B3_EPSILON);
+							solverConstraint.m_jacDiagABInv = fsum > B3_EPSILON ? b3Scalar(1.) / sum : 0.f;
+						}
+
+						///fix rhs
+						///todo: add force/torque accelerators
+						{
+							b3Scalar rel_vel;
+							b3Scalar vel1Dotn = solverConstraint.m_contactNormal.dot(rbA.m_linVel) + solverConstraint.m_relpos1CrossNormal.dot(rbA.m_angVel);
+							b3Scalar vel2Dotn = -solverConstraint.m_contactNormal.dot(rbB.m_linVel) + solverConstraint.m_relpos2CrossNormal.dot(rbB.m_angVel);
+
+							rel_vel = vel1Dotn + vel2Dotn;
+
+							b3Scalar restitution = 0.f;
+							b3Scalar positionalError = solverConstraint.m_rhs;  //already filled in by getConstraintInfo2
+							b3Scalar velocityError = restitution - rel_vel * info2.m_damping;
+							b3Scalar penetrationImpulse = positionalError * solverConstraint.m_jacDiagABInv;
+							b3Scalar velocityImpulse = velocityError * solverConstraint.m_jacDiagABInv;
+							solverConstraint.m_rhs = penetrationImpulse + velocityImpulse;
+							solverConstraint.m_appliedImpulse = 0.f;
+						}
+					}
+				}
+				currentRow += m_tmpConstraintSizesPool[i].m_numConstraintRows;
+			}
+#endif  //DISABLE_JOINTS
+		}
+
+		{
+			int i;
+
+			for (i = 0; i < numManifolds; i++)
+			{
+				b3Contact4& manifold = manifoldPtr[i];
+				convertContact(bodies, inertias, &manifold, infoGlobal);
+			}
+		}
+	}
+
+	//	b3ContactSolverInfo info = infoGlobal;
+
+	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
+	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
+	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
+
+	///@todo: use stack allocator for such temporarily memory, same for solver bodies/constraints
+	m_orderNonContactConstraintPool.resizeNoInitialize(numNonContactPool);
+	if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+		m_orderTmpConstraintPool.resizeNoInitialize(numConstraintPool * 2);
+	else
+		m_orderTmpConstraintPool.resizeNoInitialize(numConstraintPool);
+
+	m_orderFrictionConstraintPool.resizeNoInitialize(numFrictionPool);
+	{
+		int i;
+		for (i = 0; i < numNonContactPool; i++)
+		{
+			m_orderNonContactConstraintPool[i] = i;
+		}
+		for (i = 0; i < numConstraintPool; i++)
+		{
+			m_orderTmpConstraintPool[i] = i;
+		}
+		for (i = 0; i < numFrictionPool; i++)
+		{
+			m_orderFrictionConstraintPool[i] = i;
+		}
+	}
+
+	return 0.f;
+}
+
+b3Scalar b3PgsJacobiSolver::solveSingleIteration(int iteration, b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal)
+{
+	int numNonContactPool = m_tmpSolverNonContactConstraintPool.size();
+	int numConstraintPool = m_tmpSolverContactConstraintPool.size();
+	int numFrictionPool = m_tmpSolverContactFrictionConstraintPool.size();
+
+	if (infoGlobal.m_solverMode & B3_SOLVER_RANDMIZE_ORDER)
+	{
+		if (1)  // uncomment this for a bit less random ((iteration & 7) == 0)
+		{
+			for (int j = 0; j < numNonContactPool; ++j)
+			{
+				int tmp = m_orderNonContactConstraintPool[j];
+				int swapi = b3RandInt2(j + 1);
+				m_orderNonContactConstraintPool[j] = m_orderNonContactConstraintPool[swapi];
+				m_orderNonContactConstraintPool[swapi] = tmp;
+			}
+
+			//contact/friction constraints are not solved more than
+			if (iteration < infoGlobal.m_numIterations)
+			{
+				for (int j = 0; j < numConstraintPool; ++j)
+				{
+					int tmp = m_orderTmpConstraintPool[j];
+					int swapi = b3RandInt2(j + 1);
+					m_orderTmpConstraintPool[j] = m_orderTmpConstraintPool[swapi];
+					m_orderTmpConstraintPool[swapi] = tmp;
+				}
+
+				for (int j = 0; j < numFrictionPool; ++j)
+				{
+					int tmp = m_orderFrictionConstraintPool[j];
+					int swapi = b3RandInt2(j + 1);
+					m_orderFrictionConstraintPool[j] = m_orderFrictionConstraintPool[swapi];
+					m_orderFrictionConstraintPool[swapi] = tmp;
+				}
+			}
+		}
+	}
+
+	if (infoGlobal.m_solverMode & B3_SOLVER_SIMD)
+	{
+		///solve all joint constraints, using SIMD, if available
+		for (int j = 0; j < m_tmpSolverNonContactConstraintPool.size(); j++)
+		{
+			b3SolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+			if (iteration < constraint.m_overrideNumSolverIterations)
+				resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA], m_tmpSolverBodyPool[constraint.m_solverBodyIdB], constraint);
+		}
+
+		if (iteration < infoGlobal.m_numIterations)
+		{
+			///solve all contact constraints using SIMD, if available
+			if (infoGlobal.m_solverMode & B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS)
+			{
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int multiplier = (infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS) ? 2 : 1;
+
+				for (int c = 0; c < numPoolConstraints; c++)
+				{
+					b3Scalar totalImpulse = 0;
+
+					{
+						const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[c]];
+						resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+						totalImpulse = solveManifold.m_appliedImpulse;
+					}
+					bool applyFriction = true;
+					if (applyFriction)
+					{
+						{
+							b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c * multiplier]];
+
+							if (totalImpulse > b3Scalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction * totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction * totalImpulse;
+
+								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+							}
+						}
+
+						if (infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS)
+						{
+							b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[c * multiplier + 1]];
+
+							if (totalImpulse > b3Scalar(0))
+							{
+								solveManifold.m_lowerLimit = -(solveManifold.m_friction * totalImpulse);
+								solveManifold.m_upperLimit = solveManifold.m_friction * totalImpulse;
+
+								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+							}
+						}
+					}
+				}
+			}
+			else  //B3_SOLVER_INTERLEAVE_CONTACT_AND_FRICTION_CONSTRAINTS
+			{
+				//solve the friction constraints after all contact constraints, don't interleave them
+				int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+				int j;
+
+				for (j = 0; j < numPoolConstraints; j++)
+				{
+					const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+					resolveSingleConstraintRowLowerLimitSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+				}
+
+				if (!m_usePgs)
+					averageVelocities();
+
+				///solve all friction constraints, using SIMD, if available
+
+				int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
+				for (j = 0; j < numFrictionPoolConstraints; j++)
+				{
+					b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+					b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+
+					if (totalImpulse > b3Scalar(0))
+					{
+						solveManifold.m_lowerLimit = -(solveManifold.m_friction * totalImpulse);
+						solveManifold.m_upperLimit = solveManifold.m_friction * totalImpulse;
+
+						resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+					}
+				}
+
+				int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+				for (j = 0; j < numRollingFrictionPoolConstraints; j++)
+				{
+					b3SolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+					b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+					if (totalImpulse > b3Scalar(0))
+					{
+						b3Scalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction * totalImpulse;
+						if (rollingFrictionMagnitude > rollingFrictionConstraint.m_friction)
+							rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+
+						rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+						rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+
+						resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA], m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB], rollingFrictionConstraint);
+					}
+				}
+			}
+		}
+	}
+	else
+	{
+		//non-SIMD version
+		///solve all joint constraints
+		for (int j = 0; j < m_tmpSolverNonContactConstraintPool.size(); j++)
+		{
+			b3SolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[m_orderNonContactConstraintPool[j]];
+			if (iteration < constraint.m_overrideNumSolverIterations)
+				resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA], m_tmpSolverBodyPool[constraint.m_solverBodyIdB], constraint);
+		}
+
+		if (iteration < infoGlobal.m_numIterations)
+		{
+			///solve all contact constraints
+			int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+			for (int j = 0; j < numPoolConstraints; j++)
+			{
+				const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+				resolveSingleConstraintRowLowerLimit(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+			}
+			///solve all friction constraints
+			int numFrictionPoolConstraints = m_tmpSolverContactFrictionConstraintPool.size();
+			for (int j = 0; j < numFrictionPoolConstraints; j++)
+			{
+				b3SolverConstraint& solveManifold = m_tmpSolverContactFrictionConstraintPool[m_orderFrictionConstraintPool[j]];
+				b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+
+				if (totalImpulse > b3Scalar(0))
+				{
+					solveManifold.m_lowerLimit = -(solveManifold.m_friction * totalImpulse);
+					solveManifold.m_upperLimit = solveManifold.m_friction * totalImpulse;
+
+					resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+				}
+			}
+
+			int numRollingFrictionPoolConstraints = m_tmpSolverContactRollingFrictionConstraintPool.size();
+			for (int j = 0; j < numRollingFrictionPoolConstraints; j++)
+			{
+				b3SolverConstraint& rollingFrictionConstraint = m_tmpSolverContactRollingFrictionConstraintPool[j];
+				b3Scalar totalImpulse = m_tmpSolverContactConstraintPool[rollingFrictionConstraint.m_frictionIndex].m_appliedImpulse;
+				if (totalImpulse > b3Scalar(0))
+				{
+					b3Scalar rollingFrictionMagnitude = rollingFrictionConstraint.m_friction * totalImpulse;
+					if (rollingFrictionMagnitude > rollingFrictionConstraint.m_friction)
+						rollingFrictionMagnitude = rollingFrictionConstraint.m_friction;
+
+					rollingFrictionConstraint.m_lowerLimit = -rollingFrictionMagnitude;
+					rollingFrictionConstraint.m_upperLimit = rollingFrictionMagnitude;
+
+					resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdA], m_tmpSolverBodyPool[rollingFrictionConstraint.m_solverBodyIdB], rollingFrictionConstraint);
+				}
+			}
+		}
+	}
+	return 0.f;
+}
+
+void b3PgsJacobiSolver::solveGroupCacheFriendlySplitImpulseIterations(b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal)
+{
+	int iteration;
+	if (infoGlobal.m_splitImpulse)
+	{
+		if (infoGlobal.m_solverMode & B3_SOLVER_SIMD)
+		{
+			for (iteration = 0; iteration < infoGlobal.m_numIterations; iteration++)
+			{
+				{
+					int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+					int j;
+					for (j = 0; j < numPoolConstraints; j++)
+					{
+						const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+
+						resolveSplitPenetrationSIMD(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+					}
+				}
+			}
+		}
+		else
+		{
+			for (iteration = 0; iteration < infoGlobal.m_numIterations; iteration++)
+			{
+				{
+					int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+					int j;
+					for (j = 0; j < numPoolConstraints; j++)
+					{
+						const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[m_orderTmpConstraintPool[j]];
+
+						resolveSplitPenetrationImpulseCacheFriendly(m_tmpSolverBodyPool[solveManifold.m_solverBodyIdA], m_tmpSolverBodyPool[solveManifold.m_solverBodyIdB], solveManifold);
+					}
+				}
+			}
+		}
+	}
+}
+
+b3Scalar b3PgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal)
+{
+	B3_PROFILE("solveGroupCacheFriendlyIterations");
+
+	{
+		///this is a special step to resolve penetrations (just for contacts)
+		solveGroupCacheFriendlySplitImpulseIterations(constraints, numConstraints, infoGlobal);
+
+		int maxIterations = m_maxOverrideNumSolverIterations > infoGlobal.m_numIterations ? m_maxOverrideNumSolverIterations : infoGlobal.m_numIterations;
+
+		for (int iteration = 0; iteration < maxIterations; iteration++)
+		//for ( int iteration = maxIterations-1  ; iteration >= 0;iteration--)
+		{
+			solveSingleIteration(iteration, constraints, numConstraints, infoGlobal);
+
+			if (!m_usePgs)
+			{
+				averageVelocities();
+			}
+		}
+	}
+	return 0.f;
+}
+
+void b3PgsJacobiSolver::averageVelocities()
+{
+	B3_PROFILE("averaging");
+	//average the velocities
+	int numBodies = m_bodyCount.size();
+
+	m_deltaLinearVelocities.resize(0);
+	m_deltaLinearVelocities.resize(numBodies, b3MakeVector3(0, 0, 0));
+	m_deltaAngularVelocities.resize(0);
+	m_deltaAngularVelocities.resize(numBodies, b3MakeVector3(0, 0, 0));
+
+	for (int i = 0; i < m_tmpSolverBodyPool.size(); i++)
+	{
+		if (!m_tmpSolverBodyPool[i].m_invMass.isZero())
+		{
+			int orgBodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
+			m_deltaLinearVelocities[orgBodyIndex] += m_tmpSolverBodyPool[i].getDeltaLinearVelocity();
+			m_deltaAngularVelocities[orgBodyIndex] += m_tmpSolverBodyPool[i].getDeltaAngularVelocity();
+		}
+	}
+
+	for (int i = 0; i < m_tmpSolverBodyPool.size(); i++)
+	{
+		int orgBodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
+
+		if (!m_tmpSolverBodyPool[i].m_invMass.isZero())
+		{
+			b3Assert(m_bodyCount[orgBodyIndex] == m_bodyCountCheck[orgBodyIndex]);
+
+			b3Scalar factor = 1.f / b3Scalar(m_bodyCount[orgBodyIndex]);
+
+			m_tmpSolverBodyPool[i].m_deltaLinearVelocity = m_deltaLinearVelocities[orgBodyIndex] * factor;
+			m_tmpSolverBodyPool[i].m_deltaAngularVelocity = m_deltaAngularVelocities[orgBodyIndex] * factor;
+		}
+	}
+}
+
+b3Scalar b3PgsJacobiSolver::solveGroupCacheFriendlyFinish(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, const b3ContactSolverInfo& infoGlobal)
+{
+	B3_PROFILE("solveGroupCacheFriendlyFinish");
+	int numPoolConstraints = m_tmpSolverContactConstraintPool.size();
+	int i, j;
+
+	if (infoGlobal.m_solverMode & B3_SOLVER_USE_WARMSTARTING)
+	{
+		for (j = 0; j < numPoolConstraints; j++)
+		{
+			const b3SolverConstraint& solveManifold = m_tmpSolverContactConstraintPool[j];
+			b3ContactPoint* pt = (b3ContactPoint*)solveManifold.m_originalContactPoint;
+			b3Assert(pt);
+			pt->m_appliedImpulse = solveManifold.m_appliedImpulse;
+			//	float f = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+			//	printf("pt->m_appliedImpulseLateral1 = %f\n", f);
+			pt->m_appliedImpulseLateral1 = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex].m_appliedImpulse;
+			//printf("pt->m_appliedImpulseLateral1 = %f\n", pt->m_appliedImpulseLateral1);
+			if ((infoGlobal.m_solverMode & B3_SOLVER_USE_2_FRICTION_DIRECTIONS))
+			{
+				pt->m_appliedImpulseLateral2 = m_tmpSolverContactFrictionConstraintPool[solveManifold.m_frictionIndex + 1].m_appliedImpulse;
+			}
+			//do a callback here?
+		}
+	}
+
+	numPoolConstraints = m_tmpSolverNonContactConstraintPool.size();
+	for (j = 0; j < numPoolConstraints; j++)
+	{
+		const b3SolverConstraint& solverConstr = m_tmpSolverNonContactConstraintPool[j];
+		b3TypedConstraint* constr = (b3TypedConstraint*)solverConstr.m_originalContactPoint;
+		b3JointFeedback* fb = constr->getJointFeedback();
+		if (fb)
+		{
+			b3SolverBody* bodyA = &m_tmpSolverBodyPool[solverConstr.m_solverBodyIdA];
+			b3SolverBody* bodyB = &m_tmpSolverBodyPool[solverConstr.m_solverBodyIdB];
+
+			fb->m_appliedForceBodyA += solverConstr.m_contactNormal * solverConstr.m_appliedImpulse * bodyA->m_linearFactor / infoGlobal.m_timeStep;
+			fb->m_appliedForceBodyB += -solverConstr.m_contactNormal * solverConstr.m_appliedImpulse * bodyB->m_linearFactor / infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyA += solverConstr.m_relpos1CrossNormal * bodyA->m_angularFactor * solverConstr.m_appliedImpulse / infoGlobal.m_timeStep;
+			fb->m_appliedTorqueBodyB += -solverConstr.m_relpos1CrossNormal * bodyB->m_angularFactor * solverConstr.m_appliedImpulse / infoGlobal.m_timeStep;
+		}
+
+		constr->internalSetAppliedImpulse(solverConstr.m_appliedImpulse);
+		if (b3Fabs(solverConstr.m_appliedImpulse) >= constr->getBreakingImpulseThreshold())
+		{
+			constr->setEnabled(false);
+		}
+	}
+
+	{
+		B3_PROFILE("write back velocities and transforms");
+		for (i = 0; i < m_tmpSolverBodyPool.size(); i++)
+		{
+			int bodyIndex = m_tmpSolverBodyPool[i].m_originalBodyIndex;
+			//b3Assert(i==bodyIndex);
+
+			b3RigidBodyData* body = &bodies[bodyIndex];
+			if (body->m_invMass)
+			{
+				if (infoGlobal.m_splitImpulse)
+					m_tmpSolverBodyPool[i].writebackVelocityAndTransform(infoGlobal.m_timeStep, infoGlobal.m_splitImpulseTurnErp);
+				else
+					m_tmpSolverBodyPool[i].writebackVelocity();
+
+				if (m_usePgs)
+				{
+					body->m_linVel = m_tmpSolverBodyPool[i].m_linearVelocity;
+					body->m_angVel = m_tmpSolverBodyPool[i].m_angularVelocity;
+				}
+				else
+				{
+					b3Scalar factor = 1.f / b3Scalar(m_bodyCount[bodyIndex]);
+
+					b3Vector3 deltaLinVel = m_deltaLinearVelocities[bodyIndex] * factor;
+					b3Vector3 deltaAngVel = m_deltaAngularVelocities[bodyIndex] * factor;
+					//printf("body %d\n",bodyIndex);
+					//printf("deltaLinVel = %f,%f,%f\n",deltaLinVel.getX(),deltaLinVel.getY(),deltaLinVel.getZ());
+					//printf("deltaAngVel = %f,%f,%f\n",deltaAngVel.getX(),deltaAngVel.getY(),deltaAngVel.getZ());
+
+					body->m_linVel += deltaLinVel;
+					body->m_angVel += deltaAngVel;
+				}
+
+				if (infoGlobal.m_splitImpulse)
+				{
+					body->m_pos = m_tmpSolverBodyPool[i].m_worldTransform.getOrigin();
+					b3Quaternion orn;
+					orn = m_tmpSolverBodyPool[i].m_worldTransform.getRotation();
+					body->m_quat = orn;
+				}
+			}
+		}
+	}
+
+	m_tmpSolverContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverNonContactConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactFrictionConstraintPool.resizeNoInitialize(0);
+	m_tmpSolverContactRollingFrictionConstraintPool.resizeNoInitialize(0);
+
+	m_tmpSolverBodyPool.resizeNoInitialize(0);
+	return 0.f;
+}
+
+void b3PgsJacobiSolver::reset()
+{
+	m_btSeed2 = 0;
+}

+ 133 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3PgsJacobiSolver.h

@@ -0,0 +1,133 @@
+#ifndef B3_PGS_JACOBI_SOLVER
+#define B3_PGS_JACOBI_SOLVER
+
+struct b3Contact4;
+struct b3ContactPoint;
+
+class b3Dispatcher;
+
+#include "b3TypedConstraint.h"
+#include "b3ContactSolverInfo.h"
+#include "b3SolverBody.h"
+#include "b3SolverConstraint.h"
+
+struct b3RigidBodyData;
+struct b3InertiaData;
+
+class b3PgsJacobiSolver
+{
+protected:
+	b3AlignedObjectArray<b3SolverBody> m_tmpSolverBodyPool;
+	b3ConstraintArray m_tmpSolverContactConstraintPool;
+	b3ConstraintArray m_tmpSolverNonContactConstraintPool;
+	b3ConstraintArray m_tmpSolverContactFrictionConstraintPool;
+	b3ConstraintArray m_tmpSolverContactRollingFrictionConstraintPool;
+
+	b3AlignedObjectArray<int> m_orderTmpConstraintPool;
+	b3AlignedObjectArray<int> m_orderNonContactConstraintPool;
+	b3AlignedObjectArray<int> m_orderFrictionConstraintPool;
+	b3AlignedObjectArray<b3TypedConstraint::b3ConstraintInfo1> m_tmpConstraintSizesPool;
+
+	b3AlignedObjectArray<int> m_bodyCount;
+	b3AlignedObjectArray<int> m_bodyCountCheck;
+
+	b3AlignedObjectArray<b3Vector3> m_deltaLinearVelocities;
+	b3AlignedObjectArray<b3Vector3> m_deltaAngularVelocities;
+
+	bool m_usePgs;
+	void averageVelocities();
+
+	int m_maxOverrideNumSolverIterations;
+
+	int m_numSplitImpulseRecoveries;
+
+	b3Scalar getContactProcessingThreshold(b3Contact4* contact)
+	{
+		return 0.02f;
+	}
+	void setupFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis, int solverBodyIdA, int solverBodyIdB,
+								 b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2,
+								 b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation,
+								 b3Scalar desiredVelocity = 0., b3Scalar cfmSlip = 0.);
+
+	void setupRollingFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, b3SolverConstraint& solverConstraint, const b3Vector3& normalAxis, int solverBodyIdA, int solverBodyIdB,
+										b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2,
+										b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation,
+										b3Scalar desiredVelocity = 0., b3Scalar cfmSlip = 0.);
+
+	b3SolverConstraint& addFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, const b3Vector3& normalAxis, int solverBodyIdA, int solverBodyIdB, int frictionIndex, b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2, b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity = 0., b3Scalar cfmSlip = 0.);
+	b3SolverConstraint& addRollingFrictionConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias, const b3Vector3& normalAxis, int solverBodyIdA, int solverBodyIdB, int frictionIndex, b3ContactPoint& cp, const b3Vector3& rel_pos1, const b3Vector3& rel_pos2, b3RigidBodyData* colObj0, b3RigidBodyData* colObj1, b3Scalar relaxation, b3Scalar desiredVelocity = 0, b3Scalar cfmSlip = 0.f);
+
+	void setupContactConstraint(b3RigidBodyData* bodies, b3InertiaData* inertias,
+								b3SolverConstraint& solverConstraint, int solverBodyIdA, int solverBodyIdB, b3ContactPoint& cp,
+								const b3ContactSolverInfo& infoGlobal, b3Vector3& vel, b3Scalar& rel_vel, b3Scalar& relaxation,
+								b3Vector3& rel_pos1, b3Vector3& rel_pos2);
+
+	void setFrictionConstraintImpulse(b3RigidBodyData* bodies, b3InertiaData* inertias, b3SolverConstraint& solverConstraint, int solverBodyIdA, int solverBodyIdB,
+									  b3ContactPoint& cp, const b3ContactSolverInfo& infoGlobal);
+
+	///m_btSeed2 is used for re-arranging the constraint rows. improves convergence/quality of friction
+	unsigned long m_btSeed2;
+
+	b3Scalar restitutionCurve(b3Scalar rel_vel, b3Scalar restitution);
+
+	void convertContact(b3RigidBodyData* bodies, b3InertiaData* inertias, b3Contact4* manifold, const b3ContactSolverInfo& infoGlobal);
+
+	void resolveSplitPenetrationSIMD(
+		b3SolverBody& bodyA, b3SolverBody& bodyB,
+		const b3SolverConstraint& contactConstraint);
+
+	void resolveSplitPenetrationImpulseCacheFriendly(
+		b3SolverBody& bodyA, b3SolverBody& bodyB,
+		const b3SolverConstraint& contactConstraint);
+
+	//internal method
+	int getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies, b3InertiaData* inertias);
+	void initSolverBody(int bodyIndex, b3SolverBody* solverBody, b3RigidBodyData* collisionObject);
+
+	void resolveSingleConstraintRowGeneric(b3SolverBody& bodyA, b3SolverBody& bodyB, const b3SolverConstraint& contactConstraint);
+
+	void resolveSingleConstraintRowGenericSIMD(b3SolverBody& bodyA, b3SolverBody& bodyB, const b3SolverConstraint& contactConstraint);
+
+	void resolveSingleConstraintRowLowerLimit(b3SolverBody& bodyA, b3SolverBody& bodyB, const b3SolverConstraint& contactConstraint);
+
+	void resolveSingleConstraintRowLowerLimitSIMD(b3SolverBody& bodyA, b3SolverBody& bodyB, const b3SolverConstraint& contactConstraint);
+
+protected:
+	virtual b3Scalar solveGroupCacheFriendlySetup(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, b3Contact4* manifoldPtr, int numManifolds, b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+
+	virtual b3Scalar solveGroupCacheFriendlyIterations(b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+	virtual void solveGroupCacheFriendlySplitImpulseIterations(b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+	b3Scalar solveSingleIteration(int iteration, b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+
+	virtual b3Scalar solveGroupCacheFriendlyFinish(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, const b3ContactSolverInfo& infoGlobal);
+
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3PgsJacobiSolver(bool usePgs);
+	virtual ~b3PgsJacobiSolver();
+
+	//	void	solveContacts(int numBodies, b3RigidBodyData* bodies, b3InertiaData* inertias, int numContacts, b3Contact4* contacts);
+	void solveContacts(int numBodies, b3RigidBodyData* bodies, b3InertiaData* inertias, int numContacts, b3Contact4* contacts, int numConstraints, b3TypedConstraint** constraints);
+
+	b3Scalar solveGroup(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, b3Contact4* manifoldPtr, int numManifolds, b3TypedConstraint** constraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+
+	///clear internal cached data and reset random seed
+	virtual void reset();
+
+	unsigned long b3Rand2();
+
+	int b3RandInt2(int n);
+
+	void setRandSeed(unsigned long seed)
+	{
+		m_btSeed2 = seed;
+	}
+	unsigned long getRandSeed() const
+	{
+		return m_btSeed2;
+	}
+};
+
+#endif  //B3_PGS_JACOBI_SOLVER

+ 190 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.cpp

@@ -0,0 +1,190 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3Point2PointConstraint.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+#include <new>
+
+b3Point2PointConstraint::b3Point2PointConstraint(int rbA, int rbB, const b3Vector3& pivotInA, const b3Vector3& pivotInB)
+	: b3TypedConstraint(B3_POINT2POINT_CONSTRAINT_TYPE, rbA, rbB), m_pivotInA(pivotInA), m_pivotInB(pivotInB), m_flags(0)
+{
+}
+
+/*
+b3Point2PointConstraint::b3Point2PointConstraint(int  rbA,const b3Vector3& pivotInA)
+:b3TypedConstraint(B3_POINT2POINT_CONSTRAINT_TYPE,rbA),m_pivotInA(pivotInA),m_pivotInB(rbA.getCenterOfMassTransform()(pivotInA)),
+m_flags(0),
+m_useSolveConstraintObsolete(false)
+{
+	
+}
+*/
+
+void b3Point2PointConstraint::getInfo1(b3ConstraintInfo1* info, const b3RigidBodyData* bodies)
+{
+	getInfo1NonVirtual(info, bodies);
+}
+
+void b3Point2PointConstraint::getInfo1NonVirtual(b3ConstraintInfo1* info, const b3RigidBodyData* bodies)
+{
+	info->m_numConstraintRows = 3;
+	info->nub = 3;
+}
+
+void b3Point2PointConstraint::getInfo2(b3ConstraintInfo2* info, const b3RigidBodyData* bodies)
+{
+	b3Transform trA;
+	trA.setIdentity();
+	trA.setOrigin(bodies[m_rbA].m_pos);
+	trA.setRotation(bodies[m_rbA].m_quat);
+
+	b3Transform trB;
+	trB.setIdentity();
+	trB.setOrigin(bodies[m_rbB].m_pos);
+	trB.setRotation(bodies[m_rbB].m_quat);
+
+	getInfo2NonVirtual(info, trA, trB);
+}
+
+void b3Point2PointConstraint::getInfo2NonVirtual(b3ConstraintInfo2* info, const b3Transform& body0_trans, const b3Transform& body1_trans)
+{
+	//retrieve matrices
+
+	// anchor points in global coordinates with respect to body PORs.
+
+	// set jacobian
+	info->m_J1linearAxis[0] = 1;
+	info->m_J1linearAxis[info->rowskip + 1] = 1;
+	info->m_J1linearAxis[2 * info->rowskip + 2] = 1;
+
+	b3Vector3 a1 = body0_trans.getBasis() * getPivotInA();
+	//b3Vector3 a1a = b3QuatRotate(body0_trans.getRotation(),getPivotInA());
+
+	{
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis + info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis + 2 * info->rowskip);
+		b3Vector3 a1neg = -a1;
+		a1neg.getSkewSymmetricMatrix(angular0, angular1, angular2);
+	}
+
+	if (info->m_J2linearAxis)
+	{
+		info->m_J2linearAxis[0] = -1;
+		info->m_J2linearAxis[info->rowskip + 1] = -1;
+		info->m_J2linearAxis[2 * info->rowskip + 2] = -1;
+	}
+
+	b3Vector3 a2 = body1_trans.getBasis() * getPivotInB();
+
+	{
+		//	b3Vector3 a2n = -a2;
+		b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis + info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis + 2 * info->rowskip);
+		a2.getSkewSymmetricMatrix(angular0, angular1, angular2);
+	}
+
+	// set right hand side
+	b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
+	b3Scalar k = info->fps * currERP;
+	int j;
+	for (j = 0; j < 3; j++)
+	{
+		info->m_constraintError[j * info->rowskip] = k * (a2[j] + body1_trans.getOrigin()[j] - a1[j] - body0_trans.getOrigin()[j]);
+		//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
+	}
+	if (m_flags & B3_P2P_FLAGS_CFM)
+	{
+		for (j = 0; j < 3; j++)
+		{
+			info->cfm[j * info->rowskip] = m_cfm;
+		}
+	}
+
+	b3Scalar impulseClamp = m_setting.m_impulseClamp;  //
+	for (j = 0; j < 3; j++)
+	{
+		if (m_setting.m_impulseClamp > 0)
+		{
+			info->m_lowerLimit[j * info->rowskip] = -impulseClamp;
+			info->m_upperLimit[j * info->rowskip] = impulseClamp;
+		}
+	}
+	info->m_damping = m_setting.m_damping;
+}
+
+void b3Point2PointConstraint::updateRHS(b3Scalar timeStep)
+{
+	(void)timeStep;
+}
+
+///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5).
+///If no axis is provided, it uses the default axis for this constraint.
+void b3Point2PointConstraint::setParam(int num, b3Scalar value, int axis)
+{
+	if (axis != -1)
+	{
+		b3AssertConstrParams(0);
+	}
+	else
+	{
+		switch (num)
+		{
+			case B3_CONSTRAINT_ERP:
+			case B3_CONSTRAINT_STOP_ERP:
+				m_erp = value;
+				m_flags |= B3_P2P_FLAGS_ERP;
+				break;
+			case B3_CONSTRAINT_CFM:
+			case B3_CONSTRAINT_STOP_CFM:
+				m_cfm = value;
+				m_flags |= B3_P2P_FLAGS_CFM;
+				break;
+			default:
+				b3AssertConstrParams(0);
+		}
+	}
+}
+
+///return the local value of parameter
+b3Scalar b3Point2PointConstraint::getParam(int num, int axis) const
+{
+	b3Scalar retVal(B3_INFINITY);
+	if (axis != -1)
+	{
+		b3AssertConstrParams(0);
+	}
+	else
+	{
+		switch (num)
+		{
+			case B3_CONSTRAINT_ERP:
+			case B3_CONSTRAINT_STOP_ERP:
+				b3AssertConstrParams(m_flags & B3_P2P_FLAGS_ERP);
+				retVal = m_erp;
+				break;
+			case B3_CONSTRAINT_CFM:
+			case B3_CONSTRAINT_STOP_CFM:
+				b3AssertConstrParams(m_flags & B3_P2P_FLAGS_CFM);
+				retVal = m_cfm;
+				break;
+			default:
+				b3AssertConstrParams(0);
+		}
+	}
+	return retVal;
+}

+ 153 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3Point2PointConstraint.h

@@ -0,0 +1,153 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_POINT2POINTCONSTRAINT_H
+#define B3_POINT2POINTCONSTRAINT_H
+
+#include "Bullet3Common/b3Vector3.h"
+//#include "b3JacobianEntry.h"
+#include "b3TypedConstraint.h"
+
+class b3RigidBody;
+
+#ifdef B3_USE_DOUBLE_PRECISION
+#define b3Point2PointConstraintData b3Point2PointConstraintDoubleData
+#define b3Point2PointConstraintDataName "b3Point2PointConstraintDoubleData"
+#else
+#define b3Point2PointConstraintData b3Point2PointConstraintFloatData
+#define b3Point2PointConstraintDataName "b3Point2PointConstraintFloatData"
+#endif  //B3_USE_DOUBLE_PRECISION
+
+struct b3ConstraintSetting
+{
+	b3ConstraintSetting() : m_tau(b3Scalar(0.3)),
+							m_damping(b3Scalar(1.)),
+							m_impulseClamp(b3Scalar(0.))
+	{
+	}
+	b3Scalar m_tau;
+	b3Scalar m_damping;
+	b3Scalar m_impulseClamp;
+};
+
+enum b3Point2PointFlags
+{
+	B3_P2P_FLAGS_ERP = 1,
+	B3_P2P_FLAGS_CFM = 2
+};
+
+/// point to point constraint between two rigidbodies each with a pivotpoint that descibes the 'ballsocket' location in local space
+B3_ATTRIBUTE_ALIGNED16(class)
+b3Point2PointConstraint : public b3TypedConstraint
+{
+#ifdef IN_PARALLELL_SOLVER
+public:
+#endif
+
+	b3Vector3 m_pivotInA;
+	b3Vector3 m_pivotInB;
+
+	int m_flags;
+	b3Scalar m_erp;
+	b3Scalar m_cfm;
+
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3ConstraintSetting m_setting;
+
+	b3Point2PointConstraint(int rbA, int rbB, const b3Vector3& pivotInA, const b3Vector3& pivotInB);
+
+	//b3Point2PointConstraint(int  rbA,const b3Vector3& pivotInA);
+
+	virtual void getInfo1(b3ConstraintInfo1 * info, const b3RigidBodyData* bodies);
+
+	void getInfo1NonVirtual(b3ConstraintInfo1 * info, const b3RigidBodyData* bodies);
+
+	virtual void getInfo2(b3ConstraintInfo2 * info, const b3RigidBodyData* bodies);
+
+	void getInfo2NonVirtual(b3ConstraintInfo2 * info, const b3Transform& body0_trans, const b3Transform& body1_trans);
+
+	void updateRHS(b3Scalar timeStep);
+
+	void setPivotA(const b3Vector3& pivotA)
+	{
+		m_pivotInA = pivotA;
+	}
+
+	void setPivotB(const b3Vector3& pivotB)
+	{
+		m_pivotInB = pivotB;
+	}
+
+	const b3Vector3& getPivotInA() const
+	{
+		return m_pivotInA;
+	}
+
+	const b3Vector3& getPivotInB() const
+	{
+		return m_pivotInB;
+	}
+
+	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5).
+	///If no axis is provided, it uses the default axis for this constraint.
+	virtual void setParam(int num, b3Scalar value, int axis = -1);
+	///return the local value of parameter
+	virtual b3Scalar getParam(int num, int axis = -1) const;
+
+	//	virtual	int	calculateSerializeBufferSize() const;
+
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	//	virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+};
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct b3Point2PointConstraintFloatData
+{
+	b3TypedConstraintData m_typeConstraintData;
+	b3Vector3FloatData m_pivotInA;
+	b3Vector3FloatData m_pivotInB;
+};
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct b3Point2PointConstraintDoubleData
+{
+	b3TypedConstraintData m_typeConstraintData;
+	b3Vector3DoubleData m_pivotInA;
+	b3Vector3DoubleData m_pivotInB;
+};
+
+/*
+B3_FORCE_INLINE	int	b3Point2PointConstraint::calculateSerializeBufferSize() const
+{
+	return sizeof(b3Point2PointConstraintData);
+
+}
+
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+B3_FORCE_INLINE	const char*	b3Point2PointConstraint::serialize(void* dataBuffer, b3Serializer* serializer) const
+{
+	b3Point2PointConstraintData* p2pData = (b3Point2PointConstraintData*)dataBuffer;
+
+	b3TypedConstraint::serialize(&p2pData->m_typeConstraintData,serializer);
+	m_pivotInA.serialize(p2pData->m_pivotInA);
+	m_pivotInB.serialize(p2pData->m_pivotInB);
+
+	return b3Point2PointConstraintDataName;
+}
+*/
+
+#endif  //B3_POINT2POINTCONSTRAINT_H

+ 281 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h

@@ -0,0 +1,281 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_SOLVER_BODY_H
+#define B3_SOLVER_BODY_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+
+#include "Bullet3Common/b3AlignedAllocator.h"
+#include "Bullet3Common/b3TransformUtil.h"
+
+///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision
+#ifdef B3_USE_SSE
+#define USE_SIMD 1
+#endif  //
+
+#ifdef USE_SIMD
+
+struct b3SimdScalar
+{
+	B3_FORCE_INLINE b3SimdScalar()
+	{
+	}
+
+	B3_FORCE_INLINE b3SimdScalar(float fl)
+		: m_vec128(_mm_set1_ps(fl))
+	{
+	}
+
+	B3_FORCE_INLINE b3SimdScalar(__m128 v128)
+		: m_vec128(v128)
+	{
+	}
+	union {
+		__m128 m_vec128;
+		float m_floats[4];
+		float x, y, z, w;
+		int m_ints[4];
+		b3Scalar m_unusedPadding;
+	};
+	B3_FORCE_INLINE __m128 get128()
+	{
+		return m_vec128;
+	}
+
+	B3_FORCE_INLINE const __m128 get128() const
+	{
+		return m_vec128;
+	}
+
+	B3_FORCE_INLINE void set128(__m128 v128)
+	{
+		m_vec128 = v128;
+	}
+
+	B3_FORCE_INLINE operator __m128()
+	{
+		return m_vec128;
+	}
+	B3_FORCE_INLINE operator const __m128() const
+	{
+		return m_vec128;
+	}
+
+	B3_FORCE_INLINE operator float() const
+	{
+		return m_floats[0];
+	}
+};
+
+///@brief Return the elementwise product of two b3SimdScalar
+B3_FORCE_INLINE b3SimdScalar
+operator*(const b3SimdScalar& v1, const b3SimdScalar& v2)
+{
+	return b3SimdScalar(_mm_mul_ps(v1.get128(), v2.get128()));
+}
+
+///@brief Return the elementwise product of two b3SimdScalar
+B3_FORCE_INLINE b3SimdScalar
+operator+(const b3SimdScalar& v1, const b3SimdScalar& v2)
+{
+	return b3SimdScalar(_mm_add_ps(v1.get128(), v2.get128()));
+}
+
+#else
+#define b3SimdScalar b3Scalar
+#endif
+
+///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3SolverBody
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+	b3Transform m_worldTransform;
+	b3Vector3 m_deltaLinearVelocity;
+	b3Vector3 m_deltaAngularVelocity;
+	b3Vector3 m_angularFactor;
+	b3Vector3 m_linearFactor;
+	b3Vector3 m_invMass;
+	b3Vector3 m_pushVelocity;
+	b3Vector3 m_turnVelocity;
+	b3Vector3 m_linearVelocity;
+	b3Vector3 m_angularVelocity;
+
+	union {
+		void* m_originalBody;
+		int m_originalBodyIndex;
+	};
+
+	int padding[3];
+
+	void setWorldTransform(const b3Transform& worldTransform)
+	{
+		m_worldTransform = worldTransform;
+	}
+
+	const b3Transform& getWorldTransform() const
+	{
+		return m_worldTransform;
+	}
+
+	B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
+	{
+		if (m_originalBody)
+			velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
+		else
+			velocity.setValue(0, 0, 0);
+	}
+
+	B3_FORCE_INLINE void getAngularVelocity(b3Vector3 & angVel) const
+	{
+		if (m_originalBody)
+			angVel = m_angularVelocity + m_deltaAngularVelocity;
+		else
+			angVel.setValue(0, 0, 0);
+	}
+
+	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
+	B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
+	{
+		if (m_originalBody)
+		{
+			m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
+			m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
+		}
+	}
+
+	B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, b3Scalar impulseMagnitude)
+	{
+		if (m_originalBody)
+		{
+			m_pushVelocity += linearComponent * impulseMagnitude * m_linearFactor;
+			m_turnVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
+		}
+	}
+
+	const b3Vector3& getDeltaLinearVelocity() const
+	{
+		return m_deltaLinearVelocity;
+	}
+
+	const b3Vector3& getDeltaAngularVelocity() const
+	{
+		return m_deltaAngularVelocity;
+	}
+
+	const b3Vector3& getPushVelocity() const
+	{
+		return m_pushVelocity;
+	}
+
+	const b3Vector3& getTurnVelocity() const
+	{
+		return m_turnVelocity;
+	}
+
+	////////////////////////////////////////////////
+	///some internal methods, don't use them
+
+	b3Vector3& internalGetDeltaLinearVelocity()
+	{
+		return m_deltaLinearVelocity;
+	}
+
+	b3Vector3& internalGetDeltaAngularVelocity()
+	{
+		return m_deltaAngularVelocity;
+	}
+
+	const b3Vector3& internalGetAngularFactor() const
+	{
+		return m_angularFactor;
+	}
+
+	const b3Vector3& internalGetInvMass() const
+	{
+		return m_invMass;
+	}
+
+	void internalSetInvMass(const b3Vector3& invMass)
+	{
+		m_invMass = invMass;
+	}
+
+	b3Vector3& internalGetPushVelocity()
+	{
+		return m_pushVelocity;
+	}
+
+	b3Vector3& internalGetTurnVelocity()
+	{
+		return m_turnVelocity;
+	}
+
+	B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
+	{
+		velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
+	}
+
+	B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3 & angVel) const
+	{
+		angVel = m_angularVelocity + m_deltaAngularVelocity;
+	}
+
+	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
+	B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
+	{
+		//if (m_originalBody)
+		{
+			m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
+			m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
+		}
+	}
+
+	void writebackVelocity()
+	{
+		//if (m_originalBody>=0)
+		{
+			m_linearVelocity += m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
+
+			//m_originalBody->setCompanionId(-1);
+		}
+	}
+
+	void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
+	{
+		(void)timeStep;
+		if (m_originalBody)
+		{
+			m_linearVelocity += m_deltaLinearVelocity;
+			m_angularVelocity += m_deltaAngularVelocity;
+
+			//correct the position/orientation based on push/turn recovery
+			b3Transform newTransform;
+			if (m_pushVelocity[0] != 0.f || m_pushVelocity[1] != 0 || m_pushVelocity[2] != 0 || m_turnVelocity[0] != 0.f || m_turnVelocity[1] != 0 || m_turnVelocity[2] != 0)
+			{
+				//	b3Quaternion orn = m_worldTransform.getRotation();
+				b3TransformUtil::integrateTransform(m_worldTransform, m_pushVelocity, m_turnVelocity * splitImpulseTurnErp, timeStep, newTransform);
+				m_worldTransform = newTransform;
+			}
+			//m_worldTransform.setRotation(orn);
+			//m_originalBody->setCompanionId(-1);
+		}
+	}
+};
+
+#endif  //B3_SOLVER_BODY_H

+ 73 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3SolverConstraint.h

@@ -0,0 +1,73 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_SOLVER_CONSTRAINT_H
+#define B3_SOLVER_CONSTRAINT_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Matrix3x3.h"
+//#include "b3JacobianEntry.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+//#define NO_FRICTION_TANGENTIALS 1
+#include "b3SolverBody.h"
+
+///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints.
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3SolverConstraint
+{
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	b3Vector3 m_relpos1CrossNormal;
+	b3Vector3 m_contactNormal;
+
+	b3Vector3 m_relpos2CrossNormal;
+	//b3Vector3		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal
+
+	b3Vector3 m_angularComponentA;
+	b3Vector3 m_angularComponentB;
+
+	mutable b3SimdScalar m_appliedPushImpulse;
+	mutable b3SimdScalar m_appliedImpulse;
+	int m_padding1;
+	int m_padding2;
+	b3Scalar m_friction;
+	b3Scalar m_jacDiagABInv;
+	b3Scalar m_rhs;
+	b3Scalar m_cfm;
+
+	b3Scalar m_lowerLimit;
+	b3Scalar m_upperLimit;
+	b3Scalar m_rhsPenetration;
+	union {
+		void* m_originalContactPoint;
+		b3Scalar m_unusedPadding4;
+	};
+
+	int m_overrideNumSolverIterations;
+	int m_frictionIndex;
+	int m_solverBodyIdA;
+	int m_solverBodyIdB;
+
+	enum b3SolverConstraintType
+	{
+		B3_SOLVER_CONTACT_1D = 0,
+		B3_SOLVER_FRICTION_1D
+	};
+};
+
+typedef b3AlignedObjectArray<b3SolverConstraint> b3ConstraintArray;
+
+#endif  //B3_SOLVER_CONSTRAINT_H

+ 151 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.cpp

@@ -0,0 +1,151 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3TypedConstraint.h"
+//#include "Bullet3Common/b3Serializer.h"
+
+#define B3_DEFAULT_DEBUGDRAW_SIZE b3Scalar(0.3f)
+
+b3TypedConstraint::b3TypedConstraint(b3TypedConstraintType type, int rbA, int rbB)
+	: b3TypedObject(type),
+	  m_userConstraintType(-1),
+	  m_userConstraintPtr((void*)-1),
+	  m_breakingImpulseThreshold(B3_INFINITY),
+	  m_isEnabled(true),
+	  m_needsFeedback(false),
+	  m_overrideNumSolverIterations(-1),
+	  m_rbA(rbA),
+	  m_rbB(rbB),
+	  m_appliedImpulse(b3Scalar(0.)),
+	  m_dbgDrawSize(B3_DEFAULT_DEBUGDRAW_SIZE),
+	  m_jointFeedback(0)
+{
+}
+
+b3Scalar b3TypedConstraint::getMotorFactor(b3Scalar pos, b3Scalar lowLim, b3Scalar uppLim, b3Scalar vel, b3Scalar timeFact)
+{
+	if (lowLim > uppLim)
+	{
+		return b3Scalar(1.0f);
+	}
+	else if (lowLim == uppLim)
+	{
+		return b3Scalar(0.0f);
+	}
+	b3Scalar lim_fact = b3Scalar(1.0f);
+	b3Scalar delta_max = vel / timeFact;
+	if (delta_max < b3Scalar(0.0f))
+	{
+		if ((pos >= lowLim) && (pos < (lowLim - delta_max)))
+		{
+			lim_fact = (lowLim - pos) / delta_max;
+		}
+		else if (pos < lowLim)
+		{
+			lim_fact = b3Scalar(0.0f);
+		}
+		else
+		{
+			lim_fact = b3Scalar(1.0f);
+		}
+	}
+	else if (delta_max > b3Scalar(0.0f))
+	{
+		if ((pos <= uppLim) && (pos > (uppLim - delta_max)))
+		{
+			lim_fact = (uppLim - pos) / delta_max;
+		}
+		else if (pos > uppLim)
+		{
+			lim_fact = b3Scalar(0.0f);
+		}
+		else
+		{
+			lim_fact = b3Scalar(1.0f);
+		}
+	}
+	else
+	{
+		lim_fact = b3Scalar(0.0f);
+	}
+	return lim_fact;
+}
+
+void b3AngularLimit::set(b3Scalar low, b3Scalar high, b3Scalar _softness, b3Scalar _biasFactor, b3Scalar _relaxationFactor)
+{
+	m_halfRange = (high - low) / 2.0f;
+	m_center = b3NormalizeAngle(low + m_halfRange);
+	m_softness = _softness;
+	m_biasFactor = _biasFactor;
+	m_relaxationFactor = _relaxationFactor;
+}
+
+void b3AngularLimit::test(const b3Scalar angle)
+{
+	m_correction = 0.0f;
+	m_sign = 0.0f;
+	m_solveLimit = false;
+
+	if (m_halfRange >= 0.0f)
+	{
+		b3Scalar deviation = b3NormalizeAngle(angle - m_center);
+		if (deviation < -m_halfRange)
+		{
+			m_solveLimit = true;
+			m_correction = -(deviation + m_halfRange);
+			m_sign = +1.0f;
+		}
+		else if (deviation > m_halfRange)
+		{
+			m_solveLimit = true;
+			m_correction = m_halfRange - deviation;
+			m_sign = -1.0f;
+		}
+	}
+}
+
+b3Scalar b3AngularLimit::getError() const
+{
+	return m_correction * m_sign;
+}
+
+void b3AngularLimit::fit(b3Scalar& angle) const
+{
+	if (m_halfRange > 0.0f)
+	{
+		b3Scalar relativeAngle = b3NormalizeAngle(angle - m_center);
+		if (!b3Equal(relativeAngle, m_halfRange))
+		{
+			if (relativeAngle > 0.0f)
+			{
+				angle = getHigh();
+			}
+			else
+			{
+				angle = getLow();
+			}
+		}
+	}
+}
+
+b3Scalar b3AngularLimit::getLow() const
+{
+	return b3NormalizeAngle(m_center - m_halfRange);
+}
+
+b3Scalar b3AngularLimit::getHigh() const
+{
+	return b3NormalizeAngle(m_center + m_halfRange);
+}

+ 469 - 0
Dependencies/include/bullet3/Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h

@@ -0,0 +1,469 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2010 Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_TYPED_CONSTRAINT_H
+#define B3_TYPED_CONSTRAINT_H
+
+#include "Bullet3Common/b3Scalar.h"
+#include "b3SolverConstraint.h"
+
+class b3Serializer;
+
+//Don't change any of the existing enum values, so add enum types at the end for serialization compatibility
+enum b3TypedConstraintType
+{
+	B3_POINT2POINT_CONSTRAINT_TYPE = 3,
+	B3_HINGE_CONSTRAINT_TYPE,
+	B3_CONETWIST_CONSTRAINT_TYPE,
+	B3_D6_CONSTRAINT_TYPE,
+	B3_SLIDER_CONSTRAINT_TYPE,
+	B3_CONTACT_CONSTRAINT_TYPE,
+	B3_D6_SPRING_CONSTRAINT_TYPE,
+	B3_GEAR_CONSTRAINT_TYPE,
+	B3_FIXED_CONSTRAINT_TYPE,
+	B3_MAX_CONSTRAINT_TYPE
+};
+
+enum b3ConstraintParams
+{
+	B3_CONSTRAINT_ERP = 1,
+	B3_CONSTRAINT_STOP_ERP,
+	B3_CONSTRAINT_CFM,
+	B3_CONSTRAINT_STOP_CFM
+};
+
+#if 1
+#define b3AssertConstrParams(_par) b3Assert(_par)
+#else
+#define b3AssertConstrParams(_par)
+#endif
+
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3JointFeedback
+{
+	b3Vector3 m_appliedForceBodyA;
+	b3Vector3 m_appliedTorqueBodyA;
+	b3Vector3 m_appliedForceBodyB;
+	b3Vector3 m_appliedTorqueBodyB;
+};
+
+struct b3RigidBodyData;
+
+///TypedConstraint is the baseclass for Bullet constraints and vehicles
+B3_ATTRIBUTE_ALIGNED16(class)
+b3TypedConstraint : public b3TypedObject
+{
+	int m_userConstraintType;
+
+	union {
+		int m_userConstraintId;
+		void* m_userConstraintPtr;
+	};
+
+	b3Scalar m_breakingImpulseThreshold;
+	bool m_isEnabled;
+	bool m_needsFeedback;
+	int m_overrideNumSolverIterations;
+
+	b3TypedConstraint& operator=(b3TypedConstraint& other)
+	{
+		b3Assert(0);
+		(void)other;
+		return *this;
+	}
+
+protected:
+	int m_rbA;
+	int m_rbB;
+	b3Scalar m_appliedImpulse;
+	b3Scalar m_dbgDrawSize;
+	b3JointFeedback* m_jointFeedback;
+
+	///internal method used by the constraint solver, don't use them directly
+	b3Scalar getMotorFactor(b3Scalar pos, b3Scalar lowLim, b3Scalar uppLim, b3Scalar vel, b3Scalar timeFact);
+
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();
+
+	virtual ~b3TypedConstraint(){};
+	b3TypedConstraint(b3TypedConstraintType type, int bodyA, int bodyB);
+
+	struct b3ConstraintInfo1
+	{
+		int m_numConstraintRows, nub;
+	};
+
+	struct b3ConstraintInfo2
+	{
+		// integrator parameters: frames per second (1/stepsize), default error
+		// reduction parameter (0..1).
+		b3Scalar fps, erp;
+
+		// for the first and second body, pointers to two (linear and angular)
+		// n*3 jacobian sub matrices, stored by rows. these matrices will have
+		// been initialized to 0 on entry. if the second body is zero then the
+		// J2xx pointers may be 0.
+		b3Scalar *m_J1linearAxis, *m_J1angularAxis, *m_J2linearAxis, *m_J2angularAxis;
+
+		// elements to jump from one row to the next in J's
+		int rowskip;
+
+		// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
+		// "constraint force mixing" vector. c is set to zero on entry, cfm is
+		// set to a constant value (typically very small or zero) value on entry.
+		b3Scalar *m_constraintError, *cfm;
+
+		// lo and hi limits for variables (set to -/+ infinity on entry).
+		b3Scalar *m_lowerLimit, *m_upperLimit;
+
+		// findex vector for variables. see the LCP solver interface for a
+		// description of what this does. this is set to -1 on entry.
+		// note that the returned indexes are relative to the first index of
+		// the constraint.
+		int* findex;
+		// number of solver iterations
+		int m_numIterations;
+
+		//damping of the velocity
+		b3Scalar m_damping;
+	};
+
+	int getOverrideNumSolverIterations() const
+	{
+		return m_overrideNumSolverIterations;
+	}
+
+	///override the number of constraint solver iterations used to solve this constraint
+	///-1 will use the default number of iterations, as specified in SolverInfo.m_numIterations
+	void setOverrideNumSolverIterations(int overideNumIterations)
+	{
+		m_overrideNumSolverIterations = overideNumIterations;
+	}
+
+	///internal method used by the constraint solver, don't use them directly
+	virtual void setupSolverConstraint(b3ConstraintArray & ca, int solverBodyA, int solverBodyB, b3Scalar timeStep)
+	{
+		(void)ca;
+		(void)solverBodyA;
+		(void)solverBodyB;
+		(void)timeStep;
+	}
+
+	///internal method used by the constraint solver, don't use them directly
+	virtual void getInfo1(b3ConstraintInfo1 * info, const b3RigidBodyData* bodies) = 0;
+
+	///internal method used by the constraint solver, don't use them directly
+	virtual void getInfo2(b3ConstraintInfo2 * info, const b3RigidBodyData* bodies) = 0;
+
+	///internal method used by the constraint solver, don't use them directly
+	void internalSetAppliedImpulse(b3Scalar appliedImpulse)
+	{
+		m_appliedImpulse = appliedImpulse;
+	}
+	///internal method used by the constraint solver, don't use them directly
+	b3Scalar internalGetAppliedImpulse()
+	{
+		return m_appliedImpulse;
+	}
+
+	b3Scalar getBreakingImpulseThreshold() const
+	{
+		return m_breakingImpulseThreshold;
+	}
+
+	void setBreakingImpulseThreshold(b3Scalar threshold)
+	{
+		m_breakingImpulseThreshold = threshold;
+	}
+
+	bool isEnabled() const
+	{
+		return m_isEnabled;
+	}
+
+	void setEnabled(bool enabled)
+	{
+		m_isEnabled = enabled;
+	}
+
+	///internal method used by the constraint solver, don't use them directly
+	virtual void solveConstraintObsolete(b3SolverBody& /*bodyA*/, b3SolverBody& /*bodyB*/, b3Scalar /*timeStep*/){};
+
+	int getRigidBodyA() const
+	{
+		return m_rbA;
+	}
+	int getRigidBodyB() const
+	{
+		return m_rbB;
+	}
+
+	int getRigidBodyA()
+	{
+		return m_rbA;
+	}
+	int getRigidBodyB()
+	{
+		return m_rbB;
+	}
+
+	int getUserConstraintType() const
+	{
+		return m_userConstraintType;
+	}
+
+	void setUserConstraintType(int userConstraintType)
+	{
+		m_userConstraintType = userConstraintType;
+	};
+
+	void setUserConstraintId(int uid)
+	{
+		m_userConstraintId = uid;
+	}
+
+	int getUserConstraintId() const
+	{
+		return m_userConstraintId;
+	}
+
+	void setUserConstraintPtr(void* ptr)
+	{
+		m_userConstraintPtr = ptr;
+	}
+
+	void* getUserConstraintPtr()
+	{
+		return m_userConstraintPtr;
+	}
+
+	void setJointFeedback(b3JointFeedback * jointFeedback)
+	{
+		m_jointFeedback = jointFeedback;
+	}
+
+	const b3JointFeedback* getJointFeedback() const
+	{
+		return m_jointFeedback;
+	}
+
+	b3JointFeedback* getJointFeedback()
+	{
+		return m_jointFeedback;
+	}
+
+	int getUid() const
+	{
+		return m_userConstraintId;
+	}
+
+	bool needsFeedback() const
+	{
+		return m_needsFeedback;
+	}
+
+	///enableFeedback will allow to read the applied linear and angular impulse
+	///use getAppliedImpulse, getAppliedLinearImpulse and getAppliedAngularImpulse to read feedback information
+	void enableFeedback(bool needsFeedback)
+	{
+		m_needsFeedback = needsFeedback;
+	}
+
+	///getAppliedImpulse is an estimated total applied impulse.
+	///This feedback could be used to determine breaking constraints or playing sounds.
+	b3Scalar getAppliedImpulse() const
+	{
+		b3Assert(m_needsFeedback);
+		return m_appliedImpulse;
+	}
+
+	b3TypedConstraintType getConstraintType() const
+	{
+		return b3TypedConstraintType(m_objectType);
+	}
+
+	void setDbgDrawSize(b3Scalar dbgDrawSize)
+	{
+		m_dbgDrawSize = dbgDrawSize;
+	}
+	b3Scalar getDbgDrawSize()
+	{
+		return m_dbgDrawSize;
+	}
+
+	///override the default global value of a parameter (such as ERP or CFM), optionally provide the axis (0..5).
+	///If no axis is provided, it uses the default axis for this constraint.
+	virtual void setParam(int num, b3Scalar value, int axis = -1) = 0;
+
+	///return the local value of parameter
+	virtual b3Scalar getParam(int num, int axis = -1) const = 0;
+
+	//	virtual	int	calculateSerializeBufferSize() const;
+
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	//virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+};
+
+// returns angle in range [-B3_2_PI, B3_2_PI], closest to one of the limits
+// all arguments should be normalized angles (i.e. in range [-B3_PI, B3_PI])
+B3_FORCE_INLINE b3Scalar b3AdjustAngleToLimits(b3Scalar angleInRadians, b3Scalar angleLowerLimitInRadians, b3Scalar angleUpperLimitInRadians)
+{
+	if (angleLowerLimitInRadians >= angleUpperLimitInRadians)
+	{
+		return angleInRadians;
+	}
+	else if (angleInRadians < angleLowerLimitInRadians)
+	{
+		b3Scalar diffLo = b3Fabs(b3NormalizeAngle(angleLowerLimitInRadians - angleInRadians));
+		b3Scalar diffHi = b3Fabs(b3NormalizeAngle(angleUpperLimitInRadians - angleInRadians));
+		return (diffLo < diffHi) ? angleInRadians : (angleInRadians + B3_2_PI);
+	}
+	else if (angleInRadians > angleUpperLimitInRadians)
+	{
+		b3Scalar diffHi = b3Fabs(b3NormalizeAngle(angleInRadians - angleUpperLimitInRadians));
+		b3Scalar diffLo = b3Fabs(b3NormalizeAngle(angleInRadians - angleLowerLimitInRadians));
+		return (diffLo < diffHi) ? (angleInRadians - B3_2_PI) : angleInRadians;
+	}
+	else
+	{
+		return angleInRadians;
+	}
+}
+
+// clang-format off
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	b3TypedConstraintData
+{
+	int		m_bodyA;
+	int		m_bodyB;
+	char	*m_name;
+
+	int	m_objectType;
+	int	m_userConstraintType;
+	int	m_userConstraintId;
+	int	m_needsFeedback;
+
+	float	m_appliedImpulse;
+	float	m_dbgDrawSize;
+
+	int	m_disableCollisionsBetweenLinkedBodies;
+	int	m_overrideNumSolverIterations;
+
+	float	m_breakingImpulseThreshold;
+	int		m_isEnabled;
+	
+};
+
+// clang-format on
+
+/*B3_FORCE_INLINE	int	b3TypedConstraint::calculateSerializeBufferSize() const
+{
+	return sizeof(b3TypedConstraintData);
+}
+*/
+
+class b3AngularLimit
+{
+private:
+	b3Scalar
+		m_center,
+		m_halfRange,
+		m_softness,
+		m_biasFactor,
+		m_relaxationFactor,
+		m_correction,
+		m_sign;
+
+	bool
+		m_solveLimit;
+
+public:
+	/// Default constructor initializes limit as inactive, allowing free constraint movement
+	b3AngularLimit()
+		: m_center(0.0f),
+		  m_halfRange(-1.0f),
+		  m_softness(0.9f),
+		  m_biasFactor(0.3f),
+		  m_relaxationFactor(1.0f),
+		  m_correction(0.0f),
+		  m_sign(0.0f),
+		  m_solveLimit(false)
+	{
+	}
+
+	/// Sets all limit's parameters.
+	/// When low > high limit becomes inactive.
+	/// When high - low > 2PI limit is ineffective too becouse no angle can exceed the limit
+	void set(b3Scalar low, b3Scalar high, b3Scalar _softness = 0.9f, b3Scalar _biasFactor = 0.3f, b3Scalar _relaxationFactor = 1.0f);
+
+	/// Checks conastaint angle against limit. If limit is active and the angle violates the limit
+	/// correction is calculated.
+	void test(const b3Scalar angle);
+
+	/// Returns limit's softness
+	inline b3Scalar getSoftness() const
+	{
+		return m_softness;
+	}
+
+	/// Returns limit's bias factor
+	inline b3Scalar getBiasFactor() const
+	{
+		return m_biasFactor;
+	}
+
+	/// Returns limit's relaxation factor
+	inline b3Scalar getRelaxationFactor() const
+	{
+		return m_relaxationFactor;
+	}
+
+	/// Returns correction value evaluated when test() was invoked
+	inline b3Scalar getCorrection() const
+	{
+		return m_correction;
+	}
+
+	/// Returns sign value evaluated when test() was invoked
+	inline b3Scalar getSign() const
+	{
+		return m_sign;
+	}
+
+	/// Gives half of the distance between min and max limit angle
+	inline b3Scalar getHalfRange() const
+	{
+		return m_halfRange;
+	}
+
+	/// Returns true when the last test() invocation recognized limit violation
+	inline bool isLimit() const
+	{
+		return m_solveLimit;
+	}
+
+	/// Checks given angle against limit. If limit is active and angle doesn't fit it, the angle
+	/// returned is modified so it equals to the limit closest to given angle.
+	void fit(b3Scalar& angle) const;
+
+	/// Returns correction value multiplied by sign value
+	b3Scalar getError() const;
+
+	b3Scalar getLow() const;
+
+	b3Scalar getHigh() const;
+};
+
+#endif  //B3_TYPED_CONSTRAINT_H

+ 447 - 0
Dependencies/include/bullet3/Bullet3Dynamics/b3CpuRigidBodyPipeline.cpp

@@ -0,0 +1,447 @@
+#include "b3CpuRigidBodyPipeline.h"
+
+#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+#include "Bullet3Collision/BroadPhaseCollision/b3DynamicBvhBroadphase.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3CpuNarrowPhase.h"
+#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
+#include "Bullet3Dynamics/shared/b3Inertia.h"
+
+struct b3CpuRigidBodyPipelineInternalData
+{
+	b3AlignedObjectArray<b3RigidBodyData> m_rigidBodies;
+	b3AlignedObjectArray<b3Inertia> m_inertias;
+	b3AlignedObjectArray<b3Aabb> m_aabbWorldSpace;
+
+	b3DynamicBvhBroadphase* m_bp;
+	b3CpuNarrowPhase* m_np;
+	b3Config m_config;
+};
+
+b3CpuRigidBodyPipeline::b3CpuRigidBodyPipeline(class b3CpuNarrowPhase* narrowphase, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
+{
+	m_data = new b3CpuRigidBodyPipelineInternalData;
+	m_data->m_np = narrowphase;
+	m_data->m_bp = broadphaseDbvt;
+	m_data->m_config = config;
+}
+
+b3CpuRigidBodyPipeline::~b3CpuRigidBodyPipeline()
+{
+	delete m_data;
+}
+
+void b3CpuRigidBodyPipeline::updateAabbWorldSpace()
+{
+	for (int i = 0; i < this->getNumBodies(); i++)
+	{
+		b3RigidBodyData* body = &m_data->m_rigidBodies[i];
+		b3Float4 position = body->m_pos;
+		b3Quat orientation = body->m_quat;
+
+		int collidableIndex = body->m_collidableIdx;
+		b3Collidable& collidable = m_data->m_np->getCollidableCpu(collidableIndex);
+		int shapeIndex = collidable.m_shapeIndex;
+
+		if (shapeIndex >= 0)
+		{
+			b3Aabb localAabb = m_data->m_np->getLocalSpaceAabb(shapeIndex);
+			b3Aabb& worldAabb = m_data->m_aabbWorldSpace[i];
+			float margin = 0.f;
+			b3TransformAabb2(localAabb.m_minVec, localAabb.m_maxVec, margin, position, orientation, &worldAabb.m_minVec, &worldAabb.m_maxVec);
+			m_data->m_bp->setAabb(i, worldAabb.m_minVec, worldAabb.m_maxVec, 0);
+		}
+	}
+}
+
+void b3CpuRigidBodyPipeline::computeOverlappingPairs()
+{
+	int numPairs = m_data->m_bp->getOverlappingPairCache()->getNumOverlappingPairs();
+	m_data->m_bp->calculateOverlappingPairs();
+	numPairs = m_data->m_bp->getOverlappingPairCache()->getNumOverlappingPairs();
+	printf("numPairs=%d\n", numPairs);
+}
+
+void b3CpuRigidBodyPipeline::computeContactPoints()
+{
+	b3AlignedObjectArray<b3Int4>& pairs = m_data->m_bp->getOverlappingPairCache()->getOverlappingPairArray();
+
+	m_data->m_np->computeContacts(pairs, m_data->m_aabbWorldSpace, m_data->m_rigidBodies);
+}
+void b3CpuRigidBodyPipeline::stepSimulation(float deltaTime)
+{
+	//update world space aabb's
+	updateAabbWorldSpace();
+
+	//compute overlapping pairs
+	computeOverlappingPairs();
+
+	//compute contacts
+	computeContactPoints();
+
+	//solve contacts
+
+	//update transforms
+	integrate(deltaTime);
+}
+
+static inline float b3CalcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1,
+								 const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1)
+{
+	return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1);
+}
+
+static inline void b3SetLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1,
+										 b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1)
+{
+	linear = -n;
+	angular0 = -b3Cross(r0, n);
+	angular1 = b3Cross(r1, n);
+}
+
+static inline void b3SolveContact(b3ContactConstraint4& cs,
+								  const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+								  const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
+								  float maxRambdaDt[4], float minRambdaDt[4])
+{
+	b3Vector3 dLinVelA;
+	dLinVelA.setZero();
+	b3Vector3 dAngVelA;
+	dAngVelA.setZero();
+	b3Vector3 dLinVelB;
+	dLinVelB.setZero();
+	b3Vector3 dAngVelB;
+	dAngVelB.setZero();
+
+	for (int ic = 0; ic < 4; ic++)
+	{
+		//	dont necessary because this makes change to 0
+		if (cs.m_jacCoeffInv[ic] == 0.f) continue;
+
+		{
+			b3Vector3 angular0, angular1, linear;
+			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
+			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
+			b3SetLinearAndAngular((const b3Vector3&)-cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, linear, angular0, angular1);
+
+			float rambdaDt = b3CalcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
+										  linVelA, angVelA, linVelB, angVelB) +
+							 cs.m_b[ic];
+			rambdaDt *= cs.m_jacCoeffInv[ic];
+
+			{
+				float prevSum = cs.m_appliedRambdaDt[ic];
+				float updated = prevSum;
+				updated += rambdaDt;
+				updated = b3Max(updated, minRambdaDt[ic]);
+				updated = b3Min(updated, maxRambdaDt[ic]);
+				rambdaDt = updated - prevSum;
+				cs.m_appliedRambdaDt[ic] = updated;
+			}
+
+			b3Vector3 linImp0 = invMassA * linear * rambdaDt;
+			b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
+			b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
+			b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
+#ifdef _WIN32
+			b3Assert(_finite(linImp0.getX()));
+			b3Assert(_finite(linImp1.getX()));
+#endif
+			{
+				linVelA += linImp0;
+				angVelA += angImp0;
+				linVelB += linImp1;
+				angVelB += angImp1;
+			}
+		}
+	}
+}
+
+static inline void b3SolveFriction(b3ContactConstraint4& cs,
+								   const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
+								   const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
+								   float maxRambdaDt[4], float minRambdaDt[4])
+{
+	if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
+	const b3Vector3& center = (const b3Vector3&)cs.m_center;
+
+	b3Vector3 n = -(const b3Vector3&)cs.m_linear;
+
+	b3Vector3 tangent[2];
+
+	b3PlaneSpace1(n, tangent[0], tangent[1]);
+
+	b3Vector3 angular0, angular1, linear;
+	b3Vector3 r0 = center - posA;
+	b3Vector3 r1 = center - posB;
+	for (int i = 0; i < 2; i++)
+	{
+		b3SetLinearAndAngular(tangent[i], r0, r1, linear, angular0, angular1);
+		float rambdaDt = b3CalcRelVel(linear, -linear, angular0, angular1,
+									  linVelA, angVelA, linVelB, angVelB);
+		rambdaDt *= cs.m_fJacCoeffInv[i];
+
+		{
+			float prevSum = cs.m_fAppliedRambdaDt[i];
+			float updated = prevSum;
+			updated += rambdaDt;
+			updated = b3Max(updated, minRambdaDt[i]);
+			updated = b3Min(updated, maxRambdaDt[i]);
+			rambdaDt = updated - prevSum;
+			cs.m_fAppliedRambdaDt[i] = updated;
+		}
+
+		b3Vector3 linImp0 = invMassA * linear * rambdaDt;
+		b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
+		b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
+		b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
+#ifdef _WIN32
+		b3Assert(_finite(linImp0.getX()));
+		b3Assert(_finite(linImp1.getX()));
+#endif
+		linVelA += linImp0;
+		angVelA += angImp0;
+		linVelB += linImp1;
+		angVelB += angImp1;
+	}
+
+	{  //	angular damping for point constraint
+		b3Vector3 ab = (posB - posA).normalized();
+		b3Vector3 ac = (center - posA).normalized();
+		if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+		{
+			float angNA = b3Dot(n, angVelA);
+			float angNB = b3Dot(n, angVelB);
+
+			angVelA -= (angNA * 0.1f) * n;
+			angVelB -= (angNB * 0.1f) * n;
+		}
+	}
+}
+
+struct b3SolveTask  // : public ThreadPool::Task
+{
+	b3SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies,
+				b3AlignedObjectArray<b3Inertia>& shapes,
+				b3AlignedObjectArray<b3ContactConstraint4>& constraints,
+				int start, int nConstraints,
+				int maxNumBatches,
+				b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx)
+		: m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_wgUsedBodies(wgUsedBodies), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches)
+	{
+	}
+
+	unsigned short int getType() { return 0; }
+
+	void run(int tIdx)
+	{
+		b3AlignedObjectArray<int> usedBodies;
+		//printf("run..............\n");
+
+		for (int bb = 0; bb < m_maxNumBatches; bb++)
+		{
+			usedBodies.resize(0);
+			for (int ic = m_nConstraints - 1; ic >= 0; ic--)
+			//for(int ic=0; ic<m_nConstraints; ic++)
+			{
+				int i = m_start + ic;
+				if (m_constraints[i].m_batchIdx != bb)
+					continue;
+
+				float frictionCoeff = b3GetFrictionCoeff(&m_constraints[i]);
+				int aIdx = (int)m_constraints[i].m_bodyA;
+				int bIdx = (int)m_constraints[i].m_bodyB;
+				//int localBatch = m_constraints[i].m_batchIdx;
+				b3RigidBodyData& bodyA = m_bodies[aIdx];
+				b3RigidBodyData& bodyB = m_bodies[bIdx];
+
+#if 0
+				if ((bodyA.m_invMass) && (bodyB.m_invMass))
+				{
+				//	printf("aIdx=%d, bIdx=%d\n", aIdx,bIdx);
+				}
+				if (bIdx==10)
+				{
+					//printf("ic(b)=%d, localBatch=%d\n",ic,localBatch);
+				}
+#endif
+				if (aIdx == 10)
+				{
+					//printf("ic(a)=%d, localBatch=%d\n",ic,localBatch);
+				}
+				if (usedBodies.size() < (aIdx + 1))
+				{
+					usedBodies.resize(aIdx + 1, 0);
+				}
+
+				if (usedBodies.size() < (bIdx + 1))
+				{
+					usedBodies.resize(bIdx + 1, 0);
+				}
+
+				if (bodyA.m_invMass)
+				{
+					b3Assert(usedBodies[aIdx] == 0);
+					usedBodies[aIdx]++;
+				}
+
+				if (bodyB.m_invMass)
+				{
+					b3Assert(usedBodies[bIdx] == 0);
+					usedBodies[bIdx]++;
+				}
+
+				if (!m_solveFriction)
+				{
+					float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
+					float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
+
+					b3SolveContact(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
+								   (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
+								   maxRambdaDt, minRambdaDt);
+				}
+				else
+				{
+					float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
+					float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
+
+					float sum = 0;
+					for (int j = 0; j < 4; j++)
+					{
+						sum += m_constraints[i].m_appliedRambdaDt[j];
+					}
+					frictionCoeff = 0.7f;
+					for (int j = 0; j < 4; j++)
+					{
+						maxRambdaDt[j] = frictionCoeff * sum;
+						minRambdaDt[j] = -maxRambdaDt[j];
+					}
+
+					b3SolveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
+									(b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
+									maxRambdaDt, minRambdaDt);
+				}
+			}
+
+			if (m_wgUsedBodies)
+			{
+				if (m_wgUsedBodies[m_curWgidx].size() < usedBodies.size())
+				{
+					m_wgUsedBodies[m_curWgidx].resize(usedBodies.size());
+				}
+				for (int i = 0; i < usedBodies.size(); i++)
+				{
+					if (usedBodies[i])
+					{
+						//printf("cell %d uses body %d\n", m_curWgidx,i);
+						m_wgUsedBodies[m_curWgidx][i] = 1;
+					}
+				}
+			}
+		}
+	}
+
+	b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
+	b3AlignedObjectArray<b3Inertia>& m_shapes;
+	b3AlignedObjectArray<b3ContactConstraint4>& m_constraints;
+	b3AlignedObjectArray<int>* m_wgUsedBodies;
+	int m_curWgidx;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+	int m_maxNumBatches;
+};
+
+void b3CpuRigidBodyPipeline::solveContactConstraints()
+{
+	int m_nIterations = 4;
+
+	b3AlignedObjectArray<b3ContactConstraint4> contactConstraints;
+	//	const b3AlignedObjectArray<b3Contact4Data>& contacts = m_data->m_np->getContacts();
+	int n = contactConstraints.size();
+	//convert contacts...
+
+	int maxNumBatches = 250;
+
+	for (int iter = 0; iter < m_nIterations; iter++)
+	{
+		b3SolveTask task(m_data->m_rigidBodies, m_data->m_inertias, contactConstraints, 0, n, maxNumBatches, 0, 0);
+		task.m_solveFriction = false;
+		task.run(0);
+	}
+
+	for (int iter = 0; iter < m_nIterations; iter++)
+	{
+		b3SolveTask task(m_data->m_rigidBodies, m_data->m_inertias, contactConstraints, 0, n, maxNumBatches, 0, 0);
+		task.m_solveFriction = true;
+		task.run(0);
+	}
+}
+
+void b3CpuRigidBodyPipeline::integrate(float deltaTime)
+{
+	float angDamping = 0.f;
+	b3Vector3 gravityAcceleration = b3MakeVector3(0, -9, 0);
+
+	//integrate transforms (external forces/gravity should be moved into constraint solver)
+	for (int i = 0; i < m_data->m_rigidBodies.size(); i++)
+	{
+		b3IntegrateTransform(&m_data->m_rigidBodies[i], deltaTime, angDamping, gravityAcceleration);
+	}
+}
+
+int b3CpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userData)
+{
+	b3RigidBodyData body;
+	int bodyIndex = m_data->m_rigidBodies.size();
+	body.m_invMass = mass ? 1.f / mass : 0.f;
+	body.m_angVel.setValue(0, 0, 0);
+	body.m_collidableIdx = collidableIndex;
+	body.m_frictionCoeff = 0.3f;
+	body.m_linVel.setValue(0, 0, 0);
+	body.m_pos.setValue(position[0], position[1], position[2]);
+	body.m_quat.setValue(orientation[0], orientation[1], orientation[2], orientation[3]);
+	body.m_restituitionCoeff = 0.f;
+
+	m_data->m_rigidBodies.push_back(body);
+
+	if (collidableIndex >= 0)
+	{
+		b3Aabb& worldAabb = m_data->m_aabbWorldSpace.expand();
+
+		b3Aabb localAabb = m_data->m_np->getLocalSpaceAabb(collidableIndex);
+		b3Vector3 localAabbMin = b3MakeVector3(localAabb.m_min[0], localAabb.m_min[1], localAabb.m_min[2]);
+		b3Vector3 localAabbMax = b3MakeVector3(localAabb.m_max[0], localAabb.m_max[1], localAabb.m_max[2]);
+
+		b3Scalar margin = 0.01f;
+		b3Transform t;
+		t.setIdentity();
+		t.setOrigin(b3MakeVector3(position[0], position[1], position[2]));
+		t.setRotation(b3Quaternion(orientation[0], orientation[1], orientation[2], orientation[3]));
+		b3TransformAabb(localAabbMin, localAabbMax, margin, t, worldAabb.m_minVec, worldAabb.m_maxVec);
+
+		m_data->m_bp->createProxy(worldAabb.m_minVec, worldAabb.m_maxVec, bodyIndex, 0, 1, 1);
+		//		b3Vector3 aabbMin,aabbMax;
+		//	m_data->m_bp->getAabb(bodyIndex,aabbMin,aabbMax);
+	}
+	else
+	{
+		b3Error("registerPhysicsInstance using invalid collidableIndex\n");
+	}
+
+	return bodyIndex;
+}
+
+const struct b3RigidBodyData* b3CpuRigidBodyPipeline::getBodyBuffer() const
+{
+	return m_data->m_rigidBodies.size() ? &m_data->m_rigidBodies[0] : 0;
+}
+
+int b3CpuRigidBodyPipeline::getNumBodies() const
+{
+	return m_data->m_rigidBodies.size();
+}

+ 62 - 0
Dependencies/include/bullet3/Bullet3Dynamics/b3CpuRigidBodyPipeline.h

@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2013 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef B3_CPU_RIGIDBODY_PIPELINE_H
+#define B3_CPU_RIGIDBODY_PIPELINE_H
+
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
+
+class b3CpuRigidBodyPipeline
+{
+protected:
+	struct b3CpuRigidBodyPipelineInternalData* m_data;
+
+	int allocateCollidable();
+
+public:
+	b3CpuRigidBodyPipeline(class b3CpuNarrowPhase* narrowphase, struct b3DynamicBvhBroadphase* broadphaseDbvt, const struct b3Config& config);
+	virtual ~b3CpuRigidBodyPipeline();
+
+	virtual void stepSimulation(float deltaTime);
+	virtual void integrate(float timeStep);
+	virtual void updateAabbWorldSpace();
+	virtual void computeOverlappingPairs();
+	virtual void computeContactPoints();
+	virtual void solveContactConstraints();
+
+	int registerConvexPolyhedron(class b3ConvexUtility* convex);
+
+	int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData);
+	void writeAllInstancesToGpu();
+	void copyConstraintsToHost();
+	void setGravity(const float* grav);
+	void reset();
+
+	int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold);
+	int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold);
+	void removeConstraintByUid(int uid);
+
+	void addConstraint(class b3TypedConstraint* constraint);
+	void removeConstraint(b3TypedConstraint* constraint);
+
+	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults);
+
+	const struct b3RigidBodyData* getBodyBuffer() const;
+
+	int getNumBodies() const;
+};
+
+#endif  //B3_CPU_RIGIDBODY_PIPELINE_H

+ 18 - 0
Dependencies/include/bullet3/Bullet3Dynamics/premake4.lua

@@ -0,0 +1,18 @@
+	project "Bullet3Dynamics"
+
+	language "C++"
+				
+	kind "StaticLib"
+
+	includedirs {
+		".."
+	}		
+	
+    if os.is("Linux") then
+        buildoptions{"-fPIC"}
+    end
+
+	files {
+		"**.cpp",
+		"**.h"
+	}

+ 31 - 0
Dependencies/include/bullet3/Bullet3Dynamics/shared/b3ContactConstraint4.h

@@ -0,0 +1,31 @@
+#ifndef B3_CONTACT_CONSTRAINT5_H
+#define B3_CONTACT_CONSTRAINT5_H
+
+#include "Bullet3Common/shared/b3Float4.h"
+
+typedef struct b3ContactConstraint4 b3ContactConstraint4_t;
+
+struct b3ContactConstraint4
+{
+	b3Float4 m_linear;  //normal?
+	b3Float4 m_worldPos[4];
+	b3Float4 m_center;  //	friction
+	float m_jacCoeffInv[4];
+	float m_b[4];
+	float m_appliedRambdaDt[4];
+	float m_fJacCoeffInv[2];      //	friction
+	float m_fAppliedRambdaDt[2];  //	friction
+
+	unsigned int m_bodyA;
+	unsigned int m_bodyB;
+	int m_batchIdx;
+	unsigned int m_paddings;
+};
+
+//inline	void setFrictionCoeff(float value) { m_linear[3] = value; }
+inline float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint)
+{
+	return constraint->m_linear.w;
+}
+
+#endif  //B3_CONTACT_CONSTRAINT5_H

+ 148 - 0
Dependencies/include/bullet3/Bullet3Dynamics/shared/b3ConvertConstraint4.h

@@ -0,0 +1,148 @@
+
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+void b3PlaneSpace1(b3Float4ConstArg n, b3Float4* p, b3Float4* q);
+void b3PlaneSpace1(b3Float4ConstArg n, b3Float4* p, b3Float4* q)
+{
+	if (b3Fabs(n.z) > 0.70710678f)
+	{
+		// choose p in y-z plane
+		float a = n.y * n.y + n.z * n.z;
+		float k = 1.f / sqrt(a);
+		p[0].x = 0;
+		p[0].y = -n.z * k;
+		p[0].z = n.y * k;
+		// set q = n x p
+		q[0].x = a * k;
+		q[0].y = -n.x * p[0].z;
+		q[0].z = n.x * p[0].y;
+	}
+	else
+	{
+		// choose p in x-y plane
+		float a = n.x * n.x + n.y * n.y;
+		float k = 1.f / sqrt(a);
+		p[0].x = -n.y * k;
+		p[0].y = n.x * k;
+		p[0].z = 0;
+		// set q = n x p
+		q[0].x = -n.z * p[0].y;
+		q[0].y = n.z * p[0].x;
+		q[0].z = a * k;
+	}
+}
+
+void setLinearAndAngular(b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)
+{
+	*linear = b3MakeFloat4(n.x, n.y, n.z, 0.f);
+	*angular0 = b3Cross3(r0, n);
+	*angular1 = -b3Cross3(r1, n);
+}
+
+float calcRelVel(b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,
+				 b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1)
+{
+	return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);
+}
+
+float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,
+				   float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)
+{
+	//	linear0,1 are normlized
+	float jmj0 = invMass0;  //b3Dot3F4(linear0, linear0)*invMass0;
+	float jmj1 = b3Dot3F4(mtMul3(angular0, *invInertia0), angular0);
+	float jmj2 = invMass1;  //b3Dot3F4(linear1, linear1)*invMass1;
+	float jmj3 = b3Dot3F4(mtMul3(angular1, *invInertia1), angular1);
+	return -1.f / (jmj0 + jmj1 + jmj2 + jmj3);
+}
+
+void setConstraint4(b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,
+					b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB,
+					__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,
+					b3ContactConstraint4_t* dstC)
+{
+	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
+	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
+
+	float dtInv = 1.f / dt;
+	for (int ic = 0; ic < 4; ic++)
+	{
+		dstC->m_appliedRambdaDt[ic] = 0.f;
+	}
+	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
+
+	dstC->m_linear = src->m_worldNormalOnB;
+	dstC->m_linear.w = 0.7f;  //src->getFrictionCoeff() );
+	for (int ic = 0; ic < 4; ic++)
+	{
+		b3Float4 r0 = src->m_worldPosB[ic] - posA;
+		b3Float4 r1 = src->m_worldPosB[ic] - posB;
+
+		if (ic >= src->m_worldNormalOnB.w)  //npoints
+		{
+			dstC->m_jacCoeffInv[ic] = 0.f;
+			continue;
+		}
+
+		float relVelN;
+		{
+			b3Float4 linear, angular0, angular1;
+			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);
+
+			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+												   invMassA, &invInertiaA, invMassB, &invInertiaB);
+
+			relVelN = calcRelVel(linear, -linear, angular0, angular1,
+								 linVelA, angVelA, linVelB, angVelB);
+
+			float e = 0.f;  //src->getRestituitionCoeff();
+			if (relVelN * relVelN < 0.004f) e = 0.f;
+
+			dstC->m_b[ic] = e * relVelN;
+			//float penetration = src->m_worldPosB[ic].w;
+			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift) * positionConstraintCoeff * dtInv;
+			dstC->m_appliedRambdaDt[ic] = 0.f;
+		}
+	}
+
+	if (src->m_worldNormalOnB.w > 0)  //npoints
+	{                                 //	prepare friction
+		b3Float4 center = b3MakeFloat4(0.f, 0.f, 0.f, 0.f);
+		for (int i = 0; i < src->m_worldNormalOnB.w; i++)
+			center += src->m_worldPosB[i];
+		center /= (float)src->m_worldNormalOnB.w;
+
+		b3Float4 tangent[2];
+		b3PlaneSpace1(src->m_worldNormalOnB, &tangent[0], &tangent[1]);
+
+		b3Float4 r[2];
+		r[0] = center - posA;
+		r[1] = center - posB;
+
+		for (int i = 0; i < 2; i++)
+		{
+			b3Float4 linear, angular0, angular1;
+			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);
+
+			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+												   invMassA, &invInertiaA, invMassB, &invInertiaB);
+			dstC->m_fAppliedRambdaDt[i] = 0.f;
+		}
+		dstC->m_center = center;
+	}
+
+	for (int i = 0; i < 4; i++)
+	{
+		if (i < src->m_worldNormalOnB.w)
+		{
+			dstC->m_worldPos[i] = src->m_worldPosB[i];
+		}
+		else
+		{
+			dstC->m_worldPos[i] = b3MakeFloat4(0.f, 0.f, 0.f, 0.f);
+		}
+	}
+}

+ 14 - 0
Dependencies/include/bullet3/Bullet3Dynamics/shared/b3Inertia.h

@@ -0,0 +1,14 @@
+
+
+#ifndef B3_INERTIA_H
+#define B3_INERTIA_H
+
+#include "Bullet3Common/shared/b3Mat3x3.h"
+
+struct b3Inertia
+{
+	b3Mat3x3 m_invInertiaWorld;
+	b3Mat3x3 m_initInvInertia;
+};
+
+#endif  //B3_INERTIA_H

+ 106 - 0
Dependencies/include/bullet3/Bullet3Dynamics/shared/b3IntegrateTransforms.h

@@ -0,0 +1,106 @@
+
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+inline void integrateSingleTransform(__global b3RigidBodyData_t* bodies, int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)
+{
+	if (bodies[nodeID].m_invMass != 0.f)
+	{
+		float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
+
+		//angular velocity
+		{
+			b3Float4 axis;
+			//add some hardcoded angular damping
+			bodies[nodeID].m_angVel.x *= angularDamping;
+			bodies[nodeID].m_angVel.y *= angularDamping;
+			bodies[nodeID].m_angVel.z *= angularDamping;
+
+			b3Float4 angvel = bodies[nodeID].m_angVel;
+
+			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));
+
+			//limit the angular motion
+			if (fAngle * timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
+			{
+				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
+			}
+			if (fAngle < 0.001f)
+			{
+				// use Taylor's expansions of sync function
+				axis = angvel * (0.5f * timeStep - (timeStep * timeStep * timeStep) * 0.020833333333f * fAngle * fAngle);
+			}
+			else
+			{
+				// sync(fAngle) = sin(c*fAngle)/t
+				axis = angvel * (b3Sin(0.5f * fAngle * timeStep) / fAngle);
+			}
+
+			b3Quat dorn;
+			dorn.x = axis.x;
+			dorn.y = axis.y;
+			dorn.z = axis.z;
+			dorn.w = b3Cos(fAngle * timeStep * 0.5f);
+			b3Quat orn0 = bodies[nodeID].m_quat;
+			b3Quat predictedOrn = b3QuatMul(dorn, orn0);
+			predictedOrn = b3QuatNormalized(predictedOrn);
+			bodies[nodeID].m_quat = predictedOrn;
+		}
+		//linear velocity
+		bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;
+
+		//apply gravity
+		bodies[nodeID].m_linVel += gravityAcceleration * timeStep;
+	}
+}
+
+inline void b3IntegrateTransform(__global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)
+{
+	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
+
+	if ((body->m_invMass != 0.f))
+	{
+		//angular velocity
+		{
+			b3Float4 axis;
+			//add some hardcoded angular damping
+			body->m_angVel.x *= angularDamping;
+			body->m_angVel.y *= angularDamping;
+			body->m_angVel.z *= angularDamping;
+
+			b3Float4 angvel = body->m_angVel;
+			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));
+			//limit the angular motion
+			if (fAngle * timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
+			{
+				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
+			}
+			if (fAngle < 0.001f)
+			{
+				// use Taylor's expansions of sync function
+				axis = angvel * (0.5f * timeStep - (timeStep * timeStep * timeStep) * 0.020833333333f * fAngle * fAngle);
+			}
+			else
+			{
+				// sync(fAngle) = sin(c*fAngle)/t
+				axis = angvel * (b3Sin(0.5f * fAngle * timeStep) / fAngle);
+			}
+			b3Quat dorn;
+			dorn.x = axis.x;
+			dorn.y = axis.y;
+			dorn.z = axis.z;
+			dorn.w = b3Cos(fAngle * timeStep * 0.5f);
+			b3Quat orn0 = body->m_quat;
+
+			b3Quat predictedOrn = b3QuatMul(dorn, orn0);
+			predictedOrn = b3QuatNormalized(predictedOrn);
+			body->m_quat = predictedOrn;
+		}
+
+		//apply gravity
+		body->m_linVel += gravityAcceleration * timeStep;
+
+		//linear velocity
+		body->m_pos += body->m_linVel * timeStep;
+	}
+}

+ 47 - 0
Dependencies/include/bullet3/Bullet3Geometry/CMakeLists.txt

@@ -0,0 +1,47 @@
+
+INCLUDE_DIRECTORIES(
+	${BULLET_PHYSICS_SOURCE_DIR}/src
+)
+
+SET(Bullet3Geometry_SRCS
+	b3ConvexHullComputer.cpp
+	b3GeometryUtil.cpp
+)
+
+SET(Bullet3Geometry_HDRS
+	b3AabbUtil.h
+	b3ConvexHullComputer.h
+	b3GeometryUtil.h
+	b3GrahamScan2dConvexHull.h
+)
+
+ADD_LIBRARY(Bullet3Geometry ${Bullet3Geometry_SRCS} ${Bullet3Geometry_HDRS})
+if (BUILD_SHARED_LIBS)
+  target_link_libraries(Bullet3Geometry Bullet3Common)
+endif()
+SET_TARGET_PROPERTIES(Bullet3Geometry PROPERTIES VERSION ${BULLET_VERSION})
+SET_TARGET_PROPERTIES(Bullet3Geometry PROPERTIES SOVERSION ${BULLET_VERSION})
+
+IF (INSTALL_LIBS)
+	IF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+		#FILES_MATCHING requires CMake 2.6
+		IF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+			IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Geometry DESTINATION .)
+			ELSE (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+				INSTALL(TARGETS Bullet3Geometry
+					RUNTIME DESTINATION bin
+					LIBRARY DESTINATION lib${LIB_SUFFIX}
+					ARCHIVE DESTINATION lib${LIB_SUFFIX})
+				INSTALL(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+DESTINATION ${INCLUDE_INSTALL_DIR} FILES_MATCHING PATTERN "*.h"  PATTERN
+".svn" EXCLUDE PATTERN "CMakeFiles" EXCLUDE)
+			ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+		ENDIF (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 2.5)
+
+		IF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+			SET_TARGET_PROPERTIES(Bullet3Geometry PROPERTIES FRAMEWORK true)
+			SET_TARGET_PROPERTIES(Bullet3Geometry PROPERTIES PUBLIC_HEADER "${Bullet3Geometry_HDRS}")
+		ENDIF (APPLE AND BUILD_SHARED_LIBS AND FRAMEWORK)
+	ENDIF (NOT INTERNAL_CREATE_DISTRIBUTABLE_MSVC_PROJECTFILES)
+ENDIF (INSTALL_LIBS)

+ 217 - 0
Dependencies/include/bullet3/Bullet3Geometry/b3AabbUtil.h

@@ -0,0 +1,217 @@
+/*
+Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_AABB_UTIL2
+#define B3_AABB_UTIL2
+
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3MinMax.h"
+
+B3_FORCE_INLINE void b3AabbExpand(b3Vector3& aabbMin,
+								  b3Vector3& aabbMax,
+								  const b3Vector3& expansionMin,
+								  const b3Vector3& expansionMax)
+{
+	aabbMin = aabbMin + expansionMin;
+	aabbMax = aabbMax + expansionMax;
+}
+
+/// conservative test for overlap between two aabbs
+B3_FORCE_INLINE bool b3TestPointAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1,
+											 const b3Vector3& point)
+{
+	bool overlap = true;
+	overlap = (aabbMin1.getX() > point.getX() || aabbMax1.getX() < point.getX()) ? false : overlap;
+	overlap = (aabbMin1.getZ() > point.getZ() || aabbMax1.getZ() < point.getZ()) ? false : overlap;
+	overlap = (aabbMin1.getY() > point.getY() || aabbMax1.getY() < point.getY()) ? false : overlap;
+	return overlap;
+}
+
+/// conservative test for overlap between two aabbs
+B3_FORCE_INLINE bool b3TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1,
+											const b3Vector3& aabbMin2, const b3Vector3& aabbMax2)
+{
+	bool overlap = true;
+	overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
+	overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
+	overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
+	return overlap;
+}
+
+/// conservative test for overlap between triangle and aabb
+B3_FORCE_INLINE bool b3TestTriangleAgainstAabb2(const b3Vector3* vertices,
+												const b3Vector3& aabbMin, const b3Vector3& aabbMax)
+{
+	const b3Vector3& p1 = vertices[0];
+	const b3Vector3& p2 = vertices[1];
+	const b3Vector3& p3 = vertices[2];
+
+	if (b3Min(b3Min(p1[0], p2[0]), p3[0]) > aabbMax[0]) return false;
+	if (b3Max(b3Max(p1[0], p2[0]), p3[0]) < aabbMin[0]) return false;
+
+	if (b3Min(b3Min(p1[2], p2[2]), p3[2]) > aabbMax[2]) return false;
+	if (b3Max(b3Max(p1[2], p2[2]), p3[2]) < aabbMin[2]) return false;
+
+	if (b3Min(b3Min(p1[1], p2[1]), p3[1]) > aabbMax[1]) return false;
+	if (b3Max(b3Max(p1[1], p2[1]), p3[1]) < aabbMin[1]) return false;
+	return true;
+}
+
+B3_FORCE_INLINE int b3Outcode(const b3Vector3& p, const b3Vector3& halfExtent)
+{
+	return (p.getX() < -halfExtent.getX() ? 0x01 : 0x0) |
+		   (p.getX() > halfExtent.getX() ? 0x08 : 0x0) |
+		   (p.getY() < -halfExtent.getY() ? 0x02 : 0x0) |
+		   (p.getY() > halfExtent.getY() ? 0x10 : 0x0) |
+		   (p.getZ() < -halfExtent.getZ() ? 0x4 : 0x0) |
+		   (p.getZ() > halfExtent.getZ() ? 0x20 : 0x0);
+}
+
+B3_FORCE_INLINE bool b3RayAabb2(const b3Vector3& rayFrom,
+								const b3Vector3& rayInvDirection,
+								const unsigned int raySign[3],
+								const b3Vector3 bounds[2],
+								b3Scalar& tmin,
+								b3Scalar lambda_min,
+								b3Scalar lambda_max)
+{
+	b3Scalar tmax, tymin, tymax, tzmin, tzmax;
+	tmin = (bounds[raySign[0]].getX() - rayFrom.getX()) * rayInvDirection.getX();
+	tmax = (bounds[1 - raySign[0]].getX() - rayFrom.getX()) * rayInvDirection.getX();
+	tymin = (bounds[raySign[1]].getY() - rayFrom.getY()) * rayInvDirection.getY();
+	tymax = (bounds[1 - raySign[1]].getY() - rayFrom.getY()) * rayInvDirection.getY();
+
+	if ((tmin > tymax) || (tymin > tmax))
+		return false;
+
+	if (tymin > tmin)
+		tmin = tymin;
+
+	if (tymax < tmax)
+		tmax = tymax;
+
+	tzmin = (bounds[raySign[2]].getZ() - rayFrom.getZ()) * rayInvDirection.getZ();
+	tzmax = (bounds[1 - raySign[2]].getZ() - rayFrom.getZ()) * rayInvDirection.getZ();
+
+	if ((tmin > tzmax) || (tzmin > tmax))
+		return false;
+	if (tzmin > tmin)
+		tmin = tzmin;
+	if (tzmax < tmax)
+		tmax = tzmax;
+	return ((tmin < lambda_max) && (tmax > lambda_min));
+}
+
+B3_FORCE_INLINE bool b3RayAabb(const b3Vector3& rayFrom,
+							   const b3Vector3& rayTo,
+							   const b3Vector3& aabbMin,
+							   const b3Vector3& aabbMax,
+							   b3Scalar& param, b3Vector3& normal)
+{
+	b3Vector3 aabbHalfExtent = (aabbMax - aabbMin) * b3Scalar(0.5);
+	b3Vector3 aabbCenter = (aabbMax + aabbMin) * b3Scalar(0.5);
+	b3Vector3 source = rayFrom - aabbCenter;
+	b3Vector3 target = rayTo - aabbCenter;
+	int sourceOutcode = b3Outcode(source, aabbHalfExtent);
+	int targetOutcode = b3Outcode(target, aabbHalfExtent);
+	if ((sourceOutcode & targetOutcode) == 0x0)
+	{
+		b3Scalar lambda_enter = b3Scalar(0.0);
+		b3Scalar lambda_exit = param;
+		b3Vector3 r = target - source;
+		int i;
+		b3Scalar normSign = 1;
+		b3Vector3 hitNormal = b3MakeVector3(0, 0, 0);
+		int bit = 1;
+
+		for (int j = 0; j < 2; j++)
+		{
+			for (i = 0; i != 3; ++i)
+			{
+				if (sourceOutcode & bit)
+				{
+					b3Scalar lambda = (-source[i] - aabbHalfExtent[i] * normSign) / r[i];
+					if (lambda_enter <= lambda)
+					{
+						lambda_enter = lambda;
+						hitNormal.setValue(0, 0, 0);
+						hitNormal[i] = normSign;
+					}
+				}
+				else if (targetOutcode & bit)
+				{
+					b3Scalar lambda = (-source[i] - aabbHalfExtent[i] * normSign) / r[i];
+					b3SetMin(lambda_exit, lambda);
+				}
+				bit <<= 1;
+			}
+			normSign = b3Scalar(-1.);
+		}
+		if (lambda_enter <= lambda_exit)
+		{
+			param = lambda_enter;
+			normal = hitNormal;
+			return true;
+		}
+	}
+	return false;
+}
+
+B3_FORCE_INLINE void b3TransformAabb(const b3Vector3& halfExtents, b3Scalar margin, const b3Transform& t, b3Vector3& aabbMinOut, b3Vector3& aabbMaxOut)
+{
+	b3Vector3 halfExtentsWithMargin = halfExtents + b3MakeVector3(margin, margin, margin);
+	b3Matrix3x3 abs_b = t.getBasis().absolute();
+	b3Vector3 center = t.getOrigin();
+	b3Vector3 extent = halfExtentsWithMargin.dot3(abs_b[0], abs_b[1], abs_b[2]);
+	aabbMinOut = center - extent;
+	aabbMaxOut = center + extent;
+}
+
+B3_FORCE_INLINE void b3TransformAabb(const b3Vector3& localAabbMin, const b3Vector3& localAabbMax, b3Scalar margin, const b3Transform& trans, b3Vector3& aabbMinOut, b3Vector3& aabbMaxOut)
+{
+	//b3Assert(localAabbMin.getX() <= localAabbMax.getX());
+	//b3Assert(localAabbMin.getY() <= localAabbMax.getY());
+	//b3Assert(localAabbMin.getZ() <= localAabbMax.getZ());
+	b3Vector3 localHalfExtents = b3Scalar(0.5) * (localAabbMax - localAabbMin);
+	localHalfExtents += b3MakeVector3(margin, margin, margin);
+
+	b3Vector3 localCenter = b3Scalar(0.5) * (localAabbMax + localAabbMin);
+	b3Matrix3x3 abs_b = trans.getBasis().absolute();
+	b3Vector3 center = trans(localCenter);
+	b3Vector3 extent = localHalfExtents.dot3(abs_b[0], abs_b[1], abs_b[2]);
+	aabbMinOut = center - extent;
+	aabbMaxOut = center + extent;
+}
+
+#define B3_USE_BANCHLESS 1
+#ifdef B3_USE_BANCHLESS
+//This block replaces the block below and uses no branches, and replaces the 8 bit return with a 32 bit return for improved performance (~3x on XBox 360)
+B3_FORCE_INLINE unsigned b3TestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1, const unsigned short int* aabbMax1, const unsigned short int* aabbMin2, const unsigned short int* aabbMax2)
+{
+	return static_cast<unsigned int>(b3Select((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0]) & (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2]) & (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
+											  1, 0));
+}
+#else
+B3_FORCE_INLINE bool b3TestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1, const unsigned short int* aabbMax1, const unsigned short int* aabbMin2, const unsigned short int* aabbMax2)
+{
+	bool overlap = true;
+	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? false : overlap;
+	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? false : overlap;
+	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? false : overlap;
+	return overlap;
+}
+#endif  //B3_USE_BANCHLESS
+
+#endif  //B3_AABB_UTIL2

+ 2745 - 0
Dependencies/include/bullet3/Bullet3Geometry/b3ConvexHullComputer.cpp

@@ -0,0 +1,2745 @@
+/*
+Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <string.h>
+
+#include "b3ConvexHullComputer.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+#include "Bullet3Common/b3MinMax.h"
+#include "Bullet3Common/b3Vector3.h"
+
+#ifdef __GNUC__
+#include <stdint.h>
+typedef int32_t btInt32_t;
+typedef int64_t btInt64_t;
+typedef uint32_t btUint32_t;
+typedef uint64_t btUint64_t;
+#elif defined(_MSC_VER)
+typedef __int32 btInt32_t;
+typedef __int64 btInt64_t;
+typedef unsigned __int32 btUint32_t;
+typedef unsigned __int64 btUint64_t;
+#else
+typedef int btInt32_t;
+typedef long long int btInt64_t;
+typedef unsigned int btUint32_t;
+typedef unsigned long long int btUint64_t;
+#endif
+
+//The definition of USE_X86_64_ASM is moved into the build system. You can enable it manually by commenting out the following lines
+//#if (defined(__GNUC__) && defined(__x86_64__) && !defined(__ICL))  // || (defined(__ICL) && defined(_M_X64))   bug in Intel compiler, disable inline assembly
+//	#define USE_X86_64_ASM
+//#endif
+
+//#define DEBUG_CONVEX_HULL
+//#define SHOW_ITERATIONS
+
+#if defined(DEBUG_CONVEX_HULL) || defined(SHOW_ITERATIONS)
+#include <stdio.h>
+#endif
+
+// Convex hull implementation based on Preparata and Hong
+// Ole Kniemeyer, MAXON Computer GmbH
+class b3ConvexHullInternal
+{
+public:
+	class Point64
+	{
+	public:
+		btInt64_t x;
+		btInt64_t y;
+		btInt64_t z;
+
+		Point64(btInt64_t x, btInt64_t y, btInt64_t z) : x(x), y(y), z(z)
+		{
+		}
+
+		bool isZero()
+		{
+			return (x == 0) && (y == 0) && (z == 0);
+		}
+
+		btInt64_t dot(const Point64& b) const
+		{
+			return x * b.x + y * b.y + z * b.z;
+		}
+	};
+
+	class Point32
+	{
+	public:
+		btInt32_t x;
+		btInt32_t y;
+		btInt32_t z;
+		int index;
+
+		Point32()
+		{
+		}
+
+		Point32(btInt32_t x, btInt32_t y, btInt32_t z) : x(x), y(y), z(z), index(-1)
+		{
+		}
+
+		bool operator==(const Point32& b) const
+		{
+			return (x == b.x) && (y == b.y) && (z == b.z);
+		}
+
+		bool operator!=(const Point32& b) const
+		{
+			return (x != b.x) || (y != b.y) || (z != b.z);
+		}
+
+		bool isZero()
+		{
+			return (x == 0) && (y == 0) && (z == 0);
+		}
+
+		Point64 cross(const Point32& b) const
+		{
+			return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
+		}
+
+		Point64 cross(const Point64& b) const
+		{
+			return Point64(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x);
+		}
+
+		btInt64_t dot(const Point32& b) const
+		{
+			return x * b.x + y * b.y + z * b.z;
+		}
+
+		btInt64_t dot(const Point64& b) const
+		{
+			return x * b.x + y * b.y + z * b.z;
+		}
+
+		Point32 operator+(const Point32& b) const
+		{
+			return Point32(x + b.x, y + b.y, z + b.z);
+		}
+
+		Point32 operator-(const Point32& b) const
+		{
+			return Point32(x - b.x, y - b.y, z - b.z);
+		}
+	};
+
+	class Int128
+	{
+	public:
+		btUint64_t low;
+		btUint64_t high;
+
+		Int128()
+		{
+		}
+
+		Int128(btUint64_t low, btUint64_t high) : low(low), high(high)
+		{
+		}
+
+		Int128(btUint64_t low) : low(low), high(0)
+		{
+		}
+
+		Int128(btInt64_t value) : low(value), high((value >= 0) ? 0 : (btUint64_t)-1LL)
+		{
+		}
+
+		static Int128 mul(btInt64_t a, btInt64_t b);
+
+		static Int128 mul(btUint64_t a, btUint64_t b);
+
+		Int128 operator-() const
+		{
+			return Int128((btUint64_t) - (btInt64_t)low, ~high + (low == 0));
+		}
+
+		Int128 operator+(const Int128& b) const
+		{
+#ifdef USE_X86_64_ASM
+			Int128 result;
+			__asm__(
+				"addq %[bl], %[rl]\n\t"
+				"adcq %[bh], %[rh]\n\t"
+				: [rl] "=r"(result.low), [rh] "=r"(result.high)
+				: "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+				: "cc");
+			return result;
+#else
+			btUint64_t lo = low + b.low;
+			return Int128(lo, high + b.high + (lo < low));
+#endif
+		}
+
+		Int128 operator-(const Int128& b) const
+		{
+#ifdef USE_X86_64_ASM
+			Int128 result;
+			__asm__(
+				"subq %[bl], %[rl]\n\t"
+				"sbbq %[bh], %[rh]\n\t"
+				: [rl] "=r"(result.low), [rh] "=r"(result.high)
+				: "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+				: "cc");
+			return result;
+#else
+			return *this + -b;
+#endif
+		}
+
+		Int128& operator+=(const Int128& b)
+		{
+#ifdef USE_X86_64_ASM
+			__asm__(
+				"addq %[bl], %[rl]\n\t"
+				"adcq %[bh], %[rh]\n\t"
+				: [rl] "=r"(low), [rh] "=r"(high)
+				: "0"(low), "1"(high), [bl] "g"(b.low), [bh] "g"(b.high)
+				: "cc");
+#else
+			btUint64_t lo = low + b.low;
+			if (lo < low)
+			{
+				++high;
+			}
+			low = lo;
+			high += b.high;
+#endif
+			return *this;
+		}
+
+		Int128& operator++()
+		{
+			if (++low == 0)
+			{
+				++high;
+			}
+			return *this;
+		}
+
+		Int128 operator*(btInt64_t b) const;
+
+		b3Scalar toScalar() const
+		{
+			return ((btInt64_t)high >= 0) ? b3Scalar(high) * (b3Scalar(0x100000000LL) * b3Scalar(0x100000000LL)) + b3Scalar(low)
+										  : -(-*this).toScalar();
+		}
+
+		int getSign() const
+		{
+			return ((btInt64_t)high < 0) ? -1 : (high || low) ? 1 : 0;
+		}
+
+		bool operator<(const Int128& b) const
+		{
+			return (high < b.high) || ((high == b.high) && (low < b.low));
+		}
+
+		int ucmp(const Int128& b) const
+		{
+			if (high < b.high)
+			{
+				return -1;
+			}
+			if (high > b.high)
+			{
+				return 1;
+			}
+			if (low < b.low)
+			{
+				return -1;
+			}
+			if (low > b.low)
+			{
+				return 1;
+			}
+			return 0;
+		}
+	};
+
+	class Rational64
+	{
+	private:
+		btUint64_t m_numerator;
+		btUint64_t m_denominator;
+		int sign;
+
+	public:
+		Rational64(btInt64_t numerator, btInt64_t denominator)
+		{
+			if (numerator > 0)
+			{
+				sign = 1;
+				m_numerator = (btUint64_t)numerator;
+			}
+			else if (numerator < 0)
+			{
+				sign = -1;
+				m_numerator = (btUint64_t)-numerator;
+			}
+			else
+			{
+				sign = 0;
+				m_numerator = 0;
+			}
+			if (denominator > 0)
+			{
+				m_denominator = (btUint64_t)denominator;
+			}
+			else if (denominator < 0)
+			{
+				sign = -sign;
+				m_denominator = (btUint64_t)-denominator;
+			}
+			else
+			{
+				m_denominator = 0;
+			}
+		}
+
+		bool isNegativeInfinity() const
+		{
+			return (sign < 0) && (m_denominator == 0);
+		}
+
+		bool isNaN() const
+		{
+			return (sign == 0) && (m_denominator == 0);
+		}
+
+		int compare(const Rational64& b) const;
+
+		b3Scalar toScalar() const
+		{
+			return sign * ((m_denominator == 0) ? B3_INFINITY : (b3Scalar)m_numerator / m_denominator);
+		}
+	};
+
+	class Rational128
+	{
+	private:
+		Int128 numerator;
+		Int128 denominator;
+		int sign;
+		bool isInt64;
+
+	public:
+		Rational128(btInt64_t value)
+		{
+			if (value > 0)
+			{
+				sign = 1;
+				this->numerator = value;
+			}
+			else if (value < 0)
+			{
+				sign = -1;
+				this->numerator = -value;
+			}
+			else
+			{
+				sign = 0;
+				this->numerator = (btUint64_t)0;
+			}
+			this->denominator = (btUint64_t)1;
+			isInt64 = true;
+		}
+
+		Rational128(const Int128& numerator, const Int128& denominator)
+		{
+			sign = numerator.getSign();
+			if (sign >= 0)
+			{
+				this->numerator = numerator;
+			}
+			else
+			{
+				this->numerator = -numerator;
+			}
+			int dsign = denominator.getSign();
+			if (dsign >= 0)
+			{
+				this->denominator = denominator;
+			}
+			else
+			{
+				sign = -sign;
+				this->denominator = -denominator;
+			}
+			isInt64 = false;
+		}
+
+		int compare(const Rational128& b) const;
+
+		int compare(btInt64_t b) const;
+
+		b3Scalar toScalar() const
+		{
+			return sign * ((denominator.getSign() == 0) ? B3_INFINITY : numerator.toScalar() / denominator.toScalar());
+		}
+	};
+
+	class PointR128
+	{
+	public:
+		Int128 x;
+		Int128 y;
+		Int128 z;
+		Int128 denominator;
+
+		PointR128()
+		{
+		}
+
+		PointR128(Int128 x, Int128 y, Int128 z, Int128 denominator) : x(x), y(y), z(z), denominator(denominator)
+		{
+		}
+
+		b3Scalar xvalue() const
+		{
+			return x.toScalar() / denominator.toScalar();
+		}
+
+		b3Scalar yvalue() const
+		{
+			return y.toScalar() / denominator.toScalar();
+		}
+
+		b3Scalar zvalue() const
+		{
+			return z.toScalar() / denominator.toScalar();
+		}
+	};
+
+	class Edge;
+	class Face;
+
+	class Vertex
+	{
+	public:
+		Vertex* next;
+		Vertex* prev;
+		Edge* edges;
+		Face* firstNearbyFace;
+		Face* lastNearbyFace;
+		PointR128 point128;
+		Point32 point;
+		int copy;
+
+		Vertex() : next(NULL), prev(NULL), edges(NULL), firstNearbyFace(NULL), lastNearbyFace(NULL), copy(-1)
+		{
+		}
+
+#ifdef DEBUG_CONVEX_HULL
+		void print()
+		{
+			b3Printf("V%d (%d, %d, %d)", point.index, point.x, point.y, point.z);
+		}
+
+		void printGraph();
+#endif
+
+		Point32 operator-(const Vertex& b) const
+		{
+			return point - b.point;
+		}
+
+		Rational128 dot(const Point64& b) const
+		{
+			return (point.index >= 0) ? Rational128(point.dot(b))
+									  : Rational128(point128.x * b.x + point128.y * b.y + point128.z * b.z, point128.denominator);
+		}
+
+		b3Scalar xvalue() const
+		{
+			return (point.index >= 0) ? b3Scalar(point.x) : point128.xvalue();
+		}
+
+		b3Scalar yvalue() const
+		{
+			return (point.index >= 0) ? b3Scalar(point.y) : point128.yvalue();
+		}
+
+		b3Scalar zvalue() const
+		{
+			return (point.index >= 0) ? b3Scalar(point.z) : point128.zvalue();
+		}
+
+		void receiveNearbyFaces(Vertex* src)
+		{
+			if (lastNearbyFace)
+			{
+				lastNearbyFace->nextWithSameNearbyVertex = src->firstNearbyFace;
+			}
+			else
+			{
+				firstNearbyFace = src->firstNearbyFace;
+			}
+			if (src->lastNearbyFace)
+			{
+				lastNearbyFace = src->lastNearbyFace;
+			}
+			for (Face* f = src->firstNearbyFace; f; f = f->nextWithSameNearbyVertex)
+			{
+				b3Assert(f->nearbyVertex == src);
+				f->nearbyVertex = this;
+			}
+			src->firstNearbyFace = NULL;
+			src->lastNearbyFace = NULL;
+		}
+	};
+
+	class Edge
+	{
+	public:
+		Edge* next;
+		Edge* prev;
+		Edge* reverse;
+		Vertex* target;
+		Face* face;
+		int copy;
+
+		~Edge()
+		{
+			next = NULL;
+			prev = NULL;
+			reverse = NULL;
+			target = NULL;
+			face = NULL;
+		}
+
+		void link(Edge* n)
+		{
+			b3Assert(reverse->target == n->reverse->target);
+			next = n;
+			n->prev = this;
+		}
+
+#ifdef DEBUG_CONVEX_HULL
+		void print()
+		{
+			b3Printf("E%p : %d -> %d,  n=%p p=%p   (0 %d\t%d\t%d) -> (%d %d %d)", this, reverse->target->point.index, target->point.index, next, prev,
+					 reverse->target->point.x, reverse->target->point.y, reverse->target->point.z, target->point.x, target->point.y, target->point.z);
+		}
+#endif
+	};
+
+	class Face
+	{
+	public:
+		Face* next;
+		Vertex* nearbyVertex;
+		Face* nextWithSameNearbyVertex;
+		Point32 origin;
+		Point32 dir0;
+		Point32 dir1;
+
+		Face() : next(NULL), nearbyVertex(NULL), nextWithSameNearbyVertex(NULL)
+		{
+		}
+
+		void init(Vertex* a, Vertex* b, Vertex* c)
+		{
+			nearbyVertex = a;
+			origin = a->point;
+			dir0 = *b - *a;
+			dir1 = *c - *a;
+			if (a->lastNearbyFace)
+			{
+				a->lastNearbyFace->nextWithSameNearbyVertex = this;
+			}
+			else
+			{
+				a->firstNearbyFace = this;
+			}
+			a->lastNearbyFace = this;
+		}
+
+		Point64 getNormal()
+		{
+			return dir0.cross(dir1);
+		}
+	};
+
+	template <typename UWord, typename UHWord>
+	class DMul
+	{
+	private:
+		static btUint32_t high(btUint64_t value)
+		{
+			return (btUint32_t)(value >> 32);
+		}
+
+		static btUint32_t low(btUint64_t value)
+		{
+			return (btUint32_t)value;
+		}
+
+		static btUint64_t mul(btUint32_t a, btUint32_t b)
+		{
+			return (btUint64_t)a * (btUint64_t)b;
+		}
+
+		static void shlHalf(btUint64_t& value)
+		{
+			value <<= 32;
+		}
+
+		static btUint64_t high(Int128 value)
+		{
+			return value.high;
+		}
+
+		static btUint64_t low(Int128 value)
+		{
+			return value.low;
+		}
+
+		static Int128 mul(btUint64_t a, btUint64_t b)
+		{
+			return Int128::mul(a, b);
+		}
+
+		static void shlHalf(Int128& value)
+		{
+			value.high = value.low;
+			value.low = 0;
+		}
+
+	public:
+		static void mul(UWord a, UWord b, UWord& resLow, UWord& resHigh)
+		{
+			UWord p00 = mul(low(a), low(b));
+			UWord p01 = mul(low(a), high(b));
+			UWord p10 = mul(high(a), low(b));
+			UWord p11 = mul(high(a), high(b));
+			UWord p0110 = UWord(low(p01)) + UWord(low(p10));
+			p11 += high(p01);
+			p11 += high(p10);
+			p11 += high(p0110);
+			shlHalf(p0110);
+			p00 += p0110;
+			if (p00 < p0110)
+			{
+				++p11;
+			}
+			resLow = p00;
+			resHigh = p11;
+		}
+	};
+
+private:
+	class IntermediateHull
+	{
+	public:
+		Vertex* minXy;
+		Vertex* maxXy;
+		Vertex* minYx;
+		Vertex* maxYx;
+
+		IntermediateHull() : minXy(NULL), maxXy(NULL), minYx(NULL), maxYx(NULL)
+		{
+		}
+
+		void print();
+	};
+
+	enum Orientation
+	{
+		NONE,
+		CLOCKWISE,
+		COUNTER_CLOCKWISE
+	};
+
+	template <typename T>
+	class PoolArray
+	{
+	private:
+		T* array;
+		int size;
+
+	public:
+		PoolArray<T>* next;
+
+		PoolArray(int size) : size(size), next(NULL)
+		{
+			array = (T*)b3AlignedAlloc(sizeof(T) * size, 16);
+		}
+
+		~PoolArray()
+		{
+			b3AlignedFree(array);
+		}
+
+		T* init()
+		{
+			T* o = array;
+			for (int i = 0; i < size; i++, o++)
+			{
+				o->next = (i + 1 < size) ? o + 1 : NULL;
+			}
+			return array;
+		}
+	};
+
+	template <typename T>
+	class Pool
+	{
+	private:
+		PoolArray<T>* arrays;
+		PoolArray<T>* nextArray;
+		T* freeObjects;
+		int arraySize;
+
+	public:
+		Pool() : arrays(NULL), nextArray(NULL), freeObjects(NULL), arraySize(256)
+		{
+		}
+
+		~Pool()
+		{
+			while (arrays)
+			{
+				PoolArray<T>* p = arrays;
+				arrays = p->next;
+				p->~PoolArray<T>();
+				b3AlignedFree(p);
+			}
+		}
+
+		void reset()
+		{
+			nextArray = arrays;
+			freeObjects = NULL;
+		}
+
+		void setArraySize(int arraySize)
+		{
+			this->arraySize = arraySize;
+		}
+
+		T* newObject()
+		{
+			T* o = freeObjects;
+			if (!o)
+			{
+				PoolArray<T>* p = nextArray;
+				if (p)
+				{
+					nextArray = p->next;
+				}
+				else
+				{
+					p = new (b3AlignedAlloc(sizeof(PoolArray<T>), 16)) PoolArray<T>(arraySize);
+					p->next = arrays;
+					arrays = p;
+				}
+				o = p->init();
+			}
+			freeObjects = o->next;
+			return new (o) T();
+		};
+
+		void freeObject(T* object)
+		{
+			object->~T();
+			object->next = freeObjects;
+			freeObjects = object;
+		}
+	};
+
+	b3Vector3 scaling;
+	b3Vector3 center;
+	Pool<Vertex> vertexPool;
+	Pool<Edge> edgePool;
+	Pool<Face> facePool;
+	b3AlignedObjectArray<Vertex*> originalVertices;
+	int mergeStamp;
+	int minAxis;
+	int medAxis;
+	int maxAxis;
+	int usedEdgePairs;
+	int maxUsedEdgePairs;
+
+	static Orientation getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t);
+	Edge* findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot);
+	void findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1);
+
+	Edge* newEdgePair(Vertex* from, Vertex* to);
+
+	void removeEdgePair(Edge* edge)
+	{
+		Edge* n = edge->next;
+		Edge* r = edge->reverse;
+
+		b3Assert(edge->target && r->target);
+
+		if (n != edge)
+		{
+			n->prev = edge->prev;
+			edge->prev->next = n;
+			r->target->edges = n;
+		}
+		else
+		{
+			r->target->edges = NULL;
+		}
+
+		n = r->next;
+
+		if (n != r)
+		{
+			n->prev = r->prev;
+			r->prev->next = n;
+			edge->target->edges = n;
+		}
+		else
+		{
+			edge->target->edges = NULL;
+		}
+
+		edgePool.freeObject(edge);
+		edgePool.freeObject(r);
+		usedEdgePairs--;
+	}
+
+	void computeInternal(int start, int end, IntermediateHull& result);
+
+	bool mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1);
+
+	void merge(IntermediateHull& h0, IntermediateHull& h1);
+
+	b3Vector3 toBtVector(const Point32& v);
+
+	b3Vector3 getBtNormal(Face* face);
+
+	bool shiftFace(Face* face, b3Scalar amount, b3AlignedObjectArray<Vertex*> stack);
+
+public:
+	Vertex* vertexList;
+
+	void compute(const void* coords, bool doubleCoords, int stride, int count);
+
+	b3Vector3 getCoordinates(const Vertex* v);
+
+	b3Scalar shrink(b3Scalar amount, b3Scalar clampAmount);
+};
+
+b3ConvexHullInternal::Int128 b3ConvexHullInternal::Int128::operator*(btInt64_t b) const
+{
+	bool negative = (btInt64_t)high < 0;
+	Int128 a = negative ? -*this : *this;
+	if (b < 0)
+	{
+		negative = !negative;
+		b = -b;
+	}
+	Int128 result = mul(a.low, (btUint64_t)b);
+	result.high += a.high * (btUint64_t)b;
+	return negative ? -result : result;
+}
+
+b3ConvexHullInternal::Int128 b3ConvexHullInternal::Int128::mul(btInt64_t a, btInt64_t b)
+{
+	Int128 result;
+
+#ifdef USE_X86_64_ASM
+	__asm__("imulq %[b]"
+			: "=a"(result.low), "=d"(result.high)
+			: "0"(a), [b] "r"(b)
+			: "cc");
+	return result;
+
+#else
+	bool negative = a < 0;
+	if (negative)
+	{
+		a = -a;
+	}
+	if (b < 0)
+	{
+		negative = !negative;
+		b = -b;
+	}
+	DMul<btUint64_t, btUint32_t>::mul((btUint64_t)a, (btUint64_t)b, result.low, result.high);
+	return negative ? -result : result;
+#endif
+}
+
+b3ConvexHullInternal::Int128 b3ConvexHullInternal::Int128::mul(btUint64_t a, btUint64_t b)
+{
+	Int128 result;
+
+#ifdef USE_X86_64_ASM
+	__asm__("mulq %[b]"
+			: "=a"(result.low), "=d"(result.high)
+			: "0"(a), [b] "r"(b)
+			: "cc");
+
+#else
+	DMul<btUint64_t, btUint32_t>::mul(a, b, result.low, result.high);
+#endif
+
+	return result;
+}
+
+int b3ConvexHullInternal::Rational64::compare(const Rational64& b) const
+{
+	if (sign != b.sign)
+	{
+		return sign - b.sign;
+	}
+	else if (sign == 0)
+	{
+		return 0;
+	}
+
+	//	return (numerator * b.denominator > b.numerator * denominator) ? sign : (numerator * b.denominator < b.numerator * denominator) ? -sign : 0;
+
+#ifdef USE_X86_64_ASM
+
+	int result;
+	btInt64_t tmp;
+	btInt64_t dummy;
+	__asm__(
+		"mulq %[bn]\n\t"
+		"movq %%rax, %[tmp]\n\t"
+		"movq %%rdx, %%rbx\n\t"
+		"movq %[tn], %%rax\n\t"
+		"mulq %[bd]\n\t"
+		"subq %[tmp], %%rax\n\t"
+		"sbbq %%rbx, %%rdx\n\t"  // rdx:rax contains 128-bit-difference "numerator*b.denominator - b.numerator*denominator"
+		"setnsb %%bh\n\t"        // bh=1 if difference is non-negative, bh=0 otherwise
+		"orq %%rdx, %%rax\n\t"
+		"setnzb %%bl\n\t"      // bl=1 if difference if non-zero, bl=0 if it is zero
+		"decb %%bh\n\t"        // now bx=0x0000 if difference is zero, 0xff01 if it is negative, 0x0001 if it is positive (i.e., same sign as difference)
+		"shll $16, %%ebx\n\t"  // ebx has same sign as difference
+		: "=&b"(result), [tmp] "=&r"(tmp), "=a"(dummy)
+		: "a"(denominator), [bn] "g"(b.numerator), [tn] "g"(numerator), [bd] "g"(b.denominator)
+		: "%rdx", "cc");
+	return result ? result ^ sign  // if sign is +1, only bit 0 of result is inverted, which does not change the sign of result (and cannot result in zero)
+								   // if sign is -1, all bits of result are inverted, which changes the sign of result (and again cannot result in zero)
+				  : 0;
+
+#else
+
+	return sign * Int128::mul(m_numerator, b.m_denominator).ucmp(Int128::mul(m_denominator, b.m_numerator));
+
+#endif
+}
+
+int b3ConvexHullInternal::Rational128::compare(const Rational128& b) const
+{
+	if (sign != b.sign)
+	{
+		return sign - b.sign;
+	}
+	else if (sign == 0)
+	{
+		return 0;
+	}
+	if (isInt64)
+	{
+		return -b.compare(sign * (btInt64_t)numerator.low);
+	}
+
+	Int128 nbdLow, nbdHigh, dbnLow, dbnHigh;
+	DMul<Int128, btUint64_t>::mul(numerator, b.denominator, nbdLow, nbdHigh);
+	DMul<Int128, btUint64_t>::mul(denominator, b.numerator, dbnLow, dbnHigh);
+
+	int cmp = nbdHigh.ucmp(dbnHigh);
+	if (cmp)
+	{
+		return cmp * sign;
+	}
+	return nbdLow.ucmp(dbnLow) * sign;
+}
+
+int b3ConvexHullInternal::Rational128::compare(btInt64_t b) const
+{
+	if (isInt64)
+	{
+		btInt64_t a = sign * (btInt64_t)numerator.low;
+		return (a > b) ? 1 : (a < b) ? -1 : 0;
+	}
+	if (b > 0)
+	{
+		if (sign <= 0)
+		{
+			return -1;
+		}
+	}
+	else if (b < 0)
+	{
+		if (sign >= 0)
+		{
+			return 1;
+		}
+		b = -b;
+	}
+	else
+	{
+		return sign;
+	}
+
+	return numerator.ucmp(denominator * b) * sign;
+}
+
+b3ConvexHullInternal::Edge* b3ConvexHullInternal::newEdgePair(Vertex* from, Vertex* to)
+{
+	b3Assert(from && to);
+	Edge* e = edgePool.newObject();
+	Edge* r = edgePool.newObject();
+	e->reverse = r;
+	r->reverse = e;
+	e->copy = mergeStamp;
+	r->copy = mergeStamp;
+	e->target = to;
+	r->target = from;
+	e->face = NULL;
+	r->face = NULL;
+	usedEdgePairs++;
+	if (usedEdgePairs > maxUsedEdgePairs)
+	{
+		maxUsedEdgePairs = usedEdgePairs;
+	}
+	return e;
+}
+
+bool b3ConvexHullInternal::mergeProjection(IntermediateHull& h0, IntermediateHull& h1, Vertex*& c0, Vertex*& c1)
+{
+	Vertex* v0 = h0.maxYx;
+	Vertex* v1 = h1.minYx;
+	if ((v0->point.x == v1->point.x) && (v0->point.y == v1->point.y))
+	{
+		b3Assert(v0->point.z < v1->point.z);
+		Vertex* v1p = v1->prev;
+		if (v1p == v1)
+		{
+			c0 = v0;
+			if (v1->edges)
+			{
+				b3Assert(v1->edges->next == v1->edges);
+				v1 = v1->edges->target;
+				b3Assert(v1->edges->next == v1->edges);
+			}
+			c1 = v1;
+			return false;
+		}
+		Vertex* v1n = v1->next;
+		v1p->next = v1n;
+		v1n->prev = v1p;
+		if (v1 == h1.minXy)
+		{
+			if ((v1n->point.x < v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y < v1p->point.y)))
+			{
+				h1.minXy = v1n;
+			}
+			else
+			{
+				h1.minXy = v1p;
+			}
+		}
+		if (v1 == h1.maxXy)
+		{
+			if ((v1n->point.x > v1p->point.x) || ((v1n->point.x == v1p->point.x) && (v1n->point.y > v1p->point.y)))
+			{
+				h1.maxXy = v1n;
+			}
+			else
+			{
+				h1.maxXy = v1p;
+			}
+		}
+	}
+
+	v0 = h0.maxXy;
+	v1 = h1.maxXy;
+	Vertex* v00 = NULL;
+	Vertex* v10 = NULL;
+	btInt32_t sign = 1;
+
+	for (int side = 0; side <= 1; side++)
+	{
+		btInt32_t dx = (v1->point.x - v0->point.x) * sign;
+		if (dx > 0)
+		{
+			while (true)
+			{
+				btInt32_t dy = v1->point.y - v0->point.y;
+
+				Vertex* w0 = side ? v0->next : v0->prev;
+				if (w0 != v0)
+				{
+					btInt32_t dx0 = (w0->point.x - v0->point.x) * sign;
+					btInt32_t dy0 = w0->point.y - v0->point.y;
+					if ((dy0 <= 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx <= dy * dx0))))
+					{
+						v0 = w0;
+						dx = (v1->point.x - v0->point.x) * sign;
+						continue;
+					}
+				}
+
+				Vertex* w1 = side ? v1->next : v1->prev;
+				if (w1 != v1)
+				{
+					btInt32_t dx1 = (w1->point.x - v1->point.x) * sign;
+					btInt32_t dy1 = w1->point.y - v1->point.y;
+					btInt32_t dxn = (w1->point.x - v0->point.x) * sign;
+					if ((dxn > 0) && (dy1 < 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx < dy * dx1))))
+					{
+						v1 = w1;
+						dx = dxn;
+						continue;
+					}
+				}
+
+				break;
+			}
+		}
+		else if (dx < 0)
+		{
+			while (true)
+			{
+				btInt32_t dy = v1->point.y - v0->point.y;
+
+				Vertex* w1 = side ? v1->prev : v1->next;
+				if (w1 != v1)
+				{
+					btInt32_t dx1 = (w1->point.x - v1->point.x) * sign;
+					btInt32_t dy1 = w1->point.y - v1->point.y;
+					if ((dy1 >= 0) && ((dx1 == 0) || ((dx1 < 0) && (dy1 * dx <= dy * dx1))))
+					{
+						v1 = w1;
+						dx = (v1->point.x - v0->point.x) * sign;
+						continue;
+					}
+				}
+
+				Vertex* w0 = side ? v0->prev : v0->next;
+				if (w0 != v0)
+				{
+					btInt32_t dx0 = (w0->point.x - v0->point.x) * sign;
+					btInt32_t dy0 = w0->point.y - v0->point.y;
+					btInt32_t dxn = (v1->point.x - w0->point.x) * sign;
+					if ((dxn < 0) && (dy0 > 0) && ((dx0 == 0) || ((dx0 < 0) && (dy0 * dx < dy * dx0))))
+					{
+						v0 = w0;
+						dx = dxn;
+						continue;
+					}
+				}
+
+				break;
+			}
+		}
+		else
+		{
+			btInt32_t x = v0->point.x;
+			btInt32_t y0 = v0->point.y;
+			Vertex* w0 = v0;
+			Vertex* t;
+			while (((t = side ? w0->next : w0->prev) != v0) && (t->point.x == x) && (t->point.y <= y0))
+			{
+				w0 = t;
+				y0 = t->point.y;
+			}
+			v0 = w0;
+
+			btInt32_t y1 = v1->point.y;
+			Vertex* w1 = v1;
+			while (((t = side ? w1->prev : w1->next) != v1) && (t->point.x == x) && (t->point.y >= y1))
+			{
+				w1 = t;
+				y1 = t->point.y;
+			}
+			v1 = w1;
+		}
+
+		if (side == 0)
+		{
+			v00 = v0;
+			v10 = v1;
+
+			v0 = h0.minXy;
+			v1 = h1.minXy;
+			sign = -1;
+		}
+	}
+
+	v0->prev = v1;
+	v1->next = v0;
+
+	v00->next = v10;
+	v10->prev = v00;
+
+	if (h1.minXy->point.x < h0.minXy->point.x)
+	{
+		h0.minXy = h1.minXy;
+	}
+	if (h1.maxXy->point.x >= h0.maxXy->point.x)
+	{
+		h0.maxXy = h1.maxXy;
+	}
+
+	h0.maxYx = h1.maxYx;
+
+	c0 = v00;
+	c1 = v10;
+
+	return true;
+}
+
+void b3ConvexHullInternal::computeInternal(int start, int end, IntermediateHull& result)
+{
+	int n = end - start;
+	switch (n)
+	{
+		case 0:
+			result.minXy = NULL;
+			result.maxXy = NULL;
+			result.minYx = NULL;
+			result.maxYx = NULL;
+			return;
+		case 2:
+		{
+			Vertex* v = originalVertices[start];
+			Vertex* w = v + 1;
+			if (v->point != w->point)
+			{
+				btInt32_t dx = v->point.x - w->point.x;
+				btInt32_t dy = v->point.y - w->point.y;
+
+				if ((dx == 0) && (dy == 0))
+				{
+					if (v->point.z > w->point.z)
+					{
+						Vertex* t = w;
+						w = v;
+						v = t;
+					}
+					b3Assert(v->point.z < w->point.z);
+					v->next = v;
+					v->prev = v;
+					result.minXy = v;
+					result.maxXy = v;
+					result.minYx = v;
+					result.maxYx = v;
+				}
+				else
+				{
+					v->next = w;
+					v->prev = w;
+					w->next = v;
+					w->prev = v;
+
+					if ((dx < 0) || ((dx == 0) && (dy < 0)))
+					{
+						result.minXy = v;
+						result.maxXy = w;
+					}
+					else
+					{
+						result.minXy = w;
+						result.maxXy = v;
+					}
+
+					if ((dy < 0) || ((dy == 0) && (dx < 0)))
+					{
+						result.minYx = v;
+						result.maxYx = w;
+					}
+					else
+					{
+						result.minYx = w;
+						result.maxYx = v;
+					}
+				}
+
+				Edge* e = newEdgePair(v, w);
+				e->link(e);
+				v->edges = e;
+
+				e = e->reverse;
+				e->link(e);
+				w->edges = e;
+
+				return;
+			}
+		}
+		// lint -fallthrough
+		case 1:
+		{
+			Vertex* v = originalVertices[start];
+			v->edges = NULL;
+			v->next = v;
+			v->prev = v;
+
+			result.minXy = v;
+			result.maxXy = v;
+			result.minYx = v;
+			result.maxYx = v;
+
+			return;
+		}
+	}
+
+	int split0 = start + n / 2;
+	Point32 p = originalVertices[split0 - 1]->point;
+	int split1 = split0;
+	while ((split1 < end) && (originalVertices[split1]->point == p))
+	{
+		split1++;
+	}
+	computeInternal(start, split0, result);
+	IntermediateHull hull1;
+	computeInternal(split1, end, hull1);
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("\n\nMerge\n");
+	result.print();
+	hull1.print();
+#endif
+	merge(result, hull1);
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("\n  Result\n");
+	result.print();
+#endif
+}
+
+#ifdef DEBUG_CONVEX_HULL
+void b3ConvexHullInternal::IntermediateHull::print()
+{
+	b3Printf("    Hull\n");
+	for (Vertex* v = minXy; v;)
+	{
+		b3Printf("      ");
+		v->print();
+		if (v == maxXy)
+		{
+			b3Printf(" maxXy");
+		}
+		if (v == minYx)
+		{
+			b3Printf(" minYx");
+		}
+		if (v == maxYx)
+		{
+			b3Printf(" maxYx");
+		}
+		if (v->next->prev != v)
+		{
+			b3Printf(" Inconsistency");
+		}
+		b3Printf("\n");
+		v = v->next;
+		if (v == minXy)
+		{
+			break;
+		}
+	}
+	if (minXy)
+	{
+		minXy->copy = (minXy->copy == -1) ? -2 : -1;
+		minXy->printGraph();
+	}
+}
+
+void b3ConvexHullInternal::Vertex::printGraph()
+{
+	print();
+	b3Printf("\nEdges\n");
+	Edge* e = edges;
+	if (e)
+	{
+		do
+		{
+			e->print();
+			b3Printf("\n");
+			e = e->next;
+		} while (e != edges);
+		do
+		{
+			Vertex* v = e->target;
+			if (v->copy != copy)
+			{
+				v->copy = copy;
+				v->printGraph();
+			}
+			e = e->next;
+		} while (e != edges);
+	}
+}
+#endif
+
+b3ConvexHullInternal::Orientation b3ConvexHullInternal::getOrientation(const Edge* prev, const Edge* next, const Point32& s, const Point32& t)
+{
+	b3Assert(prev->reverse->target == next->reverse->target);
+	if (prev->next == next)
+	{
+		if (prev->prev == next)
+		{
+			Point64 n = t.cross(s);
+			Point64 m = (*prev->target - *next->reverse->target).cross(*next->target - *next->reverse->target);
+			b3Assert(!m.isZero());
+			btInt64_t dot = n.dot(m);
+			b3Assert(dot != 0);
+			return (dot > 0) ? COUNTER_CLOCKWISE : CLOCKWISE;
+		}
+		return COUNTER_CLOCKWISE;
+	}
+	else if (prev->prev == next)
+	{
+		return CLOCKWISE;
+	}
+	else
+	{
+		return NONE;
+	}
+}
+
+b3ConvexHullInternal::Edge* b3ConvexHullInternal::findMaxAngle(bool ccw, const Vertex* start, const Point32& s, const Point64& rxs, const Point64& sxrxs, Rational64& minCot)
+{
+	Edge* minEdge = NULL;
+
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("find max edge for %d\n", start->point.index);
+#endif
+	Edge* e = start->edges;
+	if (e)
+	{
+		do
+		{
+			if (e->copy > mergeStamp)
+			{
+				Point32 t = *e->target - *start;
+				Rational64 cot(t.dot(sxrxs), t.dot(rxs));
+#ifdef DEBUG_CONVEX_HULL
+				b3Printf("      Angle is %f (%d) for ", (float)b3Atan(cot.toScalar()), (int)cot.isNaN());
+				e->print();
+#endif
+				if (cot.isNaN())
+				{
+					b3Assert(ccw ? (t.dot(s) < 0) : (t.dot(s) > 0));
+				}
+				else
+				{
+					int cmp;
+					if (minEdge == NULL)
+					{
+						minCot = cot;
+						minEdge = e;
+					}
+					else if ((cmp = cot.compare(minCot)) < 0)
+					{
+						minCot = cot;
+						minEdge = e;
+					}
+					else if ((cmp == 0) && (ccw == (getOrientation(minEdge, e, s, t) == COUNTER_CLOCKWISE)))
+					{
+						minEdge = e;
+					}
+				}
+#ifdef DEBUG_CONVEX_HULL
+				b3Printf("\n");
+#endif
+			}
+			e = e->next;
+		} while (e != start->edges);
+	}
+	return minEdge;
+}
+
+void b3ConvexHullInternal::findEdgeForCoplanarFaces(Vertex* c0, Vertex* c1, Edge*& e0, Edge*& e1, Vertex* stop0, Vertex* stop1)
+{
+	Edge* start0 = e0;
+	Edge* start1 = e1;
+	Point32 et0 = start0 ? start0->target->point : c0->point;
+	Point32 et1 = start1 ? start1->target->point : c1->point;
+	Point32 s = c1->point - c0->point;
+	Point64 normal = ((start0 ? start0 : start1)->target->point - c0->point).cross(s);
+	btInt64_t dist = c0->point.dot(normal);
+	b3Assert(!start1 || (start1->target->point.dot(normal) == dist));
+	Point64 perp = s.cross(normal);
+	b3Assert(!perp.isZero());
+
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("   Advancing %d %d  (%p %p, %d %d)\n", c0->point.index, c1->point.index, start0, start1, start0 ? start0->target->point.index : -1, start1 ? start1->target->point.index : -1);
+#endif
+
+	btInt64_t maxDot0 = et0.dot(perp);
+	if (e0)
+	{
+		while (e0->target != stop0)
+		{
+			Edge* e = e0->reverse->prev;
+			if (e->target->point.dot(normal) < dist)
+			{
+				break;
+			}
+			b3Assert(e->target->point.dot(normal) == dist);
+			if (e->copy == mergeStamp)
+			{
+				break;
+			}
+			btInt64_t dot = e->target->point.dot(perp);
+			if (dot <= maxDot0)
+			{
+				break;
+			}
+			maxDot0 = dot;
+			e0 = e;
+			et0 = e->target->point;
+		}
+	}
+
+	btInt64_t maxDot1 = et1.dot(perp);
+	if (e1)
+	{
+		while (e1->target != stop1)
+		{
+			Edge* e = e1->reverse->next;
+			if (e->target->point.dot(normal) < dist)
+			{
+				break;
+			}
+			b3Assert(e->target->point.dot(normal) == dist);
+			if (e->copy == mergeStamp)
+			{
+				break;
+			}
+			btInt64_t dot = e->target->point.dot(perp);
+			if (dot <= maxDot1)
+			{
+				break;
+			}
+			maxDot1 = dot;
+			e1 = e;
+			et1 = e->target->point;
+		}
+	}
+
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("   Starting at %d %d\n", et0.index, et1.index);
+#endif
+
+	btInt64_t dx = maxDot1 - maxDot0;
+	if (dx > 0)
+	{
+		while (true)
+		{
+			btInt64_t dy = (et1 - et0).dot(s);
+
+			if (e0 && (e0->target != stop0))
+			{
+				Edge* f0 = e0->next->reverse;
+				if (f0->copy > mergeStamp)
+				{
+					btInt64_t dx0 = (f0->target->point - et0).dot(perp);
+					btInt64_t dy0 = (f0->target->point - et0).dot(s);
+					if ((dx0 == 0) ? (dy0 < 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) >= 0)))
+					{
+						et0 = f0->target->point;
+						dx = (et1 - et0).dot(perp);
+						e0 = (e0 == start0) ? NULL : f0;
+						continue;
+					}
+				}
+			}
+
+			if (e1 && (e1->target != stop1))
+			{
+				Edge* f1 = e1->reverse->next;
+				if (f1->copy > mergeStamp)
+				{
+					Point32 d1 = f1->target->point - et1;
+					if (d1.dot(normal) == 0)
+					{
+						btInt64_t dx1 = d1.dot(perp);
+						btInt64_t dy1 = d1.dot(s);
+						btInt64_t dxn = (f1->target->point - et0).dot(perp);
+						if ((dxn > 0) && ((dx1 == 0) ? (dy1 < 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) > 0))))
+						{
+							e1 = f1;
+							et1 = e1->target->point;
+							dx = dxn;
+							continue;
+						}
+					}
+					else
+					{
+						b3Assert((e1 == start1) && (d1.dot(normal) < 0));
+					}
+				}
+			}
+
+			break;
+		}
+	}
+	else if (dx < 0)
+	{
+		while (true)
+		{
+			btInt64_t dy = (et1 - et0).dot(s);
+
+			if (e1 && (e1->target != stop1))
+			{
+				Edge* f1 = e1->prev->reverse;
+				if (f1->copy > mergeStamp)
+				{
+					btInt64_t dx1 = (f1->target->point - et1).dot(perp);
+					btInt64_t dy1 = (f1->target->point - et1).dot(s);
+					if ((dx1 == 0) ? (dy1 > 0) : ((dx1 < 0) && (Rational64(dy1, dx1).compare(Rational64(dy, dx)) <= 0)))
+					{
+						et1 = f1->target->point;
+						dx = (et1 - et0).dot(perp);
+						e1 = (e1 == start1) ? NULL : f1;
+						continue;
+					}
+				}
+			}
+
+			if (e0 && (e0->target != stop0))
+			{
+				Edge* f0 = e0->reverse->prev;
+				if (f0->copy > mergeStamp)
+				{
+					Point32 d0 = f0->target->point - et0;
+					if (d0.dot(normal) == 0)
+					{
+						btInt64_t dx0 = d0.dot(perp);
+						btInt64_t dy0 = d0.dot(s);
+						btInt64_t dxn = (et1 - f0->target->point).dot(perp);
+						if ((dxn < 0) && ((dx0 == 0) ? (dy0 > 0) : ((dx0 < 0) && (Rational64(dy0, dx0).compare(Rational64(dy, dx)) < 0))))
+						{
+							e0 = f0;
+							et0 = e0->target->point;
+							dx = dxn;
+							continue;
+						}
+					}
+					else
+					{
+						b3Assert((e0 == start0) && (d0.dot(normal) < 0));
+					}
+				}
+			}
+
+			break;
+		}
+	}
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("   Advanced edges to %d %d\n", et0.index, et1.index);
+#endif
+}
+
+void b3ConvexHullInternal::merge(IntermediateHull& h0, IntermediateHull& h1)
+{
+	if (!h1.maxXy)
+	{
+		return;
+	}
+	if (!h0.maxXy)
+	{
+		h0 = h1;
+		return;
+	}
+
+	mergeStamp--;
+
+	Vertex* c0 = NULL;
+	Edge* toPrev0 = NULL;
+	Edge* firstNew0 = NULL;
+	Edge* pendingHead0 = NULL;
+	Edge* pendingTail0 = NULL;
+	Vertex* c1 = NULL;
+	Edge* toPrev1 = NULL;
+	Edge* firstNew1 = NULL;
+	Edge* pendingHead1 = NULL;
+	Edge* pendingTail1 = NULL;
+	Point32 prevPoint;
+
+	if (mergeProjection(h0, h1, c0, c1))
+	{
+		Point32 s = *c1 - *c0;
+		Point64 normal = Point32(0, 0, -1).cross(s);
+		Point64 t = s.cross(normal);
+		b3Assert(!t.isZero());
+
+		Edge* e = c0->edges;
+		Edge* start0 = NULL;
+		if (e)
+		{
+			do
+			{
+				btInt64_t dot = (*e->target - *c0).dot(normal);
+				b3Assert(dot <= 0);
+				if ((dot == 0) && ((*e->target - *c0).dot(t) > 0))
+				{
+					if (!start0 || (getOrientation(start0, e, s, Point32(0, 0, -1)) == CLOCKWISE))
+					{
+						start0 = e;
+					}
+				}
+				e = e->next;
+			} while (e != c0->edges);
+		}
+
+		e = c1->edges;
+		Edge* start1 = NULL;
+		if (e)
+		{
+			do
+			{
+				btInt64_t dot = (*e->target - *c1).dot(normal);
+				b3Assert(dot <= 0);
+				if ((dot == 0) && ((*e->target - *c1).dot(t) > 0))
+				{
+					if (!start1 || (getOrientation(start1, e, s, Point32(0, 0, -1)) == COUNTER_CLOCKWISE))
+					{
+						start1 = e;
+					}
+				}
+				e = e->next;
+			} while (e != c1->edges);
+		}
+
+		if (start0 || start1)
+		{
+			findEdgeForCoplanarFaces(c0, c1, start0, start1, NULL, NULL);
+			if (start0)
+			{
+				c0 = start0->target;
+			}
+			if (start1)
+			{
+				c1 = start1->target;
+			}
+		}
+
+		prevPoint = c1->point;
+		prevPoint.z++;
+	}
+	else
+	{
+		prevPoint = c1->point;
+		prevPoint.x++;
+	}
+
+	Vertex* first0 = c0;
+	Vertex* first1 = c1;
+	bool firstRun = true;
+
+	while (true)
+	{
+		Point32 s = *c1 - *c0;
+		Point32 r = prevPoint - c0->point;
+		Point64 rxs = r.cross(s);
+		Point64 sxrxs = s.cross(rxs);
+
+#ifdef DEBUG_CONVEX_HULL
+		b3Printf("\n  Checking %d %d\n", c0->point.index, c1->point.index);
+#endif
+		Rational64 minCot0(0, 0);
+		Edge* min0 = findMaxAngle(false, c0, s, rxs, sxrxs, minCot0);
+		Rational64 minCot1(0, 0);
+		Edge* min1 = findMaxAngle(true, c1, s, rxs, sxrxs, minCot1);
+		if (!min0 && !min1)
+		{
+			Edge* e = newEdgePair(c0, c1);
+			e->link(e);
+			c0->edges = e;
+
+			e = e->reverse;
+			e->link(e);
+			c1->edges = e;
+			return;
+		}
+		else
+		{
+			int cmp = !min0 ? 1 : !min1 ? -1 : minCot0.compare(minCot1);
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("    -> Result %d\n", cmp);
+#endif
+			if (firstRun || ((cmp >= 0) ? !minCot1.isNegativeInfinity() : !minCot0.isNegativeInfinity()))
+			{
+				Edge* e = newEdgePair(c0, c1);
+				if (pendingTail0)
+				{
+					pendingTail0->prev = e;
+				}
+				else
+				{
+					pendingHead0 = e;
+				}
+				e->next = pendingTail0;
+				pendingTail0 = e;
+
+				e = e->reverse;
+				if (pendingTail1)
+				{
+					pendingTail1->next = e;
+				}
+				else
+				{
+					pendingHead1 = e;
+				}
+				e->prev = pendingTail1;
+				pendingTail1 = e;
+			}
+
+			Edge* e0 = min0;
+			Edge* e1 = min1;
+
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("   Found min edges to %d %d\n", e0 ? e0->target->point.index : -1, e1 ? e1->target->point.index : -1);
+#endif
+
+			if (cmp == 0)
+			{
+				findEdgeForCoplanarFaces(c0, c1, e0, e1, NULL, NULL);
+			}
+
+			if ((cmp >= 0) && e1)
+			{
+				if (toPrev1)
+				{
+					for (Edge *e = toPrev1->next, *n = NULL; e != min1; e = n)
+					{
+						n = e->next;
+						removeEdgePair(e);
+					}
+				}
+
+				if (pendingTail1)
+				{
+					if (toPrev1)
+					{
+						toPrev1->link(pendingHead1);
+					}
+					else
+					{
+						min1->prev->link(pendingHead1);
+						firstNew1 = pendingHead1;
+					}
+					pendingTail1->link(min1);
+					pendingHead1 = NULL;
+					pendingTail1 = NULL;
+				}
+				else if (!toPrev1)
+				{
+					firstNew1 = min1;
+				}
+
+				prevPoint = c1->point;
+				c1 = e1->target;
+				toPrev1 = e1->reverse;
+			}
+
+			if ((cmp <= 0) && e0)
+			{
+				if (toPrev0)
+				{
+					for (Edge *e = toPrev0->prev, *n = NULL; e != min0; e = n)
+					{
+						n = e->prev;
+						removeEdgePair(e);
+					}
+				}
+
+				if (pendingTail0)
+				{
+					if (toPrev0)
+					{
+						pendingHead0->link(toPrev0);
+					}
+					else
+					{
+						pendingHead0->link(min0->next);
+						firstNew0 = pendingHead0;
+					}
+					min0->link(pendingTail0);
+					pendingHead0 = NULL;
+					pendingTail0 = NULL;
+				}
+				else if (!toPrev0)
+				{
+					firstNew0 = min0;
+				}
+
+				prevPoint = c0->point;
+				c0 = e0->target;
+				toPrev0 = e0->reverse;
+			}
+		}
+
+		if ((c0 == first0) && (c1 == first1))
+		{
+			if (toPrev0 == NULL)
+			{
+				pendingHead0->link(pendingTail0);
+				c0->edges = pendingTail0;
+			}
+			else
+			{
+				for (Edge *e = toPrev0->prev, *n = NULL; e != firstNew0; e = n)
+				{
+					n = e->prev;
+					removeEdgePair(e);
+				}
+				if (pendingTail0)
+				{
+					pendingHead0->link(toPrev0);
+					firstNew0->link(pendingTail0);
+				}
+			}
+
+			if (toPrev1 == NULL)
+			{
+				pendingTail1->link(pendingHead1);
+				c1->edges = pendingTail1;
+			}
+			else
+			{
+				for (Edge *e = toPrev1->next, *n = NULL; e != firstNew1; e = n)
+				{
+					n = e->next;
+					removeEdgePair(e);
+				}
+				if (pendingTail1)
+				{
+					toPrev1->link(pendingHead1);
+					pendingTail1->link(firstNew1);
+				}
+			}
+
+			return;
+		}
+
+		firstRun = false;
+	}
+}
+
+static bool b3PointCmp(const b3ConvexHullInternal::Point32& p, const b3ConvexHullInternal::Point32& q)
+{
+	return (p.y < q.y) || ((p.y == q.y) && ((p.x < q.x) || ((p.x == q.x) && (p.z < q.z))));
+}
+
+void b3ConvexHullInternal::compute(const void* coords, bool doubleCoords, int stride, int count)
+{
+	b3Vector3 min = b3MakeVector3(b3Scalar(1e30), b3Scalar(1e30), b3Scalar(1e30)), max = b3MakeVector3(b3Scalar(-1e30), b3Scalar(-1e30), b3Scalar(-1e30));
+	const char* ptr = (const char*)coords;
+	if (doubleCoords)
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const double* v = (const double*)ptr;
+			b3Vector3 p = b3MakeVector3((b3Scalar)v[0], (b3Scalar)v[1], (b3Scalar)v[2]);
+			ptr += stride;
+			min.setMin(p);
+			max.setMax(p);
+		}
+	}
+	else
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const float* v = (const float*)ptr;
+			b3Vector3 p = b3MakeVector3(v[0], v[1], v[2]);
+			ptr += stride;
+			min.setMin(p);
+			max.setMax(p);
+		}
+	}
+
+	b3Vector3 s = max - min;
+	maxAxis = s.maxAxis();
+	minAxis = s.minAxis();
+	if (minAxis == maxAxis)
+	{
+		minAxis = (maxAxis + 1) % 3;
+	}
+	medAxis = 3 - maxAxis - minAxis;
+
+	s /= b3Scalar(10216);
+	if (((medAxis + 1) % 3) != maxAxis)
+	{
+		s *= -1;
+	}
+	scaling = s;
+
+	if (s[0] != 0)
+	{
+		s[0] = b3Scalar(1) / s[0];
+	}
+	if (s[1] != 0)
+	{
+		s[1] = b3Scalar(1) / s[1];
+	}
+	if (s[2] != 0)
+	{
+		s[2] = b3Scalar(1) / s[2];
+	}
+
+	center = (min + max) * b3Scalar(0.5);
+
+	b3AlignedObjectArray<Point32> points;
+	points.resize(count);
+	ptr = (const char*)coords;
+	if (doubleCoords)
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const double* v = (const double*)ptr;
+			b3Vector3 p = b3MakeVector3((b3Scalar)v[0], (b3Scalar)v[1], (b3Scalar)v[2]);
+			ptr += stride;
+			p = (p - center) * s;
+			points[i].x = (btInt32_t)p[medAxis];
+			points[i].y = (btInt32_t)p[maxAxis];
+			points[i].z = (btInt32_t)p[minAxis];
+			points[i].index = i;
+		}
+	}
+	else
+	{
+		for (int i = 0; i < count; i++)
+		{
+			const float* v = (const float*)ptr;
+			b3Vector3 p = b3MakeVector3(v[0], v[1], v[2]);
+			ptr += stride;
+			p = (p - center) * s;
+			points[i].x = (btInt32_t)p[medAxis];
+			points[i].y = (btInt32_t)p[maxAxis];
+			points[i].z = (btInt32_t)p[minAxis];
+			points[i].index = i;
+		}
+	}
+	points.quickSort(b3PointCmp);
+
+	vertexPool.reset();
+	vertexPool.setArraySize(count);
+	originalVertices.resize(count);
+	for (int i = 0; i < count; i++)
+	{
+		Vertex* v = vertexPool.newObject();
+		v->edges = NULL;
+		v->point = points[i];
+		v->copy = -1;
+		originalVertices[i] = v;
+	}
+
+	points.clear();
+
+	edgePool.reset();
+	edgePool.setArraySize(6 * count);
+
+	usedEdgePairs = 0;
+	maxUsedEdgePairs = 0;
+
+	mergeStamp = -3;
+
+	IntermediateHull hull;
+	computeInternal(0, count, hull);
+	vertexList = hull.minXy;
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("max. edges %d (3v = %d)", maxUsedEdgePairs, 3 * count);
+#endif
+}
+
+b3Vector3 b3ConvexHullInternal::toBtVector(const Point32& v)
+{
+	b3Vector3 p;
+	p[medAxis] = b3Scalar(v.x);
+	p[maxAxis] = b3Scalar(v.y);
+	p[minAxis] = b3Scalar(v.z);
+	return p * scaling;
+}
+
+b3Vector3 b3ConvexHullInternal::getBtNormal(Face* face)
+{
+	return toBtVector(face->dir0).cross(toBtVector(face->dir1)).normalized();
+}
+
+b3Vector3 b3ConvexHullInternal::getCoordinates(const Vertex* v)
+{
+	b3Vector3 p;
+	p[medAxis] = v->xvalue();
+	p[maxAxis] = v->yvalue();
+	p[minAxis] = v->zvalue();
+	return p * scaling + center;
+}
+
+b3Scalar b3ConvexHullInternal::shrink(b3Scalar amount, b3Scalar clampAmount)
+{
+	if (!vertexList)
+	{
+		return 0;
+	}
+	int stamp = --mergeStamp;
+	b3AlignedObjectArray<Vertex*> stack;
+	vertexList->copy = stamp;
+	stack.push_back(vertexList);
+	b3AlignedObjectArray<Face*> faces;
+
+	Point32 ref = vertexList->point;
+	Int128 hullCenterX(0, 0);
+	Int128 hullCenterY(0, 0);
+	Int128 hullCenterZ(0, 0);
+	Int128 volume(0, 0);
+
+	while (stack.size() > 0)
+	{
+		Vertex* v = stack[stack.size() - 1];
+		stack.pop_back();
+		Edge* e = v->edges;
+		if (e)
+		{
+			do
+			{
+				if (e->target->copy != stamp)
+				{
+					e->target->copy = stamp;
+					stack.push_back(e->target);
+				}
+				if (e->copy != stamp)
+				{
+					Face* face = facePool.newObject();
+					face->init(e->target, e->reverse->prev->target, v);
+					faces.push_back(face);
+					Edge* f = e;
+
+					Vertex* a = NULL;
+					Vertex* b = NULL;
+					do
+					{
+						if (a && b)
+						{
+							btInt64_t vol = (v->point - ref).dot((a->point - ref).cross(b->point - ref));
+							b3Assert(vol >= 0);
+							Point32 c = v->point + a->point + b->point + ref;
+							hullCenterX += vol * c.x;
+							hullCenterY += vol * c.y;
+							hullCenterZ += vol * c.z;
+							volume += vol;
+						}
+
+						b3Assert(f->copy != stamp);
+						f->copy = stamp;
+						f->face = face;
+
+						a = b;
+						b = f->target;
+
+						f = f->reverse->prev;
+					} while (f != e);
+				}
+				e = e->next;
+			} while (e != v->edges);
+		}
+	}
+
+	if (volume.getSign() <= 0)
+	{
+		return 0;
+	}
+
+	b3Vector3 hullCenter;
+	hullCenter[medAxis] = hullCenterX.toScalar();
+	hullCenter[maxAxis] = hullCenterY.toScalar();
+	hullCenter[minAxis] = hullCenterZ.toScalar();
+	hullCenter /= 4 * volume.toScalar();
+	hullCenter *= scaling;
+
+	int faceCount = faces.size();
+
+	if (clampAmount > 0)
+	{
+		b3Scalar minDist = B3_INFINITY;
+		for (int i = 0; i < faceCount; i++)
+		{
+			b3Vector3 normal = getBtNormal(faces[i]);
+			b3Scalar dist = normal.dot(toBtVector(faces[i]->origin) - hullCenter);
+			if (dist < minDist)
+			{
+				minDist = dist;
+			}
+		}
+
+		if (minDist <= 0)
+		{
+			return 0;
+		}
+
+		amount = b3Min(amount, minDist * clampAmount);
+	}
+
+	unsigned int seed = 243703;
+	for (int i = 0; i < faceCount; i++, seed = 1664525 * seed + 1013904223)
+	{
+		b3Swap(faces[i], faces[seed % faceCount]);
+	}
+
+	for (int i = 0; i < faceCount; i++)
+	{
+		if (!shiftFace(faces[i], amount, stack))
+		{
+			return -amount;
+		}
+	}
+
+	return amount;
+}
+
+bool b3ConvexHullInternal::shiftFace(Face* face, b3Scalar amount, b3AlignedObjectArray<Vertex*> stack)
+{
+	b3Vector3 origShift = getBtNormal(face) * -amount;
+	if (scaling[0] != 0)
+	{
+		origShift[0] /= scaling[0];
+	}
+	if (scaling[1] != 0)
+	{
+		origShift[1] /= scaling[1];
+	}
+	if (scaling[2] != 0)
+	{
+		origShift[2] /= scaling[2];
+	}
+	Point32 shift((btInt32_t)origShift[medAxis], (btInt32_t)origShift[maxAxis], (btInt32_t)origShift[minAxis]);
+	if (shift.isZero())
+	{
+		return true;
+	}
+	Point64 normal = face->getNormal();
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("\nShrinking face (%d %d %d) (%d %d %d) (%d %d %d) by (%d %d %d)\n",
+			 face->origin.x, face->origin.y, face->origin.z, face->dir0.x, face->dir0.y, face->dir0.z, face->dir1.x, face->dir1.y, face->dir1.z, shift.x, shift.y, shift.z);
+#endif
+	btInt64_t origDot = face->origin.dot(normal);
+	Point32 shiftedOrigin = face->origin + shift;
+	btInt64_t shiftedDot = shiftedOrigin.dot(normal);
+	b3Assert(shiftedDot <= origDot);
+	if (shiftedDot >= origDot)
+	{
+		return false;
+	}
+
+	Edge* intersection = NULL;
+
+	Edge* startEdge = face->nearbyVertex->edges;
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("Start edge is ");
+	startEdge->print();
+	b3Printf(", normal is (%lld %lld %lld), shifted dot is %lld\n", normal.x, normal.y, normal.z, shiftedDot);
+#endif
+	Rational128 optDot = face->nearbyVertex->dot(normal);
+	int cmp = optDot.compare(shiftedDot);
+#ifdef SHOW_ITERATIONS
+	int n = 0;
+#endif
+	if (cmp >= 0)
+	{
+		Edge* e = startEdge;
+		do
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			Rational128 dot = e->target->dot(normal);
+			b3Assert(dot.compare(origDot) <= 0);
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("Moving downwards, edge is ");
+			e->print();
+			b3Printf(", dot is %f (%f %lld)\n", (float)dot.toScalar(), (float)optDot.toScalar(), shiftedDot);
+#endif
+			if (dot.compare(optDot) < 0)
+			{
+				int c = dot.compare(shiftedDot);
+				optDot = dot;
+				e = e->reverse;
+				startEdge = e;
+				if (c < 0)
+				{
+					intersection = e;
+					break;
+				}
+				cmp = c;
+			}
+			e = e->prev;
+		} while (e != startEdge);
+
+		if (!intersection)
+		{
+			return false;
+		}
+	}
+	else
+	{
+		Edge* e = startEdge;
+		do
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			Rational128 dot = e->target->dot(normal);
+			b3Assert(dot.compare(origDot) <= 0);
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("Moving upwards, edge is ");
+			e->print();
+			b3Printf(", dot is %f (%f %lld)\n", (float)dot.toScalar(), (float)optDot.toScalar(), shiftedDot);
+#endif
+			if (dot.compare(optDot) > 0)
+			{
+				cmp = dot.compare(shiftedDot);
+				if (cmp >= 0)
+				{
+					intersection = e;
+					break;
+				}
+				optDot = dot;
+				e = e->reverse;
+				startEdge = e;
+			}
+			e = e->prev;
+		} while (e != startEdge);
+
+		if (!intersection)
+		{
+			return true;
+		}
+	}
+
+#ifdef SHOW_ITERATIONS
+	b3Printf("Needed %d iterations to find initial intersection\n", n);
+#endif
+
+	if (cmp == 0)
+	{
+		Edge* e = intersection->reverse->next;
+#ifdef SHOW_ITERATIONS
+		n = 0;
+#endif
+		while (e->target->dot(normal).compare(shiftedDot) <= 0)
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			e = e->next;
+			if (e == intersection->reverse)
+			{
+				return true;
+			}
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("Checking for outwards edge, current edge is ");
+			e->print();
+			b3Printf("\n");
+#endif
+		}
+#ifdef SHOW_ITERATIONS
+		b3Printf("Needed %d iterations to check for complete containment\n", n);
+#endif
+	}
+
+	Edge* firstIntersection = NULL;
+	Edge* faceEdge = NULL;
+	Edge* firstFaceEdge = NULL;
+
+#ifdef SHOW_ITERATIONS
+	int m = 0;
+#endif
+	while (true)
+	{
+#ifdef SHOW_ITERATIONS
+		m++;
+#endif
+#ifdef DEBUG_CONVEX_HULL
+		b3Printf("Intersecting edge is ");
+		intersection->print();
+		b3Printf("\n");
+#endif
+		if (cmp == 0)
+		{
+			Edge* e = intersection->reverse->next;
+			startEdge = e;
+#ifdef SHOW_ITERATIONS
+			n = 0;
+#endif
+			while (true)
+			{
+#ifdef SHOW_ITERATIONS
+				n++;
+#endif
+				if (e->target->dot(normal).compare(shiftedDot) >= 0)
+				{
+					break;
+				}
+				intersection = e->reverse;
+				e = e->next;
+				if (e == startEdge)
+				{
+					return true;
+				}
+			}
+#ifdef SHOW_ITERATIONS
+			b3Printf("Needed %d iterations to advance intersection\n", n);
+#endif
+		}
+
+#ifdef DEBUG_CONVEX_HULL
+		b3Printf("Advanced intersecting edge to ");
+		intersection->print();
+		b3Printf(", cmp = %d\n", cmp);
+#endif
+
+		if (!firstIntersection)
+		{
+			firstIntersection = intersection;
+		}
+		else if (intersection == firstIntersection)
+		{
+			break;
+		}
+
+		int prevCmp = cmp;
+		Edge* prevIntersection = intersection;
+		Edge* prevFaceEdge = faceEdge;
+
+		Edge* e = intersection->reverse;
+#ifdef SHOW_ITERATIONS
+		n = 0;
+#endif
+		while (true)
+		{
+#ifdef SHOW_ITERATIONS
+			n++;
+#endif
+			e = e->reverse->prev;
+			b3Assert(e != intersection->reverse);
+			cmp = e->target->dot(normal).compare(shiftedDot);
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("Testing edge ");
+			e->print();
+			b3Printf(" -> cmp = %d\n", cmp);
+#endif
+			if (cmp >= 0)
+			{
+				intersection = e;
+				break;
+			}
+		}
+#ifdef SHOW_ITERATIONS
+		b3Printf("Needed %d iterations to find other intersection of face\n", n);
+#endif
+
+		if (cmp > 0)
+		{
+			Vertex* removed = intersection->target;
+			e = intersection->reverse;
+			if (e->prev == e)
+			{
+				removed->edges = NULL;
+			}
+			else
+			{
+				removed->edges = e->prev;
+				e->prev->link(e->next);
+				e->link(e);
+			}
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("1: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+#endif
+
+			Point64 n0 = intersection->face->getNormal();
+			Point64 n1 = intersection->reverse->face->getNormal();
+			btInt64_t m00 = face->dir0.dot(n0);
+			btInt64_t m01 = face->dir1.dot(n0);
+			btInt64_t m10 = face->dir0.dot(n1);
+			btInt64_t m11 = face->dir1.dot(n1);
+			btInt64_t r0 = (intersection->face->origin - shiftedOrigin).dot(n0);
+			btInt64_t r1 = (intersection->reverse->face->origin - shiftedOrigin).dot(n1);
+			Int128 det = Int128::mul(m00, m11) - Int128::mul(m01, m10);
+			b3Assert(det.getSign() != 0);
+			Vertex* v = vertexPool.newObject();
+			v->point.index = -1;
+			v->copy = -1;
+			v->point128 = PointR128(Int128::mul(face->dir0.x * r0, m11) - Int128::mul(face->dir0.x * r1, m01) + Int128::mul(face->dir1.x * r1, m00) - Int128::mul(face->dir1.x * r0, m10) + det * shiftedOrigin.x,
+									Int128::mul(face->dir0.y * r0, m11) - Int128::mul(face->dir0.y * r1, m01) + Int128::mul(face->dir1.y * r1, m00) - Int128::mul(face->dir1.y * r0, m10) + det * shiftedOrigin.y,
+									Int128::mul(face->dir0.z * r0, m11) - Int128::mul(face->dir0.z * r1, m01) + Int128::mul(face->dir1.z * r1, m00) - Int128::mul(face->dir1.z * r0, m10) + det * shiftedOrigin.z,
+									det);
+			v->point.x = (btInt32_t)v->point128.xvalue();
+			v->point.y = (btInt32_t)v->point128.yvalue();
+			v->point.z = (btInt32_t)v->point128.zvalue();
+			intersection->target = v;
+			v->edges = e;
+
+			stack.push_back(v);
+			stack.push_back(removed);
+			stack.push_back(NULL);
+		}
+
+		if (cmp || prevCmp || (prevIntersection->reverse->next->target != intersection->target))
+		{
+			faceEdge = newEdgePair(prevIntersection->target, intersection->target);
+			if (prevCmp == 0)
+			{
+				faceEdge->link(prevIntersection->reverse->next);
+			}
+			if ((prevCmp == 0) || prevFaceEdge)
+			{
+				prevIntersection->reverse->link(faceEdge);
+			}
+			if (cmp == 0)
+			{
+				intersection->reverse->prev->link(faceEdge->reverse);
+			}
+			faceEdge->reverse->link(intersection->reverse);
+		}
+		else
+		{
+			faceEdge = prevIntersection->reverse->next;
+		}
+
+		if (prevFaceEdge)
+		{
+			if (prevCmp > 0)
+			{
+				faceEdge->link(prevFaceEdge->reverse);
+			}
+			else if (faceEdge != prevFaceEdge->reverse)
+			{
+				stack.push_back(prevFaceEdge->target);
+				while (faceEdge->next != prevFaceEdge->reverse)
+				{
+					Vertex* removed = faceEdge->next->target;
+					removeEdgePair(faceEdge->next);
+					stack.push_back(removed);
+#ifdef DEBUG_CONVEX_HULL
+					b3Printf("2: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+#endif
+				}
+				stack.push_back(NULL);
+			}
+		}
+		faceEdge->face = face;
+		faceEdge->reverse->face = intersection->face;
+
+		if (!firstFaceEdge)
+		{
+			firstFaceEdge = faceEdge;
+		}
+	}
+#ifdef SHOW_ITERATIONS
+	b3Printf("Needed %d iterations to process all intersections\n", m);
+#endif
+
+	if (cmp > 0)
+	{
+		firstFaceEdge->reverse->target = faceEdge->target;
+		firstIntersection->reverse->link(firstFaceEdge);
+		firstFaceEdge->link(faceEdge->reverse);
+	}
+	else if (firstFaceEdge != faceEdge->reverse)
+	{
+		stack.push_back(faceEdge->target);
+		while (firstFaceEdge->next != faceEdge->reverse)
+		{
+			Vertex* removed = firstFaceEdge->next->target;
+			removeEdgePair(firstFaceEdge->next);
+			stack.push_back(removed);
+#ifdef DEBUG_CONVEX_HULL
+			b3Printf("3: Removed part contains (%d %d %d)\n", removed->point.x, removed->point.y, removed->point.z);
+#endif
+		}
+		stack.push_back(NULL);
+	}
+
+	b3Assert(stack.size() > 0);
+	vertexList = stack[0];
+
+#ifdef DEBUG_CONVEX_HULL
+	b3Printf("Removing part\n");
+#endif
+#ifdef SHOW_ITERATIONS
+	n = 0;
+#endif
+	int pos = 0;
+	while (pos < stack.size())
+	{
+		int end = stack.size();
+		while (pos < end)
+		{
+			Vertex* kept = stack[pos++];
+#ifdef DEBUG_CONVEX_HULL
+			kept->print();
+#endif
+			bool deeper = false;
+			Vertex* removed;
+			while ((removed = stack[pos++]) != NULL)
+			{
+#ifdef SHOW_ITERATIONS
+				n++;
+#endif
+				kept->receiveNearbyFaces(removed);
+				while (removed->edges)
+				{
+					if (!deeper)
+					{
+						deeper = true;
+						stack.push_back(kept);
+					}
+					stack.push_back(removed->edges->target);
+					removeEdgePair(removed->edges);
+				}
+			}
+			if (deeper)
+			{
+				stack.push_back(NULL);
+			}
+		}
+	}
+#ifdef SHOW_ITERATIONS
+	b3Printf("Needed %d iterations to remove part\n", n);
+#endif
+
+	stack.resize(0);
+	face->origin = shiftedOrigin;
+
+	return true;
+}
+
+static int getVertexCopy(b3ConvexHullInternal::Vertex* vertex, b3AlignedObjectArray<b3ConvexHullInternal::Vertex*>& vertices)
+{
+	int index = vertex->copy;
+	if (index < 0)
+	{
+		index = vertices.size();
+		vertex->copy = index;
+		vertices.push_back(vertex);
+#ifdef DEBUG_CONVEX_HULL
+		b3Printf("Vertex %d gets index *%d\n", vertex->point.index, index);
+#endif
+	}
+	return index;
+}
+
+b3Scalar b3ConvexHullComputer::compute(const void* coords, bool doubleCoords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp)
+{
+	if (count <= 0)
+	{
+		vertices.clear();
+		edges.clear();
+		faces.clear();
+		return 0;
+	}
+
+	b3ConvexHullInternal hull;
+	hull.compute(coords, doubleCoords, stride, count);
+
+	b3Scalar shift = 0;
+	if ((shrink > 0) && ((shift = hull.shrink(shrink, shrinkClamp)) < 0))
+	{
+		vertices.clear();
+		edges.clear();
+		faces.clear();
+		return shift;
+	}
+
+	vertices.resize(0);
+	edges.resize(0);
+	faces.resize(0);
+
+	b3AlignedObjectArray<b3ConvexHullInternal::Vertex*> oldVertices;
+	getVertexCopy(hull.vertexList, oldVertices);
+	int copied = 0;
+	while (copied < oldVertices.size())
+	{
+		b3ConvexHullInternal::Vertex* v = oldVertices[copied];
+		vertices.push_back(hull.getCoordinates(v));
+		b3ConvexHullInternal::Edge* firstEdge = v->edges;
+		if (firstEdge)
+		{
+			int firstCopy = -1;
+			int prevCopy = -1;
+			b3ConvexHullInternal::Edge* e = firstEdge;
+			do
+			{
+				if (e->copy < 0)
+				{
+					int s = edges.size();
+					edges.push_back(Edge());
+					edges.push_back(Edge());
+					Edge* c = &edges[s];
+					Edge* r = &edges[s + 1];
+					e->copy = s;
+					e->reverse->copy = s + 1;
+					c->reverse = 1;
+					r->reverse = -1;
+					c->targetVertex = getVertexCopy(e->target, oldVertices);
+					r->targetVertex = copied;
+#ifdef DEBUG_CONVEX_HULL
+					b3Printf("      CREATE: Vertex *%d has edge to *%d\n", copied, c->getTargetVertex());
+#endif
+				}
+				if (prevCopy >= 0)
+				{
+					edges[e->copy].next = prevCopy - e->copy;
+				}
+				else
+				{
+					firstCopy = e->copy;
+				}
+				prevCopy = e->copy;
+				e = e->next;
+			} while (e != firstEdge);
+			edges[firstCopy].next = prevCopy - firstCopy;
+		}
+		copied++;
+	}
+
+	for (int i = 0; i < copied; i++)
+	{
+		b3ConvexHullInternal::Vertex* v = oldVertices[i];
+		b3ConvexHullInternal::Edge* firstEdge = v->edges;
+		if (firstEdge)
+		{
+			b3ConvexHullInternal::Edge* e = firstEdge;
+			do
+			{
+				if (e->copy >= 0)
+				{
+#ifdef DEBUG_CONVEX_HULL
+					b3Printf("Vertex *%d has edge to *%d\n", i, edges[e->copy].getTargetVertex());
+#endif
+					faces.push_back(e->copy);
+					b3ConvexHullInternal::Edge* f = e;
+					do
+					{
+#ifdef DEBUG_CONVEX_HULL
+						b3Printf("   Face *%d\n", edges[f->copy].getTargetVertex());
+#endif
+						f->copy = -1;
+						f = f->reverse->prev;
+					} while (f != e);
+				}
+				e = e->next;
+			} while (e != firstEdge);
+		}
+	}
+
+	return shift;
+}

+ 99 - 0
Dependencies/include/bullet3/Bullet3Geometry/b3ConvexHullComputer.h

@@ -0,0 +1,99 @@
+/*
+Copyright (c) 2011 Ole Kniemeyer, MAXON, www.maxon.net
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_CONVEX_HULL_COMPUTER_H
+#define B3_CONVEX_HULL_COMPUTER_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+/// Convex hull implementation based on Preparata and Hong
+/// See http://code.google.com/p/bullet/issues/detail?id=275
+/// Ole Kniemeyer, MAXON Computer GmbH
+class b3ConvexHullComputer
+{
+private:
+	b3Scalar compute(const void* coords, bool doubleCoords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp);
+
+public:
+	class Edge
+	{
+	private:
+		int next;
+		int reverse;
+		int targetVertex;
+
+		friend class b3ConvexHullComputer;
+
+	public:
+		int getSourceVertex() const
+		{
+			return (this + reverse)->targetVertex;
+		}
+
+		int getTargetVertex() const
+		{
+			return targetVertex;
+		}
+
+		const Edge* getNextEdgeOfVertex() const  // clockwise list of all edges of a vertex
+		{
+			return this + next;
+		}
+
+		const Edge* getNextEdgeOfFace() const  // counter-clockwise list of all edges of a face
+		{
+			return (this + reverse)->getNextEdgeOfVertex();
+		}
+
+		const Edge* getReverseEdge() const
+		{
+			return this + reverse;
+		}
+	};
+
+	// Vertices of the output hull
+	b3AlignedObjectArray<b3Vector3> vertices;
+
+	// Edges of the output hull
+	b3AlignedObjectArray<Edge> edges;
+
+	// Faces of the convex hull. Each entry is an index into the "edges" array pointing to an edge of the face. Faces are planar n-gons
+	b3AlignedObjectArray<int> faces;
+
+	/*
+		Compute convex hull of "count" vertices stored in "coords". "stride" is the difference in bytes
+		between the addresses of consecutive vertices. If "shrink" is positive, the convex hull is shrunken
+		by that amount (each face is moved by "shrink" length units towards the center along its normal).
+		If "shrinkClamp" is positive, "shrink" is clamped to not exceed "shrinkClamp * innerRadius", where "innerRadius"
+		is the minimum distance of a face to the center of the convex hull.
+
+		The returned value is the amount by which the hull has been shrunken. If it is negative, the amount was so large
+		that the resulting convex hull is empty.
+
+		The output convex hull can be found in the member variables "vertices", "edges", "faces".
+		*/
+	b3Scalar compute(const float* coords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp)
+	{
+		return compute(coords, false, stride, count, shrink, shrinkClamp);
+	}
+
+	// same as above, but double precision
+	b3Scalar compute(const double* coords, int stride, int count, b3Scalar shrink, b3Scalar shrinkClamp)
+	{
+		return compute(coords, true, stride, count, shrink, shrinkClamp);
+	}
+};
+
+#endif  //B3_CONVEX_HULL_COMPUTER_H

+ 174 - 0
Dependencies/include/bullet3/Bullet3Geometry/b3GeometryUtil.cpp

@@ -0,0 +1,174 @@
+/*
+Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "b3GeometryUtil.h"
+
+/*
+  Make sure this dummy function never changes so that it
+  can be used by probes that are checking whether the
+  library is actually installed.
+*/
+extern "C"
+{
+	void b3BulletMathProbe();
+
+	void b3BulletMathProbe() {}
+}
+
+bool b3GeometryUtil::isPointInsidePlanes(const b3AlignedObjectArray<b3Vector3>& planeEquations, const b3Vector3& point, b3Scalar margin)
+{
+	int numbrushes = planeEquations.size();
+	for (int i = 0; i < numbrushes; i++)
+	{
+		const b3Vector3& N1 = planeEquations[i];
+		b3Scalar dist = b3Scalar(N1.dot(point)) + b3Scalar(N1[3]) - margin;
+		if (dist > b3Scalar(0.))
+		{
+			return false;
+		}
+	}
+	return true;
+}
+
+bool b3GeometryUtil::areVerticesBehindPlane(const b3Vector3& planeNormal, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar margin)
+{
+	int numvertices = vertices.size();
+	for (int i = 0; i < numvertices; i++)
+	{
+		const b3Vector3& N1 = vertices[i];
+		b3Scalar dist = b3Scalar(planeNormal.dot(N1)) + b3Scalar(planeNormal[3]) - margin;
+		if (dist > b3Scalar(0.))
+		{
+			return false;
+		}
+	}
+	return true;
+}
+
+bool notExist(const b3Vector3& planeEquation, const b3AlignedObjectArray<b3Vector3>& planeEquations);
+
+bool notExist(const b3Vector3& planeEquation, const b3AlignedObjectArray<b3Vector3>& planeEquations)
+{
+	int numbrushes = planeEquations.size();
+	for (int i = 0; i < numbrushes; i++)
+	{
+		const b3Vector3& N1 = planeEquations[i];
+		if (planeEquation.dot(N1) > b3Scalar(0.999))
+		{
+			return false;
+		}
+	}
+	return true;
+}
+
+void b3GeometryUtil::getPlaneEquationsFromVertices(b3AlignedObjectArray<b3Vector3>& vertices, b3AlignedObjectArray<b3Vector3>& planeEquationsOut)
+{
+	const int numvertices = vertices.size();
+	// brute force:
+	for (int i = 0; i < numvertices; i++)
+	{
+		const b3Vector3& N1 = vertices[i];
+
+		for (int j = i + 1; j < numvertices; j++)
+		{
+			const b3Vector3& N2 = vertices[j];
+
+			for (int k = j + 1; k < numvertices; k++)
+			{
+				const b3Vector3& N3 = vertices[k];
+
+				b3Vector3 planeEquation, edge0, edge1;
+				edge0 = N2 - N1;
+				edge1 = N3 - N1;
+				b3Scalar normalSign = b3Scalar(1.);
+				for (int ww = 0; ww < 2; ww++)
+				{
+					planeEquation = normalSign * edge0.cross(edge1);
+					if (planeEquation.length2() > b3Scalar(0.0001))
+					{
+						planeEquation.normalize();
+						if (notExist(planeEquation, planeEquationsOut))
+						{
+							planeEquation[3] = -planeEquation.dot(N1);
+
+							//check if inside, and replace supportingVertexOut if needed
+							if (areVerticesBehindPlane(planeEquation, vertices, b3Scalar(0.01)))
+							{
+								planeEquationsOut.push_back(planeEquation);
+							}
+						}
+					}
+					normalSign = b3Scalar(-1.);
+				}
+			}
+		}
+	}
+}
+
+void b3GeometryUtil::getVerticesFromPlaneEquations(const b3AlignedObjectArray<b3Vector3>& planeEquations, b3AlignedObjectArray<b3Vector3>& verticesOut)
+{
+	const int numbrushes = planeEquations.size();
+	// brute force:
+	for (int i = 0; i < numbrushes; i++)
+	{
+		const b3Vector3& N1 = planeEquations[i];
+
+		for (int j = i + 1; j < numbrushes; j++)
+		{
+			const b3Vector3& N2 = planeEquations[j];
+
+			for (int k = j + 1; k < numbrushes; k++)
+			{
+				const b3Vector3& N3 = planeEquations[k];
+
+				b3Vector3 n2n3;
+				n2n3 = N2.cross(N3);
+				b3Vector3 n3n1;
+				n3n1 = N3.cross(N1);
+				b3Vector3 n1n2;
+				n1n2 = N1.cross(N2);
+
+				if ((n2n3.length2() > b3Scalar(0.0001)) &&
+					(n3n1.length2() > b3Scalar(0.0001)) &&
+					(n1n2.length2() > b3Scalar(0.0001)))
+				{
+					//point P out of 3 plane equations:
+
+					//	d1 ( N2 * N3 ) + d2 ( N3 * N1 ) + d3 ( N1 * N2 )
+					//P =  -------------------------------------------------------------------------
+					//   N1 . ( N2 * N3 )
+
+					b3Scalar quotient = (N1.dot(n2n3));
+					if (b3Fabs(quotient) > b3Scalar(0.000001))
+					{
+						quotient = b3Scalar(-1.) / quotient;
+						n2n3 *= N1[3];
+						n3n1 *= N2[3];
+						n1n2 *= N3[3];
+						b3Vector3 potentialVertex = n2n3;
+						potentialVertex += n3n1;
+						potentialVertex += n1n2;
+						potentialVertex *= quotient;
+
+						//check if inside, and replace supportingVertexOut if needed
+						if (isPointInsidePlanes(planeEquations, potentialVertex, b3Scalar(0.01)))
+						{
+							verticesOut.push_back(potentialVertex);
+						}
+					}
+				}
+			}
+		}
+	}
+}

+ 36 - 0
Dependencies/include/bullet3/Bullet3Geometry/b3GeometryUtil.h

@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  https://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_GEOMETRY_UTIL_H
+#define B3_GEOMETRY_UTIL_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+///The b3GeometryUtil helper class provides a few methods to convert between plane equations and vertices.
+class b3GeometryUtil
+{
+public:
+	static void getPlaneEquationsFromVertices(b3AlignedObjectArray<b3Vector3>& vertices, b3AlignedObjectArray<b3Vector3>& planeEquationsOut);
+
+	static void getVerticesFromPlaneEquations(const b3AlignedObjectArray<b3Vector3>& planeEquations, b3AlignedObjectArray<b3Vector3>& verticesOut);
+
+	static bool isInside(const b3AlignedObjectArray<b3Vector3>& vertices, const b3Vector3& planeNormal, b3Scalar margin);
+
+	static bool isPointInsidePlanes(const b3AlignedObjectArray<b3Vector3>& planeEquations, const b3Vector3& point, b3Scalar margin);
+
+	static bool areVerticesBehindPlane(const b3Vector3& planeNormal, const b3AlignedObjectArray<b3Vector3>& vertices, b3Scalar margin);
+};
+
+#endif  //B3_GEOMETRY_UTIL_H

+ 116 - 0
Dependencies/include/bullet3/Bullet3Geometry/b3GrahamScan2dConvexHull.h

@@ -0,0 +1,116 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef B3_GRAHAM_SCAN_2D_CONVEX_HULL_H
+#define B3_GRAHAM_SCAN_2D_CONVEX_HULL_H
+
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3AlignedObjectArray.h"
+
+struct b3GrahamVector3 : public b3Vector3
+{
+	b3GrahamVector3(const b3Vector3& org, int orgIndex)
+		: b3Vector3(org),
+		  m_orgIndex(orgIndex)
+	{
+	}
+	b3Scalar m_angle;
+	int m_orgIndex;
+};
+
+struct b3AngleCompareFunc
+{
+	b3Vector3 m_anchor;
+	b3AngleCompareFunc(const b3Vector3& anchor)
+		: m_anchor(anchor)
+	{
+	}
+	bool operator()(const b3GrahamVector3& a, const b3GrahamVector3& b) const
+	{
+		if (a.m_angle != b.m_angle)
+			return a.m_angle < b.m_angle;
+		else
+		{
+			b3Scalar al = (a - m_anchor).length2();
+			b3Scalar bl = (b - m_anchor).length2();
+			if (al != bl)
+				return al < bl;
+			else
+			{
+				return a.m_orgIndex < b.m_orgIndex;
+			}
+		}
+	}
+};
+
+inline void b3GrahamScanConvexHull2D(b3AlignedObjectArray<b3GrahamVector3>& originalPoints, b3AlignedObjectArray<b3GrahamVector3>& hull, const b3Vector3& normalAxis)
+{
+	b3Vector3 axis0, axis1;
+	b3PlaneSpace1(normalAxis, axis0, axis1);
+
+	if (originalPoints.size() <= 1)
+	{
+		for (int i = 0; i < originalPoints.size(); i++)
+			hull.push_back(originalPoints[0]);
+		return;
+	}
+	//step1 : find anchor point with smallest projection on axis0 and move it to first location
+	for (int i = 0; i < originalPoints.size(); i++)
+	{
+		//		const b3Vector3& left = originalPoints[i];
+		//		const b3Vector3& right = originalPoints[0];
+		b3Scalar projL = originalPoints[i].dot(axis0);
+		b3Scalar projR = originalPoints[0].dot(axis0);
+		if (projL < projR)
+		{
+			originalPoints.swap(0, i);
+		}
+	}
+
+	//also precompute angles
+	originalPoints[0].m_angle = -1e30f;
+	for (int i = 1; i < originalPoints.size(); i++)
+	{
+		b3Vector3 xvec = axis0;
+		b3Vector3 ar = originalPoints[i] - originalPoints[0];
+		originalPoints[i].m_angle = b3Cross(xvec, ar).dot(normalAxis) / ar.length();
+	}
+
+	//step 2: sort all points, based on 'angle' with this anchor
+	b3AngleCompareFunc comp(originalPoints[0]);
+	originalPoints.quickSortInternal(comp, 1, originalPoints.size() - 1);
+
+	int i;
+	for (i = 0; i < 2; i++)
+		hull.push_back(originalPoints[i]);
+
+	//step 3: keep all 'convex' points and discard concave points (using back tracking)
+	for (; i != originalPoints.size(); i++)
+	{
+		bool isConvex = false;
+		while (!isConvex && hull.size() > 1)
+		{
+			b3Vector3& a = hull[hull.size() - 2];
+			b3Vector3& b = hull[hull.size() - 1];
+			isConvex = b3Cross(a - b, a - originalPoints[i]).dot(normalAxis) > 0;
+			if (!isConvex)
+				hull.pop_back();
+			else
+				hull.push_back(originalPoints[i]);
+		}
+	}
+}
+
+#endif  //B3_GRAHAM_SCAN_2D_CONVEX_HULL_H

+ 16 - 0
Dependencies/include/bullet3/Bullet3Geometry/premake4.lua

@@ -0,0 +1,16 @@
+	project "Bullet3Geometry"
+
+	language "C++"
+				
+	kind "StaticLib"
+		
+	includedirs {".."}
+	
+    if os.is("Linux") then
+        buildoptions{"-fPIC"}
+    end
+
+	files {
+		"**.cpp",
+		"**.h"
+	}

+ 42 - 0
Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h

@@ -0,0 +1,42 @@
+
+#ifndef B3_GPU_BROADPHASE_INTERFACE_H
+#define B3_GPU_BROADPHASE_INTERFACE_H
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLInclude.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "b3SapAabb.h"
+#include "Bullet3Common/shared/b3Int2.h"
+#include "Bullet3Common/shared/b3Int4.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+
+class b3GpuBroadphaseInterface
+{
+public:
+	typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);
+
+	virtual ~b3GpuBroadphaseInterface()
+	{
+	}
+
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
+
+	virtual void calculateOverlappingPairs(int maxPairs) = 0;
+	virtual void calculateOverlappingPairsHost(int maxPairs) = 0;
+
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	virtual void writeAabbsToGpu() = 0;
+
+	virtual cl_mem getAabbBufferWS() = 0;
+	virtual int getNumOverlap() = 0;
+	virtual cl_mem getOverlappingPairBuffer() = 0;
+
+	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;
+
+	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
+	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
+	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
+};
+
+#endif  //B3_GPU_BROADPHASE_INTERFACE_H

+ 338 - 0
Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp

@@ -0,0 +1,338 @@
+
+#include "b3GpuGridBroadphase.h"
+#include "Bullet3Geometry/b3AabbUtil.h"
+#include "kernels/gridBroadphaseKernels.h"
+#include "kernels/sapKernels.h"
+//#include "kernels/gridBroadphase.cl"
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+
+#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
+#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
+
+cl_kernel kCalcHashAABB;
+cl_kernel kClearCellStart;
+cl_kernel kFindCellStart;
+cl_kernel kFindOverlappingPairs;
+cl_kernel m_copyAabbsKernel;
+cl_kernel m_sap2Kernel;
+
+//int maxPairsPerBody = 64;
+int maxBodiesPerCell = 256;  //??
+
+b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
+	: m_context(ctx),
+	  m_device(device),
+	  m_queue(q),
+	  m_allAabbsGPU1(ctx, q),
+	  m_smallAabbsMappingGPU(ctx, q),
+	  m_largeAabbsMappingGPU(ctx, q),
+	  m_gpuPairs(ctx, q),
+
+	  m_hashGpu(ctx, q),
+
+	  m_cellStartGpu(ctx, q),
+	  m_paramsGPU(ctx, q)
+{
+	b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
+	b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);
+
+	m_paramsCPU.m_gridSize[0] = 128;
+	m_paramsCPU.m_gridSize[1] = 128;
+	m_paramsCPU.m_gridSize[2] = 128;
+	m_paramsCPU.m_gridSize[3] = maxBodiesPerCell;
+	m_paramsCPU.setMaxBodiesPerCell(maxBodiesPerCell);
+	m_paramsCPU.m_invCellSize[0] = invGridSize[0];
+	m_paramsCPU.m_invCellSize[1] = invGridSize[1];
+	m_paramsCPU.m_invCellSize[2] = invGridSize[2];
+	m_paramsCPU.m_invCellSize[3] = 0.f;
+	m_paramsGPU.push_back(m_paramsCPU);
+
+	cl_int errNum = 0;
+
+	{
+		const char* sapSrc = sapCL;
+		cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
+		b3Assert(errNum == CL_SUCCESS);
+		m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
+		m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
+		b3Assert(errNum == CL_SUCCESS);
+	}
+
+	{
+		cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
+		b3Assert(errNum == CL_SUCCESS);
+
+		kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);
+
+		kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);
+
+		kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);
+
+		kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);
+	}
+
+	m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
+}
+b3GpuGridBroadphase::~b3GpuGridBroadphase()
+{
+	clReleaseKernel(kCalcHashAABB);
+	clReleaseKernel(kClearCellStart);
+	clReleaseKernel(kFindCellStart);
+	clReleaseKernel(kFindOverlappingPairs);
+	clReleaseKernel(m_sap2Kernel);
+	clReleaseKernel(m_copyAabbsKernel);
+
+	delete m_sorter;
+}
+
+void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
+{
+	b3SapAabb aabb;
+	aabb.m_minVec = aabbMin;
+	aabb.m_maxVec = aabbMax;
+	aabb.m_minIndices[3] = userPtr;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;
+	m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
+
+	m_allAabbsCPU1.push_back(aabb);
+}
+void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
+{
+	b3SapAabb aabb;
+	aabb.m_minVec = aabbMin;
+	aabb.m_maxVec = aabbMax;
+	aabb.m_minIndices[3] = userPtr;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;
+	m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
+
+	m_allAabbsCPU1.push_back(aabb);
+}
+
+void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
+{
+	B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
+
+	if (0)
+	{
+		calculateOverlappingPairsHost(maxPairs);
+		/*
+		b3AlignedObjectArray<b3Int4> cpuPairs;
+		m_gpuPairs.copyToHost(cpuPairs);
+		printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
+		for (int i=0;i<m_gpuPairs.size();i++)
+		{
+			printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y);
+		}
+		*/
+		return;
+	}
+
+	int numSmallAabbs = m_smallAabbsMappingGPU.size();
+
+	b3OpenCLArray<int> pairCount(m_context, m_queue);
+	pairCount.push_back(0);
+	m_gpuPairs.resize(maxPairs);  //numSmallAabbs*maxPairsPerBody);
+
+	{
+		int numLargeAabbs = m_largeAabbsMappingGPU.size();
+		if (numLargeAabbs && numSmallAabbs)
+		{
+			B3_PROFILE("sap2Kernel");
+			b3BufferInfoCL bInfo[] = {
+				b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
+				b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
+				b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
+				b3BufferInfoCL(m_gpuPairs.getBufferCL()),
+				b3BufferInfoCL(pairCount.getBufferCL())};
+			b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(numLargeAabbs);
+			launcher.setConst(numSmallAabbs);
+			launcher.setConst(0);  //axis is not used
+			launcher.setConst(maxPairs);
+			//@todo: use actual maximum work item sizes of the device instead of hardcoded values
+			launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
+
+			int numPairs = pairCount.at(0);
+
+			if (numPairs > maxPairs)
+			{
+				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+				numPairs = maxPairs;
+			}
+		}
+	}
+
+	if (numSmallAabbs)
+	{
+		B3_PROFILE("gridKernel");
+		m_hashGpu.resize(numSmallAabbs);
+		{
+			B3_PROFILE("kCalcHashAABB");
+			b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
+			launch.setConst(numSmallAabbs);
+			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
+			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
+			launch.setBuffer(m_hashGpu.getBufferCL());
+			launch.setBuffer(this->m_paramsGPU.getBufferCL());
+			launch.launch1D(numSmallAabbs);
+		}
+
+		m_sorter->execute(m_hashGpu);
+
+		int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
+		m_cellStartGpu.resize(numCells);
+		//b3AlignedObjectArray<int >			cellStartCpu;
+
+		{
+			B3_PROFILE("kClearCellStart");
+			b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
+			launch.setConst(numCells);
+			launch.setBuffer(m_cellStartGpu.getBufferCL());
+			launch.launch1D(numCells);
+			//m_cellStartGpu.copyToHost(cellStartCpu);
+			//printf("??\n");
+		}
+
+		{
+			B3_PROFILE("kFindCellStart");
+			b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
+			launch.setConst(numSmallAabbs);
+			launch.setBuffer(m_hashGpu.getBufferCL());
+			launch.setBuffer(m_cellStartGpu.getBufferCL());
+			launch.launch1D(numSmallAabbs);
+			//m_cellStartGpu.copyToHost(cellStartCpu);
+			//printf("??\n");
+		}
+
+		{
+			B3_PROFILE("kFindOverlappingPairs");
+
+			b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
+			launch.setConst(numSmallAabbs);
+			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
+			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
+			launch.setBuffer(m_hashGpu.getBufferCL());
+			launch.setBuffer(m_cellStartGpu.getBufferCL());
+
+			launch.setBuffer(m_paramsGPU.getBufferCL());
+			//launch.setBuffer(0);
+			launch.setBuffer(pairCount.getBufferCL());
+			launch.setBuffer(m_gpuPairs.getBufferCL());
+
+			launch.setConst(maxPairs);
+			launch.launch1D(numSmallAabbs);
+
+			int numPairs = pairCount.at(0);
+			if (numPairs > maxPairs)
+			{
+				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+				numPairs = maxPairs;
+			}
+
+			m_gpuPairs.resize(numPairs);
+
+			if (0)
+			{
+				b3AlignedObjectArray<b3Int4> pairsCpu;
+				m_gpuPairs.copyToHost(pairsCpu);
+
+				int sz = m_gpuPairs.size();
+				printf("m_gpuPairs.size()=%d\n", sz);
+				for (int i = 0; i < m_gpuPairs.size(); i++)
+				{
+					printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
+				}
+
+				printf("?!?\n");
+			}
+		}
+	}
+
+	//calculateOverlappingPairsHost(maxPairs);
+}
+void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
+{
+	m_hostPairs.resize(0);
+	m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
+	for (int i = 0; i < m_allAabbsCPU1.size(); i++)
+	{
+		for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
+		{
+			if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
+									   m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
+			{
+				b3Int4 pair;
+				int a = m_allAabbsCPU1[j].m_minIndices[3];
+				int b = m_allAabbsCPU1[i].m_minIndices[3];
+				if (a <= b)
+				{
+					pair.x = a;
+					pair.y = b;  //store the original index in the unsorted aabb array
+				}
+				else
+				{
+					pair.x = b;
+					pair.y = a;  //store the original index in the unsorted aabb array
+				}
+
+				if (m_hostPairs.size() < maxPairs)
+				{
+					m_hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+
+	m_gpuPairs.copyFromHost(m_hostPairs);
+}
+
+//call writeAabbsToGpu after done making all changes (createProxy etc)
+void b3GpuGridBroadphase::writeAabbsToGpu()
+{
+	m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
+	m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
+	m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
+}
+
+cl_mem b3GpuGridBroadphase::getAabbBufferWS()
+{
+	return this->m_allAabbsGPU1.getBufferCL();
+}
+int b3GpuGridBroadphase::getNumOverlap()
+{
+	return m_gpuPairs.size();
+}
+cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
+{
+	return m_gpuPairs.getBufferCL();
+}
+
+b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
+{
+	return m_allAabbsGPU1;
+}
+
+b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
+{
+	return m_allAabbsCPU1;
+}
+
+b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU()
+{
+	return m_gpuPairs;
+}
+b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU()
+{
+	return m_smallAabbsMappingGPU;
+}
+b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
+{
+	return m_largeAabbsMappingGPU;
+}

+ 80 - 0
Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h

@@ -0,0 +1,80 @@
+#ifndef B3_GPU_GRID_BROADPHASE_H
+#define B3_GPU_GRID_BROADPHASE_H
+
+#include "b3GpuBroadphaseInterface.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+
+struct b3ParamsGridBroadphaseCL
+{
+	float m_invCellSize[4];
+	int m_gridSize[4];
+
+	int getMaxBodiesPerCell() const
+	{
+		return m_gridSize[3];
+	}
+
+	void setMaxBodiesPerCell(int maxOverlap)
+	{
+		m_gridSize[3] = maxOverlap;
+	}
+};
+
+class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
+{
+protected:
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+
+	b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
+	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
+
+	b3OpenCLArray<int> m_smallAabbsMappingGPU;
+	b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
+
+	b3OpenCLArray<int> m_largeAabbsMappingGPU;
+	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
+
+	b3AlignedObjectArray<b3Int4> m_hostPairs;
+	b3OpenCLArray<b3Int4> m_gpuPairs;
+
+	b3OpenCLArray<b3SortData> m_hashGpu;
+	b3OpenCLArray<int> m_cellStartGpu;
+
+	b3ParamsGridBroadphaseCL m_paramsCPU;
+	b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
+
+	class b3RadixSort32CL* m_sorter;
+
+public:
+	b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
+	virtual ~b3GpuGridBroadphase();
+
+	static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
+	{
+		return new b3GpuGridBroadphase(ctx, device, q);
+	}
+
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+
+	virtual void calculateOverlappingPairs(int maxPairs);
+	virtual void calculateOverlappingPairsHost(int maxPairs);
+
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	virtual void writeAabbsToGpu();
+
+	virtual cl_mem getAabbBufferWS();
+	virtual int getNumOverlap();
+	virtual cl_mem getOverlappingPairBuffer();
+
+	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
+
+	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
+	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
+	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
+};
+
+#endif  //B3_GPU_GRID_BROADPHASE_H

+ 557 - 0
Dependencies/include/bullet3/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp

@@ -0,0 +1,557 @@
+/*
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Initial Author Jackson Lee, 2014
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+
+#include "b3GpuParallelLinearBvh.h"
+
+b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
+																												  m_radixSorter(context, device, queue),
+
+																												  m_rootNodeIndex(context, queue),
+																												  m_maxDistanceFromRoot(context, queue),
+																												  m_temp(context, queue),
+
+																												  m_internalNodeAabbs(context, queue),
+																												  m_internalNodeLeafIndexRanges(context, queue),
+																												  m_internalNodeChildNodes(context, queue),
+																												  m_internalNodeParentNodes(context, queue),
+
+																												  m_commonPrefixes(context, queue),
+																												  m_commonPrefixLengths(context, queue),
+																												  m_distanceFromRoot(context, queue),
+
+																												  m_leafNodeParentNodes(context, queue),
+																												  m_mortonCodesAndAabbIndicies(context, queue),
+																												  m_mergedAabb(context, queue),
+																												  m_leafNodeAabbs(context, queue),
+
+																												  m_largeAabbs(context, queue)
+{
+	m_rootNodeIndex.resize(1);
+	m_maxDistanceFromRoot.resize(1);
+	m_temp.resize(1);
+
+	//
+	const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
+
+	const char* kernelSource = parallelLinearBvhCL;  //parallelLinearBvhCL.h
+	cl_int error;
+	char* additionalMacros = 0;
+	m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
+	b3Assert(m_parallelLinearBvhProgram);
+
+	m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_separateAabbsKernel);
+	m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_findAllNodesMergedAabbKernel);
+	m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
+
+	m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_computeAdjacentPairCommonPrefixKernel);
+	m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
+	m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
+	m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_findDistanceFromRootKernel);
+	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
+
+	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_findLeafIndexRangesKernel);
+
+	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_plbvhCalculateOverlappingPairsKernel);
+	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_plbvhRayTraverseKernel);
+	m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_plbvhLargeAabbAabbTestKernel);
+	m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
+	b3Assert(m_plbvhLargeAabbRayTestKernel);
+}
+
+b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
+{
+	clReleaseKernel(m_separateAabbsKernel);
+	clReleaseKernel(m_findAllNodesMergedAabbKernel);
+	clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
+
+	clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
+	clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
+	clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
+	clReleaseKernel(m_findDistanceFromRootKernel);
+	clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
+
+	clReleaseKernel(m_findLeafIndexRangesKernel);
+
+	clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
+	clReleaseKernel(m_plbvhRayTraverseKernel);
+	clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
+	clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
+
+	clReleaseProgram(m_parallelLinearBvhProgram);
+}
+
+void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
+								   const b3OpenCLArray<int>& largeAabbIndices)
+{
+	B3_PROFILE("b3ParallelLinearBvh::build()");
+
+	int numLargeAabbs = largeAabbIndices.size();
+	int numSmallAabbs = smallAabbIndices.size();
+
+	//Since all AABBs(both large and small) are input as a contiguous array,
+	//with 2 additional arrays used to indicate the indices of large and small AABBs,
+	//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
+	{
+		B3_PROFILE("Separate large and small AABBs");
+
+		m_largeAabbs.resize(numLargeAabbs);
+		m_leafNodeAabbs.resize(numSmallAabbs);
+
+		//Write large AABBs into m_largeAabbs
+		{
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
+					b3BufferInfoCL(largeAabbIndices.getBufferCL()),
+
+					b3BufferInfoCL(m_largeAabbs.getBufferCL())};
+
+			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(numLargeAabbs);
+
+			launcher.launch1D(numLargeAabbs);
+		}
+
+		//Write small AABBs into m_leafNodeAabbs
+		{
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
+					b3BufferInfoCL(smallAabbIndices.getBufferCL()),
+
+					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
+
+			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(numSmallAabbs);
+
+			launcher.launch1D(numSmallAabbs);
+		}
+
+		clFinish(m_queue);
+	}
+
+	//
+	int numLeaves = numSmallAabbs;  //Number of leaves in the BVH == Number of rigid bodies with small AABBs
+	int numInternalNodes = numLeaves - 1;
+
+	if (numLeaves < 2)
+	{
+		//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
+		//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
+		int rootNodeIndex = numLeaves - 1;
+		m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
+
+		//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
+		//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
+		//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
+		//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
+		if (numLeaves == 1)
+		{
+			b3SortData leaf;
+			leaf.m_value = 0;  //1 leaf so index is always 0; leaf.m_key does not need to be set
+
+			m_mortonCodesAndAabbIndicies.resize(1);
+			m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
+		}
+
+		return;
+	}
+
+	//
+	{
+		m_internalNodeAabbs.resize(numInternalNodes);
+		m_internalNodeLeafIndexRanges.resize(numInternalNodes);
+		m_internalNodeChildNodes.resize(numInternalNodes);
+		m_internalNodeParentNodes.resize(numInternalNodes);
+
+		m_commonPrefixes.resize(numInternalNodes);
+		m_commonPrefixLengths.resize(numInternalNodes);
+		m_distanceFromRoot.resize(numInternalNodes);
+
+		m_leafNodeParentNodes.resize(numLeaves);
+		m_mortonCodesAndAabbIndicies.resize(numLeaves);
+		m_mergedAabb.resize(numLeaves);
+	}
+
+	//Find the merged AABB of all small AABBs; this is used to define the size of
+	//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
+	{
+		B3_PROFILE("Find AABB of merged nodes");
+
+		m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);  //Need to make a copy since the kernel modifies the array
+
+		for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
+			 numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
+		{
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(m_mergedAabb.getBufferCL())  //Resulting AABB is stored in m_mergedAabb[0]
+				};
+
+			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(numAabbsNeedingMerge);
+
+			launcher.launch1D(numAabbsNeedingMerge);
+		}
+
+		clFinish(m_queue);
+	}
+
+	//Insert the center of the AABBs into a virtual grid,
+	//then convert the discrete grid coordinates into a morton code
+	//For each element in m_mortonCodesAndAabbIndicies, set
+	//	m_key == morton code (value to sort by)
+	//	m_value == small AABB index
+	{
+		B3_PROFILE("Assign morton codes");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_mergedAabb.getBufferCL()),
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(numLeaves);
+
+		launcher.launch1D(numLeaves);
+		clFinish(m_queue);
+	}
+
+	//
+	{
+		B3_PROFILE("Sort leaves by morton codes");
+
+		m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
+		clFinish(m_queue);
+	}
+
+	//
+	constructBinaryRadixTree();
+
+	//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
+	//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
+	//The child nodes of each node split their parent's index range into 2 contiguous halves.
+	//
+	//For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
+	//The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
+	//
+	//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
+	{
+		B3_PROFILE("m_findLeafIndexRangesKernel");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(numInternalNodes);
+
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+}
+
+void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
+{
+	int maxPairs = out_overlappingPairs.size();
+	b3OpenCLArray<int>& numPairsGpu = m_temp;
+
+	int reset = 0;
+	numPairsGpu.copyFromHostPointer(&reset, 1);
+
+	//
+	if (m_leafNodeAabbs.size() > 1)
+	{
+		B3_PROFILE("PLBVH small-small AABB test");
+
+		int numQueryAabbs = m_leafNodeAabbs.size();
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+
+				b3BufferInfoCL(numPairsGpu.getBufferCL()),
+				b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(maxPairs);
+		launcher.setConst(numQueryAabbs);
+
+		launcher.launch1D(numQueryAabbs);
+		clFinish(m_queue);
+	}
+
+	int numLargeAabbRigids = m_largeAabbs.size();
+	if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
+	{
+		B3_PROFILE("PLBVH large-small AABB test");
+
+		int numQueryAabbs = m_leafNodeAabbs.size();
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_largeAabbs.getBufferCL()),
+
+				b3BufferInfoCL(numPairsGpu.getBufferCL()),
+				b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(maxPairs);
+		launcher.setConst(numLargeAabbRigids);
+		launcher.setConst(numQueryAabbs);
+
+		launcher.launch1D(numQueryAabbs);
+		clFinish(m_queue);
+	}
+
+	//
+	int numPairs = -1;
+	numPairsGpu.copyToHostPointer(&numPairs, 1);
+	if (numPairs > maxPairs)
+	{
+		b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
+		numPairs = maxPairs;
+		numPairsGpu.copyFromHostPointer(&maxPairs, 1);
+	}
+
+	out_overlappingPairs.resize(numPairs);
+}
+
+void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
+													 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
+{
+	B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
+
+	int numRays = rays.size();
+	int maxRayRigidPairs = out_rayRigidPairs.size();
+
+	int reset = 0;
+	out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
+
+	//
+	if (m_leafNodeAabbs.size() > 0)
+	{
+		B3_PROFILE("PLBVH ray test small AABB");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+
+				b3BufferInfoCL(rays.getBufferCL()),
+
+				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
+				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(maxRayRigidPairs);
+		launcher.setConst(numRays);
+
+		launcher.launch1D(numRays);
+		clFinish(m_queue);
+	}
+
+	int numLargeAabbRigids = m_largeAabbs.size();
+	if (numLargeAabbRigids > 0)
+	{
+		B3_PROFILE("PLBVH ray test large AABB");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_largeAabbs.getBufferCL()),
+				b3BufferInfoCL(rays.getBufferCL()),
+
+				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
+				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(numLargeAabbRigids);
+		launcher.setConst(maxRayRigidPairs);
+		launcher.setConst(numRays);
+
+		launcher.launch1D(numRays);
+		clFinish(m_queue);
+	}
+
+	//
+	int numRayRigidPairs = -1;
+	out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
+
+	if (numRayRigidPairs > maxRayRigidPairs)
+		b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
+}
+
+void b3GpuParallelLinearBvh::constructBinaryRadixTree()
+{
+	B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
+
+	int numLeaves = m_leafNodeAabbs.size();
+	int numInternalNodes = numLeaves - 1;
+
+	//Each internal node is placed in between 2 leaf nodes.
+	//By using this arrangement and computing the common prefix between
+	//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
+	{
+		B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+				b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
+				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(numInternalNodes);
+
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+
+	//For each leaf node, select its parent node by
+	//comparing the 2 nearest internal nodes and assign child node indices
+	{
+		B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
+				b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(numLeaves);
+
+		launcher.launch1D(numLeaves);
+		clFinish(m_queue);
+	}
+
+	//For each internal node, perform 2 binary searches among the other internal nodes
+	//to its left and right to find its potential parent nodes and assign child node indices
+	{
+		B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
+				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(numInternalNodes);
+
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+
+	//Find the number of nodes separating each internal node and the root node
+	//so that the AABBs can be set using the next kernel.
+	//Also determine the maximum number of nodes separating an internal node and the root node.
+	{
+		B3_PROFILE("m_findDistanceFromRootKernel");
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
+				b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
+				b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
+
+		b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(numInternalNodes);
+
+		launcher.launch1D(numInternalNodes);
+		clFinish(m_queue);
+	}
+
+	//Starting from the internal nodes nearest to the leaf nodes, recursively move up
+	//the tree towards the root to set the AABBs of each internal node; each internal node
+	//checks its children and merges their AABBs
+	{
+		B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
+
+		int maxDistanceFromRoot = -1;
+		{
+			B3_PROFILE("copy maxDistanceFromRoot to CPU");
+			m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
+			clFinish(m_queue);
+		}
+
+		for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
+		{
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
+					b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+					b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+					b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
+
+			b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(maxDistanceFromRoot);
+			launcher.setConst(distanceFromRoot);
+			launcher.setConst(numInternalNodes);
+
+			//It may seem inefficent to launch a thread for each internal node when a
+			//much smaller number of nodes is actually processed, but this is actually
+			//faster than determining the exact nodes that are ready to merge their child AABBs.
+			launcher.launch1D(numInternalNodes);
+		}
+
+		clFinish(m_queue);
+	}
+}

部分文件因为文件数量过多而无法显示