Mat44.inl 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952
  1. // Jolt Physics Library (https://github.com/jrouwe/JoltPhysics)
  2. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  3. // SPDX-License-Identifier: MIT
  4. #pragma once
  5. #include <Jolt/Math/Vec3.h>
  6. #include <Jolt/Math/Vec4.h>
  7. #include <Jolt/Math/Quat.h>
  8. JPH_NAMESPACE_BEGIN
  9. #define JPH_EL(r, c) mCol[c].mF32[r]
  10. Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) :
  11. mCol { inC1, inC2, inC3, inC4 }
  12. {
  13. }
  14. Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec3Arg inC4) :
  15. mCol { inC1, inC2, inC3, Vec4(inC4, 1.0f) }
  16. {
  17. }
  18. Mat44::Mat44(Type inC1, Type inC2, Type inC3, Type inC4) :
  19. mCol { inC1, inC2, inC3, inC4 }
  20. {
  21. }
  22. Mat44 Mat44::sZero()
  23. {
  24. return Mat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), Vec4::sZero());
  25. }
  26. Mat44 Mat44::sIdentity()
  27. {
  28. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  29. }
  30. Mat44 Mat44::sNaN()
  31. {
  32. return Mat44(Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN());
  33. }
  34. Mat44 Mat44::sLoadFloat4x4(const Float4 *inV)
  35. {
  36. Mat44 result;
  37. for (int c = 0; c < 4; ++c)
  38. result.mCol[c] = Vec4::sLoadFloat4(inV + c);
  39. return result;
  40. }
  41. Mat44 Mat44::sLoadFloat4x4Aligned(const Float4 *inV)
  42. {
  43. Mat44 result;
  44. for (int c = 0; c < 4; ++c)
  45. result.mCol[c] = Vec4::sLoadFloat4Aligned(inV + c);
  46. return result;
  47. }
  48. Mat44 Mat44::sRotationX(float inX)
  49. {
  50. Vec4 sv, cv;
  51. Vec4::sReplicate(inX).SinCos(sv, cv);
  52. float s = sv.GetX(), c = cv.GetX();
  53. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, c, s, 0), Vec4(0, -s, c, 0), Vec4(0, 0, 0, 1));
  54. }
  55. Mat44 Mat44::sRotationY(float inY)
  56. {
  57. Vec4 sv, cv;
  58. Vec4::sReplicate(inY).SinCos(sv, cv);
  59. float s = sv.GetX(), c = cv.GetX();
  60. return Mat44(Vec4(c, 0, -s, 0), Vec4(0, 1, 0, 0), Vec4(s, 0, c, 0), Vec4(0, 0, 0, 1));
  61. }
  62. Mat44 Mat44::sRotationZ(float inZ)
  63. {
  64. Vec4 sv, cv;
  65. Vec4::sReplicate(inZ).SinCos(sv, cv);
  66. float s = sv.GetX(), c = cv.GetX();
  67. return Mat44(Vec4(c, s, 0, 0), Vec4(-s, c, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  68. }
  69. Mat44 Mat44::sRotation(QuatArg inQuat)
  70. {
  71. JPH_ASSERT(inQuat.IsNormalized());
  72. // See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
  73. #ifdef JPH_USE_SSE4_1
  74. __m128 xyzw = inQuat.mValue.mValue;
  75. __m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
  76. __m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
  77. __m128 two_yzxw = _mm_add_ps(yzxw, yzxw);
  78. __m128 zxyw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 1, 0, 2));
  79. __m128 two_zxyw = _mm_add_ps(zxyw, zxyw);
  80. __m128 wwww = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 3, 3, 3));
  81. __m128 diagonal = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(two_yzxw, yzxw)), _mm_mul_ps(two_zxyw, zxyw)); // (1 - 2 y^2 - 2 z^2, 1 - 2 x^2 - 2 z^2, 1 - 2 x^2 - 2 y^2, 1 - 4 w^2)
  82. __m128 plus = _mm_add_ps(_mm_mul_ps(two_xyzw, zxyw), _mm_mul_ps(two_yzxw, wwww)); // 2 * (xz + yw, xy + zw, yz + xw, ww)
  83. __m128 minus = _mm_sub_ps(_mm_mul_ps(two_yzxw, xyzw), _mm_mul_ps(two_zxyw, wwww)); // 2 * (xy - zw, yz - xw, xz - yw, 0)
  84. // Workaround for compiler changing _mm_sub_ps(_mm_mul_ps(...), ...) into a fused multiply sub instruction, resulting in w not being 0
  85. // There doesn't appear to be a reliable way to turn this off in Clang
  86. minus = _mm_insert_ps(minus, minus, 0b1000);
  87. __m128 col0 = _mm_blend_ps(_mm_blend_ps(plus, diagonal, 0b0001), minus, 0b1100); // (1 - 2 y^2 - 2 z^2, 2 xy + 2 zw, 2 xz - 2 yw, 0)
  88. __m128 col1 = _mm_blend_ps(_mm_blend_ps(diagonal, minus, 0b1001), plus, 0b0100); // (2 xy - 2 zw, 1 - 2 x^2 - 2 z^2, 2 yz + 2 xw, 0)
  89. __m128 col2 = _mm_blend_ps(_mm_blend_ps(minus, plus, 0b0001), diagonal, 0b0100); // (2 xz + 2 yw, 2 yz - 2 xw, 1 - 2 x^2 - 2 y^2, 0)
  90. __m128 col3 = _mm_set_ps(1, 0, 0, 0);
  91. return Mat44(col0, col1, col2, col3);
  92. #else
  93. float x = inQuat.GetX();
  94. float y = inQuat.GetY();
  95. float z = inQuat.GetZ();
  96. float w = inQuat.GetW();
  97. float tx = x + x; // Note: Using x + x instead of 2.0f * x to force this function to return the same value as the SSE4.1 version across platforms.
  98. float ty = y + y;
  99. float tz = z + z;
  100. float xx = tx * x;
  101. float yy = ty * y;
  102. float zz = tz * z;
  103. float xy = tx * y;
  104. float xz = tx * z;
  105. float xw = tx * w;
  106. float yz = ty * z;
  107. float yw = ty * w;
  108. float zw = tz * w;
  109. return Mat44(Vec4((1.0f - yy) - zz, xy + zw, xz - yw, 0.0f), // Note: Added extra brackets to force this function to return the same value as the SSE4.1 version across platforms.
  110. Vec4(xy - zw, (1.0f - zz) - xx, yz + xw, 0.0f),
  111. Vec4(xz + yw, yz - xw, (1.0f - xx) - yy, 0.0f),
  112. Vec4(0.0f, 0.0f, 0.0f, 1.0f));
  113. #endif
  114. }
  115. Mat44 Mat44::sRotation(Vec3Arg inAxis, float inAngle)
  116. {
  117. return sRotation(Quat::sRotation(inAxis, inAngle));
  118. }
  119. Mat44 Mat44::sTranslation(Vec3Arg inV)
  120. {
  121. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(inV, 1));
  122. }
  123. Mat44 Mat44::sRotationTranslation(QuatArg inR, Vec3Arg inT)
  124. {
  125. Mat44 m = sRotation(inR);
  126. m.SetTranslation(inT);
  127. return m;
  128. }
  129. Mat44 Mat44::sInverseRotationTranslation(QuatArg inR, Vec3Arg inT)
  130. {
  131. Mat44 m = sRotation(inR.Conjugated());
  132. m.SetTranslation(-m.Multiply3x3(inT));
  133. return m;
  134. }
  135. Mat44 Mat44::sScale(float inScale)
  136. {
  137. return Mat44(Vec4(inScale, 0, 0, 0), Vec4(0, inScale, 0, 0), Vec4(0, 0, inScale, 0), Vec4(0, 0, 0, 1));
  138. }
  139. Mat44 Mat44::sScale(Vec3Arg inV)
  140. {
  141. return Mat44(Vec4(inV.GetX(), 0, 0, 0), Vec4(0, inV.GetY(), 0, 0), Vec4(0, 0, inV.GetZ(), 0), Vec4(0, 0, 0, 1));
  142. }
  143. Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
  144. {
  145. Vec4 v1(inV1, 0);
  146. return Mat44(v1 * inV2.SplatX(), v1 * inV2.SplatY(), v1 * inV2.SplatZ(), Vec4(0, 0, 0, 1));
  147. }
  148. Mat44 Mat44::sCrossProduct(Vec3Arg inV)
  149. {
  150. #ifdef JPH_USE_SSE4_1
  151. // Zero out the W component
  152. __m128 zero = _mm_setzero_ps();
  153. __m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
  154. // Negate
  155. __m128 min_v = _mm_sub_ps(zero, v);
  156. return Mat44(
  157. _mm_shuffle_ps(v, min_v, _MM_SHUFFLE(3, 1, 2, 3)), // [0, z, -y, 0]
  158. _mm_shuffle_ps(min_v, v, _MM_SHUFFLE(3, 0, 3, 2)), // [-z, 0, x, 0]
  159. _mm_blend_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 1)), _mm_shuffle_ps(min_v, min_v, _MM_SHUFFLE(3, 3, 0, 3)), 0b0010), // [y, -x, 0, 0]
  160. Vec4(0, 0, 0, 1));
  161. #else
  162. float x = inV.GetX();
  163. float y = inV.GetY();
  164. float z = inV.GetZ();
  165. return Mat44(
  166. Vec4(0, z, -y, 0),
  167. Vec4(-z, 0, x, 0),
  168. Vec4(y, -x, 0, 0),
  169. Vec4(0, 0, 0, 1));
  170. #endif
  171. }
  172. Mat44 Mat44::sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp)
  173. {
  174. Vec3 direction = (inTarget - inPos).NormalizedOr(-Vec3::sAxisZ());
  175. Vec3 right = direction.Cross(inUp).NormalizedOr(Vec3::sAxisX());
  176. Vec3 up = right.Cross(direction);
  177. return Mat44(Vec4(right, 0), Vec4(up, 0), Vec4(-direction, 0), Vec4(inPos, 1)).InversedRotationTranslation();
  178. }
  179. Mat44 Mat44::sPerspective(float inFovY, float inAspect, float inNear, float inFar)
  180. {
  181. float height = 1.0f / Tan(0.5f * inFovY);
  182. float width = height / inAspect;
  183. float range = inFar / (inNear - inFar);
  184. return Mat44(Vec4(width, 0.0f, 0.0f, 0.0f), Vec4(0.0f, height, 0.0f, 0.0f), Vec4(0.0f, 0.0f, range, -1.0f), Vec4(0.0f, 0.0f, range * inNear, 0.0f));
  185. }
  186. bool Mat44::operator == (Mat44Arg inM2) const
  187. {
  188. return UVec4::sAnd(
  189. UVec4::sAnd(Vec4::sEquals(mCol[0], inM2.mCol[0]), Vec4::sEquals(mCol[1], inM2.mCol[1])),
  190. UVec4::sAnd(Vec4::sEquals(mCol[2], inM2.mCol[2]), Vec4::sEquals(mCol[3], inM2.mCol[3]))
  191. ).TestAllTrue();
  192. }
  193. bool Mat44::IsClose(Mat44Arg inM2, float inMaxDistSq) const
  194. {
  195. for (int i = 0; i < 4; ++i)
  196. if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
  197. return false;
  198. return true;
  199. }
  200. Mat44 Mat44::operator * (Mat44Arg inM) const
  201. {
  202. Mat44 result;
  203. #if defined(JPH_USE_SSE)
  204. for (int i = 0; i < 4; ++i)
  205. {
  206. __m128 c = inM.mCol[i].mValue;
  207. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  208. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  209. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  210. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3))));
  211. result.mCol[i].mValue = t;
  212. }
  213. #elif defined(JPH_USE_NEON)
  214. for (int i = 0; i < 4; ++i)
  215. {
  216. Type c = inM.mCol[i].mValue;
  217. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  218. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  219. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  220. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
  221. result.mCol[i].mValue = t;
  222. }
  223. #else
  224. for (int i = 0; i < 4; ++i)
  225. result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2] + mCol[3] * inM.mCol[i].mF32[3];
  226. #endif
  227. return result;
  228. }
  229. Vec3 Mat44::operator * (Vec3Arg inV) const
  230. {
  231. #if defined(JPH_USE_SSE)
  232. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  233. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  234. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  235. t = _mm_add_ps(t, mCol[3].mValue);
  236. return Vec3::sFixW(t);
  237. #elif defined(JPH_USE_NEON)
  238. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  239. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  240. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  241. t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
  242. return Vec3::sFixW(t);
  243. #else
  244. return Vec3(
  245. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0],
  246. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1],
  247. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2]);
  248. #endif
  249. }
  250. Vec4 Mat44::operator * (Vec4Arg inV) const
  251. {
  252. #if defined(JPH_USE_SSE)
  253. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  254. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  255. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  256. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(3, 3, 3, 3))));
  257. return t;
  258. #elif defined(JPH_USE_NEON)
  259. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  260. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  261. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  262. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
  263. return t;
  264. #else
  265. return Vec4(
  266. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0] * inV.mF32[3],
  267. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1] * inV.mF32[3],
  268. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2] * inV.mF32[3],
  269. mCol[0].mF32[3] * inV.mF32[0] + mCol[1].mF32[3] * inV.mF32[1] + mCol[2].mF32[3] * inV.mF32[2] + mCol[3].mF32[3] * inV.mF32[3]);
  270. #endif
  271. }
  272. Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
  273. {
  274. #if defined(JPH_USE_SSE)
  275. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  276. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  277. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  278. return Vec3::sFixW(t);
  279. #elif defined(JPH_USE_NEON)
  280. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  281. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  282. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  283. return Vec3::sFixW(t);
  284. #else
  285. return Vec3(
  286. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2],
  287. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2],
  288. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]);
  289. #endif
  290. }
  291. Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
  292. {
  293. #if defined(JPH_USE_SSE4_1)
  294. __m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
  295. __m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
  296. __m128 xy = _mm_blend_ps(x, y, 0b0010);
  297. __m128 z = _mm_dp_ps(mCol[2].mValue, inV.mValue, 0x7f);
  298. __m128 xyzz = _mm_blend_ps(xy, z, 0b1100);
  299. return xyzz;
  300. #else
  301. return Transposed3x3().Multiply3x3(inV);
  302. #endif
  303. }
  304. Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
  305. {
  306. JPH_ASSERT(mCol[0][3] == 0.0f);
  307. JPH_ASSERT(mCol[1][3] == 0.0f);
  308. JPH_ASSERT(mCol[2][3] == 0.0f);
  309. Mat44 result;
  310. #if defined(JPH_USE_SSE)
  311. for (int i = 0; i < 3; ++i)
  312. {
  313. __m128 c = inM.mCol[i].mValue;
  314. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  315. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  316. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  317. result.mCol[i].mValue = t;
  318. }
  319. #elif defined(JPH_USE_NEON)
  320. for (int i = 0; i < 3; ++i)
  321. {
  322. Type c = inM.mCol[i].mValue;
  323. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  324. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  325. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  326. result.mCol[i].mValue = t;
  327. }
  328. #else
  329. for (int i = 0; i < 3; ++i)
  330. result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2];
  331. #endif
  332. result.mCol[3] = Vec4(0, 0, 0, 1);
  333. return result;
  334. }
  335. Mat44 Mat44::Multiply3x3LeftTransposed(Mat44Arg inM) const
  336. {
  337. // Transpose left hand side
  338. Mat44 trans = Transposed3x3();
  339. // Do 3x3 matrix multiply
  340. Mat44 result;
  341. result.mCol[0] = trans.mCol[0] * inM.mCol[0].SplatX() + trans.mCol[1] * inM.mCol[0].SplatY() + trans.mCol[2] * inM.mCol[0].SplatZ();
  342. result.mCol[1] = trans.mCol[0] * inM.mCol[1].SplatX() + trans.mCol[1] * inM.mCol[1].SplatY() + trans.mCol[2] * inM.mCol[1].SplatZ();
  343. result.mCol[2] = trans.mCol[0] * inM.mCol[2].SplatX() + trans.mCol[1] * inM.mCol[2].SplatY() + trans.mCol[2] * inM.mCol[2].SplatZ();
  344. result.mCol[3] = Vec4(0, 0, 0, 1);
  345. return result;
  346. }
  347. Mat44 Mat44::Multiply3x3RightTransposed(Mat44Arg inM) const
  348. {
  349. JPH_ASSERT(mCol[0][3] == 0.0f);
  350. JPH_ASSERT(mCol[1][3] == 0.0f);
  351. JPH_ASSERT(mCol[2][3] == 0.0f);
  352. Mat44 result;
  353. result.mCol[0] = mCol[0] * inM.mCol[0].SplatX() + mCol[1] * inM.mCol[1].SplatX() + mCol[2] * inM.mCol[2].SplatX();
  354. result.mCol[1] = mCol[0] * inM.mCol[0].SplatY() + mCol[1] * inM.mCol[1].SplatY() + mCol[2] * inM.mCol[2].SplatY();
  355. result.mCol[2] = mCol[0] * inM.mCol[0].SplatZ() + mCol[1] * inM.mCol[1].SplatZ() + mCol[2] * inM.mCol[2].SplatZ();
  356. result.mCol[3] = Vec4(0, 0, 0, 1);
  357. return result;
  358. }
  359. Mat44 Mat44::operator * (float inV) const
  360. {
  361. Vec4 multiplier = Vec4::sReplicate(inV);
  362. Mat44 result;
  363. for (int c = 0; c < 4; ++c)
  364. result.mCol[c] = mCol[c] * multiplier;
  365. return result;
  366. }
  367. Mat44 &Mat44::operator *= (float inV)
  368. {
  369. for (int c = 0; c < 4; ++c)
  370. mCol[c] *= inV;
  371. return *this;
  372. }
  373. Mat44 Mat44::operator + (Mat44Arg inM) const
  374. {
  375. Mat44 result;
  376. for (int i = 0; i < 4; ++i)
  377. result.mCol[i] = mCol[i] + inM.mCol[i];
  378. return result;
  379. }
  380. Mat44 Mat44::operator - () const
  381. {
  382. Mat44 result;
  383. for (int i = 0; i < 4; ++i)
  384. result.mCol[i] = -mCol[i];
  385. return result;
  386. }
  387. Mat44 Mat44::operator - (Mat44Arg inM) const
  388. {
  389. Mat44 result;
  390. for (int i = 0; i < 4; ++i)
  391. result.mCol[i] = mCol[i] - inM.mCol[i];
  392. return result;
  393. }
  394. Mat44 &Mat44::operator += (Mat44Arg inM)
  395. {
  396. for (int c = 0; c < 4; ++c)
  397. mCol[c] += inM.mCol[c];
  398. return *this;
  399. }
  400. void Mat44::StoreFloat4x4(Float4 *outV) const
  401. {
  402. for (int c = 0; c < 4; ++c)
  403. mCol[c].StoreFloat4(outV + c);
  404. }
  405. Mat44 Mat44::Transposed() const
  406. {
  407. #if defined(JPH_USE_SSE)
  408. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  409. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  410. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  411. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  412. Mat44 result;
  413. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  414. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  415. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  416. result.mCol[3].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(3, 1, 3, 1));
  417. return result;
  418. #elif defined(JPH_USE_NEON)
  419. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  420. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, mCol[3].mValue);
  421. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  422. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  423. Mat44 result;
  424. result.mCol[0].mValue = tmp3.val[0];
  425. result.mCol[1].mValue = tmp3.val[1];
  426. result.mCol[2].mValue = tmp4.val[0];
  427. result.mCol[3].mValue = tmp4.val[1];
  428. return result;
  429. #else
  430. Mat44 result;
  431. for (int c = 0; c < 4; ++c)
  432. for (int r = 0; r < 4; ++r)
  433. result.mCol[r].mF32[c] = mCol[c].mF32[r];
  434. return result;
  435. #endif
  436. }
  437. Mat44 Mat44::Transposed3x3() const
  438. {
  439. #if defined(JPH_USE_SSE)
  440. __m128 zero = _mm_setzero_ps();
  441. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  442. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  443. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(1, 0, 1, 0));
  444. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(3, 2, 3, 2));
  445. Mat44 result;
  446. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  447. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  448. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  449. #elif defined(JPH_USE_NEON)
  450. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  451. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, vdupq_n_f32(0));
  452. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  453. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  454. Mat44 result;
  455. result.mCol[0].mValue = tmp3.val[0];
  456. result.mCol[1].mValue = tmp3.val[1];
  457. result.mCol[2].mValue = tmp4.val[0];
  458. #else
  459. Mat44 result;
  460. for (int c = 0; c < 3; ++c)
  461. {
  462. for (int r = 0; r < 3; ++r)
  463. result.mCol[c].mF32[r] = mCol[r].mF32[c];
  464. result.mCol[c].mF32[3] = 0;
  465. }
  466. #endif
  467. result.mCol[3] = Vec4(0, 0, 0, 1);
  468. return result;
  469. }
  470. Mat44 Mat44::Inversed() const
  471. {
  472. #if defined(JPH_USE_SSE)
  473. // Algorithm from: http://download.intel.com/design/PentiumIII/sml/24504301.pdf
  474. // Streaming SIMD Extensions - Inverse of 4x4 Matrix
  475. // Adapted to load data using _mm_shuffle_ps instead of loading from memory
  476. // Replaced _mm_rcp_ps with _mm_div_ps for better accuracy
  477. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  478. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  479. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  480. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  481. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  482. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  483. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  484. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  485. tmp1 = _mm_mul_ps(row2, row3);
  486. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  487. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  488. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  489. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  490. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  491. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  492. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  493. tmp1 = _mm_mul_ps(row1, row2);
  494. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  495. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  496. __m128 minor3 = _mm_mul_ps(row0, tmp1);
  497. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  498. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  499. minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
  500. minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
  501. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  502. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  503. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  504. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  505. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  506. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  507. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  508. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  509. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  510. tmp1 = _mm_mul_ps(row0, row1);
  511. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  512. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  513. minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
  514. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  515. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  516. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
  517. tmp1 = _mm_mul_ps(row0, row3);
  518. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  519. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  520. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  521. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  522. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  523. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  524. tmp1 = _mm_mul_ps(row0, row2);
  525. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  526. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  527. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
  528. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  529. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  530. minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
  531. __m128 det = _mm_mul_ps(row0, minor0);
  532. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
  533. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  534. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  535. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  536. Mat44 result;
  537. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  538. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  539. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  540. result.mCol[3].mValue = _mm_mul_ps(det, minor3);
  541. return result;
  542. #elif defined(JPH_USE_NEON)
  543. // Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
  544. Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  545. Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
  546. Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
  547. row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
  548. tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  549. Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
  550. Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
  551. row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
  552. tmp1 = vmulq_f32(row2, row3);
  553. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  554. Type minor0 = vmulq_f32(row1, tmp1);
  555. Type minor1 = vmulq_f32(row0, tmp1);
  556. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  557. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  558. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  559. minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
  560. tmp1 = vmulq_f32(row1, row2);
  561. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  562. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  563. Type minor3 = vmulq_f32(row0, tmp1);
  564. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  565. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  566. minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
  567. minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
  568. tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
  569. tmp1 = vmulq_f32(tmp1, row3);
  570. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  571. row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
  572. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  573. Type minor2 = vmulq_f32(row0, tmp1);
  574. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  575. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  576. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  577. minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
  578. tmp1 = vmulq_f32(row0, row1);
  579. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  580. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  581. minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
  582. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  583. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  584. minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
  585. tmp1 = vmulq_f32(row0, row3);
  586. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  587. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  588. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  589. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  590. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  591. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  592. tmp1 = vmulq_f32(row0, row2);
  593. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  594. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  595. minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
  596. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  597. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  598. minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
  599. Type det = vmulq_f32(row0, minor0);
  600. det = vdupq_n_f32(vaddvq_f32(det));
  601. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  602. Mat44 result;
  603. result.mCol[0].mValue = vmulq_f32(det, minor0);
  604. result.mCol[1].mValue = vmulq_f32(det, minor1);
  605. result.mCol[2].mValue = vmulq_f32(det, minor2);
  606. result.mCol[3].mValue = vmulq_f32(det, minor3);
  607. return result;
  608. #else
  609. float m00 = JPH_EL(0, 0), m10 = JPH_EL(1, 0), m20 = JPH_EL(2, 0), m30 = JPH_EL(3, 0);
  610. float m01 = JPH_EL(0, 1), m11 = JPH_EL(1, 1), m21 = JPH_EL(2, 1), m31 = JPH_EL(3, 1);
  611. float m02 = JPH_EL(0, 2), m12 = JPH_EL(1, 2), m22 = JPH_EL(2, 2), m32 = JPH_EL(3, 2);
  612. float m03 = JPH_EL(0, 3), m13 = JPH_EL(1, 3), m23 = JPH_EL(2, 3), m33 = JPH_EL(3, 3);
  613. float m10211120 = m10 * m21 - m11 * m20;
  614. float m10221220 = m10 * m22 - m12 * m20;
  615. float m10231320 = m10 * m23 - m13 * m20;
  616. float m10311130 = m10 * m31 - m11 * m30;
  617. float m10321230 = m10 * m32 - m12 * m30;
  618. float m10331330 = m10 * m33 - m13 * m30;
  619. float m11221221 = m11 * m22 - m12 * m21;
  620. float m11231321 = m11 * m23 - m13 * m21;
  621. float m11321231 = m11 * m32 - m12 * m31;
  622. float m11331331 = m11 * m33 - m13 * m31;
  623. float m12231322 = m12 * m23 - m13 * m22;
  624. float m12331332 = m12 * m33 - m13 * m32;
  625. float m20312130 = m20 * m31 - m21 * m30;
  626. float m20322230 = m20 * m32 - m22 * m30;
  627. float m20332330 = m20 * m33 - m23 * m30;
  628. float m21322231 = m21 * m32 - m22 * m31;
  629. float m21332331 = m21 * m33 - m23 * m31;
  630. float m22332332 = m22 * m33 - m23 * m32;
  631. Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231, -m10 * m22332332 + m12 * m20332330 - m13 * m20322230, m10 * m21332331 - m11 * m20332330 + m13 * m20312130, -m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
  632. Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231, m00 * m22332332 - m02 * m20332330 + m03 * m20322230, -m00 * m21332331 + m01 * m20332330 - m03 * m20312130, m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
  633. Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231, -m00 * m12331332 + m02 * m10331330 - m03 * m10321230, m00 * m11331331 - m01 * m10331330 + m03 * m10311130, -m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
  634. Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221, m00 * m12231322 - m02 * m10231320 + m03 * m10221220, -m00 * m11231321 + m01 * m10231320 - m03 * m10211120, m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
  635. float det = m00 * col0.mF32[0] + m01 * col0.mF32[1] + m02 * col0.mF32[2] + m03 * col0.mF32[3];
  636. return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);
  637. #endif
  638. }
  639. Mat44 Mat44::InversedRotationTranslation() const
  640. {
  641. Mat44 m = Transposed3x3();
  642. m.SetTranslation(-m.Multiply3x3(GetTranslation()));
  643. return m;
  644. }
  645. float Mat44::GetDeterminant3x3() const
  646. {
  647. return GetAxisX().Dot(GetAxisY().Cross(GetAxisZ()));
  648. }
  649. Mat44 Mat44::Adjointed3x3() const
  650. {
  651. return Mat44(
  652. Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
  653. - Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0),
  654. Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
  655. - Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0),
  656. Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
  657. - Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0),
  658. Vec4(0, 0, 0, 1));
  659. }
  660. Mat44 Mat44::Inversed3x3() const
  661. {
  662. float det = GetDeterminant3x3();
  663. return Mat44(
  664. (Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)
  665. - Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)) / det,
  666. (Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(2, 1), JPH_EL(2, 2), JPH_EL(2, 0), 0)
  667. - Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(2, 2), JPH_EL(2, 0), JPH_EL(2, 1), 0)) / det,
  668. (Vec4(JPH_EL(0, 1), JPH_EL(0, 2), JPH_EL(0, 0), 0) * Vec4(JPH_EL(1, 2), JPH_EL(1, 0), JPH_EL(1, 1), 0)
  669. - Vec4(JPH_EL(0, 2), JPH_EL(0, 0), JPH_EL(0, 1), 0) * Vec4(JPH_EL(1, 1), JPH_EL(1, 2), JPH_EL(1, 0), 0)) / det,
  670. Vec4(0, 0, 0, 1));
  671. }
  672. bool Mat44::SetInversed3x3(Mat44Arg inM)
  673. {
  674. float det = inM.GetDeterminant3x3();
  675. // If the determinant is zero the matrix is singular and we return false
  676. if (det == 0.0f)
  677. return false;
  678. // Finish calculating the inverse
  679. *this = inM.Adjointed3x3();
  680. mCol[0] /= det;
  681. mCol[1] /= det;
  682. mCol[2] /= det;
  683. return true;
  684. }
  685. Quat Mat44::GetQuaternion() const
  686. {
  687. float tr = mCol[0].mF32[0] + mCol[1].mF32[1] + mCol[2].mF32[2];
  688. if (tr >= 0.0f)
  689. {
  690. float s = sqrt(tr + 1.0f);
  691. float is = 0.5f / s;
  692. return Quat(
  693. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is,
  694. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is,
  695. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is,
  696. 0.5f * s);
  697. }
  698. else
  699. {
  700. int i = 0;
  701. if (mCol[1].mF32[1] > mCol[0].mF32[0]) i = 1;
  702. if (mCol[2].mF32[2] > mCol[i].mF32[i]) i = 2;
  703. if (i == 0)
  704. {
  705. float s = sqrt(mCol[0].mF32[0] - (mCol[1].mF32[1] + mCol[2].mF32[2]) + 1);
  706. float is = 0.5f / s;
  707. return Quat(
  708. 0.5f * s,
  709. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  710. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  711. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is);
  712. }
  713. else if (i == 1)
  714. {
  715. float s = sqrt(mCol[1].mF32[1] - (mCol[2].mF32[2] + mCol[0].mF32[0]) + 1);
  716. float is = 0.5f / s;
  717. return Quat(
  718. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  719. 0.5f * s,
  720. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  721. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is);
  722. }
  723. else
  724. {
  725. JPH_ASSERT(i == 2);
  726. float s = sqrt(mCol[2].mF32[2] - (mCol[0].mF32[0] + mCol[1].mF32[1]) + 1);
  727. float is = 0.5f / s;
  728. return Quat(
  729. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  730. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  731. 0.5f * s,
  732. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is);
  733. }
  734. }
  735. }
  736. Mat44 Mat44::sQuatLeftMultiply(QuatArg inQ)
  737. {
  738. return Mat44(
  739. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  740. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  741. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  742. inQ.mValue);
  743. }
  744. Mat44 Mat44::sQuatRightMultiply(QuatArg inQ)
  745. {
  746. return Mat44(
  747. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  748. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  749. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  750. inQ.mValue);
  751. }
  752. Mat44 Mat44::GetRotation() const
  753. {
  754. JPH_ASSERT(mCol[0][3] == 0.0f);
  755. JPH_ASSERT(mCol[1][3] == 0.0f);
  756. JPH_ASSERT(mCol[2][3] == 0.0f);
  757. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1));
  758. }
  759. Mat44 Mat44::GetRotationSafe() const
  760. {
  761. #if defined(JPH_USE_AVX512)
  762. return Mat44(_mm_maskz_mov_ps(0b0111, mCol[0].mValue),
  763. _mm_maskz_mov_ps(0b0111, mCol[1].mValue),
  764. _mm_maskz_mov_ps(0b0111, mCol[2].mValue),
  765. Vec4(0, 0, 0, 1));
  766. #elif defined(JPH_USE_SSE4_1)
  767. __m128 zero = _mm_setzero_ps();
  768. return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
  769. _mm_blend_ps(mCol[1].mValue, zero, 8),
  770. _mm_blend_ps(mCol[2].mValue, zero, 8),
  771. Vec4(0, 0, 0, 1));
  772. #elif defined(JPH_USE_NEON)
  773. return Mat44(vsetq_lane_f32(0, mCol[0].mValue, 3),
  774. vsetq_lane_f32(0, mCol[1].mValue, 3),
  775. vsetq_lane_f32(0, mCol[2].mValue, 3),
  776. Vec4(0, 0, 0, 1));
  777. #else
  778. return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
  779. Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
  780. Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
  781. Vec4(0, 0, 0, 1));
  782. #endif
  783. }
  784. void Mat44::SetRotation(Mat44Arg inRotation)
  785. {
  786. mCol[0] = inRotation.mCol[0];
  787. mCol[1] = inRotation.mCol[1];
  788. mCol[2] = inRotation.mCol[2];
  789. }
  790. Mat44 Mat44::PreTranslated(Vec3Arg inTranslation) const
  791. {
  792. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + Multiply3x3(inTranslation), 1));
  793. }
  794. Mat44 Mat44::PostTranslated(Vec3Arg inTranslation) const
  795. {
  796. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + inTranslation, 1));
  797. }
  798. Mat44 Mat44::PreScaled(Vec3Arg inScale) const
  799. {
  800. return Mat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol[3]);
  801. }
  802. Mat44 Mat44::PostScaled(Vec3Arg inScale) const
  803. {
  804. Vec4 scale(inScale, 1);
  805. return Mat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], scale * mCol[3]);
  806. }
  807. Mat44 Mat44::Decompose(Vec3 &outScale) const
  808. {
  809. // Start the modified Gram-Schmidt algorithm
  810. // X axis will just be normalized
  811. Vec3 x = GetAxisX();
  812. // Make Y axis perpendicular to X
  813. Vec3 y = GetAxisY();
  814. float x_dot_x = x.LengthSq();
  815. y -= (x.Dot(y) / x_dot_x) * x;
  816. // Make Z axis perpendicular to X
  817. Vec3 z = GetAxisZ();
  818. z -= (x.Dot(z) / x_dot_x) * x;
  819. // Make Z axis perpendicular to Y
  820. float y_dot_y = y.LengthSq();
  821. z -= (y.Dot(z) / y_dot_y) * y;
  822. // Determine the scale
  823. float z_dot_z = z.LengthSq();
  824. outScale = Vec3(x_dot_x, y_dot_y, z_dot_z).Sqrt();
  825. // If the resulting x, y and z vectors don't form a right handed matrix, flip the z axis.
  826. if (x.Cross(y).Dot(z) < 0.0f)
  827. outScale.SetZ(-outScale.GetZ());
  828. // Determine the rotation and translation
  829. return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
  830. }
  831. #undef JPH_EL
  832. JPH_NAMESPACE_END