Mat44.inl 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #pragma once
  4. #include <Jolt/Math/Vec3.h>
  5. #include <Jolt/Math/Vec4.h>
  6. #include <Jolt/Math/Quat.h>
  7. JPH_NAMESPACE_BEGIN
  8. #define JPH_EL(r, c) mCol[c].mF32[r]
  9. Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) :
  10. mCol { inC1, inC2, inC3, inC4 }
  11. {
  12. }
  13. Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec3Arg inC4) :
  14. mCol { inC1, inC2, inC3, Vec4(inC4, 1.0f) }
  15. {
  16. }
  17. Mat44::Mat44(Type inC1, Type inC2, Type inC3, Type inC4) :
  18. mCol { inC1, inC2, inC3, inC4 }
  19. {
  20. }
  21. Mat44 Mat44::sZero()
  22. {
  23. return Mat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), Vec4::sZero());
  24. }
  25. Mat44 Mat44::sIdentity()
  26. {
  27. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  28. }
  29. Mat44 Mat44::sNaN()
  30. {
  31. return Mat44(Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN());
  32. }
  33. Mat44 Mat44::sLoadFloat4x4(const Float4 *inV)
  34. {
  35. Mat44 result;
  36. for (int c = 0; c < 4; ++c)
  37. result.mCol[c] = Vec4::sLoadFloat4(inV + c);
  38. return result;
  39. }
  40. Mat44 Mat44::sLoadFloat4x4Aligned(const Float4 *inV)
  41. {
  42. Mat44 result;
  43. for (int c = 0; c < 4; ++c)
  44. result.mCol[c] = Vec4::sLoadFloat4Aligned(inV + c);
  45. return result;
  46. }
  47. Mat44 Mat44::sRotationX(float inX)
  48. {
  49. Vec4 sv, cv;
  50. Vec4::sReplicate(inX).SinCos(sv, cv);
  51. float s = sv.GetX(), c = cv.GetX();
  52. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, c, s, 0), Vec4(0, -s, c, 0), Vec4(0, 0, 0, 1));
  53. }
  54. Mat44 Mat44::sRotationY(float inY)
  55. {
  56. Vec4 sv, cv;
  57. Vec4::sReplicate(inY).SinCos(sv, cv);
  58. float s = sv.GetX(), c = cv.GetX();
  59. return Mat44(Vec4(c, 0, -s, 0), Vec4(0, 1, 0, 0), Vec4(s, 0, c, 0), Vec4(0, 0, 0, 1));
  60. }
  61. Mat44 Mat44::sRotationZ(float inZ)
  62. {
  63. Vec4 sv, cv;
  64. Vec4::sReplicate(inZ).SinCos(sv, cv);
  65. float s = sv.GetX(), c = cv.GetX();
  66. return Mat44(Vec4(c, s, 0, 0), Vec4(-s, c, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  67. }
  68. Mat44 Mat44::sRotation(QuatArg inQuat)
  69. {
  70. JPH_ASSERT(inQuat.IsNormalized());
  71. // See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
  72. #ifdef JPH_USE_SSE4_1
  73. __m128 xyzw = inQuat.mValue.mValue;
  74. __m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
  75. __m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
  76. __m128 two_yzxw = _mm_add_ps(yzxw, yzxw);
  77. __m128 zxyw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 1, 0, 2));
  78. __m128 two_zxyw = _mm_add_ps(zxyw, zxyw);
  79. __m128 wwww = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 3, 3, 3));
  80. __m128 diagonal = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(two_yzxw, yzxw)), _mm_mul_ps(two_zxyw, zxyw)); // (1 - 2 y^2 - 2 z^2, 1 - 2 x^2 - 2 z^2, 1 - 2 x^2 - 2 y^2, 1 - 4 w^2)
  81. __m128 plus = _mm_add_ps(_mm_mul_ps(two_xyzw, zxyw), _mm_mul_ps(two_yzxw, wwww)); // 2 * (xz + yw, xy + zw, yz + xw, ww)
  82. __m128 minus = _mm_sub_ps(_mm_mul_ps(two_yzxw, xyzw), _mm_mul_ps(two_zxyw, wwww)); // 2 * (xy - zw, yz - xw, xz - yw, 0)
  83. // Workaround for compiler changing _mm_sub_ps(_mm_mul_ps(...), ...) into a fused multiply sub instruction, resulting in w not being 0
  84. // There doesn't appear to be a reliable way to turn this off in Clang
  85. minus = _mm_insert_ps(minus, minus, 0b1000);
  86. __m128 col0 = _mm_blend_ps(_mm_blend_ps(plus, diagonal, 0b0001), minus, 0b1100); // (1 - 2 y^2 - 2 z^2, 2 xy + 2 zw, 2 xz - 2 yw, 0)
  87. __m128 col1 = _mm_blend_ps(_mm_blend_ps(diagonal, minus, 0b1001), plus, 0b0100); // (2 xy - 2 zw, 1 - 2 x^2 - 2 z^2, 2 yz + 2 xw, 0)
  88. __m128 col2 = _mm_blend_ps(_mm_blend_ps(minus, plus, 0b0001), diagonal, 0b0100); // (2 xz + 2 yw, 2 yz - 2 xw, 1 - 2 x^2 - 2 y^2, 0)
  89. __m128 col3 = _mm_set_ps(1, 0, 0, 0);
  90. return Mat44(col0, col1, col2, col3);
  91. #else
  92. float x = inQuat.GetX();
  93. float y = inQuat.GetY();
  94. float z = inQuat.GetZ();
  95. float w = inQuat.GetW();
  96. float tx = x + x; // Note: Using x + x instead of 2.0f * x to force this function to return the same value as the SSE4.1 version across platforms.
  97. float ty = y + y;
  98. float tz = z + z;
  99. float xx = tx * x;
  100. float yy = ty * y;
  101. float zz = tz * z;
  102. float xy = tx * y;
  103. float xz = tx * z;
  104. float xw = tx * w;
  105. float yz = ty * z;
  106. float yw = ty * w;
  107. float zw = tz * w;
  108. return Mat44(Vec4((1.0f - yy) - zz, xy + zw, xz - yw, 0.0f), // Note: Added extra brackets to force this function to return the same value as the SSE4.1 version across platforms.
  109. Vec4(xy - zw, (1.0f - zz) - xx, yz + xw, 0.0f),
  110. Vec4(xz + yw, yz - xw, (1.0f - xx) - yy, 0.0f),
  111. Vec4(0.0f, 0.0f, 0.0f, 1.0f));
  112. #endif
  113. }
  114. Mat44 Mat44::sRotation(Vec3Arg inAxis, float inAngle)
  115. {
  116. return sRotation(Quat::sRotation(inAxis, inAngle));
  117. }
  118. Mat44 Mat44::sTranslation(Vec3Arg inV)
  119. {
  120. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(inV, 1));
  121. }
  122. Mat44 Mat44::sRotationTranslation(QuatArg inR, Vec3Arg inT)
  123. {
  124. Mat44 m = sRotation(inR);
  125. m.SetTranslation(inT);
  126. return m;
  127. }
  128. Mat44 Mat44::sInverseRotationTranslation(QuatArg inR, Vec3Arg inT)
  129. {
  130. Mat44 m = sRotation(inR.Conjugated());
  131. m.SetTranslation(-m.Multiply3x3(inT));
  132. return m;
  133. }
  134. Mat44 Mat44::sScale(float inScale)
  135. {
  136. return Mat44(Vec4(inScale, 0, 0, 0), Vec4(0, inScale, 0, 0), Vec4(0, 0, inScale, 0), Vec4(0, 0, 0, 1));
  137. }
  138. Mat44 Mat44::sScale(Vec3Arg inV)
  139. {
  140. return Mat44(Vec4(inV.GetX(), 0, 0, 0), Vec4(0, inV.GetY(), 0, 0), Vec4(0, 0, inV.GetZ(), 0), Vec4(0, 0, 0, 1));
  141. }
  142. Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
  143. {
  144. Vec4 v1(inV1, 0);
  145. return Mat44(v1 * inV2.SplatX(), v1 * inV2.SplatY(), v1 * inV2.SplatZ(), Vec4(0, 0, 0, 1));
  146. }
  147. Mat44 Mat44::sCrossProduct(Vec3Arg inV)
  148. {
  149. #ifdef JPH_USE_SSE4_1
  150. // Zero out the W component
  151. __m128 zero = _mm_setzero_ps();
  152. __m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
  153. // Negate
  154. __m128 min_v = _mm_sub_ps(zero, v);
  155. return Mat44(
  156. _mm_shuffle_ps(v, min_v, _MM_SHUFFLE(3, 1, 2, 3)), // [0, z, -y, 0]
  157. _mm_shuffle_ps(min_v, v, _MM_SHUFFLE(3, 0, 3, 2)), // [-z, 0, x, 0]
  158. _mm_blend_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 1)), _mm_shuffle_ps(min_v, min_v, _MM_SHUFFLE(3, 3, 0, 3)), 0b0010), // [y, -x, 0, 0]
  159. Vec4(0, 0, 0, 1));
  160. #else
  161. float x = inV.GetX();
  162. float y = inV.GetY();
  163. float z = inV.GetZ();
  164. return Mat44(
  165. Vec4(0, z, -y, 0),
  166. Vec4(-z, 0, x, 0),
  167. Vec4(y, -x, 0, 0),
  168. Vec4(0, 0, 0, 1));
  169. #endif
  170. }
  171. Mat44 Mat44::sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp)
  172. {
  173. Vec3 direction = (inTarget - inPos).NormalizedOr(-Vec3::sAxisZ());
  174. Vec3 right = direction.Cross(inUp).NormalizedOr(Vec3::sAxisX());
  175. Vec3 up = right.Cross(direction);
  176. return Mat44(Vec4(right, 0), Vec4(up, 0), Vec4(-direction, 0), Vec4(inPos, 1)).InversedRotationTranslation();
  177. }
  178. bool Mat44::operator == (Mat44Arg inM2) const
  179. {
  180. return UVec4::sAnd(
  181. UVec4::sAnd(Vec4::sEquals(mCol[0], inM2.mCol[0]), Vec4::sEquals(mCol[1], inM2.mCol[1])),
  182. UVec4::sAnd(Vec4::sEquals(mCol[2], inM2.mCol[2]), Vec4::sEquals(mCol[3], inM2.mCol[3]))
  183. ).TestAllTrue();
  184. }
  185. bool Mat44::IsClose(Mat44Arg inM2, float inMaxDistSq) const
  186. {
  187. for (int i = 0; i < 4; ++i)
  188. if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
  189. return false;
  190. return true;
  191. }
  192. Mat44 Mat44::operator * (Mat44Arg inM) const
  193. {
  194. Mat44 result;
  195. #if defined(JPH_USE_SSE)
  196. for (int i = 0; i < 4; ++i)
  197. {
  198. __m128 c = inM.mCol[i].mValue;
  199. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  200. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  201. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  202. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3))));
  203. result.mCol[i].mValue = t;
  204. }
  205. #elif defined(JPH_USE_NEON)
  206. for (int i = 0; i < 4; ++i)
  207. {
  208. Type c = inM.mCol[i].mValue;
  209. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  210. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  211. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  212. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
  213. result.mCol[i].mValue = t;
  214. }
  215. #else
  216. for (int i = 0; i < 4; ++i)
  217. result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2] + mCol[3] * inM.mCol[i].mF32[3];
  218. #endif
  219. return result;
  220. }
  221. Vec3 Mat44::operator * (Vec3Arg inV) const
  222. {
  223. #if defined(JPH_USE_SSE)
  224. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  225. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  226. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  227. t = _mm_add_ps(t, mCol[3].mValue);
  228. return Vec3::sFixW(t);
  229. #elif defined(JPH_USE_NEON)
  230. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  231. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  232. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  233. t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
  234. return Vec3::sFixW(t);
  235. #else
  236. return Vec3(
  237. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0],
  238. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1],
  239. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2]);
  240. #endif
  241. }
  242. Vec4 Mat44::operator * (Vec4Arg inV) const
  243. {
  244. #if defined(JPH_USE_SSE)
  245. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  246. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  247. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  248. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(3, 3, 3, 3))));
  249. return t;
  250. #elif defined(JPH_USE_NEON)
  251. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  252. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  253. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  254. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
  255. return t;
  256. #else
  257. return Vec4(
  258. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0] * inV.mF32[3],
  259. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1] * inV.mF32[3],
  260. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2] * inV.mF32[3],
  261. mCol[0].mF32[3] * inV.mF32[0] + mCol[1].mF32[3] * inV.mF32[1] + mCol[2].mF32[3] * inV.mF32[2] + mCol[3].mF32[3] * inV.mF32[3]);
  262. #endif
  263. }
  264. Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
  265. {
  266. #if defined(JPH_USE_SSE)
  267. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  268. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  269. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  270. return Vec3::sFixW(t);
  271. #elif defined(JPH_USE_NEON)
  272. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  273. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  274. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  275. return Vec3::sFixW(t);
  276. #else
  277. return Vec3(
  278. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2],
  279. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2],
  280. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]);
  281. #endif
  282. }
  283. Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
  284. {
  285. #if defined(JPH_USE_SSE4_1)
  286. __m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
  287. __m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
  288. __m128 xy = _mm_blend_ps(x, y, 0b0010);
  289. __m128 z = _mm_dp_ps(mCol[2].mValue, inV.mValue, 0x7f);
  290. __m128 xyzz = _mm_blend_ps(xy, z, 0b1100);
  291. return xyzz;
  292. #else
  293. return Transposed3x3().Multiply3x3(inV);
  294. #endif
  295. }
  296. Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
  297. {
  298. JPH_ASSERT(mCol[0][3] == 0.0f);
  299. JPH_ASSERT(mCol[1][3] == 0.0f);
  300. JPH_ASSERT(mCol[2][3] == 0.0f);
  301. Mat44 result;
  302. #if defined(JPH_USE_SSE)
  303. for (int i = 0; i < 3; ++i)
  304. {
  305. __m128 c = inM.mCol[i].mValue;
  306. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  307. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  308. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  309. result.mCol[i].mValue = t;
  310. }
  311. #elif defined(JPH_USE_NEON)
  312. for (int i = 0; i < 3; ++i)
  313. {
  314. Type c = inM.mCol[i].mValue;
  315. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  316. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  317. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  318. result.mCol[i].mValue = t;
  319. }
  320. #else
  321. for (int i = 0; i < 3; ++i)
  322. result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2];
  323. #endif
  324. result.mCol[3] = Vec4(0, 0, 0, 1);
  325. return result;
  326. }
  327. Mat44 Mat44::Multiply3x3LeftTransposed(Mat44Arg inM) const
  328. {
  329. // Transpose left hand side
  330. Mat44 trans = Transposed3x3();
  331. // Do 3x3 matrix multiply
  332. Mat44 result;
  333. result.mCol[0] = trans.mCol[0] * inM.mCol[0].SplatX() + trans.mCol[1] * inM.mCol[0].SplatY() + trans.mCol[2] * inM.mCol[0].SplatZ();
  334. result.mCol[1] = trans.mCol[0] * inM.mCol[1].SplatX() + trans.mCol[1] * inM.mCol[1].SplatY() + trans.mCol[2] * inM.mCol[1].SplatZ();
  335. result.mCol[2] = trans.mCol[0] * inM.mCol[2].SplatX() + trans.mCol[1] * inM.mCol[2].SplatY() + trans.mCol[2] * inM.mCol[2].SplatZ();
  336. result.mCol[3] = Vec4(0, 0, 0, 1);
  337. return result;
  338. }
  339. Mat44 Mat44::Multiply3x3RightTransposed(Mat44Arg inM) const
  340. {
  341. JPH_ASSERT(mCol[0][3] == 0.0f);
  342. JPH_ASSERT(mCol[1][3] == 0.0f);
  343. JPH_ASSERT(mCol[2][3] == 0.0f);
  344. Mat44 result;
  345. result.mCol[0] = mCol[0] * inM.mCol[0].SplatX() + mCol[1] * inM.mCol[1].SplatX() + mCol[2] * inM.mCol[2].SplatX();
  346. result.mCol[1] = mCol[0] * inM.mCol[0].SplatY() + mCol[1] * inM.mCol[1].SplatY() + mCol[2] * inM.mCol[2].SplatY();
  347. result.mCol[2] = mCol[0] * inM.mCol[0].SplatZ() + mCol[1] * inM.mCol[1].SplatZ() + mCol[2] * inM.mCol[2].SplatZ();
  348. result.mCol[3] = Vec4(0, 0, 0, 1);
  349. return result;
  350. }
  351. Mat44 Mat44::operator * (float inV) const
  352. {
  353. Vec4 multiplier = Vec4::sReplicate(inV);
  354. Mat44 result;
  355. for (int c = 0; c < 4; ++c)
  356. result.mCol[c] = mCol[c] * multiplier;
  357. return result;
  358. }
  359. Mat44 &Mat44::operator *= (float inV)
  360. {
  361. for (int c = 0; c < 4; ++c)
  362. mCol[c] *= inV;
  363. return *this;
  364. }
  365. Mat44 Mat44::operator + (Mat44Arg inM) const
  366. {
  367. Mat44 result;
  368. for (int i = 0; i < 4; ++i)
  369. result.mCol[i] = mCol[i] + inM.mCol[i];
  370. return result;
  371. }
  372. Mat44 Mat44::operator - () const
  373. {
  374. Mat44 result;
  375. for (int i = 0; i < 4; ++i)
  376. result.mCol[i] = -mCol[i];
  377. return result;
  378. }
  379. Mat44 Mat44::operator - (Mat44Arg inM) const
  380. {
  381. Mat44 result;
  382. for (int i = 0; i < 4; ++i)
  383. result.mCol[i] = mCol[i] - inM.mCol[i];
  384. return result;
  385. }
  386. Mat44 &Mat44::operator += (Mat44Arg inM)
  387. {
  388. for (int c = 0; c < 4; ++c)
  389. mCol[c] += inM.mCol[c];
  390. return *this;
  391. }
  392. void Mat44::StoreFloat4x4(Float4 *outV) const
  393. {
  394. for (int c = 0; c < 4; ++c)
  395. mCol[c].StoreFloat4(outV + c);
  396. }
  397. Mat44 Mat44::Transposed() const
  398. {
  399. #if defined(JPH_USE_SSE)
  400. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  401. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  402. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  403. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  404. Mat44 result;
  405. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  406. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  407. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  408. result.mCol[3].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(3, 1, 3, 1));
  409. return result;
  410. #elif defined(JPH_USE_NEON)
  411. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  412. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, mCol[3].mValue);
  413. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  414. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  415. Mat44 result;
  416. result.mCol[0].mValue = tmp3.val[0];
  417. result.mCol[1].mValue = tmp3.val[1];
  418. result.mCol[2].mValue = tmp4.val[0];
  419. result.mCol[3].mValue = tmp4.val[1];
  420. return result;
  421. #else
  422. Mat44 result;
  423. for (int c = 0; c < 4; ++c)
  424. for (int r = 0; r < 4; ++r)
  425. result.mCol[r].mF32[c] = mCol[c].mF32[r];
  426. return result;
  427. #endif
  428. }
  429. Mat44 Mat44::Transposed3x3() const
  430. {
  431. #if defined(JPH_USE_SSE)
  432. __m128 zero = _mm_setzero_ps();
  433. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  434. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  435. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(1, 0, 1, 0));
  436. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(3, 2, 3, 2));
  437. Mat44 result;
  438. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  439. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  440. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  441. #elif defined(JPH_USE_NEON)
  442. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  443. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, vdupq_n_f32(0));
  444. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  445. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  446. Mat44 result;
  447. result.mCol[0].mValue = tmp3.val[0];
  448. result.mCol[1].mValue = tmp3.val[1];
  449. result.mCol[2].mValue = tmp4.val[0];
  450. #else
  451. Mat44 result;
  452. for (int c = 0; c < 3; ++c)
  453. {
  454. for (int r = 0; r < 3; ++r)
  455. result.mCol[c].mF32[r] = mCol[r].mF32[c];
  456. result.mCol[c].mF32[3] = 0;
  457. }
  458. #endif
  459. result.mCol[3] = Vec4(0, 0, 0, 1);
  460. return result;
  461. }
  462. Mat44 Mat44::Inversed() const
  463. {
  464. #if defined(JPH_USE_SSE)
  465. // Algorithm from: http://download.intel.com/design/PentiumIII/sml/24504301.pdf
  466. // Streaming SIMD Extensions - Inverse of 4x4 Matrix
  467. // Adapted to load data using _mm_shuffle_ps instead of loading from memory
  468. // Replaced _mm_rcp_ps with _mm_div_ps for better accuracy
  469. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  470. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  471. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  472. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  473. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  474. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  475. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  476. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  477. tmp1 = _mm_mul_ps(row2, row3);
  478. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  479. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  480. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  481. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  482. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  483. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  484. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  485. tmp1 = _mm_mul_ps(row1, row2);
  486. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  487. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  488. __m128 minor3 = _mm_mul_ps(row0, tmp1);
  489. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  490. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  491. minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
  492. minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
  493. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  494. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  495. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  496. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  497. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  498. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  499. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  500. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  501. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  502. tmp1 = _mm_mul_ps(row0, row1);
  503. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  504. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  505. minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
  506. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  507. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  508. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
  509. tmp1 = _mm_mul_ps(row0, row3);
  510. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  511. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  512. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  513. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  514. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  515. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  516. tmp1 = _mm_mul_ps(row0, row2);
  517. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  518. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  519. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
  520. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  521. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  522. minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
  523. __m128 det = _mm_mul_ps(row0, minor0);
  524. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
  525. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  526. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  527. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  528. Mat44 result;
  529. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  530. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  531. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  532. result.mCol[3].mValue = _mm_mul_ps(det, minor3);
  533. return result;
  534. #elif defined(JPH_USE_NEON)
  535. // Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
  536. Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  537. Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
  538. Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
  539. row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
  540. tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  541. Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
  542. Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
  543. row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
  544. tmp1 = vmulq_f32(row2, row3);
  545. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  546. Type minor0 = vmulq_f32(row1, tmp1);
  547. Type minor1 = vmulq_f32(row0, tmp1);
  548. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  549. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  550. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  551. minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
  552. tmp1 = vmulq_f32(row1, row2);
  553. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  554. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  555. Type minor3 = vmulq_f32(row0, tmp1);
  556. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  557. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  558. minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
  559. minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
  560. tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
  561. tmp1 = vmulq_f32(tmp1, row3);
  562. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  563. row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
  564. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  565. Type minor2 = vmulq_f32(row0, tmp1);
  566. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  567. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  568. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  569. minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
  570. tmp1 = vmulq_f32(row0, row1);
  571. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  572. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  573. minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
  574. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  575. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  576. minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
  577. tmp1 = vmulq_f32(row0, row3);
  578. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  579. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  580. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  581. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  582. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  583. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  584. tmp1 = vmulq_f32(row0, row2);
  585. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  586. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  587. minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
  588. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  589. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  590. minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
  591. Type det = vmulq_f32(row0, minor0);
  592. det = vdupq_n_f32(vaddvq_f32(det));
  593. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  594. Mat44 result;
  595. result.mCol[0].mValue = vmulq_f32(det, minor0);
  596. result.mCol[1].mValue = vmulq_f32(det, minor1);
  597. result.mCol[2].mValue = vmulq_f32(det, minor2);
  598. result.mCol[3].mValue = vmulq_f32(det, minor3);
  599. return result;
  600. #else
  601. float m00 = JPH_EL(0, 0), m10 = JPH_EL(1, 0), m20 = JPH_EL(2, 0), m30 = JPH_EL(3, 0);
  602. float m01 = JPH_EL(0, 1), m11 = JPH_EL(1, 1), m21 = JPH_EL(2, 1), m31 = JPH_EL(3, 1);
  603. float m02 = JPH_EL(0, 2), m12 = JPH_EL(1, 2), m22 = JPH_EL(2, 2), m32 = JPH_EL(3, 2);
  604. float m03 = JPH_EL(0, 3), m13 = JPH_EL(1, 3), m23 = JPH_EL(2, 3), m33 = JPH_EL(3, 3);
  605. float m10211120 = m10 * m21 - m11 * m20;
  606. float m10221220 = m10 * m22 - m12 * m20;
  607. float m10231320 = m10 * m23 - m13 * m20;
  608. float m10311130 = m10 * m31 - m11 * m30;
  609. float m10321230 = m10 * m32 - m12 * m30;
  610. float m10331330 = m10 * m33 - m13 * m30;
  611. float m11221221 = m11 * m22 - m12 * m21;
  612. float m11231321 = m11 * m23 - m13 * m21;
  613. float m11321231 = m11 * m32 - m12 * m31;
  614. float m11331331 = m11 * m33 - m13 * m31;
  615. float m12231322 = m12 * m23 - m13 * m22;
  616. float m12331332 = m12 * m33 - m13 * m32;
  617. float m20312130 = m20 * m31 - m21 * m30;
  618. float m20322230 = m20 * m32 - m22 * m30;
  619. float m20332330 = m20 * m33 - m23 * m30;
  620. float m21322231 = m21 * m32 - m22 * m31;
  621. float m21332331 = m21 * m33 - m23 * m31;
  622. float m22332332 = m22 * m33 - m23 * m32;
  623. Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231, -m10 * m22332332 + m12 * m20332330 - m13 * m20322230, m10 * m21332331 - m11 * m20332330 + m13 * m20312130, -m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
  624. Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231, m00 * m22332332 - m02 * m20332330 + m03 * m20322230, -m00 * m21332331 + m01 * m20332330 - m03 * m20312130, m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
  625. Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231, -m00 * m12331332 + m02 * m10331330 - m03 * m10321230, m00 * m11331331 - m01 * m10331330 + m03 * m10311130, -m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
  626. Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221, m00 * m12231322 - m02 * m10231320 + m03 * m10221220, -m00 * m11231321 + m01 * m10231320 - m03 * m10211120, m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
  627. float det = m00 * col0.mF32[0] + m01 * col0.mF32[1] + m02 * col0.mF32[2] + m03 * col0.mF32[3];
  628. return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);
  629. #endif
  630. }
  631. Mat44 Mat44::InversedRotationTranslation() const
  632. {
  633. Mat44 m = Transposed3x3();
  634. m.SetTranslation(-m.Multiply3x3(GetTranslation()));
  635. return m;
  636. }
  637. float Mat44::GetDeterminant3x3() const
  638. {
  639. return GetAxisX().Dot(GetAxisY().Cross(GetAxisZ()));
  640. }
  641. Mat44 Mat44::Adjointed3x3() const
  642. {
  643. // Adapted from Inversed() to remove 4th column and the division by the determinant
  644. // Note: This can be optimized.
  645. JPH_ASSERT(mCol[0][3] == 0.0f);
  646. JPH_ASSERT(mCol[1][3] == 0.0f);
  647. JPH_ASSERT(mCol[2][3] == 0.0f);
  648. #if defined(JPH_USE_SSE)
  649. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  650. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  651. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  652. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  653. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  654. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  655. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  656. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  657. tmp1 = _mm_mul_ps(row2, row3);
  658. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  659. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  660. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  661. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  662. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  663. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  664. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  665. tmp1 = _mm_mul_ps(row1, row2);
  666. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  667. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  668. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  669. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  670. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  671. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  672. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  673. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  674. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  675. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  676. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  677. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  678. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  679. tmp1 = _mm_mul_ps(row0, row1);
  680. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  681. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  682. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  683. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  684. tmp1 = _mm_mul_ps(row0, row3);
  685. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  686. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  687. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  688. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  689. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  690. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  691. tmp1 = _mm_mul_ps(row0, row2);
  692. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  693. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  694. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  695. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  696. Mat44 result;
  697. result.mCol[0].mValue = minor0;
  698. result.mCol[1].mValue = minor1;
  699. result.mCol[2].mValue = minor2;
  700. result.mCol[3] = Vec4(0, 0, 0, 1);
  701. return result;
  702. #elif defined(JPH_USE_NEON)
  703. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  704. Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  705. Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
  706. Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
  707. row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
  708. tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  709. Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
  710. Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
  711. row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
  712. tmp1 = vmulq_f32(row2, row3);
  713. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  714. Type minor0 = vmulq_f32(row1, tmp1);
  715. Type minor1 = vmulq_f32(row0, tmp1);
  716. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  717. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  718. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  719. minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
  720. tmp1 = vmulq_f32(row1, row2);
  721. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  722. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  723. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  724. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  725. tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
  726. tmp1 = vmulq_f32(tmp1, row3);
  727. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  728. row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
  729. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  730. Type minor2 = vmulq_f32(row0, tmp1);
  731. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  732. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  733. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  734. minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
  735. tmp1 = vmulq_f32(row0, row1);
  736. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  737. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  738. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  739. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  740. tmp1 = vmulq_f32(row0, row3);
  741. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  742. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  743. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  744. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  745. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  746. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  747. tmp1 = vmulq_f32(row0, row2);
  748. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  749. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  750. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  751. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  752. Mat44 result;
  753. result.mCol[0].mValue = minor0;
  754. result.mCol[1].mValue = minor1;
  755. result.mCol[2].mValue = minor2;
  756. result.mCol[3].mValue = v0001;
  757. return result;
  758. #else
  759. return Mat44(
  760. Vec4(JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1),
  761. JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2),
  762. JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0),
  763. 0),
  764. Vec4(JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2),
  765. JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0),
  766. JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1),
  767. 0),
  768. Vec4(JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1),
  769. JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2),
  770. JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0),
  771. 0),
  772. Vec4(0, 0, 0, 1));
  773. #endif
  774. }
  775. Mat44 Mat44::Inversed3x3() const
  776. {
  777. // Adapted from Inversed() to remove 4th column
  778. // Note: This can be optimized.
  779. JPH_ASSERT(mCol[0][3] == 0.0f);
  780. JPH_ASSERT(mCol[1][3] == 0.0f);
  781. JPH_ASSERT(mCol[2][3] == 0.0f);
  782. #if defined(JPH_USE_SSE)
  783. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  784. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  785. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  786. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  787. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  788. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  789. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  790. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  791. tmp1 = _mm_mul_ps(row2, row3);
  792. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  793. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  794. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  795. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  796. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  797. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  798. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  799. tmp1 = _mm_mul_ps(row1, row2);
  800. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  801. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  802. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  803. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  804. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  805. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  806. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  807. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  808. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  809. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  810. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  811. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  812. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  813. tmp1 = _mm_mul_ps(row0, row1);
  814. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  815. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  816. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  817. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  818. tmp1 = _mm_mul_ps(row0, row3);
  819. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  820. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  821. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  822. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  823. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  824. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  825. tmp1 = _mm_mul_ps(row0, row2);
  826. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  827. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  828. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  829. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  830. __m128 det = _mm_mul_ps(row0, minor0);
  831. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
  832. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  833. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  834. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  835. Mat44 result;
  836. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  837. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  838. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  839. result.mCol[3] = Vec4(0, 0, 0, 1);
  840. return result;
  841. #elif defined(JPH_USE_NEON)
  842. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  843. Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  844. Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
  845. Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
  846. row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
  847. tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  848. Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
  849. Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
  850. row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
  851. tmp1 = vmulq_f32(row2, row3);
  852. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  853. Type minor0 = vmulq_f32(row1, tmp1);
  854. Type minor1 = vmulq_f32(row0, tmp1);
  855. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  856. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  857. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  858. minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
  859. tmp1 = vmulq_f32(row1, row2);
  860. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  861. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  862. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  863. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  864. tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
  865. tmp1 = vmulq_f32(tmp1, row3);
  866. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  867. row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
  868. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  869. Type minor2 = vmulq_f32(row0, tmp1);
  870. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  871. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  872. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  873. minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
  874. tmp1 = vmulq_f32(row0, row1);
  875. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  876. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  877. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  878. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  879. tmp1 = vmulq_f32(row0, row3);
  880. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  881. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  882. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  883. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  884. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  885. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  886. tmp1 = vmulq_f32(row0, row2);
  887. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  888. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  889. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  890. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  891. Type det = vmulq_f32(row0, minor0);
  892. det = vdupq_n_f32(vaddvq_f32(det));
  893. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  894. Mat44 result;
  895. result.mCol[0].mValue = vmulq_f32(det, minor0);
  896. result.mCol[1].mValue = vmulq_f32(det, minor1);
  897. result.mCol[2].mValue = vmulq_f32(det, minor2);
  898. result.mCol[3].mValue = v0001;
  899. return result;
  900. #else
  901. float det = GetDeterminant3x3();
  902. return Mat44(
  903. Vec4((JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1)) / det,
  904. (JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2)) / det,
  905. (JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0)) / det,
  906. 0),
  907. Vec4((JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2)) / det,
  908. (JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0)) / det,
  909. (JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1)) / det,
  910. 0),
  911. Vec4((JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1)) / det,
  912. (JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2)) / det,
  913. (JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0)) / det,
  914. 0),
  915. Vec4(0, 0, 0, 1));
  916. #endif
  917. }
  918. Quat Mat44::GetQuaternion() const
  919. {
  920. JPH_ASSERT(mCol[3] == Vec4(0, 0, 0, 1));
  921. float tr = mCol[0].mF32[0] + mCol[1].mF32[1] + mCol[2].mF32[2];
  922. if (tr >= 0.0f)
  923. {
  924. float s = sqrt(tr + 1.0f);
  925. float is = 0.5f / s;
  926. return Quat(
  927. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is,
  928. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is,
  929. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is,
  930. 0.5f * s);
  931. }
  932. else
  933. {
  934. int i = 0;
  935. if (mCol[1].mF32[1] > mCol[0].mF32[0]) i = 1;
  936. if (mCol[2].mF32[2] > mCol[i].mF32[i]) i = 2;
  937. if (i == 0)
  938. {
  939. float s = sqrt(mCol[0].mF32[0] - (mCol[1].mF32[1] + mCol[2].mF32[2]) + 1);
  940. float is = 0.5f / s;
  941. return Quat(
  942. 0.5f * s,
  943. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  944. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  945. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is);
  946. }
  947. else if (i == 1)
  948. {
  949. float s = sqrt(mCol[1].mF32[1] - (mCol[2].mF32[2] + mCol[0].mF32[0]) + 1);
  950. float is = 0.5f / s;
  951. return Quat(
  952. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  953. 0.5f * s,
  954. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  955. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is);
  956. }
  957. else
  958. {
  959. JPH_ASSERT(i == 2);
  960. float s = sqrt(mCol[2].mF32[2] - (mCol[0].mF32[0] + mCol[1].mF32[1]) + 1);
  961. float is = 0.5f / s;
  962. return Quat(
  963. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  964. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  965. 0.5f * s,
  966. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is);
  967. }
  968. }
  969. }
  970. Mat44 Mat44::sQuatLeftMultiply(QuatArg inQ)
  971. {
  972. return Mat44(
  973. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  974. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  975. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  976. inQ.mValue);
  977. }
  978. Mat44 Mat44::sQuatRightMultiply(QuatArg inQ)
  979. {
  980. return Mat44(
  981. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  982. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  983. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  984. inQ.mValue);
  985. }
  986. Mat44 Mat44::GetRotation() const
  987. {
  988. JPH_ASSERT(mCol[0][3] == 0.0f);
  989. JPH_ASSERT(mCol[1][3] == 0.0f);
  990. JPH_ASSERT(mCol[2][3] == 0.0f);
  991. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1));
  992. }
  993. Mat44 Mat44::GetRotationSafe() const
  994. {
  995. #if defined(JPH_USE_AVX512)
  996. return Mat44(_mm_maskz_mov_ps(0b0111, mCol[0].mValue),
  997. _mm_maskz_mov_ps(0b0111, mCol[1].mValue),
  998. _mm_maskz_mov_ps(0b0111, mCol[2].mValue),
  999. Vec4(0, 0, 0, 1));
  1000. #elif defined(JPH_USE_SSE4_1)
  1001. __m128 zero = _mm_setzero_ps();
  1002. return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
  1003. _mm_blend_ps(mCol[1].mValue, zero, 8),
  1004. _mm_blend_ps(mCol[2].mValue, zero, 8),
  1005. Vec4(0, 0, 0, 1));
  1006. #elif defined(JPH_USE_NEON)
  1007. return Mat44(vsetq_lane_f32(0, mCol[0].mValue, 3),
  1008. vsetq_lane_f32(0, mCol[1].mValue, 3),
  1009. vsetq_lane_f32(0, mCol[2].mValue, 3),
  1010. Vec4(0, 0, 0, 1));
  1011. #else
  1012. return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
  1013. Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
  1014. Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
  1015. Vec4(0, 0, 0, 1));
  1016. #endif
  1017. }
  1018. void Mat44::SetRotation(Mat44Arg inRotation)
  1019. {
  1020. mCol[0] = inRotation.mCol[0];
  1021. mCol[1] = inRotation.mCol[1];
  1022. mCol[2] = inRotation.mCol[2];
  1023. }
  1024. Mat44 Mat44::PreTranslated(Vec3Arg inTranslation) const
  1025. {
  1026. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + Multiply3x3(inTranslation), 1));
  1027. }
  1028. Mat44 Mat44::PostTranslated(Vec3Arg inTranslation) const
  1029. {
  1030. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + inTranslation, 1));
  1031. }
  1032. Mat44 Mat44::PreScaled(Vec3Arg inScale) const
  1033. {
  1034. return Mat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol[3]);
  1035. }
  1036. Mat44 Mat44::PostScaled(Vec3Arg inScale) const
  1037. {
  1038. Vec4 scale(inScale, 1);
  1039. return Mat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], scale * mCol[3]);
  1040. }
  1041. Mat44 Mat44::Decompose(Vec3 &outScale) const
  1042. {
  1043. // Start the modified Gram-Schmidt algorithm
  1044. // X axis will just be normalized
  1045. Vec3 x = GetAxisX();
  1046. // Make Y axis perpendicular to X
  1047. Vec3 y = GetAxisY();
  1048. float x_dot_x = x.LengthSq();
  1049. y -= (x.Dot(y) / x_dot_x) * x;
  1050. // Make Z axis perpendicular to X
  1051. Vec3 z = GetAxisZ();
  1052. z -= (x.Dot(z) / x_dot_x) * x;
  1053. // Make Z axis perpendicular to Y
  1054. float y_dot_y = y.LengthSq();
  1055. z -= (y.Dot(z) / y_dot_y) * y;
  1056. // Determine the scale
  1057. float z_dot_z = z.LengthSq();
  1058. outScale = Vec3(x_dot_x, y_dot_y, z_dot_z).Sqrt();
  1059. // If the resulting x, y and z vectors don't form a right handed matrix, flip the z axis.
  1060. if (x.Cross(y).Dot(z) < 0.0f)
  1061. outScale.SetZ(-outScale.GetZ());
  1062. // Determine the rotation and translation
  1063. return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
  1064. }
  1065. #undef JPH_EL
  1066. JPH_NAMESPACE_END