Mat44.inl 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #pragma once
  4. #include <Jolt/Math/Vec3.h>
  5. #include <Jolt/Math/Vec4.h>
  6. #include <Jolt/Math/Quat.h>
  7. JPH_NAMESPACE_BEGIN
  8. #define JPH_EL(r, c) mCol[c].mF32[r]
  9. Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) :
  10. mCol { inC1, inC2, inC3, inC4 }
  11. {
  12. }
  13. Mat44::Mat44(Type inC1, Type inC2, Type inC3, Type inC4) :
  14. mCol { inC1, inC2, inC3, inC4 }
  15. {
  16. }
  17. Mat44 Mat44::sZero()
  18. {
  19. return Mat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), Vec4::sZero());
  20. }
  21. Mat44 Mat44::sIdentity()
  22. {
  23. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  24. }
  25. Mat44 Mat44::sNaN()
  26. {
  27. return Mat44(Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN());
  28. }
  29. Mat44 Mat44::sLoadFloat4x4(const Float4 *inV)
  30. {
  31. Mat44 result;
  32. for (int c = 0; c < 4; ++c)
  33. result.mCol[c] = Vec4::sLoadFloat4(inV + c);
  34. return result;
  35. }
  36. Mat44 Mat44::sLoadFloat4x4Aligned(const Float4 *inV)
  37. {
  38. Mat44 result;
  39. for (int c = 0; c < 4; ++c)
  40. result.mCol[c] = Vec4::sLoadFloat4Aligned(inV + c);
  41. return result;
  42. }
  43. Mat44 Mat44::sRotationX(float inX)
  44. {
  45. Vec4 sv, cv;
  46. Vec4::sReplicate(inX).SinCos(sv, cv);
  47. float s = sv.GetX(), c = cv.GetX();
  48. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, c, s, 0), Vec4(0, -s, c, 0), Vec4(0, 0, 0, 1));
  49. }
  50. Mat44 Mat44::sRotationY(float inY)
  51. {
  52. Vec4 sv, cv;
  53. Vec4::sReplicate(inY).SinCos(sv, cv);
  54. float s = sv.GetX(), c = cv.GetX();
  55. return Mat44(Vec4(c, 0, -s, 0), Vec4(0, 1, 0, 0), Vec4(s, 0, c, 0), Vec4(0, 0, 0, 1));
  56. }
  57. Mat44 Mat44::sRotationZ(float inZ)
  58. {
  59. Vec4 sv, cv;
  60. Vec4::sReplicate(inZ).SinCos(sv, cv);
  61. float s = sv.GetX(), c = cv.GetX();
  62. return Mat44(Vec4(c, s, 0, 0), Vec4(-s, c, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  63. }
  64. Mat44 Mat44::sRotation(QuatArg inQuat)
  65. {
  66. JPH_ASSERT(inQuat.IsNormalized());
  67. // See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
  68. #ifdef JPH_USE_SSE4_1
  69. __m128 xyzw = inQuat.mValue.mValue;
  70. __m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
  71. __m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
  72. __m128 two_yzxw = _mm_add_ps(yzxw, yzxw);
  73. __m128 zxyw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 1, 0, 2));
  74. __m128 two_zxyw = _mm_add_ps(zxyw, zxyw);
  75. __m128 wwww = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 3, 3, 3));
  76. __m128 diagonal = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(two_yzxw, yzxw)), _mm_mul_ps(two_zxyw, zxyw)); // (1 - 2 y^2 - 2 z^2, 1 - 2 x^2 - 2 z^2, 1 - 2 x^2 - 2 y^2, 1 - 4 w^2)
  77. __m128 plus = _mm_add_ps(_mm_mul_ps(two_xyzw, zxyw), _mm_mul_ps(two_yzxw, wwww)); // 2 * (xz + yw, xy + zw, yz + xw, ww)
  78. __m128 minus = _mm_sub_ps(_mm_mul_ps(two_yzxw, xyzw), _mm_mul_ps(two_zxyw, wwww)); // 2 * (xy - zw, yz - xw, xz - yw, 0)
  79. // Workaround for compiler changing _mm_sub_ps(_mm_mul_ps(...), ...) into a fused multiply sub instruction, resulting in w not being 0
  80. // There doesn't appear to be a reliable way to turn this off in Clang
  81. minus = _mm_insert_ps(minus, minus, 0b1000);
  82. __m128 col0 = _mm_blend_ps(_mm_blend_ps(plus, diagonal, 0b0001), minus, 0b1100); // (1 - 2 y^2 - 2 z^2, 2 xy + 2 zw, 2 xz - 2 yw, 0)
  83. __m128 col1 = _mm_blend_ps(_mm_blend_ps(diagonal, minus, 0b1001), plus, 0b0100); // (2 xy - 2 zw, 1 - 2 x^2 - 2 z^2, 2 yz + 2 xw, 0)
  84. __m128 col2 = _mm_blend_ps(_mm_blend_ps(minus, plus, 0b0001), diagonal, 0b0100); // (2 xz + 2 yw, 2 yz - 2 xw, 1 - 2 x^2 - 2 y^2, 0)
  85. __m128 col3 = _mm_set_ps(1, 0, 0, 0);
  86. return Mat44(col0, col1, col2, col3);
  87. #else
  88. float x = inQuat.GetX();
  89. float y = inQuat.GetY();
  90. float z = inQuat.GetZ();
  91. float w = inQuat.GetW();
  92. float tx = x + x; // Note: Using x + x instead of 2.0f * x to force this function to return the same value as the SSE4.1 version across platforms.
  93. float ty = y + y;
  94. float tz = z + z;
  95. float xx = tx * x;
  96. float yy = ty * y;
  97. float zz = tz * z;
  98. float xy = tx * y;
  99. float xz = tx * z;
  100. float xw = tx * w;
  101. float yz = ty * z;
  102. float yw = ty * w;
  103. float zw = tz * w;
  104. return Mat44(Vec4((1.0f - yy) - zz, xy + zw, xz - yw, 0.0f), // Note: Added extra brackets to force this function to return the same value as the SSE4.1 version across platforms.
  105. Vec4(xy - zw, (1.0f - zz) - xx, yz + xw, 0.0f),
  106. Vec4(xz + yw, yz - xw, (1.0f - xx) - yy, 0.0f),
  107. Vec4(0.0f, 0.0f, 0.0f, 1.0f));
  108. #endif
  109. }
  110. Mat44 Mat44::sRotation(Vec3Arg inAxis, float inAngle)
  111. {
  112. return sRotation(Quat::sRotation(inAxis, inAngle));
  113. }
  114. Mat44 Mat44::sTranslation(Vec3Arg inV)
  115. {
  116. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(inV, 1));
  117. }
  118. Mat44 Mat44::sRotationTranslation(QuatArg inR, Vec3Arg inT)
  119. {
  120. Mat44 m = sRotation(inR);
  121. m.SetTranslation(inT);
  122. return m;
  123. }
  124. Mat44 Mat44::sInverseRotationTranslation(QuatArg inR, Vec3Arg inT)
  125. {
  126. Mat44 m = sRotation(inR.Conjugated());
  127. m.SetTranslation(-m.Multiply3x3(inT));
  128. return m;
  129. }
  130. Mat44 Mat44::sScale(float inScale)
  131. {
  132. return Mat44(Vec4(inScale, 0, 0, 0), Vec4(0, inScale, 0, 0), Vec4(0, 0, inScale, 0), Vec4(0, 0, 0, 1));
  133. }
  134. Mat44 Mat44::sScale(Vec3Arg inV)
  135. {
  136. return Mat44(Vec4(inV.GetX(), 0, 0, 0), Vec4(0, inV.GetY(), 0, 0), Vec4(0, 0, inV.GetZ(), 0), Vec4(0, 0, 0, 1));
  137. }
  138. Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
  139. {
  140. Vec4 v1(inV1, 0);
  141. return Mat44(v1 * inV2.SplatX(), v1 * inV2.SplatY(), v1 * inV2.SplatZ(), Vec4(0, 0, 0, 1));
  142. }
  143. Mat44 Mat44::sCrossProduct(Vec3Arg inV)
  144. {
  145. #ifdef JPH_USE_SSE4_1
  146. // Zero out the W component
  147. __m128 zero = _mm_setzero_ps();
  148. __m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
  149. // Negate
  150. __m128 min_v = _mm_sub_ps(zero, v);
  151. return Mat44(
  152. _mm_shuffle_ps(v, min_v, _MM_SHUFFLE(3, 1, 2, 3)), // [0, z, -y, 0]
  153. _mm_shuffle_ps(min_v, v, _MM_SHUFFLE(3, 0, 3, 2)), // [-z, 0, x, 0]
  154. _mm_blend_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 1)), _mm_shuffle_ps(min_v, min_v, _MM_SHUFFLE(3, 3, 0, 3)), 0b0010), // [y, -x, 0, 0]
  155. Vec4(0, 0, 0, 1));
  156. #else
  157. float x = inV.GetX();
  158. float y = inV.GetY();
  159. float z = inV.GetZ();
  160. return Mat44(
  161. Vec4(0, z, -y, 0),
  162. Vec4(-z, 0, x, 0),
  163. Vec4(y, -x, 0, 0),
  164. Vec4(0, 0, 0, 1));
  165. #endif
  166. }
  167. Mat44 Mat44::sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp)
  168. {
  169. Vec3 direction = (inTarget - inPos).NormalizedOr(-Vec3::sAxisZ());
  170. Vec3 right = direction.Cross(inUp).NormalizedOr(Vec3::sAxisX());
  171. Vec3 up = right.Cross(direction);
  172. return Mat44(Vec4(right, 0), Vec4(up, 0), Vec4(-direction, 0), Vec4(inPos, 1)).InversedRotationTranslation();
  173. }
  174. bool Mat44::operator == (Mat44Arg inM2) const
  175. {
  176. return UVec4::sAnd(
  177. UVec4::sAnd(Vec4::sEquals(mCol[0], inM2.mCol[0]), Vec4::sEquals(mCol[1], inM2.mCol[1])),
  178. UVec4::sAnd(Vec4::sEquals(mCol[2], inM2.mCol[2]), Vec4::sEquals(mCol[3], inM2.mCol[3]))
  179. ).TestAllTrue();
  180. }
  181. bool Mat44::IsClose(Mat44Arg inM2, float inMaxDistSq) const
  182. {
  183. for (int i = 0; i < 4; ++i)
  184. if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
  185. return false;
  186. return true;
  187. }
  188. Mat44 Mat44::operator * (Mat44Arg inM) const
  189. {
  190. Mat44 result;
  191. #if defined(JPH_USE_SSE)
  192. for (int i = 0; i < 4; ++i)
  193. {
  194. __m128 c = inM.mCol[i].mValue;
  195. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  196. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  197. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  198. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3))));
  199. result.mCol[i].mValue = t;
  200. }
  201. #elif defined(JPH_USE_NEON)
  202. for (int i = 0; i < 4; ++i)
  203. {
  204. Type c = inM.mCol[i].mValue;
  205. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  206. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  207. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  208. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
  209. result.mCol[i].mValue = t;
  210. }
  211. #else
  212. for (int i = 0; i < 4; ++i)
  213. result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2] + mCol[3] * inM.mCol[i].mF32[3];
  214. #endif
  215. return result;
  216. }
  217. Vec3 Mat44::operator * (Vec3Arg inV) const
  218. {
  219. #if defined(JPH_USE_SSE)
  220. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  221. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  222. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  223. t = _mm_add_ps(t, mCol[3].mValue);
  224. return Vec3::sFixW(t);
  225. #elif defined(JPH_USE_NEON)
  226. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  227. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  228. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  229. t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
  230. return Vec3::sFixW(t);
  231. #else
  232. return Vec3(
  233. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0],
  234. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1],
  235. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2]);
  236. #endif
  237. }
  238. Vec4 Mat44::operator * (Vec4Arg inV) const
  239. {
  240. #if defined(JPH_USE_SSE)
  241. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  242. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  243. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  244. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(3, 3, 3, 3))));
  245. return t;
  246. #elif defined(JPH_USE_NEON)
  247. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  248. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  249. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  250. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
  251. return t;
  252. #else
  253. return Vec4(
  254. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2] + mCol[3].mF32[0] * inV.mF32[3],
  255. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2] + mCol[3].mF32[1] * inV.mF32[3],
  256. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2] + mCol[3].mF32[2] * inV.mF32[3],
  257. mCol[0].mF32[3] * inV.mF32[0] + mCol[1].mF32[3] * inV.mF32[1] + mCol[2].mF32[3] * inV.mF32[2] + mCol[3].mF32[3] * inV.mF32[3]);
  258. #endif
  259. }
  260. Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
  261. {
  262. #if defined(JPH_USE_SSE)
  263. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  264. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  265. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  266. return Vec3::sFixW(t);
  267. #elif defined(JPH_USE_NEON)
  268. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  269. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  270. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  271. return Vec3::sFixW(t);
  272. #else
  273. return Vec3(
  274. mCol[0].mF32[0] * inV.mF32[0] + mCol[1].mF32[0] * inV.mF32[1] + mCol[2].mF32[0] * inV.mF32[2],
  275. mCol[0].mF32[1] * inV.mF32[0] + mCol[1].mF32[1] * inV.mF32[1] + mCol[2].mF32[1] * inV.mF32[2],
  276. mCol[0].mF32[2] * inV.mF32[0] + mCol[1].mF32[2] * inV.mF32[1] + mCol[2].mF32[2] * inV.mF32[2]);
  277. #endif
  278. }
  279. Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
  280. {
  281. #if defined(JPH_USE_SSE4_1)
  282. __m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
  283. __m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
  284. __m128 xy = _mm_blend_ps(x, y, 0b0010);
  285. __m128 z = _mm_dp_ps(mCol[2].mValue, inV.mValue, 0x7f);
  286. __m128 xyzz = _mm_blend_ps(xy, z, 0b1100);
  287. return xyzz;
  288. #else
  289. return Transposed3x3().Multiply3x3(inV);
  290. #endif
  291. }
  292. Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
  293. {
  294. JPH_ASSERT(mCol[0][3] == 0.0f);
  295. JPH_ASSERT(mCol[1][3] == 0.0f);
  296. JPH_ASSERT(mCol[2][3] == 0.0f);
  297. Mat44 result;
  298. #if defined(JPH_USE_SSE)
  299. for (int i = 0; i < 3; ++i)
  300. {
  301. __m128 c = inM.mCol[i].mValue;
  302. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  303. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  304. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  305. result.mCol[i].mValue = t;
  306. }
  307. #elif defined(JPH_USE_NEON)
  308. for (int i = 0; i < 3; ++i)
  309. {
  310. Type c = inM.mCol[i].mValue;
  311. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  312. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  313. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  314. result.mCol[i].mValue = t;
  315. }
  316. #else
  317. for (int i = 0; i < 3; ++i)
  318. result.mCol[i] = mCol[0] * inM.mCol[i].mF32[0] + mCol[1] * inM.mCol[i].mF32[1] + mCol[2] * inM.mCol[i].mF32[2];
  319. #endif
  320. result.mCol[3] = Vec4(0, 0, 0, 1);
  321. return result;
  322. }
  323. Mat44 Mat44::Multiply3x3LeftTransposed(Mat44Arg inM) const
  324. {
  325. // Transpose left hand side
  326. Mat44 trans = Transposed3x3();
  327. // Do 3x3 matrix multiply
  328. Mat44 result;
  329. result.mCol[0] = trans.mCol[0] * inM.mCol[0].SplatX() + trans.mCol[1] * inM.mCol[0].SplatY() + trans.mCol[2] * inM.mCol[0].SplatZ();
  330. result.mCol[1] = trans.mCol[0] * inM.mCol[1].SplatX() + trans.mCol[1] * inM.mCol[1].SplatY() + trans.mCol[2] * inM.mCol[1].SplatZ();
  331. result.mCol[2] = trans.mCol[0] * inM.mCol[2].SplatX() + trans.mCol[1] * inM.mCol[2].SplatY() + trans.mCol[2] * inM.mCol[2].SplatZ();
  332. result.mCol[3] = Vec4(0, 0, 0, 1);
  333. return result;
  334. }
  335. Mat44 Mat44::Multiply3x3RightTransposed(Mat44Arg inM) const
  336. {
  337. JPH_ASSERT(mCol[0][3] == 0.0f);
  338. JPH_ASSERT(mCol[1][3] == 0.0f);
  339. JPH_ASSERT(mCol[2][3] == 0.0f);
  340. Mat44 result;
  341. result.mCol[0] = mCol[0] * inM.mCol[0].SplatX() + mCol[1] * inM.mCol[1].SplatX() + mCol[2] * inM.mCol[2].SplatX();
  342. result.mCol[1] = mCol[0] * inM.mCol[0].SplatY() + mCol[1] * inM.mCol[1].SplatY() + mCol[2] * inM.mCol[2].SplatY();
  343. result.mCol[2] = mCol[0] * inM.mCol[0].SplatZ() + mCol[1] * inM.mCol[1].SplatZ() + mCol[2] * inM.mCol[2].SplatZ();
  344. result.mCol[3] = Vec4(0, 0, 0, 1);
  345. return result;
  346. }
  347. Mat44 Mat44::operator * (float inV) const
  348. {
  349. Vec4 multiplier = Vec4::sReplicate(inV);
  350. Mat44 result;
  351. for (int c = 0; c < 4; ++c)
  352. result.mCol[c] = mCol[c] * multiplier;
  353. return result;
  354. }
  355. Mat44 &Mat44::operator *= (float inV)
  356. {
  357. for (int c = 0; c < 4; ++c)
  358. mCol[c] *= inV;
  359. return *this;
  360. }
  361. Mat44 Mat44::operator + (Mat44Arg inM) const
  362. {
  363. Mat44 result;
  364. for (int i = 0; i < 4; ++i)
  365. result.mCol[i] = mCol[i] + inM.mCol[i];
  366. return result;
  367. }
  368. Mat44 Mat44::operator - () const
  369. {
  370. Mat44 result;
  371. for (int i = 0; i < 4; ++i)
  372. result.mCol[i] = -mCol[i];
  373. return result;
  374. }
  375. Mat44 Mat44::operator - (Mat44Arg inM) const
  376. {
  377. Mat44 result;
  378. for (int i = 0; i < 4; ++i)
  379. result.mCol[i] = mCol[i] - inM.mCol[i];
  380. return result;
  381. }
  382. Mat44 &Mat44::operator += (Mat44Arg inM)
  383. {
  384. for (int c = 0; c < 4; ++c)
  385. mCol[c] += inM.mCol[c];
  386. return *this;
  387. }
  388. void Mat44::StoreFloat4x4(Float4 *outV) const
  389. {
  390. for (int c = 0; c < 4; ++c)
  391. mCol[c].StoreFloat4(outV + c);
  392. }
  393. Mat44 Mat44::Transposed() const
  394. {
  395. #if defined(JPH_USE_SSE)
  396. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  397. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  398. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  399. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  400. Mat44 result;
  401. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  402. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  403. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  404. result.mCol[3].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(3, 1, 3, 1));
  405. return result;
  406. #elif defined(JPH_USE_NEON)
  407. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  408. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, mCol[3].mValue);
  409. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  410. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  411. Mat44 result;
  412. result.mCol[0].mValue = tmp3.val[0];
  413. result.mCol[1].mValue = tmp3.val[1];
  414. result.mCol[2].mValue = tmp4.val[0];
  415. result.mCol[3].mValue = tmp4.val[1];
  416. return result;
  417. #else
  418. Mat44 result;
  419. for (int c = 0; c < 4; ++c)
  420. for (int r = 0; r < 4; ++r)
  421. result.mCol[r].mF32[c] = mCol[c].mF32[r];
  422. return result;
  423. #endif
  424. }
  425. Mat44 Mat44::Transposed3x3() const
  426. {
  427. #if defined(JPH_USE_SSE)
  428. __m128 zero = _mm_setzero_ps();
  429. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  430. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  431. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(1, 0, 1, 0));
  432. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(3, 2, 3, 2));
  433. Mat44 result;
  434. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  435. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  436. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  437. #elif defined(JPH_USE_NEON)
  438. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  439. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, vdupq_n_f32(0));
  440. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  441. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  442. Mat44 result;
  443. result.mCol[0].mValue = tmp3.val[0];
  444. result.mCol[1].mValue = tmp3.val[1];
  445. result.mCol[2].mValue = tmp4.val[0];
  446. #else
  447. Mat44 result;
  448. for (int c = 0; c < 3; ++c)
  449. {
  450. for (int r = 0; r < 3; ++r)
  451. result.mCol[c].mF32[r] = mCol[r].mF32[c];
  452. result.mCol[c].mF32[3] = 0;
  453. }
  454. #endif
  455. result.mCol[3] = Vec4(0, 0, 0, 1);
  456. return result;
  457. }
  458. Mat44 Mat44::Inversed() const
  459. {
  460. #if defined(JPH_USE_SSE)
  461. // Algorithm from: http://download.intel.com/design/PentiumIII/sml/24504301.pdf
  462. // Streaming SIMD Extensions - Inverse of 4x4 Matrix
  463. // Adapted to load data using _mm_shuffle_ps instead of loading from memory
  464. // Replaced _mm_rcp_ps with _mm_div_ps for better accuracy
  465. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  466. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  467. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  468. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  469. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  470. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  471. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  472. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  473. tmp1 = _mm_mul_ps(row2, row3);
  474. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  475. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  476. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  477. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  478. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  479. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  480. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  481. tmp1 = _mm_mul_ps(row1, row2);
  482. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  483. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  484. __m128 minor3 = _mm_mul_ps(row0, tmp1);
  485. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  486. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  487. minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
  488. minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
  489. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  490. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  491. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  492. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  493. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  494. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  495. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  496. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  497. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  498. tmp1 = _mm_mul_ps(row0, row1);
  499. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  500. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  501. minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
  502. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  503. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  504. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
  505. tmp1 = _mm_mul_ps(row0, row3);
  506. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  507. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  508. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  509. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  510. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  511. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  512. tmp1 = _mm_mul_ps(row0, row2);
  513. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  514. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  515. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
  516. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  517. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  518. minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
  519. __m128 det = _mm_mul_ps(row0, minor0);
  520. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
  521. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  522. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  523. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  524. Mat44 result;
  525. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  526. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  527. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  528. result.mCol[3].mValue = _mm_mul_ps(det, minor3);
  529. return result;
  530. #elif defined(JPH_USE_NEON)
  531. // Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
  532. Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  533. Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
  534. Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
  535. row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
  536. tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  537. Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
  538. Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
  539. row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
  540. tmp1 = vmulq_f32(row2, row3);
  541. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  542. Type minor0 = vmulq_f32(row1, tmp1);
  543. Type minor1 = vmulq_f32(row0, tmp1);
  544. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  545. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  546. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  547. minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
  548. tmp1 = vmulq_f32(row1, row2);
  549. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  550. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  551. Type minor3 = vmulq_f32(row0, tmp1);
  552. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  553. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  554. minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
  555. minor3 = JPH_NEON_SHUFFLE_F32x4(minor3, minor3, 2, 3, 0, 1);
  556. tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
  557. tmp1 = vmulq_f32(tmp1, row3);
  558. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  559. row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
  560. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  561. Type minor2 = vmulq_f32(row0, tmp1);
  562. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  563. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  564. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  565. minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
  566. tmp1 = vmulq_f32(row0, row1);
  567. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  568. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  569. minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
  570. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  571. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  572. minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
  573. tmp1 = vmulq_f32(row0, row3);
  574. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  575. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  576. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  577. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  578. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  579. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  580. tmp1 = vmulq_f32(row0, row2);
  581. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  582. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  583. minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
  584. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  585. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  586. minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
  587. Type det = vmulq_f32(row0, minor0);
  588. det = vdupq_n_f32(vaddvq_f32(det));
  589. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  590. Mat44 result;
  591. result.mCol[0].mValue = vmulq_f32(det, minor0);
  592. result.mCol[1].mValue = vmulq_f32(det, minor1);
  593. result.mCol[2].mValue = vmulq_f32(det, minor2);
  594. result.mCol[3].mValue = vmulq_f32(det, minor3);
  595. return result;
  596. #else
  597. float m00 = JPH_EL(0, 0), m10 = JPH_EL(1, 0), m20 = JPH_EL(2, 0), m30 = JPH_EL(3, 0);
  598. float m01 = JPH_EL(0, 1), m11 = JPH_EL(1, 1), m21 = JPH_EL(2, 1), m31 = JPH_EL(3, 1);
  599. float m02 = JPH_EL(0, 2), m12 = JPH_EL(1, 2), m22 = JPH_EL(2, 2), m32 = JPH_EL(3, 2);
  600. float m03 = JPH_EL(0, 3), m13 = JPH_EL(1, 3), m23 = JPH_EL(2, 3), m33 = JPH_EL(3, 3);
  601. float m10211120 = m10 * m21 - m11 * m20;
  602. float m10221220 = m10 * m22 - m12 * m20;
  603. float m10231320 = m10 * m23 - m13 * m20;
  604. float m10311130 = m10 * m31 - m11 * m30;
  605. float m10321230 = m10 * m32 - m12 * m30;
  606. float m10331330 = m10 * m33 - m13 * m30;
  607. float m11221221 = m11 * m22 - m12 * m21;
  608. float m11231321 = m11 * m23 - m13 * m21;
  609. float m11321231 = m11 * m32 - m12 * m31;
  610. float m11331331 = m11 * m33 - m13 * m31;
  611. float m12231322 = m12 * m23 - m13 * m22;
  612. float m12331332 = m12 * m33 - m13 * m32;
  613. float m20312130 = m20 * m31 - m21 * m30;
  614. float m20322230 = m20 * m32 - m22 * m30;
  615. float m20332330 = m20 * m33 - m23 * m30;
  616. float m21322231 = m21 * m32 - m22 * m31;
  617. float m21332331 = m21 * m33 - m23 * m31;
  618. float m22332332 = m22 * m33 - m23 * m32;
  619. Vec4 col0(m11 * m22332332 - m12 * m21332331 + m13 * m21322231, -m10 * m22332332 + m12 * m20332330 - m13 * m20322230, m10 * m21332331 - m11 * m20332330 + m13 * m20312130, -m10 * m21322231 + m11 * m20322230 - m12 * m20312130);
  620. Vec4 col1(-m01 * m22332332 + m02 * m21332331 - m03 * m21322231, m00 * m22332332 - m02 * m20332330 + m03 * m20322230, -m00 * m21332331 + m01 * m20332330 - m03 * m20312130, m00 * m21322231 - m01 * m20322230 + m02 * m20312130);
  621. Vec4 col2(m01 * m12331332 - m02 * m11331331 + m03 * m11321231, -m00 * m12331332 + m02 * m10331330 - m03 * m10321230, m00 * m11331331 - m01 * m10331330 + m03 * m10311130, -m00 * m11321231 + m01 * m10321230 - m02 * m10311130);
  622. Vec4 col3(-m01 * m12231322 + m02 * m11231321 - m03 * m11221221, m00 * m12231322 - m02 * m10231320 + m03 * m10221220, -m00 * m11231321 + m01 * m10231320 - m03 * m10211120, m00 * m11221221 - m01 * m10221220 + m02 * m10211120);
  623. float det = m00 * col0.mF32[0] + m01 * col0.mF32[1] + m02 * col0.mF32[2] + m03 * col0.mF32[3];
  624. return Mat44(col0 / det, col1 / det, col2 / det, col3 / det);
  625. #endif
  626. }
  627. Mat44 Mat44::InversedRotationTranslation() const
  628. {
  629. Mat44 m = Transposed3x3();
  630. m.SetTranslation(-m.Multiply3x3(GetTranslation()));
  631. return m;
  632. }
  633. float Mat44::GetDeterminant3x3() const
  634. {
  635. return GetAxisX().Dot(GetAxisY().Cross(GetAxisZ()));
  636. }
  637. Mat44 Mat44::Adjointed3x3() const
  638. {
  639. // Adapted from Inversed() to remove 4th column and the division by the determinant
  640. // Note: This can be optimized.
  641. JPH_ASSERT(mCol[0][3] == 0.0f);
  642. JPH_ASSERT(mCol[1][3] == 0.0f);
  643. JPH_ASSERT(mCol[2][3] == 0.0f);
  644. #if defined(JPH_USE_SSE)
  645. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  646. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  647. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  648. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  649. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  650. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  651. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  652. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  653. tmp1 = _mm_mul_ps(row2, row3);
  654. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  655. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  656. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  657. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  658. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  659. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  660. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  661. tmp1 = _mm_mul_ps(row1, row2);
  662. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  663. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  664. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  665. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  666. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  667. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  668. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  669. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  670. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  671. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  672. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  673. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  674. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  675. tmp1 = _mm_mul_ps(row0, row1);
  676. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  677. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  678. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  679. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  680. tmp1 = _mm_mul_ps(row0, row3);
  681. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  682. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  683. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  684. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  685. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  686. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  687. tmp1 = _mm_mul_ps(row0, row2);
  688. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  689. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  690. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  691. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  692. Mat44 result;
  693. result.mCol[0].mValue = minor0;
  694. result.mCol[1].mValue = minor1;
  695. result.mCol[2].mValue = minor2;
  696. result.mCol[3] = Vec4(0, 0, 0, 1);
  697. return result;
  698. #elif defined(JPH_USE_NEON)
  699. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  700. Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  701. Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
  702. Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
  703. row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
  704. tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  705. Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
  706. Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
  707. row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
  708. tmp1 = vmulq_f32(row2, row3);
  709. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  710. Type minor0 = vmulq_f32(row1, tmp1);
  711. Type minor1 = vmulq_f32(row0, tmp1);
  712. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  713. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  714. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  715. minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
  716. tmp1 = vmulq_f32(row1, row2);
  717. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  718. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  719. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  720. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  721. tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
  722. tmp1 = vmulq_f32(tmp1, row3);
  723. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  724. row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
  725. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  726. Type minor2 = vmulq_f32(row0, tmp1);
  727. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  728. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  729. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  730. minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
  731. tmp1 = vmulq_f32(row0, row1);
  732. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  733. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  734. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  735. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  736. tmp1 = vmulq_f32(row0, row3);
  737. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  738. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  739. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  740. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  741. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  742. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  743. tmp1 = vmulq_f32(row0, row2);
  744. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  745. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  746. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  747. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  748. Mat44 result;
  749. result.mCol[0].mValue = minor0;
  750. result.mCol[1].mValue = minor1;
  751. result.mCol[2].mValue = minor2;
  752. result.mCol[3].mValue = v0001;
  753. return result;
  754. #else
  755. return Mat44(
  756. Vec4(JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1),
  757. JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2),
  758. JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0),
  759. 0),
  760. Vec4(JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2),
  761. JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0),
  762. JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1),
  763. 0),
  764. Vec4(JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1),
  765. JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2),
  766. JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0),
  767. 0),
  768. Vec4(0, 0, 0, 1));
  769. #endif
  770. }
  771. Mat44 Mat44::Inversed3x3() const
  772. {
  773. // Adapted from Inversed() to remove 4th column
  774. // Note: This can be optimized.
  775. JPH_ASSERT(mCol[0][3] == 0.0f);
  776. JPH_ASSERT(mCol[1][3] == 0.0f);
  777. JPH_ASSERT(mCol[2][3] == 0.0f);
  778. #if defined(JPH_USE_SSE)
  779. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  780. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  781. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  782. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  783. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  784. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  785. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  786. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  787. tmp1 = _mm_mul_ps(row2, row3);
  788. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  789. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  790. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  791. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  792. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  793. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  794. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  795. tmp1 = _mm_mul_ps(row1, row2);
  796. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  797. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  798. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  799. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  800. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  801. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  802. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  803. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  804. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  805. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  806. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  807. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  808. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  809. tmp1 = _mm_mul_ps(row0, row1);
  810. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  811. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  812. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  813. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  814. tmp1 = _mm_mul_ps(row0, row3);
  815. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  816. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  817. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  818. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  819. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  820. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  821. tmp1 = _mm_mul_ps(row0, row2);
  822. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  823. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  824. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  825. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  826. __m128 det = _mm_mul_ps(row0, minor0);
  827. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
  828. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  829. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  830. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  831. Mat44 result;
  832. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  833. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  834. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  835. result.mCol[3] = Vec4(0, 0, 0, 1);
  836. return result;
  837. #elif defined(JPH_USE_NEON)
  838. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  839. Type tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  840. Type row1 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 0, 1, 4, 5);
  841. Type row0 = JPH_NEON_SHUFFLE_F32x4(tmp1, row1, 0, 2, 4, 6);
  842. row1 = JPH_NEON_SHUFFLE_F32x4(row1, tmp1, 1, 3, 5, 7);
  843. tmp1 = JPH_NEON_SHUFFLE_F32x4(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  844. Type row3 = JPH_NEON_SHUFFLE_F32x4(mCol[2].mValue, v0001, 2, 3, 6, 7);
  845. Type row2 = JPH_NEON_SHUFFLE_F32x4(tmp1, row3, 0, 2, 4, 6);
  846. row3 = JPH_NEON_SHUFFLE_F32x4(row3, tmp1, 1, 3, 5, 7);
  847. tmp1 = vmulq_f32(row2, row3);
  848. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  849. Type minor0 = vmulq_f32(row1, tmp1);
  850. Type minor1 = vmulq_f32(row0, tmp1);
  851. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  852. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  853. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  854. minor1 = JPH_NEON_SHUFFLE_F32x4(minor1, minor1, 2, 3, 0, 1);
  855. tmp1 = vmulq_f32(row1, row2);
  856. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  857. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  858. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  859. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  860. tmp1 = JPH_NEON_SHUFFLE_F32x4(row1, row1, 2, 3, 0, 1);
  861. tmp1 = vmulq_f32(tmp1, row3);
  862. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  863. row2 = JPH_NEON_SHUFFLE_F32x4(row2, row2, 2, 3, 0, 1);
  864. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  865. Type minor2 = vmulq_f32(row0, tmp1);
  866. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  867. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  868. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  869. minor2 = JPH_NEON_SHUFFLE_F32x4(minor2, minor2, 2, 3, 0, 1);
  870. tmp1 = vmulq_f32(row0, row1);
  871. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  872. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  873. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  874. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  875. tmp1 = vmulq_f32(row0, row3);
  876. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  877. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  878. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  879. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  880. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  881. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  882. tmp1 = vmulq_f32(row0, row2);
  883. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 1, 0, 3, 2);
  884. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  885. tmp1 = JPH_NEON_SHUFFLE_F32x4(tmp1, tmp1, 2, 3, 0, 1);
  886. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  887. Type det = vmulq_f32(row0, minor0);
  888. det = vdupq_n_f32(vaddvq_f32(det));
  889. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  890. Mat44 result;
  891. result.mCol[0].mValue = vmulq_f32(det, minor0);
  892. result.mCol[1].mValue = vmulq_f32(det, minor1);
  893. result.mCol[2].mValue = vmulq_f32(det, minor2);
  894. result.mCol[3].mValue = v0001;
  895. return result;
  896. #else
  897. float det = GetDeterminant3x3();
  898. return Mat44(
  899. Vec4((JPH_EL(1, 1) * JPH_EL(2, 2) - JPH_EL(1, 2) * JPH_EL(2, 1)) / det,
  900. (JPH_EL(1, 2) * JPH_EL(2, 0) - JPH_EL(1, 0) * JPH_EL(2, 2)) / det,
  901. (JPH_EL(1, 0) * JPH_EL(2, 1) - JPH_EL(1, 1) * JPH_EL(2, 0)) / det,
  902. 0),
  903. Vec4((JPH_EL(0, 2) * JPH_EL(2, 1) - JPH_EL(0, 1) * JPH_EL(2, 2)) / det,
  904. (JPH_EL(0, 0) * JPH_EL(2, 2) - JPH_EL(0, 2) * JPH_EL(2, 0)) / det,
  905. (JPH_EL(0, 1) * JPH_EL(2, 0) - JPH_EL(0, 0) * JPH_EL(2, 1)) / det,
  906. 0),
  907. Vec4((JPH_EL(0, 1) * JPH_EL(1, 2) - JPH_EL(0, 2) * JPH_EL(1, 1)) / det,
  908. (JPH_EL(0, 2) * JPH_EL(1, 0) - JPH_EL(0, 0) * JPH_EL(1, 2)) / det,
  909. (JPH_EL(0, 0) * JPH_EL(1, 1) - JPH_EL(0, 1) * JPH_EL(1, 0)) / det,
  910. 0),
  911. Vec4(0, 0, 0, 1));
  912. #endif
  913. }
  914. Quat Mat44::GetQuaternion() const
  915. {
  916. JPH_ASSERT(mCol[3] == Vec4(0, 0, 0, 1));
  917. float tr = mCol[0].mF32[0] + mCol[1].mF32[1] + mCol[2].mF32[2];
  918. if (tr >= 0.0f)
  919. {
  920. float s = sqrt(tr + 1.0f);
  921. float is = 0.5f / s;
  922. return Quat(
  923. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is,
  924. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is,
  925. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is,
  926. 0.5f * s);
  927. }
  928. else
  929. {
  930. int i = 0;
  931. if (mCol[1].mF32[1] > mCol[0].mF32[0]) i = 1;
  932. if (mCol[2].mF32[2] > mCol[i].mF32[i]) i = 2;
  933. if (i == 0)
  934. {
  935. float s = sqrt(mCol[0].mF32[0] - (mCol[1].mF32[1] + mCol[2].mF32[2]) + 1);
  936. float is = 0.5f / s;
  937. return Quat(
  938. 0.5f * s,
  939. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  940. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  941. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is);
  942. }
  943. else if (i == 1)
  944. {
  945. float s = sqrt(mCol[1].mF32[1] - (mCol[2].mF32[2] + mCol[0].mF32[0]) + 1);
  946. float is = 0.5f / s;
  947. return Quat(
  948. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  949. 0.5f * s,
  950. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  951. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is);
  952. }
  953. else
  954. {
  955. JPH_ASSERT(i == 2);
  956. float s = sqrt(mCol[2].mF32[2] - (mCol[0].mF32[0] + mCol[1].mF32[1]) + 1);
  957. float is = 0.5f / s;
  958. return Quat(
  959. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  960. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  961. 0.5f * s,
  962. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is);
  963. }
  964. }
  965. }
  966. Mat44 Mat44::sQuatLeftMultiply(QuatArg inQ)
  967. {
  968. return Mat44(
  969. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  970. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  971. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  972. inQ.mValue);
  973. }
  974. Mat44 Mat44::sQuatRightMultiply(QuatArg inQ)
  975. {
  976. return Mat44(
  977. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  978. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  979. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  980. inQ.mValue);
  981. }
  982. Mat44 Mat44::GetRotation() const
  983. {
  984. JPH_ASSERT(mCol[0][3] == 0.0f);
  985. JPH_ASSERT(mCol[1][3] == 0.0f);
  986. JPH_ASSERT(mCol[2][3] == 0.0f);
  987. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1));
  988. }
  989. Mat44 Mat44::GetRotationSafe() const
  990. {
  991. #if defined(JPH_USE_AVX512)
  992. return Mat44(_mm_maskz_mov_ps(0b0111, mCol[0].mValue),
  993. _mm_maskz_mov_ps(0b0111, mCol[1].mValue),
  994. _mm_maskz_mov_ps(0b0111, mCol[2].mValue),
  995. Vec4(0, 0, 0, 1));
  996. #elif defined(JPH_USE_SSE4_1)
  997. __m128 zero = _mm_setzero_ps();
  998. return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
  999. _mm_blend_ps(mCol[1].mValue, zero, 8),
  1000. _mm_blend_ps(mCol[2].mValue, zero, 8),
  1001. Vec4(0, 0, 0, 1));
  1002. #elif defined(JPH_USE_NEON)
  1003. return Mat44(vsetq_lane_f32(0, mCol[0].mValue, 3),
  1004. vsetq_lane_f32(0, mCol[1].mValue, 3),
  1005. vsetq_lane_f32(0, mCol[2].mValue, 3),
  1006. Vec4(0, 0, 0, 1));
  1007. #else
  1008. return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
  1009. Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
  1010. Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
  1011. Vec4(0, 0, 0, 1));
  1012. #endif
  1013. }
  1014. void Mat44::SetRotation(Mat44Arg inRotation)
  1015. {
  1016. mCol[0] = inRotation.mCol[0];
  1017. mCol[1] = inRotation.mCol[1];
  1018. mCol[2] = inRotation.mCol[2];
  1019. }
  1020. Mat44 Mat44::PreTranslated(Vec3Arg inTranslation) const
  1021. {
  1022. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + Multiply3x3(inTranslation), 1));
  1023. }
  1024. Mat44 Mat44::PostTranslated(Vec3Arg inTranslation) const
  1025. {
  1026. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + inTranslation, 1));
  1027. }
  1028. Mat44 Mat44::PreScaled(Vec3Arg inScale) const
  1029. {
  1030. return Mat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol[3]);
  1031. }
  1032. Mat44 Mat44::PostScaled(Vec3Arg inScale) const
  1033. {
  1034. Vec4 scale(inScale, 1);
  1035. return Mat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], scale * mCol[3]);
  1036. }
  1037. Mat44 Mat44::Decompose(Vec3 &outScale) const
  1038. {
  1039. // Start the modified Gram-Schmidt algorithm
  1040. // X axis will just be normalized
  1041. Vec3 x = GetAxisX();
  1042. // Make Y axis perpendicular to X
  1043. Vec3 y = GetAxisY();
  1044. float x_dot_x = x.LengthSq();
  1045. y -= (x.Dot(y) / x_dot_x) * x;
  1046. // Make Z axis perpendicular to X
  1047. Vec3 z = GetAxisZ();
  1048. z -= (x.Dot(z) / x_dot_x) * x;
  1049. // Make Z axis perpendicular to Y
  1050. float y_dot_y = y.LengthSq();
  1051. z -= (y.Dot(z) / y_dot_y) * y;
  1052. // Determine the scale
  1053. float z_dot_z = z.LengthSq();
  1054. outScale = Vec3(x_dot_x, y_dot_y, z_dot_z).Sqrt();
  1055. // If the resulting x, y and z vectors don't form a right handed matrix, flip the z axis.
  1056. if (x.Cross(y).Dot(z) < 0.0f)
  1057. outScale.SetZ(-outScale.GetZ());
  1058. // Determine the rotation and translation
  1059. return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
  1060. }
  1061. #undef JPH_EL
  1062. JPH_NAMESPACE_END