Mat44.inl 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #pragma once
  4. #include <Jolt/Math/Vec3.h>
  5. #include <Jolt/Math/Vec4.h>
  6. #include <Jolt/Math/Quat.h>
  7. JPH_NAMESPACE_BEGIN
  8. Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) :
  9. mCol { inC1, inC2, inC3, inC4 }
  10. {
  11. }
  12. Mat44::Mat44(Type inC1, Type inC2, Type inC3, Type inC4) :
  13. mCol { inC1, inC2, inC3, inC4 }
  14. {
  15. }
  16. Mat44 Mat44::sZero()
  17. {
  18. return Mat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), Vec4::sZero());
  19. }
  20. Mat44 Mat44::sIdentity()
  21. {
  22. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  23. }
  24. Mat44 Mat44::sNaN()
  25. {
  26. return Mat44(Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN());
  27. }
  28. Mat44 Mat44::sLoadFloat4x4(const Float4 *inV)
  29. {
  30. Mat44 result;
  31. for (int c = 0; c < 4; ++c)
  32. result.mCol[c] = Vec4::sLoadFloat4(inV + c);
  33. return result;
  34. }
  35. Mat44 Mat44::sLoadFloat4x4Aligned(const Float4 *inV)
  36. {
  37. Mat44 result;
  38. for (int c = 0; c < 4; ++c)
  39. result.mCol[c] = Vec4::sLoadFloat4Aligned(inV + c);
  40. return result;
  41. }
  42. Mat44 Mat44::sRotationX(float inX)
  43. {
  44. Vec4 sv, cv;
  45. Vec4::sReplicate(inX).SinCos(sv, cv);
  46. float s = sv.GetX(), c = cv.GetX();
  47. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, c, s, 0), Vec4(0, -s, c, 0), Vec4(0, 0, 0, 1));
  48. }
  49. Mat44 Mat44::sRotationY(float inY)
  50. {
  51. Vec4 sv, cv;
  52. Vec4::sReplicate(inY).SinCos(sv, cv);
  53. float s = sv.GetX(), c = cv.GetX();
  54. return Mat44(Vec4(c, 0, -s, 0), Vec4(0, 1, 0, 0), Vec4(s, 0, c, 0), Vec4(0, 0, 0, 1));
  55. }
  56. Mat44 Mat44::sRotationZ(float inZ)
  57. {
  58. Vec4 sv, cv;
  59. Vec4::sReplicate(inZ).SinCos(sv, cv);
  60. float s = sv.GetX(), c = cv.GetX();
  61. return Mat44(Vec4(c, s, 0, 0), Vec4(-s, c, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  62. }
  63. Mat44 Mat44::sRotation(QuatArg inQuat)
  64. {
  65. JPH_ASSERT(inQuat.IsNormalized());
  66. // See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
  67. #ifdef JPH_USE_SSE4_1
  68. __m128 xyzw = inQuat.mValue.mValue;
  69. __m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
  70. __m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
  71. __m128 two_yzxw = _mm_add_ps(yzxw, yzxw);
  72. __m128 zxyw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 1, 0, 2));
  73. __m128 two_zxyw = _mm_add_ps(zxyw, zxyw);
  74. __m128 wwww = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 3, 3, 3));
  75. __m128 diagonal = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(two_yzxw, yzxw)), _mm_mul_ps(two_zxyw, zxyw)); // (1 - 2 y^2 - 2 z^2, 1 - 2 x^2 - 2 z^2, 1 - 2 x^2 - 2 y^2, 1 - 4 w^2)
  76. __m128 plus = _mm_add_ps(_mm_mul_ps(two_xyzw, zxyw), _mm_mul_ps(two_yzxw, wwww)); // 2 * (xz + yw, xy + zw, yz + xw, ww)
  77. __m128 minus = _mm_sub_ps(_mm_mul_ps(two_yzxw, xyzw), _mm_mul_ps(two_zxyw, wwww)); // 2 * (xy - zw, yz - xw, xz - yw, 0)
  78. // Workaround for compiler changing _mm_sub_ps(_mm_mul_ps(...), ...) into a fused multiply sub instruction, resulting in w not being 0
  79. // There doesn't appear to be a reliable way to turn this off in Clang
  80. minus = _mm_insert_ps(minus, minus, 0b1000);
  81. __m128 col0 = _mm_blend_ps(_mm_blend_ps(plus, diagonal, 0b0001), minus, 0b1100); // (1 - 2 y^2 - 2 z^2, 2 xy + 2 zw, 2 xz - 2 yw, 0)
  82. __m128 col1 = _mm_blend_ps(_mm_blend_ps(diagonal, minus, 0b1001), plus, 0b0100); // (2 xy - 2 zw, 1 - 2 x^2 - 2 z^2, 2 yz + 2 xw, 0)
  83. __m128 col2 = _mm_blend_ps(_mm_blend_ps(minus, plus, 0b0001), diagonal, 0b0100); // (2 xz + 2 yw, 2 yz - 2 xw, 1 - 2 x^2 - 2 y^2, 0)
  84. __m128 col3 = _mm_set_ps(1, 0, 0, 0);
  85. return Mat44(col0, col1, col2, col3);
  86. #else
  87. float x = inQuat.GetX();
  88. float y = inQuat.GetY();
  89. float z = inQuat.GetZ();
  90. float w = inQuat.GetW();
  91. float tx = x + x; // Note: Using x + x instead of 2.0f * x to force this function to return the same value as the SSE4.1 version across platforms.
  92. float ty = y + y;
  93. float tz = z + z;
  94. float xx = tx * x;
  95. float yy = ty * y;
  96. float zz = tz * z;
  97. float xy = tx * y;
  98. float xz = tx * z;
  99. float xw = tx * w;
  100. float yz = ty * z;
  101. float yw = ty * w;
  102. float zw = tz * w;
  103. return Mat44(Vec4((1.0f - yy) - zz, xy + zw, xz - yw, 0.0f), // Note: Added extra brackets to force this function to return the same value as the SSE4.1 version across platforms.
  104. Vec4(xy - zw, (1.0f - zz) - xx, yz + xw, 0.0f),
  105. Vec4(xz + yw, yz - xw, (1.0f - xx) - yy, 0.0f),
  106. Vec4(0.0f, 0.0f, 0.0f, 1.0f));
  107. #endif
  108. }
  109. Mat44 Mat44::sRotation(Vec3Arg inAxis, float inAngle)
  110. {
  111. return sRotation(Quat::sRotation(inAxis, inAngle));
  112. }
  113. Mat44 Mat44::sTranslation(Vec3Arg inV)
  114. {
  115. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(inV, 1));
  116. }
  117. Mat44 Mat44::sRotationTranslation(QuatArg inR, Vec3Arg inT)
  118. {
  119. Mat44 m = sRotation(inR);
  120. m.SetTranslation(inT);
  121. return m;
  122. }
  123. Mat44 Mat44::sInverseRotationTranslation(QuatArg inR, Vec3Arg inT)
  124. {
  125. Mat44 m = sRotation(inR.Conjugated());
  126. m.SetTranslation(-m.Multiply3x3(inT));
  127. return m;
  128. }
  129. Mat44 Mat44::sScale(float inScale)
  130. {
  131. return Mat44(Vec4(inScale, 0, 0, 0), Vec4(0, inScale, 0, 0), Vec4(0, 0, inScale, 0), Vec4(0, 0, 0, 1));
  132. }
  133. Mat44 Mat44::sScale(Vec3Arg inV)
  134. {
  135. return Mat44(Vec4(inV.GetX(), 0, 0, 0), Vec4(0, inV.GetY(), 0, 0), Vec4(0, 0, inV.GetZ(), 0), Vec4(0, 0, 0, 1));
  136. }
  137. Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
  138. {
  139. Vec4 v1(inV1, 0);
  140. return Mat44(v1 * inV2.SplatX(), v1 * inV2.SplatY(), v1 * inV2.SplatZ(), Vec4(0, 0, 0, 1));
  141. }
  142. Mat44 Mat44::sCrossProduct(Vec3Arg inV)
  143. {
  144. #ifdef JPH_USE_SSE4_1
  145. // Zero out the W component
  146. __m128 zero = _mm_setzero_ps();
  147. __m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
  148. // Negate
  149. __m128 min_v = _mm_sub_ps(zero, v);
  150. return Mat44(
  151. _mm_shuffle_ps(v, min_v, _MM_SHUFFLE(3, 1, 2, 3)), // [0, z, -y, 0]
  152. _mm_shuffle_ps(min_v, v, _MM_SHUFFLE(3, 0, 3, 2)), // [-z, 0, x, 0]
  153. _mm_blend_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 1)), _mm_shuffle_ps(min_v, min_v, _MM_SHUFFLE(3, 3, 0, 3)), 0b0010), // [y, -x, 0, 0]
  154. Vec4(0, 0, 0, 1));
  155. #else
  156. float x = inV.GetX();
  157. float y = inV.GetY();
  158. float z = inV.GetZ();
  159. return Mat44(
  160. Vec4(0, z, -y, 0),
  161. Vec4(-z, 0, x, 0),
  162. Vec4(y, -x, 0, 0),
  163. Vec4(0, 0, 0, 1));
  164. #endif
  165. }
  166. Mat44 Mat44::sLookAt(Vec3Arg inPos, Vec3Arg inTarget, Vec3Arg inUp)
  167. {
  168. Vec3 direction = (inTarget - inPos).NormalizedOr(-Vec3::sAxisZ());
  169. Vec3 right = direction.Cross(inUp).NormalizedOr(Vec3::sAxisX());
  170. Vec3 up = right.Cross(direction);
  171. return Mat44(Vec4(right, 0), Vec4(up, 0), Vec4(-direction, 0), Vec4(inPos, 1)).InversedRotationTranslation();
  172. }
  173. bool Mat44::operator == (Mat44Arg inM2) const
  174. {
  175. return UVec4::sAnd(
  176. UVec4::sAnd(Vec4::sEquals(mCol[0], inM2.mCol[0]), Vec4::sEquals(mCol[1], inM2.mCol[1])),
  177. UVec4::sAnd(Vec4::sEquals(mCol[2], inM2.mCol[2]), Vec4::sEquals(mCol[3], inM2.mCol[3]))
  178. ).TestAllTrue();
  179. }
  180. bool Mat44::IsClose(Mat44Arg inM2, float inMaxDistSq) const
  181. {
  182. for (int i = 0; i < 4; ++i)
  183. if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
  184. return false;
  185. return true;
  186. }
  187. Mat44 Mat44::operator * (Mat44Arg inM) const
  188. {
  189. Mat44 result;
  190. #if defined(JPH_USE_SSE)
  191. for (int i = 0; i < 4; ++i)
  192. {
  193. __m128 c = inM.mCol[i].mValue;
  194. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  195. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  196. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  197. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3))));
  198. result.mCol[i].mValue = t;
  199. }
  200. #elif defined(JPH_USE_NEON)
  201. for (int i = 0; i < 4; ++i)
  202. {
  203. Type c = inM.mCol[i].mValue;
  204. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  205. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  206. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  207. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
  208. result.mCol[i].mValue = t;
  209. }
  210. #else
  211. #error Unsupported CPU architecture
  212. #endif
  213. return result;
  214. }
  215. Vec3 Mat44::operator * (Vec3Arg inV) const
  216. {
  217. #if defined(JPH_USE_SSE)
  218. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  219. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  220. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  221. t = _mm_add_ps(t, mCol[3].mValue);
  222. return Vec3::sFixW(t);
  223. #elif defined(JPH_USE_NEON)
  224. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  225. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  226. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  227. t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
  228. return Vec3::sFixW(t);
  229. #else
  230. #error Unsupported CPU architecture
  231. #endif
  232. }
  233. Vec4 Mat44::operator * (Vec4Arg inV) const
  234. {
  235. #if defined(JPH_USE_SSE)
  236. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  237. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  238. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  239. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(3, 3, 3, 3))));
  240. return t;
  241. #elif defined(JPH_USE_NEON)
  242. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  243. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  244. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  245. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
  246. return t;
  247. #else
  248. #error Unsupported CPU architecture
  249. #endif
  250. }
  251. Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
  252. {
  253. #if defined(JPH_USE_SSE)
  254. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  255. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  256. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  257. return Vec3::sFixW(t);
  258. #elif defined(JPH_USE_NEON)
  259. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  260. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  261. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  262. return Vec3::sFixW(t);
  263. #else
  264. #error Unsupported CPU architecture
  265. #endif
  266. }
  267. Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
  268. {
  269. #if defined(JPH_USE_SSE4_1)
  270. __m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
  271. __m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
  272. __m128 xy = _mm_blend_ps(x, y, 0b0010);
  273. __m128 z = _mm_dp_ps(mCol[2].mValue, inV.mValue, 0x7f);
  274. __m128 xyzz = _mm_blend_ps(xy, z, 0b1100);
  275. return xyzz;
  276. #else
  277. return Transposed3x3().Multiply3x3(inV);
  278. #endif
  279. }
  280. Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
  281. {
  282. JPH_ASSERT(mCol[0][3] == 0.0f);
  283. JPH_ASSERT(mCol[1][3] == 0.0f);
  284. JPH_ASSERT(mCol[2][3] == 0.0f);
  285. Mat44 result;
  286. #if defined(JPH_USE_SSE)
  287. for (int i = 0; i < 3; ++i)
  288. {
  289. __m128 c = inM.mCol[i].mValue;
  290. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  291. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  292. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  293. result.mCol[i].mValue = t;
  294. }
  295. #elif defined(JPH_USE_NEON)
  296. for (int i = 0; i < 3; ++i)
  297. {
  298. Type c = inM.mCol[i].mValue;
  299. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  300. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  301. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  302. result.mCol[i].mValue = t;
  303. }
  304. #else
  305. #error Unsupported CPU architecture
  306. #endif
  307. result.mCol[3] = Vec4(0, 0, 0, 1);
  308. return result;
  309. }
  310. Mat44 Mat44::Multiply3x3LeftTransposed(Mat44Arg inM) const
  311. {
  312. // Transpose left hand side
  313. Mat44 trans = Transposed3x3();
  314. // Do 3x3 matrix multiply
  315. Mat44 result;
  316. result.mCol[0] = trans.mCol[0] * inM.mCol[0].SplatX() + trans.mCol[1] * inM.mCol[0].SplatY() + trans.mCol[2] * inM.mCol[0].SplatZ();
  317. result.mCol[1] = trans.mCol[0] * inM.mCol[1].SplatX() + trans.mCol[1] * inM.mCol[1].SplatY() + trans.mCol[2] * inM.mCol[1].SplatZ();
  318. result.mCol[2] = trans.mCol[0] * inM.mCol[2].SplatX() + trans.mCol[1] * inM.mCol[2].SplatY() + trans.mCol[2] * inM.mCol[2].SplatZ();
  319. result.mCol[3] = Vec4(0, 0, 0, 1);
  320. return result;
  321. }
  322. Mat44 Mat44::Multiply3x3RightTransposed(Mat44Arg inM) const
  323. {
  324. JPH_ASSERT(mCol[0][3] == 0.0f);
  325. JPH_ASSERT(mCol[1][3] == 0.0f);
  326. JPH_ASSERT(mCol[2][3] == 0.0f);
  327. Mat44 result;
  328. result.mCol[0] = mCol[0] * inM.mCol[0].SplatX() + mCol[1] * inM.mCol[1].SplatX() + mCol[2] * inM.mCol[2].SplatX();
  329. result.mCol[1] = mCol[0] * inM.mCol[0].SplatY() + mCol[1] * inM.mCol[1].SplatY() + mCol[2] * inM.mCol[2].SplatY();
  330. result.mCol[2] = mCol[0] * inM.mCol[0].SplatZ() + mCol[1] * inM.mCol[1].SplatZ() + mCol[2] * inM.mCol[2].SplatZ();
  331. result.mCol[3] = Vec4(0, 0, 0, 1);
  332. return result;
  333. }
  334. Mat44 Mat44::operator * (float inV) const
  335. {
  336. Vec4 multiplier = Vec4::sReplicate(inV);
  337. Mat44 result;
  338. for (int c = 0; c < 4; ++c)
  339. result.mCol[c] = mCol[c] * multiplier;
  340. return result;
  341. }
  342. Mat44 &Mat44::operator *= (float inV)
  343. {
  344. for (int c = 0; c < 4; ++c)
  345. mCol[c] *= inV;
  346. return *this;
  347. }
  348. Mat44 Mat44::operator + (Mat44Arg inM) const
  349. {
  350. Mat44 result;
  351. for (int i = 0; i < 4; ++i)
  352. result.mCol[i] = mCol[i] + inM.mCol[i];
  353. return result;
  354. }
  355. Mat44 Mat44::operator - () const
  356. {
  357. Mat44 result;
  358. for (int i = 0; i < 4; ++i)
  359. result.mCol[i] = -mCol[i];
  360. return result;
  361. }
  362. Mat44 Mat44::operator - (Mat44Arg inM) const
  363. {
  364. Mat44 result;
  365. for (int i = 0; i < 4; ++i)
  366. result.mCol[i] = mCol[i] - inM.mCol[i];
  367. return result;
  368. }
  369. Mat44 &Mat44::operator += (Mat44Arg inM)
  370. {
  371. for (int c = 0; c < 4; ++c)
  372. mCol[c] += inM.mCol[c];
  373. return *this;
  374. }
  375. void Mat44::StoreFloat4x4(Float4 *outV) const
  376. {
  377. for (int c = 0; c < 4; ++c)
  378. mCol[c].StoreFloat4(outV + c);
  379. }
  380. Mat44 Mat44::Transposed() const
  381. {
  382. #if defined(JPH_USE_SSE)
  383. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  384. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  385. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  386. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  387. Mat44 result;
  388. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  389. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  390. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  391. result.mCol[3].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(3, 1, 3, 1));
  392. return result;
  393. #elif defined(JPH_USE_NEON)
  394. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  395. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, mCol[3].mValue);
  396. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  397. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  398. Mat44 result;
  399. result.mCol[0].mValue = tmp3.val[0];
  400. result.mCol[1].mValue = tmp3.val[1];
  401. result.mCol[2].mValue = tmp4.val[0];
  402. result.mCol[3].mValue = tmp4.val[1];
  403. return result;
  404. #else
  405. #error Unsupported CPU architecture
  406. #endif
  407. }
  408. Mat44 Mat44::Transposed3x3() const
  409. {
  410. #if defined(JPH_USE_SSE)
  411. __m128 zero = _mm_setzero_ps();
  412. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  413. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  414. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(1, 0, 1, 0));
  415. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(3, 2, 3, 2));
  416. Mat44 result;
  417. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  418. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  419. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  420. #elif defined(JPH_USE_NEON)
  421. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  422. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, vdupq_n_f32(0));
  423. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  424. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  425. Mat44 result;
  426. result.mCol[0].mValue = tmp3.val[0];
  427. result.mCol[1].mValue = tmp3.val[1];
  428. result.mCol[2].mValue = tmp4.val[0];
  429. #else
  430. #error Unsupported CPU architecture
  431. #endif
  432. result.mCol[3] = Vec4(0, 0, 0, 1);
  433. return result;
  434. }
  435. Mat44 Mat44::Inversed() const
  436. {
  437. #if defined(JPH_USE_SSE)
  438. // Algorithm from: http://download.intel.com/design/PentiumIII/sml/24504301.pdf
  439. // Streaming SIMD Extensions - Inverse of 4x4 Matrix
  440. // Adapted to load data using _mm_shuffle_ps instead of loading from memory
  441. // Replaced _mm_rcp_ps with _mm_div_ps for better accuracy
  442. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  443. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  444. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  445. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  446. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  447. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  448. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  449. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  450. tmp1 = _mm_mul_ps(row2, row3);
  451. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  452. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  453. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  454. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  455. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  456. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  457. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  458. tmp1 = _mm_mul_ps(row1, row2);
  459. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  460. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  461. __m128 minor3 = _mm_mul_ps(row0, tmp1);
  462. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  463. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  464. minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
  465. minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
  466. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  467. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  468. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  469. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  470. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  471. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  472. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  473. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  474. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  475. tmp1 = _mm_mul_ps(row0, row1);
  476. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  477. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  478. minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
  479. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  480. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  481. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
  482. tmp1 = _mm_mul_ps(row0, row3);
  483. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  484. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  485. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  486. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  487. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  488. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  489. tmp1 = _mm_mul_ps(row0, row2);
  490. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  491. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  492. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
  493. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  494. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  495. minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
  496. __m128 det = _mm_mul_ps(row0, minor0);
  497. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
  498. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  499. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  500. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  501. Mat44 result;
  502. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  503. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  504. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  505. result.mCol[3].mValue = _mm_mul_ps(det, minor3);
  506. return result;
  507. #elif defined(JPH_USE_NEON)
  508. // Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
  509. Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  510. Type row1 = __builtin_shufflevector(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
  511. Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
  512. row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
  513. tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  514. Type row3 = __builtin_shufflevector(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
  515. Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
  516. row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
  517. tmp1 = vmulq_f32(row2, row3);
  518. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  519. Type minor0 = vmulq_f32(row1, tmp1);
  520. Type minor1 = vmulq_f32(row0, tmp1);
  521. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  522. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  523. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  524. minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
  525. tmp1 = vmulq_f32(row1, row2);
  526. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  527. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  528. Type minor3 = vmulq_f32(row0, tmp1);
  529. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  530. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  531. minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
  532. minor3 = __builtin_shufflevector(minor3, minor3, 2, 3, 0, 1);
  533. tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
  534. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  535. row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
  536. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  537. Type minor2 = vmulq_f32(row0, tmp1);
  538. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  539. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  540. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  541. minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
  542. tmp1 = vmulq_f32(row0, row1);
  543. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  544. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  545. minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
  546. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  547. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  548. minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
  549. tmp1 = vmulq_f32(row0, row3);
  550. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  551. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  552. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  553. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  554. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  555. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  556. tmp1 = vmulq_f32(row0, row2);
  557. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  558. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  559. minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
  560. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  561. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  562. minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
  563. Type det = vmulq_f32(row0, minor0);
  564. det = vdupq_n_f32(vaddvq_f32(det));
  565. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  566. Mat44 result;
  567. result.mCol[0].mValue = vmulq_f32(det, minor0);
  568. result.mCol[1].mValue = vmulq_f32(det, minor1);
  569. result.mCol[2].mValue = vmulq_f32(det, minor2);
  570. result.mCol[3].mValue = vmulq_f32(det, minor3);
  571. return result;
  572. #else
  573. #error Undefined CPU architecture
  574. #endif
  575. }
  576. Mat44 Mat44::InversedRotationTranslation() const
  577. {
  578. Mat44 m = Transposed3x3();
  579. m.SetTranslation(-m.Multiply3x3(GetTranslation()));
  580. return m;
  581. }
  582. float Mat44::GetDeterminant3x3() const
  583. {
  584. return GetAxisX().Dot(GetAxisY().Cross(GetAxisZ()));
  585. }
  586. Mat44 Mat44::Adjointed3x3() const
  587. {
  588. // Adapted from Inversed() to remove 4th column and the division by the determinant
  589. // Note: This can be optimized.
  590. JPH_ASSERT(mCol[0][3] == 0.0f);
  591. JPH_ASSERT(mCol[1][3] == 0.0f);
  592. JPH_ASSERT(mCol[2][3] == 0.0f);
  593. #if defined(JPH_USE_SSE)
  594. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  595. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  596. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  597. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  598. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  599. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  600. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  601. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  602. tmp1 = _mm_mul_ps(row2, row3);
  603. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  604. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  605. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  606. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  607. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  608. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  609. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  610. tmp1 = _mm_mul_ps(row1, row2);
  611. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  612. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  613. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  614. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  615. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  616. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  617. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  618. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  619. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  620. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  621. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  622. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  623. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  624. tmp1 = _mm_mul_ps(row0, row1);
  625. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  626. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  627. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  628. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  629. tmp1 = _mm_mul_ps(row0, row3);
  630. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  631. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  632. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  633. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  634. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  635. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  636. tmp1 = _mm_mul_ps(row0, row2);
  637. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  638. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  639. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  640. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  641. Mat44 result;
  642. result.mCol[0].mValue = minor0;
  643. result.mCol[1].mValue = minor1;
  644. result.mCol[2].mValue = minor2;
  645. result.mCol[3] = Vec4(0, 0, 0, 1);
  646. return result;
  647. #elif defined(JPH_USE_NEON)
  648. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  649. Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  650. Type row1 = __builtin_shufflevector(mCol[2].mValue, v0001, 0, 1, 4, 5);
  651. Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
  652. row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
  653. tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  654. Type row3 = __builtin_shufflevector(mCol[2].mValue, v0001, 2, 3, 6, 7);
  655. Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
  656. row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
  657. tmp1 = vmulq_f32(row2, row3);
  658. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  659. Type minor0 = vmulq_f32(row1, tmp1);
  660. Type minor1 = vmulq_f32(row0, tmp1);
  661. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  662. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  663. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  664. minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
  665. tmp1 = vmulq_f32(row1, row2);
  666. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  667. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  668. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  669. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  670. tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
  671. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  672. row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
  673. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  674. Type minor2 = vmulq_f32(row0, tmp1);
  675. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  676. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  677. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  678. minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
  679. tmp1 = vmulq_f32(row0, row1);
  680. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  681. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  682. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  683. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  684. tmp1 = vmulq_f32(row0, row3);
  685. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  686. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  687. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  688. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  689. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  690. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  691. tmp1 = vmulq_f32(row0, row2);
  692. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  693. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  694. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  695. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  696. Mat44 result;
  697. result.mCol[0].mValue = minor0;
  698. result.mCol[1].mValue = minor1;
  699. result.mCol[2].mValue = minor2;
  700. result.mCol[3].mValue = v0001;
  701. return result;
  702. #else
  703. #error Undefined CPU architecture
  704. #endif
  705. }
  706. Mat44 Mat44::Inversed3x3() const
  707. {
  708. // Adapted from Inversed() to remove 4th column
  709. // Note: This can be optimized.
  710. JPH_ASSERT(mCol[0][3] == 0.0f);
  711. JPH_ASSERT(mCol[1][3] == 0.0f);
  712. JPH_ASSERT(mCol[2][3] == 0.0f);
  713. #if defined(JPH_USE_SSE)
  714. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  715. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  716. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  717. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  718. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  719. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  720. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  721. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  722. tmp1 = _mm_mul_ps(row2, row3);
  723. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  724. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  725. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  726. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  727. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  728. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  729. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  730. tmp1 = _mm_mul_ps(row1, row2);
  731. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  732. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  733. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  734. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  735. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  736. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  737. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  738. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  739. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  740. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  741. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  742. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  743. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  744. tmp1 = _mm_mul_ps(row0, row1);
  745. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  746. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  747. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  748. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  749. tmp1 = _mm_mul_ps(row0, row3);
  750. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  751. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  752. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  753. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  754. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  755. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  756. tmp1 = _mm_mul_ps(row0, row2);
  757. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  758. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  759. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  760. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  761. __m128 det = _mm_mul_ps(row0, minor0);
  762. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det); // Original code did (x + z) + (y + w), changed to (x + y) + (z + w) to match the ARM code below and make the result cross platform deterministic
  763. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  764. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  765. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  766. Mat44 result;
  767. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  768. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  769. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  770. result.mCol[3] = Vec4(0, 0, 0, 1);
  771. return result;
  772. #elif defined(JPH_USE_NEON)
  773. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  774. Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  775. Type row1 = __builtin_shufflevector(mCol[2].mValue, v0001, 0, 1, 4, 5);
  776. Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
  777. row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
  778. tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  779. Type row3 = __builtin_shufflevector(mCol[2].mValue, v0001, 2, 3, 6, 7);
  780. Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
  781. row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
  782. tmp1 = vmulq_f32(row2, row3);
  783. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  784. Type minor0 = vmulq_f32(row1, tmp1);
  785. Type minor1 = vmulq_f32(row0, tmp1);
  786. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  787. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  788. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  789. minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
  790. tmp1 = vmulq_f32(row1, row2);
  791. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  792. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  793. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  794. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  795. tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
  796. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  797. row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
  798. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  799. Type minor2 = vmulq_f32(row0, tmp1);
  800. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  801. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  802. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  803. minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
  804. tmp1 = vmulq_f32(row0, row1);
  805. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  806. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  807. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  808. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  809. tmp1 = vmulq_f32(row0, row3);
  810. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  811. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  812. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  813. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  814. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  815. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  816. tmp1 = vmulq_f32(row0, row2);
  817. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  818. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  819. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  820. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  821. Type det = vmulq_f32(row0, minor0);
  822. det = vdupq_n_f32(vaddvq_f32(det));
  823. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  824. Mat44 result;
  825. result.mCol[0].mValue = vmulq_f32(det, minor0);
  826. result.mCol[1].mValue = vmulq_f32(det, minor1);
  827. result.mCol[2].mValue = vmulq_f32(det, minor2);
  828. result.mCol[3].mValue = v0001;
  829. return result;
  830. #else
  831. #error Undefined CPU architecture
  832. #endif
  833. }
  834. Quat Mat44::GetQuaternion() const
  835. {
  836. JPH_ASSERT(mCol[3] == Vec4(0, 0, 0, 1));
  837. float tr = mCol[0].mF32[0] + mCol[1].mF32[1] + mCol[2].mF32[2];
  838. if (tr >= 0.0f)
  839. {
  840. float s = sqrt(tr + 1.0f);
  841. float is = 0.5f / s;
  842. return Quat(
  843. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is,
  844. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is,
  845. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is,
  846. 0.5f * s);
  847. }
  848. else
  849. {
  850. int i = 0;
  851. if (mCol[1].mF32[1] > mCol[0].mF32[0]) i = 1;
  852. if (mCol[2].mF32[2] > mCol[i].mF32[i]) i = 2;
  853. if (i == 0)
  854. {
  855. float s = sqrt(mCol[0].mF32[0] - (mCol[1].mF32[1] + mCol[2].mF32[2]) + 1);
  856. float is = 0.5f / s;
  857. return Quat(
  858. 0.5f * s,
  859. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  860. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  861. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is);
  862. }
  863. else if (i == 1)
  864. {
  865. float s = sqrt(mCol[1].mF32[1] - (mCol[2].mF32[2] + mCol[0].mF32[0]) + 1);
  866. float is = 0.5f / s;
  867. return Quat(
  868. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  869. 0.5f * s,
  870. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  871. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is);
  872. }
  873. else
  874. {
  875. JPH_ASSERT(i == 2);
  876. float s = sqrt(mCol[2].mF32[2] - (mCol[0].mF32[0] + mCol[1].mF32[1]) + 1);
  877. float is = 0.5f / s;
  878. return Quat(
  879. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  880. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  881. 0.5f * s,
  882. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is);
  883. }
  884. }
  885. }
  886. Mat44 Mat44::sQuatLeftMultiply(QuatArg inQ)
  887. {
  888. return Mat44(
  889. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  890. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  891. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  892. inQ.mValue);
  893. }
  894. Mat44 Mat44::sQuatRightMultiply(QuatArg inQ)
  895. {
  896. return Mat44(
  897. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  898. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  899. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  900. inQ.mValue);
  901. }
  902. Mat44 Mat44::GetRotation() const
  903. {
  904. JPH_ASSERT(mCol[0][3] == 0.0f);
  905. JPH_ASSERT(mCol[1][3] == 0.0f);
  906. JPH_ASSERT(mCol[2][3] == 0.0f);
  907. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1));
  908. }
  909. Mat44 Mat44::GetRotationSafe() const
  910. {
  911. #if defined(JPH_USE_SSE4_1)
  912. __m128 zero = _mm_setzero_ps();
  913. return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
  914. _mm_blend_ps(mCol[1].mValue, zero, 8),
  915. _mm_blend_ps(mCol[2].mValue, zero, 8),
  916. Vec4(0, 0, 0, 1));
  917. #elif defined(JPH_USE_NEON)
  918. return Mat44(vsetq_lane_f32(0, mCol[0].mValue, 3),
  919. vsetq_lane_f32(0, mCol[1].mValue, 3),
  920. vsetq_lane_f32(0, mCol[2].mValue, 3),
  921. Vec4(0, 0, 0, 1));
  922. #else
  923. return Mat44(Vec4(mCol[0].mF32[0], mCol[0].mF32[1], mCol[0].mF32[2], 0),
  924. Vec4(mCol[1].mF32[0], mCol[1].mF32[1], mCol[1].mF32[2], 0),
  925. Vec4(mCol[2].mF32[0], mCol[2].mF32[1], mCol[2].mF32[2], 0),
  926. Vec4(0, 0, 0, 1));
  927. #endif
  928. }
  929. void Mat44::SetRotation(Mat44Arg inRotation)
  930. {
  931. mCol[0] = inRotation.mCol[0];
  932. mCol[1] = inRotation.mCol[1];
  933. mCol[2] = inRotation.mCol[2];
  934. }
  935. Mat44 Mat44::PreTranslated(Vec3Arg inTranslation) const
  936. {
  937. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + Multiply3x3(inTranslation), 1));
  938. }
  939. Mat44 Mat44::PostTranslated(Vec3Arg inTranslation) const
  940. {
  941. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(GetTranslation() + inTranslation, 1));
  942. }
  943. Mat44 Mat44::PreScaled(Vec3Arg inScale) const
  944. {
  945. return Mat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol[3]);
  946. }
  947. Mat44 Mat44::PostScaled(Vec3Arg inScale) const
  948. {
  949. Vec4 scale(inScale, 1);
  950. return Mat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], scale * mCol[3]);
  951. }
  952. Mat44 Mat44::Decompose(Vec3 &outScale) const
  953. {
  954. // Start the modified Gram-Schmidt algorithm
  955. // X axis will just be normalized
  956. Vec3 x = GetAxisX();
  957. // Make Y axis perpendicular to X
  958. Vec3 y = GetAxisY();
  959. float x_dot_x = x.LengthSq();
  960. y -= (x.Dot(y) / x_dot_x) * x;
  961. // Make Z axis perpendicular to X
  962. Vec3 z = GetAxisZ();
  963. z -= (x.Dot(z) / x_dot_x) * x;
  964. // Make Z axis perpendicular to Y
  965. float y_dot_y = y.LengthSq();
  966. z -= (y.Dot(z) / y_dot_y) * y;
  967. // Determine the scale
  968. float z_dot_z = z.LengthSq();
  969. outScale = Vec3(x_dot_x, y_dot_y, z_dot_z).Sqrt();
  970. // If the resulting x, y and z vectors don't form a right handed matrix, flip the z axis.
  971. if (x.Cross(y).Dot(z) < 0.0f)
  972. outScale.SetZ(-outScale.GetZ());
  973. // Determine the rotation and translation
  974. return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
  975. }
  976. JPH_NAMESPACE_END