Mat44.inl 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094
  1. // SPDX-FileCopyrightText: 2021 Jorrit Rouwe
  2. // SPDX-License-Identifier: MIT
  3. #pragma once
  4. #include <Math/Vec3.h>
  5. #include <Math/Vec4.h>
  6. #include <Math/Quat.h>
  7. namespace JPH {
  8. Mat44::Mat44(Vec4Arg inC1, Vec4Arg inC2, Vec4Arg inC3, Vec4Arg inC4) :
  9. mCol { inC1, inC2, inC3, inC4 }
  10. {
  11. }
  12. Mat44::Mat44(Type inC1, Type inC2, Type inC3, Type inC4) :
  13. mCol { inC1, inC2, inC3, inC4 }
  14. {
  15. }
  16. Mat44 Mat44::sZero()
  17. {
  18. return Mat44(Vec4::sZero(), Vec4::sZero(), Vec4::sZero(), Vec4::sZero());
  19. }
  20. Mat44 Mat44::sIdentity()
  21. {
  22. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  23. }
  24. Mat44 Mat44::sNaN()
  25. {
  26. return Mat44(Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN(), Vec4::sNaN());
  27. }
  28. Mat44 Mat44::sLoadFloat4x4(const Float4 *inV)
  29. {
  30. Mat44 result;
  31. for (int c = 0; c < 4; ++c)
  32. result.mCol[c] = Vec4::sLoadFloat4(inV + c);
  33. return result;
  34. }
  35. Mat44 Mat44::sLoadFloat4x4Aligned(const Float4 *inV)
  36. {
  37. Mat44 result;
  38. for (int c = 0; c < 4; ++c)
  39. result.mCol[c] = Vec4::sLoadFloat4Aligned(inV + c);
  40. return result;
  41. }
  42. Mat44 Mat44::sRotationX(float inX)
  43. {
  44. // TODO: Could be optimized
  45. float c = cos(inX), s = sin(inX);
  46. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, c, s, 0), Vec4(0, -s, c, 0), Vec4(0, 0, 0, 1));
  47. }
  48. Mat44 Mat44::sRotationY(float inY)
  49. {
  50. // TODO: Could be optimized
  51. float c = cos(inY), s = sin(inY);
  52. return Mat44(Vec4(c, 0, -s, 0), Vec4(0, 1, 0, 0), Vec4(s, 0, c, 0), Vec4(0, 0, 0, 1));
  53. }
  54. Mat44 Mat44::sRotationZ(float inZ)
  55. {
  56. // TODO: Could be optimized
  57. float c = cos(inZ), s = sin(inZ);
  58. return Mat44(Vec4(c, s, 0, 0), Vec4(-s, c, 0, 0), Vec4(0, 0, 1, 0), Vec4(0, 0, 0, 1));
  59. }
  60. Mat44 Mat44::sRotation(QuatArg inQuat)
  61. {
  62. JPH_ASSERT(inQuat.IsNormalized());
  63. // See: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation section 'Quaternion-derived rotation matrix'
  64. #ifdef JPH_USE_SSE
  65. __m128 xyzw = inQuat.mValue.mValue;
  66. __m128 two_xyzw = _mm_add_ps(xyzw, xyzw);
  67. __m128 yzxw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 0, 2, 1));
  68. __m128 two_yzxw = _mm_add_ps(yzxw, yzxw);
  69. __m128 zxyw = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 1, 0, 2));
  70. __m128 two_zxyw = _mm_add_ps(zxyw, zxyw);
  71. __m128 wwww = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(3, 3, 3, 3));
  72. __m128 diagonal = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(two_yzxw, yzxw)), _mm_mul_ps(two_zxyw, zxyw)); // (1 - 2 y^2 - 2 z^2, 1 - 2 x^2 - 2 z^2, 1 - 2 x^2 - 2 y^2, 1 - 4 w^2)
  73. __m128 plus = _mm_add_ps(_mm_mul_ps(two_xyzw, zxyw), _mm_mul_ps(two_yzxw, wwww)); // 2 * (xz + yw, xy + zw, yz + xw, ww)
  74. __m128 minus = _mm_sub_ps(_mm_mul_ps(two_yzxw, xyzw), _mm_mul_ps(two_zxyw, wwww)); // 2 * (xy - zw, yz - xw, xz - yw, 0)
  75. // Workaround for compiler changing _mm_sub_ps(_mm_mul_ps(...), ...) into a fused multiply sub instruction, resulting in w not being 0
  76. // There doesn't appear to be a reliable way to turn this off in Clang
  77. minus = _mm_insert_ps(minus, minus, 0b1000);
  78. __m128 col0 = _mm_blend_ps(_mm_blend_ps(plus, diagonal, 0b0001), minus, 0b1100); // (1 - 2 y^2 - 2 z^2, 2 xy + 2 zw, 2 xz - 2 yw, 0)
  79. __m128 col1 = _mm_blend_ps(_mm_blend_ps(diagonal, minus, 0b1001), plus, 0b0100); // (2 xy - 2 zw, 1 - 2 x^2 - 2 z^2, 2 yz + 2 xw, 0)
  80. __m128 col2 = _mm_blend_ps(_mm_blend_ps(minus, plus, 0b0001), diagonal, 0b0100); // (2 xz + 2 yw, 2 yz - 2 xw, 1 - 2 x^2 - 2 y^2, 0)
  81. __m128 col3 = _mm_set_ps(1, 0, 0, 0);
  82. return Mat44(col0, col1, col2, col3);
  83. #else
  84. float x = inQuat.GetX();
  85. float y = inQuat.GetY();
  86. float z = inQuat.GetZ();
  87. float w = inQuat.GetW();
  88. float tx = 2.0f * x;
  89. float ty = 2.0f * y;
  90. float tz = 2.0f * z;
  91. float xx = tx * x;
  92. float yy = ty * y;
  93. float zz = tz * z;
  94. float xy = tx * y;
  95. float xz = tx * z;
  96. float xw = tx * w;
  97. float yz = ty * z;
  98. float yw = ty * w;
  99. float zw = tz * w;
  100. return Mat44(Vec4(1.0f - yy - zz, xy + zw, xz - yw, 0.0f),
  101. Vec4(xy - zw, 1.0f - xx - zz, yz + xw, 0.0f),
  102. Vec4(xz + yw, yz - xw, 1.0f - xx - yy, 0.0f),
  103. Vec4(0.0f, 0.0f, 0.0f, 1.0f));
  104. #endif
  105. }
  106. Mat44 Mat44::sRotation(Vec3Arg inAxis, float inAngle)
  107. {
  108. return sRotation(Quat::sRotation(inAxis, inAngle));
  109. }
  110. Mat44 Mat44::sTranslation(Vec3Arg inV)
  111. {
  112. return Mat44(Vec4(1, 0, 0, 0), Vec4(0, 1, 0, 0), Vec4(0, 0, 1, 0), Vec4(inV, 1));
  113. }
  114. Mat44 Mat44::sRotationTranslation(QuatArg inR, Vec3Arg inT)
  115. {
  116. Mat44 m = sRotation(inR);
  117. m.SetTranslation(inT);
  118. return m;
  119. }
  120. Mat44 Mat44::sInverseRotationTranslation(QuatArg inR, Vec3Arg inT)
  121. {
  122. Mat44 m = sRotation(inR.Conjugated());
  123. m.SetTranslation(-m.Multiply3x3(inT));
  124. return m;
  125. }
  126. Mat44 Mat44::sScale(float inScale)
  127. {
  128. return Mat44(Vec4(inScale, 0, 0, 0), Vec4(0, inScale, 0, 0), Vec4(0, 0, inScale, 0), Vec4(0, 0, 0, 1));
  129. }
  130. Mat44 Mat44::sScale(Vec3Arg inV)
  131. {
  132. return Mat44(Vec4(inV.GetX(), 0, 0, 0), Vec4(0, inV.GetY(), 0, 0), Vec4(0, 0, inV.GetZ(), 0), Vec4(0, 0, 0, 1));
  133. }
  134. Mat44 Mat44::sOuterProduct(Vec3Arg inV1, Vec3Arg inV2)
  135. {
  136. Vec4 v1(inV1, 0);
  137. return Mat44(v1 * inV2.SplatX(), v1 * inV2.SplatY(), v1 * inV2.SplatZ(), Vec4(0, 0, 0, 1));
  138. }
  139. Mat44 Mat44::sCrossProduct(Vec3Arg inV)
  140. {
  141. #ifdef JPH_USE_SSE
  142. // Zero out the W component
  143. __m128 zero = _mm_setzero_ps();
  144. __m128 v = _mm_blend_ps(inV.mValue, zero, 0b1000);
  145. // Negate
  146. __m128 min_v = _mm_sub_ps(zero, v);
  147. return Mat44(
  148. _mm_shuffle_ps(v, min_v, _MM_SHUFFLE(3, 1, 2, 3)), // [0, z, -y, 0]
  149. _mm_shuffle_ps(min_v, v, _MM_SHUFFLE(3, 0, 3, 2)), // [-z, 0, x, 0]
  150. _mm_blend_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 1)), _mm_shuffle_ps(min_v, min_v, _MM_SHUFFLE(3, 3, 0, 3)), 0b0010), // [y, -x, 0, 0]
  151. Vec4(0, 0, 0, 1));
  152. #else
  153. float x = inV.GetX();
  154. float y = inV.GetY();
  155. float z = inV.GetZ();
  156. return Mat44(
  157. Vec4(0, z, -y, 0),
  158. Vec4(-z, 0, x, 0),
  159. Vec4(y, -x, 0, 0),
  160. Vec4(0, 0, 0, 1));
  161. #endif
  162. }
  163. bool Mat44::operator == (Mat44Arg inM2) const
  164. {
  165. return UVec4::sAnd(
  166. UVec4::sAnd(Vec4::sEquals(mCol[0], inM2.mCol[0]), Vec4::sEquals(mCol[1], inM2.mCol[1])),
  167. UVec4::sAnd(Vec4::sEquals(mCol[2], inM2.mCol[2]), Vec4::sEquals(mCol[3], inM2.mCol[3]))
  168. ).TestAllTrue();
  169. }
  170. bool Mat44::IsClose(Mat44Arg inM2, float inMaxDistSq) const
  171. {
  172. for (int i = 0; i < 4; ++i)
  173. if (!mCol[i].IsClose(inM2.mCol[i], inMaxDistSq))
  174. return false;
  175. return true;
  176. }
  177. Mat44 Mat44::operator * (Mat44Arg inM) const
  178. {
  179. Mat44 result;
  180. #if defined(JPH_USE_SSE)
  181. for (int i = 0; i < 4; ++i)
  182. {
  183. __m128 c = inM.mCol[i].mValue;
  184. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  185. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  186. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  187. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 3, 3, 3))));
  188. result.mCol[i].mValue = t;
  189. }
  190. #elif defined(JPH_USE_NEON)
  191. for (int i = 0; i < 4; ++i)
  192. {
  193. Type c = inM.mCol[i].mValue;
  194. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  195. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  196. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  197. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
  198. result.mCol[i].mValue = t;
  199. }
  200. #else
  201. #error Unsupported CPU architecture
  202. #endif
  203. return result;
  204. }
  205. Vec3 Mat44::operator * (Vec3Arg inV) const
  206. {
  207. #if defined(JPH_USE_SSE)
  208. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  209. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  210. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  211. t = _mm_add_ps(t, mCol[3].mValue);
  212. return Vec3::sFixW(t);
  213. #elif defined(JPH_USE_NEON)
  214. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  215. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  216. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  217. t = vaddq_f32(t, mCol[3].mValue); // Don't combine this with the first mul into a fused multiply add, causes precision issues
  218. return Vec3::sFixW(t);
  219. #else
  220. #error Unsupported CPU architecture
  221. #endif
  222. }
  223. Vec4 Mat44::operator * (Vec4Arg inV) const
  224. {
  225. #if defined(JPH_USE_SSE)
  226. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  227. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  228. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  229. t = _mm_add_ps(t, _mm_mul_ps(mCol[3].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(3, 3, 3, 3))));
  230. return t;
  231. #elif defined(JPH_USE_NEON)
  232. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  233. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  234. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  235. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(inV.mValue, 3));
  236. return t;
  237. #else
  238. #error Unsupported CPU architecture
  239. #endif
  240. }
  241. Vec3 Mat44::Multiply3x3(Vec3Arg inV) const
  242. {
  243. #if defined(JPH_USE_SSE)
  244. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(0, 0, 0, 0)));
  245. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(1, 1, 1, 1))));
  246. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(inV.mValue, inV.mValue, _MM_SHUFFLE(2, 2, 2, 2))));
  247. return Vec3::sFixW(t);
  248. #elif defined(JPH_USE_NEON)
  249. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(inV.mValue, 0));
  250. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(inV.mValue, 1));
  251. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(inV.mValue, 2));
  252. return Vec3::sFixW(t);
  253. #else
  254. #error Unsupported CPU architecture
  255. #endif
  256. }
  257. Vec3 Mat44::Multiply3x3Transposed(Vec3Arg inV) const
  258. {
  259. #if defined(JPH_USE_SSE)
  260. __m128 x = _mm_dp_ps(mCol[0].mValue, inV.mValue, 0x7f);
  261. __m128 y = _mm_dp_ps(mCol[1].mValue, inV.mValue, 0x7f);
  262. __m128 xy = _mm_blend_ps(x, y, 0b0010);
  263. __m128 z = _mm_dp_ps(mCol[2].mValue, inV.mValue, 0x7f);
  264. __m128 xyzz = _mm_blend_ps(xy, z, 0b1100);
  265. return xyzz;
  266. #else
  267. return Transposed3x3().Multiply3x3(inV);
  268. #endif
  269. }
  270. Mat44 Mat44::Multiply3x3(Mat44Arg inM) const
  271. {
  272. JPH_ASSERT(mCol[0][3] == 0.0f);
  273. JPH_ASSERT(mCol[1][3] == 0.0f);
  274. JPH_ASSERT(mCol[2][3] == 0.0f);
  275. Mat44 result;
  276. #if defined(JPH_USE_SSE)
  277. for (int i = 0; i < 3; ++i)
  278. {
  279. __m128 c = inM.mCol[i].mValue;
  280. __m128 t = _mm_mul_ps(mCol[0].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0)));
  281. t = _mm_add_ps(t, _mm_mul_ps(mCol[1].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1))));
  282. t = _mm_add_ps(t, _mm_mul_ps(mCol[2].mValue, _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2))));
  283. result.mCol[i].mValue = t;
  284. }
  285. #elif defined(JPH_USE_NEON)
  286. for (int i = 0; i < 3; ++i)
  287. {
  288. Type c = inM.mCol[i].mValue;
  289. Type t = vmulq_f32(mCol[0].mValue, vdupq_laneq_f32(c, 0));
  290. t = vmlaq_f32(t, mCol[1].mValue, vdupq_laneq_f32(c, 1));
  291. t = vmlaq_f32(t, mCol[2].mValue, vdupq_laneq_f32(c, 2));
  292. t = vmlaq_f32(t, mCol[3].mValue, vdupq_laneq_f32(c, 3));
  293. result.mCol[i].mValue = t;
  294. }
  295. #else
  296. #error Unsupported CPU architecture
  297. #endif
  298. result.mCol[3] = Vec4(0, 0, 0, 1);
  299. return result;
  300. }
  301. Mat44 Mat44::Multiply3x3LeftTransposed(Mat44Arg inM) const
  302. {
  303. // Transpose left hand side
  304. Mat44 trans = Transposed3x3();
  305. // Do 3x3 matrix multiply
  306. Mat44 result;
  307. result.mCol[0] = trans.mCol[0] * inM.mCol[0].SplatX() + trans.mCol[1] * inM.mCol[0].SplatY() + trans.mCol[2] * inM.mCol[0].SplatZ();
  308. result.mCol[1] = trans.mCol[0] * inM.mCol[1].SplatX() + trans.mCol[1] * inM.mCol[1].SplatY() + trans.mCol[2] * inM.mCol[1].SplatZ();
  309. result.mCol[2] = trans.mCol[0] * inM.mCol[2].SplatX() + trans.mCol[1] * inM.mCol[2].SplatY() + trans.mCol[2] * inM.mCol[2].SplatZ();
  310. result.mCol[3] = Vec4(0, 0, 0, 1);
  311. return result;
  312. }
  313. Mat44 Mat44::Multiply3x3RightTransposed(Mat44Arg inM) const
  314. {
  315. JPH_ASSERT(mCol[0][3] == 0.0f);
  316. JPH_ASSERT(mCol[1][3] == 0.0f);
  317. JPH_ASSERT(mCol[2][3] == 0.0f);
  318. Mat44 result;
  319. result.mCol[0] = mCol[0] * inM.mCol[0].SplatX() + mCol[1] * inM.mCol[1].SplatX() + mCol[2] * inM.mCol[2].SplatX();
  320. result.mCol[1] = mCol[0] * inM.mCol[0].SplatY() + mCol[1] * inM.mCol[1].SplatY() + mCol[2] * inM.mCol[2].SplatY();
  321. result.mCol[2] = mCol[0] * inM.mCol[0].SplatZ() + mCol[1] * inM.mCol[1].SplatZ() + mCol[2] * inM.mCol[2].SplatZ();
  322. result.mCol[3] = Vec4(0, 0, 0, 1);
  323. return result;
  324. }
  325. Mat44 Mat44::operator * (float inV) const
  326. {
  327. Vec4 multiplier = Vec4::sReplicate(inV);
  328. Mat44 result;
  329. for (int c = 0; c < 4; ++c)
  330. result.mCol[c] = mCol[c] * multiplier;
  331. return result;
  332. }
  333. Mat44 &Mat44::operator *= (float inV)
  334. {
  335. for (int c = 0; c < 4; ++c)
  336. mCol[c] *= inV;
  337. return *this;
  338. }
  339. Mat44 Mat44::operator + (Mat44Arg inM) const
  340. {
  341. Mat44 result;
  342. for (int i = 0; i < 4; ++i)
  343. result.mCol[i] = mCol[i] + inM.mCol[i];
  344. return result;
  345. }
  346. Mat44 Mat44::operator - () const
  347. {
  348. Mat44 result;
  349. for (int i = 0; i < 4; ++i)
  350. result.mCol[i] = -mCol[i];
  351. return result;
  352. }
  353. Mat44 Mat44::operator - (Mat44Arg inM) const
  354. {
  355. Mat44 result;
  356. for (int i = 0; i < 4; ++i)
  357. result.mCol[i] = mCol[i] - inM.mCol[i];
  358. return result;
  359. }
  360. Mat44 &Mat44::operator += (Mat44Arg inM)
  361. {
  362. for (int c = 0; c < 4; ++c)
  363. mCol[c] += inM.mCol[c];
  364. return *this;
  365. }
  366. void Mat44::StoreFloat4x4(Float4 *outV) const
  367. {
  368. for (int c = 0; c < 4; ++c)
  369. mCol[c].StoreFloat4(outV + c);
  370. }
  371. Mat44 Mat44::Transposed() const
  372. {
  373. #if defined(JPH_USE_SSE)
  374. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  375. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  376. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  377. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  378. Mat44 result;
  379. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  380. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  381. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  382. result.mCol[3].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(3, 1, 3, 1));
  383. return result;
  384. #elif defined(JPH_USE_NEON)
  385. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  386. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, mCol[3].mValue);
  387. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  388. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  389. Mat44 result;
  390. result.mCol[0].mValue = tmp3.val[0];
  391. result.mCol[1].mValue = tmp3.val[1];
  392. result.mCol[2].mValue = tmp4.val[0];
  393. result.mCol[3].mValue = tmp4.val[1];
  394. return result;
  395. #else
  396. #error Unsupported CPU architecture
  397. #endif
  398. }
  399. Mat44 Mat44::Transposed3x3() const
  400. {
  401. #if defined(JPH_USE_SSE)
  402. __m128 zero = _mm_setzero_ps();
  403. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  404. __m128 tmp3 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  405. __m128 tmp2 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(1, 0, 1, 0));
  406. __m128 tmp4 = _mm_shuffle_ps(mCol[2].mValue, zero, _MM_SHUFFLE(3, 2, 3, 2));
  407. Mat44 result;
  408. result.mCol[0].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(2, 0, 2, 0));
  409. result.mCol[1].mValue = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3, 1, 3, 1));
  410. result.mCol[2].mValue = _mm_shuffle_ps(tmp3, tmp4, _MM_SHUFFLE(2, 0, 2, 0));
  411. #elif defined(JPH_USE_NEON)
  412. float32x4x2_t tmp1 = vzipq_f32(mCol[0].mValue, mCol[2].mValue);
  413. float32x4x2_t tmp2 = vzipq_f32(mCol[1].mValue, vdupq_n_f32(0));
  414. float32x4x2_t tmp3 = vzipq_f32(tmp1.val[0], tmp2.val[0]);
  415. float32x4x2_t tmp4 = vzipq_f32(tmp1.val[1], tmp2.val[1]);
  416. Mat44 result;
  417. result.mCol[0].mValue = tmp3.val[0];
  418. result.mCol[1].mValue = tmp3.val[1];
  419. result.mCol[2].mValue = tmp4.val[0];
  420. #else
  421. #error Unsupported CPU architecture
  422. #endif
  423. result.mCol[3] = Vec4(0, 0, 0, 1);
  424. return result;
  425. }
  426. Mat44 Mat44::Inversed() const
  427. {
  428. #if defined(JPH_USE_SSE)
  429. // Algorithm from: http://download.intel.com/design/PentiumIII/sml/24504301.pdf
  430. // Streaming SIMD Extensions - Inverse of 4x4 Matrix
  431. // Adapted to load data using _mm_shuffle_ps instead of loading from memory
  432. // Replaced _mm_rcp_ps with _mm_div_ps for better accuracy
  433. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  434. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  435. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  436. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  437. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  438. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, mCol[3].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  439. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  440. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  441. tmp1 = _mm_mul_ps(row2, row3);
  442. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  443. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  444. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  445. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  446. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  447. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  448. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  449. tmp1 = _mm_mul_ps(row1, row2);
  450. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  451. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  452. __m128 minor3 = _mm_mul_ps(row0, tmp1);
  453. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  454. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  455. minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
  456. minor3 = _mm_shuffle_ps(minor3, minor3, _MM_SHUFFLE(1, 0, 3, 2));
  457. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  458. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  459. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  460. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  461. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  462. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  463. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  464. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  465. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  466. tmp1 = _mm_mul_ps(row0, row1);
  467. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  468. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  469. minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
  470. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  471. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  472. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
  473. tmp1 = _mm_mul_ps(row0, row3);
  474. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  475. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  476. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  477. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  478. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  479. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  480. tmp1 = _mm_mul_ps(row0, row2);
  481. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  482. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  483. minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
  484. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  485. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  486. minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
  487. __m128 det = _mm_mul_ps(row0, minor0);
  488. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  489. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det);
  490. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  491. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  492. Mat44 result;
  493. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  494. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  495. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  496. result.mCol[3].mValue = _mm_mul_ps(det, minor3);
  497. return result;
  498. #elif defined(JPH_USE_NEON)
  499. // Adapted from the SSE version, there's surprising few articles about efficient ways of calculating an inverse for ARM on the internet
  500. Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  501. Type row1 = __builtin_shufflevector(mCol[2].mValue, mCol[3].mValue, 0, 1, 4, 5);
  502. Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
  503. row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
  504. tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  505. Type row3 = __builtin_shufflevector(mCol[2].mValue, mCol[3].mValue, 2, 3, 6, 7);
  506. Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
  507. row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
  508. tmp1 = vmulq_f32(row2, row3);
  509. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  510. Type minor0 = vmulq_f32(row1, tmp1);
  511. Type minor1 = vmulq_f32(row0, tmp1);
  512. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  513. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  514. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  515. minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
  516. tmp1 = vmulq_f32(row1, row2);
  517. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  518. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  519. Type minor3 = vmulq_f32(row0, tmp1);
  520. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  521. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  522. minor3 = vsubq_f32(vmulq_f32(row0, tmp1), minor3);
  523. minor3 = __builtin_shufflevector(minor3, minor3, 2, 3, 0, 1);
  524. tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
  525. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  526. row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
  527. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  528. Type minor2 = vmulq_f32(row0, tmp1);
  529. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  530. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  531. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  532. minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
  533. tmp1 = vmulq_f32(row0, row1);
  534. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  535. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  536. minor3 = vsubq_f32(vmulq_f32(row2, tmp1), minor3);
  537. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  538. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  539. minor3 = vsubq_f32(minor3, vmulq_f32(row2, tmp1));
  540. tmp1 = vmulq_f32(row0, row3);
  541. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  542. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  543. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  544. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  545. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  546. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  547. tmp1 = vmulq_f32(row0, row2);
  548. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  549. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  550. minor3 = vsubq_f32(minor3, vmulq_f32(row1, tmp1));
  551. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  552. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  553. minor3 = vaddq_f32(vmulq_f32(row1, tmp1), minor3);
  554. Type det = vmulq_f32(row0, minor0);
  555. det = vdupq_n_f32(vaddvq_f32(det));
  556. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  557. Mat44 result;
  558. result.mCol[0].mValue = vmulq_f32(det, minor0);
  559. result.mCol[1].mValue = vmulq_f32(det, minor1);
  560. result.mCol[2].mValue = vmulq_f32(det, minor2);
  561. result.mCol[3].mValue = vmulq_f32(det, minor3);
  562. return result;
  563. #else
  564. #error Undefined CPU architecture
  565. #endif
  566. }
  567. Mat44 Mat44::InversedRotationTranslation() const
  568. {
  569. Mat44 m = Transposed3x3();
  570. m.SetTranslation(-m.Multiply3x3(GetTranslation()));
  571. return m;
  572. }
  573. float Mat44::GetDeterminant3x3() const
  574. {
  575. return GetAxisX().Dot(GetAxisY().Cross(GetAxisZ()));
  576. }
  577. Mat44 Mat44::Adjointed3x3() const
  578. {
  579. // Adapted from Inversed() to remove 4th column and the division by the determinant
  580. // Note: This can be optimized.
  581. JPH_ASSERT(mCol[0][3] == 0.0f);
  582. JPH_ASSERT(mCol[1][3] == 0.0f);
  583. JPH_ASSERT(mCol[2][3] == 0.0f);
  584. #if defined(JPH_USE_SSE)
  585. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  586. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  587. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  588. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  589. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  590. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  591. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  592. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  593. tmp1 = _mm_mul_ps(row2, row3);
  594. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  595. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  596. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  597. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  598. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  599. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  600. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  601. tmp1 = _mm_mul_ps(row1, row2);
  602. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  603. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  604. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  605. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  606. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  607. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  608. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  609. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  610. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  611. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  612. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  613. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  614. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  615. tmp1 = _mm_mul_ps(row0, row1);
  616. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  617. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  618. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  619. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  620. tmp1 = _mm_mul_ps(row0, row3);
  621. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  622. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  623. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  624. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  625. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  626. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  627. tmp1 = _mm_mul_ps(row0, row2);
  628. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  629. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  630. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  631. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  632. Mat44 result;
  633. result.mCol[0].mValue = minor0;
  634. result.mCol[1].mValue = minor1;
  635. result.mCol[2].mValue = minor2;
  636. result.mCol[3] = Vec4(0, 0, 0, 1);
  637. return result;
  638. #elif defined(JPH_USE_NEON)
  639. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  640. Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  641. Type row1 = __builtin_shufflevector(mCol[2].mValue, v0001, 0, 1, 4, 5);
  642. Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
  643. row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
  644. tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  645. Type row3 = __builtin_shufflevector(mCol[2].mValue, v0001, 2, 3, 6, 7);
  646. Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
  647. row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
  648. tmp1 = vmulq_f32(row2, row3);
  649. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  650. Type minor0 = vmulq_f32(row1, tmp1);
  651. Type minor1 = vmulq_f32(row0, tmp1);
  652. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  653. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  654. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  655. minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
  656. tmp1 = vmulq_f32(row1, row2);
  657. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  658. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  659. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  660. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  661. tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
  662. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  663. row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
  664. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  665. Type minor2 = vmulq_f32(row0, tmp1);
  666. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  667. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  668. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  669. minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
  670. tmp1 = vmulq_f32(row0, row1);
  671. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  672. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  673. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  674. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  675. tmp1 = vmulq_f32(row0, row3);
  676. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  677. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  678. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  679. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  680. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  681. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  682. tmp1 = vmulq_f32(row0, row2);
  683. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  684. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  685. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  686. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  687. Mat44 result;
  688. result.mCol[0].mValue = minor0;
  689. result.mCol[1].mValue = minor1;
  690. result.mCol[2].mValue = minor2;
  691. result.mCol[3].mValue = v0001;
  692. return result;
  693. #else
  694. #error Undefined CPU architecture
  695. #endif
  696. }
  697. Mat44 Mat44::Inversed3x3() const
  698. {
  699. // Adapted from Inversed() to remove 4th column
  700. // Note: This can be optimized.
  701. JPH_ASSERT(mCol[0][3] == 0.0f);
  702. JPH_ASSERT(mCol[1][3] == 0.0f);
  703. JPH_ASSERT(mCol[2][3] == 0.0f);
  704. #if defined(JPH_USE_SSE)
  705. __m128 tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(1, 0, 1, 0));
  706. __m128 row1 = _mm_shuffle_ps(mCol[2].mValue, _mm_setzero_ps(), _MM_SHUFFLE(1, 0, 1, 0));
  707. __m128 row0 = _mm_shuffle_ps(tmp1, row1, _MM_SHUFFLE(2, 0, 2, 0));
  708. row1 = _mm_shuffle_ps(row1, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  709. tmp1 = _mm_shuffle_ps(mCol[0].mValue, mCol[1].mValue, _MM_SHUFFLE(3, 2, 3, 2));
  710. __m128 row3 = _mm_shuffle_ps(mCol[2].mValue, _mm_set_ps(1, 0, 0, 0), _MM_SHUFFLE(3, 2, 3, 2));
  711. __m128 row2 = _mm_shuffle_ps(tmp1, row3, _MM_SHUFFLE(2, 0, 2, 0));
  712. row3 = _mm_shuffle_ps(row3, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
  713. tmp1 = _mm_mul_ps(row2, row3);
  714. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  715. __m128 minor0 = _mm_mul_ps(row1, tmp1);
  716. __m128 minor1 = _mm_mul_ps(row0, tmp1);
  717. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  718. minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  719. minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  720. minor1 = _mm_shuffle_ps(minor1, minor1, _MM_SHUFFLE(1, 0, 3, 2));
  721. tmp1 = _mm_mul_ps(row1, row2);
  722. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  723. minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  724. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  725. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  726. tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, _MM_SHUFFLE(1, 0, 3, 2)), row3);
  727. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  728. row2 = _mm_shuffle_ps(row2, row2, _MM_SHUFFLE(1, 0, 3, 2));
  729. minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  730. __m128 minor2 = _mm_mul_ps(row0, tmp1);
  731. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  732. minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  733. minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  734. minor2 = _mm_shuffle_ps(minor2, minor2, _MM_SHUFFLE(1, 0, 3, 2));
  735. tmp1 = _mm_mul_ps(row0, row1);
  736. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  737. minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  738. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  739. minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  740. tmp1 = _mm_mul_ps(row0, row3);
  741. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  742. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  743. minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  744. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  745. minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  746. minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  747. tmp1 = _mm_mul_ps(row0, row2);
  748. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(2, 3, 0, 1));
  749. minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  750. tmp1 = _mm_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
  751. minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  752. __m128 det = _mm_mul_ps(row0, minor0);
  753. det = _mm_add_ps(_mm_shuffle_ps(det, det, _MM_SHUFFLE(1, 0, 3, 2)), det);
  754. det = _mm_add_ss(_mm_shuffle_ps(det, det, _MM_SHUFFLE(2, 3, 0, 1)), det);
  755. det = _mm_div_ss(_mm_set_ss(1.0f), det);
  756. det = _mm_shuffle_ps(det, det, _MM_SHUFFLE(0, 0, 0, 0));
  757. Mat44 result;
  758. result.mCol[0].mValue = _mm_mul_ps(det, minor0);
  759. result.mCol[1].mValue = _mm_mul_ps(det, minor1);
  760. result.mCol[2].mValue = _mm_mul_ps(det, minor2);
  761. result.mCol[3] = Vec4(0, 0, 0, 1);
  762. return result;
  763. #elif defined(JPH_USE_NEON)
  764. Type v0001 = vsetq_lane_f32(1, vdupq_n_f32(0), 3);
  765. Type tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 0, 1, 4, 5);
  766. Type row1 = __builtin_shufflevector(mCol[2].mValue, v0001, 0, 1, 4, 5);
  767. Type row0 = __builtin_shufflevector(tmp1, row1, 0, 2, 4, 6);
  768. row1 = __builtin_shufflevector(row1, tmp1, 1, 3, 5, 7);
  769. tmp1 = __builtin_shufflevector(mCol[0].mValue, mCol[1].mValue, 2, 3, 6, 7);
  770. Type row3 = __builtin_shufflevector(mCol[2].mValue, v0001, 2, 3, 6, 7);
  771. Type row2 = __builtin_shufflevector(tmp1, row3, 0, 2, 4, 6);
  772. row3 = __builtin_shufflevector(row3, tmp1, 1, 3, 5, 7);
  773. tmp1 = vmulq_f32(row2, row3);
  774. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  775. Type minor0 = vmulq_f32(row1, tmp1);
  776. Type minor1 = vmulq_f32(row0, tmp1);
  777. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  778. minor0 = vsubq_f32(vmulq_f32(row1, tmp1), minor0);
  779. minor1 = vsubq_f32(vmulq_f32(row0, tmp1), minor1);
  780. minor1 = __builtin_shufflevector(minor1, minor1, 2, 3, 0, 1);
  781. tmp1 = vmulq_f32(row1, row2);
  782. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  783. minor0 = vaddq_f32(vmulq_f32(row3, tmp1), minor0);
  784. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  785. minor0 = vsubq_f32(minor0, vmulq_f32(row3, tmp1));
  786. tmp1 = vmulq_f32(__builtin_shufflevector(row1, row1, 2, 3, 0, 1), row3);
  787. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  788. row2 = __builtin_shufflevector(row2, row2, 2, 3, 0, 1);
  789. minor0 = vaddq_f32(vmulq_f32(row2, tmp1), minor0);
  790. Type minor2 = vmulq_f32(row0, tmp1);
  791. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  792. minor0 = vsubq_f32(minor0, vmulq_f32(row2, tmp1));
  793. minor2 = vsubq_f32(vmulq_f32(row0, tmp1), minor2);
  794. minor2 = __builtin_shufflevector(minor2, minor2, 2, 3, 0, 1);
  795. tmp1 = vmulq_f32(row0, row1);
  796. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  797. minor2 = vaddq_f32(vmulq_f32(row3, tmp1), minor2);
  798. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  799. minor2 = vsubq_f32(vmulq_f32(row3, tmp1), minor2);
  800. tmp1 = vmulq_f32(row0, row3);
  801. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  802. minor1 = vsubq_f32(minor1, vmulq_f32(row2, tmp1));
  803. minor2 = vaddq_f32(vmulq_f32(row1, tmp1), minor2);
  804. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  805. minor1 = vaddq_f32(vmulq_f32(row2, tmp1), minor1);
  806. minor2 = vsubq_f32(minor2, vmulq_f32(row1, tmp1));
  807. tmp1 = vmulq_f32(row0, row2);
  808. tmp1 = __builtin_shufflevector(tmp1, tmp1, 1, 0, 3, 2);
  809. minor1 = vaddq_f32(vmulq_f32(row3, tmp1), minor1);
  810. tmp1 = __builtin_shufflevector(tmp1, tmp1, 2, 3, 0, 1);
  811. minor1 = vsubq_f32(minor1, vmulq_f32(row3, tmp1));
  812. Type det = vmulq_f32(row0, minor0);
  813. det = vdupq_n_f32(vaddvq_f32(det));
  814. det = vdivq_f32(vdupq_n_f32(1.0f), det);
  815. Mat44 result;
  816. result.mCol[0].mValue = vmulq_f32(det, minor0);
  817. result.mCol[1].mValue = vmulq_f32(det, minor1);
  818. result.mCol[2].mValue = vmulq_f32(det, minor2);
  819. result.mCol[3].mValue = v0001;
  820. return result;
  821. #else
  822. #error Undefined CPU architecture
  823. #endif
  824. }
  825. const Quat Mat44::GetQuaternion() const
  826. {
  827. JPH_ASSERT(mCol[3] == Vec4(0, 0, 0, 1));
  828. float tr = mCol[0].mF32[0] + mCol[1].mF32[1] + mCol[2].mF32[2];
  829. if (tr >= 0.0f)
  830. {
  831. float s = sqrt(tr + 1.0f);
  832. float is = 0.5f / s;
  833. return Quat(
  834. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is,
  835. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is,
  836. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is,
  837. 0.5f * s);
  838. }
  839. else
  840. {
  841. int i = 0;
  842. if (mCol[1].mF32[1] > mCol[0].mF32[0]) i = 1;
  843. if (mCol[2].mF32[2] > mCol[i].mF32[i]) i = 2;
  844. if (i == 0)
  845. {
  846. float s = sqrt(mCol[0].mF32[0] - (mCol[1].mF32[1] + mCol[2].mF32[2]) + 1);
  847. float is = 0.5f / s;
  848. return Quat(
  849. 0.5f * s,
  850. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  851. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  852. (mCol[1].mF32[2] - mCol[2].mF32[1]) * is);
  853. }
  854. else if (i == 1)
  855. {
  856. float s = sqrt(mCol[1].mF32[1] - (mCol[2].mF32[2] + mCol[0].mF32[0]) + 1);
  857. float is = 0.5f / s;
  858. return Quat(
  859. (mCol[1].mF32[0] + mCol[0].mF32[1]) * is,
  860. 0.5f * s,
  861. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  862. (mCol[2].mF32[0] - mCol[0].mF32[2]) * is);
  863. }
  864. else
  865. {
  866. JPH_ASSERT(i == 2);
  867. float s = sqrt(mCol[2].mF32[2] - (mCol[0].mF32[0] + mCol[1].mF32[1]) + 1);
  868. float is = 0.5f / s;
  869. return Quat(
  870. (mCol[0].mF32[2] + mCol[2].mF32[0]) * is,
  871. (mCol[2].mF32[1] + mCol[1].mF32[2]) * is,
  872. 0.5f * s,
  873. (mCol[0].mF32[1] - mCol[1].mF32[0]) * is);
  874. }
  875. }
  876. }
  877. Mat44 Mat44::sQuatLeftMultiply(QuatArg inQ)
  878. {
  879. return Mat44(
  880. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  881. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  882. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  883. inQ.mValue);
  884. }
  885. Mat44 Mat44::sQuatRightMultiply(QuatArg inQ)
  886. {
  887. return Mat44(
  888. Vec4(1, -1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_W, SWIZZLE_Z, SWIZZLE_Y, SWIZZLE_X>(),
  889. Vec4(1, 1, -1, -1) * inQ.mValue.Swizzle<SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, SWIZZLE_Y>(),
  890. Vec4(-1, 1, 1, -1) * inQ.mValue.Swizzle<SWIZZLE_Y, SWIZZLE_X, SWIZZLE_W, SWIZZLE_Z>(),
  891. inQ.mValue);
  892. }
  893. Mat44 Mat44::GetRotation() const
  894. {
  895. JPH_ASSERT(mCol[0][3] == 0.0f);
  896. JPH_ASSERT(mCol[1][3] == 0.0f);
  897. JPH_ASSERT(mCol[2][3] == 0.0f);
  898. return Mat44(mCol[0], mCol[1], mCol[2], Vec4(0, 0, 0, 1));
  899. }
  900. Mat44 Mat44::GetRotationSafe() const
  901. {
  902. #if defined(JPH_USE_SSE)
  903. __m128 zero = _mm_setzero_ps();
  904. return Mat44(_mm_blend_ps(mCol[0].mValue, zero, 8),
  905. _mm_blend_ps(mCol[1].mValue, zero, 8),
  906. _mm_blend_ps(mCol[2].mValue, zero, 8),
  907. Vec4(0, 0, 0, 1));
  908. #elif defined(JPH_USE_NEON)
  909. return Mat44(vsetq_lane_f32(0, mCol[0].mValue, 3),
  910. vsetq_lane_f32(0, mCol[1].mValue, 3),
  911. vsetq_lane_f32(0, mCol[2].mValue, 3),
  912. Vec4(0, 0, 0, 1));
  913. #else
  914. #error Unsupported CPU architecture
  915. #endif
  916. }
  917. void Mat44::SetRotation(Mat44Arg inRotation)
  918. {
  919. mCol[0] = inRotation.mCol[0];
  920. mCol[1] = inRotation.mCol[1];
  921. mCol[2] = inRotation.mCol[2];
  922. }
  923. Mat44 Mat44::PreScaled(Vec3Arg inScale) const
  924. {
  925. return Mat44(inScale.GetX() * mCol[0], inScale.GetY() * mCol[1], inScale.GetZ() * mCol[2], mCol[3]);
  926. }
  927. Mat44 Mat44::PostScaled(Vec3Arg inScale) const
  928. {
  929. Vec4 scale(inScale, 1);
  930. return Mat44(scale * mCol[0], scale * mCol[1], scale * mCol[2], scale * mCol[3]);
  931. }
  932. Mat44 Mat44::Decompose(Vec3 &outScale) const
  933. {
  934. // Start the modified Gram-Schmidt algorithm
  935. // X axis will just be normalized
  936. Vec3 x = GetAxisX();
  937. // Make Y axis perpendicular to X
  938. Vec3 y = GetAxisY();
  939. float x_dot_x = x.LengthSq();
  940. y -= (x.Dot(y) / x_dot_x) * x;
  941. // Make Z axis perpendicular to X
  942. Vec3 z = GetAxisZ();
  943. z -= (x.Dot(z) / x_dot_x) * x;
  944. // Make Z axis perpendicular to Y
  945. float y_dot_y = y.LengthSq();
  946. z -= (y.Dot(z) / y_dot_y) * y;
  947. // Determine the scale
  948. float z_dot_z = z.LengthSq();
  949. outScale = Vec3(x_dot_x, y_dot_y, z_dot_z).Sqrt();
  950. // If the resulting x, y and z vectors don't form a right handed matrix, flip the z axis.
  951. if (x.Cross(y).Dot(z) < 0.0f)
  952. outScale.SetZ(-outScale.GetZ());
  953. // Determine the rotation and translation
  954. return Mat44(Vec4(x / outScale.GetX(), 0), Vec4(y / outScale.GetY(), 0), Vec4(z / outScale.GetZ(), 0), GetColumn4(3));
  955. }
  956. } // JPH