|
@@ -103,17 +103,10 @@ namespace glm {
|
|
|
auto MulRow = [&](int l) {
|
|
auto MulRow = [&](int l) {
|
|
|
float32x4_t const SrcA = m2[l].data;
|
|
float32x4_t const SrcA = m2[l].data;
|
|
|
|
|
|
|
|
-#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
|
|
|
|
|
- float32x4_t r= vmulq_laneq_f32(m1[0].data, SrcA, 0);
|
|
|
|
|
- r = vaddq_f32(r, vmulq_laneq_f32(m1[1].data, SrcA, 1));
|
|
|
|
|
- r = vaddq_f32(r, vmulq_laneq_f32(m1[2].data, SrcA, 2));
|
|
|
|
|
- r = vaddq_f32(r, vmulq_laneq_f32(m1[3].data, SrcA, 3));
|
|
|
|
|
-#else
|
|
|
|
|
- float32x4_t r= vmulq_f32(m1[0].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 0)));
|
|
|
|
|
- r = vaddq_f32(r, vmulq_f32(m1[1].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 1))));
|
|
|
|
|
- r = vaddq_f32(r, vmulq_f32(m1[2].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 2))));
|
|
|
|
|
- r = vaddq_f32(r, vmulq_f32(m1[3].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 3))));
|
|
|
|
|
-#endif
|
|
|
|
|
|
|
+ float32x4_t r = neon::mul_lane(m1[0].data, SrcA, 0);
|
|
|
|
|
+ r = neon::madd_lane(r, m1[1].data, SrcA, 1);
|
|
|
|
|
+ r = neon::madd_lane(r, m1[2].data, SrcA, 2);
|
|
|
|
|
+ r = neon::madd_lane(r, m1[3].data, SrcA, 3);
|
|
|
|
|
|
|
|
return r;
|
|
return r;
|
|
|
};
|
|
};
|
|
@@ -127,5 +120,130 @@ namespace glm {
|
|
|
return Result;
|
|
return Result;
|
|
|
}
|
|
}
|
|
|
#endif // CXX11
|
|
#endif // CXX11
|
|
|
|
|
+
|
|
|
|
|
+ template<qualifier Q>
|
|
|
|
|
+ struct detail::compute_inverse<4, 4, float, Q, true>
|
|
|
|
|
+ {
|
|
|
|
|
+ GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m)
|
|
|
|
|
+ {
|
|
|
|
|
+ float32x4_t const& m0 = m[0].data;
|
|
|
|
|
+ float32x4_t const& m1 = m[1].data;
|
|
|
|
|
+ float32x4_t const& m2 = m[2].data;
|
|
|
|
|
+ float32x4_t const& m3 = m[3].data;
|
|
|
|
|
+
|
|
|
|
|
+ // m[2][2] * m[3][3] - m[3][2] * m[2][3];
|
|
|
|
|
+ // m[2][2] * m[3][3] - m[3][2] * m[2][3];
|
|
|
|
|
+ // m[1][2] * m[3][3] - m[3][2] * m[1][3];
|
|
|
|
|
+ // m[1][2] * m[2][3] - m[2][2] * m[1][3];
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Fac0;
|
|
|
|
|
+ {
|
|
|
|
|
+ float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
|
|
|
|
|
+ float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
|
|
|
|
|
+ float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
|
|
|
|
|
+ float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
|
|
|
|
|
+ Fac0 = w0 * w1 - w2 * w3;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // m[2][1] * m[3][3] - m[3][1] * m[2][3];
|
|
|
|
|
+ // m[2][1] * m[3][3] - m[3][1] * m[2][3];
|
|
|
|
|
+ // m[1][1] * m[3][3] - m[3][1] * m[1][3];
|
|
|
|
|
+ // m[1][1] * m[2][3] - m[2][1] * m[1][3];
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Fac1;
|
|
|
|
|
+ {
|
|
|
|
|
+ float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
|
|
|
|
|
+ float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
|
|
|
|
|
+ float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
|
|
|
|
|
+ float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
|
|
|
|
|
+ Fac1 = w0 * w1 - w2 * w3;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // m[2][1] * m[3][2] - m[3][1] * m[2][2];
|
|
|
|
|
+ // m[2][1] * m[3][2] - m[3][1] * m[2][2];
|
|
|
|
|
+ // m[1][1] * m[3][2] - m[3][1] * m[1][2];
|
|
|
|
|
+ // m[1][1] * m[2][2] - m[2][1] * m[1][2];
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Fac2;
|
|
|
|
|
+ {
|
|
|
|
|
+ float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
|
|
|
|
|
+ float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
|
|
|
|
|
+ float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
|
|
|
|
|
+ float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
|
|
|
|
|
+ Fac2 = w0 * w1 - w2 * w3;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // m[2][0] * m[3][3] - m[3][0] * m[2][3];
|
|
|
|
|
+ // m[2][0] * m[3][3] - m[3][0] * m[2][3];
|
|
|
|
|
+ // m[1][0] * m[3][3] - m[3][0] * m[1][3];
|
|
|
|
|
+ // m[1][0] * m[2][3] - m[2][0] * m[1][3];
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Fac3;
|
|
|
|
|
+ {
|
|
|
|
|
+ float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
|
|
|
|
|
+ float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
|
|
|
|
|
+ float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
|
|
|
|
|
+ float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
|
|
|
|
|
+ Fac3 = w0 * w1 - w2 * w3;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // m[2][0] * m[3][2] - m[3][0] * m[2][2];
|
|
|
|
|
+ // m[2][0] * m[3][2] - m[3][0] * m[2][2];
|
|
|
|
|
+ // m[1][0] * m[3][2] - m[3][0] * m[1][2];
|
|
|
|
|
+ // m[1][0] * m[2][2] - m[2][0] * m[1][2];
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Fac4;
|
|
|
|
|
+ {
|
|
|
|
|
+ float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
|
|
|
|
|
+ float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
|
|
|
|
|
+ float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
|
|
|
|
|
+ float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
|
|
|
|
|
+ Fac4 = w0 * w1 - w2 * w3;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // m[2][0] * m[3][1] - m[3][0] * m[2][1];
|
|
|
|
|
+ // m[2][0] * m[3][1] - m[3][0] * m[2][1];
|
|
|
|
|
+ // m[1][0] * m[3][1] - m[3][0] * m[1][1];
|
|
|
|
|
+ // m[1][0] * m[2][1] - m[2][0] * m[1][1];
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Fac5;
|
|
|
|
|
+ {
|
|
|
|
|
+ float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
|
|
|
|
|
+ float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
|
|
|
|
|
+ float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
|
|
|
|
|
+ float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
|
|
|
|
|
+ Fac5 = w0 * w1 - w2 * w3;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Vec0 = neon::copy_lane(neon::dupq_lane(m0, 0), 0, m1, 0); // (m[1][0], m[0][0], m[0][0], m[0][0]);
|
|
|
|
|
+ float32x4_t Vec1 = neon::copy_lane(neon::dupq_lane(m0, 1), 0, m1, 1); // (m[1][1], m[0][1], m[0][1], m[0][1]);
|
|
|
|
|
+ float32x4_t Vec2 = neon::copy_lane(neon::dupq_lane(m0, 2), 0, m1, 2); // (m[1][2], m[0][2], m[0][2], m[0][2]);
|
|
|
|
|
+ float32x4_t Vec3 = neon::copy_lane(neon::dupq_lane(m0, 3), 0, m1, 3); // (m[1][3], m[0][3], m[0][3], m[0][3]);
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t Inv0 = Vec1 * Fac0 - Vec2 * Fac1 + Vec3 * Fac2;
|
|
|
|
|
+ float32x4_t Inv1 = Vec0 * Fac0 - Vec2 * Fac3 + Vec3 * Fac4;
|
|
|
|
|
+ float32x4_t Inv2 = Vec0 * Fac1 - Vec1 * Fac3 + Vec3 * Fac5;
|
|
|
|
|
+ float32x4_t Inv3 = Vec0 * Fac2 - Vec1 * Fac4 + Vec2 * Fac5;
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t r0 = float32x4_t{-1, +1, -1, +1} * Inv0;
|
|
|
|
|
+ float32x4_t r1 = float32x4_t{+1, -1, +1, -1} * Inv1;
|
|
|
|
|
+ float32x4_t r2 = float32x4_t{-1, +1, -1, +1} * Inv2;
|
|
|
|
|
+ float32x4_t r3 = float32x4_t{+1, -1, +1, -1} * Inv3;
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t det = neon::mul_lane(r0, m0, 0);
|
|
|
|
|
+ det = neon::madd_lane(det, r1, m0, 1);
|
|
|
|
|
+ det = neon::madd_lane(det, r2, m0, 2);
|
|
|
|
|
+ det = neon::madd_lane(det, r3, m0, 3);
|
|
|
|
|
+
|
|
|
|
|
+ float32x4_t rdet = vdupq_n_f32(1 / vgetq_lane_f32(det, 0));
|
|
|
|
|
+
|
|
|
|
|
+ mat<4, 4, float, Q> r;
|
|
|
|
|
+ r[0].data = vmulq_f32(r0, rdet);
|
|
|
|
|
+ r[1].data = vmulq_f32(r1, rdet);
|
|
|
|
|
+ r[2].data = vmulq_f32(r2, rdet);
|
|
|
|
|
+ r[3].data = vmulq_f32(r3, rdet);
|
|
|
|
|
+ return r;
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
}//namespace glm
|
|
}//namespace glm
|
|
|
#endif
|
|
#endif
|