|
@@ -9,6 +9,23 @@
|
|
|
|
|
|
|
|
namespace bx
|
|
namespace bx
|
|
|
{
|
|
{
|
|
|
|
|
+#if BX_COMPILER_CLANG
|
|
|
|
|
+
|
|
|
|
|
+#define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) \
|
|
|
|
|
+__builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 )
|
|
|
|
|
+#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) \
|
|
|
|
|
+__builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 )
|
|
|
|
|
+
|
|
|
|
|
+#else
|
|
|
|
|
+
|
|
|
|
|
+#define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) \
|
|
|
|
|
+__builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 })
|
|
|
|
|
+#define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) \
|
|
|
|
|
+__builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 })
|
|
|
|
|
+
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
#define ELEMx 0
|
|
#define ELEMx 0
|
|
|
#define ELEMy 1
|
|
#define ELEMy 1
|
|
|
#define ELEMz 2
|
|
#define ELEMz 2
|
|
@@ -17,7 +34,7 @@ namespace bx
|
|
|
template<> \
|
|
template<> \
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_swiz_##_x##_y##_z##_w(simd128_neon_t _a) \
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_swiz_##_x##_y##_z##_w(simd128_neon_t _a) \
|
|
|
{ \
|
|
{ \
|
|
|
- return __builtin_shuffle(_a, (uint32x4_t){ ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w }); \
|
|
|
|
|
|
|
+ return SHUFFLE_A(_a, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w ); \
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
#include "simd128_swizzle.inl"
|
|
#include "simd128_swizzle.inl"
|
|
@@ -74,50 +91,52 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 1, 4, 5 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 0, 1, 4, 5 );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 4, 5, 0, 1 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 4, 5, 0, 1 );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 7, 2, 3 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 6, 7, 2, 3 );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 3, 6, 7 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 2, 3, 6, 7 );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 0, 4, 1, 5 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 0, 4, 1, 5 );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_AxBy(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_AxBy(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 1, 5, 0, 4 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 4, 0, 5, 1 );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 2, 6, 3, 7 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 2, 6, 3, 7 );
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b)
|
|
|
{
|
|
{
|
|
|
- return __builtin_shuffle(_a, _b, (uint32x4_t){ 6, 2, 7, 3 });
|
|
|
|
|
|
|
+ return SHUFFLE_AB(_a, _b, 6, 2, 7, 3 );
|
|
|
}
|
|
}
|
|
|
|
|
+#undef SHUFFLE_A
|
|
|
|
|
+#undef SHUFFLE_AB
|
|
|
|
|
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE float simd_x(simd128_neon_t _a)
|
|
BX_SIMD_FORCE_INLINE float simd_x(simd128_neon_t _a)
|
|
@@ -367,6 +386,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_sll(simd128_neon_t _a, int _count)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_sll(simd128_neon_t _a, int _count)
|
|
|
{
|
|
{
|
|
|
|
|
+#if !BX_COMPILER_CLANG
|
|
|
if (__builtin_constant_p(_count) )
|
|
if (__builtin_constant_p(_count) )
|
|
|
{
|
|
{
|
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
@@ -375,7 +395,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
|
|
|
|
|
|
|
|
return result;
|
|
return result;
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+#endif
|
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
|
const int32x4_t shift = vdupq_n_s32(_count);
|
|
const int32x4_t shift = vdupq_n_s32(_count);
|
|
|
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
|
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
|
@@ -387,6 +407,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_srl(simd128_neon_t _a, int _count)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_srl(simd128_neon_t _a, int _count)
|
|
|
{
|
|
{
|
|
|
|
|
+#if !BX_COMPILER_CLANG
|
|
|
if (__builtin_constant_p(_count) )
|
|
if (__builtin_constant_p(_count) )
|
|
|
{
|
|
{
|
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
@@ -395,7 +416,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
|
|
|
|
|
|
|
|
return result;
|
|
return result;
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+#endif
|
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a);
|
|
|
const int32x4_t shift = vdupq_n_s32(-_count);
|
|
const int32x4_t shift = vdupq_n_s32(-_count);
|
|
|
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
|
const uint32x4_t tmp1 = vshlq_u32(tmp0, shift);
|
|
@@ -407,6 +428,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
|
|
|
template<>
|
|
template<>
|
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_sra(simd128_neon_t _a, int _count)
|
|
BX_SIMD_FORCE_INLINE simd128_neon_t simd_sra(simd128_neon_t _a, int _count)
|
|
|
{
|
|
{
|
|
|
|
|
+#if !BX_COMPILER_CLANG
|
|
|
if (__builtin_constant_p(_count) )
|
|
if (__builtin_constant_p(_count) )
|
|
|
{
|
|
{
|
|
|
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
|
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
|
@@ -415,7 +437,7 @@ BX_SIMD128_IMPLEMENT_TEST(yzw, yzww);
|
|
|
|
|
|
|
|
return result;
|
|
return result;
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+#endif
|
|
|
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
|
const int32x4_t tmp0 = vreinterpretq_s32_f32(_a);
|
|
|
const int32x4_t shift = vdupq_n_s32(-_count);
|
|
const int32x4_t shift = vdupq_n_s32(-_count);
|
|
|
const int32x4_t tmp1 = vshlq_s32(tmp0, shift);
|
|
const int32x4_t tmp1 = vshlq_s32(tmp0, shift);
|