|
@@ -139,6 +139,27 @@ index d9c3808..b58c36a 100644
|
|
|
v *= Reciprocal(HorizontalMax(Abs(v)));
|
|
|
}
|
|
|
#if POWER_ITERATION_COUNT <= 0
|
|
|
+diff --git a/maths_sse.h b/maths_sse.h
|
|
|
+index eeff0d5..2a8eb89 100644
|
|
|
+--- a/maths_sse.h
|
|
|
++++ b/maths_sse.h
|
|
|
+@@ -25,11 +25,15 @@
|
|
|
+
|
|
|
+ #ifndef SQUISH_MATH_SSE_H
|
|
|
+ #define SQUISH_MATH_SSE_H
|
|
|
+-
|
|
|
++#if SQUISH_USE_NEON && __ARM_NEON
|
|
|
++#include <sse2neon.h>
|
|
|
++#else
|
|
|
+ #include <xmmintrin.h>
|
|
|
+ #if ( SQUISH_USE_SSE > 1 )
|
|
|
+ #include <emmintrin.h>
|
|
|
+ #endif
|
|
|
++#endif
|
|
|
++
|
|
|
+ #if ( SQUISH_USE_SSE >= 3 )
|
|
|
+ #include <pmmintrin.h>
|
|
|
+ #endif
|
|
|
diff --git a/paletteclusterfit.cpp b/paletteclusterfit.cpp
|
|
|
index 2d6f5a1..b98e975 100644
|
|
|
--- a/paletteclusterfit.cpp
|
|
@@ -181,8 +202,21 @@ index bee740c..8c7aea0 100644
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+diff --git a/simd_float.h b/simd_float.h
|
|
|
+index f401b68..668bb61 100644
|
|
|
+--- a/simd_float.h
|
|
|
++++ b/simd_float.h
|
|
|
+@@ -833,7 +833,7 @@ public:
|
|
|
+ template<const int n, const int p>
|
|
|
+ friend void ConcBits(Col4::Arg left, Col4 &right )
|
|
|
+ {
|
|
|
+- right = ShiftLeft<32>( right );
|
|
|
++ right = ShiftLeft<64>( right );
|
|
|
+ if (n > 0)
|
|
|
+ right += ExtrBits<n, p>( left );
|
|
|
+ }
|
|
|
diff --git a/simd_sse.h b/simd_sse.h
|
|
|
-index f959e20..1a2f6b8 100644
|
|
|
+index f959e20..542fbba 100644
|
|
|
--- a/simd_sse.h
|
|
|
+++ b/simd_sse.h
|
|
|
@@ -1,7 +1,7 @@
|
|
@@ -194,15 +228,26 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining
|
|
|
a copy of this software and associated documentation files (the
|
|
|
-@@ -33,6 +33,7 @@
|
|
|
+@@ -27,12 +27,18 @@
|
|
|
+ #ifndef SQUISH_SIMD_SSE_H
|
|
|
+ #define SQUISH_SIMD_SSE_H
|
|
|
+
|
|
|
++#if SQUISH_USE_NEON && __ARM_NEON
|
|
|
++#include <sse2neon.h>
|
|
|
++#else
|
|
|
+ #include <xmmintrin.h>
|
|
|
+ #if ( SQUISH_USE_SSE > 1 )
|
|
|
+ #include <emmintrin.h>
|
|
|
#endif
|
|
|
++#endif
|
|
|
++
|
|
|
#if ( SQUISH_USE_SSE >= 3 )
|
|
|
#include <pmmintrin.h>
|
|
|
+#include <smmintrin.h>
|
|
|
#endif
|
|
|
#if ( SQUISH_USE_SSE >= 4 )
|
|
|
#include <smmintrin.h>
|
|
|
-@@ -69,6 +70,12 @@
|
|
|
+@@ -69,6 +75,12 @@
|
|
|
|
|
|
namespace squish {
|
|
|
|
|
@@ -215,7 +260,7 @@ index f959e20..1a2f6b8 100644
|
|
|
#define COL4_CONST( X ) Col4( X )
|
|
|
|
|
|
|
|
|
-@@ -263,7 +270,7 @@ public:
|
|
|
+@@ -263,7 +275,7 @@ public:
|
|
|
Col3& operator/=( short v )
|
|
|
{
|
|
|
__m128
|
|
@@ -224,7 +269,7 @@ index f959e20..1a2f6b8 100644
|
|
|
fp = _mm_cvtepi32_ps(m_v);
|
|
|
fp = _mm_div_ps(fp, _mm_set1_ps(v));
|
|
|
m_v = _mm_cvttps_epi32(fp);
|
|
|
-@@ -351,64 +358,18 @@ public:
|
|
|
+@@ -351,64 +363,18 @@ public:
|
|
|
|
|
|
template<const int n>
|
|
|
friend Col3 ShiftLeft( Arg a );
|
|
@@ -292,7 +337,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
template<const int r, const int g, const int b>
|
|
|
friend Col3 ShiftLeftLo( Arg v )
|
|
|
-@@ -422,140 +383,24 @@ public:
|
|
|
+@@ -422,140 +388,24 @@ public:
|
|
|
|
|
|
template<const int n, const int p>
|
|
|
friend Col3 MaskBits( Arg a );
|
|
@@ -436,7 +481,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
friend Col3 Mul16x16u( Arg a, Arg b )
|
|
|
{
|
|
|
-@@ -652,18 +497,7 @@ public:
|
|
|
+@@ -652,18 +502,7 @@ public:
|
|
|
template<const int f, const int t>
|
|
|
friend Col3 Exchange( Arg a );
|
|
|
template<const int f, const int t>
|
|
@@ -456,7 +501,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
friend Col3 HorizontalAdd( Arg a )
|
|
|
{
|
|
|
-@@ -751,7 +585,7 @@ public:
|
|
|
+@@ -751,7 +590,7 @@ public:
|
|
|
return HorizontalAdd( a, b );
|
|
|
#endif
|
|
|
}
|
|
@@ -465,7 +510,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col3 HorizontalMaxTiny( Arg a )
|
|
|
{
|
|
|
#if ( SQUISH_USE_SSE >= 4 ) && 0
|
|
|
-@@ -867,7 +701,7 @@ public:
|
|
|
+@@ -867,7 +706,7 @@ public:
|
|
|
|
|
|
return Col3( _mm_castps_si128 ( resc ) );
|
|
|
}
|
|
@@ -474,7 +519,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend bool CompareFirstLessThan( Arg left, Arg right )
|
|
|
{
|
|
|
__m128i bits = _mm_cmplt_epi32( left.m_v, right.m_v );
|
|
|
-@@ -937,7 +771,7 @@ public:
|
|
|
+@@ -937,7 +776,7 @@ public:
|
|
|
|
|
|
loc = _mm_cvtsi128_si32( r );
|
|
|
}
|
|
@@ -483,7 +528,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend void PackBytes( Arg a, int &loc )
|
|
|
{
|
|
|
__m128i
|
|
|
-@@ -947,7 +781,7 @@ public:
|
|
|
+@@ -947,7 +786,7 @@ public:
|
|
|
|
|
|
loc = _mm_cvtsi128_si32( r );
|
|
|
}
|
|
@@ -492,7 +537,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend void PackWords( Arg a, unsigned__int64 &loc )
|
|
|
{
|
|
|
__m128i
|
|
|
-@@ -964,17 +798,17 @@ public:
|
|
|
+@@ -964,17 +803,17 @@ public:
|
|
|
// loc = _mm_cvtsi128_si64( r );
|
|
|
_mm_storel_epi64( (__m128i *)&loc, r );
|
|
|
}
|
|
@@ -513,7 +558,7 @@ index f959e20..1a2f6b8 100644
|
|
|
// clamp the output to [0, 1]
|
|
|
Col3 Clamp() const {
|
|
|
Col3 const one (0xFF);
|
|
|
-@@ -1020,17 +854,17 @@ public:
|
|
|
+@@ -1020,17 +859,17 @@ public:
|
|
|
{
|
|
|
_mm_store_si128( (__m128i *)destination, _mm_unpacklo_epi64( a.m_v, b.m_v ) );
|
|
|
}
|
|
@@ -534,7 +579,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend void StoreUnaligned( Arg a, u8* loc ) {
|
|
|
PackBytes( a, (unsigned int&) (*((unsigned int *)loc)) ); }
|
|
|
friend void StoreUnaligned( Arg a, u16* loc ) {
|
|
|
-@@ -1043,10 +877,202 @@ public:
|
|
|
+@@ -1043,10 +882,202 @@ public:
|
|
|
private:
|
|
|
__m128i m_v;
|
|
|
|
|
@@ -739,7 +784,7 @@ index f959e20..1a2f6b8 100644
|
|
|
class Col4
|
|
|
{
|
|
|
public:
|
|
|
-@@ -1305,317 +1331,56 @@ public:
|
|
|
+@@ -1305,317 +1336,56 @@ public:
|
|
|
|
|
|
template<const int n>
|
|
|
friend Col4 FillSign( Arg a );
|
|
@@ -1066,7 +1111,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
friend Col4 RevsBits( Col4::Arg v )
|
|
|
{
|
|
|
-@@ -1679,19 +1444,7 @@ public:
|
|
|
+@@ -1679,19 +1449,7 @@ public:
|
|
|
|
|
|
template<const int f, const int t>
|
|
|
friend Col4 Shuffle( Arg a );
|
|
@@ -1086,7 +1131,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
template<const int f, const int t>
|
|
|
friend Col4 Exchange( Arg a );
|
|
|
-@@ -1888,7 +1641,7 @@ public:
|
|
|
+@@ -1888,7 +1646,7 @@ public:
|
|
|
return Col4( _mm_max_epi16( left.m_v, right.m_v ) );
|
|
|
#endif
|
|
|
}
|
|
@@ -1095,7 +1140,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col4 MaxTiny( Arg left, Arg right )
|
|
|
{
|
|
|
__m128 resa = _mm_castsi128_ps( left.m_v );
|
|
|
-@@ -1973,7 +1726,7 @@ public:
|
|
|
+@@ -1973,7 +1731,7 @@ public:
|
|
|
{
|
|
|
return Col4( _mm_cmplt_epi8( left.m_v, right.m_v ) );
|
|
|
}
|
|
@@ -1104,7 +1149,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col4 CompareAllEqualTo_M8( Arg left, Arg right )
|
|
|
{
|
|
|
return Col4( _mm_cmpeq_epi8( left.m_v, right.m_v ) );
|
|
|
-@@ -1996,11 +1749,6 @@ public:
|
|
|
+@@ -1996,11 +1754,6 @@ public:
|
|
|
|
|
|
template<const int value>
|
|
|
friend Col4 IsValue( Arg v );
|
|
@@ -1116,7 +1161,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
friend Col4 TransferA( Arg left, Arg right )
|
|
|
{
|
|
|
-@@ -2014,7 +1762,7 @@ public:
|
|
|
+@@ -2014,7 +1767,7 @@ public:
|
|
|
{
|
|
|
return Col4( _mm_or_si128( left.m_v, _mm_setr_epi32( 0x00, 0x00, 0x00, 0xFF ) ) );
|
|
|
}
|
|
@@ -1125,7 +1170,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col4 CollapseA( Arg r, Arg g, Arg b, Arg a )
|
|
|
{
|
|
|
return Col4( _mm_packus_epi16(
|
|
|
-@@ -2032,7 +1780,7 @@ public:
|
|
|
+@@ -2032,7 +1785,7 @@ public:
|
|
|
|
|
|
loc = _mm_cvtsi128_si32 ( r );
|
|
|
}
|
|
@@ -1134,7 +1179,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend void PackBytes( Arg a, int &loc )
|
|
|
{
|
|
|
__m128i
|
|
|
-@@ -2042,7 +1790,7 @@ public:
|
|
|
+@@ -2042,7 +1795,7 @@ public:
|
|
|
|
|
|
loc = _mm_cvtsi128_si32 ( r );
|
|
|
}
|
|
@@ -1143,7 +1188,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend void PackWords( Arg a, unsigned__int64 &loc )
|
|
|
{
|
|
|
__m128i
|
|
|
-@@ -2059,11 +1807,11 @@ public:
|
|
|
+@@ -2059,11 +1812,11 @@ public:
|
|
|
// loc = _mm_cvtsi128_si64( r );
|
|
|
_mm_storel_epi64( (__m128i *)&loc, r );
|
|
|
}
|
|
@@ -1157,7 +1202,7 @@ index f959e20..1a2f6b8 100644
|
|
|
r = _mm_packs_epi32( a.m_v, a.m_v );
|
|
|
|
|
|
// loc = _mm_cvtsi128_si64( r );
|
|
|
-@@ -2100,18 +1848,9 @@ public:
|
|
|
+@@ -2100,18 +1853,9 @@ public:
|
|
|
|
|
|
a = Col4( r );
|
|
|
}
|
|
@@ -1178,7 +1223,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend void UnpackWords( Col4 &a, const unsigned__int64 &loc )
|
|
|
{
|
|
|
__m128i
|
|
|
-@@ -2121,110 +1860,447 @@ public:
|
|
|
+@@ -2121,110 +1865,449 @@ public:
|
|
|
|
|
|
a = Col4( r );
|
|
|
}
|
|
@@ -1332,14 +1377,16 @@ index f959e20..1a2f6b8 100644
|
|
|
+template<const int n>
|
|
|
+Col4 ShiftLeft( Col4::Arg a )
|
|
|
+{
|
|
|
++ constexpr int n_shift_3 = (n) >> 3;
|
|
|
++
|
|
|
+ if ((n) <= 0)
|
|
|
+ return Col4( a.m_v );
|
|
|
+ if ((n) <= 7)
|
|
|
+ return Col4( _mm_slli_epi32( a.m_v, (n) & 7 ) );
|
|
|
+ if ((n) & 7)
|
|
|
-+ return Col4( _mm_slli_epi32( _mm_slli_si128( a.m_v, (n) >> 3 ), (n) & 7 ) );
|
|
|
++ return Col4( _mm_slli_epi32( _mm_slli_si128( a.m_v, n_shift_3 ), (n) & 7 ) );
|
|
|
+
|
|
|
-+ return Col4( _mm_slli_si128( a.m_v, (n) >> 3 ) );
|
|
|
++ return Col4( _mm_slli_si128( a.m_v, n_shift_3 ) );
|
|
|
+}
|
|
|
+
|
|
|
+template<const int n, const int p>
|
|
@@ -1714,7 +1761,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
#if !defined(SQUISH_USE_PRE)
|
|
|
inline Col3 LengthSquared( Col3::Arg v )
|
|
|
-@@ -2291,30 +2367,30 @@ public:
|
|
|
+@@ -2291,30 +2374,30 @@ public:
|
|
|
{
|
|
|
return _mm_extract_epi16( m_v, 0 );
|
|
|
}
|
|
@@ -1750,7 +1797,7 @@ index f959e20..1a2f6b8 100644
|
|
|
const u16 &operator[]( int pos ) const
|
|
|
{
|
|
|
return ((u16 *)&m_v)[pos];
|
|
|
-@@ -2331,7 +2407,7 @@ public:
|
|
|
+@@ -2331,7 +2414,7 @@ public:
|
|
|
{
|
|
|
return Col8( _mm_srli_epi16( left.m_v, right ) );
|
|
|
}
|
|
@@ -1759,7 +1806,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col8 operator>>( Arg left, int right )
|
|
|
{
|
|
|
return Col8( _mm_srai_epi16( left.m_v, right ) );
|
|
|
-@@ -2341,7 +2417,7 @@ public:
|
|
|
+@@ -2341,7 +2424,7 @@ public:
|
|
|
{
|
|
|
return Col8( _mm_slli_epi16( left.m_v, right ) );
|
|
|
}
|
|
@@ -1768,7 +1815,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col8 operator<<( Arg left, int right )
|
|
|
{
|
|
|
return Col8( _mm_slli_epi16( left.m_v, right ) );
|
|
|
-@@ -2366,7 +2442,7 @@ public:
|
|
|
+@@ -2366,7 +2449,7 @@ public:
|
|
|
{
|
|
|
return Col8( _mm_mulhi_epu16( left.m_v, _mm_set1_epi16( (unsigned short)right ) ) );
|
|
|
}
|
|
@@ -1777,7 +1824,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col8 operator*( Arg left, int right )
|
|
|
{
|
|
|
return Col8( _mm_mulhi_epi16( left.m_v, _mm_set1_epi16( (short)right ) ) );
|
|
|
-@@ -2374,12 +2450,7 @@ public:
|
|
|
+@@ -2374,12 +2457,7 @@ public:
|
|
|
|
|
|
template<const int n>
|
|
|
friend Col8 ExtendSign(Arg a);
|
|
@@ -1791,7 +1838,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col8 HorizontalMin(Arg a)
|
|
|
{
|
|
|
__m128i res = a.m_v;
|
|
|
-@@ -2420,17 +2491,13 @@ public:
|
|
|
+@@ -2420,17 +2498,13 @@ public:
|
|
|
|
|
|
template<const int n>
|
|
|
friend Col8 ShiftUp(Arg a);
|
|
@@ -1812,7 +1859,7 @@ index f959e20..1a2f6b8 100644
|
|
|
res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
|
|
|
|
|
|
#ifdef _MSV_VER
|
|
|
-@@ -2445,7 +2512,7 @@ public:
|
|
|
+@@ -2445,7 +2519,7 @@ public:
|
|
|
|
|
|
friend Col4 RepeatUpper(Arg a, const unsigned dummy) {
|
|
|
__m128i res = a.m_v;
|
|
@@ -1821,7 +1868,7 @@ index f959e20..1a2f6b8 100644
|
|
|
res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
|
|
|
res = _mm_shuffle_epi32( res, SQUISH_SSE_SPLAT(3) );
|
|
|
|
|
|
-@@ -2458,10 +2525,10 @@ public:
|
|
|
+@@ -2458,10 +2532,10 @@ public:
|
|
|
|
|
|
return Col4( res );
|
|
|
}
|
|
@@ -1834,7 +1881,7 @@ index f959e20..1a2f6b8 100644
|
|
|
res = _mm_unpackhi_epi16( a.m_v, b.m_v );
|
|
|
res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
|
|
|
res = _mm_unpackhi_epi64( res, res );
|
|
|
-@@ -2478,7 +2545,7 @@ public:
|
|
|
+@@ -2478,7 +2552,7 @@ public:
|
|
|
|
|
|
friend Col4 ReplicateUpper(Arg a, Arg b, const unsigned dummy) {
|
|
|
__m128i res;
|
|
@@ -1843,7 +1890,7 @@ index f959e20..1a2f6b8 100644
|
|
|
res = _mm_unpackhi_epi16( a.m_v, b.m_v );
|
|
|
res = _mm_unpackhi_epi16( res, _mm_setzero_si128() );
|
|
|
res = _mm_unpackhi_epi32( res, res );
|
|
|
-@@ -2495,7 +2562,7 @@ public:
|
|
|
+@@ -2495,7 +2569,7 @@ public:
|
|
|
|
|
|
friend Col4 ExpandUpper(Arg a, const signed dummy) {
|
|
|
__m128i res = a.m_v;
|
|
@@ -1852,7 +1899,7 @@ index f959e20..1a2f6b8 100644
|
|
|
res = _mm_unpackhi_epi16( res, res );
|
|
|
res = _mm_srai_epi32( res, 16 );
|
|
|
|
|
|
-@@ -2524,10 +2591,10 @@ public:
|
|
|
+@@ -2524,10 +2598,10 @@ public:
|
|
|
|
|
|
return Col4( res );
|
|
|
}
|
|
@@ -1865,7 +1912,7 @@ index f959e20..1a2f6b8 100644
|
|
|
res = _mm_unpackhi_epi32( a.m_v, b.m_v );
|
|
|
res = _mm_srai_epi32( res, 16 );
|
|
|
res = _mm_unpackhi_epi64( res, res );
|
|
|
-@@ -2544,11 +2611,11 @@ public:
|
|
|
+@@ -2544,11 +2618,11 @@ public:
|
|
|
|
|
|
friend Col4 ReplicateUpper(Arg a, Arg b, const signed dummy) {
|
|
|
__m128i res;
|
|
@@ -1879,7 +1926,7 @@ index f959e20..1a2f6b8 100644
|
|
|
#ifdef _MSV_VER
|
|
|
assert(res.m128i_i32[0] == a.m_v.m128i_i16[7]);
|
|
|
assert(res.m128i_i32[1] == a.m_v.m128i_i16[7]);
|
|
|
-@@ -2559,7 +2626,7 @@ public:
|
|
|
+@@ -2559,7 +2633,7 @@ public:
|
|
|
return Col4( res );
|
|
|
}
|
|
|
#pragma warning ( pop )
|
|
@@ -1888,7 +1935,7 @@ index f959e20..1a2f6b8 100644
|
|
|
/*
|
|
|
friend Col4 Expand(Arg a, int ia) {
|
|
|
__m128i res = _mm_setzero_si128();
|
|
|
-@@ -2601,17 +2668,17 @@ public:
|
|
|
+@@ -2601,17 +2675,17 @@ public:
|
|
|
return Col4( res );
|
|
|
}
|
|
|
*/
|
|
@@ -1909,7 +1956,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Col8 CompareAllLessThan( Arg left, Arg right )
|
|
|
{
|
|
|
return Col8( _mm_cmplt_epi16( left.m_v, right.m_v ) );
|
|
|
-@@ -2620,9 +2687,21 @@ public:
|
|
|
+@@ -2620,9 +2694,22 @@ public:
|
|
|
private:
|
|
|
__m128i m_v;
|
|
|
|
|
@@ -1926,13 +1973,14 @@ index f959e20..1a2f6b8 100644
|
|
|
+template<const int n>
|
|
|
+Col8 ShiftUp(Col8::Arg a)
|
|
|
+{
|
|
|
-+ return Col8( _mm_slli_si128( a.m_v, n << 1 ) );
|
|
|
++ constexpr int n_shift_left_1 = n << 1;
|
|
|
++ return Col8( _mm_slli_si128( a.m_v, n_shift_left_1 ) );
|
|
|
+}
|
|
|
+
|
|
|
#define VEC4_CONST( X ) Vec4( X )
|
|
|
|
|
|
class Vec3
|
|
|
-@@ -2649,7 +2728,7 @@ public:
|
|
|
+@@ -2649,7 +2736,7 @@ public:
|
|
|
m_v = _mm_unpacklo_ps(_mm_load_ss(x), _mm_load_ss(y));
|
|
|
m_v = _mm_movelh_ps(m_v, _mm_load_ss(z));
|
|
|
}
|
|
@@ -1941,7 +1989,7 @@ index f959e20..1a2f6b8 100644
|
|
|
Vec3( bool x, bool y, bool z ) : m_v( _mm_castsi128_ps( _mm_setr_epi32( x ? ~0 : 0, y ? ~0 : 0, z ? ~0 : 0, 0 ) ) ) {}
|
|
|
|
|
|
Vec3( float x, float y, float z ) : m_v( _mm_setr_ps( x, y, z, 0.0f ) ) {}
|
|
|
-@@ -2662,7 +2741,7 @@ public:
|
|
|
+@@ -2662,7 +2749,7 @@ public:
|
|
|
void StoreX(float *x) const { _mm_store_ss(x, m_v); }
|
|
|
void StoreY(float *y) const { _mm_store_ss(y, _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) )); }
|
|
|
void StoreZ(float *z) const { _mm_store_ss(z, _mm_movehl_ps( m_v, m_v ) ); }
|
|
@@ -1950,7 +1998,7 @@ index f959e20..1a2f6b8 100644
|
|
|
float X() const { return ((float *)&m_v)[0]; }
|
|
|
float Y() const { return ((float *)&m_v)[1]; }
|
|
|
float Z() const { return ((float *)&m_v)[2]; }
|
|
|
-@@ -2729,7 +2808,7 @@ public:
|
|
|
+@@ -2729,7 +2816,7 @@ public:
|
|
|
m_v = _mm_mul_ps( m_v, v.m_v );
|
|
|
return *this;
|
|
|
}
|
|
@@ -1959,7 +2007,7 @@ index f959e20..1a2f6b8 100644
|
|
|
Vec3& operator/=( Arg v )
|
|
|
{
|
|
|
*this *= Reciprocal( v );
|
|
|
-@@ -2863,16 +2942,7 @@ public:
|
|
|
+@@ -2863,16 +2950,7 @@ public:
|
|
|
|
|
|
template<const int n>
|
|
|
friend Vec3 RotateLeft( Arg a );
|
|
@@ -1977,7 +2025,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
friend Vec3 HorizontalAdd( Arg a )
|
|
|
{
|
|
|
-@@ -2974,7 +3044,7 @@ public:
|
|
|
+@@ -2974,7 +3052,7 @@ public:
|
|
|
|
|
|
return Vec3( res );
|
|
|
}
|
|
@@ -1986,7 +2034,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec3 HorizontalMaxXY( Arg a )
|
|
|
{
|
|
|
__m128 res = a.m_v;
|
|
|
-@@ -2986,7 +3056,7 @@ public:
|
|
|
+@@ -2986,7 +3064,7 @@ public:
|
|
|
|
|
|
return Vec3( res );
|
|
|
}
|
|
@@ -1995,7 +2043,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec3 HorizontalMinXY( Arg a )
|
|
|
{
|
|
|
__m128 res = a.m_v;
|
|
|
-@@ -3063,37 +3133,6 @@ public:
|
|
|
+@@ -3063,37 +3141,6 @@ public:
|
|
|
|
|
|
template<const bool disarm>
|
|
|
friend Vec3 Complement( Arg left );
|
|
@@ -2033,7 +2081,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
template<const bool disarm>
|
|
|
friend Vec3 Complement( Vec3 &left, Vec3 &right );
|
|
|
-@@ -3104,20 +3143,20 @@ public:
|
|
|
+@@ -3104,20 +3151,20 @@ public:
|
|
|
Vec3 len = (left * left) + (right * right);
|
|
|
Vec3 adj = ReciprocalSqrt(Max(Vec3(1.0f), len));
|
|
|
|
|
@@ -2058,7 +2106,7 @@ index f959e20..1a2f6b8 100644
|
|
|
return Sqrt(Vec3(1.0f) - len);
|
|
|
}
|
|
|
}
|
|
|
-@@ -3168,7 +3207,7 @@ public:
|
|
|
+@@ -3168,7 +3215,7 @@ public:
|
|
|
{
|
|
|
return Vec3( _mm_and_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x7FFFFFFF ) ) ) );
|
|
|
}
|
|
@@ -2067,7 +2115,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec3 Neg( Arg a )
|
|
|
{
|
|
|
return Vec3( _mm_or_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x80000000 ) ) ) );
|
|
|
-@@ -3192,21 +3231,9 @@ public:
|
|
|
+@@ -3192,21 +3239,9 @@ public:
|
|
|
return Min(one, Max(zero, *this));
|
|
|
}
|
|
|
|
|
@@ -2092,7 +2140,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
friend Vec3 Truncate( Arg v )
|
|
|
{
|
|
|
-@@ -3296,7 +3323,7 @@ public:
|
|
|
+@@ -3296,7 +3331,7 @@ public:
|
|
|
{
|
|
|
return Vec3( _mm_cmpneq_ps( m_v, _mm_set1_ps( 1.0f ) ) );
|
|
|
}
|
|
@@ -2101,7 +2149,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec3 TransferZ( Arg left, Arg right )
|
|
|
{
|
|
|
return Vec3( _mm_shuffle_ps( left.m_v, right.m_v, SQUISH_SSE_SHUF( 0, 1, 2, 3 ) ) );
|
|
|
-@@ -3351,9 +3378,70 @@ public:
|
|
|
+@@ -3351,9 +3386,70 @@ public:
|
|
|
private:
|
|
|
__m128 m_v;
|
|
|
|
|
@@ -2173,7 +2221,7 @@ index f959e20..1a2f6b8 100644
|
|
|
template<const bool round>
|
|
|
Col3 FloatToUHalf( Vec3::Arg v );
|
|
|
template<const bool round>
|
|
|
-@@ -3382,7 +3470,7 @@ Col3 FloatToSHalf( Vec3::Arg v )
|
|
|
+@@ -3382,7 +3478,7 @@ Col3 FloatToSHalf( Vec3::Arg v )
|
|
|
return h;
|
|
|
}
|
|
|
|
|
@@ -2182,7 +2230,7 @@ index f959e20..1a2f6b8 100644
|
|
|
{
|
|
|
Vec3 f;
|
|
|
|
|
|
-@@ -3393,7 +3481,7 @@ Vec3 UHalfToFloat( Col3::Arg v )
|
|
|
+@@ -3393,7 +3489,7 @@ Vec3 UHalfToFloat( Col3::Arg v )
|
|
|
return f;
|
|
|
}
|
|
|
|
|
@@ -2191,7 +2239,7 @@ index f959e20..1a2f6b8 100644
|
|
|
{
|
|
|
Vec3 f;
|
|
|
|
|
|
-@@ -3427,7 +3515,7 @@ public:
|
|
|
+@@ -3427,7 +3523,7 @@ public:
|
|
|
m_v = arg.m_v;
|
|
|
return *this;
|
|
|
}
|
|
@@ -2200,7 +2248,7 @@ index f959e20..1a2f6b8 100644
|
|
|
operator Vec3()
|
|
|
{
|
|
|
return Vec3(m_v);
|
|
|
-@@ -3458,21 +3546,21 @@ public:
|
|
|
+@@ -3458,21 +3554,21 @@ public:
|
|
|
m_v = _mm_load_ss(x);
|
|
|
m_v = _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) );
|
|
|
}
|
|
@@ -2225,7 +2273,7 @@ index f959e20..1a2f6b8 100644
|
|
|
Vec4( bool x, bool y, bool z, bool w ) : m_v( _mm_castsi128_ps( _mm_setr_epi32( x ? ~0 : 0, y ? ~0 : 0, z ? ~0 : 0, w ? ~0 : 0 ) ) ) {}
|
|
|
|
|
|
Vec4( int x, int y, int z, int w ) : m_v( _mm_cvtepi32_ps( _mm_setr_epi32( x, y, z, w ) ) ) {}
|
|
|
-@@ -3498,23 +3586,17 @@ public:
|
|
|
+@@ -3498,23 +3594,17 @@ public:
|
|
|
{
|
|
|
return Vec3( m_v );
|
|
|
}
|
|
@@ -2252,7 +2300,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
void StoreX(float *x) const { _mm_store_ss(x, m_v); }
|
|
|
void StoreY(float *y) const { _mm_store_ss(y, _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) )); }
|
|
|
-@@ -3619,7 +3701,7 @@ public:
|
|
|
+@@ -3619,7 +3709,7 @@ public:
|
|
|
m_v = _mm_mul_ps( m_v, v.m_v );
|
|
|
return *this;
|
|
|
}
|
|
@@ -2261,7 +2309,7 @@ index f959e20..1a2f6b8 100644
|
|
|
Vec4& operator*=( float v )
|
|
|
{
|
|
|
m_v = _mm_mul_ps( m_v, Vec4( v ).m_v );
|
|
|
-@@ -3631,7 +3713,7 @@ public:
|
|
|
+@@ -3631,7 +3721,7 @@ public:
|
|
|
*this *= Reciprocal( v );
|
|
|
return *this;
|
|
|
}
|
|
@@ -2270,7 +2318,7 @@ index f959e20..1a2f6b8 100644
|
|
|
Vec4& operator/=( float v )
|
|
|
{
|
|
|
*this *= Reciprocal( Vec4( v ) );
|
|
|
-@@ -3732,16 +3814,7 @@ public:
|
|
|
+@@ -3732,16 +3822,7 @@ public:
|
|
|
|
|
|
template<const int a, const int b, const int c, const int d>
|
|
|
friend Vec4 Merge( Arg lo, Arg hi );
|
|
@@ -2288,7 +2336,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
template<const int f, const int t>
|
|
|
friend Vec4 Shuffle( Arg a );
|
|
|
-@@ -3900,7 +3973,7 @@ public:
|
|
|
+@@ -3900,7 +3981,7 @@ public:
|
|
|
|
|
|
return Vec4( res );
|
|
|
}
|
|
@@ -2297,7 +2345,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec4 HorizontalMaxXY( Arg a )
|
|
|
{
|
|
|
__m128 res = a.m_v;
|
|
|
-@@ -3912,7 +3985,7 @@ public:
|
|
|
+@@ -3912,7 +3993,7 @@ public:
|
|
|
|
|
|
return Vec4( res );
|
|
|
}
|
|
@@ -2306,7 +2354,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec4 HorizontalMinXY( Arg a )
|
|
|
{
|
|
|
__m128 res = a.m_v;
|
|
|
-@@ -3965,7 +4038,7 @@ public:
|
|
|
+@@ -3965,7 +4046,7 @@ public:
|
|
|
|
|
|
return rsq;
|
|
|
}
|
|
@@ -2315,7 +2363,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec4 Normalize( Arg left )
|
|
|
{
|
|
|
Vec4 sum = HorizontalAdd( Vec4( _mm_mul_ps( left.m_v, left.m_v ) ) );
|
|
|
-@@ -3973,7 +4046,7 @@ public:
|
|
|
+@@ -3973,7 +4054,7 @@ public:
|
|
|
|
|
|
return left * rsq;
|
|
|
}
|
|
@@ -2324,7 +2372,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec4 Normalize( Vec4& x, Vec4& y, Vec4& z )
|
|
|
{
|
|
|
Vec4 xx = x * x;
|
|
|
-@@ -4006,7 +4079,7 @@ public:
|
|
|
+@@ -4006,7 +4087,7 @@ public:
|
|
|
res = _mm_add_ps( res, _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 1, 0, 1, 0 ) ) );
|
|
|
#endif
|
|
|
if (!disarm) {
|
|
@@ -2333,7 +2381,7 @@ index f959e20..1a2f6b8 100644
|
|
|
if ( _mm_comigt_ss( res, rez ) ) {
|
|
|
res = ReciprocalSqrt( Vec4(res) ).m_v;
|
|
|
res = _mm_shuffle_ps( res, res, SQUISH_SSE_SHUF( 0, 0, 0, 0 ) );
|
|
|
-@@ -4028,7 +4101,7 @@ public:
|
|
|
+@@ -4028,7 +4109,7 @@ public:
|
|
|
res = _mm_and_ps( res, _mm_castsi128_ps ( _mm_setr_epi32( ~0, ~0, ~0, 0 ) ) );
|
|
|
}
|
|
|
|
|
@@ -2342,7 +2390,7 @@ index f959e20..1a2f6b8 100644
|
|
|
return Vec4( res );
|
|
|
}
|
|
|
|
|
|
-@@ -4041,20 +4114,20 @@ public:
|
|
|
+@@ -4041,20 +4122,20 @@ public:
|
|
|
Vec4 len = left * left + right * right;
|
|
|
Vec4 adj = ReciprocalSqrt(Max(Vec4(1.0f), len));
|
|
|
|
|
@@ -2367,7 +2415,7 @@ index f959e20..1a2f6b8 100644
|
|
|
return Sqrt(Vec4(1.0f) - len);
|
|
|
}
|
|
|
}
|
|
|
-@@ -4105,7 +4178,7 @@ public:
|
|
|
+@@ -4105,7 +4186,7 @@ public:
|
|
|
{
|
|
|
return Vec4( _mm_and_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x7FFFFFFF ) ) ) );
|
|
|
}
|
|
@@ -2376,7 +2424,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec4 Neg( Arg a )
|
|
|
{
|
|
|
return Vec4( _mm_or_ps( a.m_v, _mm_castsi128_ps( _mm_set1_epi32( 0x80000000 ) ) ) );
|
|
|
-@@ -4131,19 +4204,7 @@ public:
|
|
|
+@@ -4131,19 +4212,7 @@ public:
|
|
|
|
|
|
template<const bool round>
|
|
|
friend Col4 FloatToInt( Vec4::Arg v );
|
|
@@ -2397,7 +2445,7 @@ index f959e20..1a2f6b8 100644
|
|
|
|
|
|
friend Vec4 Truncate( Arg v )
|
|
|
{
|
|
|
-@@ -4159,7 +4220,7 @@ public:
|
|
|
+@@ -4159,7 +4228,7 @@ public:
|
|
|
|
|
|
// clear out the MMX multimedia state to allow FP calls later
|
|
|
_mm_empty();
|
|
@@ -2406,7 +2454,7 @@ index f959e20..1a2f6b8 100644
|
|
|
return Vec4( truncated );
|
|
|
#else
|
|
|
// use SSE2 instructions
|
|
|
-@@ -4188,7 +4249,7 @@ public:
|
|
|
+@@ -4188,7 +4257,7 @@ public:
|
|
|
{
|
|
|
return _mm_movemask_ps( _mm_cmpeq_ps( left.m_v, right.m_v ) );
|
|
|
}
|
|
@@ -2415,7 +2463,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend int CompareNotEqualTo( Arg left, Arg right )
|
|
|
{
|
|
|
return _mm_movemask_ps( _mm_cmpneq_ps( left.m_v, right.m_v ) );
|
|
|
-@@ -4198,7 +4259,7 @@ public:
|
|
|
+@@ -4198,7 +4267,7 @@ public:
|
|
|
{
|
|
|
return _mm_movemask_ps( _mm_cmplt_ps( left.m_v, right.m_v ) );
|
|
|
}
|
|
@@ -2424,7 +2472,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend int CompareGreaterThan( Arg left, Arg right )
|
|
|
{
|
|
|
return _mm_movemask_ps( _mm_cmpgt_ps( left.m_v, right.m_v ) );
|
|
|
-@@ -4234,17 +4295,17 @@ public:
|
|
|
+@@ -4234,17 +4303,17 @@ public:
|
|
|
{
|
|
|
return Col4( _mm_cmpeq_epi32( _mm_castps_si128 ( left.m_v ), _mm_castps_si128 ( right.m_v ) ) );
|
|
|
}
|
|
@@ -2445,7 +2493,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend int CompareFirstLessEqualTo( Arg left, Arg right )
|
|
|
{
|
|
|
return _mm_comile_ss( left.m_v, right.m_v );
|
|
|
-@@ -4264,17 +4325,17 @@ public:
|
|
|
+@@ -4264,17 +4333,17 @@ public:
|
|
|
{
|
|
|
return _mm_comieq_ss( left.m_v, right.m_v );
|
|
|
}
|
|
@@ -2466,7 +2514,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec4 IsNotEqualTo( Arg left, Arg right )
|
|
|
{
|
|
|
return Vec4( _mm_cmpneq_ps( left.m_v, right.m_v ) );
|
|
|
-@@ -4326,7 +4387,7 @@ public:
|
|
|
+@@ -4326,7 +4395,7 @@ public:
|
|
|
{
|
|
|
return Vec4( _mm_and_ps( left.m_v, _mm_castsi128_ps ( _mm_setr_epi32( 0, 0, 0, ~0 ) ) ) );
|
|
|
}
|
|
@@ -2475,7 +2523,7 @@ index f959e20..1a2f6b8 100644
|
|
|
friend Vec4 CollapseW( Arg x, Arg y, Arg z, Arg w )
|
|
|
{
|
|
|
return Vec4( _mm_unpackhi_ps( _mm_unpackhi_ps( x.m_v, z.m_v ), _mm_unpackhi_ps( y.m_v, w.m_v ) ) );
|
|
|
-@@ -4420,6 +4481,41 @@ private:
|
|
|
+@@ -4420,6 +4489,41 @@ private:
|
|
|
__m128 m_v;
|
|
|
};
|
|
|
|
|
@@ -2517,7 +2565,7 @@ index f959e20..1a2f6b8 100644
|
|
|
template<const bool round>
|
|
|
Col4 FloatToUHalf( Vec4::Arg v );
|
|
|
template<const bool round>
|
|
|
-@@ -4450,7 +4546,7 @@ Col4 FloatToSHalf( Vec4::Arg v )
|
|
|
+@@ -4450,7 +4554,7 @@ Col4 FloatToSHalf( Vec4::Arg v )
|
|
|
return h;
|
|
|
}
|
|
|
|
|
@@ -2526,7 +2574,7 @@ index f959e20..1a2f6b8 100644
|
|
|
{
|
|
|
Vec4 f;
|
|
|
|
|
|
-@@ -4462,7 +4558,7 @@ Vec4 UHalfToFloat( Col4::Arg v )
|
|
|
+@@ -4462,7 +4566,7 @@ Vec4 UHalfToFloat( Col4::Arg v )
|
|
|
return f;
|
|
|
}
|
|
|
|